diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /intl/icu/source/common/unisetspan.cpp | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/icu/source/common/unisetspan.cpp')
-rw-r--r-- | intl/icu/source/common/unisetspan.cpp | 1522 |
1 files changed, 1522 insertions, 0 deletions
diff --git a/intl/icu/source/common/unisetspan.cpp b/intl/icu/source/common/unisetspan.cpp new file mode 100644 index 0000000000..b7256d8fd1 --- /dev/null +++ b/intl/icu/source/common/unisetspan.cpp @@ -0,0 +1,1522 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 2007-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: unisetspan.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2007mar01 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "unicode/uniset.h" +#include "unicode/ustring.h" +#include "unicode/utf8.h" +#include "unicode/utf16.h" +#include "cmemory.h" +#include "uvector.h" +#include "unisetspan.h" + +U_NAMESPACE_BEGIN + +/* + * List of offsets from the current position from where to try matching + * a code point or a string. + * Store offsets rather than indexes to simplify the code and use the same list + * for both increments (in span()) and decrements (in spanBack()). + * + * Assumption: The maximum offset is limited, and the offsets that are stored + * at any one time are relatively dense, that is, there are normally no gaps of + * hundreds or thousands of offset values. + * + * The implementation uses a circular buffer of byte flags, + * each indicating whether the corresponding offset is in the list. + * This avoids inserting into a sorted list of offsets (or absolute indexes) and + * physically moving part of the list. + * + * Note: In principle, the caller should setMaxLength() to the maximum of the + * max string length and U16_LENGTH/U8_LENGTH to account for + * "long" single code points. + * However, this implementation uses at least a staticList with more than + * U8_LENGTH entries anyway. + * + * Note: If maxLength were guaranteed to be no more than 32 or 64, + * the list could be stored as bit flags in a single integer. + * Rather than handling a circular buffer with a start list index, + * the integer would simply be shifted when lower offsets are removed. + * UnicodeSet does not have a limit on the lengths of strings. + */ +class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. +public: + OffsetList() : list(staticList), capacity(0), length(0), start(0) {} + + ~OffsetList() { + if(list!=staticList) { + uprv_free(list); + } + } + + // Call exactly once if the list is to be used. + void setMaxLength(int32_t maxLength) { + if(maxLength<=(int32_t)sizeof(staticList)) { + capacity=(int32_t)sizeof(staticList); + } else { + UBool *l=(UBool *)uprv_malloc(maxLength); + if(l!=nullptr) { + list=l; + capacity=maxLength; + } + } + uprv_memset(list, 0, capacity); + } + + void clear() { + uprv_memset(list, 0, capacity); + start=length=0; + } + + UBool isEmpty() const { + return (UBool)(length==0); + } + + // Reduce all stored offsets by delta, used when the current position + // moves by delta. + // There must not be any offsets lower than delta. + // If there is an offset equal to delta, it is removed. + // delta=[1..maxLength] + void shift(int32_t delta) { + int32_t i=start+delta; + if(i>=capacity) { + i-=capacity; + } + if(list[i]) { + list[i]=false; + --length; + } + start=i; + } + + // Add an offset. The list must not contain it yet. + // offset=[1..maxLength] + void addOffset(int32_t offset) { + int32_t i=start+offset; + if(i>=capacity) { + i-=capacity; + } + list[i]=true; + ++length; + } + + // offset=[1..maxLength] + UBool containsOffset(int32_t offset) const { + int32_t i=start+offset; + if(i>=capacity) { + i-=capacity; + } + return list[i]; + } + + // Find the lowest stored offset from a non-empty list, remove it, + // and reduce all other offsets by this minimum. + // Returns [1..maxLength]. + int32_t popMinimum() { + // Look for the next offset in list[start+1..capacity-1]. + int32_t i=start, result; + while(++i<capacity) { + if(list[i]) { + list[i]=false; + --length; + result=i-start; + start=i; + return result; + } + } + // i==capacity + + // Wrap around and look for the next offset in list[0..start]. + // Since the list is not empty, there will be one. + result=capacity-start; + i=0; + while(!list[i]) { + ++i; + } + list[i]=false; + --length; + start=i; + return result+=i; + } + +private: + UBool *list; + int32_t capacity; + int32_t length; + int32_t start; + + UBool staticList[16]; +}; + +// Get the number of UTF-8 bytes for a UTF-16 (sub)string. +static int32_t +getUTF8Length(const char16_t *s, int32_t length) { + UErrorCode errorCode=U_ZERO_ERROR; + int32_t length8=0; + u_strToUTF8(nullptr, 0, &length8, s, length, &errorCode); + if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { + return length8; + } else { + // The string contains an unpaired surrogate. + // Ignore this string. + return 0; + } +} + +// Append the UTF-8 version of the string to t and return the appended UTF-8 length. +static int32_t +appendUTF8(const char16_t *s, int32_t length, uint8_t *t, int32_t capacity) { + UErrorCode errorCode=U_ZERO_ERROR; + int32_t length8=0; + u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); + if(U_SUCCESS(errorCode)) { + return length8; + } else { + // The string contains an unpaired surrogate. + // Ignore this string. + return 0; + } +} + +static inline uint8_t +makeSpanLengthByte(int32_t spanLength) { + // 0xfe==UnicodeSetStringSpan::LONG_SPAN + return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; +} + +// Construct for all variants of span(), or only for any one variant. +// Initialize as little as possible, for single use. +UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, + const UVector &setStrings, + uint32_t which) + : spanSet(0, 0x10ffff), pSpanNotSet(nullptr), strings(setStrings), + utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr), + utf8Length(0), + maxLength16(0), maxLength8(0), + all((UBool)(which==ALL)) { + spanSet.retainAll(set); + if(which&NOT_CONTAINED) { + // Default to the same sets. + // addToSpanNotSet() will create a separate set if necessary. + pSpanNotSet=&spanSet; + } + + // Determine if the strings even need to be taken into account at all for span() etc. + // If any string is relevant, then all strings need to be used for + // span(longest match) but only the relevant ones for span(while contained). + // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH + // and do not store UTF-8 strings if !thisRelevant and CONTAINED. + // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) + // Also count the lengths of the UTF-8 versions of the strings for memory allocation. + int32_t stringsLength=strings.size(); + + int32_t i, spanLength; + UBool someRelevant=false; + for(i=0; i<stringsLength; ++i) { + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const char16_t *s16=string.getBuffer(); + int32_t length16=string.length(); + if (length16==0) { + continue; // skip the empty string + } + UBool thisRelevant; + spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); + if(spanLength<length16) { // Relevant string. + someRelevant=thisRelevant=true; + } else { + thisRelevant=false; + } + if((which&UTF16) && length16>maxLength16) { + maxLength16=length16; + } + if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { + int32_t length8=getUTF8Length(s16, length16); + utf8Length+=length8; + if(length8>maxLength8) { + maxLength8=length8; + } + } + } + if(!someRelevant) { + maxLength16=maxLength8=0; + return; + } + + // Freeze after checking for the need to use strings at all because freezing + // a set takes some time and memory which are wasted if there are no relevant strings. + if(all) { + spanSet.freeze(); + } + + uint8_t *spanBackLengths; + uint8_t *spanUTF8Lengths; + uint8_t *spanBackUTF8Lengths; + + // Allocate a block of meta data. + int32_t allocSize; + if(all) { + // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. + allocSize=stringsLength*(4+1+1+1+1)+utf8Length; + } else { + allocSize=stringsLength; // One set of span lengths. + if(which&UTF8) { + // UTF-8 lengths and UTF-8 strings. + allocSize+=stringsLength*4+utf8Length; + } + } + if(allocSize<=(int32_t)sizeof(staticLengths)) { + utf8Lengths=staticLengths; + } else { + utf8Lengths=(int32_t *)uprv_malloc(allocSize); + if(utf8Lengths==nullptr) { + maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false. + return; // Out of memory. + } + } + + if(all) { + // Store span lengths for all span() variants. + spanLengths=(uint8_t *)(utf8Lengths+stringsLength); + spanBackLengths=spanLengths+stringsLength; + spanUTF8Lengths=spanBackLengths+stringsLength; + spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; + utf8=spanBackUTF8Lengths+stringsLength; + } else { + // Store span lengths for only one span() variant. + if(which&UTF8) { + spanLengths=(uint8_t *)(utf8Lengths+stringsLength); + utf8=spanLengths+stringsLength; + } else { + spanLengths=(uint8_t *)utf8Lengths; + } + spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; + } + + // Set the meta data and pSpanNotSet and write the UTF-8 strings. + int32_t utf8Count=0; // Count UTF-8 bytes written so far. + + for(i=0; i<stringsLength; ++i) { + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const char16_t *s16=string.getBuffer(); + int32_t length16=string.length(); + spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); + if(spanLength<length16 && length16>0) { // Relevant string. + if(which&UTF16) { + if(which&CONTAINED) { + if(which&FWD) { + spanLengths[i]=makeSpanLengthByte(spanLength); + } + if(which&BACK) { + spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); + spanBackLengths[i]=makeSpanLengthByte(spanLength); + } + } else /* not CONTAINED, not all, but NOT_CONTAINED */ { + spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. + } + } + if(which&UTF8) { + uint8_t *s8=utf8+utf8Count; + int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); + utf8Count+=utf8Lengths[i]=length8; + if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. + spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; + } else { // Relevant for UTF-8. + if(which&CONTAINED) { + if(which&FWD) { + spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); + spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); + } + if(which&BACK) { + spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); + spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); + } + } else /* not CONTAINED, not all, but NOT_CONTAINED */ { + spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. + } + } + } + if(which&NOT_CONTAINED) { + // Add string start and end code points to the spanNotSet so that + // a span(while not contained) stops before any string. + UChar32 c; + if(which&FWD) { + int32_t len=0; + U16_NEXT(s16, len, length16, c); + addToSpanNotSet(c); + } + if(which&BACK) { + int32_t len=length16; + U16_PREV(s16, 0, len, c); + addToSpanNotSet(c); + } + } + } else { // Irrelevant string. (Also the empty string.) + if(which&UTF8) { + if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. + uint8_t *s8=utf8+utf8Count; + int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); + utf8Count+=utf8Lengths[i]=length8; + } else { + utf8Lengths[i]=0; + } + } + if(all) { + spanLengths[i]=spanBackLengths[i]= + spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= + (uint8_t)ALL_CP_CONTAINED; + } else { + // All spanXYZLengths pointers contain the same address. + spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; + } + } + } + + // Finish. + if(all) { + pSpanNotSet->freeze(); + } +} + +// Copy constructor. Assumes which==ALL for a frozen set. +UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, + const UVector &newParentSetStrings) + : spanSet(otherStringSpan.spanSet), pSpanNotSet(nullptr), strings(newParentSetStrings), + utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr), + utf8Length(otherStringSpan.utf8Length), + maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), + all(true) { + if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { + pSpanNotSet=&spanSet; + } else { + pSpanNotSet=otherStringSpan.pSpanNotSet->clone(); + } + + // Allocate a block of meta data. + // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. + int32_t stringsLength=strings.size(); + int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; + if(allocSize<=(int32_t)sizeof(staticLengths)) { + utf8Lengths=staticLengths; + } else { + utf8Lengths=(int32_t *)uprv_malloc(allocSize); + if(utf8Lengths==nullptr) { + maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false. + return; // Out of memory. + } + } + + spanLengths=(uint8_t *)(utf8Lengths+stringsLength); + utf8=spanLengths+stringsLength*4; + uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); +} + +UnicodeSetStringSpan::~UnicodeSetStringSpan() { + if(pSpanNotSet!=nullptr && pSpanNotSet!=&spanSet) { + delete pSpanNotSet; + } + if(utf8Lengths!=nullptr && utf8Lengths!=staticLengths) { + uprv_free(utf8Lengths); + } +} + +void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { + if(pSpanNotSet==nullptr || pSpanNotSet==&spanSet) { + if(spanSet.contains(c)) { + return; // Nothing to do. + } + UnicodeSet *newSet=spanSet.cloneAsThawed(); + if(newSet==nullptr) { + return; // Out of memory. + } else { + pSpanNotSet=newSet; + } + } + pSpanNotSet->add(c); +} + +// Compare strings without any argument checks. Requires length>0. +static inline UBool +matches16(const char16_t *s, const char16_t *t, int32_t length) { + do { + if(*s++!=*t++) { + return false; + } + } while(--length>0); + return true; +} + +static inline UBool +matches8(const uint8_t *s, const uint8_t *t, int32_t length) { + do { + if(*s++!=*t++) { + return false; + } + } while(--length>0); + return true; +} + +// Compare 16-bit Unicode strings (which may be malformed UTF-16) +// at code point boundaries. +// That is, each edge of a match must not be in the middle of a surrogate pair. +static inline UBool +matches16CPB(const char16_t *s, int32_t start, int32_t limit, const char16_t *t, int32_t length) { + s+=start; + limit-=start; + return matches16(s, t, length) && + !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && + !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); +} + +// Does the set contain the next code point? +// If so, return its length; otherwise return its negative length. +static inline int32_t +spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) { + char16_t c=*s, c2; + if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { + return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; + } + return set.contains(c) ? 1 : -1; +} + +static inline int32_t +spanOneBack(const UnicodeSet &set, const char16_t *s, int32_t length) { + char16_t c=s[length-1], c2; + if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { + return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; + } + return set.contains(c) ? 1 : -1; +} + +static inline int32_t +spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { + UChar32 c=*s; + if(U8_IS_SINGLE(c)) { + return set.contains(c) ? 1 : -1; + } + // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). + int32_t i=0; + U8_NEXT_OR_FFFD(s, i, length, c); + return set.contains(c) ? i : -i; +} + +static inline int32_t +spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { + UChar32 c=s[length-1]; + if(U8_IS_SINGLE(c)) { + return set.contains(c) ? 1 : -1; + } + int32_t i=length-1; + c=utf8_prevCharSafeBody(s, 0, &i, c, -3); + length-=i; + return set.contains(c) ? length : -length; +} + +/* + * Note: In span() when spanLength==0 (after a string match, or at the beginning + * after an empty code point span) and in spanNot() and spanNotUTF8(), + * string matching could use a binary search + * because all string matches are done from the same start index. + * + * For UTF-8, this would require a comparison function that returns UTF-16 order. + * + * This optimization should not be necessary for normal UnicodeSets because + * most sets have no strings, and most sets with strings have + * very few very short strings. + * For cases with many strings, it might be better to use a different API + * and implementation with a DFA (state machine). + */ + +/* + * Algorithm for span(USET_SPAN_CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Either recursively span for each code point or string match, + * or recursively span for all but the shortest one and + * iteratively continue the span with the shortest local match. + * + Remember the longest recursive span (the farthest end point). + * + If there is no match at the current position, neither for the code point there + * nor for any set string, then stop and return the longest recursive span length. + * + * Optimized implementation: + * + * (We assume that most sets will have very few very short strings. + * A span using a string-less set is extremely fast.) + * + * Create and cache a spanSet which contains all of the single code points + * of the original set but none of its strings. + * + * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points must be matched + * with a partial overlap because the recursive algorithm would have tried + * to match them at every position. + * ~ Set strings that entirely consist of set-contained code points + * are irrelevant for span(USET_SPAN_CONTAINED) because the + * recursive algorithm would continue after them anyway + * and find the longest recursive match from their end. + * ~ Rather than recursing, note each end point of a set string match. + * + If no set string matched after spanSet.span(), then return + * with where the spanSet.span() ended. + * + If at least one set string matched after spanSet.span(), then + * pop the shortest string match end point and continue + * the loop, trying to match all set strings from there. + * + If at least one more set string matched after a previous string match, + * then test if the code point after the previous string match is also + * contained in the set. + * Continue the loop with the shortest end point of either this code point + * or a matching set string. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + * + * By noting each end point of a set string match, + * the function visits each string position at most once and finishes + * in linear time. + * + * The recursive algorithm may visit the same string position many times + * if multiple paths lead to it and finishes in exponential time. + */ + +/* + * Algorithm for span(USET_SPAN_SIMPLE) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Continue from the farthest match position and ignore all others. + * + If there is no match at the current position, + * then stop and return the current position. + * + * Optimized implementation: + * + * (Same assumption and spanSet as above.) + * + * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points must be matched + * with a partial overlap because the standard algorithm would have tried + * to match them earlier. + * ~ Set strings that entirely consist of set-contained code points + * must be matched with a full overlap because the longest-match algorithm + * would hide set string matches that end earlier. + * Such set strings need not be matched earlier inside the code point span + * because the standard algorithm would then have continued after + * the set string match anyway. + * ~ Remember the longest set string match (farthest end point) from the earliest + * starting point. + * + If no set string matched after spanSet.span(), then return + * with where the spanSet.span() ended. + * + If at least one set string matched, then continue the loop after the + * longest match from the earliest position. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + */ + +int32_t UnicodeSetStringSpan::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const { + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + return spanNot(s, length); + } + int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); + if(spanLength==length) { + return length; + } + + // Consider strings; they may overlap with the span. + OffsetList offsets; + if(spanCondition==USET_SPAN_CONTAINED) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength16); + } + int32_t pos=spanLength, rest=length-pos; + int32_t i, stringsLength=strings.size(); + for(;;) { + if(spanCondition==USET_SPAN_CONTAINED) { + for(i=0; i<stringsLength; ++i) { + int32_t overlap=spanLengths[i]; + if(overlap==ALL_CP_CONTAINED) { + continue; // Irrelevant string. (Also the empty string.) + } + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const char16_t *s16=string.getBuffer(); + int32_t length16=string.length(); + U_ASSERT(length>0); + + // Try to match this string at pos-overlap..pos. + if(overlap>=LONG_SPAN) { + overlap=length16; + // While contained: No point matching fully inside the code point span. + U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t inc=length16-overlap; // Keep overlap+inc==length16. + for(;;) { + if(inc>rest) { + break; + } + // Try to match if the increment is not listed already. + if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { + if(inc==rest) { + return length; // Reached the end of the string. + } + offsets.addOffset(inc); + } + if(overlap==0) { + break; + } + --overlap; + ++inc; + } + } + } else /* USET_SPAN_SIMPLE */ { + int32_t maxInc=0, maxOverlap=0; + for(i=0; i<stringsLength; ++i) { + int32_t overlap=spanLengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the earliest start. + + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const char16_t *s16=string.getBuffer(); + int32_t length16=string.length(); + if (length16==0) { + continue; // skip the empty string + } + + // Try to match this string at pos-overlap..pos. + if(overlap>=LONG_SPAN) { + overlap=length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the earliest start. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t inc=length16-overlap; // Keep overlap+inc==length16. + for(;;) { + if(inc>rest || overlap<maxOverlap) { + break; + } + // Try to match if the string is longer or starts earlier. + if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && + matches16CPB(s, pos-overlap, length, s16, length16) + ) { + maxInc=inc; // Longest match from earliest start. + maxOverlap=overlap; + break; + } + --overlap; + ++inc; + } + } + + if(maxInc!=0 || maxOverlap!=0) { + // Longest-match algorithm, and there was a string match. + // Simply continue after it. + pos+=maxInc; + rest-=maxInc; + if(rest==0) { + return length; // Reached the end of the string. + } + spanLength=0; // Match strings from after a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if(spanLength!=0 || pos==0) { + // The position is after an unlimited code point span (spanLength!=0), + // not after a string match. + // The only position where spanLength==0 after a span is pos==0. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if(offsets.isEmpty()) { + return pos; // No strings matched after a span. + } + // Match strings from after the next string match. + } else { + // The position is after a string match (or a single code point). + if(offsets.isEmpty()) { + // No more strings matched after a previous string match. + // Try another code point span from after the last string match. + spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); + if( spanLength==rest || // Reached the end of the string, or + spanLength==0 // neither strings nor span progressed. + ) { + return pos+spanLength; + } + pos+=spanLength; + rest-=spanLength; + continue; // spanLength>0: Match strings from after a span. + } else { + // Try to match only one code point from after a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength=spanOne(spanSet, s+pos, rest); + if(spanLength>0) { + if(spanLength==rest) { + return length; // Reached the end of the string. + } + // Match strings after this code point. + // There cannot be any increments below it because UnicodeSet strings + // contain multiple code points. + pos+=spanLength; + rest-=spanLength; + offsets.shift(spanLength); + spanLength=0; + continue; // Match strings from after a single code point. + } + // Match strings from after the next string match. + } + } + int32_t minOffset=offsets.popMinimum(); + pos+=minOffset; + rest-=minOffset; + spanLength=0; // Match strings from after a string match. + } +} + +int32_t UnicodeSetStringSpan::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const { + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + return spanNotBack(s, length); + } + int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); + if(pos==0) { + return 0; + } + int32_t spanLength=length-pos; + + // Consider strings; they may overlap with the span. + OffsetList offsets; + if(spanCondition==USET_SPAN_CONTAINED) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength16); + } + int32_t i, stringsLength=strings.size(); + uint8_t *spanBackLengths=spanLengths; + if(all) { + spanBackLengths+=stringsLength; + } + for(;;) { + if(spanCondition==USET_SPAN_CONTAINED) { + for(i=0; i<stringsLength; ++i) { + int32_t overlap=spanBackLengths[i]; + if(overlap==ALL_CP_CONTAINED) { + continue; // Irrelevant string. (Also the empty string.) + } + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const char16_t *s16=string.getBuffer(); + int32_t length16=string.length(); + U_ASSERT(length>0); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if(overlap>=LONG_SPAN) { + overlap=length16; + // While contained: No point matching fully inside the code point span. + int32_t len1=0; + U16_FWD_1(s16, len1, overlap); + overlap-=len1; // Length of the string minus the first code point. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t dec=length16-overlap; // Keep dec+overlap==length16. + for(;;) { + if(dec>pos) { + break; + } + // Try to match if the decrement is not listed already. + if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { + if(dec==pos) { + return 0; // Reached the start of the string. + } + offsets.addOffset(dec); + } + if(overlap==0) { + break; + } + --overlap; + ++dec; + } + } + } else /* USET_SPAN_SIMPLE */ { + int32_t maxDec=0, maxOverlap=0; + for(i=0; i<stringsLength; ++i) { + int32_t overlap=spanBackLengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the latest end. + + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const char16_t *s16=string.getBuffer(); + int32_t length16=string.length(); + if (length16==0) { + continue; // skip the empty string + } + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if(overlap>=LONG_SPAN) { + overlap=length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the latest end. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t dec=length16-overlap; // Keep dec+overlap==length16. + for(;;) { + if(dec>pos || overlap<maxOverlap) { + break; + } + // Try to match if the string is longer or ends later. + if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && + matches16CPB(s, pos-dec, length, s16, length16) + ) { + maxDec=dec; // Longest match from latest end. + maxOverlap=overlap; + break; + } + --overlap; + ++dec; + } + } + + if(maxDec!=0 || maxOverlap!=0) { + // Longest-match algorithm, and there was a string match. + // Simply continue before it. + pos-=maxDec; + if(pos==0) { + return 0; // Reached the start of the string. + } + spanLength=0; // Match strings from before a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if(spanLength!=0 || pos==length) { + // The position is before an unlimited code point span (spanLength!=0), + // not before a string match. + // The only position where spanLength==0 before a span is pos==length. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if(offsets.isEmpty()) { + return pos; // No strings matched before a span. + } + // Match strings from before the next string match. + } else { + // The position is before a string match (or a single code point). + if(offsets.isEmpty()) { + // No more strings matched before a previous string match. + // Try another code point span from before the last string match. + int32_t oldPos=pos; + pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); + spanLength=oldPos-pos; + if( pos==0 || // Reached the start of the string, or + spanLength==0 // neither strings nor span progressed. + ) { + return pos; + } + continue; // spanLength>0: Match strings from before a span. + } else { + // Try to match only one code point from before a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength=spanOneBack(spanSet, s, pos); + if(spanLength>0) { + if(spanLength==pos) { + return 0; // Reached the start of the string. + } + // Match strings before this code point. + // There cannot be any decrements below it because UnicodeSet strings + // contain multiple code points. + pos-=spanLength; + offsets.shift(spanLength); + spanLength=0; + continue; // Match strings from before a single code point. + } + // Match strings from before the next string match. + } + } + pos-=offsets.popMinimum(); + spanLength=0; // Match strings from before a string match. + } +} + +int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + return spanNotUTF8(s, length); + } + int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); + if(spanLength==length) { + return length; + } + + // Consider strings; they may overlap with the span. + OffsetList offsets; + if(spanCondition==USET_SPAN_CONTAINED) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength8); + } + int32_t pos=spanLength, rest=length-pos; + int32_t i, stringsLength=strings.size(); + uint8_t *spanUTF8Lengths=spanLengths; + if(all) { + spanUTF8Lengths+=2*stringsLength; + } + for(;;) { + const uint8_t *s8=utf8; + int32_t length8; + if(spanCondition==USET_SPAN_CONTAINED) { + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + if(length8==0) { + continue; // String not representable in UTF-8. + } + int32_t overlap=spanUTF8Lengths[i]; + if(overlap==ALL_CP_CONTAINED) { + s8+=length8; + continue; // Irrelevant string. + } + + // Try to match this string at pos-overlap..pos. + if(overlap>=LONG_SPAN) { + overlap=length8; + // While contained: No point matching fully inside the code point span. + U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t inc=length8-overlap; // Keep overlap+inc==length8. + for(;;) { + if(inc>rest) { + break; + } + // Try to match if the increment is not listed already. + // Match at code point boundaries. (The UTF-8 strings were converted + // from UTF-16 and are guaranteed to be well-formed.) + if(!U8_IS_TRAIL(s[pos-overlap]) && + !offsets.containsOffset(inc) && + matches8(s+pos-overlap, s8, length8)) { + if(inc==rest) { + return length; // Reached the end of the string. + } + offsets.addOffset(inc); + } + if(overlap==0) { + break; + } + --overlap; + ++inc; + } + s8+=length8; + } + } else /* USET_SPAN_SIMPLE */ { + int32_t maxInc=0, maxOverlap=0; + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + if(length8==0) { + continue; // String not representable in UTF-8. + } + int32_t overlap=spanUTF8Lengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the earliest start. + + // Try to match this string at pos-overlap..pos. + if(overlap>=LONG_SPAN) { + overlap=length8; + // Longest match: Need to match fully inside the code point span + // to find the match from the earliest start. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t inc=length8-overlap; // Keep overlap+inc==length8. + for(;;) { + if(inc>rest || overlap<maxOverlap) { + break; + } + // Try to match if the string is longer or starts earlier. + // Match at code point boundaries. (The UTF-8 strings were converted + // from UTF-16 and are guaranteed to be well-formed.) + if(!U8_IS_TRAIL(s[pos-overlap]) && + (overlap>maxOverlap || + /* redundant overlap==maxOverlap && */ inc>maxInc) && + matches8(s+pos-overlap, s8, length8)) { + maxInc=inc; // Longest match from earliest start. + maxOverlap=overlap; + break; + } + --overlap; + ++inc; + } + s8+=length8; + } + + if(maxInc!=0 || maxOverlap!=0) { + // Longest-match algorithm, and there was a string match. + // Simply continue after it. + pos+=maxInc; + rest-=maxInc; + if(rest==0) { + return length; // Reached the end of the string. + } + spanLength=0; // Match strings from after a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if(spanLength!=0 || pos==0) { + // The position is after an unlimited code point span (spanLength!=0), + // not after a string match. + // The only position where spanLength==0 after a span is pos==0. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if(offsets.isEmpty()) { + return pos; // No strings matched after a span. + } + // Match strings from after the next string match. + } else { + // The position is after a string match (or a single code point). + if(offsets.isEmpty()) { + // No more strings matched after a previous string match. + // Try another code point span from after the last string match. + spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); + if( spanLength==rest || // Reached the end of the string, or + spanLength==0 // neither strings nor span progressed. + ) { + return pos+spanLength; + } + pos+=spanLength; + rest-=spanLength; + continue; // spanLength>0: Match strings from after a span. + } else { + // Try to match only one code point from after a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength=spanOneUTF8(spanSet, s+pos, rest); + if(spanLength>0) { + if(spanLength==rest) { + return length; // Reached the end of the string. + } + // Match strings after this code point. + // There cannot be any increments below it because UnicodeSet strings + // contain multiple code points. + pos+=spanLength; + rest-=spanLength; + offsets.shift(spanLength); + spanLength=0; + continue; // Match strings from after a single code point. + } + // Match strings from after the next string match. + } + } + int32_t minOffset=offsets.popMinimum(); + pos+=minOffset; + rest-=minOffset; + spanLength=0; // Match strings from after a string match. + } +} + +int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + return spanNotBackUTF8(s, length); + } + int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); + if(pos==0) { + return 0; + } + int32_t spanLength=length-pos; + + // Consider strings; they may overlap with the span. + OffsetList offsets; + if(spanCondition==USET_SPAN_CONTAINED) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength8); + } + int32_t i, stringsLength=strings.size(); + uint8_t *spanBackUTF8Lengths=spanLengths; + if(all) { + spanBackUTF8Lengths+=3*stringsLength; + } + for(;;) { + const uint8_t *s8=utf8; + int32_t length8; + if(spanCondition==USET_SPAN_CONTAINED) { + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + if(length8==0) { + continue; // String not representable in UTF-8. + } + int32_t overlap=spanBackUTF8Lengths[i]; + if(overlap==ALL_CP_CONTAINED) { + s8+=length8; + continue; // Irrelevant string. + } + + // Try to match this string at pos-(length8-overlap)..pos-length8. + if(overlap>=LONG_SPAN) { + overlap=length8; + // While contained: No point matching fully inside the code point span. + int32_t len1=0; + U8_FWD_1(s8, len1, overlap); + overlap-=len1; // Length of the string minus the first code point. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t dec=length8-overlap; // Keep dec+overlap==length8. + for(;;) { + if(dec>pos) { + break; + } + // Try to match if the decrement is not listed already. + // Match at code point boundaries. (The UTF-8 strings were converted + // from UTF-16 and are guaranteed to be well-formed.) + if( !U8_IS_TRAIL(s[pos-dec]) && + !offsets.containsOffset(dec) && + matches8(s+pos-dec, s8, length8) + ) { + if(dec==pos) { + return 0; // Reached the start of the string. + } + offsets.addOffset(dec); + } + if(overlap==0) { + break; + } + --overlap; + ++dec; + } + s8+=length8; + } + } else /* USET_SPAN_SIMPLE */ { + int32_t maxDec=0, maxOverlap=0; + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + if(length8==0) { + continue; // String not representable in UTF-8. + } + int32_t overlap=spanBackUTF8Lengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the latest end. + + // Try to match this string at pos-(length8-overlap)..pos-length8. + if(overlap>=LONG_SPAN) { + overlap=length8; + // Longest match: Need to match fully inside the code point span + // to find the match from the latest end. + } + if(overlap>spanLength) { + overlap=spanLength; + } + int32_t dec=length8-overlap; // Keep dec+overlap==length8. + for(;;) { + if(dec>pos || overlap<maxOverlap) { + break; + } + // Try to match if the string is longer or ends later. + // Match at code point boundaries. (The UTF-8 strings were converted + // from UTF-16 and are guaranteed to be well-formed.) + if( !U8_IS_TRAIL(s[pos-dec]) && + (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && + matches8(s+pos-dec, s8, length8) + ) { + maxDec=dec; // Longest match from latest end. + maxOverlap=overlap; + break; + } + --overlap; + ++dec; + } + s8+=length8; + } + + if(maxDec!=0 || maxOverlap!=0) { + // Longest-match algorithm, and there was a string match. + // Simply continue before it. + pos-=maxDec; + if(pos==0) { + return 0; // Reached the start of the string. + } + spanLength=0; // Match strings from before a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if(spanLength!=0 || pos==length) { + // The position is before an unlimited code point span (spanLength!=0), + // not before a string match. + // The only position where spanLength==0 before a span is pos==length. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if(offsets.isEmpty()) { + return pos; // No strings matched before a span. + } + // Match strings from before the next string match. + } else { + // The position is before a string match (or a single code point). + if(offsets.isEmpty()) { + // No more strings matched before a previous string match. + // Try another code point span from before the last string match. + int32_t oldPos=pos; + pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); + spanLength=oldPos-pos; + if( pos==0 || // Reached the start of the string, or + spanLength==0 // neither strings nor span progressed. + ) { + return pos; + } + continue; // spanLength>0: Match strings from before a span. + } else { + // Try to match only one code point from before a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength=spanOneBackUTF8(spanSet, s, pos); + if(spanLength>0) { + if(spanLength==pos) { + return 0; // Reached the start of the string. + } + // Match strings before this code point. + // There cannot be any decrements below it because UnicodeSet strings + // contain multiple code points. + pos-=spanLength; + offsets.shift(spanLength); + spanLength=0; + continue; // Match strings from before a single code point. + } + // Match strings from before the next string match. + } + } + pos-=offsets.popMinimum(); + spanLength=0; // Match strings from before a string match. + } +} + +/* + * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then return with the current position. + * + If a set string matches at the current position, then return with the current position. + * + * Optimized implementation: + * + * (Same assumption as for span() above.) + * + * Create and cache a spanNotSet which contains all of the single code points + * of the original set but none of its strings. + * For each set string add its initial code point to the spanNotSet. + * (Also add its final code point for spanNotBack().) + * + * - Loop: + * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). + * + If the current code point is in the original set, then + * return the current position. + * + If any set string matches at the current position, then + * return the current position. + * + If there is no match at the current position, neither for the code point there + * nor for any set string, then skip this code point and continue the loop. + * This happens for set-string-initial code points that were added to spanNotSet + * when there is not actually a match for such a set string. + */ + +int32_t UnicodeSetStringSpan::spanNot(const char16_t *s, int32_t length) const { + int32_t pos=0, rest=length; + int32_t i, stringsLength=strings.size(); + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); + if(i==rest) { + return length; // Reached the end of the string. + } + pos+=i; + rest-=i; + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int32_t cpLength=spanOne(spanSet, s+pos, rest); + if(cpLength>0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for(i=0; i<stringsLength; ++i) { + if(spanLengths[i]==ALL_CP_CONTAINED) { + continue; // Irrelevant string. (Also the empty string.) + } + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const char16_t *s16=string.getBuffer(); + int32_t length16=string.length(); + U_ASSERT(length>0); + if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos-=cpLength; + rest+=cpLength; + } while(rest!=0); + return length; // Reached the end of the string. +} + +int32_t UnicodeSetStringSpan::spanNotBack(const char16_t *s, int32_t length) const { + int32_t pos=length; + int32_t i, stringsLength=strings.size(); + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); + if(pos==0) { + return 0; // Reached the start of the string. + } + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int32_t cpLength=spanOneBack(spanSet, s, pos); + if(cpLength>0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for(i=0; i<stringsLength; ++i) { + // Use spanLengths rather than a spanBackLengths pointer because + // it is easier and we only need to know whether the string is irrelevant + // which is the same in either array. + if(spanLengths[i]==ALL_CP_CONTAINED) { + continue; // Irrelevant string. (Also the empty string.) + } + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); + const char16_t *s16=string.getBuffer(); + int32_t length16=string.length(); + U_ASSERT(length>0); + if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos+=cpLength; + } while(pos!=0); + return 0; // Reached the start of the string. +} + +int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { + int32_t pos=0, rest=length; + int32_t i, stringsLength=strings.size(); + uint8_t *spanUTF8Lengths=spanLengths; + if(all) { + spanUTF8Lengths+=2*stringsLength; + } + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); + if(i==rest) { + return length; // Reached the end of the string. + } + pos+=i; + rest-=i; + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); + if(cpLength>0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + const uint8_t *s8=utf8; + int32_t length8; + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + // ALL_CP_CONTAINED: Irrelevant string. + if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { + return pos; // There is a set element at pos. + } + s8+=length8; + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos-=cpLength; + rest+=cpLength; + } while(rest!=0); + return length; // Reached the end of the string. +} + +int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { + int32_t pos=length; + int32_t i, stringsLength=strings.size(); + uint8_t *spanBackUTF8Lengths=spanLengths; + if(all) { + spanBackUTF8Lengths+=3*stringsLength; + } + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); + if(pos==0) { + return 0; // Reached the start of the string. + } + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); + if(cpLength>0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + const uint8_t *s8=utf8; + int32_t length8; + for(i=0; i<stringsLength; ++i) { + length8=utf8Lengths[i]; + // ALL_CP_CONTAINED: Irrelevant string. + if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { + return pos; // There is a set element at pos. + } + s8+=length8; + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos+=cpLength; + } while(pos!=0); + return 0; // Reached the start of the string. +} + +U_NAMESPACE_END |