diff options
Diffstat (limited to 'intl/icu/source/common/ucharstriebuilder.cpp')
-rw-r--r-- | intl/icu/source/common/ucharstriebuilder.cpp | 443 |
1 files changed, 443 insertions, 0 deletions
diff --git a/intl/icu/source/common/ucharstriebuilder.cpp b/intl/icu/source/common/ucharstriebuilder.cpp new file mode 100644 index 0000000000..049997a275 --- /dev/null +++ b/intl/icu/source/common/ucharstriebuilder.cpp @@ -0,0 +1,443 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2010-2012, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: ucharstriebuilder.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010nov14 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "unicode/ucharstrie.h" +#include "unicode/ucharstriebuilder.h" +#include "unicode/unistr.h" +#include "unicode/ustring.h" +#include "cmemory.h" +#include "uarrsort.h" +#include "uassert.h" +#include "uhash.h" +#include "ustr_imp.h" + +U_NAMESPACE_BEGIN + +/* + * Note: This builder implementation stores (string, value) pairs with full copies + * of the 16-bit-unit sequences, until the UCharsTrie is built. + * It might(!) take less memory if we collected the data in a temporary, dynamic trie. + */ + +class UCharsTrieElement : public UMemory { +public: + // Use compiler's default constructor, initializes nothing. + + void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode); + + UnicodeString getString(const UnicodeString &strings) const { + int32_t length=strings[stringOffset]; + return strings.tempSubString(stringOffset+1, length); + } + int32_t getStringLength(const UnicodeString &strings) const { + return strings[stringOffset]; + } + + UChar charAt(int32_t index, const UnicodeString &strings) const { + return strings[stringOffset+1+index]; + } + + int32_t getValue() const { return value; } + + int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const; + +private: + // The first strings unit contains the string length. + // (Compared with a stringLength field here, this saves 2 bytes per string.) + int32_t stringOffset; + int32_t value; +}; + +void +UCharsTrieElement::setTo(const UnicodeString &s, int32_t val, + UnicodeString &strings, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return; + } + int32_t length=s.length(); + if(length>0xffff) { + // Too long: We store the length in 1 unit. + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + stringOffset=strings.length(); + strings.append((UChar)length); + value=val; + strings.append(s); +} + +int32_t +UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const { + return getString(strings).compare(other.getString(strings)); +} + +UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/) + : elements(NULL), elementsCapacity(0), elementsLength(0), + uchars(NULL), ucharsCapacity(0), ucharsLength(0) {} + +UCharsTrieBuilder::~UCharsTrieBuilder() { + delete[] elements; + uprv_free(uchars); +} + +UCharsTrieBuilder & +UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return *this; + } + if(ucharsLength>0) { + // Cannot add elements after building. + errorCode=U_NO_WRITE_PERMISSION; + return *this; + } + if(elementsLength==elementsCapacity) { + int32_t newCapacity; + if(elementsCapacity==0) { + newCapacity=1024; + } else { + newCapacity=4*elementsCapacity; + } + UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity]; + if(newElements==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return *this; + } + if(elementsLength>0) { + uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement)); + } + delete[] elements; + elements=newElements; + elementsCapacity=newCapacity; + } + elements[elementsLength++].setTo(s, value, strings, errorCode); + if(U_SUCCESS(errorCode) && strings.isBogus()) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + } + return *this; +} + +U_CDECL_BEGIN + +static int32_t U_CALLCONV +compareElementStrings(const void *context, const void *left, const void *right) { + const UnicodeString *strings=static_cast<const UnicodeString *>(context); + const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left); + const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right); + return leftElement->compareStringTo(*rightElement, *strings); +} + +U_CDECL_END + +UCharsTrie * +UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { + buildUChars(buildOption, errorCode); + UCharsTrie *newTrie=NULL; + if(U_SUCCESS(errorCode)) { + newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength)); + if(newTrie==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + } else { + uchars=NULL; // The new trie now owns the array. + ucharsCapacity=0; + } + } + return newTrie; +} + +UnicodeString & +UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result, + UErrorCode &errorCode) { + buildUChars(buildOption, errorCode); + if(U_SUCCESS(errorCode)) { + result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength); + } + return result; +} + +void +UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return; + } + if(uchars!=NULL && ucharsLength>0) { + // Already built. + return; + } + if(ucharsLength==0) { + if(elementsLength==0) { + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + if(strings.isBogus()) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement), + compareElementStrings, &strings, + FALSE, // need not be a stable sort + &errorCode); + if(U_FAILURE(errorCode)) { + return; + } + // Duplicate strings are not allowed. + UnicodeString prev=elements[0].getString(strings); + for(int32_t i=1; i<elementsLength; ++i) { + UnicodeString current=elements[i].getString(strings); + if(prev==current) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + prev.fastCopyFrom(current); + } + } + // Create and UChar-serialize the trie for the elements. + ucharsLength=0; + int32_t capacity=strings.length(); + if(capacity<1024) { + capacity=1024; + } + if(ucharsCapacity<capacity) { + uprv_free(uchars); + uchars=static_cast<UChar *>(uprv_malloc(capacity*2)); + if(uchars==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + ucharsCapacity=0; + return; + } + ucharsCapacity=capacity; + } + StringTrieBuilder::build(buildOption, elementsLength, errorCode); + if(uchars==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + } +} + +int32_t +UCharsTrieBuilder::getElementStringLength(int32_t i) const { + return elements[i].getStringLength(strings); +} + +UChar +UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const { + return elements[i].charAt(unitIndex, strings); +} + +int32_t +UCharsTrieBuilder::getElementValue(int32_t i) const { + return elements[i].getValue(); +} + +int32_t +UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const { + const UCharsTrieElement &firstElement=elements[first]; + const UCharsTrieElement &lastElement=elements[last]; + int32_t minStringLength=firstElement.getStringLength(strings); + while(++unitIndex<minStringLength && + firstElement.charAt(unitIndex, strings)== + lastElement.charAt(unitIndex, strings)) {} + return unitIndex; +} + +int32_t +UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const { + int32_t length=0; // Number of different units at unitIndex. + int32_t i=start; + do { + UChar unit=elements[i++].charAt(unitIndex, strings); + while(i<limit && unit==elements[i].charAt(unitIndex, strings)) { + ++i; + } + ++length; + } while(i<limit); + return length; +} + +int32_t +UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const { + do { + UChar unit=elements[i++].charAt(unitIndex, strings); + while(unit==elements[i].charAt(unitIndex, strings)) { + ++i; + } + } while(--count>0); + return i; +} + +int32_t +UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const { + while(unit==elements[i].charAt(unitIndex, strings)) { + ++i; + } + return i; +} + +UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode) + : LinearMatchNode(len, nextNode), s(units) { + hash=hash*37u+ustr_hashUCharsN(units, len); +} + +UBool +UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!LinearMatchNode::operator==(other)) { + return FALSE; + } + const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other; + return 0==u_memcmp(s, o.s, length); +} + +void +UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) { + UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder; + next->write(builder); + b.write(s, length); + offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1); +} + +StringTrieBuilder::Node * +UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, + Node *nextNode) const { + return new UCTLinearMatchNode( + elements[i].getString(strings).getBuffer()+unitIndex, + length, + nextNode); +} + +UBool +UCharsTrieBuilder::ensureCapacity(int32_t length) { + if(uchars==NULL) { + return FALSE; // previous memory allocation had failed + } + if(length>ucharsCapacity) { + int32_t newCapacity=ucharsCapacity; + do { + newCapacity*=2; + } while(newCapacity<=length); + UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2)); + if(newUChars==NULL) { + // unable to allocate memory + uprv_free(uchars); + uchars=NULL; + ucharsCapacity=0; + return FALSE; + } + u_memcpy(newUChars+(newCapacity-ucharsLength), + uchars+(ucharsCapacity-ucharsLength), ucharsLength); + uprv_free(uchars); + uchars=newUChars; + ucharsCapacity=newCapacity; + } + return TRUE; +} + +int32_t +UCharsTrieBuilder::write(int32_t unit) { + int32_t newLength=ucharsLength+1; + if(ensureCapacity(newLength)) { + ucharsLength=newLength; + uchars[ucharsCapacity-ucharsLength]=(UChar)unit; + } + return ucharsLength; +} + +int32_t +UCharsTrieBuilder::write(const UChar *s, int32_t length) { + int32_t newLength=ucharsLength+length; + if(ensureCapacity(newLength)) { + ucharsLength=newLength; + u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length); + } + return ucharsLength; +} + +int32_t +UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) { + return write(elements[i].getString(strings).getBuffer()+unitIndex, length); +} + +int32_t +UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { + if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) { + return write(i|(isFinal<<15)); + } + UChar intUnits[3]; + int32_t length; + if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) { + intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead); + intUnits[1]=(UChar)((uint32_t)i>>16); + intUnits[2]=(UChar)i; + length=3; + // } else if(i<=UCharsTrie::kMaxOneUnitValue) { + // intUnits[0]=(UChar)(i); + // length=1; + } else { + intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16)); + intUnits[1]=(UChar)i; + length=2; + } + intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15)); + return write(intUnits, length); +} + +int32_t +UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { + if(!hasValue) { + return write(node); + } + UChar intUnits[3]; + int32_t length; + if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) { + intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead); + intUnits[1]=(UChar)((uint32_t)value>>16); + intUnits[2]=(UChar)value; + length=3; + } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) { + intUnits[0]=(UChar)((value+1)<<6); + length=1; + } else { + intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0)); + intUnits[1]=(UChar)value; + length=2; + } + intUnits[0]|=(UChar)node; + return write(intUnits, length); +} + +int32_t +UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) { + int32_t i=ucharsLength-jumpTarget; + U_ASSERT(i>=0); + if(i<=UCharsTrie::kMaxOneUnitDelta) { + return write(i); + } + UChar intUnits[3]; + int32_t length; + if(i<=UCharsTrie::kMaxTwoUnitDelta) { + intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16)); + length=1; + } else { + intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead); + intUnits[1]=(UChar)(i>>16); + length=2; + } + intUnits[length++]=(UChar)i; + return write(intUnits, length); +} + +U_NAMESPACE_END |