diff options
Diffstat (limited to '')
-rw-r--r-- | intl/icu/source/common/stringtriebuilder.cpp | 618 |
1 files changed, 618 insertions, 0 deletions
diff --git a/intl/icu/source/common/stringtriebuilder.cpp b/intl/icu/source/common/stringtriebuilder.cpp new file mode 100644 index 0000000000..6f9cc2e5c2 --- /dev/null +++ b/intl/icu/source/common/stringtriebuilder.cpp @@ -0,0 +1,618 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2010-2012, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: stringtriebuilder.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010dec24 +* created by: Markus W. Scherer +*/ + +#include "utypeinfo.h" // for 'typeid' to work +#include "unicode/utypes.h" +#include "unicode/stringtriebuilder.h" +#include "uassert.h" +#include "uhash.h" + +U_CDECL_BEGIN + +static int32_t U_CALLCONV +hashStringTrieNode(const UHashTok key) { + return icu::StringTrieBuilder::hashNode(key.pointer); +} + +static UBool U_CALLCONV +equalStringTrieNodes(const UHashTok key1, const UHashTok key2) { + return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer); +} + +U_CDECL_END + +U_NAMESPACE_BEGIN + +StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {} + +StringTrieBuilder::~StringTrieBuilder() { + deleteCompactBuilder(); +} + +void +StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return; + } + nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL, + sizeGuess, &errorCode); + if(U_SUCCESS(errorCode)) { + if(nodes==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + } else { + uhash_setKeyDeleter(nodes, uprv_deleteUObject); + } + } +} + +void +StringTrieBuilder::deleteCompactBuilder() { + uhash_close(nodes); + nodes=NULL; +} + +void +StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength, + UErrorCode &errorCode) { + if(buildOption==USTRINGTRIE_BUILD_FAST) { + writeNode(0, elementsLength, 0); + } else /* USTRINGTRIE_BUILD_SMALL */ { + createCompactBuilder(2*elementsLength, errorCode); + Node *root=makeNode(0, elementsLength, 0, errorCode); + if(U_SUCCESS(errorCode)) { + root->markRightEdgesFirst(-1); + root->write(*this); + } + deleteCompactBuilder(); + } +} + +// Requires start<limit, +// and all strings of the [start..limit[ elements must be sorted and +// have a common prefix of length unitIndex. +int32_t +StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) { + UBool hasValue=FALSE; + int32_t value=0; + int32_t type; + if(unitIndex==getElementStringLength(start)) { + // An intermediate or final value. + value=getElementValue(start++); + if(start==limit) { + return writeValueAndFinal(value, TRUE); // final-value node + } + hasValue=TRUE; + } + // Now all [start..limit[ strings are longer than unitIndex. + int32_t minUnit=getElementUnit(start, unitIndex); + int32_t maxUnit=getElementUnit(limit-1, unitIndex); + if(minUnit==maxUnit) { + // Linear-match node: All strings have the same character at unitIndex. + int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); + writeNode(start, limit, lastUnitIndex); + // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. + int32_t length=lastUnitIndex-unitIndex; + int32_t maxLinearMatchLength=getMaxLinearMatchLength(); + while(length>maxLinearMatchLength) { + lastUnitIndex-=maxLinearMatchLength; + length-=maxLinearMatchLength; + writeElementUnits(start, lastUnitIndex, maxLinearMatchLength); + write(getMinLinearMatch()+maxLinearMatchLength-1); + } + writeElementUnits(start, unitIndex, length); + type=getMinLinearMatch()+length-1; + } else { + // Branch node. + int32_t length=countElementUnits(start, limit, unitIndex); + // length>=2 because minUnit!=maxUnit. + writeBranchSubNode(start, limit, unitIndex, length); + if(--length<getMinLinearMatch()) { + type=length; + } else { + write(length); + type=0; + } + } + return writeValueAndType(hasValue, value, type); +} + +// start<limit && all strings longer than unitIndex && +// length different units at unitIndex +int32_t +StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) { + UChar middleUnits[kMaxSplitBranchLevels]; + int32_t lessThan[kMaxSplitBranchLevels]; + int32_t ltLength=0; + while(length>getMaxBranchLinearSubNodeLength()) { + // Branch on the middle unit. + // First, find the middle unit. + int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); + // Encode the less-than branch first. + middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit + lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2); + ++ltLength; + // Continue for the greater-or-equal branch. + start=i; + length=length-length/2; + } + // For each unit, find its elements array start and whether it has a final value. + int32_t starts[kMaxBranchLinearSubNodeLength]; + UBool isFinal[kMaxBranchLinearSubNodeLength-1]; + int32_t unitNumber=0; + do { + int32_t i=starts[unitNumber]=start; + UChar unit=getElementUnit(i++, unitIndex); + i=indexOfElementWithNextUnit(i, unitIndex, unit); + isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start); + start=i; + } while(++unitNumber<length-1); + // unitNumber==length-1, and the maxUnit elements range is [start..limit[ + starts[unitNumber]=start; + + // Write the sub-nodes in reverse order: The jump lengths are deltas from + // after their own positions, so if we wrote the minUnit sub-node first, + // then its jump delta would be larger. + // Instead we write the minUnit sub-node last, for a shorter delta. + int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1]; + do { + --unitNumber; + if(!isFinal[unitNumber]) { + jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1); + } + } while(unitNumber>0); + // The maxUnit sub-node is written as the very last one because we do + // not jump for it at all. + unitNumber=length-1; + writeNode(start, limit, unitIndex+1); + int32_t offset=write(getElementUnit(start, unitIndex)); + // Write the rest of this node's unit-value pairs. + while(--unitNumber>=0) { + start=starts[unitNumber]; + int32_t value; + if(isFinal[unitNumber]) { + // Write the final value for the one string ending with this unit. + value=getElementValue(start); + } else { + // Write the delta to the start position of the sub-node. + value=offset-jumpTargets[unitNumber]; + } + writeValueAndFinal(value, isFinal[unitNumber]); + offset=write(getElementUnit(start, unitIndex)); + } + // Write the split-branch nodes. + while(ltLength>0) { + --ltLength; + writeDeltaTo(lessThan[ltLength]); + offset=write(middleUnits[ltLength]); + } + return offset; +} + +// Requires start<limit, +// and all strings of the [start..limit[ elements must be sorted and +// have a common prefix of length unitIndex. +StringTrieBuilder::Node * +StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return NULL; + } + UBool hasValue=FALSE; + int32_t value=0; + if(unitIndex==getElementStringLength(start)) { + // An intermediate or final value. + value=getElementValue(start++); + if(start==limit) { + return registerFinalValue(value, errorCode); + } + hasValue=TRUE; + } + Node *node; + // Now all [start..limit[ strings are longer than unitIndex. + int32_t minUnit=getElementUnit(start, unitIndex); + int32_t maxUnit=getElementUnit(limit-1, unitIndex); + if(minUnit==maxUnit) { + // Linear-match node: All strings have the same character at unitIndex. + int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); + Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode); + // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. + int32_t length=lastUnitIndex-unitIndex; + int32_t maxLinearMatchLength=getMaxLinearMatchLength(); + while(length>maxLinearMatchLength) { + lastUnitIndex-=maxLinearMatchLength; + length-=maxLinearMatchLength; + node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode); + nextNode=registerNode(node, errorCode); + } + node=createLinearMatchNode(start, unitIndex, length, nextNode); + } else { + // Branch node. + int32_t length=countElementUnits(start, limit, unitIndex); + // length>=2 because minUnit!=maxUnit. + Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode); + node=new BranchHeadNode(length, subNode); + } + if(hasValue && node!=NULL) { + if(matchNodesCanHaveValues()) { + ((ValueNode *)node)->setValue(value); + } else { + node=new IntermediateValueNode(value, registerNode(node, errorCode)); + } + } + return registerNode(node, errorCode); +} + +// start<limit && all strings longer than unitIndex && +// length different units at unitIndex +StringTrieBuilder::Node * +StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, + int32_t length, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return NULL; + } + UChar middleUnits[kMaxSplitBranchLevels]; + Node *lessThan[kMaxSplitBranchLevels]; + int32_t ltLength=0; + while(length>getMaxBranchLinearSubNodeLength()) { + // Branch on the middle unit. + // First, find the middle unit. + int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); + // Create the less-than branch. + middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit + lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode); + ++ltLength; + // Continue for the greater-or-equal branch. + start=i; + length=length-length/2; + } + if(U_FAILURE(errorCode)) { + return NULL; + } + ListBranchNode *listNode=new ListBranchNode(); + if(listNode==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + // For each unit, find its elements array start and whether it has a final value. + int32_t unitNumber=0; + do { + int32_t i=start; + UChar unit=getElementUnit(i++, unitIndex); + i=indexOfElementWithNextUnit(i, unitIndex, unit); + if(start==i-1 && unitIndex+1==getElementStringLength(start)) { + listNode->add(unit, getElementValue(start)); + } else { + listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode)); + } + start=i; + } while(++unitNumber<length-1); + // unitNumber==length-1, and the maxUnit elements range is [start..limit[ + UChar unit=getElementUnit(start, unitIndex); + if(start==limit-1 && unitIndex+1==getElementStringLength(start)) { + listNode->add(unit, getElementValue(start)); + } else { + listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode)); + } + Node *node=registerNode(listNode, errorCode); + // Create the split-branch nodes. + while(ltLength>0) { + --ltLength; + node=registerNode( + new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode); + } + return node; +} + +StringTrieBuilder::Node * +StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + delete newNode; + return NULL; + } + if(newNode==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + const UHashElement *old=uhash_find(nodes, newNode); + if(old!=NULL) { + delete newNode; + return (Node *)old->key.pointer; + } + // If uhash_puti() returns a non-zero value from an equivalent, previously + // registered node, then uhash_find() failed to find that and we will leak newNode. +#if U_DEBUG + int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. +#endif + uhash_puti(nodes, newNode, 1, &errorCode); + U_ASSERT(oldValue==0); + if(U_FAILURE(errorCode)) { + delete newNode; + return NULL; + } + return newNode; +} + +StringTrieBuilder::Node * +StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return NULL; + } + FinalValueNode key(value); + const UHashElement *old=uhash_find(nodes, &key); + if(old!=NULL) { + return (Node *)old->key.pointer; + } + Node *newNode=new FinalValueNode(value); + if(newNode==NULL) { + errorCode=U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + // If uhash_puti() returns a non-zero value from an equivalent, previously + // registered node, then uhash_find() failed to find that and we will leak newNode. +#if U_DEBUG + int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. +#endif + uhash_puti(nodes, newNode, 1, &errorCode); + U_ASSERT(oldValue==0); + if(U_FAILURE(errorCode)) { + delete newNode; + return NULL; + } + return newNode; +} + +int32_t +StringTrieBuilder::hashNode(const void *node) { + return ((const Node *)node)->hashCode(); +} + +UBool +StringTrieBuilder::equalNodes(const void *left, const void *right) { + return *(const Node *)left==*(const Node *)right; +} + +UBool +StringTrieBuilder::Node::operator==(const Node &other) const { + return this==&other || (typeid(*this)==typeid(other) && hash==other.hash); +} + +int32_t +StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + offset=edgeNumber; + } + return edgeNumber; +} + +UBool +StringTrieBuilder::FinalValueNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!Node::operator==(other)) { + return FALSE; + } + const FinalValueNode &o=(const FinalValueNode &)other; + return value==o.value; +} + +void +StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) { + offset=builder.writeValueAndFinal(value, TRUE); +} + +UBool +StringTrieBuilder::ValueNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!Node::operator==(other)) { + return FALSE; + } + const ValueNode &o=(const ValueNode &)other; + return hasValue==o.hasValue && (!hasValue || value==o.value); +} + +UBool +StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!ValueNode::operator==(other)) { + return FALSE; + } + const IntermediateValueNode &o=(const IntermediateValueNode &)other; + return next==o.next; +} + +int32_t +StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); + } + return edgeNumber; +} + +void +StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) { + next->write(builder); + offset=builder.writeValueAndFinal(value, FALSE); +} + +UBool +StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!ValueNode::operator==(other)) { + return FALSE; + } + const LinearMatchNode &o=(const LinearMatchNode &)other; + return length==o.length && next==o.next; +} + +int32_t +StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); + } + return edgeNumber; +} + +UBool +StringTrieBuilder::ListBranchNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!Node::operator==(other)) { + return FALSE; + } + const ListBranchNode &o=(const ListBranchNode &)other; + for(int32_t i=0; i<length; ++i) { + if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) { + return FALSE; + } + } + return TRUE; +} + +int32_t +StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + firstEdgeNumber=edgeNumber; + int32_t step=0; + int32_t i=length; + do { + Node *edge=equal[--i]; + if(edge!=NULL) { + edgeNumber=edge->markRightEdgesFirst(edgeNumber-step); + } + // For all but the rightmost edge, decrement the edge number. + step=1; + } while(i>0); + offset=edgeNumber; + } + return edgeNumber; +} + +void +StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) { + // Write the sub-nodes in reverse order: The jump lengths are deltas from + // after their own positions, so if we wrote the minUnit sub-node first, + // then its jump delta would be larger. + // Instead we write the minUnit sub-node last, for a shorter delta. + int32_t unitNumber=length-1; + Node *rightEdge=equal[unitNumber]; + int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset(); + do { + --unitNumber; + if(equal[unitNumber]!=NULL) { + equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder); + } + } while(unitNumber>0); + // The maxUnit sub-node is written as the very last one because we do + // not jump for it at all. + unitNumber=length-1; + if(rightEdge==NULL) { + builder.writeValueAndFinal(values[unitNumber], TRUE); + } else { + rightEdge->write(builder); + } + offset=builder.write(units[unitNumber]); + // Write the rest of this node's unit-value pairs. + while(--unitNumber>=0) { + int32_t value; + UBool isFinal; + if(equal[unitNumber]==NULL) { + // Write the final value for the one string ending with this unit. + value=values[unitNumber]; + isFinal=TRUE; + } else { + // Write the delta to the start position of the sub-node. + U_ASSERT(equal[unitNumber]->getOffset()>0); + value=offset-equal[unitNumber]->getOffset(); + isFinal=FALSE; + } + builder.writeValueAndFinal(value, isFinal); + offset=builder.write(units[unitNumber]); + } +} + +UBool +StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!Node::operator==(other)) { + return FALSE; + } + const SplitBranchNode &o=(const SplitBranchNode &)other; + return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual; +} + +int32_t +StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + firstEdgeNumber=edgeNumber; + edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber); + offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1); + } + return edgeNumber; +} + +void +StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) { + // Encode the less-than branch first. + lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder); + // Encode the greater-or-equal branch last because we do not jump for it at all. + greaterOrEqual->write(builder); + // Write this node. + U_ASSERT(lessThan->getOffset()>0); + builder.writeDeltaTo(lessThan->getOffset()); // less-than + offset=builder.write(unit); +} + +UBool +StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const { + if(this==&other) { + return TRUE; + } + if(!ValueNode::operator==(other)) { + return FALSE; + } + const BranchHeadNode &o=(const BranchHeadNode &)other; + return length==o.length && next==o.next; +} + +int32_t +StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) { + if(offset==0) { + offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); + } + return edgeNumber; +} + +void +StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) { + next->write(builder); + if(length<=builder.getMinLinearMatch()) { + offset=builder.writeValueAndType(hasValue, value, length-1); + } else { + builder.write(length-1); + offset=builder.writeValueAndType(hasValue, value, 0); + } +} + +U_NAMESPACE_END |