summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/ucharstriebuilder.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/common/ucharstriebuilder.cpp')
-rw-r--r--intl/icu/source/common/ucharstriebuilder.cpp443
1 files changed, 443 insertions, 0 deletions
diff --git a/intl/icu/source/common/ucharstriebuilder.cpp b/intl/icu/source/common/ucharstriebuilder.cpp
new file mode 100644
index 0000000000..049997a275
--- /dev/null
+++ b/intl/icu/source/common/ucharstriebuilder.cpp
@@ -0,0 +1,443 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2010-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* file name: ucharstriebuilder.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2010nov14
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/ucharstrie.h"
+#include "unicode/ucharstriebuilder.h"
+#include "unicode/unistr.h"
+#include "unicode/ustring.h"
+#include "cmemory.h"
+#include "uarrsort.h"
+#include "uassert.h"
+#include "uhash.h"
+#include "ustr_imp.h"
+
+U_NAMESPACE_BEGIN
+
+/*
+ * Note: This builder implementation stores (string, value) pairs with full copies
+ * of the 16-bit-unit sequences, until the UCharsTrie is built.
+ * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
+ */
+
+class UCharsTrieElement : public UMemory {
+public:
+ // Use compiler's default constructor, initializes nothing.
+
+ void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
+
+ UnicodeString getString(const UnicodeString &strings) const {
+ int32_t length=strings[stringOffset];
+ return strings.tempSubString(stringOffset+1, length);
+ }
+ int32_t getStringLength(const UnicodeString &strings) const {
+ return strings[stringOffset];
+ }
+
+ UChar charAt(int32_t index, const UnicodeString &strings) const {
+ return strings[stringOffset+1+index];
+ }
+
+ int32_t getValue() const { return value; }
+
+ int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
+
+private:
+ // The first strings unit contains the string length.
+ // (Compared with a stringLength field here, this saves 2 bytes per string.)
+ int32_t stringOffset;
+ int32_t value;
+};
+
+void
+UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
+ UnicodeString &strings, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) {
+ return;
+ }
+ int32_t length=s.length();
+ if(length>0xffff) {
+ // Too long: We store the length in 1 unit.
+ errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return;
+ }
+ stringOffset=strings.length();
+ strings.append((UChar)length);
+ value=val;
+ strings.append(s);
+}
+
+int32_t
+UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
+ return getString(strings).compare(other.getString(strings));
+}
+
+UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
+ : elements(NULL), elementsCapacity(0), elementsLength(0),
+ uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
+
+UCharsTrieBuilder::~UCharsTrieBuilder() {
+ delete[] elements;
+ uprv_free(uchars);
+}
+
+UCharsTrieBuilder &
+UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) {
+ return *this;
+ }
+ if(ucharsLength>0) {
+ // Cannot add elements after building.
+ errorCode=U_NO_WRITE_PERMISSION;
+ return *this;
+ }
+ if(elementsLength==elementsCapacity) {
+ int32_t newCapacity;
+ if(elementsCapacity==0) {
+ newCapacity=1024;
+ } else {
+ newCapacity=4*elementsCapacity;
+ }
+ UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
+ if(newElements==NULL) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ return *this;
+ }
+ if(elementsLength>0) {
+ uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement));
+ }
+ delete[] elements;
+ elements=newElements;
+ elementsCapacity=newCapacity;
+ }
+ elements[elementsLength++].setTo(s, value, strings, errorCode);
+ if(U_SUCCESS(errorCode) && strings.isBogus()) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ }
+ return *this;
+}
+
+U_CDECL_BEGIN
+
+static int32_t U_CALLCONV
+compareElementStrings(const void *context, const void *left, const void *right) {
+ const UnicodeString *strings=static_cast<const UnicodeString *>(context);
+ const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
+ const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
+ return leftElement->compareStringTo(*rightElement, *strings);
+}
+
+U_CDECL_END
+
+UCharsTrie *
+UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
+ buildUChars(buildOption, errorCode);
+ UCharsTrie *newTrie=NULL;
+ if(U_SUCCESS(errorCode)) {
+ newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
+ if(newTrie==NULL) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ } else {
+ uchars=NULL; // The new trie now owns the array.
+ ucharsCapacity=0;
+ }
+ }
+ return newTrie;
+}
+
+UnicodeString &
+UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
+ UErrorCode &errorCode) {
+ buildUChars(buildOption, errorCode);
+ if(U_SUCCESS(errorCode)) {
+ result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
+ }
+ return result;
+}
+
+void
+UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) {
+ return;
+ }
+ if(uchars!=NULL && ucharsLength>0) {
+ // Already built.
+ return;
+ }
+ if(ucharsLength==0) {
+ if(elementsLength==0) {
+ errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return;
+ }
+ if(strings.isBogus()) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
+ compareElementStrings, &strings,
+ FALSE, // need not be a stable sort
+ &errorCode);
+ if(U_FAILURE(errorCode)) {
+ return;
+ }
+ // Duplicate strings are not allowed.
+ UnicodeString prev=elements[0].getString(strings);
+ for(int32_t i=1; i<elementsLength; ++i) {
+ UnicodeString current=elements[i].getString(strings);
+ if(prev==current) {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ prev.fastCopyFrom(current);
+ }
+ }
+ // Create and UChar-serialize the trie for the elements.
+ ucharsLength=0;
+ int32_t capacity=strings.length();
+ if(capacity<1024) {
+ capacity=1024;
+ }
+ if(ucharsCapacity<capacity) {
+ uprv_free(uchars);
+ uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
+ if(uchars==NULL) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ ucharsCapacity=0;
+ return;
+ }
+ ucharsCapacity=capacity;
+ }
+ StringTrieBuilder::build(buildOption, elementsLength, errorCode);
+ if(uchars==NULL) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ }
+}
+
+int32_t
+UCharsTrieBuilder::getElementStringLength(int32_t i) const {
+ return elements[i].getStringLength(strings);
+}
+
+UChar
+UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
+ return elements[i].charAt(unitIndex, strings);
+}
+
+int32_t
+UCharsTrieBuilder::getElementValue(int32_t i) const {
+ return elements[i].getValue();
+}
+
+int32_t
+UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
+ const UCharsTrieElement &firstElement=elements[first];
+ const UCharsTrieElement &lastElement=elements[last];
+ int32_t minStringLength=firstElement.getStringLength(strings);
+ while(++unitIndex<minStringLength &&
+ firstElement.charAt(unitIndex, strings)==
+ lastElement.charAt(unitIndex, strings)) {}
+ return unitIndex;
+}
+
+int32_t
+UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
+ int32_t length=0; // Number of different units at unitIndex.
+ int32_t i=start;
+ do {
+ UChar unit=elements[i++].charAt(unitIndex, strings);
+ while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
+ ++i;
+ }
+ ++length;
+ } while(i<limit);
+ return length;
+}
+
+int32_t
+UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
+ do {
+ UChar unit=elements[i++].charAt(unitIndex, strings);
+ while(unit==elements[i].charAt(unitIndex, strings)) {
+ ++i;
+ }
+ } while(--count>0);
+ return i;
+}
+
+int32_t
+UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
+ while(unit==elements[i].charAt(unitIndex, strings)) {
+ ++i;
+ }
+ return i;
+}
+
+UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
+ : LinearMatchNode(len, nextNode), s(units) {
+ hash=hash*37u+ustr_hashUCharsN(units, len);
+}
+
+UBool
+UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
+ if(this==&other) {
+ return TRUE;
+ }
+ if(!LinearMatchNode::operator==(other)) {
+ return FALSE;
+ }
+ const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
+ return 0==u_memcmp(s, o.s, length);
+}
+
+void
+UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
+ UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
+ next->write(builder);
+ b.write(s, length);
+ offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
+}
+
+StringTrieBuilder::Node *
+UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
+ Node *nextNode) const {
+ return new UCTLinearMatchNode(
+ elements[i].getString(strings).getBuffer()+unitIndex,
+ length,
+ nextNode);
+}
+
+UBool
+UCharsTrieBuilder::ensureCapacity(int32_t length) {
+ if(uchars==NULL) {
+ return FALSE; // previous memory allocation had failed
+ }
+ if(length>ucharsCapacity) {
+ int32_t newCapacity=ucharsCapacity;
+ do {
+ newCapacity*=2;
+ } while(newCapacity<=length);
+ UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
+ if(newUChars==NULL) {
+ // unable to allocate memory
+ uprv_free(uchars);
+ uchars=NULL;
+ ucharsCapacity=0;
+ return FALSE;
+ }
+ u_memcpy(newUChars+(newCapacity-ucharsLength),
+ uchars+(ucharsCapacity-ucharsLength), ucharsLength);
+ uprv_free(uchars);
+ uchars=newUChars;
+ ucharsCapacity=newCapacity;
+ }
+ return TRUE;
+}
+
+int32_t
+UCharsTrieBuilder::write(int32_t unit) {
+ int32_t newLength=ucharsLength+1;
+ if(ensureCapacity(newLength)) {
+ ucharsLength=newLength;
+ uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
+ }
+ return ucharsLength;
+}
+
+int32_t
+UCharsTrieBuilder::write(const UChar *s, int32_t length) {
+ int32_t newLength=ucharsLength+length;
+ if(ensureCapacity(newLength)) {
+ ucharsLength=newLength;
+ u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
+ }
+ return ucharsLength;
+}
+
+int32_t
+UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
+ return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
+}
+
+int32_t
+UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
+ if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
+ return write(i|(isFinal<<15));
+ }
+ UChar intUnits[3];
+ int32_t length;
+ if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
+ intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
+ intUnits[1]=(UChar)((uint32_t)i>>16);
+ intUnits[2]=(UChar)i;
+ length=3;
+ // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
+ // intUnits[0]=(UChar)(i);
+ // length=1;
+ } else {
+ intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
+ intUnits[1]=(UChar)i;
+ length=2;
+ }
+ intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
+ return write(intUnits, length);
+}
+
+int32_t
+UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
+ if(!hasValue) {
+ return write(node);
+ }
+ UChar intUnits[3];
+ int32_t length;
+ if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
+ intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
+ intUnits[1]=(UChar)((uint32_t)value>>16);
+ intUnits[2]=(UChar)value;
+ length=3;
+ } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
+ intUnits[0]=(UChar)((value+1)<<6);
+ length=1;
+ } else {
+ intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
+ intUnits[1]=(UChar)value;
+ length=2;
+ }
+ intUnits[0]|=(UChar)node;
+ return write(intUnits, length);
+}
+
+int32_t
+UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
+ int32_t i=ucharsLength-jumpTarget;
+ U_ASSERT(i>=0);
+ if(i<=UCharsTrie::kMaxOneUnitDelta) {
+ return write(i);
+ }
+ UChar intUnits[3];
+ int32_t length;
+ if(i<=UCharsTrie::kMaxTwoUnitDelta) {
+ intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
+ length=1;
+ } else {
+ intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
+ intUnits[1]=(UChar)(i>>16);
+ length=2;
+ }
+ intUnits[length++]=(UChar)i;
+ return write(intUnits, length);
+}
+
+U_NAMESPACE_END