1 files changed, 529 insertions, 0 deletions
diff --git a/intl/icu/source/common/normlzr.cpp b/intl/icu/source/common/normlzr.cpp
new file mode 100644
index 0000000000..52b9ffd54a
--- /dev/null
+++ b/intl/icu/source/common/normlzr.cpp
@@ -0,0 +1,529 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+ *************************************************************************
+ * COPYRIGHT: 
+ * Copyright (c) 1996-2012, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *************************************************************************
+ */
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_NORMALIZATION
+
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/chariter.h"
+#include "unicode/schriter.h"
+#include "unicode/uchriter.h"
+#include "unicode/normlzr.h"
+#include "unicode/utf16.h"
+#include "cmemory.h"
+#include "normalizer2impl.h"
+#include "uprops.h"  // for uniset_getUnicode32Instance()
+
+#if defined(move32)
+ // System can define move32 intrinsics, but the char iters define move32 method
+ // using same undef trick in headers, so undef here to re-enable the method.
+#undef move32
+#endif
+
+U_NAMESPACE_BEGIN
+
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
+
+//-------------------------------------------------------------------------
+// Constructors and other boilerplate
+//-------------------------------------------------------------------------
+
+Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
+    UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
+    text(new StringCharacterIterator(str)),
+    currentIndex(0), nextIndex(0),
+    buffer(), bufferPos(0)
+{
+    init();
+}
+
+Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
+    UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
+    text(new UCharCharacterIterator(str, length)),
+    currentIndex(0), nextIndex(0),
+    buffer(), bufferPos(0)
+{
+    init();
+}
+
+Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
+    UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
+    text(iter.clone()),
+    currentIndex(0), nextIndex(0),
+    buffer(), bufferPos(0)
+{
+    init();
+}
+
+Normalizer::Normalizer(const Normalizer &copy) :
+    UObject(copy), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(copy.fUMode), fOptions(copy.fOptions),
+    text(copy.text->clone()),
+    currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
+    buffer(copy.buffer), bufferPos(copy.bufferPos)
+{
+    init();
+}
+
+void
+Normalizer::init() {
+    UErrorCode errorCode=U_ZERO_ERROR;
+    fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
+    if(fOptions&UNORM_UNICODE_3_2) {
+        delete fFilteredNorm2;
+        fNorm2=fFilteredNorm2=
+            new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
+    }
+    if(U_FAILURE(errorCode)) {
+        errorCode=U_ZERO_ERROR;
+        fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
+    }
+}
+
+Normalizer::~Normalizer()
+{
+    delete fFilteredNorm2;
+    delete text;
+}
+
+Normalizer* 
+Normalizer::clone() const
+{
+    return new Normalizer(*this);
+}
+
+/**
+ * Generates a hash code for this iterator.
+ */
+int32_t Normalizer::hashCode() const
+{
+    return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
+}
+    
+bool Normalizer::operator==(const Normalizer& that) const
+{
+    return
+        this==&that ||
+        (fUMode==that.fUMode &&
+        fOptions==that.fOptions &&
+        *text==*that.text &&
+        buffer==that.buffer &&
+        bufferPos==that.bufferPos &&
+        nextIndex==that.nextIndex);
+}
+
+//-------------------------------------------------------------------------
+// Static utility methods
+//-------------------------------------------------------------------------
+
+void U_EXPORT2
+Normalizer::normalize(const UnicodeString& source, 
+                      UNormalizationMode mode, int32_t options,
+                      UnicodeString& result, 
+                      UErrorCode &status) {
+    if(source.isBogus() || U_FAILURE(status)) {
+        result.setToBogus();
+        if(U_SUCCESS(status)) {
+            status=U_ILLEGAL_ARGUMENT_ERROR;
+        }
+    } else {
+        UnicodeString localDest;
+        UnicodeString *dest;
+
+        if(&source!=&result) {
+            dest=&result;
+        } else {
+            // the source and result strings are the same object, use a temporary one
+            dest=&localDest;
+        }
+        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+        if(U_SUCCESS(status)) {
+            if(options&UNORM_UNICODE_3_2) {
+                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+                    normalize(source, *dest, status);
+            } else {
+                n2->normalize(source, *dest, status);
+            }
+        }
+        if(dest==&localDest && U_SUCCESS(status)) {
+            result=*dest;
+        }
+    }
+}
+
+void U_EXPORT2
+Normalizer::compose(const UnicodeString& source, 
+                    UBool compat, int32_t options,
+                    UnicodeString& result, 
+                    UErrorCode &status) {
+    normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
+}
+
+void U_EXPORT2
+Normalizer::decompose(const UnicodeString& source, 
+                      UBool compat, int32_t options,
+                      UnicodeString& result, 
+                      UErrorCode &status) {
+    normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
+}
+
+UNormalizationCheckResult
+Normalizer::quickCheck(const UnicodeString& source,
+                       UNormalizationMode mode, int32_t options,
+                       UErrorCode &status) {
+    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+    if(U_SUCCESS(status)) {
+        if(options&UNORM_UNICODE_3_2) {
+            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+                quickCheck(source, status);
+        } else {
+            return n2->quickCheck(source, status);
+        }
+    } else {
+        return UNORM_MAYBE;
+    }
+}
+
+UBool
+Normalizer::isNormalized(const UnicodeString& source,
+                         UNormalizationMode mode, int32_t options,
+                         UErrorCode &status) {
+    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+    if(U_SUCCESS(status)) {
+        if(options&UNORM_UNICODE_3_2) {
+            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+                isNormalized(source, status);
+        } else {
+            return n2->isNormalized(source, status);
+        }
+    } else {
+        return false;
+    }
+}
+
+UnicodeString & U_EXPORT2
+Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
+                        UnicodeString &result,
+                        UNormalizationMode mode, int32_t options,
+                        UErrorCode &errorCode) {
+    if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
+        result.setToBogus();
+        if(U_SUCCESS(errorCode)) {
+            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        }
+    } else {
+        UnicodeString localDest;
+        UnicodeString *dest;
+
+        if(&right!=&result) {
+            dest=&result;
+        } else {
+            // the right and result strings are the same object, use a temporary one
+            dest=&localDest;
+        }
+        *dest=left;
+        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
+        if(U_SUCCESS(errorCode)) {
+            if(options&UNORM_UNICODE_3_2) {
+                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
+                    append(*dest, right, errorCode);
+            } else {
+                n2->append(*dest, right, errorCode);
+            }
+        }
+        if(dest==&localDest && U_SUCCESS(errorCode)) {
+            result=*dest;
+        }
+    }
+    return result;
+}
+
+//-------------------------------------------------------------------------
+// Iteration API
+//-------------------------------------------------------------------------
+
+/**
+ * Return the current character in the normalized text.
+ */
+UChar32 Normalizer::current() {
+    if(bufferPos<buffer.length() || nextNormalize()) {
+        return buffer.char32At(bufferPos);
+    } else {
+        return DONE;
+    }
+}
+
+/**
+ * Return the next character in the normalized text and advance
+ * the iteration position by one.  If the end
+ * of the text has already been reached, {@link #DONE} is returned.
+ */
+UChar32 Normalizer::next() {
+    if(bufferPos<buffer.length() ||  nextNormalize()) {
+        UChar32 c=buffer.char32At(bufferPos);
+        bufferPos+=U16_LENGTH(c);
+        return c;
+    } else {
+        return DONE;
+    }
+}
+
+/**
+ * Return the previous character in the normalized text and decrement
+ * the iteration position by one.  If the beginning
+ * of the text has already been reached, {@link #DONE} is returned.
+ */
+UChar32 Normalizer::previous() {
+    if(bufferPos>0 || previousNormalize()) {
+        UChar32 c=buffer.char32At(bufferPos-1);
+        bufferPos-=U16_LENGTH(c);
+        return c;
+    } else {
+        return DONE;
+    }
+}
+
+void Normalizer::reset() {
+    currentIndex=nextIndex=text->setToStart();
+    clearBuffer();
+}
+
+void
+Normalizer::setIndexOnly(int32_t index) {
+    text->setIndex(index);  // pins index
+    currentIndex=nextIndex=text->getIndex();
+    clearBuffer();
+}
+
+/**
+ * Return the first character in the normalized text.  This resets
+ * the <tt>Normalizer's</tt> position to the beginning of the text.
+ */
+UChar32 Normalizer::first() {
+    reset();
+    return next();
+}
+
+/**
+ * Return the last character in the normalized text.  This resets
+ * the <tt>Normalizer's</tt> position to be just before the
+ * the input text corresponding to that normalized character.
+ */
+UChar32 Normalizer::last() {
+    currentIndex=nextIndex=text->setToEnd();
+    clearBuffer();
+    return previous();
+}
+
+/**
+ * Retrieve the current iteration position in the input text that is
+ * being normalized.  This method is useful in applications such as
+ * searching, where you need to be able to determine the position in
+ * the input text that corresponds to a given normalized output character.
+ * <p>
+ * <b>Note:</b> This method sets the position in the <em>input</em>, while
+ * {@link #next} and {@link #previous} iterate through characters in the
+ * <em>output</em>.  This means that there is not necessarily a one-to-one
+ * correspondence between characters returned by <tt>next</tt> and
+ * <tt>previous</tt> and the indices passed to and returned from
+ * <tt>setIndex</tt> and {@link #getIndex}.
+ *
+ */
+int32_t Normalizer::getIndex() const {
+    if(bufferPos<buffer.length()) {
+        return currentIndex;
+    } else {
+        return nextIndex;
+    }
+}
+
+/**
+ * Retrieve the index of the start of the input text.  This is the begin index
+ * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
+ * over which this <tt>Normalizer</tt> is iterating
+ */
+int32_t Normalizer::startIndex() const {
+    return text->startIndex();
+}
+
+/**
+ * Retrieve the index of the end of the input text.  This is the end index
+ * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
+ * over which this <tt>Normalizer</tt> is iterating
+ */
+int32_t Normalizer::endIndex() const {
+    return text->endIndex();
+}
+
+//-------------------------------------------------------------------------
+// Property access methods
+//-------------------------------------------------------------------------
+
+void
+Normalizer::setMode(UNormalizationMode newMode) 
+{
+    fUMode = newMode;
+    init();
+}
+
+UNormalizationMode
+Normalizer::getUMode() const
+{
+    return fUMode;
+}
+
+void
+Normalizer::setOption(int32_t option, 
+                      UBool value) 
+{
+    if (value) {
+        fOptions |= option;
+    } else {
+        fOptions &= (~option);
+    }
+    init();
+}
+
+UBool
+Normalizer::getOption(int32_t option) const
+{
+    return (fOptions & option) != 0;
+}
+
+/**
+ * Set the input text over which this <tt>Normalizer</tt> will iterate.
+ * The iteration position is set to the beginning of the input text.
+ */
+void
+Normalizer::setText(const UnicodeString& newText, 
+                    UErrorCode &status)
+{
+    if (U_FAILURE(status)) {
+        return;
+    }
+    CharacterIterator *newIter = new StringCharacterIterator(newText);
+    if (newIter == nullptr) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    delete text;
+    text = newIter;
+    reset();
+}
+
+/**
+ * Set the input text over which this <tt>Normalizer</tt> will iterate.
+ * The iteration position is set to the beginning of the string.
+ */
+void
+Normalizer::setText(const CharacterIterator& newText, 
+                    UErrorCode &status) 
+{
+    if (U_FAILURE(status)) {
+        return;
+    }
+    CharacterIterator *newIter = newText.clone();
+    if (newIter == nullptr) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    delete text;
+    text = newIter;
+    reset();
+}
+
+void
+Normalizer::setText(ConstChar16Ptr newText,
+                    int32_t length,
+                    UErrorCode &status)
+{
+    if (U_FAILURE(status)) {
+        return;
+    }
+    CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
+    if (newIter == nullptr) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    delete text;
+    text = newIter;
+    reset();
+}
+
+/**
+ * Copies the text under iteration into the UnicodeString referred to by "result".
+ * @param result Receives a copy of the text under iteration.
+ */
+void
+Normalizer::getText(UnicodeString&  result) 
+{
+    text->getText(result);
+}
+
+//-------------------------------------------------------------------------
+// Private utility methods
+//-------------------------------------------------------------------------
+
+void Normalizer::clearBuffer() {
+    buffer.remove();
+    bufferPos=0;
+}
+
+UBool
+Normalizer::nextNormalize() {
+    clearBuffer();
+    currentIndex=nextIndex;
+    text->setIndex(nextIndex);
+    if(!text->hasNext()) {
+        return false;
+    }
+    // Skip at least one character so we make progress.
+    UnicodeString segment(text->next32PostInc());
+    while(text->hasNext()) {
+        UChar32 c;
+        if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
+            text->move32(-1, CharacterIterator::kCurrent);
+            break;
+        }
+        segment.append(c);
+    }
+    nextIndex=text->getIndex();
+    UErrorCode errorCode=U_ZERO_ERROR;
+    fNorm2->normalize(segment, buffer, errorCode);
+    return U_SUCCESS(errorCode) && !buffer.isEmpty();
+}
+
+UBool
+Normalizer::previousNormalize() {
+    clearBuffer();
+    nextIndex=currentIndex;
+    text->setIndex(currentIndex);
+    if(!text->hasPrevious()) {
+        return false;
+    }
+    UnicodeString segment;
+    while(text->hasPrevious()) {
+        UChar32 c=text->previous32();
+        segment.insert(0, c);
+        if(fNorm2->hasBoundaryBefore(c)) {
+            break;
+        }
+    }
+    currentIndex=text->getIndex();
+    UErrorCode errorCode=U_ZERO_ERROR;
+    fNorm2->normalize(segment, buffer, errorCode);
+    bufferPos=buffer.length();
+    return U_SUCCESS(errorCode) && !buffer.isEmpty();
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_NORMALIZATION */