diff options
Diffstat (limited to 'intl/icu/source/common/normlzr.cpp')
-rw-r--r-- | intl/icu/source/common/normlzr.cpp | 529 |
1 files changed, 529 insertions, 0 deletions
diff --git a/intl/icu/source/common/normlzr.cpp b/intl/icu/source/common/normlzr.cpp new file mode 100644 index 0000000000..52b9ffd54a --- /dev/null +++ b/intl/icu/source/common/normlzr.cpp @@ -0,0 +1,529 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ************************************************************************* + * COPYRIGHT: + * Copyright (c) 1996-2012, International Business Machines Corporation and + * others. All Rights Reserved. + ************************************************************************* + */ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/chariter.h" +#include "unicode/schriter.h" +#include "unicode/uchriter.h" +#include "unicode/normlzr.h" +#include "unicode/utf16.h" +#include "cmemory.h" +#include "normalizer2impl.h" +#include "uprops.h" // for uniset_getUnicode32Instance() + +#if defined(move32) + // System can define move32 intrinsics, but the char iters define move32 method + // using same undef trick in headers, so undef here to re-enable the method. +#undef move32 +#endif + +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) + +//------------------------------------------------------------------------- +// Constructors and other boilerplate +//------------------------------------------------------------------------- + +Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : + UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0), + text(new StringCharacterIterator(str)), + currentIndex(0), nextIndex(0), + buffer(), bufferPos(0) +{ + init(); +} + +Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) : + UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0), + text(new UCharCharacterIterator(str, length)), + currentIndex(0), nextIndex(0), + buffer(), bufferPos(0) +{ + init(); +} + +Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : + UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0), + text(iter.clone()), + currentIndex(0), nextIndex(0), + buffer(), bufferPos(0) +{ + init(); +} + +Normalizer::Normalizer(const Normalizer ©) : + UObject(copy), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(copy.fUMode), fOptions(copy.fOptions), + text(copy.text->clone()), + currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), + buffer(copy.buffer), bufferPos(copy.bufferPos) +{ + init(); +} + +void +Normalizer::init() { + UErrorCode errorCode=U_ZERO_ERROR; + fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); + if(fOptions&UNORM_UNICODE_3_2) { + delete fFilteredNorm2; + fNorm2=fFilteredNorm2= + new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); + } + if(U_FAILURE(errorCode)) { + errorCode=U_ZERO_ERROR; + fNorm2=Normalizer2Factory::getNoopInstance(errorCode); + } +} + +Normalizer::~Normalizer() +{ + delete fFilteredNorm2; + delete text; +} + +Normalizer* +Normalizer::clone() const +{ + return new Normalizer(*this); +} + +/** + * Generates a hash code for this iterator. + */ +int32_t Normalizer::hashCode() const +{ + return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; +} + +bool Normalizer::operator==(const Normalizer& that) const +{ + return + this==&that || + (fUMode==that.fUMode && + fOptions==that.fOptions && + *text==*that.text && + buffer==that.buffer && + bufferPos==that.bufferPos && + nextIndex==that.nextIndex); +} + +//------------------------------------------------------------------------- +// Static utility methods +//------------------------------------------------------------------------- + +void U_EXPORT2 +Normalizer::normalize(const UnicodeString& source, + UNormalizationMode mode, int32_t options, + UnicodeString& result, + UErrorCode &status) { + if(source.isBogus() || U_FAILURE(status)) { + result.setToBogus(); + if(U_SUCCESS(status)) { + status=U_ILLEGAL_ARGUMENT_ERROR; + } + } else { + UnicodeString localDest; + UnicodeString *dest; + + if(&source!=&result) { + dest=&result; + } else { + // the source and result strings are the same object, use a temporary one + dest=&localDest; + } + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); + if(U_SUCCESS(status)) { + if(options&UNORM_UNICODE_3_2) { + FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). + normalize(source, *dest, status); + } else { + n2->normalize(source, *dest, status); + } + } + if(dest==&localDest && U_SUCCESS(status)) { + result=*dest; + } + } +} + +void U_EXPORT2 +Normalizer::compose(const UnicodeString& source, + UBool compat, int32_t options, + UnicodeString& result, + UErrorCode &status) { + normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); +} + +void U_EXPORT2 +Normalizer::decompose(const UnicodeString& source, + UBool compat, int32_t options, + UnicodeString& result, + UErrorCode &status) { + normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); +} + +UNormalizationCheckResult +Normalizer::quickCheck(const UnicodeString& source, + UNormalizationMode mode, int32_t options, + UErrorCode &status) { + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); + if(U_SUCCESS(status)) { + if(options&UNORM_UNICODE_3_2) { + return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). + quickCheck(source, status); + } else { + return n2->quickCheck(source, status); + } + } else { + return UNORM_MAYBE; + } +} + +UBool +Normalizer::isNormalized(const UnicodeString& source, + UNormalizationMode mode, int32_t options, + UErrorCode &status) { + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); + if(U_SUCCESS(status)) { + if(options&UNORM_UNICODE_3_2) { + return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). + isNormalized(source, status); + } else { + return n2->isNormalized(source, status); + } + } else { + return false; + } +} + +UnicodeString & U_EXPORT2 +Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right, + UnicodeString &result, + UNormalizationMode mode, int32_t options, + UErrorCode &errorCode) { + if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { + result.setToBogus(); + if(U_SUCCESS(errorCode)) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + } + } else { + UnicodeString localDest; + UnicodeString *dest; + + if(&right!=&result) { + dest=&result; + } else { + // the right and result strings are the same object, use a temporary one + dest=&localDest; + } + *dest=left; + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); + if(U_SUCCESS(errorCode)) { + if(options&UNORM_UNICODE_3_2) { + FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). + append(*dest, right, errorCode); + } else { + n2->append(*dest, right, errorCode); + } + } + if(dest==&localDest && U_SUCCESS(errorCode)) { + result=*dest; + } + } + return result; +} + +//------------------------------------------------------------------------- +// Iteration API +//------------------------------------------------------------------------- + +/** + * Return the current character in the normalized text. + */ +UChar32 Normalizer::current() { + if(bufferPos<buffer.length() || nextNormalize()) { + return buffer.char32At(bufferPos); + } else { + return DONE; + } +} + +/** + * Return the next character in the normalized text and advance + * the iteration position by one. If the end + * of the text has already been reached, {@link #DONE} is returned. + */ +UChar32 Normalizer::next() { + if(bufferPos<buffer.length() || nextNormalize()) { + UChar32 c=buffer.char32At(bufferPos); + bufferPos+=U16_LENGTH(c); + return c; + } else { + return DONE; + } +} + +/** + * Return the previous character in the normalized text and decrement + * the iteration position by one. If the beginning + * of the text has already been reached, {@link #DONE} is returned. + */ +UChar32 Normalizer::previous() { + if(bufferPos>0 || previousNormalize()) { + UChar32 c=buffer.char32At(bufferPos-1); + bufferPos-=U16_LENGTH(c); + return c; + } else { + return DONE; + } +} + +void Normalizer::reset() { + currentIndex=nextIndex=text->setToStart(); + clearBuffer(); +} + +void +Normalizer::setIndexOnly(int32_t index) { + text->setIndex(index); // pins index + currentIndex=nextIndex=text->getIndex(); + clearBuffer(); +} + +/** + * Return the first character in the normalized text. This resets + * the <tt>Normalizer's</tt> position to the beginning of the text. + */ +UChar32 Normalizer::first() { + reset(); + return next(); +} + +/** + * Return the last character in the normalized text. This resets + * the <tt>Normalizer's</tt> position to be just before the + * the input text corresponding to that normalized character. + */ +UChar32 Normalizer::last() { + currentIndex=nextIndex=text->setToEnd(); + clearBuffer(); + return previous(); +} + +/** + * Retrieve the current iteration position in the input text that is + * being normalized. This method is useful in applications such as + * searching, where you need to be able to determine the position in + * the input text that corresponds to a given normalized output character. + * <p> + * <b>Note:</b> This method sets the position in the <em>input</em>, while + * {@link #next} and {@link #previous} iterate through characters in the + * <em>output</em>. This means that there is not necessarily a one-to-one + * correspondence between characters returned by <tt>next</tt> and + * <tt>previous</tt> and the indices passed to and returned from + * <tt>setIndex</tt> and {@link #getIndex}. + * + */ +int32_t Normalizer::getIndex() const { + if(bufferPos<buffer.length()) { + return currentIndex; + } else { + return nextIndex; + } +} + +/** + * Retrieve the index of the start of the input text. This is the begin index + * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> + * over which this <tt>Normalizer</tt> is iterating + */ +int32_t Normalizer::startIndex() const { + return text->startIndex(); +} + +/** + * Retrieve the index of the end of the input text. This is the end index + * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> + * over which this <tt>Normalizer</tt> is iterating + */ +int32_t Normalizer::endIndex() const { + return text->endIndex(); +} + +//------------------------------------------------------------------------- +// Property access methods +//------------------------------------------------------------------------- + +void +Normalizer::setMode(UNormalizationMode newMode) +{ + fUMode = newMode; + init(); +} + +UNormalizationMode +Normalizer::getUMode() const +{ + return fUMode; +} + +void +Normalizer::setOption(int32_t option, + UBool value) +{ + if (value) { + fOptions |= option; + } else { + fOptions &= (~option); + } + init(); +} + +UBool +Normalizer::getOption(int32_t option) const +{ + return (fOptions & option) != 0; +} + +/** + * Set the input text over which this <tt>Normalizer</tt> will iterate. + * The iteration position is set to the beginning of the input text. + */ +void +Normalizer::setText(const UnicodeString& newText, + UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + CharacterIterator *newIter = new StringCharacterIterator(newText); + if (newIter == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + delete text; + text = newIter; + reset(); +} + +/** + * Set the input text over which this <tt>Normalizer</tt> will iterate. + * The iteration position is set to the beginning of the string. + */ +void +Normalizer::setText(const CharacterIterator& newText, + UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + CharacterIterator *newIter = newText.clone(); + if (newIter == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + delete text; + text = newIter; + reset(); +} + +void +Normalizer::setText(ConstChar16Ptr newText, + int32_t length, + UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + CharacterIterator *newIter = new UCharCharacterIterator(newText, length); + if (newIter == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + delete text; + text = newIter; + reset(); +} + +/** + * Copies the text under iteration into the UnicodeString referred to by "result". + * @param result Receives a copy of the text under iteration. + */ +void +Normalizer::getText(UnicodeString& result) +{ + text->getText(result); +} + +//------------------------------------------------------------------------- +// Private utility methods +//------------------------------------------------------------------------- + +void Normalizer::clearBuffer() { + buffer.remove(); + bufferPos=0; +} + +UBool +Normalizer::nextNormalize() { + clearBuffer(); + currentIndex=nextIndex; + text->setIndex(nextIndex); + if(!text->hasNext()) { + return false; + } + // Skip at least one character so we make progress. + UnicodeString segment(text->next32PostInc()); + while(text->hasNext()) { + UChar32 c; + if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { + text->move32(-1, CharacterIterator::kCurrent); + break; + } + segment.append(c); + } + nextIndex=text->getIndex(); + UErrorCode errorCode=U_ZERO_ERROR; + fNorm2->normalize(segment, buffer, errorCode); + return U_SUCCESS(errorCode) && !buffer.isEmpty(); +} + +UBool +Normalizer::previousNormalize() { + clearBuffer(); + nextIndex=currentIndex; + text->setIndex(currentIndex); + if(!text->hasPrevious()) { + return false; + } + UnicodeString segment; + while(text->hasPrevious()) { + UChar32 c=text->previous32(); + segment.insert(0, c); + if(fNorm2->hasBoundaryBefore(c)) { + break; + } + } + currentIndex=text->getIndex(); + UErrorCode errorCode=U_ZERO_ERROR; + fNorm2->normalize(segment, buffer, errorCode); + bufferPos=buffer.length(); + return U_SUCCESS(errorCode) && !buffer.isEmpty(); +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_NORMALIZATION */ |