summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/normalizer2.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/common/normalizer2.cpp')
-rw-r--r--intl/icu/source/common/normalizer2.cpp572
1 files changed, 572 insertions, 0 deletions
diff --git a/intl/icu/source/common/normalizer2.cpp b/intl/icu/source/common/normalizer2.cpp
new file mode 100644
index 0000000000..6856736b88
--- /dev/null
+++ b/intl/icu/source/common/normalizer2.cpp
@@ -0,0 +1,572 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2009-2016, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: normalizer2.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2009nov22
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_NORMALIZATION
+
+#include "unicode/edits.h"
+#include "unicode/normalizer2.h"
+#include "unicode/stringoptions.h"
+#include "unicode/unistr.h"
+#include "unicode/unorm.h"
+#include "cstring.h"
+#include "mutex.h"
+#include "norm2allmodes.h"
+#include "normalizer2impl.h"
+#include "uassert.h"
+#include "ucln_cmn.h"
+
+using icu::Normalizer2Impl;
+
+#if NORM2_HARDCODE_NFC_DATA
+// NFC/NFD data machine-generated by gennorm2 --csource
+#define INCLUDED_FROM_NORMALIZER2_CPP
+#include "norm2_nfc_data.h"
+#endif
+
+U_NAMESPACE_BEGIN
+
+// Public API dispatch via Normalizer2 subclasses -------------------------- ***
+
+Normalizer2::~Normalizer2() {}
+
+void
+Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
+ Edits *edits, UErrorCode &errorCode) const {
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if (edits != nullptr) {
+ errorCode = U_UNSUPPORTED_ERROR;
+ return;
+ }
+ UnicodeString src16 = UnicodeString::fromUTF8(src);
+ normalize(src16, errorCode).toUTF8(sink);
+}
+
+UBool
+Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
+ return false;
+}
+
+UChar32
+Normalizer2::composePair(UChar32, UChar32) const {
+ return U_SENTINEL;
+}
+
+uint8_t
+Normalizer2::getCombiningClass(UChar32 /*c*/) const {
+ return 0;
+}
+
+UBool
+Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
+ return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
+}
+
+// Normalizer2 implementation for the old UNORM_NONE.
+class NoopNormalizer2 : public Normalizer2 {
+ virtual ~NoopNormalizer2();
+
+ virtual UnicodeString &
+ normalize(const UnicodeString &src,
+ UnicodeString &dest,
+ UErrorCode &errorCode) const override {
+ if(U_SUCCESS(errorCode)) {
+ if(&dest!=&src) {
+ dest=src;
+ } else {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ }
+ }
+ return dest;
+ }
+ virtual void
+ normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
+ Edits *edits, UErrorCode &errorCode) const override {
+ if(U_SUCCESS(errorCode)) {
+ if (edits != nullptr) {
+ if ((options & U_EDITS_NO_RESET) == 0) {
+ edits->reset();
+ }
+ edits->addUnchanged(src.length());
+ }
+ if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
+ sink.Append(src.data(), src.length());
+ }
+ sink.Flush();
+ }
+ }
+
+ virtual UnicodeString &
+ normalizeSecondAndAppend(UnicodeString &first,
+ const UnicodeString &second,
+ UErrorCode &errorCode) const override {
+ if(U_SUCCESS(errorCode)) {
+ if(&first!=&second) {
+ first.append(second);
+ } else {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ }
+ }
+ return first;
+ }
+ virtual UnicodeString &
+ append(UnicodeString &first,
+ const UnicodeString &second,
+ UErrorCode &errorCode) const override {
+ if(U_SUCCESS(errorCode)) {
+ if(&first!=&second) {
+ first.append(second);
+ } else {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ }
+ }
+ return first;
+ }
+ virtual UBool
+ getDecomposition(UChar32, UnicodeString &) const override {
+ return false;
+ }
+ // No need to override the default getRawDecomposition().
+ virtual UBool
+ isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
+ return U_SUCCESS(errorCode);
+ }
+ virtual UBool
+ isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
+ return U_SUCCESS(errorCode);
+ }
+ virtual UNormalizationCheckResult
+ quickCheck(const UnicodeString &, UErrorCode &) const override {
+ return UNORM_YES;
+ }
+ virtual int32_t
+ spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const override {
+ return s.length();
+ }
+ virtual UBool hasBoundaryBefore(UChar32) const override { return true; }
+ virtual UBool hasBoundaryAfter(UChar32) const override { return true; }
+ virtual UBool isInert(UChar32) const override { return true; }
+};
+
+NoopNormalizer2::~NoopNormalizer2() {}
+
+Normalizer2WithImpl::~Normalizer2WithImpl() {}
+
+DecomposeNormalizer2::~DecomposeNormalizer2() {}
+
+ComposeNormalizer2::~ComposeNormalizer2() {}
+
+FCDNormalizer2::~FCDNormalizer2() {}
+
+// instance cache ---------------------------------------------------------- ***
+
+U_CDECL_BEGIN
+static UBool U_CALLCONV uprv_normalizer2_cleanup();
+U_CDECL_END
+
+static Normalizer2 *noopSingleton;
+static icu::UInitOnce noopInitOnce {};
+
+static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) {
+ return;
+ }
+ noopSingleton=new NoopNormalizer2;
+ if(noopSingleton==nullptr) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
+}
+
+const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return nullptr; }
+ umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
+ return noopSingleton;
+}
+
+const Normalizer2Impl *
+Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
+ return &((Normalizer2WithImpl *)norm2)->impl;
+}
+
+Norm2AllModes::~Norm2AllModes() {
+ delete impl;
+}
+
+Norm2AllModes *
+Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) {
+ delete impl;
+ return nullptr;
+ }
+ Norm2AllModes *allModes=new Norm2AllModes(impl);
+ if(allModes==nullptr) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ delete impl;
+ return nullptr;
+ }
+ return allModes;
+}
+
+#if NORM2_HARDCODE_NFC_DATA
+Norm2AllModes *
+Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+ Normalizer2Impl *impl=new Normalizer2Impl;
+ if(impl==nullptr) {
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ return nullptr;
+ }
+ impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
+ norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
+ return createInstance(impl, errorCode);
+}
+
+static Norm2AllModes *nfcSingleton;
+
+static icu::UInitOnce nfcInitOnce {};
+
+static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
+ nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
+ ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
+}
+
+const Norm2AllModes *
+Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return nullptr; }
+ umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
+ return nfcSingleton;
+}
+
+const Normalizer2 *
+Normalizer2::getNFCInstance(UErrorCode &errorCode) {
+ const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
+ return allModes!=nullptr ? &allModes->comp : nullptr;
+}
+
+const Normalizer2 *
+Normalizer2::getNFDInstance(UErrorCode &errorCode) {
+ const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
+ return allModes!=nullptr ? &allModes->decomp : nullptr;
+}
+
+const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
+ const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
+ return allModes!=nullptr ? &allModes->fcd : nullptr;
+}
+
+const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
+ const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
+ return allModes!=nullptr ? &allModes->fcc : nullptr;
+}
+
+const Normalizer2Impl *
+Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
+ const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
+ return allModes!=nullptr ? allModes->impl : nullptr;
+}
+#endif // NORM2_HARDCODE_NFC_DATA
+
+U_CDECL_BEGIN
+
+static UBool U_CALLCONV uprv_normalizer2_cleanup() {
+ delete noopSingleton;
+ noopSingleton = nullptr;
+ noopInitOnce.reset();
+#if NORM2_HARDCODE_NFC_DATA
+ delete nfcSingleton;
+ nfcSingleton = nullptr;
+ nfcInitOnce.reset();
+#endif
+ return true;
+}
+
+U_CDECL_END
+
+U_NAMESPACE_END
+
+// C API ------------------------------------------------------------------- ***
+
+U_NAMESPACE_USE
+
+U_CAPI const UNormalizer2 * U_EXPORT2
+unorm2_getNFCInstance(UErrorCode *pErrorCode) {
+ return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
+}
+
+U_CAPI const UNormalizer2 * U_EXPORT2
+unorm2_getNFDInstance(UErrorCode *pErrorCode) {
+ return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
+}
+
+U_CAPI void U_EXPORT2
+unorm2_close(UNormalizer2 *norm2) {
+ delete (Normalizer2 *)norm2;
+}
+
+U_CAPI int32_t U_EXPORT2
+unorm2_normalize(const UNormalizer2 *norm2,
+ const char16_t *src, int32_t length,
+ char16_t *dest, int32_t capacity,
+ UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if( (src==nullptr ? length!=0 : length<-1) ||
+ (dest==nullptr ? capacity!=0 : capacity<0) ||
+ (src==dest && src!=nullptr)
+ ) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ UnicodeString destString(dest, 0, capacity);
+ // length==0: Nothing to do, and n2wi->normalize(nullptr, nullptr, buffer, ...) would crash.
+ if(length!=0) {
+ const Normalizer2 *n2=(const Normalizer2 *)norm2;
+ const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
+ if(n2wi!=nullptr) {
+ // Avoid duplicate argument checking and support NUL-terminated src.
+ ReorderingBuffer buffer(n2wi->impl, destString);
+ if(buffer.init(length, *pErrorCode)) {
+ n2wi->normalize(src, length>=0 ? src+length : nullptr, buffer, *pErrorCode);
+ }
+ } else {
+ UnicodeString srcString(length<0, src, length);
+ n2->normalize(srcString, destString, *pErrorCode);
+ }
+ }
+ return destString.extract(dest, capacity, *pErrorCode);
+}
+
+static int32_t
+normalizeSecondAndAppend(const UNormalizer2 *norm2,
+ char16_t *first, int32_t firstLength, int32_t firstCapacity,
+ const char16_t *second, int32_t secondLength,
+ UBool doNormalize,
+ UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if( (second==nullptr ? secondLength!=0 : secondLength<-1) ||
+ (first==nullptr ? (firstCapacity!=0 || firstLength!=0) :
+ (firstCapacity<0 || firstLength<-1)) ||
+ (first==second && first!=nullptr)
+ ) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ UnicodeString firstString(first, firstLength, firstCapacity);
+ firstLength=firstString.length(); // In case it was -1.
+ // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(nullptr, nullptr, buffer, ...) would crash.
+ if(secondLength!=0) {
+ const Normalizer2 *n2=(const Normalizer2 *)norm2;
+ const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
+ if(n2wi!=nullptr) {
+ // Avoid duplicate argument checking and support NUL-terminated src.
+ UnicodeString safeMiddle;
+ {
+ ReorderingBuffer buffer(n2wi->impl, firstString);
+ if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
+ n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : nullptr,
+ doNormalize, safeMiddle, buffer, *pErrorCode);
+ }
+ } // The ReorderingBuffer destructor finalizes firstString.
+ if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
+ // Restore the modified suffix of the first string.
+ // This does not restore first[] array contents between firstLength and firstCapacity.
+ // (That might be uninitialized memory, as far as we know.)
+ if(first!=nullptr) { /* don't dereference nullptr */
+ safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
+ if(firstLength<firstCapacity) {
+ first[firstLength]=0; // NUL-terminate in case it was originally.
+ }
+ }
+ }
+ } else {
+ UnicodeString secondString(secondLength<0, second, secondLength);
+ if(doNormalize) {
+ n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
+ } else {
+ n2->append(firstString, secondString, *pErrorCode);
+ }
+ }
+ }
+ return firstString.extract(first, firstCapacity, *pErrorCode);
+}
+
+U_CAPI int32_t U_EXPORT2
+unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
+ char16_t *first, int32_t firstLength, int32_t firstCapacity,
+ const char16_t *second, int32_t secondLength,
+ UErrorCode *pErrorCode) {
+ return normalizeSecondAndAppend(norm2,
+ first, firstLength, firstCapacity,
+ second, secondLength,
+ true, pErrorCode);
+}
+
+U_CAPI int32_t U_EXPORT2
+unorm2_append(const UNormalizer2 *norm2,
+ char16_t *first, int32_t firstLength, int32_t firstCapacity,
+ const char16_t *second, int32_t secondLength,
+ UErrorCode *pErrorCode) {
+ return normalizeSecondAndAppend(norm2,
+ first, firstLength, firstCapacity,
+ second, secondLength,
+ false, pErrorCode);
+}
+
+U_CAPI int32_t U_EXPORT2
+unorm2_getDecomposition(const UNormalizer2 *norm2,
+ UChar32 c, char16_t *decomposition, int32_t capacity,
+ UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if(decomposition==nullptr ? capacity!=0 : capacity<0) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ UnicodeString destString(decomposition, 0, capacity);
+ if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
+ return destString.extract(decomposition, capacity, *pErrorCode);
+ } else {
+ return -1;
+ }
+}
+
+U_CAPI int32_t U_EXPORT2
+unorm2_getRawDecomposition(const UNormalizer2 *norm2,
+ UChar32 c, char16_t *decomposition, int32_t capacity,
+ UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if(decomposition==nullptr ? capacity!=0 : capacity<0) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ UnicodeString destString(decomposition, 0, capacity);
+ if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
+ return destString.extract(decomposition, capacity, *pErrorCode);
+ } else {
+ return -1;
+ }
+}
+
+U_CAPI UChar32 U_EXPORT2
+unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
+ return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
+}
+
+U_CAPI uint8_t U_EXPORT2
+unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
+ return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
+}
+
+U_CAPI UBool U_EXPORT2
+unorm2_isNormalized(const UNormalizer2 *norm2,
+ const char16_t *s, int32_t length,
+ UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if((s==nullptr && length!=0) || length<-1) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ UnicodeString sString(length<0, s, length);
+ return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
+}
+
+U_CAPI UNormalizationCheckResult U_EXPORT2
+unorm2_quickCheck(const UNormalizer2 *norm2,
+ const char16_t *s, int32_t length,
+ UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return UNORM_NO;
+ }
+ if((s==nullptr && length!=0) || length<-1) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return UNORM_NO;
+ }
+ UnicodeString sString(length<0, s, length);
+ return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
+}
+
+U_CAPI int32_t U_EXPORT2
+unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
+ const char16_t *s, int32_t length,
+ UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if((s==nullptr && length!=0) || length<-1) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ UnicodeString sString(length<0, s, length);
+ return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
+}
+
+U_CAPI UBool U_EXPORT2
+unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
+ return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
+}
+
+U_CAPI UBool U_EXPORT2
+unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
+ return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
+}
+
+U_CAPI UBool U_EXPORT2
+unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
+ return ((const Normalizer2 *)norm2)->isInert(c);
+}
+
+// Some properties APIs ---------------------------------------------------- ***
+
+U_CAPI uint8_t U_EXPORT2
+u_getCombiningClass(UChar32 c) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ return nfd->getCombiningClass(c);
+ } else {
+ return 0;
+ }
+}
+
+U_CFUNC uint16_t
+unorm_getFCD16(UChar32 c) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ return impl->getFCD16(c);
+ } else {
+ return 0;
+ }
+}
+
+#endif // !UCONFIG_NO_NORMALIZATION