summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/characterproperties.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/common/characterproperties.cpp')
-rw-r--r--intl/icu/source/common/characterproperties.cpp383
1 files changed, 383 insertions, 0 deletions
diff --git a/intl/icu/source/common/characterproperties.cpp b/intl/icu/source/common/characterproperties.cpp
new file mode 100644
index 0000000000..7b50a4e205
--- /dev/null
+++ b/intl/icu/source/common/characterproperties.cpp
@@ -0,0 +1,383 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// characterproperties.cpp
+// created: 2018sep03 Markus W. Scherer
+
+#include "unicode/utypes.h"
+#include "unicode/localpointer.h"
+#include "unicode/uchar.h"
+#include "unicode/ucpmap.h"
+#include "unicode/ucptrie.h"
+#include "unicode/umutablecptrie.h"
+#include "unicode/uniset.h"
+#include "unicode/uscript.h"
+#include "unicode/uset.h"
+#include "cmemory.h"
+#include "mutex.h"
+#include "normalizer2impl.h"
+#include "uassert.h"
+#include "ubidi_props.h"
+#include "ucase.h"
+#include "ucln_cmn.h"
+#include "umutex.h"
+#include "uprops.h"
+
+using icu::LocalPointer;
+#if !UCONFIG_NO_NORMALIZATION
+using icu::Normalizer2Factory;
+using icu::Normalizer2Impl;
+#endif
+using icu::UInitOnce;
+using icu::UnicodeSet;
+
+namespace {
+
+UBool U_CALLCONV characterproperties_cleanup();
+
+constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START;
+
+struct Inclusion {
+ UnicodeSet *fSet = nullptr;
+ UInitOnce fInitOnce = U_INITONCE_INITIALIZER;
+};
+Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
+
+UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
+
+UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
+
+icu::UMutex cpMutex;
+
+//----------------------------------------------------------------
+// Inclusions list
+//----------------------------------------------------------------
+
+// USetAdder implementation
+// Does not use uset.h to reduce code dependencies
+void U_CALLCONV
+_set_add(USet *set, UChar32 c) {
+ ((UnicodeSet *)set)->add(c);
+}
+
+void U_CALLCONV
+_set_addRange(USet *set, UChar32 start, UChar32 end) {
+ ((UnicodeSet *)set)->add(start, end);
+}
+
+void U_CALLCONV
+_set_addString(USet *set, const UChar *str, int32_t length) {
+ ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
+}
+
+UBool U_CALLCONV characterproperties_cleanup() {
+ for (Inclusion &in: gInclusions) {
+ delete in.fSet;
+ in.fSet = nullptr;
+ in.fInitOnce.reset();
+ }
+ for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
+ delete sets[i];
+ sets[i] = nullptr;
+ }
+ for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
+ ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
+ maps[i] = nullptr;
+ }
+ return TRUE;
+}
+
+void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
+ // This function is invoked only via umtx_initOnce().
+ U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
+ if (src == UPROPS_SRC_NONE) {
+ errorCode = U_INTERNAL_PROGRAM_ERROR;
+ return;
+ }
+ U_ASSERT(gInclusions[src].fSet == nullptr);
+
+ LocalPointer<UnicodeSet> incl(new UnicodeSet());
+ if (incl.isNull()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ USetAdder sa = {
+ (USet *)incl.getAlias(),
+ _set_add,
+ _set_addRange,
+ _set_addString,
+ nullptr, // don't need remove()
+ nullptr // don't need removeRange()
+ };
+
+ switch(src) {
+ case UPROPS_SRC_CHAR:
+ uchar_addPropertyStarts(&sa, &errorCode);
+ break;
+ case UPROPS_SRC_PROPSVEC:
+ upropsvec_addPropertyStarts(&sa, &errorCode);
+ break;
+ case UPROPS_SRC_CHAR_AND_PROPSVEC:
+ uchar_addPropertyStarts(&sa, &errorCode);
+ upropsvec_addPropertyStarts(&sa, &errorCode);
+ break;
+#if !UCONFIG_NO_NORMALIZATION
+ case UPROPS_SRC_CASE_AND_NORM: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addPropertyStarts(&sa, errorCode);
+ }
+ ucase_addPropertyStarts(&sa, &errorCode);
+ break;
+ }
+ case UPROPS_SRC_NFC: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addPropertyStarts(&sa, errorCode);
+ }
+ break;
+ }
+ case UPROPS_SRC_NFKC: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addPropertyStarts(&sa, errorCode);
+ }
+ break;
+ }
+ case UPROPS_SRC_NFKC_CF: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addPropertyStarts(&sa, errorCode);
+ }
+ break;
+ }
+ case UPROPS_SRC_NFC_CANON_ITER: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addCanonIterPropertyStarts(&sa, errorCode);
+ }
+ break;
+ }
+#endif
+ case UPROPS_SRC_CASE:
+ ucase_addPropertyStarts(&sa, &errorCode);
+ break;
+ case UPROPS_SRC_BIDI:
+ ubidi_addPropertyStarts(&sa, &errorCode);
+ break;
+ case UPROPS_SRC_INPC:
+ case UPROPS_SRC_INSC:
+ case UPROPS_SRC_VO:
+ uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
+ break;
+ default:
+ errorCode = U_INTERNAL_PROGRAM_ERROR;
+ break;
+ }
+
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if (incl->isBogus()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ // Compact for caching.
+ incl->compact();
+ gInclusions[src].fSet = incl.orphan();
+ ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
+}
+
+const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ if (src < 0 || UPROPS_SRC_COUNT <= src) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ Inclusion &i = gInclusions[src];
+ umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
+ return i.fSet;
+}
+
+void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
+ // This function is invoked only via umtx_initOnce().
+ U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
+ int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
+ U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
+ UPropertySource src = uprops_getSource(prop);
+ const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+
+ LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
+ if (intPropIncl.isNull()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ int32_t numRanges = incl->getRangeCount();
+ int32_t prevValue = 0;
+ for (int32_t i = 0; i < numRanges; ++i) {
+ UChar32 rangeEnd = incl->getRangeEnd(i);
+ for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
+ // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
+ int32_t value = u_getIntPropertyValue(c, prop);
+ if (value != prevValue) {
+ intPropIncl->add(c);
+ prevValue = value;
+ }
+ }
+ }
+
+ if (intPropIncl->isBogus()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ // Compact for caching.
+ intPropIncl->compact();
+ gInclusions[inclIndex].fSet = intPropIncl.orphan();
+ ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
+}
+
+} // namespace
+
+U_NAMESPACE_BEGIN
+
+const UnicodeSet *CharacterProperties::getInclusionsForProperty(
+ UProperty prop, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
+ int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START;
+ Inclusion &i = gInclusions[inclIndex];
+ umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
+ return i.fSet;
+ } else {
+ UPropertySource src = uprops_getSource(prop);
+ return getInclusionsForSource(src, errorCode);
+ }
+}
+
+U_NAMESPACE_END
+
+namespace {
+
+UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ LocalPointer<UnicodeSet> set(new UnicodeSet());
+ if (set.isNull()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return nullptr;
+ }
+ const UnicodeSet *inclusions =
+ icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ int32_t numRanges = inclusions->getRangeCount();
+ UChar32 startHasProperty = -1;
+
+ for (int32_t i = 0; i < numRanges; ++i) {
+ UChar32 rangeEnd = inclusions->getRangeEnd(i);
+ for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
+ // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
+ if (u_hasBinaryProperty(c, property)) {
+ if (startHasProperty < 0) {
+ // Transition from false to true.
+ startHasProperty = c;
+ }
+ } else if (startHasProperty >= 0) {
+ // Transition from true to false.
+ set->add(startHasProperty, c - 1);
+ startHasProperty = -1;
+ }
+ }
+ }
+ if (startHasProperty >= 0) {
+ set->add(startHasProperty, 0x10FFFF);
+ }
+ set->freeze();
+ return set.orphan();
+}
+
+UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
+ icu::LocalUMutableCPTriePointer mutableTrie(
+ umutablecptrie_open(nullValue, nullValue, &errorCode));
+ const UnicodeSet *inclusions =
+ icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ int32_t numRanges = inclusions->getRangeCount();
+ UChar32 start = 0;
+ uint32_t value = nullValue;
+
+ for (int32_t i = 0; i < numRanges; ++i) {
+ UChar32 rangeEnd = inclusions->getRangeEnd(i);
+ for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
+ // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
+ uint32_t nextValue = u_getIntPropertyValue(c, property);
+ if (value != nextValue) {
+ if (value != nullValue) {
+ umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
+ }
+ start = c;
+ value = nextValue;
+ }
+ }
+ }
+ if (value != 0) {
+ umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
+ }
+
+ UCPTrieType type;
+ if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
+ type = UCPTRIE_TYPE_FAST;
+ } else {
+ type = UCPTRIE_TYPE_SMALL;
+ }
+ UCPTrieValueWidth valueWidth;
+ // TODO: UCharacterProperty.IntProperty
+ int32_t max = u_getIntPropertyMaxValue(property);
+ if (max <= 0xff) {
+ valueWidth = UCPTRIE_VALUE_BITS_8;
+ } else if (max <= 0xffff) {
+ valueWidth = UCPTRIE_VALUE_BITS_16;
+ } else {
+ valueWidth = UCPTRIE_VALUE_BITS_32;
+ }
+ return reinterpret_cast<UCPMap *>(
+ umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
+}
+
+} // namespace
+
+U_NAMESPACE_USE
+
+U_CAPI const USet * U_EXPORT2
+u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) { return nullptr; }
+ if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ Mutex m(&cpMutex);
+ UnicodeSet *set = sets[property];
+ if (set == nullptr) {
+ sets[property] = set = makeSet(property, *pErrorCode);
+ }
+ if (U_FAILURE(*pErrorCode)) { return nullptr; }
+ return set->toUSet();
+}
+
+U_CAPI const UCPMap * U_EXPORT2
+u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) { return nullptr; }
+ if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ Mutex m(&cpMutex);
+ UCPMap *map = maps[property - UCHAR_INT_START];
+ if (map == nullptr) {
+ maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
+ }
+ return map;
+}