diff options
Diffstat (limited to 'intl/icu/source/common/uniset_closure.cpp')
-rw-r--r-- | intl/icu/source/common/uniset_closure.cpp | 250 |
1 files changed, 250 insertions, 0 deletions
diff --git a/intl/icu/source/common/uniset_closure.cpp b/intl/icu/source/common/uniset_closure.cpp new file mode 100644 index 0000000000..882231ba1a --- /dev/null +++ b/intl/icu/source/common/uniset_closure.cpp @@ -0,0 +1,250 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uniset_closure.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011may30 +* created by: Markus W. Scherer +* +* UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp +* to simplify dependencies. +* In particular, this depends on the BreakIterator, but the BreakIterator +* code also builds UnicodeSets from patterns and needs uniset_props. +*/ + +#include "unicode/brkiter.h" +#include "unicode/locid.h" +#include "unicode/parsepos.h" +#include "unicode/uniset.h" +#include "cmemory.h" +#include "ruleiter.h" +#include "ucase.h" +#include "util.h" +#include "uvector.h" + +U_NAMESPACE_BEGIN + +// TODO memory debugging provided inside uniset.cpp +// could be made available here but probably obsolete with use of modern +// memory leak checker tools +#define _dbgct(me) + +//---------------------------------------------------------------- +// Constructors &c +//---------------------------------------------------------------- + +UnicodeSet::UnicodeSet(const UnicodeString& pattern, + uint32_t options, + const SymbolTable* symbols, + UErrorCode& status) { + applyPattern(pattern, options, symbols, status); + _dbgct(this); +} + +UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, + uint32_t options, + const SymbolTable* symbols, + UErrorCode& status) { + applyPattern(pattern, pos, options, symbols, status); + _dbgct(this); +} + +//---------------------------------------------------------------- +// Public API +//---------------------------------------------------------------- + +UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, + uint32_t options, + const SymbolTable* symbols, + UErrorCode& status) { + ParsePosition pos(0); + applyPattern(pattern, pos, options, symbols, status); + if (U_FAILURE(status)) return *this; + + int32_t i = pos.getIndex(); + + if (options & USET_IGNORE_SPACE) { + // Skip over trailing whitespace + ICU_Utility::skipWhitespace(pattern, i, TRUE); + } + + if (i != pattern.length()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } + return *this; +} + +UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, + ParsePosition& pos, + uint32_t options, + const SymbolTable* symbols, + UErrorCode& status) { + if (U_FAILURE(status)) { + return *this; + } + if (isFrozen()) { + status = U_NO_WRITE_PERMISSION; + return *this; + } + // Need to build the pattern in a temporary string because + // _applyPattern calls add() etc., which set pat to empty. + UnicodeString rebuiltPat; + RuleCharacterIterator chars(pattern, symbols, pos); + applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, 0, status); + if (U_FAILURE(status)) return *this; + if (chars.inVariable()) { + // syntaxError(chars, "Extra chars in variable value"); + status = U_MALFORMED_SET; + return *this; + } + setPattern(rebuiltPat); + return *this; +} + +// USetAdder implementation +// Does not use uset.h to reduce code dependencies +static void U_CALLCONV +_set_add(USet *set, UChar32 c) { + ((UnicodeSet *)set)->add(c); +} + +static void U_CALLCONV +_set_addRange(USet *set, UChar32 start, UChar32 end) { + ((UnicodeSet *)set)->add(start, end); +} + +static void U_CALLCONV +_set_addString(USet *set, const UChar *str, int32_t length) { + ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); +} + +//---------------------------------------------------------------- +// Case folding API +//---------------------------------------------------------------- + +// add the result of a full case mapping to the set +// use str as a temporary string to avoid constructing one +static inline void +addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { + if(result >= 0) { + if(result > UCASE_MAX_STRING_LENGTH) { + // add a single-code point case mapping + set.add(result); + } else { + // add a string case mapping from full with length result + str.setTo((UBool)FALSE, full, result); + set.add(str); + } + } + // result < 0: the code point mapped to itself, no need to add it + // see ucase.h +} + +UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { + if (isFrozen() || isBogus()) { + return *this; + } + if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { + { + UnicodeSet foldSet(*this); + UnicodeString str; + USetAdder sa = { + foldSet.toUSet(), + _set_add, + _set_addRange, + _set_addString, + NULL, // don't need remove() + NULL // don't need removeRange() + }; + + // start with input set to guarantee inclusion + // USET_CASE: remove strings because the strings will actually be reduced (folded); + // therefore, start with no strings and add only those needed + if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) { + foldSet.strings->removeAllElements(); + } + + int32_t n = getRangeCount(); + UChar32 result; + const UChar *full; + + for (int32_t i=0; i<n; ++i) { + UChar32 start = getRangeStart(i); + UChar32 end = getRangeEnd(i); + + if (attribute & USET_CASE_INSENSITIVE) { + // full case closure + for (UChar32 cp=start; cp<=end; ++cp) { + ucase_addCaseClosure(cp, &sa); + } + } else { + // add case mappings + // (does not add long s for regular s, or Kelvin for k, for example) + for (UChar32 cp=start; cp<=end; ++cp) { + result = ucase_toFullLower(cp, NULL, NULL, &full, UCASE_LOC_ROOT); + addCaseMapping(foldSet, result, full, str); + + result = ucase_toFullTitle(cp, NULL, NULL, &full, UCASE_LOC_ROOT); + addCaseMapping(foldSet, result, full, str); + + result = ucase_toFullUpper(cp, NULL, NULL, &full, UCASE_LOC_ROOT); + addCaseMapping(foldSet, result, full, str); + + result = ucase_toFullFolding(cp, &full, 0); + addCaseMapping(foldSet, result, full, str); + } + } + } + if (hasStrings()) { + if (attribute & USET_CASE_INSENSITIVE) { + for (int32_t j=0; j<strings->size(); ++j) { + str = *(const UnicodeString *) strings->elementAt(j); + str.foldCase(); + if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) { + foldSet.add(str); // does not map to code points: add the folded string itself + } + } + } else { + Locale root(""); +#if !UCONFIG_NO_BREAK_ITERATION + UErrorCode status = U_ZERO_ERROR; + BreakIterator *bi = BreakIterator::createWordInstance(root, status); + if (U_SUCCESS(status)) { +#endif + const UnicodeString *pStr; + + for (int32_t j=0; j<strings->size(); ++j) { + pStr = (const UnicodeString *) strings->elementAt(j); + (str = *pStr).toLower(root); + foldSet.add(str); +#if !UCONFIG_NO_BREAK_ITERATION + (str = *pStr).toTitle(bi, root); + foldSet.add(str); +#endif + (str = *pStr).toUpper(root); + foldSet.add(str); + (str = *pStr).foldCase(); + foldSet.add(str); + } +#if !UCONFIG_NO_BREAK_ITERATION + } + delete bi; +#endif + } + } + *this = foldSet; + } + } + return *this; +} + +U_NAMESPACE_END |