diff options
Diffstat (limited to 'intl/icu/source/i18n/collationsets.h')
-rw-r--r-- | intl/icu/source/i18n/collationsets.h | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/collationsets.h b/intl/icu/source/i18n/collationsets.h new file mode 100644 index 0000000000..99aa194e76 --- /dev/null +++ b/intl/icu/source/i18n/collationsets.h @@ -0,0 +1,144 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationsets.h +* +* created on: 2013feb09 +* created by: Markus W. Scherer +*/ + +#ifndef __COLLATIONSETS_H__ +#define __COLLATIONSETS_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/uniset.h" +#include "collation.h" + +U_NAMESPACE_BEGIN + +struct CollationData; + +/** + * Finds the set of characters and strings that sort differently in the tailoring + * from the base data. + * + * Every mapping in the tailoring needs to be compared to the base, + * because some mappings are copied for optimization, and + * all contractions for a character are copied if any contractions for that character + * are added, modified or removed. + * + * It might be simpler to re-parse the rule string, but: + * - That would require duplicating some of the from-rules builder code. + * - That would make the runtime code depend on the builder. + * - That would only work if we have the rule string, and we allow users to + * omit the rule string from data files. + */ +class TailoredSet : public UMemory { +public: + TailoredSet(UnicodeSet *t) + : data(nullptr), baseData(nullptr), + tailored(t), + suffix(nullptr), + errorCode(U_ZERO_ERROR) {} + + void forData(const CollationData *d, UErrorCode &errorCode); + + /** + * @return U_SUCCESS(errorCode) in C++, void in Java + * @internal only public for access by callback + */ + UBool handleCE32(UChar32 start, UChar32 end, uint32_t ce32); + +private: + void compare(UChar32 c, uint32_t ce32, uint32_t baseCE32); + void comparePrefixes(UChar32 c, const char16_t *p, const char16_t *q); + void compareContractions(UChar32 c, const char16_t *p, const char16_t *q); + + void addPrefixes(const CollationData *d, UChar32 c, const char16_t *p); + void addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32); + void addContractions(UChar32 c, const char16_t *p); + void addSuffix(UChar32 c, const UnicodeString &sfx); + void add(UChar32 c); + + /** Prefixes are reversed in the data structure. */ + void setPrefix(const UnicodeString &pfx) { + unreversedPrefix = pfx; + unreversedPrefix.reverse(); + } + void resetPrefix() { + unreversedPrefix.remove(); + } + + const CollationData *data; + const CollationData *baseData; + UnicodeSet *tailored; + UnicodeString unreversedPrefix; + const UnicodeString *suffix; + UErrorCode errorCode; +}; + +class ContractionsAndExpansions : public UMemory { +public: + class CESink : public UMemory { + public: + virtual ~CESink(); + virtual void handleCE(int64_t ce) = 0; + virtual void handleExpansion(const int64_t ces[], int32_t length) = 0; + }; + + ContractionsAndExpansions(UnicodeSet *con, UnicodeSet *exp, CESink *s, UBool prefixes) + : data(nullptr), + contractions(con), expansions(exp), + sink(s), + addPrefixes(prefixes), + checkTailored(0), + suffix(nullptr), + errorCode(U_ZERO_ERROR) {} + + void forData(const CollationData *d, UErrorCode &errorCode); + void forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec); + + // all following: @internal, only public for access by callback + + void handleCE32(UChar32 start, UChar32 end, uint32_t ce32); + + void handlePrefixes(UChar32 start, UChar32 end, uint32_t ce32); + void handleContractions(UChar32 start, UChar32 end, uint32_t ce32); + + void addExpansions(UChar32 start, UChar32 end); + void addStrings(UChar32 start, UChar32 end, UnicodeSet *set); + + /** Prefixes are reversed in the data structure. */ + void setPrefix(const UnicodeString &pfx) { + unreversedPrefix = pfx; + unreversedPrefix.reverse(); + } + void resetPrefix() { + unreversedPrefix.remove(); + } + + const CollationData *data; + UnicodeSet *contractions; + UnicodeSet *expansions; + CESink *sink; + UBool addPrefixes; + int8_t checkTailored; // -1: collected tailored +1: exclude tailored + UnicodeSet tailored; + UnicodeSet ranges; + UnicodeString unreversedPrefix; + const UnicodeString *suffix; + int64_t ces[Collation::MAX_EXPANSION_LENGTH]; + UErrorCode errorCode; +}; + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION +#endif // __COLLATIONSETS_H__ |