summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/collationdatabuilder.h
blob: fee444deee77dcac15d766e55f8521d4e5b271bc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* collationdatabuilder.h
*
* created on: 2012apr01
* created by: Markus W. Scherer
*/

#ifndef __COLLATIONDATABUILDER_H__
#define __COLLATIONDATABUILDER_H__

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/uversion.h"
#include "collation.h"
#include "collationdata.h"
#include "collationsettings.h"
#include "normalizer2impl.h"
#include "utrie2.h"
#include "uvectr32.h"
#include "uvectr64.h"
#include "uvector.h"

U_NAMESPACE_BEGIN

struct ConditionalCE32;

class CollationFastLatinBuilder;
class CopyHelper;
class DataBuilderCollationIterator;
class UCharsTrieBuilder;

/**
 * Low-level CollationData builder.
 * Takes (character, CE) pairs and builds them into runtime data structures.
 * Supports characters with context prefixes and contraction suffixes.
 */
class U_I18N_API CollationDataBuilder : public UObject {
public:
    /**
     * Collation element modifier. Interface class for a modifier
     * that changes a tailoring builder's temporary CEs to final CEs.
     * Called for every non-special CE32 and every expansion CE.
     */
    class CEModifier : public UObject {
    public:
        virtual ~CEModifier();
        /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
        virtual int64_t modifyCE32(uint32_t ce32) const = 0;
        /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
        virtual int64_t modifyCE(int64_t ce) const = 0;
    };

    CollationDataBuilder(UErrorCode &errorCode);

    virtual ~CollationDataBuilder();

    void initForTailoring(const CollationData *b, UErrorCode &errorCode);

    virtual UBool isCompressibleLeadByte(uint32_t b) const;

    inline UBool isCompressiblePrimary(uint32_t p) const {
        return isCompressibleLeadByte(p >> 24);
    }

    /**
     * @return TRUE if this builder has mappings (e.g., add() has been called)
     */
    UBool hasMappings() const { return modified; }

    /**
     * @return TRUE if c has CEs in this builder
     */
    UBool isAssigned(UChar32 c) const;

    /**
     * @return the three-byte primary if c maps to a single such CE and has no context data,
     * otherwise returns 0.
     */
    uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;

    /**
     * @return the single CE for c.
     * Sets an error code if c does not have a single CE.
     */
    int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;

    void add(const UnicodeString &prefix, const UnicodeString &s,
             const int64_t ces[], int32_t cesLength,
             UErrorCode &errorCode);

    /**
     * Encodes the ces as either the returned ce32 by itself,
     * or by storing an expansion, with the returned ce32 referring to that.
     *
     * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
     */
    virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
    void addCE32(const UnicodeString &prefix, const UnicodeString &s,
                 uint32_t ce32, UErrorCode &errorCode);

    /**
     * Sets three-byte-primary CEs for a range of code points in code point order,
     * if it is worth doing; otherwise no change is made.
     * None of the code points in the range should have complex mappings so far
     * (expansions/contractions/prefixes).
     * @param start first code point
     * @param end last code point (inclusive)
     * @param primary primary weight for 'start'
     * @param step per-code point primary-weight increment
     * @param errorCode ICU in/out error code
     * @return TRUE if an OFFSET_TAG range was used for start..end
     */
    UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
                               uint32_t primary, int32_t step,
                               UErrorCode &errorCode);

    /**
     * Sets three-byte-primary CEs for a range of code points in code point order.
     * Sets range values if that is worth doing, or else individual values.
     * None of the code points in the range should have complex mappings so far
     * (expansions/contractions/prefixes).
     * @param start first code point
     * @param end last code point (inclusive)
     * @param primary primary weight for 'start'
     * @param step per-code point primary-weight increment
     * @param errorCode ICU in/out error code
     * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
     */
    uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
                                          uint32_t primary, int32_t step,
                                          UErrorCode &errorCode);

    /**
     * Copies all mappings from the src builder, with modifications.
     * This builder here must not be built yet, and should be empty.
     */
    void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
                  UErrorCode &errorCode);

    void optimize(const UnicodeSet &set, UErrorCode &errorCode);
    void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);

    void enableFastLatin() { fastLatinEnabled = TRUE; }
    virtual void build(CollationData &data, UErrorCode &errorCode);

    /**
     * Looks up CEs for s and appends them to the ces array.
     * Does not handle normalization: s should be in FCD form.
     *
     * Does not write completely ignorable CEs.
     * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
     *
     * @return incremented cesLength
     */
    int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
    int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
                   int64_t ces[], int32_t cesLength);

protected:
    friend class CopyHelper;
    friend class DataBuilderCollationIterator;

    uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;

    int32_t addCE(int64_t ce, UErrorCode &errorCode);
    int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
    int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);

    inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
        return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
    }
    inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
        return getConditionalCE32(Collation::indexFromCE32(ce32));
    }

    static uint32_t makeBuilderContextCE32(int32_t index) {
        return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
    }
    static inline UBool isBuilderContextCE32(uint32_t ce32) {
        return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
    }

    static uint32_t encodeOneCEAsCE32(int64_t ce);
    uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
    uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
    uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);

    uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
    /**
     * Copies base contractions to a list of ConditionalCE32.
     * Sets cond->next to the index of the first new item
     * and returns the index of the last new item.
     */
    int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
                                         ConditionalCE32 *cond, UErrorCode &errorCode);

    UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
    void setDigitTags(UErrorCode &errorCode);
    void setLeadSurrogates(UErrorCode &errorCode);

    void buildMappings(CollationData &data, UErrorCode &errorCode);

    void clearContexts();
    void buildContexts(UErrorCode &errorCode);
    uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
    int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
                           UErrorCode &errorCode);

    void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);

    int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);

    static UChar32 jamoCpFromIndex(int32_t i) {
        // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
        if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
        i -= Hangul::JAMO_L_COUNT;
        if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
        i -= Hangul::JAMO_V_COUNT;
        // i < 27
        return Hangul::JAMO_T_BASE + 1 + i;
    }

    /** @see Collation::BUILDER_DATA_TAG */
    static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;

    const Normalizer2Impl &nfcImpl;
    const CollationData *base;
    const CollationSettings *baseSettings;
    UTrie2 *trie;
    UVector32 ce32s;
    UVector64 ce64s;
    UVector conditionalCE32s;  // vector of ConditionalCE32
    // Characters that have context (prefixes or contraction suffixes).
    UnicodeSet contextChars;
    // Serialized UCharsTrie structures for finalized contexts.
    UnicodeString contexts;
    UnicodeSet unsafeBackwardSet;
    UBool modified;

    UBool fastLatinEnabled;
    CollationFastLatinBuilder *fastLatinBuilder;

    DataBuilderCollationIterator *collIter;
};

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION
#endif  // __COLLATIONDATABUILDER_H__