summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/collationsettings.h
blob: 83e775d443bfd93e48fd2adf39b54a4c12e0ae4c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* collationsettings.h
*
* created on: 2013feb07
* created by: Markus W. Scherer
*/

#ifndef __COLLATIONSETTINGS_H__
#define __COLLATIONSETTINGS_H__

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/ucol.h"
#include "collation.h"
#include "sharedobject.h"
#include "umutex.h"

U_NAMESPACE_BEGIN

struct CollationData;

/**
 * Collation settings/options/attributes.
 * These are the values that can be changed via API.
 */
struct U_I18N_API CollationSettings : public SharedObject {
    /**
     * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
     */
    static const int32_t CHECK_FCD = 1;
    /**
     * Options bit 1: Numeric collation.
     * Also known as CODAN = COllate Digits As Numbers.
     *
     * Treat digit sequences as numbers with CE sequences in numeric order,
     * rather than returning a normal CE for each digit.
     */
    static const int32_t NUMERIC = 2;
    /**
     * "Shifted" alternate handling, see ALTERNATE_MASK.
     */
    static const int32_t SHIFTED = 4;
    /**
     * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
     * Reserve values 8 and 0xc for shift-trimmed and blanked.
     */
    static const int32_t ALTERNATE_MASK = 0xc;
    /**
     * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
     */
    static const int32_t MAX_VARIABLE_SHIFT = 4;
    /** maxVariable options bit mask before shifting. */
    static const int32_t MAX_VARIABLE_MASK = 0x70;
    /** Options bit 7: Reserved/unused/0. */
    /**
     * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
     */
    static const int32_t UPPER_FIRST = 0x100;
    /**
     * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
     * unless case level is on (when they are *moved* into the separate case level).
     * By default, the case bits are removed from the tertiary weight (ignored).
     *
     * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
     * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
     */
    static const int32_t CASE_FIRST = 0x200;
    /**
     * Options bit mask for caseFirst and upperFirst, before shifting.
     * Same value as caseFirst==upperFirst.
     */
    static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
    /**
     * Options bit 10: Insert the case level between the secondary and tertiary levels.
     */
    static const int32_t CASE_LEVEL = 0x400;
    /**
     * Options bit 11: Compare secondary weights backwards. ("French secondary")
     */
    static const int32_t BACKWARD_SECONDARY = 0x800;
    /**
     * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
     * It is the top used bit field in the options. (No need to mask after shifting.)
     */
    static const int32_t STRENGTH_SHIFT = 12;
    /** Strength options bit mask before shifting. */
    static const int32_t STRENGTH_MASK = 0xf000;

    /** maxVariable values */
    enum MaxVariable {
        MAX_VAR_SPACE,
        MAX_VAR_PUNCT,
        MAX_VAR_SYMBOL,
        MAX_VAR_CURRENCY
    };

    CollationSettings()
            : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) |
                      (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
              variableTop(0),
              reorderTable(NULL),
              minHighNoReorder(0),
              reorderRanges(NULL), reorderRangesLength(0),
              reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
              fastLatinOptions(-1) {}

    CollationSettings(const CollationSettings &other);
    virtual ~CollationSettings();

    UBool operator==(const CollationSettings &other) const;

    inline UBool operator!=(const CollationSettings &other) const {
        return !operator==(other);
    }

    int32_t hashCode() const;

    void resetReordering();
    void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
                         const uint32_t *ranges, int32_t rangesLength,
                         const uint8_t *table, UErrorCode &errorCode);
    void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
                       UErrorCode &errorCode);
    void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);

    inline UBool hasReordering() const { return reorderTable != NULL; }
    static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
    inline uint32_t reorder(uint32_t p) const {
        uint8_t b = reorderTable[p >> 24];
        if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
            return ((uint32_t)b << 24) | (p & 0xffffff);
        } else {
            return reorderEx(p);
        }
    }

    void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);

    static int32_t getStrength(int32_t options) {
        return options >> STRENGTH_SHIFT;
    }

    int32_t getStrength() const {
        return getStrength(options);
    }

    /** Sets the options bit for an on/off attribute. */
    void setFlag(int32_t bit, UColAttributeValue value,
                 int32_t defaultOptions, UErrorCode &errorCode);

    UColAttributeValue getFlag(int32_t bit) const {
        return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF;
    }

    void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode);

    UColAttributeValue getCaseFirst() const {
        int32_t option = options & CASE_FIRST_AND_UPPER_MASK;
        return (option == 0) ? UCOL_OFF :
                (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST;
    }

    void setAlternateHandling(UColAttributeValue value,
                              int32_t defaultOptions, UErrorCode &errorCode);

    UColAttributeValue getAlternateHandling() const {
        return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED;
    }

    void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);

    MaxVariable getMaxVariable() const {
        return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT);
    }

    /**
     * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
     */
    static inline UBool isTertiaryWithCaseBits(int32_t options) {
        return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
    }
    static uint32_t getTertiaryMask(int32_t options) {
        // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
        return isTertiaryWithCaseBits(options) ?
                Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK;
    }

    static UBool sortsTertiaryUpperCaseFirst(int32_t options) {
        // On tertiary level, consider case bits and sort uppercase first
        // if caseLevel is off and caseFirst==upperFirst.
        return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
    }

    inline UBool dontCheckFCD() const {
        return (options & CHECK_FCD) == 0;
    }

    inline UBool hasBackwardSecondary() const {
        return (options & BACKWARD_SECONDARY) != 0;
    }

    inline UBool isNumeric() const {
        return (options & NUMERIC) != 0;
    }

    /** CHECK_FCD etc. */
    int32_t options;
    /** Variable-top primary weight. */
    uint32_t variableTop;
    /**
     * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
     * A 0 entry at a non-zero index means that the primary lead byte is "split"
     * (there are different offsets for primaries that share that lead byte)
     * and the reordering offset must be determined via the reorderRanges.
     */
    const uint8_t *reorderTable;
    /** Limit of last reordered range. 0 if no reordering or no split bytes. */
    uint32_t minHighNoReorder;
    /**
     * Primary-weight ranges for script reordering,
     * to be used by reorder(p) for split-reordered primary lead bytes.
     *
     * Each entry is a (limit, offset) pair.
     * The upper 16 bits of the entry are the upper 16 bits of the
     * exclusive primary limit of a range.
     * Primaries between the previous limit and this one have their lead bytes
     * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
     *
     * CollationData::makeReorderRanges() writes a full list where the first range
     * (at least for terminators and separators) has a 0 offset.
     * The last range has a non-zero offset.
     * minHighNoReorder is set to the limit of that last range.
     *
     * In the settings object, the initial ranges before the first split lead byte
     * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
     * If there are no split-reordered lead bytes, then no ranges are needed.
     */
    const uint32_t *reorderRanges;
    int32_t reorderRangesLength;
    /** Array of reorder codes; ignored if reorderCodesLength == 0. */
    const int32_t *reorderCodes;
    /** Number of reorder codes; 0 if no reordering. */
    int32_t reorderCodesLength;
    /**
     * Capacity of reorderCodes.
     * If 0, then the codes, the ranges, and the table are aliases.
     * Otherwise, this object owns the memory via the reorderCodes pointer;
     * the codes, the ranges, and the table are in the same memory block, in that order.
     */
    int32_t reorderCodesCapacity;

    /** Options for CollationFastLatin. Negative if disabled. */
    int32_t fastLatinOptions;
    uint16_t fastLatinPrimaries[0x180];

private:
    void setReorderArrays(const int32_t *codes, int32_t codesLength,
                          const uint32_t *ranges, int32_t rangesLength,
                          const uint8_t *table, UErrorCode &errorCode);
    uint32_t reorderEx(uint32_t p) const;
};

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION
#endif  // __COLLATIONSETTINGS_H__