intl/icu/source/i18n/usrchimp.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2001-2015 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*  08/13/2001   synwee      Creation.
**********************************************************************
*/
#ifndef USRCHIMP_H
#define USRCHIMP_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/normalizer2.h"
#include "unicode/ucol.h"
#include "unicode/ucoleitr.h"
#include "unicode/ubrk.h"

/* mask off anything but primary order */
#define UCOL_PRIMARYORDERMASK 0xffff0000
/* mask off anything but secondary order */
#define UCOL_SECONDARYORDERMASK 0x0000ff00
/* mask off anything but tertiary order */
#define UCOL_TERTIARYORDERMASK 0x000000ff
/* primary order shift */
#define UCOL_PRIMARYORDERSHIFT 16
/* secondary order shift */
#define UCOL_SECONDARYORDERSHIFT 8

#define UCOL_IGNORABLE 0

/* get weights from a CE */
#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)

#define UCOL_CONTINUATION_MARKER 0xC0

#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)

/**
 * This indicates an error has occurred during processing or there are no more CEs 
 * to be returned.
 */
#define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)

U_NAMESPACE_BEGIN

class CollationElementIterator;
class Collator;

struct PCEI
{
    uint64_t ce;
    int32_t  low;
    int32_t  high;
};

struct PCEBuffer
{
    PCEI    defaultBuffer[16];
    PCEI   *buffer;
    int32_t bufferIndex;
    int32_t bufferSize;

    PCEBuffer();
    ~PCEBuffer();

    void  reset();
    UBool isEmpty() const;
    void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
    const PCEI *get();
};

class UCollationPCE : public UMemory {
private:
    PCEBuffer          pceBuffer;
    CollationElementIterator *cei;
    UCollationStrength strength;
    UBool              toShift;
    UBool              isShifted;
    uint32_t           variableTop;

public:
    UCollationPCE(UCollationElements *elems);
    UCollationPCE(CollationElementIterator *iter);
    ~UCollationPCE();

    void init(UCollationElements *elems);
    void init(CollationElementIterator *iter);

    /**
     * Get the processed ordering priority of the next collation element in the text.
     * A single character may contain more than one collation element.
     *
     * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
     * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
     * @param status A pointer to an UErrorCode to receive any errors.
     * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER 
     *         if an error has occurred or if the end of string has been reached
     */
    int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
    /**
     * Get the processed ordering priority of the previous collation element in the text.
     * A single character may contain more than one collation element.
     *
     * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
     * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
     * @param status A pointer to an UErrorCode to receive any errors. Notably 
     *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
     *               buffer has been exhausted.
     * @return The previous collation elements ordering, otherwise returns 
     *         UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of
     *         string has been reached.
     */
    int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);

private:
    void init(const Collator &coll);
    uint64_t processCE(uint32_t ce);
};

U_NAMESPACE_END

#define INITIAL_ARRAY_SIZE_       256

struct USearch {
    // required since collation element iterator does not have a getText API
    const UChar              *text;
          int32_t             textLength; // exact length
          UBool               isOverlap;
          UBool               isCanonicalMatch;
          int16_t             elementComparisonType;
          UBreakIterator     *internalBreakIter;  // internal character breakiterator, lazily created.
          UBreakIterator     *breakIter;          // caller provided character breakiterator
    // value USEARCH_DONE is the default value
    // if we are not at the start of the text or the end of the text, 
    // depending on the iteration direction and matchedIndex is USEARCH_DONE 
    // it means that we can't find any more matches in that particular direction
          int32_t             matchedIndex; 
          int32_t             matchedLength;
          UBool               isForwardSearching;
          UBool               reset;
};

struct UPattern {
    const UChar              *text;
          int32_t             textLength; // exact length
          // length required for backwards ce comparison
          int32_t             cesLength;
          int32_t            *ces;
          int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
          int32_t             pcesLength;
          int64_t            *pces;
          int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
          UBool               hasPrefixAccents;
          UBool               hasSuffixAccents;
};

struct UStringSearch {
    struct USearch            *search;
    struct UPattern            pattern;
    const  UCollator          *collator;
    const  icu::Normalizer2   *nfd;
    // positions within the collation element iterator is used to determine
    // if we are at the start of the text.
           UCollationElements *textIter;
           icu::UCollationPCE *textProcessedIter;
    // utility collation element, used throughout program for temporary 
    // iteration.
           UCollationElements *utilIter;
           UBool               ownCollator;
           UCollationStrength  strength;
           uint32_t            ceMask;
           uint32_t            variableTop;
           UBool               toShift;
};

/**
* Exact matches without checking for the ends for extra accents.
* The match after the position within the collation element iterator is to be
* found. 
* After a match is found the offset in the collation element iterator will be
* shifted to the start of the match.
* Implementation note: 
* For tertiary we can't use the collator->tertiaryMask, that is a 
* preprocessed mask that takes into account case options. since we are only 
* concerned with exact matches, we don't need that.
* Alternate handling - since only the 16 most significant digits is only used, 
* we can safely do a compare without masking if the ce is a variable, we mask 
* and get only the primary values no shifting to quartenary is required since 
* all primary values less than variabletop will need to be masked off anyway.
* If the end character is composite and the pattern ce does not match the text 
* ce, we skip it until we find a match in the end composite character or when 
* it has passed the character. This is so that we can match pattern "a" with
* the text "\u00e6" 
* @param strsrch string search data
* @param status error status if any
* @return true if an exact match is found, false otherwise
*/
U_CFUNC
UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);

/**
* Canonical matches.
* According to the definition, matches found here will include the whole span 
* of beginning and ending accents if it overlaps that region.
* @param strsrch string search data
* @param status error status if any
* @return true if a canonical match is found, false otherwise
*/
U_CFUNC
UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);

/**
* Gets the previous match.
* Comments follows from handleNextExact
* @param strsrch string search data
* @param status error status if any
* @return True if a exact math is found, false otherwise.
*/
U_CFUNC
UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);

/**
* Canonical matches.
* According to the definition, matches found here will include the whole span 
* of beginning and ending accents if it overlaps that region.
* @param strsrch string search data
* @param status error status if any
* @return true if a canonical match is found, false otherwise
*/
U_CFUNC
UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 
                                      UErrorCode    *status);

#endif /* #if !UCONFIG_NO_COLLATION */

#endif