summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/utf16collationiterator.h
blob: 34634bf494ab561aa97cc9174e2646ddda2bd17b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* utf16collationiterator.h
*
* created on: 2010oct27
* created by: Markus W. Scherer
*/

#ifndef __UTF16COLLATIONITERATOR_H__
#define __UTF16COLLATIONITERATOR_H__

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "cmemory.h"
#include "collation.h"
#include "collationdata.h"
#include "collationiterator.h"
#include "normalizer2impl.h"

U_NAMESPACE_BEGIN

/**
 * UTF-16 collation element and character iterator.
 * Handles normalized UTF-16 text inline, with length or NUL-terminated.
 * Unnormalized text is handled by a subclass.
 */
class U_I18N_API UTF16CollationIterator : public CollationIterator {
public:
    UTF16CollationIterator(const CollationData *d, UBool numeric,
                           const char16_t *s, const char16_t *p, const char16_t *lim)
            : CollationIterator(d, numeric),
              start(s), pos(p), limit(lim) {}

    UTF16CollationIterator(const UTF16CollationIterator &other, const char16_t *newText);

    virtual ~UTF16CollationIterator();

    virtual bool operator==(const CollationIterator &other) const override;

    virtual void resetToOffset(int32_t newOffset) override;

    virtual int32_t getOffset() const override;

    void setText(const char16_t *s, const char16_t *lim) {
        reset();
        start = pos = s;
        limit = lim;
    }

    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;

    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;

protected:
    // Copy constructor only for subclasses which set the pointers.
    UTF16CollationIterator(const UTF16CollationIterator &other)
            : CollationIterator(other),
              start(nullptr), pos(nullptr), limit(nullptr) {}

    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;

    virtual char16_t handleGetTrailSurrogate() override;

    virtual UBool foundNULTerminator() override;

    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;

    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;

    // UTF-16 string pointers.
    // limit can be nullptr for NUL-terminated strings.
    const char16_t *start, *pos, *limit;
};

/**
 * Incrementally checks the input text for FCD and normalizes where necessary.
 */
class U_I18N_API FCDUTF16CollationIterator : public UTF16CollationIterator {
public:
    FCDUTF16CollationIterator(const CollationData *data, UBool numeric,
                              const char16_t *s, const char16_t *p, const char16_t *lim)
            : UTF16CollationIterator(data, numeric, s, p, lim),
              rawStart(s), segmentStart(p), segmentLimit(nullptr), rawLimit(lim),
              nfcImpl(data->nfcImpl),
              checkDir(1) {}

    FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other, const char16_t *newText);

    virtual ~FCDUTF16CollationIterator();

    virtual bool operator==(const CollationIterator &other) const override;

    virtual void resetToOffset(int32_t newOffset) override;

    virtual int32_t getOffset() const override;

    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;

    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;

protected:
    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;

    virtual UBool foundNULTerminator() override;

    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;

    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;

private:
    /**
     * Switches to forward checking if possible.
     * To be called when checkDir < 0 || (checkDir == 0 && pos == limit).
     * Returns with checkDir > 0 || (checkDir == 0 && pos != limit).
     */
    void switchToForward();

    /**
     * Extend the FCD text segment forward or normalize around pos.
     * To be called when checkDir > 0 && pos != limit.
     * @return true if success, checkDir == 0 and pos != limit
     */
    UBool nextSegment(UErrorCode &errorCode);

    /**
     * Switches to backward checking.
     * To be called when checkDir > 0 || (checkDir == 0 && pos == start).
     * Returns with checkDir < 0 || (checkDir == 0 && pos != start).
     */
    void switchToBackward();

    /**
     * Extend the FCD text segment backward or normalize around pos.
     * To be called when checkDir < 0 && pos != start.
     * @return true if success, checkDir == 0 and pos != start
     */
    UBool previousSegment(UErrorCode &errorCode);

    UBool normalize(const char16_t *from, const char16_t *to, UErrorCode &errorCode);

    // Text pointers: The input text is [rawStart, rawLimit[
    // where rawLimit can be nullptr for NUL-terminated text.
    //
    // checkDir > 0:
    //
    // The input text [segmentStart..pos[ passes the FCD check.
    // Moving forward checks incrementally.
    // segmentLimit is undefined. limit == rawLimit.
    //
    // checkDir < 0:
    // The input text [pos..segmentLimit[ passes the FCD check.
    // Moving backward checks incrementally.
    // segmentStart is undefined, start == rawStart.
    //
    // checkDir == 0:
    //
    // The input text [segmentStart..segmentLimit[ is being processed.
    // These pointers are at FCD boundaries.
    // Either this text segment already passes the FCD check
    // and segmentStart==start<=pos<=limit==segmentLimit,
    // or the current segment had to be normalized so that
    // [segmentStart..segmentLimit[ turned into the normalized string,
    // corresponding to normalized.getBuffer()==start<=pos<=limit==start+normalized.length().
    const char16_t *rawStart;
    const char16_t *segmentStart;
    const char16_t *segmentLimit;
    // rawLimit==nullptr for a NUL-terminated string.
    const char16_t *rawLimit;

    const Normalizer2Impl &nfcImpl;
    UnicodeString normalized;
    // Direction of incremental FCD check. See comments before rawStart.
    int8_t checkDir;
};

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION
#endif  // __UTF16COLLATIONITERATOR_H__