summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/utf8collationiterator.h
blob: 9a3ec45aeb41eb3a4141075292ac814d1f5b9668 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2012-2016, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* utf8collationiterator.h
*
* created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
* created by: Markus W. Scherer
*/

#ifndef __UTF8COLLATIONITERATOR_H__
#define __UTF8COLLATIONITERATOR_H__

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "cmemory.h"
#include "collation.h"
#include "collationdata.h"
#include "collationiterator.h"
#include "normalizer2impl.h"

U_NAMESPACE_BEGIN

/**
 * UTF-8 collation element and character iterator.
 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
 * Unnormalized text is handled by a subclass.
 */
class U_I18N_API UTF8CollationIterator : public CollationIterator {
public:
    UTF8CollationIterator(const CollationData *d, UBool numeric,
                          const uint8_t *s, int32_t p, int32_t len)
            : CollationIterator(d, numeric),
              u8(s), pos(p), length(len) {}

    virtual ~UTF8CollationIterator();

    virtual void resetToOffset(int32_t newOffset);

    virtual int32_t getOffset() const;

    virtual UChar32 nextCodePoint(UErrorCode &errorCode);

    virtual UChar32 previousCodePoint(UErrorCode &errorCode);

protected:
    /**
     * For byte sequences that are illegal in UTF-8, an error value may be returned
     * together with a bogus code point. The caller will ignore that code point.
     *
     * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
     * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
     *
     * Valid lead surrogates are returned from inside a normalized text segment,
     * where handleGetTrailSurrogate() will return the matching trail surrogate.
     */
    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);

    virtual UBool foundNULTerminator();

    virtual UBool forbidSurrogateCodePoints() const;

    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);

    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);

    const uint8_t *u8;
    int32_t pos;
    int32_t length;  // <0 for NUL-terminated strings
};

/**
 * Incrementally checks the input text for FCD and normalizes where necessary.
 */
class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
public:
    FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
                             const uint8_t *s, int32_t p, int32_t len)
            : UTF8CollationIterator(data, numeric, s, p, len),
              state(CHECK_FWD), start(p),
              nfcImpl(data->nfcImpl) {}

    virtual ~FCDUTF8CollationIterator();

    virtual void resetToOffset(int32_t newOffset);

    virtual int32_t getOffset() const;

    virtual UChar32 nextCodePoint(UErrorCode &errorCode);

    virtual UChar32 previousCodePoint(UErrorCode &errorCode);

protected:
    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);

    virtual UChar handleGetTrailSurrogate();

    virtual UBool foundNULTerminator();

    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);

    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);

private:
    UBool nextHasLccc() const;
    UBool previousHasTccc() const;

    /**
     * Switches to forward checking if possible.
     */
    void switchToForward();

    /**
     * Extends the FCD text segment forward or normalizes around pos.
     * @return TRUE if success
     */
    UBool nextSegment(UErrorCode &errorCode);

    /**
     * Switches to backward checking.
     */
    void switchToBackward();

    /**
     * Extends the FCD text segment backward or normalizes around pos.
     * @return TRUE if success
     */
    UBool previousSegment(UErrorCode &errorCode);

    UBool normalize(const UnicodeString &s, UErrorCode &errorCode);

    enum State {
        /**
         * The input text [start..pos[ passes the FCD check.
         * Moving forward checks incrementally.
         * limit is undefined.
         */
        CHECK_FWD,
        /**
         * The input text [pos..limit[ passes the FCD check.
         * Moving backward checks incrementally.
         * start is undefined.
         */
        CHECK_BWD,
        /**
         * The input text [start..limit[ passes the FCD check.
         * pos tracks the current text index.
         */
        IN_FCD_SEGMENT,
        /**
         * The input text [start..limit[ failed the FCD check and was normalized.
         * pos tracks the current index in the normalized string.
         */
        IN_NORMALIZED
    };

    State state;

    int32_t start;
    int32_t limit;

    const Normalizer2Impl &nfcImpl;
    UnicodeString normalized;
};

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION
#endif  // __UTF8COLLATIONITERATOR_H__