summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/rbbi_cache.h
blob: 597312e85c45368c43bcb56e261c1e2a7827e7e8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

// file: rbbi_cache.h
//
#ifndef RBBI_CACHE_H
#define RBBI_CACHE_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_BREAK_ITERATION

#include "unicode/rbbi.h"
#include "unicode/uobject.h"

#include "uvectr32.h"

U_NAMESPACE_BEGIN

/* DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
 *                 Dictionary boundaries are moved first to this cache, then from here
 *                 to the main BreakCache, where they may inter-leave with non-dictionary
 *                 boundaries. The public BreakIterator API always fetches directly
 *                 from the main BreakCache, not from here.
 *
 *                 In common situations, the number of boundaries in a single dictionary run
 *                 should be quite small, it will be terminated by punctuation, spaces,
 *                 or any other non-dictionary characters. The main BreakCache may end
 *                 up with boundaries from multiple dictionary based runs.
 *
 *                 The boundaries are stored in a simple ArrayList (vector), with the
 *                 assumption that they will be accessed sequentially.
 */                 
class RuleBasedBreakIterator::DictionaryCache: public UMemory {
  public:
     DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
     ~DictionaryCache();

     void reset();

     UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
     UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);

    /**
     * Populate the cache with the dictionary based boundaries within a region of text.
     * @param startPos  The start position of a range of text
     * @param endPos    The end position of a range of text
     * @param firstRuleStatus The rule status index that applies to the break at startPos
     * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
     * @internal
     */
    void populateDictionary(int32_t startPos, int32_t endPos,
                         int32_t firstRuleStatus, int32_t otherRuleStatus);



    RuleBasedBreakIterator *fBI;
    
    UVector32           fBreaks;                // A vector containing the boundaries.
    int32_t             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
                                                //    or preceding(). Optimizes sequential access.
    int32_t             fStart;                 // Text position of first boundary in cache.
    int32_t             fLimit;                 // Last boundary in cache. Which is the limit of the
                                                //    text segment being handled by the dictionary.
    int32_t             fFirstRuleStatusIndex;  // Rule status info for first boundary.
    int32_t             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
};


/*
 * class BreakCache
 *
 * Cache of break boundary positions and rule status values.
 * Break iterator API functions, next(), previous(), etc., will use cached results
 * when possible, and otherwise cache new results as they are obtained.
 *
 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
 *
 * The cache is implemented as a single circular buffer.
 */

/*
 * size of the circular cache buffer.
 */

class RuleBasedBreakIterator::BreakCache: public UMemory {
  public:
                BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
    virtual     ~BreakCache();
    void        reset(int32_t pos = 0, int32_t ruleStatus = 0);
    void        next() {    if (fBufIdx == fEndBufIdx) {
                                nextOL();
                            } else {
                                fBufIdx = modChunkSize(fBufIdx + 1);
                                fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
                                fBI->fRuleStatusIndex = fStatuses[fBufIdx];
                            }
                }


    void        nextOL();
    void        previous(UErrorCode &status);

    // Move the iteration state to the position following the startPosition.
    // Input position must be pinned to the input length.
    void        following(int32_t startPosition, UErrorCode &status);

    void        preceding(int32_t startPosition, UErrorCode &status);

    /*
     * Update the state of the public BreakIterator (fBI) to reflect the
     * current state of the break iterator cache (this).
     */
    int32_t     current();

    /**
     * Add boundaries to the cache near the specified position.
     * The given position need not be a boundary itself.
     * The input position must be within the range of the text, and
     * on a code point boundary.
     * If the requested position is a break boundary, leave the iteration
     * position on it.
     * If the requested position is not a boundary, leave the iteration
     * position on the preceding boundary and include both the
     * preceding and following boundaries in the cache.
     * Additional boundaries, either preceding or following, may be added
     * to the cache as a side effect.
     *
     * Return false if the operation failed.
     */
    UBool populateNear(int32_t position, UErrorCode &status);

    /**
     *  Add boundary(s) to the cache following the current last boundary.
     *  Return false if at the end of the text, and no more boundaries can be added.
     *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
     */
    UBool populateFollowing();

    /**
     *  Add one or more boundaries to the cache preceding the first currently cached boundary.
     *  Leave the iteration position on the first added boundary.
     *  Return false if no boundaries could be added (if at the start of the text.)
     */
    UBool populatePreceding(UErrorCode &status);

    enum UpdatePositionValues {
        RetainCachePosition = 0,
        UpdateCachePosition = 1
    };

    /*
     * Add the boundary following the current position.
     * The current position can be left as it was, or changed to the newly added boundary,
     * as specified by the update parameter.
     */
    void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);


    /*
     * Add the boundary preceding the current position.
     * The current position can be left as it was, or changed to the newly added boundary,
     * as specified by the update parameter.
     */
    bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);

    /**
     *  Set the cache position to the specified position, or, if the position
     *  falls between to cached boundaries, to the preceding boundary.
     *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
     *  The startPosition must be on a code point boundary.
     *
     *  Return true if successful, false if the specified position is after
     *  the last cached boundary or before the first.
     */
    UBool                   seek(int32_t startPosition);

    void dumpCache();

  private:
    static inline int32_t   modChunkSize(int index) { return index & (CACHE_SIZE - 1); }

    static constexpr int32_t CACHE_SIZE = 128;
    static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");

    RuleBasedBreakIterator *fBI;
    int32_t                 fStartBufIdx;
    int32_t                 fEndBufIdx;    // inclusive

    int32_t                 fTextIdx;
    int32_t                 fBufIdx;

    int32_t                 fBoundaries[CACHE_SIZE];
    uint16_t                fStatuses[CACHE_SIZE];

    UVector32               fSideBuffer;
};

U_NAMESPACE_END

#endif // #if !UCONFIG_NO_BREAK_ITERATION

#endif // RBBI_CACHE_H