summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/rbbitblb.h
blob: 01eee3da4ef095bc43e4445df30f81e6ae0e6cfe (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
//  rbbitblb.h
//

/*
**********************************************************************
*   Copyright (c) 2002-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*/

#ifndef RBBITBLB_H
#define RBBITBLB_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_BREAK_ITERATION

#include "unicode/uobject.h"
#include "unicode/rbbi.h"
#include "rbbidata.h"
#include "rbbirb.h"
#include "rbbinode.h"


U_NAMESPACE_BEGIN

class RBBIRuleScanner;
class RBBIRuleBuilder;
class UVector32;

//
//  class RBBITableBuilder is part of the RBBI rule compiler.
//                         It builds the state transition table used by the RBBI runtime
//                         from the expression syntax tree generated by the rule scanner.
//
//                         This class is part of the RBBI implementation only.
//                         There is no user-visible public API here.
//

class RBBITableBuilder : public UMemory {
public:
    RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
    ~RBBITableBuilder();

    void     buildForwardTable();

    /** Return the runtime size in bytes of the built state table.  */
    int32_t  getTableSize() const;

    /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
     */
    void     exportTable(void *where);

    /** Use 8 bits to encode the forward table */
    bool     use8BitsForTable() const;

    /**
     *  Find duplicate (redundant) character classes. Begin looking with categories.first.
     *  Duplicate, if found are returned in the categories parameter.
     *  This is an iterator-like function, used to identify character classes
     *  (state table columns) that can be eliminated.
     *  @param categories in/out parameter, specifies where to start looking for duplicates,
     *                and returns the first pair of duplicates found, if any.
     *  @return true if duplicate char classes were found, false otherwise.
     */
    bool     findDuplCharClassFrom(IntPair *categories);

    /** Remove a column from the state table. Used when two character categories
     *  have been found equivalent, and merged together, to eliminate the unneeded table column.
     */
    void     removeColumn(int32_t column);

    /**
     * Check for, and remove duplicate states (table rows).
     * @return the number of states removed.
     */
    int32_t  removeDuplicateStates();

    /** Build the safe reverse table from the already-constructed forward table. */
    void     buildSafeReverseTable(UErrorCode &status);

    /** Return the runtime size in bytes of the built safe reverse state table. */
    int32_t  getSafeTableSize() const;

    /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
     */
    void     exportSafeTable(void *where);

    /** Use 8 bits to encode the safe reverse table */
    bool     use8BitsForSafeTable() const;

private:
    void     calcNullable(RBBINode *n);
    void     calcFirstPos(RBBINode *n);
    void     calcLastPos(RBBINode  *n);
    void     calcFollowPos(RBBINode *n);
    void     calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
    void     bofFixup();
    void     buildStateTable();
    void     mapLookAheadRules();
    void     flagAcceptingStates();
    void     flagLookAheadStates();
    void     flagTaggedStates();
    void     mergeRuleStatusVals();

    /**
     * Merge redundant state table columns, eliminating character classes with identical behavior.
     * Done after the state tables are generated, just before converting to their run-time format.
     */
    int32_t  mergeColumns();

    void     addRuleRootNodes(UVector *dest, RBBINode *node);

    /**
     *  Find duplicate (redundant) states, beginning at the specified pair,
     *  within this state table. This is an iterator-like function, used to
     *  identify states (state table rows) that can be eliminated.
     *  @param states in/out parameter, specifies where to start looking for duplicates,
     *                and returns the first pair of duplicates found, if any.
     *  @return true if duplicate states were found, false otherwise.
     */
    bool findDuplicateState(IntPair *states);

    /** Remove a duplicate state.
     * @param duplStates The duplicate states. The first is kept, the second is removed.
     *                   All references to the second in the state table are retargeted
     *                   to the first.
     */
    void removeState(IntPair duplStates);

    /** Find the next duplicate state in the safe reverse table. An iterator function.
     *  @param states in/out parameter, specifies where to start looking for duplicates,
     *                and returns the first pair of duplicates found, if any.
     *  @return true if a duplicate pair of states was found.
     */
    bool findDuplicateSafeState(IntPair *states);

    /** Remove a duplicate state from the safe table.
     * @param duplStates The duplicate states. The first is kept, the second is removed.
     *                   All references to the second in the state table are retargeted
     *                   to the first.
     */
    void removeSafeState(IntPair duplStates);

    // Set functions for UVector.
    //   TODO:  make a USet subclass of UVector

    void     setAdd(UVector *dest, UVector *source);
    UBool    setEquals(UVector *a, UVector *b);

    void     sortedAdd(UVector **dest, int32_t val);

public:
#ifdef RBBI_DEBUG
    void     printSet(UVector *s);
    void     printPosSets(RBBINode *n /* = nullptr */);
    void     printStates();
    void     printRuleStatusTable();
    void     printReverseTable();
#else
    #define  printSet(s)
    #define  printPosSets(n)
    #define  printStates()
    #define  printRuleStatusTable()
    #define  printReverseTable()
#endif

private:
    RBBIRuleBuilder  *fRB;
    RBBINode         *&fTree;              // The root node of the parse tree to build a
                                           //   table for.
    UErrorCode       *fStatus;

    /** State Descriptors, UVector<RBBIStateDescriptor> */
    UVector          *fDStates;            //  D states (Aho's terminology)
                                           //  Index is state number
                                           //  Contents are RBBIStateDescriptor pointers.

    /** Synthesized safe table, UVector of UnicodeString, one string per table row.   */
    UVector          *fSafeTable;

    /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
    UVector32        *fLookAheadRuleMap = nullptr;

    /* Counter used when assigning lookahead rule numbers.
     * Contains the last look-ahead number already in use.
     * The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
     * for non-lookahead accepting states. See the declarations of RBBIStateTableRowT.   */
    int32_t          fLASlotsInUse = ACCEPTING_UNCONDITIONAL;


    RBBITableBuilder(const RBBITableBuilder &other) = delete; // forbid copying of this class
    RBBITableBuilder &operator=(const RBBITableBuilder &other) = delete; // forbid copying of this class
};

//
//  RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
//                        one for each state.
class RBBIStateDescriptor : public UMemory {
public:
    UBool            fMarked;
    uint32_t         fAccepting;
    uint32_t         fLookAhead;
    UVector          *fTagVals;
    int32_t          fTagsIdx;
    UVector          *fPositions;          // Set of parse tree positions associated
                                           //   with this state.  Unordered (it's a set).
                                           //   UVector contents are RBBINode *

    UVector32        *fDtran;              // Transitions out of this state.
                                           //   indexed by input character
                                           //   contents is int index of dest state
                                           //   in RBBITableBuilder.fDStates

    RBBIStateDescriptor(int maxInputSymbol,  UErrorCode *fStatus);
    ~RBBIStateDescriptor();

private:
    RBBIStateDescriptor(const RBBIStateDescriptor &other) = delete; // forbid copying of this class
    RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other) = delete; // forbid copying of this class
};



U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

#endif