diff options
Diffstat (limited to 'intl/icu/source/common/rbbiscan.h')
-rw-r--r-- | intl/icu/source/common/rbbiscan.h | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/intl/icu/source/common/rbbiscan.h b/intl/icu/source/common/rbbiscan.h new file mode 100644 index 0000000000..8a419b9d76 --- /dev/null +++ b/intl/icu/source/common/rbbiscan.h @@ -0,0 +1,167 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// +// rbbiscan.h +// +// Copyright (C) 2002-2016, International Business Machines Corporation and others. +// All Rights Reserved. +// +// This file contains declarations for class RBBIRuleScanner +// + + +#ifndef RBBISCAN_H +#define RBBISCAN_H + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "unicode/rbbi.h" +#include "unicode/uniset.h" +#include "unicode/parseerr.h" +#include "uhash.h" +#include "uvector.h" +#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that + // looks up references to $variables within a set. +#include "rbbinode.h" +#include "rbbirpt.h" + +U_NAMESPACE_BEGIN + +class RBBIRuleBuilder; +class RBBISymbolTable; + + +//-------------------------------------------------------------------------------- +// +// class RBBIRuleScanner does the lowest level, character-at-a-time +// scanning of break iterator rules. +// +// The output of the scanner is parse trees for +// the rule expressions and a list of all Unicode Sets +// encountered. +// +//-------------------------------------------------------------------------------- + +class RBBIRuleScanner : public UMemory { +public: + + enum { + kStackSize = 100 // The size of the state stack for + }; // rules parsing. Corresponds roughly + // to the depth of parentheses nesting + // that is allowed in the rules. + + struct RBBIRuleChar { + UChar32 fChar; + UBool fEscaped; + RBBIRuleChar() : fChar(0), fEscaped(false) {} + }; + + RBBIRuleScanner(RBBIRuleBuilder *rb); + + + virtual ~RBBIRuleScanner(); + + void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. + // Return false if at end. + + UBool push(const RBBIRuleChar &c); // Push (unget) one character. + // Only a single character may be pushed. + + void parse(); // Parse the rules, generating two parse + // trees, one each for the forward and + // reverse rules, + // and a list of UnicodeSets encountered. + + int32_t numRules(); // Return the number of rules that have been seen. + + /** + * Return a rules string without unnecessary + * characters. + */ + static UnicodeString stripRules(const UnicodeString &rules); +private: + + UBool doParseActions(int32_t a); + void error(UErrorCode e); // error reporting convenience function. + void fixOpStack(RBBINode::OpPrecedence p); + // a character. + void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = nullptr); + + UChar32 nextCharLL(); +#ifdef RBBI_DEBUG + void printNodeStack(const char *title); +#endif + RBBINode *pushNewNode(RBBINode::NodeType t); + void scanSet(); + + + RBBIRuleBuilder *fRB; // The rule builder that we are part of. + + int32_t fScanIndex; // Index of current character being processed + // in the rule input string. + int32_t fNextIndex; // Index of the next character, which + // is the first character not yet scanned. + UBool fQuoteMode; // Scan is in a 'quoted region' + int32_t fLineNum; // Line number in input file. + int32_t fCharNum; // Char position within the line. + UChar32 fLastChar; // Previous char, needed to count CR-LF + // as a single line, not two. + + RBBIRuleChar fC; // Current char for parse state machine + // processing. + UnicodeString fVarName; // $variableName, valid when we've just + // scanned one. + + RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule + // parsing. index by p[state][char-class] + + uint16_t fStack[kStackSize]; // State stack, holds state pushes + int32_t fStackPtr; // and pops as specified in the state + // transition rules. + + RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created + // during the parse of a rule + int32_t fNodeStackPtr; + + + UBool fReverseRule; // True if the rule currently being scanned + // is a reverse direction rule (if it + // starts with a '!') + + UBool fLookAheadRule; // True if the rule includes a '/' + // somewhere within it. + + UBool fNoChainInRule; // True if the current rule starts with a '^'. + + RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of + // $variable symbols. + + UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to + // the sets created while parsing rules. + // The key is the string used for creating + // the set. + + UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during + // the scanning of RBBI rules. The + // indices for these are assigned by the + // perl script that builds the state tables. + // See rbbirpt.h. + + int32_t fRuleNum; // Counts each rule as it is scanned. + + int32_t fOptionStart; // Input index of start of a !!option + // keyword, while being scanned. + + UnicodeSet *gRuleSet_rule_char; + UnicodeSet *gRuleSet_white_space; + UnicodeSet *gRuleSet_name_char; + UnicodeSet *gRuleSet_name_start_char; + + RBBIRuleScanner(const RBBIRuleScanner &other) = delete; // forbid copying of this class + RBBIRuleScanner &operator=(const RBBIRuleScanner &other) = delete; // forbid copying of this class +}; + +U_NAMESPACE_END + +#endif |