diff options
Diffstat (limited to 'intl/icu/source/i18n/regexcmp.h')
-rw-r--r-- | intl/icu/source/i18n/regexcmp.h | 234 |
1 files changed, 234 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/regexcmp.h b/intl/icu/source/i18n/regexcmp.h new file mode 100644 index 0000000000..81ac9e5178 --- /dev/null +++ b/intl/icu/source/i18n/regexcmp.h @@ -0,0 +1,234 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// +// regexcmp.h +// +// Copyright (C) 2002-2016, International Business Machines Corporation and others. +// All Rights Reserved. +// +// This file contains declarations for the class RegexCompile +// +// This class is internal to the regular expression implementation. +// For the public Regular Expression API, see the file "unicode/regex.h" +// + + +#ifndef RBBISCAN_H +#define RBBISCAN_H + +#include "unicode/utypes.h" +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + +#include "unicode/parseerr.h" +#include "unicode/uniset.h" +#include "unicode/uobject.h" +#include "unicode/utext.h" +#include "uhash.h" +#include "uvector.h" +#include "uvectr32.h" + + + +U_NAMESPACE_BEGIN + + +//-------------------------------------------------------------------------------- +// +// class RegexCompile Contains the regular expression compiler. +// +//-------------------------------------------------------------------------------- +class RegexPattern; + + +class U_I18N_API RegexCompile : public UMemory { +public: + + enum { + kStackSize = 100 // The size of the state stack for + }; // pattern parsing. Corresponds roughly + // to the depth of parentheses nesting + // that is allowed in the rules. + + struct RegexPatternChar { + UChar32 fChar; + UBool fQuoted; + }; + + RegexCompile(RegexPattern *rp, UErrorCode &e); + + void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); + void compile(UText *pat, UParseError &pp, UErrorCode &e); + + + virtual ~RegexCompile(); + + void nextChar(RegexPatternChar &c); // Get the next char from the input stream. + + + // Categories of parentheses in pattern. + // The category is saved in the compile-time parentheses stack frame, and + // determines the code to be generated when the matching close ) is encountered. + enum EParenClass { + plain = -1, // No special handling + capturing = -2, + atomic = -3, + lookAhead = -4, + negLookAhead = -5, + flags = -6, + lookBehind = -7, + lookBehindN = -8 + }; + +private: + + + UBool doParseActions(int32_t a); + void error(UErrorCode e); // error reporting convenience function. + + UChar32 nextCharLL(); + UChar32 peekCharLL(); + UnicodeSet *scanProp(); + UnicodeSet *scanPosixProp(); + void handleCloseParen(); + int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern + // at the top of the just completed block + // or operation, and optionally ensure that + // there is space to add an opcode there. + void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for + // a reference to a UnicodeSet. + void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. + int32_t LoopOp); + UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier + void literalChar(UChar32 c); // Compile a literal char + void fixLiterals(UBool split=false); // Generate code for pending literal characters. + void insertOp(int32_t where); // Open up a slot for a new op in the + // generated code at the specified location. + void appendOp(int32_t op); // Append a new op to the compiled pattern. + void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern. + int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode instruction. + int32_t allocateData(int32_t size); // Allocate space in the matcher data area. + // Return index of the newly allocated data. + int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame. + // Return offset index in the frame. + int32_t minMatchLength(int32_t start, + int32_t end); + int32_t maxMatchLength(int32_t start, + int32_t end); + void matchStartType(); + void stripNOPs(); + + void setEval(int32_t op); + void setPushOp(int32_t op); + UChar32 scanNamedChar(); + UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); + +public: // Public for testing only. + static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars); +private: + + + UErrorCode *fStatus; + RegexPattern *fRXPat; + UParseError *fParseErr; + + // + // Data associated with low level character scanning + // + int64_t fScanIndex; // Index of current character being processed + // in the rule input string. + UBool fQuoteMode; // Scan is in a \Q...\E quoted region + UBool fInBackslashQuote; // Scan is between a '\' and the following char. + UBool fEOLComments; // When scan is just after '(?', inhibit #... to + // end of line comments, in favor of (?#...) comments. + int64_t fLineNum; // Line number in input file. + int64_t fCharNum; // Char position within the line. + UChar32 fLastChar; // Previous char, needed to count CR-LF + // as a single line, not two. + UChar32 fPeekChar; // Saved char, if we've scanned ahead. + + + RegexPatternChar fC; // Current char for parse state machine + // processing. + + uint16_t fStack[kStackSize]; // State stack, holds state pushes + int32_t fStackPtr; // and pops as specified in the state + // transition rules. + + // + // Data associated with the generation of the pcode for the match engine + // + int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) + // Always has high bit (31) set so that flag values + // on the paren stack are distinguished from relocatable + // pcode addresses. + int32_t fNewModeFlags; // New flags, while compiling (?i, holds state + // until last flag is scanned. + UBool fSetModeFlag; // true for (?ismx, false for (?-ismx + + UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. + // Once completed, meaning that some non-literal pattern + // construct is encountered, the appropriate opcodes + // to match the literal will be generated, and this + // string will be cleared. + + int64_t fPatternLength; // Length of the input pattern string. + + UVector32 fParenStack; // parentheses stack. Each frame consists of + // the positions of compiled pattern operations + // needing fixup, followed by negative value. The + // first entry in each frame is the position of the + // spot reserved for use when a quantifier + // needs to add a SAVE at the start of a (block) + // The negative value (-1, -2,...) indicates + // the kind of paren that opened the frame. Some + // need special handling on close. + + + int32_t fMatchOpenParen; // The position in the compiled pattern + // of the slot reserved for a state save + // at the start of the most recently processed + // parenthesized block. Updated when processing + // a close to the location for the corresponding open. + + int32_t fMatchCloseParen; // The position in the pattern of the first + // location after the most recently processed + // parenthesized block. + + int32_t fIntervalLow; // {lower, upper} interval quantifier values. + int32_t fIntervalUpper; // Placed here temporarily, when pattern is + // initially scanned. Each new interval + // encountered overwrites these values. + // -1 for the upper interval value means none + // was specified (unlimited occurrences.) + + UStack fSetStack; // Stack of UnicodeSets, used while evaluating + // (at compile time) set expressions within + // the pattern. + UStack fSetOpStack; // Stack of pending set operators (&&, --, union) + + UChar32 fLastSetLiteral; // The last single code point added to a set. + // needed when "-y" is scanned, and we need + // to turn "x-y" into a range. + + UnicodeString *fCaptureName; // Named Capture, the group name is built up + // in this string while being scanned. +}; + +// Constant values to be pushed onto fSetOpStack while scanning & evaluating [set expressions] +// The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. + +enum SetOperations { + setStart = 0 << 16 | 1, + setEnd = 1 << 16 | 2, + setNegation = 2 << 16 | 3, + setCaseClose = 2 << 16 | 9, + setDifference2 = 3 << 16 | 4, // '--' set difference operator + setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator + setUnion = 4 << 16 | 6, // implicit union of adjacent items + setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. + setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. + }; + +U_NAMESPACE_END +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS +#endif // RBBISCAN_H |