diff options
Diffstat (limited to 'intl/icu/source/i18n/regexcst.txt')
-rw-r--r-- | intl/icu/source/i18n/regexcst.txt | 505 |
1 files changed, 505 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/regexcst.txt b/intl/icu/source/i18n/regexcst.txt new file mode 100644 index 0000000000..7e53578e24 --- /dev/null +++ b/intl/icu/source/i18n/regexcst.txt @@ -0,0 +1,505 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +#***************************************************************************** +# +# Copyright (C) 2002-2015, International Business Machines Corporation and others. +# All Rights Reserved. +# +#***************************************************************************** +# +# file: regexcst.txt +# ICU Regular Expression Parser State Table +# +# This state table is used when reading and parsing a regular expression pattern +# The pattern parser uses a state machine; the data in this file define the +# state transitions that occur for each input character. +# +# *** This file defines the regex pattern grammar. This is it. +# *** The determination of what is accepted is here. +# +# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays +# that are then built with the rule parser. +# + +# +# Here is the syntax of the state definitions in this file: +# +# +#StateName: +# input-char n next-state ^push-state action +# input-char n next-state ^push-state action +# | | | | | +# | | | | |--- action to be performed by state machine +# | | | | See function RBBIRuleScanner::doParseActions() +# | | | | +# | | | |--- Push this named state onto the state stack. +# | | | Later, when next state is specified as "pop", +# | | | the pushed state will become the current state. +# | | | +# | | |--- Transition to this state if the current input character matches the input +# | | character or char class in the left hand column. "pop" causes the next +# | | state to be popped from the state stack. +# | | +# | |--- When making the state transition specified on this line, advance to the next +# | character from the input only if 'n' appears here. +# | +# |--- Character or named character classes to test for. If the current character being scanned +# matches, perform the actions and go to the state specified on this line. +# The input character is tested sequentally, in the order written. The characters and +# character classes tested for do not need to be mutually exclusive. The first match wins. +# + + + + +# +# start state, scan position is at the beginning of the pattern. +# +start: + default term doPatStart + + + + +# +# term. At a position where we can accept the start most items in a pattern. +# +term: + quoted n expr-quant doLiteralChar + rule_char n expr-quant doLiteralChar + '[' n set-open ^set-finish doSetBegin + '(' n open-paren + '.' n expr-quant doDotAny + '^' n expr-quant doCaret + '$' n expr-quant doDollar + '\' n backslash + '|' n term doOrOperator + ')' n pop doCloseParen + eof term doPatFinish + default errorDeath doRuleError + + + +# +# expr-quant We've just finished scanning a term, now look for the optional +# trailing quantifier - *, +, ?, *?, etc. +# +expr-quant: + '*' n quant-star + '+' n quant-plus + '?' n quant-opt + '{' n interval-open doIntervalInit + '(' n open-paren-quant + default expr-cont + + +# +# expr-cont Expression, continuation. At a point where additional terms are +# allowed, but not required. No Quantifiers +# +expr-cont: + '|' n term doOrOperator + ')' n pop doCloseParen + default term + + +# +# open-paren-quant Special case handling for comments appearing before a quantifier, +# e.g. x(?#comment )* +# Open parens from expr-quant come here; anything but a (?# comment +# branches into the normal parenthesis sequence as quickly as possible. +# +open-paren-quant: + '?' n open-paren-quant2 doSuppressComments + default open-paren + +open-paren-quant2: + '#' n paren-comment ^expr-quant + default open-paren-extended + + +# +# open-paren We've got an open paren. We need to scan further to +# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. +# +open-paren: + '?' n open-paren-extended doSuppressComments + default term ^expr-quant doOpenCaptureParen + +open-paren-extended: + ':' n term ^expr-quant doOpenNonCaptureParen # (?: + '>' n term ^expr-quant doOpenAtomicParen # (?> + '=' n term ^expr-cont doOpenLookAhead # (?= + '!' n term ^expr-cont doOpenLookAheadNeg # (?! + '<' n open-paren-lookbehind + '#' n paren-comment ^term + 'i' paren-flag doBeginMatchMode + 'd' paren-flag doBeginMatchMode + 'm' paren-flag doBeginMatchMode + 's' paren-flag doBeginMatchMode + 'u' paren-flag doBeginMatchMode + 'w' paren-flag doBeginMatchMode + 'x' paren-flag doBeginMatchMode + '-' paren-flag doBeginMatchMode + '(' n errorDeath doConditionalExpr + '{' n errorDeath doPerlInline + default errorDeath doBadOpenParenType + +open-paren-lookbehind: + '=' n term ^expr-cont doOpenLookBehind # (?<= + '!' n term ^expr-cont doOpenLookBehindNeg # (?<! + ascii_letter named-capture doBeginNamedCapture # (?<name + default errorDeath doBadOpenParenType + + +# +# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' +# +paren-comment: + ')' n pop + eof errorDeath doMismatchedParenErr + default n paren-comment + +# +# paren-flag Scanned a (?ismx-ismx flag setting +# +paren-flag: + 'i' n paren-flag doMatchMode + 'd' n paren-flag doMatchMode + 'm' n paren-flag doMatchMode + 's' n paren-flag doMatchMode + 'u' n paren-flag doMatchMode + 'w' n paren-flag doMatchMode + 'x' n paren-flag doMatchMode + '-' n paren-flag doMatchMode + ')' n term doSetMatchMode + ':' n term ^expr-quant doMatchModeParen + default errorDeath doBadModeFlag + +# +# named-capture (?<name> ... ), position currently on the name. +# +named-capture: + ascii_letter n named-capture doContinueNamedCapture + digit_char n named-capture doContinueNamedCapture + '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. + default errorDeath doBadNamedCapture + +# +# quant-star Scanning a '*' quantifier. Need to look ahead to decide +# between plain '*', '*?', '*+' +# +quant-star: + '?' n expr-cont doNGStar # *? + '+' n expr-cont doPossessiveStar # *+ + default expr-cont doStar + + +# +# quant-plus Scanning a '+' quantifier. Need to look ahead to decide +# between plain '+', '+?', '++' +# +quant-plus: + '?' n expr-cont doNGPlus # *? + '+' n expr-cont doPossessivePlus # *+ + default expr-cont doPlus + + +# +# quant-opt Scanning a '?' quantifier. Need to look ahead to decide +# between plain '?', '??', '?+' +# +quant-opt: + '?' n expr-cont doNGOpt # ?? + '+' n expr-cont doPossessiveOpt # ?+ + default expr-cont doOpt # ? + + +# +# Interval scanning a '{', the opening delimiter for an interval specification +# {number} or {min, max} or {min,} +# +interval-open: + digit_char interval-lower + default errorDeath doIntervalError + +interval-lower: + digit_char n interval-lower doIntevalLowerDigit + ',' n interval-upper + '}' n interval-type doIntervalSame # {n} + default errorDeath doIntervalError + +interval-upper: + digit_char n interval-upper doIntervalUpperDigit + '}' n interval-type + default errorDeath doIntervalError + +interval-type: + '?' n expr-cont doNGInterval # {n,m}? + '+' n expr-cont doPossessiveInterval # {n,m}+ + default expr-cont doInterval # {m,n} + + +# +# backslash # Backslash. Figure out which of the \thingies we have encountered. +# The low level next-char function will have preprocessed +# some of them already; those won't come here. +backslash: + 'A' n term doBackslashA + 'B' n term doBackslashB + 'b' n term doBackslashb + 'd' n expr-quant doBackslashd + 'D' n expr-quant doBackslashD + 'G' n term doBackslashG + 'h' n expr-quant doBackslashh + 'H' n expr-quant doBackslashH + 'k' n named-backref + 'N' expr-quant doNamedChar # \N{NAME} named char + 'p' expr-quant doProperty # \p{Lu} style property + 'P' expr-quant doProperty + 'R' n expr-quant doBackslashR + 'Q' n term doEnterQuoteMode + 'S' n expr-quant doBackslashS + 's' n expr-quant doBackslashs + 'v' n expr-quant doBackslashv + 'V' n expr-quant doBackslashV + 'W' n expr-quant doBackslashW + 'w' n expr-quant doBackslashw + 'X' n expr-quant doBackslashX + 'Z' n term doBackslashZ + 'z' n term doBackslashz + digit_char n expr-quant doBackRef # Will scan multiple digits + eof errorDeath doEscapeError + default n expr-quant doEscapedLiteralChar + + +# named-backref Scanned \k +# Leading to \k<captureName> +# Failure to get the full sequence is an error. +# +named-backref: + '<' n named-backref-2 doBeginNamedBackRef + default errorDeath doBadNamedCapture + +named-backref-2: + ascii_letter n named-backref-3 doContinueNamedBackRef + default errorDeath doBadNamedCapture + +named-backref-3: + ascii_letter n named-backref-3 doContinueNamedBackRef + digit_char n named-backref-3 doContinueNamedBackRef + '>' n expr-quant doCompleteNamedBackRef + default errorDeath doBadNamedCapture + + +# +# [set expression] parsing, +# All states involved in parsing set expressions have names beginning with "set-" +# + +set-open: + '^' n set-open2 doSetNegate + ':' set-posix doSetPosixProp + default set-open2 + +set-open2: + ']' n set-after-lit doSetLiteral + default set-start + +# set-posix: +# scanned a '[:' If it really is a [:property:], doSetPosixProp will have +# moved the scan to the closing ']'. If it wasn't a property +# expression, the scan will still be at the opening ':', which should +# be interpreted as a normal set expression. +set-posix: + ']' n pop doSetEnd + ':' set-start + default errorDeath doRuleError # should not be possible. + +# +# set-start after the [ and special case leading characters (^ and/or ]) but before +# everything else. A '-' is literal at this point. +# +set-start: + ']' n pop doSetEnd + '[' n set-open ^set-after-set doSetBeginUnion + '\' n set-escape + '-' n set-start-dash + '&' n set-start-amp + default n set-after-lit doSetLiteral + +# set-start-dash Turn "[--" into a syntax error. +# "[-x" is good, - and x are literals. +# +set-start-dash: + '-' errorDeath doRuleError + default set-after-lit doSetAddDash + +# set-start-amp Turn "[&&" into a syntax error. +# "[&x" is good, & and x are literals. +# +set-start-amp: + '&' errorDeath doRuleError + default set-after-lit doSetAddAmp + +# +# set-after-lit The last thing scanned was a literal character within a set. +# Can be followed by anything. Single '-' or '&' are +# literals in this context, not operators. +set-after-lit: + ']' n pop doSetEnd + '[' n set-open ^set-after-set doSetBeginUnion + '-' n set-lit-dash + '&' n set-lit-amp + '\' n set-escape + eof errorDeath doSetNoCloseError + default n set-after-lit doSetLiteral + +set-after-set: + ']' n pop doSetEnd + '[' n set-open ^set-after-set doSetBeginUnion + '-' n set-set-dash + '&' n set-set-amp + '\' n set-escape + eof errorDeath doSetNoCloseError + default n set-after-lit doSetLiteral + +set-after-range: + ']' n pop doSetEnd + '[' n set-open ^set-after-set doSetBeginUnion + '-' n set-range-dash + '&' n set-range-amp + '\' n set-escape + eof errorDeath doSetNoCloseError + default n set-after-lit doSetLiteral + + +# set-after-op +# After a -- or && +# It is an error to close a set at this point. +# +set-after-op: + '[' n set-open ^set-after-set doSetBeginUnion + ']' errorDeath doSetOpError + '\' n set-escape + default n set-after-lit doSetLiteral + +# +# set-set-amp +# Have scanned [[set]& +# Could be a '&' intersection operator, if a set follows. +# Could be the start of a '&&' operator. +# Otherwise is a literal. +set-set-amp: + '[' n set-open ^set-after-set doSetBeginIntersection1 + '&' n set-after-op doSetIntersection2 + default set-after-lit doSetAddAmp + + +# set-lit-amp Have scanned "[literals&" +# Could be a start of "&&" operator or a literal +# In [abc&[def]], the '&' is a literal +# +set-lit-amp: + '&' n set-after-op doSetIntersection2 + default set-after-lit doSetAddAmp + + +# +# set-set-dash +# Have scanned [set]- +# Could be a '-' difference operator, if a [set] follows. +# Could be the start of a '--' operator. +# Otherwise is a literal. +set-set-dash: + '[' n set-open ^set-after-set doSetBeginDifference1 + '-' n set-after-op doSetDifference2 + default set-after-lit doSetAddDash + + +# +# set-range-dash +# scanned a-b- or \w- +# any set or range like item where the trailing single '-' should +# be literal, not a set difference operation. +# A trailing "--" is still a difference operator. +set-range-dash: + '-' n set-after-op doSetDifference2 + default set-after-lit doSetAddDash + + +set-range-amp: + '&' n set-after-op doSetIntersection2 + default set-after-lit doSetAddAmp + + +# set-lit-dash +# Have scanned "[literals-" Could be a range or a -- operator or a literal +# In [abc-[def]], the '-' is a literal (confirmed with a Java test) +# [abc-\p{xx} the '-' is an error +# [abc-] the '-' is a literal +# [ab-xy] the '-' is a range +# +set-lit-dash: + '-' n set-after-op doSetDifference2 + '[' set-after-lit doSetAddDash + ']' set-after-lit doSetAddDash + '\' n set-lit-dash-escape + default n set-after-range doSetRange + +# set-lit-dash-escape +# +# scanned "[literal-\" +# Could be a range, if the \ introduces an escaped literal char or a named char. +# Otherwise it is an error. +# +set-lit-dash-escape: + 's' errorDeath doSetOpError + 'S' errorDeath doSetOpError + 'w' errorDeath doSetOpError + 'W' errorDeath doSetOpError + 'd' errorDeath doSetOpError + 'D' errorDeath doSetOpError + 'N' set-after-range doSetNamedRange + default n set-after-range doSetRange + + +# +# set-escape +# Common back-slash escape processing within set expressions +# +set-escape: + 'p' set-after-set doSetProp + 'P' set-after-set doSetProp + 'N' set-after-lit doSetNamedChar + 's' n set-after-range doSetBackslash_s + 'S' n set-after-range doSetBackslash_S + 'w' n set-after-range doSetBackslash_w + 'W' n set-after-range doSetBackslash_W + 'd' n set-after-range doSetBackslash_d + 'D' n set-after-range doSetBackslash_D + 'h' n set-after-range doSetBackslash_h + 'H' n set-after-range doSetBackslash_H + 'v' n set-after-range doSetBackslash_v + 'V' n set-after-range doSetBackslash_V + default n set-after-lit doSetLiteralEscaped + +# +# set-finish +# Have just encountered the final ']' that completes a [set], and +# arrived here via a pop. From here, we exit the set parsing world, and go +# back to generic regular expression parsing. +# +set-finish: + default expr-quant doSetFinish + + +# +# errorDeath. This state is specified as the next state whenever a syntax error +# in the source rules is detected. Barring bugs, the state machine will never +# actually get here, but will stop because of the action associated with the error. +# But, just in case, this state asks the state machine to exit. +errorDeath: + default n errorDeath doExit + + |