summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/regexcst.txt
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/i18n/regexcst.txt')
-rw-r--r--intl/icu/source/i18n/regexcst.txt505
1 files changed, 505 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/regexcst.txt b/intl/icu/source/i18n/regexcst.txt
new file mode 100644
index 0000000000..d69a7ea369
--- /dev/null
+++ b/intl/icu/source/i18n/regexcst.txt
@@ -0,0 +1,505 @@
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+#*****************************************************************************
+#
+# Copyright (C) 2002-2015, International Business Machines Corporation and others.
+# All Rights Reserved.
+#
+#*****************************************************************************
+#
+# file: regexcst.txt
+# ICU Regular Expression Parser State Table
+#
+# This state table is used when reading and parsing a regular expression pattern
+# The pattern parser uses a state machine; the data in this file define the
+# state transitions that occur for each input character.
+#
+# *** This file defines the regex pattern grammar. This is it.
+# *** The determination of what is accepted is here.
+#
+# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
+# that are then built with the rule parser.
+#
+
+#
+# Here is the syntax of the state definitions in this file:
+#
+#
+#StateName:
+# input-char n next-state ^push-state action
+# input-char n next-state ^push-state action
+# | | | | |
+# | | | | |--- action to be performed by state machine
+# | | | | See function RBBIRuleScanner::doParseActions()
+# | | | |
+# | | | |--- Push this named state onto the state stack.
+# | | | Later, when next state is specified as "pop",
+# | | | the pushed state will become the current state.
+# | | |
+# | | |--- Transition to this state if the current input character matches the input
+# | | character or char class in the left hand column. "pop" causes the next
+# | | state to be popped from the state stack.
+# | |
+# | |--- When making the state transition specified on this line, advance to the next
+# | character from the input only if 'n' appears here.
+# |
+# |--- Character or named character classes to test for. If the current character being scanned
+# matches, peform the actions and go to the state specified on this line.
+# The input character is tested sequentally, in the order written. The characters and
+# character classes tested for do not need to be mutually exclusive. The first match wins.
+#
+
+
+
+
+#
+# start state, scan position is at the beginning of the pattern.
+#
+start:
+ default term doPatStart
+
+
+
+
+#
+# term. At a position where we can accept the start most items in a pattern.
+#
+term:
+ quoted n expr-quant doLiteralChar
+ rule_char n expr-quant doLiteralChar
+ '[' n set-open ^set-finish doSetBegin
+ '(' n open-paren
+ '.' n expr-quant doDotAny
+ '^' n expr-quant doCaret
+ '$' n expr-quant doDollar
+ '\' n backslash
+ '|' n term doOrOperator
+ ')' n pop doCloseParen
+ eof term doPatFinish
+ default errorDeath doRuleError
+
+
+
+#
+# expr-quant We've just finished scanning a term, now look for the optional
+# trailing quantifier - *, +, ?, *?, etc.
+#
+expr-quant:
+ '*' n quant-star
+ '+' n quant-plus
+ '?' n quant-opt
+ '{' n interval-open doIntervalInit
+ '(' n open-paren-quant
+ default expr-cont
+
+
+#
+# expr-cont Expression, continuation. At a point where additional terms are
+# allowed, but not required. No Quantifiers
+#
+expr-cont:
+ '|' n term doOrOperator
+ ')' n pop doCloseParen
+ default term
+
+
+#
+# open-paren-quant Special case handling for comments appearing before a quantifier,
+# e.g. x(?#comment )*
+# Open parens from expr-quant come here; anything but a (?# comment
+# branches into the normal parenthesis sequence as quickly as possible.
+#
+open-paren-quant:
+ '?' n open-paren-quant2 doSuppressComments
+ default open-paren
+
+open-paren-quant2:
+ '#' n paren-comment ^expr-quant
+ default open-paren-extended
+
+
+#
+# open-paren We've got an open paren. We need to scan further to
+# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
+#
+open-paren:
+ '?' n open-paren-extended doSuppressComments
+ default term ^expr-quant doOpenCaptureParen
+
+open-paren-extended:
+ ':' n term ^expr-quant doOpenNonCaptureParen # (?:
+ '>' n term ^expr-quant doOpenAtomicParen # (?>
+ '=' n term ^expr-cont doOpenLookAhead # (?=
+ '!' n term ^expr-cont doOpenLookAheadNeg # (?!
+ '<' n open-paren-lookbehind
+ '#' n paren-comment ^term
+ 'i' paren-flag doBeginMatchMode
+ 'd' paren-flag doBeginMatchMode
+ 'm' paren-flag doBeginMatchMode
+ 's' paren-flag doBeginMatchMode
+ 'u' paren-flag doBeginMatchMode
+ 'w' paren-flag doBeginMatchMode
+ 'x' paren-flag doBeginMatchMode
+ '-' paren-flag doBeginMatchMode
+ '(' n errorDeath doConditionalExpr
+ '{' n errorDeath doPerlInline
+ default errorDeath doBadOpenParenType
+
+open-paren-lookbehind:
+ '=' n term ^expr-cont doOpenLookBehind # (?<=
+ '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
+ ascii_letter named-capture doBeginNamedCapture # (?<name
+ default errorDeath doBadOpenParenType
+
+
+#
+# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
+#
+paren-comment:
+ ')' n pop
+ eof errorDeath doMismatchedParenErr
+ default n paren-comment
+
+#
+# paren-flag Scanned a (?ismx-ismx flag setting
+#
+paren-flag:
+ 'i' n paren-flag doMatchMode
+ 'd' n paren-flag doMatchMode
+ 'm' n paren-flag doMatchMode
+ 's' n paren-flag doMatchMode
+ 'u' n paren-flag doMatchMode
+ 'w' n paren-flag doMatchMode
+ 'x' n paren-flag doMatchMode
+ '-' n paren-flag doMatchMode
+ ')' n term doSetMatchMode
+ ':' n term ^expr-quant doMatchModeParen
+ default errorDeath doBadModeFlag
+
+#
+# named-capture (?<name> ... ), position currently on the name.
+#
+named-capture:
+ ascii_letter n named-capture doContinueNamedCapture
+ digit_char n named-capture doContinueNamedCapture
+ '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
+ default errorDeath doBadNamedCapture
+
+#
+# quant-star Scanning a '*' quantifier. Need to look ahead to decide
+# between plain '*', '*?', '*+'
+#
+quant-star:
+ '?' n expr-cont doNGStar # *?
+ '+' n expr-cont doPossessiveStar # *+
+ default expr-cont doStar
+
+
+#
+# quant-plus Scanning a '+' quantifier. Need to look ahead to decide
+# between plain '+', '+?', '++'
+#
+quant-plus:
+ '?' n expr-cont doNGPlus # *?
+ '+' n expr-cont doPossessivePlus # *+
+ default expr-cont doPlus
+
+
+#
+# quant-opt Scanning a '?' quantifier. Need to look ahead to decide
+# between plain '?', '??', '?+'
+#
+quant-opt:
+ '?' n expr-cont doNGOpt # ??
+ '+' n expr-cont doPossessiveOpt # ?+
+ default expr-cont doOpt # ?
+
+
+#
+# Interval scanning a '{', the opening delimiter for an interval specification
+# {number} or {min, max} or {min,}
+#
+interval-open:
+ digit_char interval-lower
+ default errorDeath doIntervalError
+
+interval-lower:
+ digit_char n interval-lower doIntevalLowerDigit
+ ',' n interval-upper
+ '}' n interval-type doIntervalSame # {n}
+ default errorDeath doIntervalError
+
+interval-upper:
+ digit_char n interval-upper doIntervalUpperDigit
+ '}' n interval-type
+ default errorDeath doIntervalError
+
+interval-type:
+ '?' n expr-cont doNGInterval # {n,m}?
+ '+' n expr-cont doPossessiveInterval # {n,m}+
+ default expr-cont doInterval # {m,n}
+
+
+#
+# backslash # Backslash. Figure out which of the \thingies we have encountered.
+# The low level next-char function will have preprocessed
+# some of them already; those won't come here.
+backslash:
+ 'A' n term doBackslashA
+ 'B' n term doBackslashB
+ 'b' n term doBackslashb
+ 'd' n expr-quant doBackslashd
+ 'D' n expr-quant doBackslashD
+ 'G' n term doBackslashG
+ 'h' n expr-quant doBackslashh
+ 'H' n expr-quant doBackslashH
+ 'k' n named-backref
+ 'N' expr-quant doNamedChar # \N{NAME} named char
+ 'p' expr-quant doProperty # \p{Lu} style property
+ 'P' expr-quant doProperty
+ 'R' n expr-quant doBackslashR
+ 'Q' n term doEnterQuoteMode
+ 'S' n expr-quant doBackslashS
+ 's' n expr-quant doBackslashs
+ 'v' n expr-quant doBackslashv
+ 'V' n expr-quant doBackslashV
+ 'W' n expr-quant doBackslashW
+ 'w' n expr-quant doBackslashw
+ 'X' n expr-quant doBackslashX
+ 'Z' n term doBackslashZ
+ 'z' n term doBackslashz
+ digit_char n expr-quant doBackRef # Will scan multiple digits
+ eof errorDeath doEscapeError
+ default n expr-quant doEscapedLiteralChar
+
+
+# named-backref Scanned \k
+# Leading to \k<captureName>
+# Failure to get the full sequence is an error.
+#
+named-backref:
+ '<' n named-backref-2 doBeginNamedBackRef
+ default errorDeath doBadNamedCapture
+
+named-backref-2:
+ ascii_letter n named-backref-3 doContinueNamedBackRef
+ default errorDeath doBadNamedCapture
+
+named-backref-3:
+ ascii_letter n named-backref-3 doContinueNamedBackRef
+ digit_char n named-backref-3 doContinueNamedBackRef
+ '>' n expr-quant doCompleteNamedBackRef
+ default errorDeath doBadNamedCapture
+
+
+#
+# [set expression] parsing,
+# All states involved in parsing set expressions have names beginning with "set-"
+#
+
+set-open:
+ '^' n set-open2 doSetNegate
+ ':' set-posix doSetPosixProp
+ default set-open2
+
+set-open2:
+ ']' n set-after-lit doSetLiteral
+ default set-start
+
+# set-posix:
+# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
+# moved the scan to the closing ']'. If it wasn't a property
+# expression, the scan will still be at the opening ':', which should
+# be interpreted as a normal set expression.
+set-posix:
+ ']' n pop doSetEnd
+ ':' set-start
+ default errorDeath doRuleError # should not be possible.
+
+#
+# set-start after the [ and special case leading characters (^ and/or ]) but before
+# everything else. A '-' is literal at this point.
+#
+set-start:
+ ']' n pop doSetEnd
+ '[' n set-open ^set-after-set doSetBeginUnion
+ '\' n set-escape
+ '-' n set-start-dash
+ '&' n set-start-amp
+ default n set-after-lit doSetLiteral
+
+# set-start-dash Turn "[--" into a syntax error.
+# "[-x" is good, - and x are literals.
+#
+set-start-dash:
+ '-' errorDeath doRuleError
+ default set-after-lit doSetAddDash
+
+# set-start-amp Turn "[&&" into a syntax error.
+# "[&x" is good, & and x are literals.
+#
+set-start-amp:
+ '&' errorDeath doRuleError
+ default set-after-lit doSetAddAmp
+
+#
+# set-after-lit The last thing scanned was a literal character within a set.
+# Can be followed by anything. Single '-' or '&' are
+# literals in this context, not operators.
+set-after-lit:
+ ']' n pop doSetEnd
+ '[' n set-open ^set-after-set doSetBeginUnion
+ '-' n set-lit-dash
+ '&' n set-lit-amp
+ '\' n set-escape
+ eof errorDeath doSetNoCloseError
+ default n set-after-lit doSetLiteral
+
+set-after-set:
+ ']' n pop doSetEnd
+ '[' n set-open ^set-after-set doSetBeginUnion
+ '-' n set-set-dash
+ '&' n set-set-amp
+ '\' n set-escape
+ eof errorDeath doSetNoCloseError
+ default n set-after-lit doSetLiteral
+
+set-after-range:
+ ']' n pop doSetEnd
+ '[' n set-open ^set-after-set doSetBeginUnion
+ '-' n set-range-dash
+ '&' n set-range-amp
+ '\' n set-escape
+ eof errorDeath doSetNoCloseError
+ default n set-after-lit doSetLiteral
+
+
+# set-after-op
+# After a -- or &&
+# It is an error to close a set at this point.
+#
+set-after-op:
+ '[' n set-open ^set-after-set doSetBeginUnion
+ ']' errorDeath doSetOpError
+ '\' n set-escape
+ default n set-after-lit doSetLiteral
+
+#
+# set-set-amp
+# Have scanned [[set]&
+# Could be a '&' intersection operator, if a set follows.
+# Could be the start of a '&&' operator.
+# Otherewise is a literal.
+set-set-amp:
+ '[' n set-open ^set-after-set doSetBeginIntersection1
+ '&' n set-after-op doSetIntersection2
+ default set-after-lit doSetAddAmp
+
+
+# set-lit-amp Have scanned "[literals&"
+# Could be a start of "&&" operator or a literal
+# In [abc&[def]], the '&' is a literal
+#
+set-lit-amp:
+ '&' n set-after-op doSetIntersection2
+ default set-after-lit doSetAddAmp
+
+
+#
+# set-set-dash
+# Have scanned [set]-
+# Could be a '-' difference operator, if a [set] follows.
+# Could be the start of a '--' operator.
+# Otherewise is a literal.
+set-set-dash:
+ '[' n set-open ^set-after-set doSetBeginDifference1
+ '-' n set-after-op doSetDifference2
+ default set-after-lit doSetAddDash
+
+
+#
+# set-range-dash
+# scanned a-b- or \w-
+# any set or range like item where the trailing single '-' should
+# be literal, not a set difference operation.
+# A trailing "--" is still a difference operator.
+set-range-dash:
+ '-' n set-after-op doSetDifference2
+ default set-after-lit doSetAddDash
+
+
+set-range-amp:
+ '&' n set-after-op doSetIntersection2
+ default set-after-lit doSetAddAmp
+
+
+# set-lit-dash
+# Have scanned "[literals-" Could be a range or a -- operator or a literal
+# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
+# [abc-\p{xx} the '-' is an error
+# [abc-] the '-' is a literal
+# [ab-xy] the '-' is a range
+#
+set-lit-dash:
+ '-' n set-after-op doSetDifference2
+ '[' set-after-lit doSetAddDash
+ ']' set-after-lit doSetAddDash
+ '\' n set-lit-dash-escape
+ default n set-after-range doSetRange
+
+# set-lit-dash-escape
+#
+# scanned "[literal-\"
+# Could be a range, if the \ introduces an escaped literal char or a named char.
+# Otherwise it is an error.
+#
+set-lit-dash-escape:
+ 's' errorDeath doSetOpError
+ 'S' errorDeath doSetOpError
+ 'w' errorDeath doSetOpError
+ 'W' errorDeath doSetOpError
+ 'd' errorDeath doSetOpError
+ 'D' errorDeath doSetOpError
+ 'N' set-after-range doSetNamedRange
+ default n set-after-range doSetRange
+
+
+#
+# set-escape
+# Common back-slash escape processing within set expressions
+#
+set-escape:
+ 'p' set-after-set doSetProp
+ 'P' set-after-set doSetProp
+ 'N' set-after-lit doSetNamedChar
+ 's' n set-after-range doSetBackslash_s
+ 'S' n set-after-range doSetBackslash_S
+ 'w' n set-after-range doSetBackslash_w
+ 'W' n set-after-range doSetBackslash_W
+ 'd' n set-after-range doSetBackslash_d
+ 'D' n set-after-range doSetBackslash_D
+ 'h' n set-after-range doSetBackslash_h
+ 'H' n set-after-range doSetBackslash_H
+ 'v' n set-after-range doSetBackslash_v
+ 'V' n set-after-range doSetBackslash_V
+ default n set-after-lit doSetLiteralEscaped
+
+#
+# set-finish
+# Have just encountered the final ']' that completes a [set], and
+# arrived here via a pop. From here, we exit the set parsing world, and go
+# back to generic regular expression parsing.
+#
+set-finish:
+ default expr-quant doSetFinish
+
+
+#
+# errorDeath. This state is specified as the next state whenever a syntax error
+# in the source rules is detected. Barring bugs, the state machine will never
+# actually get here, but will stop because of the action associated with the error.
+# But, just in case, this state asks the state machine to exit.
+errorDeath:
+ default n errorDeath doExit
+
+