summaryrefslogtreecommitdiffstats
path: root/src/backend/regex
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/regex')
-rw-r--r--src/backend/regex/COPYRIGHT84
-rw-r--r--src/backend/regex/Makefile29
-rw-r--r--src/backend/regex/README473
-rw-r--r--src/backend/regex/regc_color.c1186
-rw-r--r--src/backend/regex/regc_cvec.c138
-rw-r--r--src/backend/regex/regc_lex.c1044
-rw-r--r--src/backend/regex/regc_locale.c771
-rw-r--r--src/backend/regex/regc_nfa.c3882
-rw-r--r--src/backend/regex/regc_pg_locale.c940
-rw-r--r--src/backend/regex/regcomp.c2622
-rw-r--r--src/backend/regex/rege_dfa.c1106
-rw-r--r--src/backend/regex/regerror.c120
-rw-r--r--src/backend/regex/regexec.c1506
-rw-r--r--src/backend/regex/regexport.c293
-rw-r--r--src/backend/regex/regfree.c54
-rw-r--r--src/backend/regex/regprefix.c268
16 files changed, 14516 insertions, 0 deletions
diff --git a/src/backend/regex/COPYRIGHT b/src/backend/regex/COPYRIGHT
new file mode 100644
index 0000000..e50cfb1
--- /dev/null
+++ b/src/backend/regex/COPYRIGHT
@@ -0,0 +1,84 @@
+This regular expression package was originally developed by Henry Spencer.
+It bears the following copyright notice:
+
+**********************************************************************
+
+Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+
+Development of this software was funded, in part, by Cray Research Inc.,
+UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+Corporation, none of whom are responsible for the results. The author
+thanks all of them.
+
+Redistribution and use in source and binary forms -- with or without
+modification -- are permitted for any purpose, provided that
+redistributions in source form retain this entire copyright notice and
+indicate the origin and nature of any modifications.
+
+I'd appreciate being given credit for this package in the documentation
+of software which uses it, but that is not a requirement.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************
+
+PostgreSQL adopted the code out of Tcl 8.4.1. Portions of regc_locale.c
+and re_syntax.n were developed by Tcl developers other than Henry; these
+files bear the Tcl copyright and license notice:
+
+**********************************************************************
+
+This software is copyrighted by the Regents of the University of
+California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+Corporation and other parties. The following terms apply to all files
+associated with the software unless explicitly disclaimed in
+individual files.
+
+The authors hereby grant permission to use, copy, modify, distribute,
+and license this software and its documentation for any purpose, provided
+that existing copyright notices are retained in all copies and that this
+notice is included verbatim in any distributions. No written agreement,
+license, or royalty fee is required for any of the authorized uses.
+Modifications to this software may be copyrighted by their authors
+and need not follow the licensing terms described here, provided that
+the new terms are clearly indicated on the first page of each file where
+they apply.
+
+IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
+IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+MODIFICATIONS.
+
+GOVERNMENT USE: If you are acquiring this software on behalf of the
+U.S. government, the Government shall have only "Restricted Rights"
+in the software and related documentation as defined in the Federal
+Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
+are acquiring the software on behalf of the Department of Defense, the
+software shall be classified as "Commercial Computer Software" and the
+Government shall have only "Restricted Rights" as defined in Clause
+252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
+authors grant the U.S. Government and others acting in its behalf
+permission to use and distribute the software in accordance with the
+terms specified in this license.
+
+**********************************************************************
+
+Subsequent modifications to the code by the PostgreSQL project follow
+the same license terms as the rest of PostgreSQL.
diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile
new file mode 100644
index 0000000..5210c16
--- /dev/null
+++ b/src/backend/regex/Makefile
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for backend/regex
+#
+# IDENTIFICATION
+# src/backend/regex/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/regex
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ regcomp.o \
+ regerror.o \
+ regexec.o \
+ regexport.o \
+ regfree.o \
+ regprefix.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+# mark inclusion dependencies between .c files explicitly
+regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c \
+ regc_locale.c regc_pg_locale.c
+
+regexec.o: regexec.c rege_dfa.c
diff --git a/src/backend/regex/README b/src/backend/regex/README
new file mode 100644
index 0000000..930d8ce
--- /dev/null
+++ b/src/backend/regex/README
@@ -0,0 +1,473 @@
+Implementation notes about Henry Spencer's regex library
+========================================================
+
+If Henry ever had any internals documentation, he didn't publish it.
+So this file is an attempt to reverse-engineer some docs.
+
+General source-file layout
+--------------------------
+
+There are six separately-compilable source files, five of which expose
+exactly one exported function apiece:
+ regcomp.c: pg_regcomp
+ regexec.c: pg_regexec
+ regerror.c: pg_regerror
+ regfree.c: pg_regfree
+ regprefix.c: pg_regprefix
+(The pg_ prefixes were added by the Postgres project to distinguish this
+library version from any similar one that might be present on a particular
+system. They'd need to be removed or replaced in any standalone version
+of the library.)
+
+The sixth file, regexport.c, exposes multiple functions that allow extraction
+of info about a compiled regex (see regexport.h).
+
+There are additional source files regc_*.c that are #include'd in regcomp,
+and similarly additional source files rege_*.c that are #include'd in
+regexec. This was done to avoid exposing internal symbols globally;
+all functions not meant to be part of the library API are static.
+
+(Actually the above is a lie in one respect: there are two more global
+symbols, pg_set_regex_collation and pg_reg_getcolor in regcomp. These are
+not meant to be part of the API, but they have to be global because both
+regcomp and regexec call them. It'd be better to get rid of
+pg_set_regex_collation, as well as the static variables it sets, in favor of
+keeping the needed locale state in the regex structs. We have not done this
+yet for lack of a design for how to add application-specific state to the
+structs.)
+
+What's where in src/backend/regex/:
+
+regcomp.c Top-level regex compilation code
+regc_color.c Color map management
+regc_cvec.c Character vector (cvec) management
+regc_lex.c Lexer
+regc_nfa.c NFA handling
+regc_locale.c Application-specific locale code from Tcl project
+regc_pg_locale.c Postgres-added application-specific locale code
+regexec.c Top-level regex execution code
+rege_dfa.c DFA creation and execution
+regerror.c pg_regerror: generate text for a regex error code
+regfree.c pg_regfree: API to free a no-longer-needed regex_t
+regexport.c Functions for extracting info from a regex_t
+regprefix.c Code for extracting a common prefix from a regex_t
+
+The locale-specific code is concerned primarily with case-folding and with
+expanding locale-specific character classes, such as [[:alnum:]]. It
+really needs refactoring if this is ever to become a standalone library.
+
+The header files for the library are in src/include/regex/:
+
+regcustom.h Customizes library for particular application
+regerrs.h Error message list
+regex.h Exported API
+regexport.h Exported API for regexport.c
+regguts.h Internals declarations
+
+
+DFAs, NFAs, and all that
+------------------------
+
+This library is a hybrid DFA/NFA regex implementation. (If you've never
+heard either of those terms, get thee to a first-year comp sci textbook.)
+It might not be clear at first glance what that really means and how it
+relates to what you'll see in the code. Here's what really happens:
+
+* Initial parsing of a regex generates an NFA representation, with number
+of states approximately proportional to the length of the regexp.
+
+* The NFA is then optimized into a "compact NFA" representation, which is
+basically the same idea but without fields that are not going to be needed
+at runtime. It is simplified too: the compact format only allows "plain"
+and "LACON" arc types. The cNFA representation is what is passed from
+regcomp to regexec.
+
+* Unlike traditional NFA-based regex engines, we do not execute directly
+from the NFA representation, as that would require backtracking and so be
+very slow in some cases. Rather, we execute a DFA, which ideally can
+process an input string in linear time (O(M) for M characters of input)
+without backtracking. Each state of the DFA corresponds to a set of
+states of the NFA, that is all the states that the NFA might have been in
+upon reaching the current point in the input string. Therefore, an NFA
+with N states might require as many as 2^N states in the corresponding
+DFA, which could easily require unreasonable amounts of memory. We deal
+with this by materializing states of the DFA lazily (only when needed) and
+keeping them in a limited-size cache. The possible need to build the same
+state of the DFA repeatedly makes this approach not truly O(M) time, but
+in the worst case as much as O(M*N). That's still far better than the
+worst case for a backtracking NFA engine.
+
+If that were the end of it, we'd just say this is a DFA engine, with the
+use of NFAs being merely an implementation detail. However, a DFA engine
+cannot handle some important regex features such as capturing parens and
+back-references. If the parser finds that a regex uses these features
+(collectively called "messy cases" in the code), then we have to use
+NFA-style backtracking search after all.
+
+When using the NFA mode, the representation constructed by the parser
+consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are
+either plain regular expressions (which are executed as DFAs in the manner
+described above) or back-references (which try to match the input to some
+previous substring). Non-leaf nodes are capture nodes (which save the
+location of the substring currently matching their child node),
+concatenation, alternation, or iteration nodes. At execution time, the
+executor recursively scans the tree. At concatenation, alternation, or
+iteration nodes, it considers each possible alternative way of matching the
+input string, that is each place where the string could be split for a
+concatenation or iteration, or each child node for an alternation. It
+tries the next alternative if the match fails according to the child nodes.
+This is exactly the sort of backtracking search done by a traditional NFA
+regex engine. If there are many tree levels it can get very slow.
+
+But all is not lost: we can still be smarter than the average pure NFA
+engine. To do this, each subre node has an associated DFA, which
+represents what the node could possibly match insofar as a mathematically
+pure regex can describe that, which basically means "no backrefs".
+Before we perform any search of possible alternative sub-matches, we run
+the DFA to see if it thinks the proposed substring could possibly match.
+If not, we can reject the match immediately without iterating through many
+possibilities.
+
+As an example, consider the regex "(a[bc]+)\1". The compiled
+representation will have a top-level concatenation subre node. Its first
+child is a plain DFA node for "a[bc]+" (which is marked as being a capture
+node). The concatenation's second child is a backref node for \1.
+The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
+where the backref has been replaced by a copy of the DFA for its referent
+expression. When executed, the concatenation node will have to search for
+a possible division of the input string that allows its two child nodes to
+each match their part of the string (and although this specific case can
+only succeed when the division is at the middle, the code does not know
+that, nor would it be true in general). However, we can first run the DFA
+and quickly reject any input that doesn't start with an "a" and contain
+one more "a" plus some number of b's and c's. If the DFA doesn't match,
+there is no need to recurse to the two child nodes for each possible
+string division point. In many cases, this prefiltering makes the search
+run much faster than a pure NFA engine could do. It is this behavior that
+justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's
+library.
+
+It's perhaps worth noting that separate capture subre nodes are a rarity:
+normally, we just mark a subre as capturing and that's it. However, it's
+legal to write a regex like "((x))" in which the same substring has to be
+captured by multiple sets of parentheses. Since a subre has room for only
+one "capno" field, a single subre can't handle that. We handle such cases
+by wrapping the base subre (which captures the innermost parens) in a
+no-op capture node, or even more than one for "(((x)))" etc. This is a
+little bit inefficient because we end up with multiple identical NFAs,
+but since the case is pointless and infrequent, it's not worth working
+harder.
+
+
+Colors and colormapping
+-----------------------
+
+In many common regex patterns, there are large numbers of characters that
+can be treated alike by the execution engine. A simple example is the
+pattern "[[:alpha:]][[:alnum:]]*" for an identifier. Basically the engine
+only needs to care whether an input symbol is a letter, a digit, or other.
+We could build the NFA or DFA with a separate arc for each possible letter
+and digit, but that's very wasteful of space and not so cheap to execute
+either, especially when dealing with Unicode which can have thousands of
+letters. Instead, the parser builds a "color map" that maps each possible
+input symbol to a "color", or equivalence class. The NFA or DFA
+representation then has arcs labeled with colors, not specific input
+symbols. At execution, the first thing the executor does with each input
+symbol is to look up its color in the color map, and then everything else
+works from the color only.
+
+To build the colormap, we start by assigning every possible input symbol
+the color WHITE, which means "other" (that is, at the end of parsing, the
+symbols that are still WHITE are those not explicitly referenced anywhere
+in the regex). When we see a simple literal character or a bracket
+expression in the regex, we want to assign that character, or all the
+characters represented by the bracket expression, a unique new color that
+can be used to label the NFA arc corresponding to the state transition for
+matching this character or bracket expression. The basic idea is:
+first, change the color assigned to a character to some new value;
+second, run through all the existing arcs in the partially-built NFA,
+and for each one referencing the character's old color, add a parallel
+arc referencing its new color (this keeps the reassignment from changing
+the semantics of what we already built); and third, add a new arc with
+the character's new color to the current pair of NFA states, denoting
+that seeing this character allows the state transition to be made.
+
+This is complicated a bit by not wanting to create more colors
+(equivalence classes) than absolutely necessary. In particular, if a
+bracket expression mentions two characters that had the same color before,
+they should still share the same color after we process the bracket, since
+there is still not a need to distinguish them. But we do need to
+distinguish them from other characters that previously had the same color
+yet are not listed in the bracket expression. To mechanize this, the code
+has a concept of "parent colors" and "subcolors", where a color's subcolor
+is the new color that we are giving to any characters of that color while
+parsing the current atom. (The word "parent" is a bit unfortunate here,
+because it suggests a long-lived relationship, but a subcolor link really
+only lasts for the duration of parsing a single atom.) In other words,
+a subcolor link means that we are in process of splitting the parent color
+into two colors (equivalence classes), depending on whether or not each
+member character should be included by the current regex atom.
+
+As an example, suppose we have the regex "a\d\wx". Initially all possible
+character codes are labeled WHITE (color 0). To parse the atom "a", we
+create a new color (1), update "a"'s color map entry to 1, and create an
+arc labeled 1 between the first two states of the NFA. Now we see \d,
+which is really a bracket expression containing the digits "0"-"9".
+First we process "0", which is currently WHITE, so we create a new color
+(2), update "0"'s color map entry to 2, and create an arc labeled 2
+between the second and third states of the NFA. We also mark color WHITE
+as having the subcolor 2, which means that future relabelings of WHITE
+characters should also select 2 as the new color. Thus, when we process
+"1", we won't create a new color but re-use 2. We update "1"'s color map
+entry to 2, and then find that we don't need a new arc because there is
+already one labeled 2 between the second and third states of the NFA.
+Similarly for the other 8 digits, so there will be only one arc labeled 2
+between NFA states 2 and 3 for all members of this bracket expression.
+At completion of processing of the bracket expression, we call okcolors()
+which breaks all the existing parent/subcolor links; there is no longer a
+marker saying that WHITE characters should be relabeled 2. (Note:
+actually, we did the same creation and clearing of a subcolor link for the
+primitive atom "a", but it didn't do anything very interesting.) Now we
+come to the "\w" bracket expression, which for simplicity assume expands
+to just "[a-z0-9]". We process "a", but observe that it is already the
+sole member of its color 1. This means there is no need to subdivide that
+equivalence class more finely, so we do not create any new color. We just
+make an arc labeled 1 between the third and fourth NFA states. Next we
+process "b", which is WHITE and far from the only WHITE character, so we
+create a new color (3), link that as WHITE's subcolor, relabel "b" as
+color 3, and make an arc labeled 3. As we process "c" through "z", each
+is relabeled from WHITE to 3, but no new arc is needed. Now we come to
+"0", which is not the only member of its color 2, so we suppose that a new
+color is needed and create color 4. We link 4 as subcolor of 2, relabel
+"0" as color 4 in the map, and add an arc for color 4. Next "1" through
+"9" are similarly relabeled as color 4, with no additional arcs needed.
+Having finished the bracket expression, we call okcolors(), which breaks
+the subcolor links. okcolors() further observes that we have removed
+every member of color 2 (the previous color of the digit characters).
+Therefore, it runs through the partial NFA built so far and relabels arcs
+labeled 2 to color 4; in particular the arc from NFA state 2 to state 3 is
+relabeled color 4. Then it frees up color 2, since we have no more use
+for that color. We now have an NFA in which transitions for digits are
+consistently labeled with color 4. Last, we come to the atom "x".
+"x" is currently labeled with color 3, and it's not the only member of
+that color, so we realize that we now need to distinguish "x" from other
+letters when we did not before. We create a new color, which might have
+been 5 but instead we recycle the unused color 2. "x" is relabeled 2 in
+the color map and 2 is linked as the subcolor of 3, and we add an arc for
+2 between states 4 and 5 of the NFA. Now we call okcolors(), which breaks
+the subcolor link between colors 3 and 2 and notices that both colors are
+nonempty. Therefore, it also runs through the existing NFA arcs and adds
+an additional arc labeled 2 wherever there is an arc labeled 3; this
+action ensures that characters of color 2 (i.e., "x") will still be
+considered as allowing any transitions they did before. We are now done
+parsing the regex, and we have these final color assignments:
+ color 1: "a"
+ color 2: "x"
+ color 3: other letters
+ color 4: digits
+and the NFA has these arcs:
+ states 1 -> 2 on color 1 (hence, "a" only)
+ states 2 -> 3 on color 4 (digits)
+ states 3 -> 4 on colors 1, 3, 4, and 2 (covering all \w characters)
+ states 4 -> 5 on color 2 ("x" only)
+which can be seen to be a correct representation of the regex.
+
+There is one more complexity, which is how to handle ".", that is a
+match-anything atom. We used to do that by generating a "rainbow"
+of arcs of all live colors between the two NFA states before and after
+the dot. That's expensive in itself when there are lots of colors,
+and it also typically adds lots of follow-on arc-splitting work for the
+color splitting logic. Now we handle this case by generating a single arc
+labeled with the special color RAINBOW, meaning all colors. Such arcs
+never need to be split, so they help keep NFAs small in this common case.
+(Note: this optimization doesn't help in REG_NLSTOP mode, where "." is
+not supposed to match newline. In that case we still handle "." by
+generating an almost-rainbow of all colors except newline's color.)
+
+Given this summary, we can see we need the following operations for
+colors:
+
+* A fast way to look up the current color assignment for any character
+ code. (This is needed during both parsing and execution, while the
+ remaining operations are needed only during parsing.)
+* A way to alter the color assignment for any given character code.
+* We must track the number of characters currently assigned to each
+ color, so that we can detect empty and singleton colors.
+* We must track all existing NFA arcs of a given color, so that we
+ can relabel them at need, or add parallel arcs of a new color when
+ an existing color has to be subdivided.
+
+The last two of these are handled with the "struct colordesc" array and
+the "colorchain" links in NFA arc structs.
+
+Ideally, we'd do the first two operations using a simple linear array
+storing the current color assignment for each character code.
+Unfortunately, that's not terribly workable for large charsets such as
+Unicode. Our solution is to divide the color map into two parts. A simple
+linear array is used for character codes up to MAX_SIMPLE_CHR, which can be
+chosen large enough to include all popular characters (so that the
+significantly-slower code paths about to be described are seldom invoked).
+Characters above that need be considered at compile time only if they
+appear explicitly in the regex pattern. We store each such mentioned
+character or character range as an entry in the "colormaprange" array in
+the colormap. (Overlapping ranges are split into unique subranges, so that
+each range in the finished list needs only a single color that describes
+all its characters.) When mapping a character above MAX_SIMPLE_CHR to a
+color at runtime, we search this list of ranges explicitly.
+
+That's still not quite enough, though, because of locale-dependent
+character classes such as [[:alpha:]]. In Unicode locales these classes
+may have thousands of entries that are above MAX_SIMPLE_CHR, and we
+certainly don't want to be searching large colormaprange arrays at runtime.
+Nor do we even want to spend the time to initialize cvec structures that
+exhaustively describe all of those characters. Our solution is to compute
+exact per-character colors at regex compile time only up to MAX_SIMPLE_CHR.
+For characters above that, we apply the <ctype.h> or <wctype.h> lookup
+functions at runtime for each locale-dependent character class used in the
+regex pattern, constructing a bitmap that describes which classes the
+runtime character belongs to. The per-character-range data structure
+mentioned above actually holds, for each range, a separate color entry
+for each possible combination of character class properties. That is,
+the color map for characters above MAX_SIMPLE_CHR is really a 2-D array,
+whose rows correspond to high characters or character ranges that are
+explicitly mentioned in the regex pattern, and whose columns correspond
+to sets of the locale-dependent character classes that are used in the
+regex.
+
+As an example, given the pattern '\w\u1234[\U0001D100-\U0001D1FF]'
+(and supposing that MAX_SIMPLE_CHR is less than 0x1234), we will need
+a high color map with three rows. One row is for the single character
+U+1234 (represented as a single-element range), one is for the range
+U+1D100..U+1D1FF, and the other row represents all remaining high
+characters. The color map has two columns, one for characters that
+satisfy iswalnum() and one for those that don't.
+
+We build this color map in parallel with scanning the regex. Each time
+we detect a new explicit high character (or range) or a locale-dependent
+character class, we split existing entry(s) in the high color map so that
+characters we need to be able to distinguish will have distinct entries
+that can be given separate colors. Often, though, single entries in the
+high color map will represent very large sets of characters.
+
+If there are both explicit high characters/ranges and locale-dependent
+character classes, we may have entries in the high color map array that
+have non-WHITE colors but don't actually represent any real characters.
+(For example, in a row representing a singleton range, only one of the
+columns could possibly be a live entry; it's the one matching the actual
+locale properties for that single character.) We don't currently make
+any effort to reclaim such colors. In principle it could be done, but
+it's not clear that it's worth the trouble.
+
+
+Detailed semantics of an NFA
+----------------------------
+
+When trying to read dumped-out NFAs, it's helpful to know these facts:
+
+State 0 (additionally marked with "@" in dumpnfa's output) is always the
+goal state, and state 1 (additionally marked with ">") is the start state.
+(The code refers to these as the post state and pre state respectively.)
+
+The possible arc types are:
+
+ PLAIN arcs, which specify matching of any character of a given "color"
+ (see above). These are dumped as "[color_number]->to_state".
+ In addition there can be "rainbow" PLAIN arcs, which are dumped as
+ "[*]->to_state".
+
+ EMPTY arcs, which specify a no-op transition to another state. These
+ are dumped as "->to_state".
+
+ AHEAD constraints, which represent a "next character must be of this
+ color" constraint. AHEAD differs from a PLAIN arc in that the input
+ character is not consumed when crossing the arc. These are dumped as
+ ">color_number>->to_state", or possibly ">*>->to_state".
+
+ BEHIND constraints, which represent a "previous character must be of
+ this color" constraint, which likewise consumes no input. These are
+ dumped as "<color_number<->to_state", or possibly "<*<->to_state".
+
+ '^' arcs, which specify a beginning-of-input constraint. These are
+ dumped as "^0->to_state" or "^1->to_state" for beginning-of-string and
+ beginning-of-line constraints respectively.
+
+ '$' arcs, which specify an end-of-input constraint. These are dumped
+ as "$0->to_state" or "$1->to_state" for end-of-string and end-of-line
+ constraints respectively.
+
+ LACON constraints, which represent "(?=re)", "(?!re)", "(?<=re)", and
+ "(?<!re)" constraints, i.e. the input starting/ending at this point must
+ match (or not match) a given sub-RE, but the matching input is not
+ consumed. These are dumped as ":subtree_number:->to_state".
+
+If you see anything else (especially any question marks) in the display of
+an arc, it's dumpnfa() trying to tell you that there's something fishy
+about the arc; see the source code.
+
+The regex executor can only handle PLAIN and LACON transitions. The regex
+optimize() function is responsible for transforming the parser's output
+to get rid of all the other arc types. In particular, ^ and $ arcs that
+are not dropped as impossible will always end up adjacent to the pre or
+post state respectively, and then will be converted into PLAIN arcs that
+mention the special "colors" for BOS, BOL, EOS, or EOL.
+
+To decide whether a thus-transformed NFA matches a given substring of the
+input string, the executor essentially follows these rules:
+1. Start the NFA "looking at" the character *before* the given substring,
+or if the substring is at the start of the input, prepend an imaginary BOS
+character instead.
+2. Run the NFA until it has consumed the character *after* the given
+substring, or an imaginary following EOS character if the substring is at
+the end of the input.
+3. If the NFA is (or can be) in the goal state at this point, it matches.
+
+This definition is necessary to support regexes that begin or end with
+constraints such as \m and \M, which imply requirements on the adjacent
+character if any. The executor implements that by checking if the
+adjacent character (or BOS/BOL/EOS/EOL pseudo-character) is of the
+right color, and it does that in the same loop that checks characters
+within the match.
+
+So one can mentally execute an untransformed NFA by taking ^ and $ as
+ordinary constraints that match at start and end of input; but plain
+arcs out of the start state should be taken as matches for the character
+before the target substring, and similarly, plain arcs leading to the
+post state are matches for the character after the target substring.
+After the optimize() transformation, there are explicit arcs mentioning
+BOS/BOL/EOS/EOL adjacent to the pre-state and post-state. So a finished
+NFA for a pattern without anchors or adjacent-character constraints will
+have pre-state outarcs for RAINBOW (all possible character colors) as well
+as BOS and BOL, and likewise post-state inarcs for RAINBOW, EOS, and EOL.
+Also note that LACON arcs will never connect to the pre-state
+or post-state.
+
+
+Look-around constraints (LACONs)
+--------------------------------
+
+The regex compiler doesn't have much intelligence about LACONs; it just
+constructs a sub-NFA representing the pattern that the constraint says to
+match or not match, and puts a LACON arc referencing that sub-NFA into the
+main NFA. At runtime, the executor applies the sub-NFA at each point in
+the string where the constraint is relevant, and then traverses or doesn't
+traverse the arc. ("Traversal" means including the arc's to-state in the
+set of NFA states that are considered active at the next character.)
+
+The actual basic matching cycle of the executor is
+1. Identify the color of the next input character, then advance over it.
+2. Apply the DFA to follow all the matching "plain" arcs of the NFA.
+ (Notionally, the previous DFA state represents the set of states the
+ NFA could have been in before the character, and the new DFA state
+ represents the set of states the NFA could be in after the character.)
+3. If there are any LACON arcs leading out of any of the new NFA states,
+ apply each LACON constraint starting from the new next input character
+ (while not actually consuming any input). For each successful LACON,
+ add its to-state to the current set of NFA states. If any such
+ to-state has outgoing LACON arcs, process those in the same way.
+ (Mathematically speaking, we compute the transitive closure of the
+ set of states reachable by successful LACONs.)
+
+Thus, LACONs are always checked immediately after consuming a character
+via a plain arc. This is okay because the NFA's "pre" state only has
+plain out-arcs, so we can always consume a character (possibly a BOS
+pseudo-character as described above) before we need to worry about LACONs.
diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c
new file mode 100644
index 0000000..30bda0e
--- /dev/null
+++ b/src/backend/regex/regc_color.c
@@ -0,0 +1,1186 @@
+/*
+ * colorings of characters
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regc_color.c
+ *
+ *
+ * Note that there are some incestuous relationships between this code and
+ * NFA arc maintenance, which perhaps ought to be cleaned up sometime.
+ */
+
+
+
+#define CISERR() VISERR(cm->v)
+#define CERR(e) VERR(cm->v, (e))
+
+
+
+/*
+ * initcm - set up new colormap
+ */
+static void
+initcm(struct vars *v,
+ struct colormap *cm)
+{
+ struct colordesc *cd;
+
+ cm->magic = CMMAGIC;
+ cm->v = v;
+
+ cm->ncds = NINLINECDS;
+ cm->cd = cm->cdspace;
+ cm->max = 0;
+ cm->free = 0;
+
+ cd = cm->cd; /* cm->cd[WHITE] */
+ cd->nschrs = MAX_SIMPLE_CHR - CHR_MIN + 1;
+ cd->nuchrs = 1;
+ cd->sub = NOSUB;
+ cd->arcs = NULL;
+ cd->firstchr = CHR_MIN;
+ cd->flags = 0;
+
+ cm->locolormap = (color *)
+ MALLOC((MAX_SIMPLE_CHR - CHR_MIN + 1) * sizeof(color));
+ if (cm->locolormap == NULL)
+ {
+ CERR(REG_ESPACE);
+ cm->cmranges = NULL; /* prevent failure during freecm */
+ cm->hicolormap = NULL;
+ return;
+ }
+ /* this memset relies on WHITE being zero: */
+ memset(cm->locolormap, WHITE,
+ (MAX_SIMPLE_CHR - CHR_MIN + 1) * sizeof(color));
+
+ memset(cm->classbits, 0, sizeof(cm->classbits));
+ cm->numcmranges = 0;
+ cm->cmranges = NULL;
+ cm->maxarrayrows = 4; /* arbitrary initial allocation */
+ cm->hiarrayrows = 1; /* but we have only one row/col initially */
+ cm->hiarraycols = 1;
+ cm->hicolormap = (color *) MALLOC(cm->maxarrayrows * sizeof(color));
+ if (cm->hicolormap == NULL)
+ {
+ CERR(REG_ESPACE);
+ return;
+ }
+ /* initialize the "all other characters" row to WHITE */
+ cm->hicolormap[0] = WHITE;
+}
+
+/*
+ * freecm - free dynamically-allocated things in a colormap
+ */
+static void
+freecm(struct colormap *cm)
+{
+ cm->magic = 0;
+ if (cm->cd != cm->cdspace)
+ FREE(cm->cd);
+ if (cm->locolormap != NULL)
+ FREE(cm->locolormap);
+ if (cm->cmranges != NULL)
+ FREE(cm->cmranges);
+ if (cm->hicolormap != NULL)
+ FREE(cm->hicolormap);
+}
+
+/*
+ * pg_reg_getcolor - slow case of GETCOLOR()
+ */
+color
+pg_reg_getcolor(struct colormap *cm, chr c)
+{
+ int rownum,
+ colnum,
+ low,
+ high;
+
+ /* Should not be used for chrs in the locolormap */
+ assert(c > MAX_SIMPLE_CHR);
+
+ /*
+ * Find which row it's in. The colormapranges are in order, so we can use
+ * binary search.
+ */
+ rownum = 0; /* if no match, use array row zero */
+ low = 0;
+ high = cm->numcmranges;
+ while (low < high)
+ {
+ int middle = low + (high - low) / 2;
+ const colormaprange *cmr = &cm->cmranges[middle];
+
+ if (c < cmr->cmin)
+ high = middle;
+ else if (c > cmr->cmax)
+ low = middle + 1;
+ else
+ {
+ rownum = cmr->rownum; /* found a match */
+ break;
+ }
+ }
+
+ /*
+ * Find which column it's in --- this is all locale-dependent.
+ */
+ if (cm->hiarraycols > 1)
+ {
+ colnum = cclass_column_index(cm, c);
+ return cm->hicolormap[rownum * cm->hiarraycols + colnum];
+ }
+ else
+ {
+ /* fast path if no relevant cclasses */
+ return cm->hicolormap[rownum];
+ }
+}
+
+/*
+ * maxcolor - report largest color number in use
+ */
+static color
+maxcolor(struct colormap *cm)
+{
+ if (CISERR())
+ return COLORLESS;
+
+ return (color) cm->max;
+}
+
+/*
+ * newcolor - find a new color (must be assigned at once)
+ * Beware: may relocate the colordescs.
+ */
+static color /* COLORLESS for error */
+newcolor(struct colormap *cm)
+{
+ struct colordesc *cd;
+ size_t n;
+
+ if (CISERR())
+ return COLORLESS;
+
+ if (cm->free != 0)
+ {
+ assert(cm->free > 0);
+ assert((size_t) cm->free < cm->ncds);
+ cd = &cm->cd[cm->free];
+ assert(UNUSEDCOLOR(cd));
+ assert(cd->arcs == NULL);
+ cm->free = cd->sub;
+ }
+ else if (cm->max < cm->ncds - 1)
+ {
+ cm->max++;
+ cd = &cm->cd[cm->max];
+ }
+ else
+ {
+ /* oops, must allocate more */
+ struct colordesc *newCd;
+
+ if (cm->max == MAX_COLOR)
+ {
+ CERR(REG_ECOLORS);
+ return COLORLESS; /* too many colors */
+ }
+
+ n = cm->ncds * 2;
+ if (n > MAX_COLOR + 1)
+ n = MAX_COLOR + 1;
+ if (cm->cd == cm->cdspace)
+ {
+ newCd = (struct colordesc *) MALLOC(n * sizeof(struct colordesc));
+ if (newCd != NULL)
+ memcpy(VS(newCd), VS(cm->cdspace), cm->ncds *
+ sizeof(struct colordesc));
+ }
+ else
+ newCd = (struct colordesc *)
+ REALLOC(cm->cd, n * sizeof(struct colordesc));
+ if (newCd == NULL)
+ {
+ CERR(REG_ESPACE);
+ return COLORLESS;
+ }
+ cm->cd = newCd;
+ cm->ncds = n;
+ assert(cm->max < cm->ncds - 1);
+ cm->max++;
+ cd = &cm->cd[cm->max];
+ }
+
+ cd->nschrs = 0;
+ cd->nuchrs = 0;
+ cd->sub = NOSUB;
+ cd->arcs = NULL;
+ cd->firstchr = CHR_MIN; /* in case never set otherwise */
+ cd->flags = 0;
+
+ return (color) (cd - cm->cd);
+}
+
+/*
+ * freecolor - free a color (must have no arcs or subcolor)
+ */
+static void
+freecolor(struct colormap *cm,
+ color co)
+{
+ struct colordesc *cd = &cm->cd[co];
+ color pco,
+ nco; /* for freelist scan */
+
+ assert(co >= 0);
+ if (co == WHITE)
+ return;
+
+ assert(cd->arcs == NULL);
+ assert(cd->sub == NOSUB);
+ assert(cd->nschrs == 0);
+ assert(cd->nuchrs == 0);
+ cd->flags = FREECOL;
+
+ if ((size_t) co == cm->max)
+ {
+ while (cm->max > WHITE && UNUSEDCOLOR(&cm->cd[cm->max]))
+ cm->max--;
+ assert(cm->free >= 0);
+ while ((size_t) cm->free > cm->max)
+ cm->free = cm->cd[cm->free].sub;
+ if (cm->free > 0)
+ {
+ assert(cm->free < cm->max);
+ pco = cm->free;
+ nco = cm->cd[pco].sub;
+ while (nco > 0)
+ if ((size_t) nco > cm->max)
+ {
+ /* take this one out of freelist */
+ nco = cm->cd[nco].sub;
+ cm->cd[pco].sub = nco;
+ }
+ else
+ {
+ assert(nco < cm->max);
+ pco = nco;
+ nco = cm->cd[pco].sub;
+ }
+ }
+ }
+ else
+ {
+ cd->sub = cm->free;
+ cm->free = (color) (cd - cm->cd);
+ }
+}
+
+/*
+ * pseudocolor - allocate a false color, to be managed by other means
+ */
+static color
+pseudocolor(struct colormap *cm)
+{
+ color co;
+ struct colordesc *cd;
+
+ co = newcolor(cm);
+ if (CISERR())
+ return COLORLESS;
+ cd = &cm->cd[co];
+ cd->nschrs = 0;
+ cd->nuchrs = 1; /* pretend it is in the upper map */
+ cd->sub = NOSUB;
+ cd->arcs = NULL;
+ cd->firstchr = CHR_MIN;
+ cd->flags = PSEUDO;
+ return co;
+}
+
+/*
+ * subcolor - allocate a new subcolor (if necessary) to this chr
+ *
+ * This works only for chrs that map into the low color map.
+ */
+static color
+subcolor(struct colormap *cm, chr c)
+{
+ color co; /* current color of c */
+ color sco; /* new subcolor */
+
+ assert(c <= MAX_SIMPLE_CHR);
+
+ co = cm->locolormap[c - CHR_MIN];
+ sco = newsub(cm, co);
+ if (CISERR())
+ return COLORLESS;
+ assert(sco != COLORLESS);
+
+ if (co == sco) /* already in an open subcolor */
+ return co; /* rest is redundant */
+ cm->cd[co].nschrs--;
+ if (cm->cd[sco].nschrs == 0)
+ cm->cd[sco].firstchr = c;
+ cm->cd[sco].nschrs++;
+ cm->locolormap[c - CHR_MIN] = sco;
+ return sco;
+}
+
+/*
+ * subcolorhi - allocate a new subcolor (if necessary) to this colormap entry
+ *
+ * This is the same processing as subcolor(), but for entries in the high
+ * colormap, which do not necessarily correspond to exactly one chr code.
+ */
+static color
+subcolorhi(struct colormap *cm, color *pco)
+{
+ color co; /* current color of entry */
+ color sco; /* new subcolor */
+
+ co = *pco;
+ sco = newsub(cm, co);
+ if (CISERR())
+ return COLORLESS;
+ assert(sco != COLORLESS);
+
+ if (co == sco) /* already in an open subcolor */
+ return co; /* rest is redundant */
+ cm->cd[co].nuchrs--;
+ cm->cd[sco].nuchrs++;
+ *pco = sco;
+ return sco;
+}
+
+/*
+ * newsub - allocate a new subcolor (if necessary) for a color
+ */
+static color
+newsub(struct colormap *cm,
+ color co)
+{
+ color sco; /* new subcolor */
+
+ sco = cm->cd[co].sub;
+ if (sco == NOSUB)
+ { /* color has no open subcolor */
+ /* optimization: singly-referenced color need not be subcolored */
+ if ((cm->cd[co].nschrs + cm->cd[co].nuchrs) == 1)
+ return co;
+ sco = newcolor(cm); /* must create subcolor */
+ if (sco == COLORLESS)
+ {
+ assert(CISERR());
+ return COLORLESS;
+ }
+ cm->cd[co].sub = sco;
+ cm->cd[sco].sub = sco; /* open subcolor points to self */
+ }
+ assert(sco != NOSUB);
+
+ return sco;
+}
+
+/*
+ * newhicolorrow - get a new row in the hicolormap, cloning it from oldrow
+ *
+ * Returns array index of new row. Note the array might move.
+ */
+static int
+newhicolorrow(struct colormap *cm,
+ int oldrow)
+{
+ int newrow = cm->hiarrayrows;
+ color *newrowptr;
+ int i;
+
+ /* Assign a fresh array row index, enlarging storage if needed */
+ if (newrow >= cm->maxarrayrows)
+ {
+ color *newarray;
+
+ if (cm->maxarrayrows >= INT_MAX / (cm->hiarraycols * 2))
+ {
+ CERR(REG_ESPACE);
+ return 0;
+ }
+ newarray = (color *) REALLOC(cm->hicolormap,
+ cm->maxarrayrows * 2 *
+ cm->hiarraycols * sizeof(color));
+ if (newarray == NULL)
+ {
+ CERR(REG_ESPACE);
+ return 0;
+ }
+ cm->hicolormap = newarray;
+ cm->maxarrayrows *= 2;
+ }
+ cm->hiarrayrows++;
+
+ /* Copy old row data */
+ newrowptr = &cm->hicolormap[newrow * cm->hiarraycols];
+ memcpy(newrowptr,
+ &cm->hicolormap[oldrow * cm->hiarraycols],
+ cm->hiarraycols * sizeof(color));
+
+ /* Increase color reference counts to reflect new colormap entries */
+ for (i = 0; i < cm->hiarraycols; i++)
+ cm->cd[newrowptr[i]].nuchrs++;
+
+ return newrow;
+}
+
+/*
+ * newhicolorcols - create a new set of columns in the high colormap
+ *
+ * Essentially, extends the 2-D array to the right with a copy of itself.
+ */
+static void
+newhicolorcols(struct colormap *cm)
+{
+ color *newarray;
+ int r,
+ c;
+
+ if (cm->hiarraycols >= INT_MAX / (cm->maxarrayrows * 2))
+ {
+ CERR(REG_ESPACE);
+ return;
+ }
+ newarray = (color *) REALLOC(cm->hicolormap,
+ cm->maxarrayrows *
+ cm->hiarraycols * 2 * sizeof(color));
+ if (newarray == NULL)
+ {
+ CERR(REG_ESPACE);
+ return;
+ }
+ cm->hicolormap = newarray;
+
+ /* Duplicate existing columns to the right, and increase ref counts */
+ /* Must work backwards in the array because we realloc'd in place */
+ for (r = cm->hiarrayrows - 1; r >= 0; r--)
+ {
+ color *oldrowptr = &newarray[r * cm->hiarraycols];
+ color *newrowptr = &newarray[r * cm->hiarraycols * 2];
+ color *newrowptr2 = newrowptr + cm->hiarraycols;
+
+ for (c = 0; c < cm->hiarraycols; c++)
+ {
+ color co = oldrowptr[c];
+
+ newrowptr[c] = newrowptr2[c] = co;
+ cm->cd[co].nuchrs++;
+ }
+ }
+
+ cm->hiarraycols *= 2;
+}
+
+/*
+ * subcolorcvec - allocate new subcolors to cvec members, fill in arcs
+ *
+ * For each chr "c" represented by the cvec, do the equivalent of
+ * newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
+ *
+ * Note that in typical cases, many of the subcolors are the same.
+ * While newarc() would discard duplicate arc requests, we can save
+ * some cycles by not calling it repetitively to begin with. This is
+ * mechanized with the "lastsubcolor" state variable.
+ */
+static void
+subcolorcvec(struct vars *v,
+ struct cvec *cv,
+ struct state *lp,
+ struct state *rp)
+{
+ struct colormap *cm = v->cm;
+ color lastsubcolor = COLORLESS;
+ chr ch,
+ from,
+ to;
+ const chr *p;
+ int i;
+
+ /* ordinary characters */
+ for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--)
+ {
+ ch = *p;
+ subcoloronechr(v, ch, lp, rp, &lastsubcolor);
+ NOERR();
+ }
+
+ /* and the ranges */
+ for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--)
+ {
+ from = *p;
+ to = *(p + 1);
+ if (from <= MAX_SIMPLE_CHR)
+ {
+ /* deal with simple chars one at a time */
+ chr lim = (to <= MAX_SIMPLE_CHR) ? to : MAX_SIMPLE_CHR;
+
+ while (from <= lim)
+ {
+ color sco = subcolor(cm, from);
+
+ NOERR();
+ if (sco != lastsubcolor)
+ {
+ newarc(v->nfa, PLAIN, sco, lp, rp);
+ NOERR();
+ lastsubcolor = sco;
+ }
+ from++;
+ }
+ }
+ /* deal with any part of the range that's above MAX_SIMPLE_CHR */
+ if (from < to)
+ subcoloronerange(v, from, to, lp, rp, &lastsubcolor);
+ else if (from == to)
+ subcoloronechr(v, from, lp, rp, &lastsubcolor);
+ NOERR();
+ }
+
+ /* and deal with cclass if any */
+ if (cv->cclasscode >= 0)
+ {
+ int classbit;
+ color *pco;
+ int r,
+ c;
+
+ /* Enlarge array if we don't have a column bit assignment for cclass */
+ if (cm->classbits[cv->cclasscode] == 0)
+ {
+ cm->classbits[cv->cclasscode] = cm->hiarraycols;
+ newhicolorcols(cm);
+ NOERR();
+ }
+ /* Apply subcolorhi() and make arc for each entry in relevant cols */
+ classbit = cm->classbits[cv->cclasscode];
+ pco = cm->hicolormap;
+ for (r = 0; r < cm->hiarrayrows; r++)
+ {
+ for (c = 0; c < cm->hiarraycols; c++)
+ {
+ if (c & classbit)
+ {
+ color sco = subcolorhi(cm, pco);
+
+ NOERR();
+ /* add the arc if needed */
+ if (sco != lastsubcolor)
+ {
+ newarc(v->nfa, PLAIN, sco, lp, rp);
+ NOERR();
+ lastsubcolor = sco;
+ }
+ }
+ pco++;
+ }
+ }
+ }
+}
+
+/*
+ * subcoloronechr - do subcolorcvec's work for a singleton chr
+ *
+ * We could just let subcoloronerange do this, but it's a bit more efficient
+ * if we exploit the single-chr case. Also, callers find it useful for this
+ * to be able to handle both low and high chr codes.
+ */
+static void
+subcoloronechr(struct vars *v,
+ chr ch,
+ struct state *lp,
+ struct state *rp,
+ color *lastsubcolor)
+{
+ struct colormap *cm = v->cm;
+ colormaprange *newranges;
+ int numnewranges;
+ colormaprange *oldrange;
+ int oldrangen;
+ int newrow;
+
+ /* Easy case for low chr codes */
+ if (ch <= MAX_SIMPLE_CHR)
+ {
+ color sco = subcolor(cm, ch);
+
+ NOERR();
+ if (sco != *lastsubcolor)
+ {
+ newarc(v->nfa, PLAIN, sco, lp, rp);
+ *lastsubcolor = sco;
+ }
+ return;
+ }
+
+ /*
+ * Potentially, we could need two more colormapranges than we have now, if
+ * the given chr is in the middle of some existing range.
+ */
+ newranges = (colormaprange *)
+ MALLOC((cm->numcmranges + 2) * sizeof(colormaprange));
+ if (newranges == NULL)
+ {
+ CERR(REG_ESPACE);
+ return;
+ }
+ numnewranges = 0;
+
+ /* Ranges before target are unchanged */
+ for (oldrange = cm->cmranges, oldrangen = 0;
+ oldrangen < cm->numcmranges;
+ oldrange++, oldrangen++)
+ {
+ if (oldrange->cmax >= ch)
+ break;
+ newranges[numnewranges++] = *oldrange;
+ }
+
+ /* Match target chr against current range */
+ if (oldrangen >= cm->numcmranges || oldrange->cmin > ch)
+ {
+ /* chr does not belong to any existing range, make a new one */
+ newranges[numnewranges].cmin = ch;
+ newranges[numnewranges].cmax = ch;
+ /* row state should be cloned from the "all others" row */
+ newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0);
+ numnewranges++;
+ }
+ else if (oldrange->cmin == oldrange->cmax)
+ {
+ /* we have an existing singleton range matching the chr */
+ newranges[numnewranges++] = *oldrange;
+ newrow = oldrange->rownum;
+ /* we've now fully processed this old range */
+ oldrange++, oldrangen++;
+ }
+ else
+ {
+ /* chr is a subset of this existing range, must split it */
+ if (ch > oldrange->cmin)
+ {
+ /* emit portion of old range before chr */
+ newranges[numnewranges].cmin = oldrange->cmin;
+ newranges[numnewranges].cmax = ch - 1;
+ newranges[numnewranges].rownum = oldrange->rownum;
+ numnewranges++;
+ }
+ /* emit chr as singleton range, initially cloning from range */
+ newranges[numnewranges].cmin = ch;
+ newranges[numnewranges].cmax = ch;
+ newranges[numnewranges].rownum = newrow =
+ newhicolorrow(cm, oldrange->rownum);
+ numnewranges++;
+ if (ch < oldrange->cmax)
+ {
+ /* emit portion of old range after chr */
+ newranges[numnewranges].cmin = ch + 1;
+ newranges[numnewranges].cmax = oldrange->cmax;
+ /* must clone the row if we are making two new ranges from old */
+ newranges[numnewranges].rownum =
+ (ch > oldrange->cmin) ? newhicolorrow(cm, oldrange->rownum) :
+ oldrange->rownum;
+ numnewranges++;
+ }
+ /* we've now fully processed this old range */
+ oldrange++, oldrangen++;
+ }
+
+ /* Update colors in newrow and create arcs as needed */
+ subcoloronerow(v, newrow, lp, rp, lastsubcolor);
+
+ /* Ranges after target are unchanged */
+ for (; oldrangen < cm->numcmranges; oldrange++, oldrangen++)
+ {
+ newranges[numnewranges++] = *oldrange;
+ }
+
+ /* Assert our original space estimate was adequate */
+ assert(numnewranges <= (cm->numcmranges + 2));
+
+ /* And finally, store back the updated list of ranges */
+ if (cm->cmranges != NULL)
+ FREE(cm->cmranges);
+ cm->cmranges = newranges;
+ cm->numcmranges = numnewranges;
+}
+
+/*
+ * subcoloronerange - do subcolorcvec's work for a high range
+ */
+static void
+subcoloronerange(struct vars *v,
+ chr from,
+ chr to,
+ struct state *lp,
+ struct state *rp,
+ color *lastsubcolor)
+{
+ struct colormap *cm = v->cm;
+ colormaprange *newranges;
+ int numnewranges;
+ colormaprange *oldrange;
+ int oldrangen;
+ int newrow;
+
+ /* Caller should take care of non-high-range cases */
+ assert(from > MAX_SIMPLE_CHR);
+ assert(from < to);
+
+ /*
+ * Potentially, if we have N non-adjacent ranges, we could need as many as
+ * 2N+1 result ranges (consider case where new range spans 'em all).
+ */
+ newranges = (colormaprange *)
+ MALLOC((cm->numcmranges * 2 + 1) * sizeof(colormaprange));
+ if (newranges == NULL)
+ {
+ CERR(REG_ESPACE);
+ return;
+ }
+ numnewranges = 0;
+
+ /* Ranges before target are unchanged */
+ for (oldrange = cm->cmranges, oldrangen = 0;
+ oldrangen < cm->numcmranges;
+ oldrange++, oldrangen++)
+ {
+ if (oldrange->cmax >= from)
+ break;
+ newranges[numnewranges++] = *oldrange;
+ }
+
+ /*
+ * Deal with ranges that (partially) overlap the target. As we process
+ * each such range, increase "from" to remove the dealt-with characters
+ * from the target range.
+ */
+ while (oldrangen < cm->numcmranges && oldrange->cmin <= to)
+ {
+ if (from < oldrange->cmin)
+ {
+ /* Handle portion of new range that corresponds to no old range */
+ newranges[numnewranges].cmin = from;
+ newranges[numnewranges].cmax = oldrange->cmin - 1;
+ /* row state should be cloned from the "all others" row */
+ newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0);
+ numnewranges++;
+ /* Update colors in newrow and create arcs as needed */
+ subcoloronerow(v, newrow, lp, rp, lastsubcolor);
+ /* We've now fully processed the part of new range before old */
+ from = oldrange->cmin;
+ }
+
+ if (from <= oldrange->cmin && to >= oldrange->cmax)
+ {
+ /* old range is fully contained in new, process it in-place */
+ newranges[numnewranges++] = *oldrange;
+ newrow = oldrange->rownum;
+ from = oldrange->cmax + 1;
+ }
+ else
+ {
+ /* some part of old range does not overlap new range */
+ if (from > oldrange->cmin)
+ {
+ /* emit portion of old range before new range */
+ newranges[numnewranges].cmin = oldrange->cmin;
+ newranges[numnewranges].cmax = from - 1;
+ newranges[numnewranges].rownum = oldrange->rownum;
+ numnewranges++;
+ }
+ /* emit common subrange, initially cloning from old range */
+ newranges[numnewranges].cmin = from;
+ newranges[numnewranges].cmax =
+ (to < oldrange->cmax) ? to : oldrange->cmax;
+ newranges[numnewranges].rownum = newrow =
+ newhicolorrow(cm, oldrange->rownum);
+ numnewranges++;
+ if (to < oldrange->cmax)
+ {
+ /* emit portion of old range after new range */
+ newranges[numnewranges].cmin = to + 1;
+ newranges[numnewranges].cmax = oldrange->cmax;
+ /* must clone the row if we are making two new ranges from old */
+ newranges[numnewranges].rownum =
+ (from > oldrange->cmin) ? newhicolorrow(cm, oldrange->rownum) :
+ oldrange->rownum;
+ numnewranges++;
+ }
+ from = oldrange->cmax + 1;
+ }
+ /* Update colors in newrow and create arcs as needed */
+ subcoloronerow(v, newrow, lp, rp, lastsubcolor);
+ /* we've now fully processed this old range */
+ oldrange++, oldrangen++;
+ }
+
+ if (from <= to)
+ {
+ /* Handle portion of new range that corresponds to no old range */
+ newranges[numnewranges].cmin = from;
+ newranges[numnewranges].cmax = to;
+ /* row state should be cloned from the "all others" row */
+ newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0);
+ numnewranges++;
+ /* Update colors in newrow and create arcs as needed */
+ subcoloronerow(v, newrow, lp, rp, lastsubcolor);
+ }
+
+ /* Ranges after target are unchanged */
+ for (; oldrangen < cm->numcmranges; oldrange++, oldrangen++)
+ {
+ newranges[numnewranges++] = *oldrange;
+ }
+
+ /* Assert our original space estimate was adequate */
+ assert(numnewranges <= (cm->numcmranges * 2 + 1));
+
+ /* And finally, store back the updated list of ranges */
+ if (cm->cmranges != NULL)
+ FREE(cm->cmranges);
+ cm->cmranges = newranges;
+ cm->numcmranges = numnewranges;
+}
+
+/*
+ * subcoloronerow - do subcolorcvec's work for one new row in the high colormap
+ */
+static void
+subcoloronerow(struct vars *v,
+ int rownum,
+ struct state *lp,
+ struct state *rp,
+ color *lastsubcolor)
+{
+ struct colormap *cm = v->cm;
+ color *pco;
+ int i;
+
+ /* Apply subcolorhi() and make arc for each entry in row */
+ pco = &cm->hicolormap[rownum * cm->hiarraycols];
+ for (i = 0; i < cm->hiarraycols; pco++, i++)
+ {
+ color sco = subcolorhi(cm, pco);
+
+ NOERR();
+ /* make the arc if needed */
+ if (sco != *lastsubcolor)
+ {
+ newarc(v->nfa, PLAIN, sco, lp, rp);
+ NOERR();
+ *lastsubcolor = sco;
+ }
+ }
+}
+
+/*
+ * okcolors - promote subcolors to full colors
+ */
+static void
+okcolors(struct nfa *nfa,
+ struct colormap *cm)
+{
+ struct colordesc *cd;
+ struct colordesc *end = CDEND(cm);
+ struct colordesc *scd;
+ struct arc *a;
+ color co;
+ color sco;
+
+ for (cd = cm->cd, co = 0; cd < end; cd++, co++)
+ {
+ sco = cd->sub;
+ if (UNUSEDCOLOR(cd) || sco == NOSUB)
+ {
+ /* has no subcolor, no further action */
+ }
+ else if (sco == co)
+ {
+ /* is subcolor, let parent deal with it */
+ }
+ else if (cd->nschrs == 0 && cd->nuchrs == 0)
+ {
+ /*
+ * Parent is now empty, so just change all its arcs to the
+ * subcolor, then free the parent.
+ *
+ * It is not obvious that simply relabeling the arcs like this is
+ * OK; it appears to risk creating duplicate arcs. We are
+ * basically relying on the assumption that processing of a
+ * bracket expression can't create arcs of both a color and its
+ * subcolor between the bracket's endpoints.
+ */
+ cd->sub = NOSUB;
+ scd = &cm->cd[sco];
+ assert(scd->nschrs > 0 || scd->nuchrs > 0);
+ assert(scd->sub == sco);
+ scd->sub = NOSUB;
+ while ((a = cd->arcs) != NULL)
+ {
+ assert(a->co == co);
+ uncolorchain(cm, a);
+ a->co = sco;
+ colorchain(cm, a);
+ }
+ freecolor(cm, co);
+ }
+ else
+ {
+ /* parent's arcs must gain parallel subcolor arcs */
+ cd->sub = NOSUB;
+ scd = &cm->cd[sco];
+ assert(scd->nschrs > 0 || scd->nuchrs > 0);
+ assert(scd->sub == sco);
+ scd->sub = NOSUB;
+ for (a = cd->arcs; a != NULL; a = a->colorchain)
+ {
+ assert(a->co == co);
+ newarc(nfa, a->type, sco, a->from, a->to);
+ }
+ }
+ }
+}
+
+/*
+ * colorchain - add this arc to the color chain of its color
+ */
+static void
+colorchain(struct colormap *cm,
+ struct arc *a)
+{
+ struct colordesc *cd = &cm->cd[a->co];
+
+ assert(a->co >= 0);
+ if (cd->arcs != NULL)
+ cd->arcs->colorchainRev = a;
+ a->colorchain = cd->arcs;
+ a->colorchainRev = NULL;
+ cd->arcs = a;
+}
+
+/*
+ * uncolorchain - delete this arc from the color chain of its color
+ */
+static void
+uncolorchain(struct colormap *cm,
+ struct arc *a)
+{
+ struct colordesc *cd = &cm->cd[a->co];
+ struct arc *aa = a->colorchainRev;
+
+ assert(a->co >= 0);
+ if (aa == NULL)
+ {
+ assert(cd->arcs == a);
+ cd->arcs = a->colorchain;
+ }
+ else
+ {
+ assert(aa->colorchain == a);
+ aa->colorchain = a->colorchain;
+ }
+ if (a->colorchain != NULL)
+ a->colorchain->colorchainRev = aa;
+ a->colorchain = NULL; /* paranoia */
+ a->colorchainRev = NULL;
+}
+
+/*
+ * rainbow - add arcs of all full colors (but one) between specified states
+ *
+ * If there isn't an exception color, we now generate just a single arc
+ * labeled RAINBOW, saving lots of arc-munging later on.
+ */
+static void
+rainbow(struct nfa *nfa,
+ struct colormap *cm,
+ int type,
+ color but, /* COLORLESS if no exceptions */
+ struct state *from,
+ struct state *to)
+{
+ struct colordesc *cd;
+ struct colordesc *end = CDEND(cm);
+ color co;
+
+ if (but == COLORLESS)
+ {
+ newarc(nfa, type, RAINBOW, from, to);
+ return;
+ }
+
+ /* Gotta do it the hard way. Skip subcolors, pseudocolors, and "but" */
+ for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
+ if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but &&
+ !(cd->flags & PSEUDO))
+ newarc(nfa, type, co, from, to);
+}
+
+/*
+ * colorcomplement - add arcs of complementary colors
+ *
+ * We add arcs of all colors that are not pseudocolors and do not match
+ * any of the "of" state's PLAIN outarcs.
+ *
+ * The calling sequence ought to be reconciled with cloneouts().
+ */
+static void
+colorcomplement(struct nfa *nfa,
+ struct colormap *cm,
+ int type,
+ struct state *of,
+ struct state *from,
+ struct state *to)
+{
+ struct colordesc *cd;
+ struct colordesc *end = CDEND(cm);
+ color co;
+ struct arc *a;
+
+ assert(of != from);
+
+ /* A RAINBOW arc matches all colors, making the complement empty */
+ if (findarc(of, PLAIN, RAINBOW) != NULL)
+ return;
+
+ /* Otherwise, transiently mark the colors that appear in of's out-arcs */
+ for (a = of->outs; a != NULL; a = a->outchain)
+ {
+ if (a->type == PLAIN)
+ {
+ assert(a->co >= 0);
+ cd = &cm->cd[a->co];
+ assert(!UNUSEDCOLOR(cd));
+ cd->flags |= COLMARK;
+ }
+ }
+
+ /* Scan colors, clear transient marks, add arcs for unmarked colors */
+ for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
+ {
+ if (cd->flags & COLMARK)
+ cd->flags &= ~COLMARK;
+ else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
+ newarc(nfa, type, co, from, to);
+ }
+}
+
+
+#ifdef REG_DEBUG
+
+/*
+ * dumpcolors - debugging output
+ */
+static void
+dumpcolors(struct colormap *cm,
+ FILE *f)
+{
+ struct colordesc *cd;
+ struct colordesc *end;
+ color co;
+ chr c;
+
+ fprintf(f, "max %ld\n", (long) cm->max);
+ end = CDEND(cm);
+ for (cd = cm->cd + 1, co = 1; cd < end; cd++, co++) /* skip 0 */
+ {
+ if (!UNUSEDCOLOR(cd))
+ {
+ assert(cd->nschrs > 0 || cd->nuchrs > 0);
+ if (cd->flags & PSEUDO)
+ fprintf(f, "#%2ld(ps): ", (long) co);
+ else
+ fprintf(f, "#%2ld(%2d): ", (long) co, cd->nschrs + cd->nuchrs);
+
+ /*
+ * Unfortunately, it's hard to do this next bit more efficiently.
+ */
+ for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
+ if (GETCOLOR(cm, c) == co)
+ dumpchr(c, f);
+ fprintf(f, "\n");
+ }
+ }
+ /* dump the high colormap if it contains anything interesting */
+ if (cm->hiarrayrows > 1 || cm->hiarraycols > 1)
+ {
+ int r,
+ c;
+ const color *rowptr;
+
+ fprintf(f, "other:\t");
+ for (c = 0; c < cm->hiarraycols; c++)
+ {
+ fprintf(f, "\t%ld", (long) cm->hicolormap[c]);
+ }
+ fprintf(f, "\n");
+ for (r = 0; r < cm->numcmranges; r++)
+ {
+ dumpchr(cm->cmranges[r].cmin, f);
+ fprintf(f, "..");
+ dumpchr(cm->cmranges[r].cmax, f);
+ fprintf(f, ":");
+ rowptr = &cm->hicolormap[cm->cmranges[r].rownum * cm->hiarraycols];
+ for (c = 0; c < cm->hiarraycols; c++)
+ {
+ fprintf(f, "\t%ld", (long) rowptr[c]);
+ }
+ fprintf(f, "\n");
+ }
+ }
+}
+
+/*
+ * dumpchr - print a chr
+ *
+ * Kind of char-centric but works well enough for debug use.
+ */
+static void
+dumpchr(chr c,
+ FILE *f)
+{
+ if (c == '\\')
+ fprintf(f, "\\\\");
+ else if (c > ' ' && c <= '~')
+ putc((char) c, f);
+ else
+ fprintf(f, "\\u%04lx", (long) c);
+}
+
+#endif /* REG_DEBUG */
diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c
new file mode 100644
index 0000000..1030621
--- /dev/null
+++ b/src/backend/regex/regc_cvec.c
@@ -0,0 +1,138 @@
+/*
+ * Utility functions for handling cvecs
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regc_cvec.c
+ *
+ */
+
+/*
+ * Notes:
+ * Only (selected) functions in _this_ file should treat the chr arrays
+ * of a cvec as non-constant.
+ */
+
+/*
+ * newcvec - allocate a new cvec
+ */
+static struct cvec *
+newcvec(int nchrs, /* to hold this many chrs... */
+ int nranges) /* ... and this many ranges */
+{
+ size_t nc = (size_t) nchrs + (size_t) nranges * 2;
+ size_t n = sizeof(struct cvec) + nc * sizeof(chr);
+ struct cvec *cv = (struct cvec *) MALLOC(n);
+
+ if (cv == NULL)
+ return NULL;
+ cv->chrspace = nchrs;
+ cv->chrs = (chr *) (((char *) cv) + sizeof(struct cvec));
+ cv->ranges = cv->chrs + nchrs;
+ cv->rangespace = nranges;
+ return clearcvec(cv);
+}
+
+/*
+ * clearcvec - clear a possibly-new cvec
+ * Returns pointer as convenience.
+ */
+static struct cvec *
+clearcvec(struct cvec *cv)
+{
+ assert(cv != NULL);
+ cv->nchrs = 0;
+ cv->nranges = 0;
+ cv->cclasscode = -1;
+ return cv;
+}
+
+/*
+ * addchr - add a chr to a cvec
+ */
+static void
+addchr(struct cvec *cv, /* character vector */
+ chr c) /* character to add */
+{
+ assert(cv->nchrs < cv->chrspace);
+ cv->chrs[cv->nchrs++] = c;
+}
+
+/*
+ * addrange - add a range to a cvec
+ */
+static void
+addrange(struct cvec *cv, /* character vector */
+ chr from, /* first character of range */
+ chr to) /* last character of range */
+{
+ assert(cv->nranges < cv->rangespace);
+ cv->ranges[cv->nranges * 2] = from;
+ cv->ranges[cv->nranges * 2 + 1] = to;
+ cv->nranges++;
+}
+
+/*
+ * getcvec - get a transient cvec, initialized to empty
+ *
+ * The returned cvec is valid only until the next call of getcvec, which
+ * typically will recycle the space. Callers should *not* free the cvec
+ * explicitly; it will be cleaned up when the struct vars is destroyed.
+ *
+ * This is typically used while interpreting bracket expressions. In that
+ * usage the cvec is only needed momentarily until we build arcs from it,
+ * so transientness is a convenient behavior.
+ */
+static struct cvec *
+getcvec(struct vars *v, /* context */
+ int nchrs, /* to hold this many chrs... */
+ int nranges) /* ... and this many ranges */
+{
+ /* recycle existing transient cvec if large enough */
+ if (v->cv != NULL && nchrs <= v->cv->chrspace &&
+ nranges <= v->cv->rangespace)
+ return clearcvec(v->cv);
+
+ /* nope, make a new one */
+ if (v->cv != NULL)
+ freecvec(v->cv);
+ v->cv = newcvec(nchrs, nranges);
+ if (v->cv == NULL)
+ ERR(REG_ESPACE);
+
+ return v->cv;
+}
+
+/*
+ * freecvec - free a cvec
+ */
+static void
+freecvec(struct cvec *cv)
+{
+ FREE(cv);
+}
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
new file mode 100644
index 0000000..38c09b1
--- /dev/null
+++ b/src/backend/regex/regc_lex.c
@@ -0,0 +1,1044 @@
+/*
+ * lexical analyzer
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regc_lex.c
+ *
+ */
+
+/* scanning macros (know about v) */
+#define ATEOS() (v->now >= v->stop)
+#define HAVE(n) (v->stop - v->now >= (n))
+#define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
+#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
+#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
+ *(v->now+1) == CHR(b) && \
+ *(v->now+2) == CHR(c))
+#define SET(c) (v->nexttype = (c))
+#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
+#define RET(c) return (SET(c), 1)
+#define RETV(c, n) return (SETV(c, n), 1)
+#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
+#define LASTTYPE(t) (v->lasttype == (t))
+
+/* lexical contexts */
+#define L_ERE 1 /* mainline ERE/ARE */
+#define L_BRE 2 /* mainline BRE */
+#define L_Q 3 /* REG_QUOTE */
+#define L_EBND 4 /* ERE/ARE bound */
+#define L_BBND 5 /* BRE bound */
+#define L_BRACK 6 /* brackets */
+#define L_CEL 7 /* collating element */
+#define L_ECL 8 /* equivalence class */
+#define L_CCL 9 /* character class */
+#define INTOCON(c) (v->lexcon = (c))
+#define INCON(con) (v->lexcon == (con))
+
+/* construct pointer past end of chr array */
+#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
+
+/*
+ * lexstart - set up lexical stuff, scan leading options
+ */
+static void
+lexstart(struct vars *v)
+{
+ prefixes(v); /* may turn on new type bits etc. */
+ NOERR();
+
+ if (v->cflags & REG_QUOTE)
+ {
+ assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
+ INTOCON(L_Q);
+ }
+ else if (v->cflags & REG_EXTENDED)
+ {
+ assert(!(v->cflags & REG_QUOTE));
+ INTOCON(L_ERE);
+ }
+ else
+ {
+ assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
+ INTOCON(L_BRE);
+ }
+
+ v->nexttype = EMPTY; /* remember we were at the start */
+ next(v); /* set up the first token */
+}
+
+/*
+ * prefixes - implement various special prefixes
+ */
+static void
+prefixes(struct vars *v)
+{
+ /* literal string doesn't get any of this stuff */
+ if (v->cflags & REG_QUOTE)
+ return;
+
+ /* initial "***" gets special things */
+ if (HAVE(4) && NEXT3('*', '*', '*'))
+ switch (*(v->now + 3))
+ {
+ case CHR('?'): /* "***?" error, msg shows version */
+ ERR(REG_BADPAT);
+ return; /* proceed no further */
+ break;
+ case CHR('='): /* "***=" shifts to literal string */
+ NOTE(REG_UNONPOSIX);
+ v->cflags |= REG_QUOTE;
+ v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
+ v->now += 4;
+ return; /* and there can be no more prefixes */
+ break;
+ case CHR(':'): /* "***:" shifts to AREs */
+ NOTE(REG_UNONPOSIX);
+ v->cflags |= REG_ADVANCED;
+ v->now += 4;
+ break;
+ default: /* otherwise *** is just an error */
+ ERR(REG_BADRPT);
+ return;
+ break;
+ }
+
+ /* BREs and EREs don't get embedded options */
+ if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
+ return;
+
+ /* embedded options (AREs only) */
+ if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
+ {
+ NOTE(REG_UNONPOSIX);
+ v->now += 2;
+ for (; !ATEOS() && iscalpha(*v->now); v->now++)
+ switch (*v->now)
+ {
+ case CHR('b'): /* BREs (but why???) */
+ v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
+ break;
+ case CHR('c'): /* case sensitive */
+ v->cflags &= ~REG_ICASE;
+ break;
+ case CHR('e'): /* plain EREs */
+ v->cflags |= REG_EXTENDED;
+ v->cflags &= ~(REG_ADVF | REG_QUOTE);
+ break;
+ case CHR('i'): /* case insensitive */
+ v->cflags |= REG_ICASE;
+ break;
+ case CHR('m'): /* Perloid synonym for n */
+ case CHR('n'): /* \n affects ^ $ . [^ */
+ v->cflags |= REG_NEWLINE;
+ break;
+ case CHR('p'): /* ~Perl, \n affects . [^ */
+ v->cflags |= REG_NLSTOP;
+ v->cflags &= ~REG_NLANCH;
+ break;
+ case CHR('q'): /* literal string */
+ v->cflags |= REG_QUOTE;
+ v->cflags &= ~REG_ADVANCED;
+ break;
+ case CHR('s'): /* single line, \n ordinary */
+ v->cflags &= ~REG_NEWLINE;
+ break;
+ case CHR('t'): /* tight syntax */
+ v->cflags &= ~REG_EXPANDED;
+ break;
+ case CHR('w'): /* weird, \n affects ^ $ only */
+ v->cflags &= ~REG_NLSTOP;
+ v->cflags |= REG_NLANCH;
+ break;
+ case CHR('x'): /* expanded syntax */
+ v->cflags |= REG_EXPANDED;
+ break;
+ default:
+ ERR(REG_BADOPT);
+ return;
+ }
+ if (!NEXT1(')'))
+ {
+ ERR(REG_BADOPT);
+ return;
+ }
+ v->now++;
+ if (v->cflags & REG_QUOTE)
+ v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
+ }
+}
+
+/*
+ * next - get next token
+ */
+static int /* 1 normal, 0 failure */
+next(struct vars *v)
+{
+ chr c;
+
+next_restart: /* loop here after eating a comment */
+
+ /* errors yield an infinite sequence of failures */
+ if (ISERR())
+ return 0; /* the error has set nexttype to EOS */
+
+ /* remember flavor of last token */
+ v->lasttype = v->nexttype;
+
+ /* REG_BOSONLY */
+ if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
+ {
+ /* at start of a REG_BOSONLY RE */
+ RETV(SBEGIN, 0); /* same as \A */
+ }
+
+ /* skip white space etc. if appropriate (not in literal or []) */
+ if (v->cflags & REG_EXPANDED)
+ switch (v->lexcon)
+ {
+ case L_ERE:
+ case L_BRE:
+ case L_EBND:
+ case L_BBND:
+ skip(v);
+ break;
+ }
+
+ /* handle EOS, depending on context */
+ if (ATEOS())
+ {
+ switch (v->lexcon)
+ {
+ case L_ERE:
+ case L_BRE:
+ case L_Q:
+ RET(EOS);
+ break;
+ case L_EBND:
+ case L_BBND:
+ FAILW(REG_EBRACE);
+ break;
+ case L_BRACK:
+ case L_CEL:
+ case L_ECL:
+ case L_CCL:
+ FAILW(REG_EBRACK);
+ break;
+ }
+ assert(NOTREACHED);
+ }
+
+ /* okay, time to actually get a character */
+ c = *v->now++;
+
+ /* deal with the easy contexts, punt EREs to code below */
+ switch (v->lexcon)
+ {
+ case L_BRE: /* punt BREs to separate function */
+ return brenext(v, c);
+ break;
+ case L_ERE: /* see below */
+ break;
+ case L_Q: /* literal strings are easy */
+ RETV(PLAIN, c);
+ break;
+ case L_BBND: /* bounds are fairly simple */
+ case L_EBND:
+ switch (c)
+ {
+ case CHR('0'):
+ case CHR('1'):
+ case CHR('2'):
+ case CHR('3'):
+ case CHR('4'):
+ case CHR('5'):
+ case CHR('6'):
+ case CHR('7'):
+ case CHR('8'):
+ case CHR('9'):
+ RETV(DIGIT, (chr) DIGITVAL(c));
+ break;
+ case CHR(','):
+ RET(',');
+ break;
+ case CHR('}'): /* ERE bound ends with } */
+ if (INCON(L_EBND))
+ {
+ INTOCON(L_ERE);
+ if ((v->cflags & REG_ADVF) && NEXT1('?'))
+ {
+ v->now++;
+ NOTE(REG_UNONPOSIX);
+ RETV('}', 0);
+ }
+ RETV('}', 1);
+ }
+ else
+ FAILW(REG_BADBR);
+ break;
+ case CHR('\\'): /* BRE bound ends with \} */
+ if (INCON(L_BBND) && NEXT1('}'))
+ {
+ v->now++;
+ INTOCON(L_BRE);
+ RETV('}', 1);
+ }
+ else
+ FAILW(REG_BADBR);
+ break;
+ default:
+ FAILW(REG_BADBR);
+ break;
+ }
+ assert(NOTREACHED);
+ break;
+ case L_BRACK: /* brackets are not too hard */
+ switch (c)
+ {
+ case CHR(']'):
+ if (LASTTYPE('['))
+ RETV(PLAIN, c);
+ else
+ {
+ INTOCON((v->cflags & REG_EXTENDED) ?
+ L_ERE : L_BRE);
+ RET(']');
+ }
+ break;
+ case CHR('\\'):
+ NOTE(REG_UBBS);
+ if (!(v->cflags & REG_ADVF))
+ RETV(PLAIN, c);
+ NOTE(REG_UNONPOSIX);
+ if (ATEOS())
+ FAILW(REG_EESCAPE);
+ if (!lexescape(v))
+ return 0;
+ switch (v->nexttype)
+ { /* not all escapes okay here */
+ case PLAIN:
+ case CCLASSS:
+ case CCLASSC:
+ return 1;
+ break;
+ }
+ /* not one of the acceptable escapes */
+ FAILW(REG_EESCAPE);
+ break;
+ case CHR('-'):
+ if (LASTTYPE('[') || NEXT1(']'))
+ RETV(PLAIN, c);
+ else
+ RETV(RANGE, c);
+ break;
+ case CHR('['):
+ if (ATEOS())
+ FAILW(REG_EBRACK);
+ switch (*v->now++)
+ {
+ case CHR('.'):
+ INTOCON(L_CEL);
+ /* might or might not be locale-specific */
+ RET(COLLEL);
+ break;
+ case CHR('='):
+ INTOCON(L_ECL);
+ NOTE(REG_ULOCALE);
+ RET(ECLASS);
+ break;
+ case CHR(':'):
+ INTOCON(L_CCL);
+ NOTE(REG_ULOCALE);
+ RET(CCLASS);
+ break;
+ default: /* oops */
+ v->now--;
+ RETV(PLAIN, c);
+ break;
+ }
+ assert(NOTREACHED);
+ break;
+ default:
+ RETV(PLAIN, c);
+ break;
+ }
+ assert(NOTREACHED);
+ break;
+ case L_CEL: /* collating elements are easy */
+ if (c == CHR('.') && NEXT1(']'))
+ {
+ v->now++;
+ INTOCON(L_BRACK);
+ RETV(END, '.');
+ }
+ else
+ RETV(PLAIN, c);
+ break;
+ case L_ECL: /* ditto equivalence classes */
+ if (c == CHR('=') && NEXT1(']'))
+ {
+ v->now++;
+ INTOCON(L_BRACK);
+ RETV(END, '=');
+ }
+ else
+ RETV(PLAIN, c);
+ break;
+ case L_CCL: /* ditto character classes */
+ if (c == CHR(':') && NEXT1(']'))
+ {
+ v->now++;
+ INTOCON(L_BRACK);
+ RETV(END, ':');
+ }
+ else
+ RETV(PLAIN, c);
+ break;
+ default:
+ assert(NOTREACHED);
+ break;
+ }
+
+ /* that got rid of everything except EREs and AREs */
+ assert(INCON(L_ERE));
+
+ /* deal with EREs and AREs, except for backslashes */
+ switch (c)
+ {
+ case CHR('|'):
+ RET('|');
+ break;
+ case CHR('*'):
+ if ((v->cflags & REG_ADVF) && NEXT1('?'))
+ {
+ v->now++;
+ NOTE(REG_UNONPOSIX);
+ RETV('*', 0);
+ }
+ RETV('*', 1);
+ break;
+ case CHR('+'):
+ if ((v->cflags & REG_ADVF) && NEXT1('?'))
+ {
+ v->now++;
+ NOTE(REG_UNONPOSIX);
+ RETV('+', 0);
+ }
+ RETV('+', 1);
+ break;
+ case CHR('?'):
+ if ((v->cflags & REG_ADVF) && NEXT1('?'))
+ {
+ v->now++;
+ NOTE(REG_UNONPOSIX);
+ RETV('?', 0);
+ }
+ RETV('?', 1);
+ break;
+ case CHR('{'): /* bounds start or plain character */
+ if (v->cflags & REG_EXPANDED)
+ skip(v);
+ if (ATEOS() || !iscdigit(*v->now))
+ {
+ NOTE(REG_UBRACES);
+ NOTE(REG_UUNSPEC);
+ RETV(PLAIN, c);
+ }
+ else
+ {
+ NOTE(REG_UBOUNDS);
+ INTOCON(L_EBND);
+ RET('{');
+ }
+ assert(NOTREACHED);
+ break;
+ case CHR('('): /* parenthesis, or advanced extension */
+ if ((v->cflags & REG_ADVF) && NEXT1('?'))
+ {
+ NOTE(REG_UNONPOSIX);
+ v->now++;
+ if (ATEOS())
+ FAILW(REG_BADRPT);
+ switch (*v->now++)
+ {
+ case CHR(':'): /* non-capturing paren */
+ RETV('(', 0);
+ break;
+ case CHR('#'): /* comment */
+ while (!ATEOS() && *v->now != CHR(')'))
+ v->now++;
+ if (!ATEOS())
+ v->now++;
+ assert(v->nexttype == v->lasttype);
+ goto next_restart;
+ case CHR('='): /* positive lookahead */
+ NOTE(REG_ULOOKAROUND);
+ RETV(LACON, LATYPE_AHEAD_POS);
+ break;
+ case CHR('!'): /* negative lookahead */
+ NOTE(REG_ULOOKAROUND);
+ RETV(LACON, LATYPE_AHEAD_NEG);
+ break;
+ case CHR('<'):
+ if (ATEOS())
+ FAILW(REG_BADRPT);
+ switch (*v->now++)
+ {
+ case CHR('='): /* positive lookbehind */
+ NOTE(REG_ULOOKAROUND);
+ RETV(LACON, LATYPE_BEHIND_POS);
+ break;
+ case CHR('!'): /* negative lookbehind */
+ NOTE(REG_ULOOKAROUND);
+ RETV(LACON, LATYPE_BEHIND_NEG);
+ break;
+ default:
+ FAILW(REG_BADRPT);
+ break;
+ }
+ assert(NOTREACHED);
+ break;
+ default:
+ FAILW(REG_BADRPT);
+ break;
+ }
+ assert(NOTREACHED);
+ }
+ RETV('(', 1);
+ break;
+ case CHR(')'):
+ if (LASTTYPE('('))
+ NOTE(REG_UUNSPEC);
+ RETV(')', c);
+ break;
+ case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
+ if (HAVE(6) && *(v->now + 0) == CHR('[') &&
+ *(v->now + 1) == CHR(':') &&
+ (*(v->now + 2) == CHR('<') ||
+ *(v->now + 2) == CHR('>')) &&
+ *(v->now + 3) == CHR(':') &&
+ *(v->now + 4) == CHR(']') &&
+ *(v->now + 5) == CHR(']'))
+ {
+ c = *(v->now + 2);
+ v->now += 6;
+ NOTE(REG_UNONPOSIX);
+ RET((c == CHR('<')) ? '<' : '>');
+ }
+ INTOCON(L_BRACK);
+ if (NEXT1('^'))
+ {
+ v->now++;
+ RETV('[', 0);
+ }
+ RETV('[', 1);
+ break;
+ case CHR('.'):
+ RET('.');
+ break;
+ case CHR('^'):
+ RET('^');
+ break;
+ case CHR('$'):
+ RET('$');
+ break;
+ case CHR('\\'): /* mostly punt backslashes to code below */
+ if (ATEOS())
+ FAILW(REG_EESCAPE);
+ break;
+ default: /* ordinary character */
+ RETV(PLAIN, c);
+ break;
+ }
+
+ /* ERE/ARE backslash handling; backslash already eaten */
+ assert(!ATEOS());
+ if (!(v->cflags & REG_ADVF))
+ { /* only AREs have non-trivial escapes */
+ if (iscalnum(*v->now))
+ {
+ NOTE(REG_UBSALNUM);
+ NOTE(REG_UUNSPEC);
+ }
+ RETV(PLAIN, *v->now++);
+ }
+ return lexescape(v);
+}
+
+/*
+ * lexescape - parse an ARE backslash escape (backslash already eaten)
+ *
+ * This is used for ARE backslashes both normally and inside bracket
+ * expressions. In the latter case, not all escape types are allowed,
+ * but the caller must reject unwanted ones after we return.
+ */
+static int
+lexescape(struct vars *v)
+{
+ chr c;
+ static const chr alert[] = {
+ CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
+ };
+ static const chr esc[] = {
+ CHR('E'), CHR('S'), CHR('C')
+ };
+ const chr *save;
+
+ assert(v->cflags & REG_ADVF);
+
+ assert(!ATEOS());
+ c = *v->now++;
+
+ /* if it's not alphanumeric ASCII, treat it as a plain character */
+ if (!('a' <= c && c <= 'z') &&
+ !('A' <= c && c <= 'Z') &&
+ !('0' <= c && c <= '9'))
+ RETV(PLAIN, c);
+
+ NOTE(REG_UNONPOSIX);
+ switch (c)
+ {
+ case CHR('a'):
+ RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
+ break;
+ case CHR('A'):
+ RETV(SBEGIN, 0);
+ break;
+ case CHR('b'):
+ RETV(PLAIN, CHR('\b'));
+ break;
+ case CHR('B'):
+ RETV(PLAIN, CHR('\\'));
+ break;
+ case CHR('c'):
+ NOTE(REG_UUNPORT);
+ if (ATEOS())
+ FAILW(REG_EESCAPE);
+ RETV(PLAIN, (chr) (*v->now++ & 037));
+ break;
+ case CHR('d'):
+ NOTE(REG_ULOCALE);
+ RETV(CCLASSS, CC_DIGIT);
+ break;
+ case CHR('D'):
+ NOTE(REG_ULOCALE);
+ RETV(CCLASSC, CC_DIGIT);
+ break;
+ case CHR('e'):
+ NOTE(REG_UUNPORT);
+ RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
+ break;
+ case CHR('f'):
+ RETV(PLAIN, CHR('\f'));
+ break;
+ case CHR('m'):
+ RET('<');
+ break;
+ case CHR('M'):
+ RET('>');
+ break;
+ case CHR('n'):
+ RETV(PLAIN, CHR('\n'));
+ break;
+ case CHR('r'):
+ RETV(PLAIN, CHR('\r'));
+ break;
+ case CHR('s'):
+ NOTE(REG_ULOCALE);
+ RETV(CCLASSS, CC_SPACE);
+ break;
+ case CHR('S'):
+ NOTE(REG_ULOCALE);
+ RETV(CCLASSC, CC_SPACE);
+ break;
+ case CHR('t'):
+ RETV(PLAIN, CHR('\t'));
+ break;
+ case CHR('u'):
+ c = lexdigits(v, 16, 4, 4);
+ if (ISERR() || !CHR_IS_IN_RANGE(c))
+ FAILW(REG_EESCAPE);
+ RETV(PLAIN, c);
+ break;
+ case CHR('U'):
+ c = lexdigits(v, 16, 8, 8);
+ if (ISERR() || !CHR_IS_IN_RANGE(c))
+ FAILW(REG_EESCAPE);
+ RETV(PLAIN, c);
+ break;
+ case CHR('v'):
+ RETV(PLAIN, CHR('\v'));
+ break;
+ case CHR('w'):
+ NOTE(REG_ULOCALE);
+ RETV(CCLASSS, CC_WORD);
+ break;
+ case CHR('W'):
+ NOTE(REG_ULOCALE);
+ RETV(CCLASSC, CC_WORD);
+ break;
+ case CHR('x'):
+ NOTE(REG_UUNPORT);
+ c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
+ if (ISERR() || !CHR_IS_IN_RANGE(c))
+ FAILW(REG_EESCAPE);
+ RETV(PLAIN, c);
+ break;
+ case CHR('y'):
+ NOTE(REG_ULOCALE);
+ RETV(WBDRY, 0);
+ break;
+ case CHR('Y'):
+ NOTE(REG_ULOCALE);
+ RETV(NWBDRY, 0);
+ break;
+ case CHR('Z'):
+ RETV(SEND, 0);
+ break;
+ case CHR('1'):
+ case CHR('2'):
+ case CHR('3'):
+ case CHR('4'):
+ case CHR('5'):
+ case CHR('6'):
+ case CHR('7'):
+ case CHR('8'):
+ case CHR('9'):
+ save = v->now;
+ v->now--; /* put first digit back */
+ c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
+ if (ISERR())
+ FAILW(REG_EESCAPE);
+ /* ugly heuristic (first test is "exactly 1 digit?") */
+ if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp))
+ {
+ NOTE(REG_UBACKREF);
+ RETV(BACKREF, c);
+ }
+ /* oops, doesn't look like it's a backref after all... */
+ v->now = save;
+ /* and fall through into octal number */
+ /* FALLTHROUGH */
+ case CHR('0'):
+ NOTE(REG_UUNPORT);
+ v->now--; /* put first digit back */
+ c = lexdigits(v, 8, 1, 3);
+ if (ISERR())
+ FAILW(REG_EESCAPE);
+ if (c > 0xff)
+ {
+ /* out of range, so we handled one digit too much */
+ v->now--;
+ c >>= 3;
+ }
+ RETV(PLAIN, c);
+ break;
+ default:
+ /*
+ * Throw an error for unrecognized ASCII alpha escape sequences,
+ * which reserves them for future use if needed.
+ */
+ FAILW(REG_EESCAPE);
+ break;
+ }
+ assert(NOTREACHED);
+}
+
+/*
+ * lexdigits - slurp up digits and return chr value
+ *
+ * This does not account for overflow; callers should range-check the result
+ * if maxlen is large enough to make that possible.
+ */
+static chr /* chr value; errors signalled via ERR */
+lexdigits(struct vars *v,
+ int base,
+ int minlen,
+ int maxlen)
+{
+ uchr n; /* unsigned to avoid overflow misbehavior */
+ int len;
+ chr c;
+ int d;
+ const uchr ub = (uchr) base;
+
+ n = 0;
+ for (len = 0; len < maxlen && !ATEOS(); len++)
+ {
+ c = *v->now++;
+ switch (c)
+ {
+ case CHR('0'):
+ case CHR('1'):
+ case CHR('2'):
+ case CHR('3'):
+ case CHR('4'):
+ case CHR('5'):
+ case CHR('6'):
+ case CHR('7'):
+ case CHR('8'):
+ case CHR('9'):
+ d = DIGITVAL(c);
+ break;
+ case CHR('a'):
+ case CHR('A'):
+ d = 10;
+ break;
+ case CHR('b'):
+ case CHR('B'):
+ d = 11;
+ break;
+ case CHR('c'):
+ case CHR('C'):
+ d = 12;
+ break;
+ case CHR('d'):
+ case CHR('D'):
+ d = 13;
+ break;
+ case CHR('e'):
+ case CHR('E'):
+ d = 14;
+ break;
+ case CHR('f'):
+ case CHR('F'):
+ d = 15;
+ break;
+ default:
+ v->now--; /* oops, not a digit at all */
+ d = -1;
+ break;
+ }
+
+ if (d >= base)
+ { /* not a plausible digit */
+ v->now--;
+ d = -1;
+ }
+ if (d < 0)
+ break; /* NOTE BREAK OUT */
+ n = n * ub + (uchr) d;
+ }
+ if (len < minlen)
+ ERR(REG_EESCAPE);
+
+ return (chr) n;
+}
+
+/*
+ * brenext - get next BRE token
+ *
+ * This is much like EREs except for all the stupid backslashes and the
+ * context-dependency of some things.
+ */
+static int /* 1 normal, 0 failure */
+brenext(struct vars *v,
+ chr c)
+{
+ switch (c)
+ {
+ case CHR('*'):
+ if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
+ RETV(PLAIN, c);
+ RETV('*', 1);
+ break;
+ case CHR('['):
+ if (HAVE(6) && *(v->now + 0) == CHR('[') &&
+ *(v->now + 1) == CHR(':') &&
+ (*(v->now + 2) == CHR('<') ||
+ *(v->now + 2) == CHR('>')) &&
+ *(v->now + 3) == CHR(':') &&
+ *(v->now + 4) == CHR(']') &&
+ *(v->now + 5) == CHR(']'))
+ {
+ c = *(v->now + 2);
+ v->now += 6;
+ NOTE(REG_UNONPOSIX);
+ RET((c == CHR('<')) ? '<' : '>');
+ }
+ INTOCON(L_BRACK);
+ if (NEXT1('^'))
+ {
+ v->now++;
+ RETV('[', 0);
+ }
+ RETV('[', 1);
+ break;
+ case CHR('.'):
+ RET('.');
+ break;
+ case CHR('^'):
+ if (LASTTYPE(EMPTY))
+ RET('^');
+ if (LASTTYPE('('))
+ {
+ NOTE(REG_UUNSPEC);
+ RET('^');
+ }
+ RETV(PLAIN, c);
+ break;
+ case CHR('$'):
+ if (v->cflags & REG_EXPANDED)
+ skip(v);
+ if (ATEOS())
+ RET('$');
+ if (NEXT2('\\', ')'))
+ {
+ NOTE(REG_UUNSPEC);
+ RET('$');
+ }
+ RETV(PLAIN, c);
+ break;
+ case CHR('\\'):
+ break; /* see below */
+ default:
+ RETV(PLAIN, c);
+ break;
+ }
+
+ assert(c == CHR('\\'));
+
+ if (ATEOS())
+ FAILW(REG_EESCAPE);
+
+ c = *v->now++;
+ switch (c)
+ {
+ case CHR('{'):
+ INTOCON(L_BBND);
+ NOTE(REG_UBOUNDS);
+ RET('{');
+ break;
+ case CHR('('):
+ RETV('(', 1);
+ break;
+ case CHR(')'):
+ RETV(')', c);
+ break;
+ case CHR('<'):
+ NOTE(REG_UNONPOSIX);
+ RET('<');
+ break;
+ case CHR('>'):
+ NOTE(REG_UNONPOSIX);
+ RET('>');
+ break;
+ case CHR('1'):
+ case CHR('2'):
+ case CHR('3'):
+ case CHR('4'):
+ case CHR('5'):
+ case CHR('6'):
+ case CHR('7'):
+ case CHR('8'):
+ case CHR('9'):
+ NOTE(REG_UBACKREF);
+ RETV(BACKREF, (chr) DIGITVAL(c));
+ break;
+ default:
+ if (iscalnum(c))
+ {
+ NOTE(REG_UBSALNUM);
+ NOTE(REG_UUNSPEC);
+ }
+ RETV(PLAIN, c);
+ break;
+ }
+
+ assert(NOTREACHED);
+ return 0;
+}
+
+/*
+ * skip - skip white space and comments in expanded form
+ */
+static void
+skip(struct vars *v)
+{
+ const chr *start = v->now;
+
+ assert(v->cflags & REG_EXPANDED);
+
+ for (;;)
+ {
+ while (!ATEOS() && iscspace(*v->now))
+ v->now++;
+ if (ATEOS() || *v->now != CHR('#'))
+ break; /* NOTE BREAK OUT */
+ assert(NEXT1('#'));
+ while (!ATEOS() && *v->now != CHR('\n'))
+ v->now++;
+ /* leave the newline to be picked up by the iscspace loop */
+ }
+
+ if (v->now != start)
+ NOTE(REG_UNONPOSIX);
+}
+
+/*
+ * newline - return the chr for a newline
+ *
+ * This helps confine use of CHR to this source file.
+ */
+static chr
+newline(void)
+{
+ return CHR('\n');
+}
+
+/*
+ * chrnamed - return the chr known by a given (chr string) name
+ *
+ * The code is a bit clumsy, but this routine gets only such specialized
+ * use that it hardly matters.
+ */
+static chr
+chrnamed(struct vars *v,
+ const chr *startp, /* start of name */
+ const chr *endp, /* just past end of name */
+ chr lastresort) /* what to return if name lookup fails */
+{
+ chr c;
+ int errsave;
+ int e;
+ struct cvec *cv;
+
+ errsave = v->err;
+ v->err = 0;
+ c = element(v, startp, endp);
+ e = v->err;
+ v->err = errsave;
+
+ if (e != 0)
+ return lastresort;
+
+ cv = range(v, c, c, 0);
+ if (cv->nchrs == 0)
+ return lastresort;
+ return cv->chrs[0];
+}
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
new file mode 100644
index 0000000..b5f3a73
--- /dev/null
+++ b/src/backend/regex/regc_locale.c
@@ -0,0 +1,771 @@
+/*
+ * regc_locale.c --
+ *
+ * This file contains locale-specific regexp routines.
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ *
+ * This software is copyrighted by the Regents of the University of
+ * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+ * Corporation and other parties. The following terms apply to all files
+ * associated with the software unless explicitly disclaimed in
+ * individual files.
+ *
+ * The authors hereby grant permission to use, copy, modify, distribute,
+ * and license this software and its documentation for any purpose, provided
+ * that existing copyright notices are retained in all copies and that this
+ * notice is included verbatim in any distributions. No written agreement,
+ * license, or royalty fee is required for any of the authorized uses.
+ * Modifications to this software may be copyrighted by their authors
+ * and need not follow the licensing terms described here, provided that
+ * the new terms are clearly indicated on the first page of each file where
+ * they apply.
+ *
+ * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+ * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+ * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
+ * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+ * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+ * MODIFICATIONS.
+ *
+ * GOVERNMENT USE: If you are acquiring this software on behalf of the
+ * U.S. government, the Government shall have only "Restricted Rights"
+ * in the software and related documentation as defined in the Federal
+ * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
+ * are acquiring the software on behalf of the Department of Defense, the
+ * software shall be classified as "Commercial Computer Software" and the
+ * Government shall have only "Restricted Rights" as defined in Clause
+ * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
+ * authors grant the U.S. Government and others acting in its behalf
+ * permission to use and distribute the software in accordance with the
+ * terms specified in this license.
+ *
+ * src/backend/regex/regc_locale.c
+ */
+
+/* ASCII character-name table */
+
+static const struct cname
+{
+ const char *name;
+ const char code;
+} cnames[] =
+
+{
+ {
+ "NUL", '\0'
+ },
+ {
+ "SOH", '\001'
+ },
+ {
+ "STX", '\002'
+ },
+ {
+ "ETX", '\003'
+ },
+ {
+ "EOT", '\004'
+ },
+ {
+ "ENQ", '\005'
+ },
+ {
+ "ACK", '\006'
+ },
+ {
+ "BEL", '\007'
+ },
+ {
+ "alert", '\007'
+ },
+ {
+ "BS", '\010'
+ },
+ {
+ "backspace", '\b'
+ },
+ {
+ "HT", '\011'
+ },
+ {
+ "tab", '\t'
+ },
+ {
+ "LF", '\012'
+ },
+ {
+ "newline", '\n'
+ },
+ {
+ "VT", '\013'
+ },
+ {
+ "vertical-tab", '\v'
+ },
+ {
+ "FF", '\014'
+ },
+ {
+ "form-feed", '\f'
+ },
+ {
+ "CR", '\015'
+ },
+ {
+ "carriage-return", '\r'
+ },
+ {
+ "SO", '\016'
+ },
+ {
+ "SI", '\017'
+ },
+ {
+ "DLE", '\020'
+ },
+ {
+ "DC1", '\021'
+ },
+ {
+ "DC2", '\022'
+ },
+ {
+ "DC3", '\023'
+ },
+ {
+ "DC4", '\024'
+ },
+ {
+ "NAK", '\025'
+ },
+ {
+ "SYN", '\026'
+ },
+ {
+ "ETB", '\027'
+ },
+ {
+ "CAN", '\030'
+ },
+ {
+ "EM", '\031'
+ },
+ {
+ "SUB", '\032'
+ },
+ {
+ "ESC", '\033'
+ },
+ {
+ "IS4", '\034'
+ },
+ {
+ "FS", '\034'
+ },
+ {
+ "IS3", '\035'
+ },
+ {
+ "GS", '\035'
+ },
+ {
+ "IS2", '\036'
+ },
+ {
+ "RS", '\036'
+ },
+ {
+ "IS1", '\037'
+ },
+ {
+ "US", '\037'
+ },
+ {
+ "space", ' '
+ },
+ {
+ "exclamation-mark", '!'
+ },
+ {
+ "quotation-mark", '"'
+ },
+ {
+ "number-sign", '#'
+ },
+ {
+ "dollar-sign", '$'
+ },
+ {
+ "percent-sign", '%'
+ },
+ {
+ "ampersand", '&'
+ },
+ {
+ "apostrophe", '\''
+ },
+ {
+ "left-parenthesis", '('
+ },
+ {
+ "right-parenthesis", ')'
+ },
+ {
+ "asterisk", '*'
+ },
+ {
+ "plus-sign", '+'
+ },
+ {
+ "comma", ','
+ },
+ {
+ "hyphen", '-'
+ },
+ {
+ "hyphen-minus", '-'
+ },
+ {
+ "period", '.'
+ },
+ {
+ "full-stop", '.'
+ },
+ {
+ "slash", '/'
+ },
+ {
+ "solidus", '/'
+ },
+ {
+ "zero", '0'
+ },
+ {
+ "one", '1'
+ },
+ {
+ "two", '2'
+ },
+ {
+ "three", '3'
+ },
+ {
+ "four", '4'
+ },
+ {
+ "five", '5'
+ },
+ {
+ "six", '6'
+ },
+ {
+ "seven", '7'
+ },
+ {
+ "eight", '8'
+ },
+ {
+ "nine", '9'
+ },
+ {
+ "colon", ':'
+ },
+ {
+ "semicolon", ';'
+ },
+ {
+ "less-than-sign", '<'
+ },
+ {
+ "equals-sign", '='
+ },
+ {
+ "greater-than-sign", '>'
+ },
+ {
+ "question-mark", '?'
+ },
+ {
+ "commercial-at", '@'
+ },
+ {
+ "left-square-bracket", '['
+ },
+ {
+ "backslash", '\\'
+ },
+ {
+ "reverse-solidus", '\\'
+ },
+ {
+ "right-square-bracket", ']'
+ },
+ {
+ "circumflex", '^'
+ },
+ {
+ "circumflex-accent", '^'
+ },
+ {
+ "underscore", '_'
+ },
+ {
+ "low-line", '_'
+ },
+ {
+ "grave-accent", '`'
+ },
+ {
+ "left-brace", '{'
+ },
+ {
+ "left-curly-bracket", '{'
+ },
+ {
+ "vertical-line", '|'
+ },
+ {
+ "right-brace", '}'
+ },
+ {
+ "right-curly-bracket", '}'
+ },
+ {
+ "tilde", '~'
+ },
+ {
+ "DEL", '\177'
+ },
+ {
+ NULL, 0
+ }
+};
+
+/*
+ * The following array defines the valid character class names.
+ * The entries must match enum char_classes in regguts.h.
+ */
+static const char *const classNames[NUM_CCLASSES + 1] = {
+ "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
+ "lower", "print", "punct", "space", "upper", "xdigit", "word",
+ NULL
+};
+
+/*
+ * We do not use the hard-wired Unicode classification tables that Tcl does.
+ * This is because (a) we need to deal with other encodings besides Unicode,
+ * and (b) we want to track the behavior of the libc locale routines as
+ * closely as possible. For example, it wouldn't be unreasonable for a
+ * locale to not consider every Unicode letter as a letter. So we build
+ * character classification cvecs by asking libc, even for Unicode.
+ */
+
+
+/*
+ * element - map collating-element name to chr
+ */
+static chr
+element(struct vars *v, /* context */
+ const chr *startp, /* points to start of name */
+ const chr *endp) /* points just past end of name */
+{
+ const struct cname *cn;
+ size_t len;
+
+ /* generic: one-chr names stand for themselves */
+ assert(startp < endp);
+ len = endp - startp;
+ if (len == 1)
+ return *startp;
+
+ NOTE(REG_ULOCALE);
+
+ /* search table */
+ for (cn = cnames; cn->name != NULL; cn++)
+ {
+ if (strlen(cn->name) == len &&
+ pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
+ {
+ break; /* NOTE BREAK OUT */
+ }
+ }
+ if (cn->name != NULL)
+ return CHR(cn->code);
+
+ /* couldn't find it */
+ ERR(REG_ECOLLATE);
+ return 0;
+}
+
+/*
+ * range - supply cvec for a range, including legality check
+ */
+static struct cvec *
+range(struct vars *v, /* context */
+ chr a, /* range start */
+ chr b, /* range end, might equal a */
+ int cases) /* case-independent? */
+{
+ int nchrs;
+ struct cvec *cv;
+ chr c,
+ cc;
+
+ if (a != b && !before(a, b))
+ {
+ ERR(REG_ERANGE);
+ return NULL;
+ }
+
+ if (!cases)
+ { /* easy version */
+ cv = getcvec(v, 0, 1);
+ NOERRN();
+ addrange(cv, a, b);
+ return cv;
+ }
+
+ /*
+ * When case-independent, it's hard to decide when cvec ranges are usable,
+ * so for now at least, we won't try. We use a range for the originally
+ * specified chrs and then add on any case-equivalents that are outside
+ * that range as individual chrs.
+ *
+ * To ensure sane behavior if someone specifies a very large range, limit
+ * the allocation size to 100000 chrs (arbitrary) and check for overrun
+ * inside the loop below.
+ */
+ nchrs = b - a + 1;
+ if (nchrs <= 0 || nchrs > 100000)
+ nchrs = 100000;
+
+ cv = getcvec(v, nchrs, 1);
+ NOERRN();
+ addrange(cv, a, b);
+
+ for (c = a; c <= b; c++)
+ {
+ cc = pg_wc_tolower(c);
+ if (cc != c &&
+ (before(cc, a) || before(b, cc)))
+ {
+ if (cv->nchrs >= cv->chrspace)
+ {
+ ERR(REG_ETOOBIG);
+ return NULL;
+ }
+ addchr(cv, cc);
+ }
+ cc = pg_wc_toupper(c);
+ if (cc != c &&
+ (before(cc, a) || before(b, cc)))
+ {
+ if (cv->nchrs >= cv->chrspace)
+ {
+ ERR(REG_ETOOBIG);
+ return NULL;
+ }
+ addchr(cv, cc);
+ }
+ if (CANCEL_REQUESTED(v->re))
+ {
+ ERR(REG_CANCEL);
+ return NULL;
+ }
+ }
+
+ return cv;
+}
+
+/*
+ * before - is chr x before chr y, for purposes of range legality?
+ */
+static int /* predicate */
+before(chr x, chr y)
+{
+ if (x < y)
+ return 1;
+ return 0;
+}
+
+/*
+ * eclass - supply cvec for an equivalence class
+ * Must include case counterparts on request.
+ */
+static struct cvec *
+eclass(struct vars *v, /* context */
+ chr c, /* Collating element representing the
+ * equivalence class. */
+ int cases) /* all cases? */
+{
+ struct cvec *cv;
+
+ /* crude fake equivalence class for testing */
+ if ((v->cflags & REG_FAKE) && c == 'x')
+ {
+ cv = getcvec(v, 4, 0);
+ addchr(cv, CHR('x'));
+ addchr(cv, CHR('y'));
+ if (cases)
+ {
+ addchr(cv, CHR('X'));
+ addchr(cv, CHR('Y'));
+ }
+ return cv;
+ }
+
+ /* otherwise, none */
+ if (cases)
+ return allcases(v, c);
+ cv = getcvec(v, 1, 0);
+ assert(cv != NULL);
+ addchr(cv, c);
+ return cv;
+}
+
+/*
+ * lookupcclass - lookup a character class identified by name
+ *
+ * On failure, sets an error code in *v; the result is then garbage.
+ */
+static enum char_classes
+lookupcclass(struct vars *v, /* context (for returning errors) */
+ const chr *startp, /* where the name starts */
+ const chr *endp) /* just past the end of the name */
+{
+ size_t len;
+ const char *const *namePtr;
+ int i;
+
+ /*
+ * Map the name to the corresponding enumerated value.
+ */
+ len = endp - startp;
+ for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
+ {
+ if (strlen(*namePtr) == len &&
+ pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
+ return (enum char_classes) i;
+ }
+
+ ERR(REG_ECTYPE);
+ return (enum char_classes) 0;
+}
+
+/*
+ * cclasscvec - supply cvec for a character class
+ *
+ * Must include case counterparts if "cases" is true.
+ *
+ * The returned cvec might be either a transient cvec gotten from getcvec(),
+ * or a permanently cached one from pg_ctype_get_cache(). This is okay
+ * because callers are not supposed to explicitly free the result either way.
+ */
+static struct cvec *
+cclasscvec(struct vars *v, /* context */
+ enum char_classes cclasscode, /* class to build a cvec for */
+ int cases) /* case-independent? */
+{
+ struct cvec *cv = NULL;
+
+ /*
+ * Remap lower and upper to alpha if the match is case insensitive.
+ */
+
+ if (cases &&
+ (cclasscode == CC_LOWER ||
+ cclasscode == CC_UPPER))
+ cclasscode = CC_ALPHA;
+
+ /*
+ * Now compute the character class contents. For classes that are based
+ * on the behavior of a <wctype.h> or <ctype.h> function, we use
+ * pg_ctype_get_cache so that we can cache the results. Other classes
+ * have definitions that are hard-wired here, and for those we just
+ * construct a transient cvec on the fly.
+ *
+ * NB: keep this code in sync with cclass_column_index(), below.
+ */
+
+ switch (cclasscode)
+ {
+ case CC_PRINT:
+ cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
+ break;
+ case CC_ALNUM:
+ cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
+ break;
+ case CC_ALPHA:
+ cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
+ break;
+ case CC_WORD:
+ cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
+ break;
+ case CC_ASCII:
+ /* hard-wired meaning */
+ cv = getcvec(v, 0, 1);
+ if (cv)
+ addrange(cv, 0, 0x7f);
+ break;
+ case CC_BLANK:
+ /* hard-wired meaning */
+ cv = getcvec(v, 2, 0);
+ addchr(cv, '\t');
+ addchr(cv, ' ');
+ break;
+ case CC_CNTRL:
+ /* hard-wired meaning */
+ cv = getcvec(v, 0, 2);
+ addrange(cv, 0x0, 0x1f);
+ addrange(cv, 0x7f, 0x9f);
+ break;
+ case CC_DIGIT:
+ cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
+ break;
+ case CC_PUNCT:
+ cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
+ break;
+ case CC_XDIGIT:
+
+ /*
+ * It's not clear how to define this in non-western locales, and
+ * even less clear that there's any particular use in trying. So
+ * just hard-wire the meaning.
+ */
+ cv = getcvec(v, 0, 3);
+ if (cv)
+ {
+ addrange(cv, '0', '9');
+ addrange(cv, 'a', 'f');
+ addrange(cv, 'A', 'F');
+ }
+ break;
+ case CC_SPACE:
+ cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
+ break;
+ case CC_LOWER:
+ cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
+ break;
+ case CC_UPPER:
+ cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
+ break;
+ case CC_GRAPH:
+ cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
+ break;
+ }
+
+ /* If cv is NULL now, the reason must be "out of memory" */
+ if (cv == NULL)
+ ERR(REG_ESPACE);
+ return cv;
+}
+
+/*
+ * cclass_column_index - get appropriate high colormap column index for chr
+ */
+static int
+cclass_column_index(struct colormap *cm, chr c)
+{
+ int colnum = 0;
+
+ /* Shouldn't go through all these pushups for simple chrs */
+ assert(c > MAX_SIMPLE_CHR);
+
+ /*
+ * Note: we should not see requests to consider cclasses that are not
+ * treated as locale-specific by cclasscvec(), above.
+ */
+ if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
+ colnum |= cm->classbits[CC_PRINT];
+ if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
+ colnum |= cm->classbits[CC_ALNUM];
+ if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
+ colnum |= cm->classbits[CC_ALPHA];
+ if (cm->classbits[CC_WORD] && pg_wc_isword(c))
+ colnum |= cm->classbits[CC_WORD];
+ assert(cm->classbits[CC_ASCII] == 0);
+ assert(cm->classbits[CC_BLANK] == 0);
+ assert(cm->classbits[CC_CNTRL] == 0);
+ if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
+ colnum |= cm->classbits[CC_DIGIT];
+ if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
+ colnum |= cm->classbits[CC_PUNCT];
+ assert(cm->classbits[CC_XDIGIT] == 0);
+ if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
+ colnum |= cm->classbits[CC_SPACE];
+ if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
+ colnum |= cm->classbits[CC_LOWER];
+ if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
+ colnum |= cm->classbits[CC_UPPER];
+ if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
+ colnum |= cm->classbits[CC_GRAPH];
+
+ return colnum;
+}
+
+/*
+ * allcases - supply cvec for all case counterparts of a chr (including itself)
+ *
+ * This is a shortcut, preferably an efficient one, for simple characters;
+ * messy cases are done via range().
+ */
+static struct cvec *
+allcases(struct vars *v, /* context */
+ chr c) /* character to get case equivs of */
+{
+ struct cvec *cv;
+ chr lc,
+ uc;
+
+ lc = pg_wc_tolower(c);
+ uc = pg_wc_toupper(c);
+
+ cv = getcvec(v, 2, 0);
+ addchr(cv, lc);
+ if (lc != uc)
+ addchr(cv, uc);
+ return cv;
+}
+
+/*
+ * cmp - chr-substring compare
+ *
+ * Backrefs need this. It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int /* 0 for equal, nonzero for unequal */
+cmp(const chr *x, const chr *y, /* strings to compare */
+ size_t len) /* exact length of comparison */
+{
+ return memcmp(VS(x), VS(y), len * sizeof(chr));
+}
+
+/*
+ * casecmp - case-independent chr-substring compare
+ *
+ * REG_ICASE backrefs need this. It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int /* 0 for equal, nonzero for unequal */
+casecmp(const chr *x, const chr *y, /* strings to compare */
+ size_t len) /* exact length of comparison */
+{
+ for (; len > 0; len--, x++, y++)
+ {
+ if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
+ return 1;
+ }
+ return 0;
+}
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
new file mode 100644
index 0000000..60fb0be
--- /dev/null
+++ b/src/backend/regex/regc_nfa.c
@@ -0,0 +1,3882 @@
+/*
+ * NFA utilities.
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regc_nfa.c
+ *
+ *
+ * One or two things that technically ought to be in here
+ * are actually in color.c, thanks to some incestuous relationships in
+ * the color chains.
+ */
+
+#define NISERR() VISERR(nfa->v)
+#define NERR(e) VERR(nfa->v, (e))
+
+
+/*
+ * newnfa - set up an NFA
+ */
+static struct nfa * /* the NFA, or NULL */
+newnfa(struct vars *v,
+ struct colormap *cm,
+ struct nfa *parent) /* NULL if primary NFA */
+{
+ struct nfa *nfa;
+
+ nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
+ if (nfa == NULL)
+ {
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+
+ /* Make the NFA minimally valid, so freenfa() will behave sanely */
+ nfa->states = NULL;
+ nfa->slast = NULL;
+ nfa->freestates = NULL;
+ nfa->freearcs = NULL;
+ nfa->lastsb = NULL;
+ nfa->lastab = NULL;
+ nfa->lastsbused = 0;
+ nfa->lastabused = 0;
+ nfa->nstates = 0;
+ nfa->cm = cm;
+ nfa->v = v;
+ nfa->bos[0] = nfa->bos[1] = COLORLESS;
+ nfa->eos[0] = nfa->eos[1] = COLORLESS;
+ nfa->flags = 0;
+ nfa->minmatchall = nfa->maxmatchall = -1;
+ nfa->parent = parent; /* Precedes newfstate so parent is valid. */
+
+ /* Create required infrastructure */
+ nfa->post = newfstate(nfa, '@'); /* number 0 */
+ nfa->pre = newfstate(nfa, '>'); /* number 1 */
+ nfa->init = newstate(nfa); /* may become invalid later */
+ nfa->final = newstate(nfa);
+ if (ISERR())
+ {
+ freenfa(nfa);
+ return NULL;
+ }
+ rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
+ newarc(nfa, '^', 1, nfa->pre, nfa->init);
+ newarc(nfa, '^', 0, nfa->pre, nfa->init);
+ rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
+ newarc(nfa, '$', 1, nfa->final, nfa->post);
+ newarc(nfa, '$', 0, nfa->final, nfa->post);
+
+ if (ISERR())
+ {
+ freenfa(nfa);
+ return NULL;
+ }
+ return nfa;
+}
+
+/*
+ * freenfa - free an entire NFA
+ */
+static void
+freenfa(struct nfa *nfa)
+{
+ struct statebatch *sb;
+ struct statebatch *sbnext;
+ struct arcbatch *ab;
+ struct arcbatch *abnext;
+
+ for (sb = nfa->lastsb; sb != NULL; sb = sbnext)
+ {
+ sbnext = sb->next;
+ nfa->v->spaceused -= STATEBATCHSIZE(sb->nstates);
+ FREE(sb);
+ }
+ nfa->lastsb = NULL;
+ for (ab = nfa->lastab; ab != NULL; ab = abnext)
+ {
+ abnext = ab->next;
+ nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
+ FREE(ab);
+ }
+ nfa->lastab = NULL;
+
+ nfa->nstates = -1;
+ FREE(nfa);
+}
+
+/*
+ * newstate - allocate an NFA state, with zero flag value
+ */
+static struct state * /* NULL on error */
+newstate(struct nfa *nfa)
+{
+ struct state *s;
+
+ /*
+ * This is a handy place to check for operation cancel during regex
+ * compilation, since no code path will go very long without making a new
+ * state or arc.
+ */
+ if (CANCEL_REQUESTED(nfa->v->re))
+ {
+ NERR(REG_CANCEL);
+ return NULL;
+ }
+
+ /* first, recycle anything that's on the freelist */
+ if (nfa->freestates != NULL)
+ {
+ s = nfa->freestates;
+ nfa->freestates = s->next;
+ }
+ /* otherwise, is there anything left in the last statebatch? */
+ else if (nfa->lastsb != NULL && nfa->lastsbused < nfa->lastsb->nstates)
+ {
+ s = &nfa->lastsb->s[nfa->lastsbused++];
+ }
+ /* otherwise, need to allocate a new statebatch */
+ else
+ {
+ struct statebatch *newSb;
+ size_t nstates;
+
+ if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
+ {
+ NERR(REG_ETOOBIG);
+ return NULL;
+ }
+ nstates = (nfa->lastsb != NULL) ? nfa->lastsb->nstates * 2 : FIRSTSBSIZE;
+ if (nstates > MAXSBSIZE)
+ nstates = MAXSBSIZE;
+ newSb = (struct statebatch *) MALLOC(STATEBATCHSIZE(nstates));
+ if (newSb == NULL)
+ {
+ NERR(REG_ESPACE);
+ return NULL;
+ }
+ nfa->v->spaceused += STATEBATCHSIZE(nstates);
+ newSb->nstates = nstates;
+ newSb->next = nfa->lastsb;
+ nfa->lastsb = newSb;
+ nfa->lastsbused = 1;
+ s = &newSb->s[0];
+ }
+
+ assert(nfa->nstates >= 0);
+ s->no = nfa->nstates++;
+ s->flag = 0;
+ if (nfa->states == NULL)
+ nfa->states = s;
+ s->nins = 0;
+ s->ins = NULL;
+ s->nouts = 0;
+ s->outs = NULL;
+ s->tmp = NULL;
+ s->next = NULL;
+ if (nfa->slast != NULL)
+ {
+ assert(nfa->slast->next == NULL);
+ nfa->slast->next = s;
+ }
+ s->prev = nfa->slast;
+ nfa->slast = s;
+ return s;
+}
+
+/*
+ * newfstate - allocate an NFA state with a specified flag value
+ */
+static struct state * /* NULL on error */
+newfstate(struct nfa *nfa, int flag)
+{
+ struct state *s;
+
+ s = newstate(nfa);
+ if (s != NULL)
+ s->flag = (char) flag;
+ return s;
+}
+
+/*
+ * dropstate - delete a state's inarcs and outarcs and free it
+ */
+static void
+dropstate(struct nfa *nfa,
+ struct state *s)
+{
+ struct arc *a;
+
+ while ((a = s->ins) != NULL)
+ freearc(nfa, a);
+ while ((a = s->outs) != NULL)
+ freearc(nfa, a);
+ freestate(nfa, s);
+}
+
+/*
+ * freestate - free a state, which has no in-arcs or out-arcs
+ */
+static void
+freestate(struct nfa *nfa,
+ struct state *s)
+{
+ assert(s != NULL);
+ assert(s->nins == 0 && s->nouts == 0);
+
+ s->no = FREESTATE;
+ s->flag = 0;
+ if (s->next != NULL)
+ s->next->prev = s->prev;
+ else
+ {
+ assert(s == nfa->slast);
+ nfa->slast = s->prev;
+ }
+ if (s->prev != NULL)
+ s->prev->next = s->next;
+ else
+ {
+ assert(s == nfa->states);
+ nfa->states = s->next;
+ }
+ s->prev = NULL;
+ s->next = nfa->freestates; /* don't delete it, put it on the free list */
+ nfa->freestates = s;
+}
+
+/*
+ * newarc - set up a new arc within an NFA
+ *
+ * This function checks to make sure that no duplicate arcs are created.
+ * In general we never want duplicates.
+ *
+ * However: in principle, a RAINBOW arc is redundant with any plain arc
+ * (unless that arc is for a pseudocolor). But we don't try to recognize
+ * that redundancy, either here or in allied operations such as moveins().
+ * The pseudocolor consideration makes that more costly than it seems worth.
+ */
+static void
+newarc(struct nfa *nfa,
+ int t,
+ color co,
+ struct state *from,
+ struct state *to)
+{
+ struct arc *a;
+
+ assert(from != NULL && to != NULL);
+
+ /*
+ * This is a handy place to check for operation cancel during regex
+ * compilation, since no code path will go very long without making a new
+ * state or arc.
+ */
+ if (CANCEL_REQUESTED(nfa->v->re))
+ {
+ NERR(REG_CANCEL);
+ return;
+ }
+
+ /* check for duplicate arc, using whichever chain is shorter */
+ if (from->nouts <= to->nins)
+ {
+ for (a = from->outs; a != NULL; a = a->outchain)
+ if (a->to == to && a->co == co && a->type == t)
+ return;
+ }
+ else
+ {
+ for (a = to->ins; a != NULL; a = a->inchain)
+ if (a->from == from && a->co == co && a->type == t)
+ return;
+ }
+
+ /* no dup, so create the arc */
+ createarc(nfa, t, co, from, to);
+}
+
+/*
+ * createarc - create a new arc within an NFA
+ *
+ * This function must *only* be used after verifying that there is no existing
+ * identical arc (same type/color/from/to).
+ */
+static void
+createarc(struct nfa *nfa,
+ int t,
+ color co,
+ struct state *from,
+ struct state *to)
+{
+ struct arc *a;
+
+ a = allocarc(nfa);
+ if (NISERR())
+ return;
+ assert(a != NULL);
+
+ a->type = t;
+ a->co = co;
+ a->to = to;
+ a->from = from;
+
+ /*
+ * Put the new arc on the beginning, not the end, of the chains; it's
+ * simpler here, and freearc() is the same cost either way. See also the
+ * logic in moveins() and its cohorts, as well as fixempties().
+ */
+ a->inchain = to->ins;
+ a->inchainRev = NULL;
+ if (to->ins)
+ to->ins->inchainRev = a;
+ to->ins = a;
+ a->outchain = from->outs;
+ a->outchainRev = NULL;
+ if (from->outs)
+ from->outs->outchainRev = a;
+ from->outs = a;
+
+ from->nouts++;
+ to->nins++;
+
+ if (COLORED(a) && nfa->parent == NULL)
+ colorchain(nfa->cm, a);
+}
+
+/*
+ * allocarc - allocate a new arc within an NFA
+ */
+static struct arc * /* NULL for failure */
+allocarc(struct nfa *nfa)
+{
+ struct arc *a;
+
+ /* first, recycle anything that's on the freelist */
+ if (nfa->freearcs != NULL)
+ {
+ a = nfa->freearcs;
+ nfa->freearcs = a->freechain;
+ }
+ /* otherwise, is there anything left in the last arcbatch? */
+ else if (nfa->lastab != NULL && nfa->lastabused < nfa->lastab->narcs)
+ {
+ a = &nfa->lastab->a[nfa->lastabused++];
+ }
+ /* otherwise, need to allocate a new arcbatch */
+ else
+ {
+ struct arcbatch *newAb;
+ size_t narcs;
+
+ if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
+ {
+ NERR(REG_ETOOBIG);
+ return NULL;
+ }
+ narcs = (nfa->lastab != NULL) ? nfa->lastab->narcs * 2 : FIRSTABSIZE;
+ if (narcs > MAXABSIZE)
+ narcs = MAXABSIZE;
+ newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
+ if (newAb == NULL)
+ {
+ NERR(REG_ESPACE);
+ return NULL;
+ }
+ nfa->v->spaceused += ARCBATCHSIZE(narcs);
+ newAb->narcs = narcs;
+ newAb->next = nfa->lastab;
+ nfa->lastab = newAb;
+ nfa->lastabused = 1;
+ a = &newAb->a[0];
+ }
+
+ return a;
+}
+
+/*
+ * freearc - free an arc
+ */
+static void
+freearc(struct nfa *nfa,
+ struct arc *victim)
+{
+ struct state *from = victim->from;
+ struct state *to = victim->to;
+ struct arc *predecessor;
+
+ assert(victim->type != 0);
+
+ /* take it off color chain if necessary */
+ if (COLORED(victim) && nfa->parent == NULL)
+ uncolorchain(nfa->cm, victim);
+
+ /* take it off source's out-chain */
+ assert(from != NULL);
+ predecessor = victim->outchainRev;
+ if (predecessor == NULL)
+ {
+ assert(from->outs == victim);
+ from->outs = victim->outchain;
+ }
+ else
+ {
+ assert(predecessor->outchain == victim);
+ predecessor->outchain = victim->outchain;
+ }
+ if (victim->outchain != NULL)
+ {
+ assert(victim->outchain->outchainRev == victim);
+ victim->outchain->outchainRev = predecessor;
+ }
+ from->nouts--;
+
+ /* take it off target's in-chain */
+ assert(to != NULL);
+ predecessor = victim->inchainRev;
+ if (predecessor == NULL)
+ {
+ assert(to->ins == victim);
+ to->ins = victim->inchain;
+ }
+ else
+ {
+ assert(predecessor->inchain == victim);
+ predecessor->inchain = victim->inchain;
+ }
+ if (victim->inchain != NULL)
+ {
+ assert(victim->inchain->inchainRev == victim);
+ victim->inchain->inchainRev = predecessor;
+ }
+ to->nins--;
+
+ /* clean up and place on NFA's free list */
+ victim->type = 0;
+ victim->from = NULL; /* precautions... */
+ victim->to = NULL;
+ victim->inchain = NULL;
+ victim->inchainRev = NULL;
+ victim->outchain = NULL;
+ victim->outchainRev = NULL;
+ victim->freechain = nfa->freearcs;
+ nfa->freearcs = victim;
+}
+
+/*
+ * changearcsource - flip an arc to have a different from state
+ *
+ * Caller must have verified that there is no pre-existing duplicate arc.
+ */
+static void
+changearcsource(struct arc *a, struct state *newfrom)
+{
+ struct state *oldfrom = a->from;
+ struct arc *predecessor;
+
+ assert(oldfrom != newfrom);
+
+ /* take it off old source's out-chain */
+ assert(oldfrom != NULL);
+ predecessor = a->outchainRev;
+ if (predecessor == NULL)
+ {
+ assert(oldfrom->outs == a);
+ oldfrom->outs = a->outchain;
+ }
+ else
+ {
+ assert(predecessor->outchain == a);
+ predecessor->outchain = a->outchain;
+ }
+ if (a->outchain != NULL)
+ {
+ assert(a->outchain->outchainRev == a);
+ a->outchain->outchainRev = predecessor;
+ }
+ oldfrom->nouts--;
+
+ a->from = newfrom;
+
+ /* prepend it to new source's out-chain */
+ a->outchain = newfrom->outs;
+ a->outchainRev = NULL;
+ if (newfrom->outs)
+ newfrom->outs->outchainRev = a;
+ newfrom->outs = a;
+ newfrom->nouts++;
+}
+
+/*
+ * changearctarget - flip an arc to have a different to state
+ *
+ * Caller must have verified that there is no pre-existing duplicate arc.
+ */
+static void
+changearctarget(struct arc *a, struct state *newto)
+{
+ struct state *oldto = a->to;
+ struct arc *predecessor;
+
+ assert(oldto != newto);
+
+ /* take it off old target's in-chain */
+ assert(oldto != NULL);
+ predecessor = a->inchainRev;
+ if (predecessor == NULL)
+ {
+ assert(oldto->ins == a);
+ oldto->ins = a->inchain;
+ }
+ else
+ {
+ assert(predecessor->inchain == a);
+ predecessor->inchain = a->inchain;
+ }
+ if (a->inchain != NULL)
+ {
+ assert(a->inchain->inchainRev == a);
+ a->inchain->inchainRev = predecessor;
+ }
+ oldto->nins--;
+
+ a->to = newto;
+
+ /* prepend it to new target's in-chain */
+ a->inchain = newto->ins;
+ a->inchainRev = NULL;
+ if (newto->ins)
+ newto->ins->inchainRev = a;
+ newto->ins = a;
+ newto->nins++;
+}
+
+/*
+ * hasnonemptyout - Does state have a non-EMPTY out arc?
+ */
+static int
+hasnonemptyout(struct state *s)
+{
+ struct arc *a;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->type != EMPTY)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * findarc - find arc, if any, from given source with given type and color
+ * If there is more than one such arc, the result is random.
+ */
+static struct arc *
+findarc(struct state *s,
+ int type,
+ color co)
+{
+ struct arc *a;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ if (a->type == type && a->co == co)
+ return a;
+ return NULL;
+}
+
+/*
+ * cparc - allocate a new arc within an NFA, copying details from old one
+ */
+static void
+cparc(struct nfa *nfa,
+ struct arc *oa,
+ struct state *from,
+ struct state *to)
+{
+ newarc(nfa, oa->type, oa->co, from, to);
+}
+
+/*
+ * sortins - sort the in arcs of a state by from/color/type
+ */
+static void
+sortins(struct nfa *nfa,
+ struct state *s)
+{
+ struct arc **sortarray;
+ struct arc *a;
+ int n = s->nins;
+ int i;
+
+ if (n <= 1)
+ return; /* nothing to do */
+ /* make an array of arc pointers ... */
+ sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
+ if (sortarray == NULL)
+ {
+ NERR(REG_ESPACE);
+ return;
+ }
+ i = 0;
+ for (a = s->ins; a != NULL; a = a->inchain)
+ sortarray[i++] = a;
+ assert(i == n);
+ /* ... sort the array */
+ qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
+ /* ... and rebuild arc list in order */
+ /* it seems worth special-casing first and last items to simplify loop */
+ a = sortarray[0];
+ s->ins = a;
+ a->inchain = sortarray[1];
+ a->inchainRev = NULL;
+ for (i = 1; i < n - 1; i++)
+ {
+ a = sortarray[i];
+ a->inchain = sortarray[i + 1];
+ a->inchainRev = sortarray[i - 1];
+ }
+ a = sortarray[i];
+ a->inchain = NULL;
+ a->inchainRev = sortarray[i - 1];
+ FREE(sortarray);
+}
+
+static int
+sortins_cmp(const void *a, const void *b)
+{
+ const struct arc *aa = *((const struct arc *const *) a);
+ const struct arc *bb = *((const struct arc *const *) b);
+
+ /* we check the fields in the order they are most likely to be different */
+ if (aa->from->no < bb->from->no)
+ return -1;
+ if (aa->from->no > bb->from->no)
+ return 1;
+ if (aa->co < bb->co)
+ return -1;
+ if (aa->co > bb->co)
+ return 1;
+ if (aa->type < bb->type)
+ return -1;
+ if (aa->type > bb->type)
+ return 1;
+ return 0;
+}
+
+/*
+ * sortouts - sort the out arcs of a state by to/color/type
+ */
+static void
+sortouts(struct nfa *nfa,
+ struct state *s)
+{
+ struct arc **sortarray;
+ struct arc *a;
+ int n = s->nouts;
+ int i;
+
+ if (n <= 1)
+ return; /* nothing to do */
+ /* make an array of arc pointers ... */
+ sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
+ if (sortarray == NULL)
+ {
+ NERR(REG_ESPACE);
+ return;
+ }
+ i = 0;
+ for (a = s->outs; a != NULL; a = a->outchain)
+ sortarray[i++] = a;
+ assert(i == n);
+ /* ... sort the array */
+ qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
+ /* ... and rebuild arc list in order */
+ /* it seems worth special-casing first and last items to simplify loop */
+ a = sortarray[0];
+ s->outs = a;
+ a->outchain = sortarray[1];
+ a->outchainRev = NULL;
+ for (i = 1; i < n - 1; i++)
+ {
+ a = sortarray[i];
+ a->outchain = sortarray[i + 1];
+ a->outchainRev = sortarray[i - 1];
+ }
+ a = sortarray[i];
+ a->outchain = NULL;
+ a->outchainRev = sortarray[i - 1];
+ FREE(sortarray);
+}
+
+static int
+sortouts_cmp(const void *a, const void *b)
+{
+ const struct arc *aa = *((const struct arc *const *) a);
+ const struct arc *bb = *((const struct arc *const *) b);
+
+ /* we check the fields in the order they are most likely to be different */
+ if (aa->to->no < bb->to->no)
+ return -1;
+ if (aa->to->no > bb->to->no)
+ return 1;
+ if (aa->co < bb->co)
+ return -1;
+ if (aa->co > bb->co)
+ return 1;
+ if (aa->type < bb->type)
+ return -1;
+ if (aa->type > bb->type)
+ return 1;
+ return 0;
+}
+
+/*
+ * Common decision logic about whether to use arc-by-arc operations or
+ * sort/merge. If there's just a few source arcs we cannot recoup the
+ * cost of sorting the destination arc list, no matter how large it is.
+ * Otherwise, limit the number of arc-by-arc comparisons to about 1000
+ * (a somewhat arbitrary choice, but the breakeven point would probably
+ * be machine dependent anyway).
+ */
+#define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
+ ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
+
+/*
+ * moveins - move all in arcs of a state to another state
+ *
+ * You might think this could be done better by just updating the
+ * existing arcs, and you would be right if it weren't for the need
+ * for duplicate suppression, which makes it easier to just make new
+ * ones to exploit the suppression built into newarc.
+ *
+ * However, if we have a whole lot of arcs to deal with, retail duplicate
+ * checks become too slow. In that case we proceed by sorting and merging
+ * the arc lists, and then we can indeed just update the arcs in-place.
+ *
+ * On the other hand, it's also true that this is frequently called with
+ * a brand-new newState that has no existing in-arcs. In that case,
+ * de-duplication is unnecessary, so we can just blindly move all the arcs.
+ */
+static void
+moveins(struct nfa *nfa,
+ struct state *oldState,
+ struct state *newState)
+{
+ assert(oldState != newState);
+
+ if (newState->nins == 0)
+ {
+ /* No need for de-duplication */
+ struct arc *a;
+
+ while ((a = oldState->ins) != NULL)
+ {
+ createarc(nfa, a->type, a->co, a->from, newState);
+ freearc(nfa, a);
+ }
+ }
+ else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
+ {
+ /* With not too many arcs, just do them one at a time */
+ struct arc *a;
+
+ while ((a = oldState->ins) != NULL)
+ {
+ cparc(nfa, a, a->from, newState);
+ freearc(nfa, a);
+ }
+ }
+ else
+ {
+ /*
+ * With many arcs, use a sort-merge approach. Note changearctarget()
+ * will put the arc onto the front of newState's chain, so it does not
+ * break our walk through the sorted part of the chain.
+ */
+ struct arc *oa;
+ struct arc *na;
+
+ /*
+ * Because we bypass newarc() in this code path, we'd better include a
+ * cancel check.
+ */
+ if (CANCEL_REQUESTED(nfa->v->re))
+ {
+ NERR(REG_CANCEL);
+ return;
+ }
+
+ sortins(nfa, oldState);
+ sortins(nfa, newState);
+ if (NISERR())
+ return; /* might have failed to sort */
+ oa = oldState->ins;
+ na = newState->ins;
+ while (oa != NULL && na != NULL)
+ {
+ struct arc *a = oa;
+
+ switch (sortins_cmp(&oa, &na))
+ {
+ case -1:
+ /* newState does not have anything matching oa */
+ oa = oa->inchain;
+
+ /*
+ * Rather than doing createarc+freearc, we can just unlink
+ * and relink the existing arc struct.
+ */
+ changearctarget(a, newState);
+ break;
+ case 0:
+ /* match, advance in both lists */
+ oa = oa->inchain;
+ na = na->inchain;
+ /* ... and drop duplicate arc from oldState */
+ freearc(nfa, a);
+ break;
+ case +1:
+ /* advance only na; oa might have a match later */
+ na = na->inchain;
+ break;
+ default:
+ assert(NOTREACHED);
+ }
+ }
+ while (oa != NULL)
+ {
+ /* newState does not have anything matching oa */
+ struct arc *a = oa;
+
+ oa = oa->inchain;
+ changearctarget(a, newState);
+ }
+ }
+
+ assert(oldState->nins == 0);
+ assert(oldState->ins == NULL);
+}
+
+/*
+ * copyins - copy in arcs of a state to another state
+ *
+ * The comments for moveins() apply here as well. However, in current
+ * usage, this is *only* called with brand-new target states, so that
+ * only the "no need for de-duplication" code path is ever reached.
+ * We keep the rest #ifdef'd out in case it's needed in the future.
+ */
+static void
+copyins(struct nfa *nfa,
+ struct state *oldState,
+ struct state *newState)
+{
+ assert(oldState != newState);
+ assert(newState->nins == 0); /* see comment above */
+
+ if (newState->nins == 0)
+ {
+ /* No need for de-duplication */
+ struct arc *a;
+
+ for (a = oldState->ins; a != NULL; a = a->inchain)
+ createarc(nfa, a->type, a->co, a->from, newState);
+ }
+#ifdef NOT_USED /* see comment above */
+ else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
+ {
+ /* With not too many arcs, just do them one at a time */
+ struct arc *a;
+
+ for (a = oldState->ins; a != NULL; a = a->inchain)
+ cparc(nfa, a, a->from, newState);
+ }
+ else
+ {
+ /*
+ * With many arcs, use a sort-merge approach. Note that createarc()
+ * will put new arcs onto the front of newState's chain, so it does
+ * not break our walk through the sorted part of the chain.
+ */
+ struct arc *oa;
+ struct arc *na;
+
+ /*
+ * Because we bypass newarc() in this code path, we'd better include a
+ * cancel check.
+ */
+ if (CANCEL_REQUESTED(nfa->v->re))
+ {
+ NERR(REG_CANCEL);
+ return;
+ }
+
+ sortins(nfa, oldState);
+ sortins(nfa, newState);
+ if (NISERR())
+ return; /* might have failed to sort */
+ oa = oldState->ins;
+ na = newState->ins;
+ while (oa != NULL && na != NULL)
+ {
+ struct arc *a = oa;
+
+ switch (sortins_cmp(&oa, &na))
+ {
+ case -1:
+ /* newState does not have anything matching oa */
+ oa = oa->inchain;
+ createarc(nfa, a->type, a->co, a->from, newState);
+ break;
+ case 0:
+ /* match, advance in both lists */
+ oa = oa->inchain;
+ na = na->inchain;
+ break;
+ case +1:
+ /* advance only na; oa might have a match later */
+ na = na->inchain;
+ break;
+ default:
+ assert(NOTREACHED);
+ }
+ }
+ while (oa != NULL)
+ {
+ /* newState does not have anything matching oa */
+ struct arc *a = oa;
+
+ oa = oa->inchain;
+ createarc(nfa, a->type, a->co, a->from, newState);
+ }
+ }
+#endif /* NOT_USED */
+}
+
+/*
+ * mergeins - merge a list of inarcs into a state
+ *
+ * This is much like copyins, but the source arcs are listed in an array,
+ * and are not guaranteed unique. It's okay to clobber the array contents.
+ */
+static void
+mergeins(struct nfa *nfa,
+ struct state *s,
+ struct arc **arcarray,
+ int arccount)
+{
+ struct arc *na;
+ int i;
+ int j;
+
+ if (arccount <= 0)
+ return;
+
+ /*
+ * Because we bypass newarc() in this code path, we'd better include a
+ * cancel check.
+ */
+ if (CANCEL_REQUESTED(nfa->v->re))
+ {
+ NERR(REG_CANCEL);
+ return;
+ }
+
+ /* Sort existing inarcs as well as proposed new ones */
+ sortins(nfa, s);
+ if (NISERR())
+ return; /* might have failed to sort */
+
+ qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
+
+ /*
+ * arcarray very likely includes dups, so we must eliminate them. (This
+ * could be folded into the next loop, but it's not worth the trouble.)
+ */
+ j = 0;
+ for (i = 1; i < arccount; i++)
+ {
+ switch (sortins_cmp(&arcarray[j], &arcarray[i]))
+ {
+ case -1:
+ /* non-dup */
+ arcarray[++j] = arcarray[i];
+ break;
+ case 0:
+ /* dup */
+ break;
+ default:
+ /* trouble */
+ assert(NOTREACHED);
+ }
+ }
+ arccount = j + 1;
+
+ /*
+ * Now merge into s' inchain. Note that createarc() will put new arcs
+ * onto the front of s's chain, so it does not break our walk through the
+ * sorted part of the chain.
+ */
+ i = 0;
+ na = s->ins;
+ while (i < arccount && na != NULL)
+ {
+ struct arc *a = arcarray[i];
+
+ switch (sortins_cmp(&a, &na))
+ {
+ case -1:
+ /* s does not have anything matching a */
+ createarc(nfa, a->type, a->co, a->from, s);
+ i++;
+ break;
+ case 0:
+ /* match, advance in both lists */
+ i++;
+ na = na->inchain;
+ break;
+ case +1:
+ /* advance only na; array might have a match later */
+ na = na->inchain;
+ break;
+ default:
+ assert(NOTREACHED);
+ }
+ }
+ while (i < arccount)
+ {
+ /* s does not have anything matching a */
+ struct arc *a = arcarray[i];
+
+ createarc(nfa, a->type, a->co, a->from, s);
+ i++;
+ }
+}
+
+/*
+ * moveouts - move all out arcs of a state to another state
+ *
+ * See comments for moveins()
+ */
+static void
+moveouts(struct nfa *nfa,
+ struct state *oldState,
+ struct state *newState)
+{
+ assert(oldState != newState);
+
+ if (newState->nouts == 0)
+ {
+ /* No need for de-duplication */
+ struct arc *a;
+
+ while ((a = oldState->outs) != NULL)
+ {
+ createarc(nfa, a->type, a->co, newState, a->to);
+ freearc(nfa, a);
+ }
+ }
+ else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
+ {
+ /* With not too many arcs, just do them one at a time */
+ struct arc *a;
+
+ while ((a = oldState->outs) != NULL)
+ {
+ cparc(nfa, a, newState, a->to);
+ freearc(nfa, a);
+ }
+ }
+ else
+ {
+ /*
+ * With many arcs, use a sort-merge approach. Note changearcsource()
+ * will put the arc onto the front of newState's chain, so it does not
+ * break our walk through the sorted part of the chain.
+ */
+ struct arc *oa;
+ struct arc *na;
+
+ /*
+ * Because we bypass newarc() in this code path, we'd better include a
+ * cancel check.
+ */
+ if (CANCEL_REQUESTED(nfa->v->re))
+ {
+ NERR(REG_CANCEL);
+ return;
+ }
+
+ sortouts(nfa, oldState);
+ sortouts(nfa, newState);
+ if (NISERR())
+ return; /* might have failed to sort */
+ oa = oldState->outs;
+ na = newState->outs;
+ while (oa != NULL && na != NULL)
+ {
+ struct arc *a = oa;
+
+ switch (sortouts_cmp(&oa, &na))
+ {
+ case -1:
+ /* newState does not have anything matching oa */
+ oa = oa->outchain;
+
+ /*
+ * Rather than doing createarc+freearc, we can just unlink
+ * and relink the existing arc struct.
+ */
+ changearcsource(a, newState);
+ break;
+ case 0:
+ /* match, advance in both lists */
+ oa = oa->outchain;
+ na = na->outchain;
+ /* ... and drop duplicate arc from oldState */
+ freearc(nfa, a);
+ break;
+ case +1:
+ /* advance only na; oa might have a match later */
+ na = na->outchain;
+ break;
+ default:
+ assert(NOTREACHED);
+ }
+ }
+ while (oa != NULL)
+ {
+ /* newState does not have anything matching oa */
+ struct arc *a = oa;
+
+ oa = oa->outchain;
+ changearcsource(a, newState);
+ }
+ }
+
+ assert(oldState->nouts == 0);
+ assert(oldState->outs == NULL);
+}
+
+/*
+ * copyouts - copy out arcs of a state to another state
+ *
+ * See comments for copyins()
+ */
+static void
+copyouts(struct nfa *nfa,
+ struct state *oldState,
+ struct state *newState)
+{
+ assert(oldState != newState);
+ assert(newState->nouts == 0); /* see comment above */
+
+ if (newState->nouts == 0)
+ {
+ /* No need for de-duplication */
+ struct arc *a;
+
+ for (a = oldState->outs; a != NULL; a = a->outchain)
+ createarc(nfa, a->type, a->co, newState, a->to);
+ }
+#ifdef NOT_USED /* see comment above */
+ else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
+ {
+ /* With not too many arcs, just do them one at a time */
+ struct arc *a;
+
+ for (a = oldState->outs; a != NULL; a = a->outchain)
+ cparc(nfa, a, newState, a->to);
+ }
+ else
+ {
+ /*
+ * With many arcs, use a sort-merge approach. Note that createarc()
+ * will put new arcs onto the front of newState's chain, so it does
+ * not break our walk through the sorted part of the chain.
+ */
+ struct arc *oa;
+ struct arc *na;
+
+ /*
+ * Because we bypass newarc() in this code path, we'd better include a
+ * cancel check.
+ */
+ if (CANCEL_REQUESTED(nfa->v->re))
+ {
+ NERR(REG_CANCEL);
+ return;
+ }
+
+ sortouts(nfa, oldState);
+ sortouts(nfa, newState);
+ if (NISERR())
+ return; /* might have failed to sort */
+ oa = oldState->outs;
+ na = newState->outs;
+ while (oa != NULL && na != NULL)
+ {
+ struct arc *a = oa;
+
+ switch (sortouts_cmp(&oa, &na))
+ {
+ case -1:
+ /* newState does not have anything matching oa */
+ oa = oa->outchain;
+ createarc(nfa, a->type, a->co, newState, a->to);
+ break;
+ case 0:
+ /* match, advance in both lists */
+ oa = oa->outchain;
+ na = na->outchain;
+ break;
+ case +1:
+ /* advance only na; oa might have a match later */
+ na = na->outchain;
+ break;
+ default:
+ assert(NOTREACHED);
+ }
+ }
+ while (oa != NULL)
+ {
+ /* newState does not have anything matching oa */
+ struct arc *a = oa;
+
+ oa = oa->outchain;
+ createarc(nfa, a->type, a->co, newState, a->to);
+ }
+ }
+#endif /* NOT_USED */
+}
+
+/*
+ * cloneouts - copy out arcs of a state to another state pair, modifying type
+ *
+ * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
+ * the same interpretation of "co". It wouldn't be sensible with LACONs.
+ */
+static void
+cloneouts(struct nfa *nfa,
+ struct state *old,
+ struct state *from,
+ struct state *to,
+ int type)
+{
+ struct arc *a;
+
+ assert(old != from);
+ assert(type == AHEAD || type == BEHIND);
+
+ for (a = old->outs; a != NULL; a = a->outchain)
+ {
+ assert(a->type == PLAIN);
+ newarc(nfa, type, a->co, from, to);
+ }
+}
+
+/*
+ * delsub - delete a sub-NFA, updating subre pointers if necessary
+ *
+ * This uses a recursive traversal of the sub-NFA, marking already-seen
+ * states using their tmp pointer.
+ */
+static void
+delsub(struct nfa *nfa,
+ struct state *lp, /* the sub-NFA goes from here... */
+ struct state *rp) /* ...to here, *not* inclusive */
+{
+ assert(lp != rp);
+
+ rp->tmp = rp; /* mark end */
+
+ deltraverse(nfa, lp, lp);
+ if (NISERR())
+ return; /* asserts might not hold after failure */
+ assert(lp->nouts == 0 && rp->nins == 0); /* did the job */
+ assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
+
+ rp->tmp = NULL; /* unmark end */
+ lp->tmp = NULL; /* and begin, marked by deltraverse */
+}
+
+/*
+ * deltraverse - the recursive heart of delsub
+ * This routine's basic job is to destroy all out-arcs of the state.
+ */
+static void
+deltraverse(struct nfa *nfa,
+ struct state *leftend,
+ struct state *s)
+{
+ struct arc *a;
+ struct state *to;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ {
+ NERR(REG_ETOOBIG);
+ return;
+ }
+
+ if (s->nouts == 0)
+ return; /* nothing to do */
+ if (s->tmp != NULL)
+ return; /* already in progress */
+
+ s->tmp = s; /* mark as in progress */
+
+ while ((a = s->outs) != NULL)
+ {
+ to = a->to;
+ deltraverse(nfa, leftend, to);
+ if (NISERR())
+ return; /* asserts might not hold after failure */
+ assert(to->nouts == 0 || to->tmp != NULL);
+ freearc(nfa, a);
+ if (to->nins == 0 && to->tmp == NULL)
+ {
+ assert(to->nouts == 0);
+ freestate(nfa, to);
+ }
+ }
+
+ assert(s->no != FREESTATE); /* we're still here */
+ assert(s == leftend || s->nins != 0); /* and still reachable */
+ assert(s->nouts == 0); /* but have no outarcs */
+
+ s->tmp = NULL; /* we're done here */
+}
+
+/*
+ * dupnfa - duplicate sub-NFA
+ *
+ * Another recursive traversal, this time using tmp to point to duplicates
+ * as well as mark already-seen states. (You knew there was a reason why
+ * it's a state pointer, didn't you? :-))
+ */
+static void
+dupnfa(struct nfa *nfa,
+ struct state *start, /* duplicate of subNFA starting here */
+ struct state *stop, /* and stopping here */
+ struct state *from, /* stringing duplicate from here */
+ struct state *to) /* to here */
+{
+ if (start == stop)
+ {
+ newarc(nfa, EMPTY, 0, from, to);
+ return;
+ }
+
+ stop->tmp = to;
+ duptraverse(nfa, start, from);
+ /* done, except for clearing out the tmp pointers */
+
+ stop->tmp = NULL;
+ cleartraverse(nfa, start);
+}
+
+/*
+ * duptraverse - recursive heart of dupnfa
+ */
+static void
+duptraverse(struct nfa *nfa,
+ struct state *s,
+ struct state *stmp) /* s's duplicate, or NULL */
+{
+ struct arc *a;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ {
+ NERR(REG_ETOOBIG);
+ return;
+ }
+
+ if (s->tmp != NULL)
+ return; /* already done */
+
+ s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
+ if (s->tmp == NULL)
+ {
+ assert(NISERR());
+ return;
+ }
+
+ for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
+ {
+ duptraverse(nfa, a->to, (struct state *) NULL);
+ if (NISERR())
+ break;
+ assert(a->to->tmp != NULL);
+ cparc(nfa, a, s->tmp, a->to->tmp);
+ }
+}
+
+/*
+ * removeconstraints - remove any constraints in an NFA
+ *
+ * Constraint arcs are replaced by empty arcs, essentially treating all
+ * constraints as automatically satisfied.
+ */
+static void
+removeconstraints(struct nfa *nfa,
+ struct state *start, /* process subNFA starting here */
+ struct state *stop) /* and stopping here */
+{
+ if (start == stop)
+ return;
+
+ stop->tmp = stop;
+ removetraverse(nfa, start);
+ /* done, except for clearing out the tmp pointers */
+
+ stop->tmp = NULL;
+ cleartraverse(nfa, start);
+}
+
+/*
+ * removetraverse - recursive heart of removeconstraints
+ */
+static void
+removetraverse(struct nfa *nfa,
+ struct state *s)
+{
+ struct arc *a;
+ struct arc *oa;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ {
+ NERR(REG_ETOOBIG);
+ return;
+ }
+
+ if (s->tmp != NULL)
+ return; /* already done */
+
+ s->tmp = s;
+ for (a = s->outs; a != NULL && !NISERR(); a = oa)
+ {
+ removetraverse(nfa, a->to);
+ if (NISERR())
+ break;
+ oa = a->outchain;
+ switch (a->type)
+ {
+ case PLAIN:
+ case EMPTY:
+ /* nothing to do */
+ break;
+ case AHEAD:
+ case BEHIND:
+ case '^':
+ case '$':
+ case LACON:
+ /* replace it */
+ newarc(nfa, EMPTY, 0, s, a->to);
+ freearc(nfa, a);
+ break;
+ default:
+ NERR(REG_ASSERT);
+ break;
+ }
+ }
+}
+
+/*
+ * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
+ */
+static void
+cleartraverse(struct nfa *nfa,
+ struct state *s)
+{
+ struct arc *a;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ {
+ NERR(REG_ETOOBIG);
+ return;
+ }
+
+ if (s->tmp == NULL)
+ return;
+ s->tmp = NULL;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ cleartraverse(nfa, a->to);
+}
+
+/*
+ * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
+ *
+ * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
+ * of a set of colors), return a state whose outarc list contains only PLAIN
+ * arcs of those color(s). Otherwise return NULL.
+ *
+ * This is used before optimizing the NFA, so there may be EMPTY arcs, which
+ * we should ignore; the possibility of an EMPTY is why the result state could
+ * be different from s1.
+ *
+ * It's worth troubling to handle multiple parallel PLAIN arcs here because a
+ * bracket construct such as [abc] might yield either one or several parallel
+ * PLAIN arcs depending on earlier atoms in the expression. We'd rather that
+ * that implementation detail not create user-visible performance differences.
+ */
+static struct state *
+single_color_transition(struct state *s1, struct state *s2)
+{
+ struct arc *a;
+
+ /* Ignore leading EMPTY arc, if any */
+ if (s1->nouts == 1 && s1->outs->type == EMPTY)
+ s1 = s1->outs->to;
+ /* Likewise for any trailing EMPTY arc */
+ if (s2->nins == 1 && s2->ins->type == EMPTY)
+ s2 = s2->ins->from;
+ /* Perhaps we could have a single-state loop in between, if so reject */
+ if (s1 == s2)
+ return NULL;
+ /* s1 must have at least one outarc... */
+ if (s1->outs == NULL)
+ return NULL;
+ /* ... and they must all be PLAIN arcs to s2 */
+ for (a = s1->outs; a != NULL; a = a->outchain)
+ {
+ if (a->type != PLAIN || a->to != s2)
+ return NULL;
+ }
+ /* OK, return s1 as the possessor of the relevant outarcs */
+ return s1;
+}
+
+/*
+ * specialcolors - fill in special colors for an NFA
+ */
+static void
+specialcolors(struct nfa *nfa)
+{
+ /* false colors for BOS, BOL, EOS, EOL */
+ if (nfa->parent == NULL)
+ {
+ nfa->bos[0] = pseudocolor(nfa->cm);
+ nfa->bos[1] = pseudocolor(nfa->cm);
+ nfa->eos[0] = pseudocolor(nfa->cm);
+ nfa->eos[1] = pseudocolor(nfa->cm);
+ }
+ else
+ {
+ assert(nfa->parent->bos[0] != COLORLESS);
+ nfa->bos[0] = nfa->parent->bos[0];
+ assert(nfa->parent->bos[1] != COLORLESS);
+ nfa->bos[1] = nfa->parent->bos[1];
+ assert(nfa->parent->eos[0] != COLORLESS);
+ nfa->eos[0] = nfa->parent->eos[0];
+ assert(nfa->parent->eos[1] != COLORLESS);
+ nfa->eos[1] = nfa->parent->eos[1];
+ }
+}
+
+/*
+ * optimize - optimize an NFA
+ *
+ * The main goal of this function is not so much "optimization" (though it
+ * does try to get rid of useless NFA states) as reducing the NFA to a form
+ * the regex executor can handle. The executor, and indeed the cNFA format
+ * that is its input, can only handle PLAIN and LACON arcs. The output of
+ * the regex parser also includes EMPTY (do-nothing) arcs, as well as
+ * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
+ * We first get rid of EMPTY arcs and then deal with the constraint arcs.
+ * The hardest part of either job is to get rid of circular loops of the
+ * target arc type. We would have to do that in any case, though, as such a
+ * loop would otherwise allow the executor to cycle through the loop endlessly
+ * without making any progress in the input string.
+ */
+static long /* re_info bits */
+optimize(struct nfa *nfa,
+ FILE *f) /* for debug output; NULL none */
+{
+#ifdef REG_DEBUG
+ int verbose = (f != NULL) ? 1 : 0;
+
+ if (verbose)
+ fprintf(f, "\ninitial cleanup:\n");
+#endif
+ cleanup(nfa); /* may simplify situation */
+#ifdef REG_DEBUG
+ if (verbose)
+ dumpnfa(nfa, f);
+ if (verbose)
+ fprintf(f, "\nempties:\n");
+#endif
+ fixempties(nfa, f); /* get rid of EMPTY arcs */
+#ifdef REG_DEBUG
+ if (verbose)
+ fprintf(f, "\nconstraints:\n");
+#endif
+ fixconstraintloops(nfa, f); /* get rid of constraint loops */
+ pullback(nfa, f); /* pull back constraints backward */
+ pushfwd(nfa, f); /* push fwd constraints forward */
+#ifdef REG_DEBUG
+ if (verbose)
+ fprintf(f, "\nfinal cleanup:\n");
+#endif
+ cleanup(nfa); /* final tidying */
+#ifdef REG_DEBUG
+ if (verbose)
+ dumpnfa(nfa, f);
+#endif
+ return analyze(nfa); /* and analysis */
+}
+
+/*
+ * pullback - pull back constraints backward to eliminate them
+ */
+static void
+pullback(struct nfa *nfa,
+ FILE *f) /* for debug output; NULL none */
+{
+ struct state *s;
+ struct state *nexts;
+ struct arc *a;
+ struct arc *nexta;
+ struct state *intermediates;
+ int progress;
+
+ /* find and pull until there are no more */
+ do
+ {
+ progress = 0;
+ for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+ {
+ nexts = s->next;
+ intermediates = NULL;
+ for (a = s->outs; a != NULL && !NISERR(); a = nexta)
+ {
+ nexta = a->outchain;
+ if (a->type == '^' || a->type == BEHIND)
+ if (pull(nfa, a, &intermediates))
+ progress = 1;
+ }
+ /* clear tmp fields of intermediate states created here */
+ while (intermediates != NULL)
+ {
+ struct state *ns = intermediates->tmp;
+
+ intermediates->tmp = NULL;
+ intermediates = ns;
+ }
+ /* if s is now useless, get rid of it */
+ if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+ dropstate(nfa, s);
+ }
+ if (progress && f != NULL)
+ dumpnfa(nfa, f);
+ } while (progress && !NISERR());
+ if (NISERR())
+ return;
+
+ /*
+ * Any ^ constraints we were able to pull to the start state can now be
+ * replaced by PLAIN arcs referencing the BOS or BOL colors. There should
+ * be no other ^ or BEHIND arcs left in the NFA, though we do not check
+ * that here (compact() will fail if so).
+ */
+ for (a = nfa->pre->outs; a != NULL; a = nexta)
+ {
+ nexta = a->outchain;
+ if (a->type == '^')
+ {
+ assert(a->co == 0 || a->co == 1);
+ newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
+ freearc(nfa, a);
+ }
+ }
+}
+
+/*
+ * pull - pull a back constraint backward past its source state
+ *
+ * Returns 1 if successful (which it always is unless the source is the
+ * start state or we have an internal error), 0 if nothing happened.
+ *
+ * A significant property of this function is that it deletes no pre-existing
+ * states, and no outarcs of the constraint's from state other than the given
+ * constraint arc. This makes the loops in pullback() safe, at the cost that
+ * we may leave useless states behind. Therefore, we leave it to pullback()
+ * to delete such states.
+ *
+ * If the from state has multiple back-constraint outarcs, and/or multiple
+ * compatible constraint inarcs, we only need to create one new intermediate
+ * state per combination of predecessor and successor states. *intermediates
+ * points to a list of such intermediate states for this from state (chained
+ * through their tmp fields).
+ */
+static int
+pull(struct nfa *nfa,
+ struct arc *con,
+ struct state **intermediates)
+{
+ struct state *from = con->from;
+ struct state *to = con->to;
+ struct arc *a;
+ struct arc *nexta;
+ struct state *s;
+
+ assert(from != to); /* should have gotten rid of this earlier */
+ if (from->flag) /* can't pull back beyond start */
+ return 0;
+ if (from->nins == 0)
+ { /* unreachable */
+ freearc(nfa, con);
+ return 1;
+ }
+
+ /*
+ * First, clone from state if necessary to avoid other outarcs. This may
+ * seem wasteful, but it simplifies the logic, and we'll get rid of the
+ * clone state again at the bottom.
+ */
+ if (from->nouts > 1)
+ {
+ s = newstate(nfa);
+ if (NISERR())
+ return 0;
+ copyins(nfa, from, s); /* duplicate inarcs */
+ cparc(nfa, con, s, to); /* move constraint arc */
+ freearc(nfa, con);
+ if (NISERR())
+ return 0;
+ from = s;
+ con = from->outs;
+ }
+ assert(from->nouts == 1);
+
+ /* propagate the constraint into the from state's inarcs */
+ for (a = from->ins; a != NULL && !NISERR(); a = nexta)
+ {
+ nexta = a->inchain;
+ switch (combine(nfa, con, a))
+ {
+ case INCOMPATIBLE: /* destroy the arc */
+ freearc(nfa, a);
+ break;
+ case SATISFIED: /* no action needed */
+ break;
+ case COMPATIBLE: /* swap the two arcs, more or less */
+ /* need an intermediate state, but might have one already */
+ for (s = *intermediates; s != NULL; s = s->tmp)
+ {
+ assert(s->nins > 0 && s->nouts > 0);
+ if (s->ins->from == a->from && s->outs->to == to)
+ break;
+ }
+ if (s == NULL)
+ {
+ s = newstate(nfa);
+ if (NISERR())
+ return 0;
+ s->tmp = *intermediates;
+ *intermediates = s;
+ }
+ cparc(nfa, con, a->from, s);
+ cparc(nfa, a, s, to);
+ freearc(nfa, a);
+ break;
+ case REPLACEARC: /* replace arc's color */
+ newarc(nfa, a->type, con->co, a->from, to);
+ freearc(nfa, a);
+ break;
+ default:
+ assert(NOTREACHED);
+ break;
+ }
+ }
+
+ /* remaining inarcs, if any, incorporate the constraint */
+ moveins(nfa, from, to);
+ freearc(nfa, con);
+ /* from state is now useless, but we leave it to pullback() to clean up */
+ return 1;
+}
+
+/*
+ * pushfwd - push forward constraints forward to eliminate them
+ */
+static void
+pushfwd(struct nfa *nfa,
+ FILE *f) /* for debug output; NULL none */
+{
+ struct state *s;
+ struct state *nexts;
+ struct arc *a;
+ struct arc *nexta;
+ struct state *intermediates;
+ int progress;
+
+ /* find and push until there are no more */
+ do
+ {
+ progress = 0;
+ for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+ {
+ nexts = s->next;
+ intermediates = NULL;
+ for (a = s->ins; a != NULL && !NISERR(); a = nexta)
+ {
+ nexta = a->inchain;
+ if (a->type == '$' || a->type == AHEAD)
+ if (push(nfa, a, &intermediates))
+ progress = 1;
+ }
+ /* clear tmp fields of intermediate states created here */
+ while (intermediates != NULL)
+ {
+ struct state *ns = intermediates->tmp;
+
+ intermediates->tmp = NULL;
+ intermediates = ns;
+ }
+ /* if s is now useless, get rid of it */
+ if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+ dropstate(nfa, s);
+ }
+ if (progress && f != NULL)
+ dumpnfa(nfa, f);
+ } while (progress && !NISERR());
+ if (NISERR())
+ return;
+
+ /*
+ * Any $ constraints we were able to push to the post state can now be
+ * replaced by PLAIN arcs referencing the EOS or EOL colors. There should
+ * be no other $ or AHEAD arcs left in the NFA, though we do not check
+ * that here (compact() will fail if so).
+ */
+ for (a = nfa->post->ins; a != NULL; a = nexta)
+ {
+ nexta = a->inchain;
+ if (a->type == '$')
+ {
+ assert(a->co == 0 || a->co == 1);
+ newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
+ freearc(nfa, a);
+ }
+ }
+}
+
+/*
+ * push - push a forward constraint forward past its destination state
+ *
+ * Returns 1 if successful (which it always is unless the destination is the
+ * post state or we have an internal error), 0 if nothing happened.
+ *
+ * A significant property of this function is that it deletes no pre-existing
+ * states, and no inarcs of the constraint's to state other than the given
+ * constraint arc. This makes the loops in pushfwd() safe, at the cost that
+ * we may leave useless states behind. Therefore, we leave it to pushfwd()
+ * to delete such states.
+ *
+ * If the to state has multiple forward-constraint inarcs, and/or multiple
+ * compatible constraint outarcs, we only need to create one new intermediate
+ * state per combination of predecessor and successor states. *intermediates
+ * points to a list of such intermediate states for this to state (chained
+ * through their tmp fields).
+ */
+static int
+push(struct nfa *nfa,
+ struct arc *con,
+ struct state **intermediates)
+{
+ struct state *from = con->from;
+ struct state *to = con->to;
+ struct arc *a;
+ struct arc *nexta;
+ struct state *s;
+
+ assert(to != from); /* should have gotten rid of this earlier */
+ if (to->flag) /* can't push forward beyond end */
+ return 0;
+ if (to->nouts == 0)
+ { /* dead end */
+ freearc(nfa, con);
+ return 1;
+ }
+
+ /*
+ * First, clone to state if necessary to avoid other inarcs. This may
+ * seem wasteful, but it simplifies the logic, and we'll get rid of the
+ * clone state again at the bottom.
+ */
+ if (to->nins > 1)
+ {
+ s = newstate(nfa);
+ if (NISERR())
+ return 0;
+ copyouts(nfa, to, s); /* duplicate outarcs */
+ cparc(nfa, con, from, s); /* move constraint arc */
+ freearc(nfa, con);
+ if (NISERR())
+ return 0;
+ to = s;
+ con = to->ins;
+ }
+ assert(to->nins == 1);
+
+ /* propagate the constraint into the to state's outarcs */
+ for (a = to->outs; a != NULL && !NISERR(); a = nexta)
+ {
+ nexta = a->outchain;
+ switch (combine(nfa, con, a))
+ {
+ case INCOMPATIBLE: /* destroy the arc */
+ freearc(nfa, a);
+ break;
+ case SATISFIED: /* no action needed */
+ break;
+ case COMPATIBLE: /* swap the two arcs, more or less */
+ /* need an intermediate state, but might have one already */
+ for (s = *intermediates; s != NULL; s = s->tmp)
+ {
+ assert(s->nins > 0 && s->nouts > 0);
+ if (s->ins->from == from && s->outs->to == a->to)
+ break;
+ }
+ if (s == NULL)
+ {
+ s = newstate(nfa);
+ if (NISERR())
+ return 0;
+ s->tmp = *intermediates;
+ *intermediates = s;
+ }
+ cparc(nfa, con, s, a->to);
+ cparc(nfa, a, from, s);
+ freearc(nfa, a);
+ break;
+ case REPLACEARC: /* replace arc's color */
+ newarc(nfa, a->type, con->co, from, a->to);
+ freearc(nfa, a);
+ break;
+ default:
+ assert(NOTREACHED);
+ break;
+ }
+ }
+
+ /* remaining outarcs, if any, incorporate the constraint */
+ moveouts(nfa, to, from);
+ freearc(nfa, con);
+ /* to state is now useless, but we leave it to pushfwd() to clean up */
+ return 1;
+}
+
+/*
+ * combine - constraint lands on an arc, what happens?
+ *
+ * #def INCOMPATIBLE 1 // destroys arc
+ * #def SATISFIED 2 // constraint satisfied
+ * #def COMPATIBLE 3 // compatible but not satisfied yet
+ * #def REPLACEARC 4 // replace arc's color with constraint color
+ */
+static int
+combine(struct nfa *nfa,
+ struct arc *con,
+ struct arc *a)
+{
+#define CA(ct,at) (((ct)<<CHAR_BIT) | (at))
+
+ switch (CA(con->type, a->type))
+ {
+ case CA('^', PLAIN): /* newlines are handled separately */
+ case CA('$', PLAIN):
+ return INCOMPATIBLE;
+ break;
+ case CA(AHEAD, PLAIN): /* color constraints meet colors */
+ case CA(BEHIND, PLAIN):
+ if (con->co == a->co)
+ return SATISFIED;
+ if (con->co == RAINBOW)
+ {
+ /* con is satisfied unless arc's color is a pseudocolor */
+ if (!(nfa->cm->cd[a->co].flags & PSEUDO))
+ return SATISFIED;
+ }
+ else if (a->co == RAINBOW)
+ {
+ /* con is incompatible if it's for a pseudocolor */
+ /* (this is hypothetical; we make no such constraints today) */
+ if (nfa->cm->cd[con->co].flags & PSEUDO)
+ return INCOMPATIBLE;
+ /* otherwise, constraint constrains arc to be only its color */
+ return REPLACEARC;
+ }
+ return INCOMPATIBLE;
+ break;
+ case CA('^', '^'): /* collision, similar constraints */
+ case CA('$', '$'):
+ if (con->co == a->co) /* true duplication */
+ return SATISFIED;
+ return INCOMPATIBLE;
+ break;
+ case CA(AHEAD, AHEAD): /* collision, similar constraints */
+ case CA(BEHIND, BEHIND):
+ if (con->co == a->co) /* true duplication */
+ return SATISFIED;
+ if (con->co == RAINBOW)
+ {
+ /* con is satisfied unless arc's color is a pseudocolor */
+ if (!(nfa->cm->cd[a->co].flags & PSEUDO))
+ return SATISFIED;
+ }
+ else if (a->co == RAINBOW)
+ {
+ /* con is incompatible if it's for a pseudocolor */
+ /* (this is hypothetical; we make no such constraints today) */
+ if (nfa->cm->cd[con->co].flags & PSEUDO)
+ return INCOMPATIBLE;
+ /* otherwise, constraint constrains arc to be only its color */
+ return REPLACEARC;
+ }
+ return INCOMPATIBLE;
+ break;
+ case CA('^', BEHIND): /* collision, dissimilar constraints */
+ case CA(BEHIND, '^'):
+ case CA('$', AHEAD):
+ case CA(AHEAD, '$'):
+ return INCOMPATIBLE;
+ break;
+ case CA('^', '$'): /* constraints passing each other */
+ case CA('^', AHEAD):
+ case CA(BEHIND, '$'):
+ case CA(BEHIND, AHEAD):
+ case CA('$', '^'):
+ case CA('$', BEHIND):
+ case CA(AHEAD, '^'):
+ case CA(AHEAD, BEHIND):
+ case CA('^', LACON):
+ case CA(BEHIND, LACON):
+ case CA('$', LACON):
+ case CA(AHEAD, LACON):
+ return COMPATIBLE;
+ break;
+ }
+ assert(NOTREACHED);
+ return INCOMPATIBLE; /* for benefit of blind compilers */
+}
+
+/*
+ * fixempties - get rid of EMPTY arcs
+ */
+static void
+fixempties(struct nfa *nfa,
+ FILE *f) /* for debug output; NULL none */
+{
+ struct state *s;
+ struct state *s2;
+ struct state *nexts;
+ struct arc *a;
+ struct arc *nexta;
+ int totalinarcs;
+ struct arc **inarcsorig;
+ struct arc **arcarray;
+ int arccount;
+ int prevnins;
+ int nskip;
+
+ /*
+ * First, get rid of any states whose sole out-arc is an EMPTY, since
+ * they're basically just aliases for their successor. The parsing
+ * algorithm creates enough of these that it's worth special-casing this.
+ */
+ for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+ {
+ nexts = s->next;
+ if (s->flag || s->nouts != 1)
+ continue;
+ a = s->outs;
+ assert(a != NULL && a->outchain == NULL);
+ if (a->type != EMPTY)
+ continue;
+ if (s != a->to)
+ moveins(nfa, s, a->to);
+ dropstate(nfa, s);
+ }
+
+ /*
+ * Similarly, get rid of any state with a single EMPTY in-arc, by folding
+ * it into its predecessor.
+ */
+ for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+ {
+ nexts = s->next;
+ /* while we're at it, ensure tmp fields are clear for next step */
+ assert(s->tmp == NULL);
+ if (s->flag || s->nins != 1)
+ continue;
+ a = s->ins;
+ assert(a != NULL && a->inchain == NULL);
+ if (a->type != EMPTY)
+ continue;
+ if (s != a->from)
+ moveouts(nfa, s, a->from);
+ dropstate(nfa, s);
+ }
+
+ if (NISERR())
+ return;
+
+ /*
+ * For each remaining NFA state, find all other states from which it is
+ * reachable by a chain of one or more EMPTY arcs. Then generate new arcs
+ * that eliminate the need for each such chain.
+ *
+ * We could replace a chain of EMPTY arcs that leads from a "from" state
+ * to a "to" state either by pushing non-EMPTY arcs forward (linking
+ * directly from "from"'s predecessors to "to") or by pulling them back
+ * (linking directly from "from" to "to"'s successors). We choose to
+ * always do the former; this choice is somewhat arbitrary, but the
+ * approach below requires that we uniformly do one or the other.
+ *
+ * Suppose we have a chain of N successive EMPTY arcs (where N can easily
+ * approach the size of the NFA). All of the intermediate states must
+ * have additional inarcs and outarcs, else they'd have been removed by
+ * the steps above. Assuming their inarcs are mostly not empties, we will
+ * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
+ * state in the chain must be duplicated to lead to all its successor
+ * states as well. So there is no hope of doing less than O(N^2) work;
+ * however, we should endeavor to keep the big-O cost from being even
+ * worse than that, which it can easily become without care. In
+ * particular, suppose we were to copy all S1's inarcs forward to S2, and
+ * then also to S3, and then later we consider pushing S2's inarcs forward
+ * to S3. If we include the arcs already copied from S1 in that, we'd be
+ * doing O(N^3) work. (The duplicate-arc elimination built into newarc()
+ * and its cohorts would get rid of the extra arcs, but not without cost.)
+ *
+ * We can avoid this cost by treating only arcs that existed at the start
+ * of this phase as candidates to be pushed forward. To identify those,
+ * we remember the first inarc each state had to start with. We rely on
+ * the fact that newarc() and friends put new arcs on the front of their
+ * to-states' inchains, and that this phase never deletes arcs, so that
+ * the original arcs must be the last arcs in their to-states' inchains.
+ *
+ * So the process here is that, for each state in the NFA, we gather up
+ * all non-EMPTY inarcs of states that can reach the target state via
+ * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the
+ * target state's inchain. (We can safely use sort-merge for this as long
+ * as we update each state's original-arcs pointer after we add arcs to
+ * it; the sort step of mergeins probably changed the order of the old
+ * arcs.)
+ *
+ * Another refinement worth making is that, because we only add non-EMPTY
+ * arcs during this phase, and all added arcs have the same from-state as
+ * the non-EMPTY arc they were cloned from, we know ahead of time that any
+ * states having only EMPTY outarcs will be useless for lack of outarcs
+ * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if
+ * they had none to start with.) So we need not bother to update the
+ * inchains of such states at all.
+ */
+
+ /* Remember the states' first original inarcs */
+ /* ... and while at it, count how many old inarcs there are altogether */
+ inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
+ if (inarcsorig == NULL)
+ {
+ NERR(REG_ESPACE);
+ return;
+ }
+ totalinarcs = 0;
+ for (s = nfa->states; s != NULL; s = s->next)
+ {
+ inarcsorig[s->no] = s->ins;
+ totalinarcs += s->nins;
+ }
+
+ /*
+ * Create a workspace for accumulating the inarcs to be added to the
+ * current target state. totalinarcs is probably a considerable
+ * overestimate of the space needed, but the NFA is unlikely to be large
+ * enough at this point to make it worth being smarter.
+ */
+ arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
+ if (arcarray == NULL)
+ {
+ NERR(REG_ESPACE);
+ FREE(inarcsorig);
+ return;
+ }
+
+ /* And iterate over the target states */
+ for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
+ {
+ /* Ignore target states without non-EMPTY outarcs, per note above */
+ if (!s->flag && !hasnonemptyout(s))
+ continue;
+
+ /* Find predecessor states and accumulate their original inarcs */
+ arccount = 0;
+ for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
+ {
+ /* Add s2's original inarcs to arcarray[], but ignore empties */
+ for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
+ {
+ if (a->type != EMPTY)
+ arcarray[arccount++] = a;
+ }
+
+ /* Reset the tmp fields as we walk back */
+ nexts = s2->tmp;
+ s2->tmp = NULL;
+ }
+ s->tmp = NULL;
+ assert(arccount <= totalinarcs);
+
+ /* Remember how many original inarcs this state has */
+ prevnins = s->nins;
+
+ /* Add non-duplicate inarcs to target state */
+ mergeins(nfa, s, arcarray, arccount);
+
+ /* Now we must update the state's inarcsorig pointer */
+ nskip = s->nins - prevnins;
+ a = s->ins;
+ while (nskip-- > 0)
+ a = a->inchain;
+ inarcsorig[s->no] = a;
+ }
+
+ FREE(arcarray);
+ FREE(inarcsorig);
+
+ if (NISERR())
+ return;
+
+ /*
+ * Now remove all the EMPTY arcs, since we don't need them anymore.
+ */
+ for (s = nfa->states; s != NULL; s = s->next)
+ {
+ for (a = s->outs; a != NULL; a = nexta)
+ {
+ nexta = a->outchain;
+ if (a->type == EMPTY)
+ freearc(nfa, a);
+ }
+ }
+
+ /*
+ * And remove any states that have become useless. (This cleanup is not
+ * very thorough, and would be even less so if we tried to combine it with
+ * the previous step; but cleanup() will take care of anything we miss.)
+ */
+ for (s = nfa->states; s != NULL; s = nexts)
+ {
+ nexts = s->next;
+ if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+ dropstate(nfa, s);
+ }
+
+ if (f != NULL)
+ dumpnfa(nfa, f);
+}
+
+/*
+ * emptyreachable - recursively find all states that can reach s by EMPTY arcs
+ *
+ * The return value is the last such state found. Its tmp field links back
+ * to the next-to-last such state, and so on back to s, so that all these
+ * states can be located without searching the whole NFA.
+ *
+ * Since this is only used in fixempties(), we pass in the inarcsorig[] array
+ * maintained by that function. This lets us skip over all new inarcs, which
+ * are certainly not EMPTY arcs.
+ *
+ * The maximum recursion depth here is equal to the length of the longest
+ * loop-free chain of EMPTY arcs, which is surely no more than the size of
+ * the NFA ... but that could still be enough to cause trouble.
+ */
+static struct state *
+emptyreachable(struct nfa *nfa,
+ struct state *s,
+ struct state *lastfound,
+ struct arc **inarcsorig)
+{
+ struct arc *a;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ {
+ NERR(REG_ETOOBIG);
+ return lastfound;
+ }
+
+ s->tmp = lastfound;
+ lastfound = s;
+ for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
+ {
+ if (a->type == EMPTY && a->from->tmp == NULL)
+ lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
+ }
+ return lastfound;
+}
+
+/*
+ * isconstraintarc - detect whether an arc is of a constraint type
+ */
+static inline int
+isconstraintarc(struct arc *a)
+{
+ switch (a->type)
+ {
+ case '^':
+ case '$':
+ case BEHIND:
+ case AHEAD:
+ case LACON:
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hasconstraintout - does state have a constraint out arc?
+ */
+static int
+hasconstraintout(struct state *s)
+{
+ struct arc *a;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (isconstraintarc(a))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * fixconstraintloops - get rid of loops containing only constraint arcs
+ *
+ * A loop of states that contains only constraint arcs is useless, since
+ * passing around the loop represents no forward progress. Moreover, it
+ * would cause infinite looping in pullback/pushfwd, so we need to get rid
+ * of such loops before doing that.
+ */
+static void
+fixconstraintloops(struct nfa *nfa,
+ FILE *f) /* for debug output; NULL none */
+{
+ struct state *s;
+ struct state *nexts;
+ struct arc *a;
+ struct arc *nexta;
+ int hasconstraints;
+
+ /*
+ * In the trivial case of a state that loops to itself, we can just drop
+ * the constraint arc altogether. This is worth special-casing because
+ * such loops are far more common than loops containing multiple states.
+ * While we're at it, note whether any constraint arcs survive.
+ */
+ hasconstraints = 0;
+ for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+ {
+ nexts = s->next;
+ /* while we're at it, ensure tmp fields are clear for next step */
+ assert(s->tmp == NULL);
+ for (a = s->outs; a != NULL && !NISERR(); a = nexta)
+ {
+ nexta = a->outchain;
+ if (isconstraintarc(a))
+ {
+ if (a->to == s)
+ freearc(nfa, a);
+ else
+ hasconstraints = 1;
+ }
+ }
+ /* If we removed all the outarcs, the state is useless. */
+ if (s->nouts == 0 && !s->flag)
+ dropstate(nfa, s);
+ }
+
+ /* Nothing to do if no remaining constraint arcs */
+ if (NISERR() || !hasconstraints)
+ return;
+
+ /*
+ * Starting from each remaining NFA state, search outwards for a
+ * constraint loop. If we find a loop, break the loop, then start the
+ * search over. (We could possibly retain some state from the first scan,
+ * but it would complicate things greatly, and multi-state constraint
+ * loops are rare enough that it's not worth optimizing the case.)
+ */
+restart:
+ for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
+ {
+ if (findconstraintloop(nfa, s))
+ goto restart;
+ }
+
+ if (NISERR())
+ return;
+
+ /*
+ * Now remove any states that have become useless. (This cleanup is not
+ * very thorough, and would be even less so if we tried to combine it with
+ * the previous step; but cleanup() will take care of anything we miss.)
+ *
+ * Because findconstraintloop intentionally doesn't reset all tmp fields,
+ * we have to clear them after it's done. This is a convenient place to
+ * do that, too.
+ */
+ for (s = nfa->states; s != NULL; s = nexts)
+ {
+ nexts = s->next;
+ s->tmp = NULL;
+ if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+ dropstate(nfa, s);
+ }
+
+ if (f != NULL)
+ dumpnfa(nfa, f);
+}
+
+/*
+ * findconstraintloop - recursively find a loop of constraint arcs
+ *
+ * If we find a loop, break it by calling breakconstraintloop(), then
+ * return 1; otherwise return 0.
+ *
+ * State tmp fields are guaranteed all NULL on a success return, because
+ * breakconstraintloop does that. After a failure return, any state that
+ * is known not to be part of a loop is marked with s->tmp == s; this allows
+ * us not to have to re-prove that fact on later calls. (This convention is
+ * workable because we already eliminated single-state loops.)
+ *
+ * Note that the found loop doesn't necessarily include the first state we
+ * are called on. Any loop reachable from that state will do.
+ *
+ * The maximum recursion depth here is one more than the length of the longest
+ * loop-free chain of constraint arcs, which is surely no more than the size
+ * of the NFA ... but that could still be enough to cause trouble.
+ */
+static int
+findconstraintloop(struct nfa *nfa, struct state *s)
+{
+ struct arc *a;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ {
+ NERR(REG_ETOOBIG);
+ return 1; /* to exit as quickly as possible */
+ }
+
+ if (s->tmp != NULL)
+ {
+ /* Already proven uninteresting? */
+ if (s->tmp == s)
+ return 0;
+ /* Found a loop involving s */
+ breakconstraintloop(nfa, s);
+ /* The tmp fields have been cleaned up by breakconstraintloop */
+ return 1;
+ }
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (isconstraintarc(a))
+ {
+ struct state *sto = a->to;
+
+ assert(sto != s);
+ s->tmp = sto;
+ if (findconstraintloop(nfa, sto))
+ return 1;
+ }
+ }
+
+ /*
+ * If we get here, no constraint loop exists leading out from s. Mark it
+ * with s->tmp == s so we need not rediscover that fact again later.
+ */
+ s->tmp = s;
+ return 0;
+}
+
+/*
+ * breakconstraintloop - break a loop of constraint arcs
+ *
+ * sinitial is any one member state of the loop. Each loop member's tmp
+ * field links to its successor within the loop. (Note that this function
+ * will reset all the tmp fields to NULL.)
+ *
+ * We can break the loop by, for any one state S1 in the loop, cloning its
+ * loop successor state S2 (and possibly following states), and then moving
+ * all S1->S2 constraint arcs to point to the cloned S2. The cloned S2 should
+ * copy any non-constraint outarcs of S2. Constraint outarcs should be
+ * dropped if they point back to S1, else they need to be copied as arcs to
+ * similarly cloned states S3, S4, etc. In general, each cloned state copies
+ * non-constraint outarcs, drops constraint outarcs that would lead to itself
+ * or any earlier cloned state, and sends other constraint outarcs to newly
+ * cloned states. No cloned state will have any inarcs that aren't constraint
+ * arcs or do not lead from S1 or earlier-cloned states. It's okay to drop
+ * constraint back-arcs since they would not take us to any state we've not
+ * already been in; therefore, no new constraint loop is created. In this way
+ * we generate a modified NFA that can still represent every useful state
+ * sequence, but not sequences that represent state loops with no consumption
+ * of input data. Note that the set of cloned states will certainly include
+ * all of the loop member states other than S1, and it may also include
+ * non-loop states that are reachable from S2 via constraint arcs. This is
+ * important because there is no guarantee that findconstraintloop found a
+ * maximal loop (and searching for one would be NP-hard, so don't try).
+ * Frequently the "non-loop states" are actually part of a larger loop that
+ * we didn't notice, and indeed there may be several overlapping loops.
+ * This technique ensures convergence in such cases, while considering only
+ * the originally-found loop does not.
+ *
+ * If there is only one S1->S2 constraint arc, then that constraint is
+ * certainly satisfied when we enter any of the clone states. This means that
+ * in the common case where many of the constraint arcs are identically
+ * labeled, we can merge together clone states linked by a similarly-labeled
+ * constraint: if we can get to the first one we can certainly get to the
+ * second, so there's no need to distinguish. This greatly reduces the number
+ * of new states needed, so we preferentially break the given loop at a state
+ * pair where this is true.
+ *
+ * Furthermore, it's fairly common to find that a cloned successor state has
+ * no outarcs, especially if we're a bit aggressive about removing unnecessary
+ * outarcs. If that happens, then there is simply not any interesting state
+ * that can be reached through the predecessor's loop arcs, which means we can
+ * break the loop just by removing those loop arcs, with no new states added.
+ */
+static void
+breakconstraintloop(struct nfa *nfa, struct state *sinitial)
+{
+ struct state *s;
+ struct state *shead;
+ struct state *stail;
+ struct state *sclone;
+ struct state *nexts;
+ struct arc *refarc;
+ struct arc *a;
+ struct arc *nexta;
+
+ /*
+ * Start by identifying which loop step we want to break at.
+ * Preferentially this is one with only one constraint arc. (XXX are
+ * there any other secondary heuristics we want to use here?) Set refarc
+ * to point to the selected lone constraint arc, if there is one.
+ */
+ refarc = NULL;
+ s = sinitial;
+ do
+ {
+ nexts = s->tmp;
+ assert(nexts != s); /* should not see any one-element loops */
+ if (refarc == NULL)
+ {
+ int narcs = 0;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->to == nexts && isconstraintarc(a))
+ {
+ refarc = a;
+ narcs++;
+ }
+ }
+ assert(narcs > 0);
+ if (narcs > 1)
+ refarc = NULL; /* multiple constraint arcs here, no good */
+ }
+ s = nexts;
+ } while (s != sinitial);
+
+ if (refarc)
+ {
+ /* break at the refarc */
+ shead = refarc->from;
+ stail = refarc->to;
+ assert(stail == shead->tmp);
+ }
+ else
+ {
+ /* for lack of a better idea, break after sinitial */
+ shead = sinitial;
+ stail = sinitial->tmp;
+ }
+
+ /*
+ * Reset the tmp fields so that we can use them for local storage in
+ * clonesuccessorstates. (findconstraintloop won't mind, since it's just
+ * going to abandon its search anyway.)
+ */
+ for (s = nfa->states; s != NULL; s = s->next)
+ s->tmp = NULL;
+
+ /*
+ * Recursively build clone state(s) as needed.
+ */
+ sclone = newstate(nfa);
+ if (sclone == NULL)
+ {
+ assert(NISERR());
+ return;
+ }
+
+ clonesuccessorstates(nfa, stail, sclone, shead, refarc,
+ NULL, NULL, nfa->nstates);
+
+ if (NISERR())
+ return;
+
+ /*
+ * It's possible that sclone has no outarcs at all, in which case it's
+ * useless. (We don't try extremely hard to get rid of useless states
+ * here, but this is an easy and fairly common case.)
+ */
+ if (sclone->nouts == 0)
+ {
+ freestate(nfa, sclone);
+ sclone = NULL;
+ }
+
+ /*
+ * Move shead's constraint-loop arcs to point to sclone, or just drop them
+ * if we discovered we don't need sclone.
+ */
+ for (a = shead->outs; a != NULL; a = nexta)
+ {
+ nexta = a->outchain;
+ if (a->to == stail && isconstraintarc(a))
+ {
+ if (sclone)
+ cparc(nfa, a, shead, sclone);
+ freearc(nfa, a);
+ if (NISERR())
+ break;
+ }
+ }
+}
+
+/*
+ * clonesuccessorstates - create a tree of constraint-arc successor states
+ *
+ * ssource is the state to be cloned, and sclone is the state to copy its
+ * outarcs into. sclone's inarcs, if any, should already be set up.
+ *
+ * spredecessor is the original predecessor state that we are trying to build
+ * successors for (it may not be the immediate predecessor of ssource).
+ * refarc, if not NULL, is the original constraint arc that is known to have
+ * been traversed out of spredecessor to reach the successor(s).
+ *
+ * For each cloned successor state, we transiently create a "donemap" that is
+ * a boolean array showing which source states we've already visited for this
+ * clone state. This prevents infinite recursion as well as useless repeat
+ * visits to the same state subtree (which can add up fast, since typical NFAs
+ * have multiple redundant arc pathways). Each donemap is a char array
+ * indexed by state number. The donemaps are all of the same size "nstates",
+ * which is nfa->nstates as of the start of the recursion. This is enough to
+ * have entries for all pre-existing states, but *not* entries for clone
+ * states created during the recursion. That's okay since we have no need to
+ * mark those.
+ *
+ * curdonemap is NULL when recursing to a new sclone state, or sclone's
+ * donemap when we are recursing without having created a new state (which we
+ * do when we decide we can merge a successor state into the current clone
+ * state). outerdonemap is NULL at the top level and otherwise the parent
+ * clone state's donemap.
+ *
+ * The successor states we create and fill here form a strict tree structure,
+ * with each state having exactly one predecessor, except that the toplevel
+ * state has no inarcs as yet (breakconstraintloop will add its inarcs from
+ * spredecessor after we're done). Thus, we can examine sclone's inarcs back
+ * to the root, plus refarc if any, to identify the set of constraints already
+ * known valid at the current point. This allows us to avoid generating extra
+ * successor states.
+ */
+static void
+clonesuccessorstates(struct nfa *nfa,
+ struct state *ssource,
+ struct state *sclone,
+ struct state *spredecessor,
+ struct arc *refarc,
+ char *curdonemap,
+ char *outerdonemap,
+ int nstates)
+{
+ char *donemap;
+ struct arc *a;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ {
+ NERR(REG_ETOOBIG);
+ return;
+ }
+
+ /* If this state hasn't already got a donemap, create one */
+ donemap = curdonemap;
+ if (donemap == NULL)
+ {
+ donemap = (char *) MALLOC(nstates * sizeof(char));
+ if (donemap == NULL)
+ {
+ NERR(REG_ESPACE);
+ return;
+ }
+
+ if (outerdonemap != NULL)
+ {
+ /*
+ * Not at outermost recursion level, so copy the outer level's
+ * donemap; this ensures that we see states in process of being
+ * visited at outer levels, or already merged into predecessor
+ * states, as ones we shouldn't traverse back to.
+ */
+ memcpy(donemap, outerdonemap, nstates * sizeof(char));
+ }
+ else
+ {
+ /* At outermost level, only spredecessor is off-limits */
+ memset(donemap, 0, nstates * sizeof(char));
+ assert(spredecessor->no < nstates);
+ donemap[spredecessor->no] = 1;
+ }
+ }
+
+ /* Mark ssource as visited in the donemap */
+ assert(ssource->no < nstates);
+ assert(donemap[ssource->no] == 0);
+ donemap[ssource->no] = 1;
+
+ /*
+ * We proceed by first cloning all of ssource's outarcs, creating new
+ * clone states as needed but not doing more with them than that. Then in
+ * a second pass, recurse to process the child clone states. This allows
+ * us to have only one child clone state per reachable source state, even
+ * when there are multiple outarcs leading to the same state. Also, when
+ * we do visit a child state, its set of inarcs is known exactly, which
+ * makes it safe to apply the constraint-is-already-checked optimization.
+ * Also, this ensures that we've merged all the states we can into the
+ * current clone before we recurse to any children, thus possibly saving
+ * them from making extra images of those states.
+ *
+ * While this function runs, child clone states of the current state are
+ * marked by setting their tmp fields to point to the original state they
+ * were cloned from. This makes it possible to detect multiple outarcs
+ * leading to the same state, and also makes it easy to distinguish clone
+ * states from original states (which will have tmp == NULL).
+ */
+ for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
+ {
+ struct state *sto = a->to;
+
+ /*
+ * We do not consider cloning successor states that have no constraint
+ * outarcs; just link to them as-is. They cannot be part of a
+ * constraint loop so there is no need to make copies. In particular,
+ * this rule keeps us from trying to clone the post state, which would
+ * be a bad idea.
+ */
+ if (isconstraintarc(a) && hasconstraintout(sto))
+ {
+ struct state *prevclone;
+ int canmerge;
+ struct arc *a2;
+
+ /*
+ * Back-link constraint arcs must not be followed. Nor is there a
+ * need to revisit states previously merged into this clone.
+ */
+ assert(sto->no < nstates);
+ if (donemap[sto->no] != 0)
+ continue;
+
+ /*
+ * Check whether we already have a child clone state for this
+ * source state.
+ */
+ prevclone = NULL;
+ for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
+ {
+ if (a2->to->tmp == sto)
+ {
+ prevclone = a2->to;
+ break;
+ }
+ }
+
+ /*
+ * If this arc is labeled the same as refarc, or the same as any
+ * arc we must have traversed to get to sclone, then no additional
+ * constraints need to be met to get to sto, so we should just
+ * merge its outarcs into sclone.
+ */
+ if (refarc && a->type == refarc->type && a->co == refarc->co)
+ canmerge = 1;
+ else
+ {
+ struct state *s;
+
+ canmerge = 0;
+ for (s = sclone; s->ins; s = s->ins->from)
+ {
+ if (s->nins == 1 &&
+ a->type == s->ins->type && a->co == s->ins->co)
+ {
+ canmerge = 1;
+ break;
+ }
+ }
+ }
+
+ if (canmerge)
+ {
+ /*
+ * We can merge into sclone. If we previously made a child
+ * clone state, drop it; there's no need to visit it. (This
+ * can happen if ssource has multiple pathways to sto, and we
+ * only just now found one that is provably a no-op.)
+ */
+ if (prevclone)
+ dropstate(nfa, prevclone); /* kills our outarc, too */
+
+ /* Recurse to merge sto's outarcs into sclone */
+ clonesuccessorstates(nfa,
+ sto,
+ sclone,
+ spredecessor,
+ refarc,
+ donemap,
+ outerdonemap,
+ nstates);
+ /* sto should now be marked as previously visited */
+ assert(NISERR() || donemap[sto->no] == 1);
+ }
+ else if (prevclone)
+ {
+ /*
+ * We already have a clone state for this successor, so just
+ * make another arc to it.
+ */
+ cparc(nfa, a, sclone, prevclone);
+ }
+ else
+ {
+ /*
+ * We need to create a new successor clone state.
+ */
+ struct state *stoclone;
+
+ stoclone = newstate(nfa);
+ if (stoclone == NULL)
+ {
+ assert(NISERR());
+ break;
+ }
+ /* Mark it as to what it's a clone of */
+ stoclone->tmp = sto;
+ /* ... and add the outarc leading to it */
+ cparc(nfa, a, sclone, stoclone);
+ }
+ }
+ else
+ {
+ /*
+ * Non-constraint outarcs just get copied to sclone, as do outarcs
+ * leading to states with no constraint outarc.
+ */
+ cparc(nfa, a, sclone, sto);
+ }
+ }
+
+ /*
+ * If we are at outer level for this clone state, recurse to all its child
+ * clone states, clearing their tmp fields as we go. (If we're not
+ * outermost for sclone, leave this to be done by the outer call level.)
+ * Note that if we have multiple outarcs leading to the same clone state,
+ * it will only be recursed-to once.
+ */
+ if (curdonemap == NULL)
+ {
+ for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
+ {
+ struct state *stoclone = a->to;
+ struct state *sto = stoclone->tmp;
+
+ if (sto != NULL)
+ {
+ stoclone->tmp = NULL;
+ clonesuccessorstates(nfa,
+ sto,
+ stoclone,
+ spredecessor,
+ refarc,
+ NULL,
+ donemap,
+ nstates);
+ }
+ }
+
+ /* Don't forget to free sclone's donemap when done with it */
+ FREE(donemap);
+ }
+}
+
+/*
+ * cleanup - clean up NFA after optimizations
+ */
+static void
+cleanup(struct nfa *nfa)
+{
+ struct state *s;
+ struct state *nexts;
+ int n;
+
+ if (NISERR())
+ return;
+
+ /* clear out unreachable or dead-end states */
+ /* use pre to mark reachable, then post to mark can-reach-post */
+ markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
+ markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
+ for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+ {
+ nexts = s->next;
+ if (s->tmp != nfa->post && !s->flag)
+ dropstate(nfa, s);
+ }
+ assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
+ cleartraverse(nfa, nfa->pre);
+ assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
+ /* the nins==0 (final unreachable) case will be caught later */
+
+ /* renumber surviving states */
+ n = 0;
+ for (s = nfa->states; s != NULL; s = s->next)
+ s->no = n++;
+ nfa->nstates = n;
+}
+
+/*
+ * markreachable - recursive marking of reachable states
+ */
+static void
+markreachable(struct nfa *nfa,
+ struct state *s,
+ struct state *okay, /* consider only states with this mark */
+ struct state *mark) /* the value to mark with */
+{
+ struct arc *a;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ {
+ NERR(REG_ETOOBIG);
+ return;
+ }
+
+ if (s->tmp != okay)
+ return;
+ s->tmp = mark;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ markreachable(nfa, a->to, okay, mark);
+}
+
+/*
+ * markcanreach - recursive marking of states which can reach here
+ */
+static void
+markcanreach(struct nfa *nfa,
+ struct state *s,
+ struct state *okay, /* consider only states with this mark */
+ struct state *mark) /* the value to mark with */
+{
+ struct arc *a;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ {
+ NERR(REG_ETOOBIG);
+ return;
+ }
+
+ if (s->tmp != okay)
+ return;
+ s->tmp = mark;
+
+ for (a = s->ins; a != NULL; a = a->inchain)
+ markcanreach(nfa, a->from, okay, mark);
+}
+
+/*
+ * analyze - ascertain potentially-useful facts about an optimized NFA
+ */
+static long /* re_info bits to be ORed in */
+analyze(struct nfa *nfa)
+{
+ struct arc *a;
+ struct arc *aa;
+
+ if (NISERR())
+ return 0;
+
+ /* Detect whether NFA can't match anything */
+ if (nfa->pre->outs == NULL)
+ return REG_UIMPOSSIBLE;
+
+ /* Detect whether NFA matches all strings (possibly with length bounds) */
+ checkmatchall(nfa);
+
+ /* Detect whether NFA can possibly match a zero-length string */
+ for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+ for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
+ if (aa->to == nfa->post)
+ return REG_UEMPTYMATCH;
+ return 0;
+}
+
+/*
+ * checkmatchall - does the NFA represent no more than a string length test?
+ *
+ * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
+ * to begin with) and set the MATCHALL bit in nfa->flags.
+ *
+ * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
+ * for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the
+ * post state via RAINBOW arcs, and if there are any loops in the graph, they
+ * must be loop-to-self arcs, ensuring that each loop iteration consumes
+ * exactly one character. (Longer loops are problematic because they create
+ * non-consecutive possible match lengths; we have no good way to represent
+ * that situation for lengths beyond the DUPINF limit.)
+ *
+ * Pseudocolor arcs complicate things a little. We know that they can only
+ * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
+ * EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs,
+ * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
+ * and BOL outarcs to state 2, and no others. Missing or extra pseudocolor
+ * arcs can occur, meaning that the NFA involves some constraint on the
+ * adjacent characters, which makes it not a matchall NFA.
+ */
+static void
+checkmatchall(struct nfa *nfa)
+{
+ bool **haspaths;
+ struct state *s;
+ int i;
+
+ /*
+ * If there are too many states, don't bother trying to detect matchall.
+ * This limit serves to bound the time and memory we could consume below.
+ * Note that even if the graph is all-RAINBOW, if there are significantly
+ * more than DUPINF states then it's likely that there are paths of length
+ * more than DUPINF, which would force us to fail anyhow. In practice,
+ * plausible ways of writing a matchall regex with maximum finite path
+ * length K tend not to have very many more than K states.
+ */
+ if (nfa->nstates > DUPINF * 2)
+ return;
+
+ /*
+ * First, scan all the states to verify that only RAINBOW arcs appear,
+ * plus pseudocolor arcs adjacent to the pre and post states. This lets
+ * us quickly eliminate most cases that aren't matchall NFAs.
+ */
+ for (s = nfa->states; s != NULL; s = s->next)
+ {
+ struct arc *a;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->type != PLAIN)
+ return; /* any LACONs make it non-matchall */
+ if (a->co != RAINBOW)
+ {
+ if (nfa->cm->cd[a->co].flags & PSEUDO)
+ {
+ /*
+ * Pseudocolor arc: verify it's in a valid place (this
+ * seems quite unlikely to fail, but let's be sure).
+ */
+ if (s == nfa->pre &&
+ (a->co == nfa->bos[0] || a->co == nfa->bos[1]))
+ /* okay BOS/BOL arc */ ;
+ else if (a->to == nfa->post &&
+ (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
+ /* okay EOS/EOL arc */ ;
+ else
+ return; /* unexpected pseudocolor arc */
+ /* We'll check these arcs some more below. */
+ }
+ else
+ return; /* any other color makes it non-matchall */
+ }
+ }
+ /* Also, assert that the tmp fields are available for use. */
+ assert(s->tmp == NULL);
+ }
+
+ /*
+ * The next cheapest check we can make is to verify that the BOS/BOL
+ * outarcs of the pre state reach the same states as its RAINBOW outarcs.
+ * If they don't, the NFA expresses some constraints on the character
+ * before the matched string, making it non-matchall. Likewise, the
+ * EOS/EOL inarcs of the post state must match its RAINBOW inarcs.
+ */
+ if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
+ !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
+ !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
+ !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]))
+ return;
+
+ /*
+ * Initialize an array of path-length arrays, in which
+ * checkmatchall_recurse will return per-state results. This lets us
+ * memo-ize the recursive search and avoid exponential time consumption.
+ */
+ haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *));
+ if (haspaths == NULL)
+ return; /* fail quietly */
+ memset(haspaths, 0, nfa->nstates * sizeof(bool *));
+
+ /*
+ * Recursively search the graph for all-RAINBOW paths to the "post" state,
+ * starting at the "pre" state, and computing the lengths of the paths.
+ * (Given the preceding checks, there should be at least one such path.
+ * However we could get back a false result anyway, in case there are
+ * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic
+ * failures such as ENOMEM.)
+ */
+ if (checkmatchall_recurse(nfa, nfa->pre, haspaths))
+ {
+ /* The useful result is the path length array for the pre state */
+ bool *haspath = haspaths[nfa->pre->no];
+ int minmatch,
+ maxmatch,
+ morematch;
+
+ assert(haspath != NULL);
+
+ /*
+ * haspath[] now represents the set of possible path lengths; but we
+ * want to reduce that to a min and max value, because it doesn't seem
+ * worth complicating regexec.c to deal with nonconsecutive possible
+ * match lengths. Find min and max of first run of lengths, then
+ * verify there are no nonconsecutive lengths.
+ */
+ for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++)
+ {
+ if (haspath[minmatch])
+ break;
+ }
+ assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */
+ for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++)
+ {
+ if (!haspath[maxmatch + 1])
+ break;
+ }
+ for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++)
+ {
+ if (haspath[morematch])
+ {
+ haspath = NULL; /* fail, there are nonconsecutive lengths */
+ break;
+ }
+ }
+
+ if (haspath != NULL)
+ {
+ /*
+ * Success, so record the info. Here we have a fine point: the
+ * path length from the pre state includes the pre-to-initial
+ * transition, so it's one more than the actually matched string
+ * length. (We avoided counting the final-to-post transition
+ * within checkmatchall_recurse, but not this one.) This is why
+ * checkmatchall_recurse allows one more level of path length than
+ * might seem necessary. This decrement also takes care of
+ * converting checkmatchall_recurse's definition of "infinity" as
+ * "DUPINF+1" to our normal representation as "DUPINF".
+ */
+ assert(minmatch > 0); /* else pre and post states were adjacent */
+ nfa->minmatchall = minmatch - 1;
+ nfa->maxmatchall = maxmatch - 1;
+ nfa->flags |= MATCHALL;
+ }
+ }
+
+ /* Clean up */
+ for (i = 0; i < nfa->nstates; i++)
+ {
+ if (haspaths[i] != NULL)
+ FREE(haspaths[i]);
+ }
+ FREE(haspaths);
+}
+
+/*
+ * checkmatchall_recurse - recursive search for checkmatchall
+ *
+ * s is the state to be examined in this recursion level.
+ * haspaths[] is an array of per-state exit path length arrays.
+ *
+ * We return true if the search was performed successfully, false if
+ * we had to fail because of multi-state loops or other internal reasons.
+ * (Because "dead" states that can't reach the post state have been
+ * eliminated, and we already verified that only RAINBOW and matching
+ * pseudocolor arcs exist, every state should have RAINBOW path(s) to
+ * the post state. Hence we take a false result from recursive calls
+ * as meaning that we'd better fail altogether, not just that that
+ * particular state can't reach the post state.)
+ *
+ * On success, we store a malloc'd result array in haspaths[s->no],
+ * showing the possible path lengths from s to the post state.
+ * Each state's haspath[] array is of length DUPINF+2. The entries from
+ * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k
+ * from this state to the string end. haspath[DUPINF+1] is true if all
+ * path lengths >= DUPINF+1 are possible. (Situations that cannot be
+ * represented under these rules cause failure.)
+ *
+ * checkmatchall is responsible for eventually freeing the haspath[] arrays.
+ */
+static bool
+checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
+{
+ bool result = false;
+ bool foundloop = false;
+ bool *haspath;
+ struct arc *a;
+
+ /*
+ * Since this is recursive, it could be driven to stack overflow. But we
+ * need not treat that as a hard failure; just deem the NFA non-matchall.
+ */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ return false;
+
+ /* In case the search takes a long time, check for cancel */
+ if (CANCEL_REQUESTED(nfa->v->re))
+ {
+ NERR(REG_CANCEL);
+ return false;
+ }
+
+ /* Create a haspath array for this state */
+ haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
+ if (haspath == NULL)
+ return false; /* again, treat as non-matchall */
+ memset(haspath, 0, (DUPINF + 2) * sizeof(bool));
+
+ /* Mark this state as being visited */
+ assert(s->tmp == NULL);
+ s->tmp = s;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->co != RAINBOW)
+ continue; /* ignore pseudocolor arcs */
+ if (a->to == nfa->post)
+ {
+ /* We found an all-RAINBOW path to the post state */
+ result = true;
+
+ /*
+ * Mark this state as being zero steps away from the string end
+ * (the transition to the post state isn't counted).
+ */
+ haspath[0] = true;
+ }
+ else if (a->to == s)
+ {
+ /* We found a cycle of length 1, which we'll deal with below. */
+ foundloop = true;
+ }
+ else if (a->to->tmp != NULL)
+ {
+ /* It's busy, so we found a cycle of length > 1, so fail. */
+ result = false;
+ break;
+ }
+ else
+ {
+ /* Consider paths forward through this to-state. */
+ bool *nexthaspath;
+ int i;
+
+ /* If to-state was not already visited, recurse */
+ if (haspaths[a->to->no] == NULL)
+ {
+ result = checkmatchall_recurse(nfa, a->to, haspaths);
+ /* Fail if any recursive path fails */
+ if (!result)
+ break;
+ }
+ else
+ {
+ /* The previous visit must have found path(s) to the end */
+ result = true;
+ }
+ assert(a->to->tmp == NULL);
+ nexthaspath = haspaths[a->to->no];
+ assert(nexthaspath != NULL);
+
+ /*
+ * Now, for every path of length i from a->to to the string end,
+ * there is a path of length i + 1 from s to the string end.
+ */
+ if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1])
+ {
+ /*
+ * a->to has a path of length exactly DUPINF, but not longer;
+ * or it has paths of all lengths > DUPINF but not one of
+ * exactly that length. In either case, we cannot represent
+ * the possible path lengths from s correctly, so fail.
+ */
+ result = false;
+ break;
+ }
+ /* Merge knowledge of these path lengths into what we have */
+ for (i = 0; i < DUPINF; i++)
+ haspath[i + 1] |= nexthaspath[i];
+ /* Infinity + 1 is still infinity */
+ haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1];
+ }
+ }
+
+ if (result && foundloop)
+ {
+ /*
+ * If there is a length-1 loop at this state, then find the shortest
+ * known path length to the end. The loop means that every larger
+ * path length is possible, too. (It doesn't matter whether any of
+ * the longer lengths were already known possible.)
+ */
+ int i;
+
+ for (i = 0; i <= DUPINF; i++)
+ {
+ if (haspath[i])
+ break;
+ }
+ for (i++; i <= DUPINF + 1; i++)
+ haspath[i] = true;
+ }
+
+ /* Report out the completed path length map */
+ assert(s->no < nfa->nstates);
+ assert(haspaths[s->no] == NULL);
+ haspaths[s->no] = haspath;
+
+ /* Mark state no longer busy */
+ s->tmp = NULL;
+
+ return result;
+}
+
+/*
+ * check_out_colors_match - subroutine for checkmatchall
+ *
+ * Check whether the set of states reachable from s by arcs of color co1
+ * is equivalent to the set reachable by arcs of color co2.
+ * checkmatchall already verified that all of the NFA's arcs are PLAIN,
+ * so we need not examine arc types here.
+ */
+static bool
+check_out_colors_match(struct state *s, color co1, color co2)
+{
+ bool result = true;
+ struct arc *a;
+
+ /*
+ * To do this in linear time, we assume that the NFA contains no duplicate
+ * arcs. Run through the out-arcs, marking states reachable by arcs of
+ * color co1. Run through again, un-marking states reachable by arcs of
+ * color co2; if we see a not-marked state, we know this co2 arc is
+ * unmatched. Then run through again, checking for still-marked states,
+ * and in any case leaving all the tmp fields reset to NULL.
+ */
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->co == co1)
+ {
+ assert(a->to->tmp == NULL);
+ a->to->tmp = a->to;
+ }
+ }
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->co == co2)
+ {
+ if (a->to->tmp != NULL)
+ a->to->tmp = NULL;
+ else
+ result = false; /* unmatched co2 arc */
+ }
+ }
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->co == co1)
+ {
+ if (a->to->tmp != NULL)
+ {
+ result = false; /* unmatched co1 arc */
+ a->to->tmp = NULL;
+ }
+ }
+ }
+ return result;
+}
+
+/*
+ * check_in_colors_match - subroutine for checkmatchall
+ *
+ * Check whether the set of states that can reach s by arcs of color co1
+ * is equivalent to the set that can reach s by arcs of color co2.
+ * checkmatchall already verified that all of the NFA's arcs are PLAIN,
+ * so we need not examine arc types here.
+ */
+static bool
+check_in_colors_match(struct state *s, color co1, color co2)
+{
+ bool result = true;
+ struct arc *a;
+
+ /*
+ * Identical algorithm to check_out_colors_match, except examine the
+ * from-states of s' inarcs.
+ */
+ for (a = s->ins; a != NULL; a = a->inchain)
+ {
+ if (a->co == co1)
+ {
+ assert(a->from->tmp == NULL);
+ a->from->tmp = a->from;
+ }
+ }
+ for (a = s->ins; a != NULL; a = a->inchain)
+ {
+ if (a->co == co2)
+ {
+ if (a->from->tmp != NULL)
+ a->from->tmp = NULL;
+ else
+ result = false; /* unmatched co2 arc */
+ }
+ }
+ for (a = s->ins; a != NULL; a = a->inchain)
+ {
+ if (a->co == co1)
+ {
+ if (a->from->tmp != NULL)
+ {
+ result = false; /* unmatched co1 arc */
+ a->from->tmp = NULL;
+ }
+ }
+ }
+ return result;
+}
+
+/*
+ * compact - construct the compact representation of an NFA
+ */
+static void
+compact(struct nfa *nfa,
+ struct cnfa *cnfa)
+{
+ struct state *s;
+ struct arc *a;
+ size_t nstates;
+ size_t narcs;
+ struct carc *ca;
+ struct carc *first;
+
+ assert(!NISERR());
+
+ nstates = 0;
+ narcs = 0;
+ for (s = nfa->states; s != NULL; s = s->next)
+ {
+ nstates++;
+ narcs += s->nouts + 1; /* need one extra for endmarker */
+ }
+
+ cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
+ cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
+ cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
+ if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
+ {
+ if (cnfa->stflags != NULL)
+ FREE(cnfa->stflags);
+ if (cnfa->states != NULL)
+ FREE(cnfa->states);
+ if (cnfa->arcs != NULL)
+ FREE(cnfa->arcs);
+ NERR(REG_ESPACE);
+ return;
+ }
+ cnfa->nstates = nstates;
+ cnfa->pre = nfa->pre->no;
+ cnfa->post = nfa->post->no;
+ cnfa->bos[0] = nfa->bos[0];
+ cnfa->bos[1] = nfa->bos[1];
+ cnfa->eos[0] = nfa->eos[0];
+ cnfa->eos[1] = nfa->eos[1];
+ cnfa->ncolors = maxcolor(nfa->cm) + 1;
+ cnfa->flags = nfa->flags;
+ cnfa->minmatchall = nfa->minmatchall;
+ cnfa->maxmatchall = nfa->maxmatchall;
+
+ ca = cnfa->arcs;
+ for (s = nfa->states; s != NULL; s = s->next)
+ {
+ assert((size_t) s->no < nstates);
+ cnfa->stflags[s->no] = 0;
+ cnfa->states[s->no] = ca;
+ first = ca;
+ for (a = s->outs; a != NULL; a = a->outchain)
+ switch (a->type)
+ {
+ case PLAIN:
+ ca->co = a->co;
+ ca->to = a->to->no;
+ ca++;
+ break;
+ case LACON:
+ assert(s->no != cnfa->pre);
+ assert(a->co >= 0);
+ ca->co = (color) (cnfa->ncolors + a->co);
+ ca->to = a->to->no;
+ ca++;
+ cnfa->flags |= HASLACONS;
+ break;
+ default:
+ NERR(REG_ASSERT);
+ return;
+ }
+ carcsort(first, ca - first);
+ ca->co = COLORLESS;
+ ca->to = 0;
+ ca++;
+ }
+ assert(ca == &cnfa->arcs[narcs]);
+ assert(cnfa->nstates != 0);
+
+ /* mark no-progress states */
+ for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+ cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
+ cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
+}
+
+/*
+ * carcsort - sort compacted-NFA arcs by color
+ */
+static void
+carcsort(struct carc *first, size_t n)
+{
+ if (n > 1)
+ qsort(first, n, sizeof(struct carc), carc_cmp);
+}
+
+static int
+carc_cmp(const void *a, const void *b)
+{
+ const struct carc *aa = (const struct carc *) a;
+ const struct carc *bb = (const struct carc *) b;
+
+ if (aa->co < bb->co)
+ return -1;
+ if (aa->co > bb->co)
+ return +1;
+ if (aa->to < bb->to)
+ return -1;
+ if (aa->to > bb->to)
+ return +1;
+ /* This is unreached, since there should be no duplicate arcs now: */
+ return 0;
+}
+
+/*
+ * freecnfa - free a compacted NFA
+ */
+static void
+freecnfa(struct cnfa *cnfa)
+{
+ assert(!NULLCNFA(*cnfa)); /* not empty already */
+ FREE(cnfa->stflags);
+ FREE(cnfa->states);
+ FREE(cnfa->arcs);
+ ZAPCNFA(*cnfa);
+}
+
+/*
+ * dumpnfa - dump an NFA in human-readable form
+ */
+static void
+dumpnfa(struct nfa *nfa,
+ FILE *f)
+{
+#ifdef REG_DEBUG
+ struct state *s;
+ int nstates = 0;
+ int narcs = 0;
+
+ fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
+ if (nfa->bos[0] != COLORLESS)
+ fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
+ if (nfa->bos[1] != COLORLESS)
+ fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
+ if (nfa->eos[0] != COLORLESS)
+ fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
+ if (nfa->eos[1] != COLORLESS)
+ fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
+ if (nfa->flags & HASLACONS)
+ fprintf(f, ", haslacons");
+ if (nfa->flags & MATCHALL)
+ {
+ fprintf(f, ", minmatchall %d", nfa->minmatchall);
+ if (nfa->maxmatchall == DUPINF)
+ fprintf(f, ", maxmatchall inf");
+ else
+ fprintf(f, ", maxmatchall %d", nfa->maxmatchall);
+ }
+ fprintf(f, "\n");
+ for (s = nfa->states; s != NULL; s = s->next)
+ {
+ dumpstate(s, f);
+ nstates++;
+ narcs += s->nouts;
+ }
+ fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
+ if (nfa->parent == NULL)
+ dumpcolors(nfa->cm, f);
+ fflush(f);
+#endif
+}
+
+#ifdef REG_DEBUG /* subordinates of dumpnfa */
+
+/*
+ * dumpstate - dump an NFA state in human-readable form
+ */
+static void
+dumpstate(struct state *s,
+ FILE *f)
+{
+ struct arc *a;
+
+ fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
+ (s->flag) ? s->flag : '.');
+ if (s->prev != NULL && s->prev->next != s)
+ fprintf(f, "\tstate chain bad\n");
+ if (s->nouts == 0)
+ fprintf(f, "\tno out arcs\n");
+ else
+ dumparcs(s, f);
+ for (a = s->ins; a != NULL; a = a->inchain)
+ {
+ if (a->to != s)
+ fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
+ a->from->no, a->to->no, s->no);
+ }
+ fflush(f);
+}
+
+/*
+ * dumparcs - dump out-arcs in human-readable form
+ */
+static void
+dumparcs(struct state *s,
+ FILE *f)
+{
+ int pos;
+ struct arc *a;
+
+ /* printing oldest arcs first is usually clearer */
+ a = s->outs;
+ assert(a != NULL);
+ while (a->outchain != NULL)
+ a = a->outchain;
+ pos = 1;
+ do
+ {
+ dumparc(a, s, f);
+ if (pos == 5)
+ {
+ fprintf(f, "\n");
+ pos = 1;
+ }
+ else
+ pos++;
+ a = a->outchainRev;
+ } while (a != NULL);
+ if (pos != 1)
+ fprintf(f, "\n");
+}
+
+/*
+ * dumparc - dump one outarc in readable form, including prefixing tab
+ */
+static void
+dumparc(struct arc *a,
+ struct state *s,
+ FILE *f)
+{
+ struct arc *aa;
+
+ fprintf(f, "\t");
+ switch (a->type)
+ {
+ case PLAIN:
+ if (a->co == RAINBOW)
+ fprintf(f, "[*]");
+ else
+ fprintf(f, "[%ld]", (long) a->co);
+ break;
+ case AHEAD:
+ if (a->co == RAINBOW)
+ fprintf(f, ">*>");
+ else
+ fprintf(f, ">%ld>", (long) a->co);
+ break;
+ case BEHIND:
+ if (a->co == RAINBOW)
+ fprintf(f, "<*<");
+ else
+ fprintf(f, "<%ld<", (long) a->co);
+ break;
+ case LACON:
+ fprintf(f, ":%ld:", (long) a->co);
+ break;
+ case '^':
+ case '$':
+ fprintf(f, "%c%d", a->type, (int) a->co);
+ break;
+ case EMPTY:
+ break;
+ default:
+ fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
+ break;
+ }
+ if (a->from != s)
+ fprintf(f, "?%d?", a->from->no);
+ for (aa = a->from->outs; aa != NULL; aa = aa->outchain)
+ if (aa == a)
+ break; /* NOTE BREAK OUT */
+ if (aa == NULL)
+ fprintf(f, "?!?"); /* missing from out-chain */
+ fprintf(f, "->");
+ if (a->to == NULL)
+ {
+ fprintf(f, "NULL");
+ return;
+ }
+ fprintf(f, "%d", a->to->no);
+ for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
+ if (aa == a)
+ break; /* NOTE BREAK OUT */
+ if (aa == NULL)
+ fprintf(f, "?!?"); /* missing from in-chain */
+}
+#endif /* REG_DEBUG */
+
+/*
+ * dumpcnfa - dump a compacted NFA in human-readable form
+ */
+#ifdef REG_DEBUG
+static void
+dumpcnfa(struct cnfa *cnfa,
+ FILE *f)
+{
+ int st;
+
+ fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
+ if (cnfa->bos[0] != COLORLESS)
+ fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
+ if (cnfa->bos[1] != COLORLESS)
+ fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
+ if (cnfa->eos[0] != COLORLESS)
+ fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
+ if (cnfa->eos[1] != COLORLESS)
+ fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
+ if (cnfa->flags & HASLACONS)
+ fprintf(f, ", haslacons");
+ if (cnfa->flags & MATCHALL)
+ {
+ fprintf(f, ", minmatchall %d", cnfa->minmatchall);
+ if (cnfa->maxmatchall == DUPINF)
+ fprintf(f, ", maxmatchall inf");
+ else
+ fprintf(f, ", maxmatchall %d", cnfa->maxmatchall);
+ }
+ fprintf(f, "\n");
+ for (st = 0; st < cnfa->nstates; st++)
+ dumpcstate(st, cnfa, f);
+ fflush(f);
+}
+#endif
+
+#ifdef REG_DEBUG /* subordinates of dumpcnfa */
+
+/*
+ * dumpcstate - dump a compacted-NFA state in human-readable form
+ */
+static void
+dumpcstate(int st,
+ struct cnfa *cnfa,
+ FILE *f)
+{
+ struct carc *ca;
+ int pos;
+
+ fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
+ pos = 1;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co == RAINBOW)
+ fprintf(f, "\t[*]->%d", ca->to);
+ else if (ca->co < cnfa->ncolors)
+ fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
+ else
+ fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
+ if (pos == 5)
+ {
+ fprintf(f, "\n");
+ pos = 1;
+ }
+ else
+ pos++;
+ }
+ if (ca == cnfa->states[st] || pos != 1)
+ fprintf(f, "\n");
+ fflush(f);
+}
+
+#endif /* REG_DEBUG */
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
new file mode 100644
index 0000000..e1f9df0
--- /dev/null
+++ b/src/backend/regex/regc_pg_locale.c
@@ -0,0 +1,940 @@
+/*-------------------------------------------------------------------------
+ *
+ * regc_pg_locale.c
+ * ctype functions adapted to work on pg_wchar (a/k/a chr),
+ * and functions to cache the results of wholesale ctype probing.
+ *
+ * This file is #included by regcomp.c; it's not meant to compile standalone.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/regex/regc_pg_locale.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "catalog/pg_collation.h"
+#include "utils/pg_locale.h"
+
+/*
+ * To provide as much functionality as possible on a variety of platforms,
+ * without going so far as to implement everything from scratch, we use
+ * several implementation strategies depending on the situation:
+ *
+ * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
+ * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
+ * collations don't give a fig about multibyte characters.
+ *
+ * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
+ *
+ * 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
+ * This assumes that every platform uses Unicode codepoints directly
+ * as the wchar_t representation of Unicode. On some platforms
+ * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
+ *
+ * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
+ * values up to 255, and punt for values above that. This is 100% correct
+ * only in single-byte encodings such as LATINn. However, non-Unicode
+ * multibyte encodings are mostly Far Eastern character sets for which the
+ * properties being tested here aren't very relevant for higher code values
+ * anyway. The difficulty with using the <wctype.h> functions with
+ * non-Unicode multibyte encodings is that we can have no certainty that
+ * the platform's wchar_t representation matches what we do in pg_wchar
+ * conversions.
+ *
+ * 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
+ * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
+ * functions, under exactly the same cases as #2.
+ *
+ * There is one notable difference between cases 2 and 3: in the "default"
+ * collation we force ASCII letters to follow ASCII upcase/downcase rules,
+ * while in a non-default collation we just let the library functions do what
+ * they will. The case where this matters is treatment of I/i in Turkish,
+ * and the behavior is meant to match the upper()/lower() SQL functions.
+ *
+ * We store the active collation setting in static variables. In principle
+ * it could be passed down to here via the regex library's "struct vars" data
+ * structure; but that would require somewhat invasive changes in the regex
+ * library, and right now there's no real benefit to be gained from that.
+ *
+ * NB: the coding here assumes pg_wchar is an unsigned type.
+ */
+
+typedef enum
+{
+ PG_REGEX_LOCALE_C, /* C locale (encoding independent) */
+ PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */
+ PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */
+ PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */
+ PG_REGEX_LOCALE_1BYTE_L, /* Use locale_t <ctype.h> functions */
+ PG_REGEX_LOCALE_ICU /* Use ICU uchar.h functions */
+} PG_Locale_Strategy;
+
+static PG_Locale_Strategy pg_regex_strategy;
+static pg_locale_t pg_regex_locale;
+static Oid pg_regex_collation;
+
+/*
+ * Hard-wired character properties for C locale
+ */
+#define PG_ISDIGIT 0x01
+#define PG_ISALPHA 0x02
+#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
+#define PG_ISUPPER 0x04
+#define PG_ISLOWER 0x08
+#define PG_ISGRAPH 0x10
+#define PG_ISPRINT 0x20
+#define PG_ISPUNCT 0x40
+#define PG_ISSPACE 0x80
+
+static const unsigned char pg_char_properties[128] = {
+ /* NUL */ 0,
+ /* ^A */ 0,
+ /* ^B */ 0,
+ /* ^C */ 0,
+ /* ^D */ 0,
+ /* ^E */ 0,
+ /* ^F */ 0,
+ /* ^G */ 0,
+ /* ^H */ 0,
+ /* ^I */ PG_ISSPACE,
+ /* ^J */ PG_ISSPACE,
+ /* ^K */ PG_ISSPACE,
+ /* ^L */ PG_ISSPACE,
+ /* ^M */ PG_ISSPACE,
+ /* ^N */ 0,
+ /* ^O */ 0,
+ /* ^P */ 0,
+ /* ^Q */ 0,
+ /* ^R */ 0,
+ /* ^S */ 0,
+ /* ^T */ 0,
+ /* ^U */ 0,
+ /* ^V */ 0,
+ /* ^W */ 0,
+ /* ^X */ 0,
+ /* ^Y */ 0,
+ /* ^Z */ 0,
+ /* ^[ */ 0,
+ /* ^\ */ 0,
+ /* ^] */ 0,
+ /* ^^ */ 0,
+ /* ^_ */ 0,
+ /* */ PG_ISPRINT | PG_ISSPACE,
+ /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+ /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+ /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+ /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+ /* DEL */ 0
+};
+
+
+/*
+ * pg_set_regex_collation: set collation for these functions to obey
+ *
+ * This is called when beginning compilation or execution of a regexp.
+ * Since there's no need for reentrancy of regexp operations, it's okay
+ * to store the results in static variables.
+ */
+void
+pg_set_regex_collation(Oid collation)
+{
+ if (!OidIsValid(collation))
+ {
+ /*
+ * This typically means that the parser could not resolve a conflict
+ * of implicit collations, so report it that way.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INDETERMINATE_COLLATION),
+ errmsg("could not determine which collation to use for regular expression"),
+ errhint("Use the COLLATE clause to set the collation explicitly.")));
+ }
+
+ if (lc_ctype_is_c(collation))
+ {
+ /* C/POSIX collations use this path regardless of database encoding */
+ pg_regex_strategy = PG_REGEX_LOCALE_C;
+ pg_regex_locale = 0;
+ pg_regex_collation = C_COLLATION_OID;
+ }
+ else
+ {
+ /*
+ * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T; the
+ * case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not have to
+ * be considered below.
+ */
+ pg_regex_locale = pg_newlocale_from_collation(collation);
+
+ if (pg_regex_locale && !pg_regex_locale->deterministic)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("nondeterministic collations are not supported for regular expressions")));
+
+#ifdef USE_ICU
+ if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU)
+ pg_regex_strategy = PG_REGEX_LOCALE_ICU;
+ else
+#endif
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ if (pg_regex_locale)
+ pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
+ else
+ pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
+ }
+ else
+ {
+ if (pg_regex_locale)
+ pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
+ else
+ pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
+ }
+
+ pg_regex_collation = collation;
+ }
+}
+
+static int
+pg_wc_isdigit(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISDIGIT));
+ case PG_REGEX_LOCALE_WIDE:
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswdigit((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isdigit((unsigned char) c));
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_isdigit(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isalpha(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISALPHA));
+ case PG_REGEX_LOCALE_WIDE:
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswalpha((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isalpha((unsigned char) c));
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_isalpha(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isalnum(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISALNUM));
+ case PG_REGEX_LOCALE_WIDE:
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswalnum((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isalnum((unsigned char) c));
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_isalnum(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isword(pg_wchar c)
+{
+ /* We define word characters as alnum class plus underscore */
+ if (c == CHR('_'))
+ return 1;
+ return pg_wc_isalnum(c);
+}
+
+static int
+pg_wc_isupper(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISUPPER));
+ case PG_REGEX_LOCALE_WIDE:
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswupper((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isupper((unsigned char) c));
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isupper_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_isupper(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_islower(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISLOWER));
+ case PG_REGEX_LOCALE_WIDE:
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswlower((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ islower((unsigned char) c));
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ islower_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_islower(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isgraph(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISGRAPH));
+ case PG_REGEX_LOCALE_WIDE:
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswgraph((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isgraph((unsigned char) c));
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_isgraph(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isprint(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISPRINT));
+ case PG_REGEX_LOCALE_WIDE:
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswprint((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isprint((unsigned char) c));
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isprint_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_isprint(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_ispunct(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISPUNCT));
+ case PG_REGEX_LOCALE_WIDE:
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswpunct((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ ispunct((unsigned char) c));
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_ispunct(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isspace(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISSPACE));
+ case PG_REGEX_LOCALE_WIDE:
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswspace((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isspace((unsigned char) c));
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ return (c <= (pg_wchar) UCHAR_MAX &&
+ isspace_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_isspace(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static pg_wchar
+pg_wc_toupper(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ if (c <= (pg_wchar) 127)
+ return pg_ascii_toupper((unsigned char) c);
+ return c;
+ case PG_REGEX_LOCALE_WIDE:
+ /* force C behavior for ASCII characters, per comments above */
+ if (c <= (pg_wchar) 127)
+ return pg_ascii_toupper((unsigned char) c);
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return towupper((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ /* force C behavior for ASCII characters, per comments above */
+ if (c <= (pg_wchar) 127)
+ return pg_ascii_toupper((unsigned char) c);
+ if (c <= (pg_wchar) UCHAR_MAX)
+ return toupper((unsigned char) c);
+ return c;
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return towupper_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ if (c <= (pg_wchar) UCHAR_MAX)
+ return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
+#endif
+ return c;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_toupper(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+static pg_wchar
+pg_wc_tolower(pg_wchar c)
+{
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+ if (c <= (pg_wchar) 127)
+ return pg_ascii_tolower((unsigned char) c);
+ return c;
+ case PG_REGEX_LOCALE_WIDE:
+ /* force C behavior for ASCII characters, per comments above */
+ if (c <= (pg_wchar) 127)
+ return pg_ascii_tolower((unsigned char) c);
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return towlower((wint_t) c);
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE:
+ /* force C behavior for ASCII characters, per comments above */
+ if (c <= (pg_wchar) 127)
+ return pg_ascii_tolower((unsigned char) c);
+ if (c <= (pg_wchar) UCHAR_MAX)
+ return tolower((unsigned char) c);
+ return c;
+ case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+ if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+ return towlower_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+ /* FALL THRU */
+ case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+ if (c <= (pg_wchar) UCHAR_MAX)
+ return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
+#endif
+ return c;
+ case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+ return u_tolower(c);
+#endif
+ break;
+ }
+ return 0; /* can't get here, but keep compiler quiet */
+}
+
+
+/*
+ * These functions cache the results of probing libc's ctype behavior for
+ * all character codes of interest in a given encoding/collation. The
+ * result is provided as a "struct cvec", but notice that the representation
+ * is a touch different from a cvec created by regc_cvec.c: we allocate the
+ * chrs[] and ranges[] arrays separately from the struct so that we can
+ * realloc them larger at need. This is okay since the cvecs made here
+ * should never be freed by freecvec().
+ *
+ * We use malloc not palloc since we mustn't lose control on out-of-memory;
+ * the main regex code expects us to return a failure indication instead.
+ */
+
+typedef int (*pg_wc_probefunc) (pg_wchar c);
+
+typedef struct pg_ctype_cache
+{
+ pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
+ Oid collation; /* collation this entry is for */
+ struct cvec cv; /* cache entry contents */
+ struct pg_ctype_cache *next; /* chain link */
+} pg_ctype_cache;
+
+static pg_ctype_cache *pg_ctype_cache_list = NULL;
+
+/*
+ * Add a chr or range to pcc->cv; return false if run out of memory
+ */
+static bool
+store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
+{
+ chr *newchrs;
+
+ if (nchrs > 1)
+ {
+ if (pcc->cv.nranges >= pcc->cv.rangespace)
+ {
+ pcc->cv.rangespace *= 2;
+ newchrs = (chr *) realloc(pcc->cv.ranges,
+ pcc->cv.rangespace * sizeof(chr) * 2);
+ if (newchrs == NULL)
+ return false;
+ pcc->cv.ranges = newchrs;
+ }
+ pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
+ pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
+ pcc->cv.nranges++;
+ }
+ else
+ {
+ assert(nchrs == 1);
+ if (pcc->cv.nchrs >= pcc->cv.chrspace)
+ {
+ pcc->cv.chrspace *= 2;
+ newchrs = (chr *) realloc(pcc->cv.chrs,
+ pcc->cv.chrspace * sizeof(chr));
+ if (newchrs == NULL)
+ return false;
+ pcc->cv.chrs = newchrs;
+ }
+ pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
+ }
+ return true;
+}
+
+/*
+ * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
+ * chrs satisfying the probe function. The active collation is the one
+ * previously set by pg_set_regex_collation. Return NULL if out of memory.
+ *
+ * Note that the result must not be freed or modified by caller.
+ */
+static struct cvec *
+pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
+{
+ pg_ctype_cache *pcc;
+ pg_wchar max_chr;
+ pg_wchar cur_chr;
+ int nmatches;
+ chr *newchrs;
+
+ /*
+ * Do we already have the answer cached?
+ */
+ for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
+ {
+ if (pcc->probefunc == probefunc &&
+ pcc->collation == pg_regex_collation)
+ return &pcc->cv;
+ }
+
+ /*
+ * Nope, so initialize some workspace ...
+ */
+ pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
+ if (pcc == NULL)
+ return NULL;
+ pcc->probefunc = probefunc;
+ pcc->collation = pg_regex_collation;
+ pcc->cv.nchrs = 0;
+ pcc->cv.chrspace = 128;
+ pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
+ pcc->cv.nranges = 0;
+ pcc->cv.rangespace = 64;
+ pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
+ if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
+ goto out_of_memory;
+ pcc->cv.cclasscode = cclasscode;
+
+ /*
+ * Decide how many character codes we ought to look through. In general
+ * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
+ * runtime using the "high colormap" mechanism. However, in C locale
+ * there's no need to go further than 127, and if we only have a 1-byte
+ * <ctype.h> API there's no need to go further than that can handle.
+ *
+ * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
+ * output cvec as not having any locale-dependent behavior, since there
+ * will be no need to do any run-time locale checks. (The #if's here
+ * would always be true for production values of MAX_SIMPLE_CHR, but it's
+ * useful to allow it to be small for testing purposes.)
+ */
+ switch (pg_regex_strategy)
+ {
+ case PG_REGEX_LOCALE_C:
+#if MAX_SIMPLE_CHR >= 127
+ max_chr = (pg_wchar) 127;
+ pcc->cv.cclasscode = -1;
+#else
+ max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+#endif
+ break;
+ case PG_REGEX_LOCALE_WIDE:
+ case PG_REGEX_LOCALE_WIDE_L:
+ max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+ break;
+ case PG_REGEX_LOCALE_1BYTE:
+ case PG_REGEX_LOCALE_1BYTE_L:
+#if MAX_SIMPLE_CHR >= UCHAR_MAX
+ max_chr = (pg_wchar) UCHAR_MAX;
+ pcc->cv.cclasscode = -1;
+#else
+ max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+#endif
+ break;
+ case PG_REGEX_LOCALE_ICU:
+ max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+ break;
+ default:
+ max_chr = 0; /* can't get here, but keep compiler quiet */
+ break;
+ }
+
+ /*
+ * And scan 'em ...
+ */
+ nmatches = 0; /* number of consecutive matches */
+
+ for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
+ {
+ if ((*probefunc) (cur_chr))
+ nmatches++;
+ else if (nmatches > 0)
+ {
+ if (!store_match(pcc, cur_chr - nmatches, nmatches))
+ goto out_of_memory;
+ nmatches = 0;
+ }
+ }
+
+ if (nmatches > 0)
+ if (!store_match(pcc, cur_chr - nmatches, nmatches))
+ goto out_of_memory;
+
+ /*
+ * We might have allocated more memory than needed, if so free it
+ */
+ if (pcc->cv.nchrs == 0)
+ {
+ free(pcc->cv.chrs);
+ pcc->cv.chrs = NULL;
+ pcc->cv.chrspace = 0;
+ }
+ else if (pcc->cv.nchrs < pcc->cv.chrspace)
+ {
+ newchrs = (chr *) realloc(pcc->cv.chrs,
+ pcc->cv.nchrs * sizeof(chr));
+ if (newchrs == NULL)
+ goto out_of_memory;
+ pcc->cv.chrs = newchrs;
+ pcc->cv.chrspace = pcc->cv.nchrs;
+ }
+ if (pcc->cv.nranges == 0)
+ {
+ free(pcc->cv.ranges);
+ pcc->cv.ranges = NULL;
+ pcc->cv.rangespace = 0;
+ }
+ else if (pcc->cv.nranges < pcc->cv.rangespace)
+ {
+ newchrs = (chr *) realloc(pcc->cv.ranges,
+ pcc->cv.nranges * sizeof(chr) * 2);
+ if (newchrs == NULL)
+ goto out_of_memory;
+ pcc->cv.ranges = newchrs;
+ pcc->cv.rangespace = pcc->cv.nranges;
+ }
+
+ /*
+ * Success, link it into cache chain
+ */
+ pcc->next = pg_ctype_cache_list;
+ pg_ctype_cache_list = pcc;
+
+ return &pcc->cv;
+
+ /*
+ * Failure, clean up
+ */
+out_of_memory:
+ if (pcc->cv.chrs)
+ free(pcc->cv.chrs);
+ if (pcc->cv.ranges)
+ free(pcc->cv.ranges);
+ free(pcc);
+
+ return NULL;
+}
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
new file mode 100644
index 0000000..4737380
--- /dev/null
+++ b/src/backend/regex/regcomp.c
@@ -0,0 +1,2622 @@
+/*
+ * re_*comp and friends - compile REs
+ * This file #includes several others (see the bottom).
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regcomp.c
+ *
+ */
+
+#include "regex/regguts.h"
+
+/*
+ * forward declarations, up here so forward datatypes etc. are defined early
+ */
+/* === regcomp.c === */
+static void moresubs(struct vars *, int);
+static int freev(struct vars *, int);
+static void makesearch(struct vars *, struct nfa *);
+static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
+static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
+static struct subre *parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
+static void nonword(struct vars *, int, struct state *, struct state *);
+static void word(struct vars *, int, struct state *, struct state *);
+static void charclass(struct vars *, enum char_classes,
+ struct state *, struct state *);
+static void charclasscomplement(struct vars *, enum char_classes,
+ struct state *, struct state *);
+static int scannum(struct vars *);
+static void repeat(struct vars *, struct state *, struct state *, int, int);
+static void bracket(struct vars *, struct state *, struct state *);
+static void cbracket(struct vars *, struct state *, struct state *);
+static void brackpart(struct vars *, struct state *, struct state *, bool *);
+static const chr *scanplain(struct vars *);
+static void onechr(struct vars *, chr, struct state *, struct state *);
+static void optimizebracket(struct vars *, struct state *, struct state *);
+static void wordchrs(struct vars *);
+static void processlacon(struct vars *, struct state *, struct state *, int,
+ struct state *, struct state *);
+static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
+static void freesubre(struct vars *, struct subre *);
+static void freesubreandsiblings(struct vars *, struct subre *);
+static void freesrnode(struct vars *, struct subre *);
+static void removecaptures(struct vars *, struct subre *);
+static int numst(struct subre *, int);
+static void markst(struct subre *);
+static void cleanst(struct vars *);
+static long nfatree(struct vars *, struct subre *, FILE *);
+static long nfanode(struct vars *, struct subre *, int, FILE *);
+static int newlacon(struct vars *, struct state *, struct state *, int);
+static void freelacons(struct subre *, int);
+static void rfree(regex_t *);
+static int rcancelrequested(void);
+static int rstacktoodeep(void);
+
+#ifdef REG_DEBUG
+static void dump(regex_t *, FILE *);
+static void dumpst(struct subre *, FILE *, int);
+static void stdump(struct subre *, FILE *, int);
+static const char *stid(struct subre *, char *, size_t);
+#endif
+/* === regc_lex.c === */
+static void lexstart(struct vars *);
+static void prefixes(struct vars *);
+static int next(struct vars *);
+static int lexescape(struct vars *);
+static chr lexdigits(struct vars *, int, int, int);
+static int brenext(struct vars *, chr);
+static void skip(struct vars *);
+static chr newline(void);
+static chr chrnamed(struct vars *, const chr *, const chr *, chr);
+
+/* === regc_color.c === */
+static void initcm(struct vars *, struct colormap *);
+static void freecm(struct colormap *);
+static color maxcolor(struct colormap *);
+static color newcolor(struct colormap *);
+static void freecolor(struct colormap *, color);
+static color pseudocolor(struct colormap *);
+static color subcolor(struct colormap *, chr);
+static color subcolorhi(struct colormap *, color *);
+static color newsub(struct colormap *, color);
+static int newhicolorrow(struct colormap *, int);
+static void newhicolorcols(struct colormap *);
+static void subcolorcvec(struct vars *, struct cvec *, struct state *, struct state *);
+static void subcoloronechr(struct vars *, chr, struct state *, struct state *, color *);
+static void subcoloronerange(struct vars *, chr, chr, struct state *, struct state *, color *);
+static void subcoloronerow(struct vars *, int, struct state *, struct state *, color *);
+static void okcolors(struct nfa *, struct colormap *);
+static void colorchain(struct colormap *, struct arc *);
+static void uncolorchain(struct colormap *, struct arc *);
+static void rainbow(struct nfa *, struct colormap *, int, color, struct state *, struct state *);
+static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
+
+#ifdef REG_DEBUG
+static void dumpcolors(struct colormap *, FILE *);
+static void dumpchr(chr, FILE *);
+#endif
+/* === regc_nfa.c === */
+static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
+static void freenfa(struct nfa *);
+static struct state *newstate(struct nfa *);
+static struct state *newfstate(struct nfa *, int flag);
+static void dropstate(struct nfa *, struct state *);
+static void freestate(struct nfa *, struct state *);
+static void newarc(struct nfa *, int, color, struct state *, struct state *);
+static void createarc(struct nfa *, int, color, struct state *, struct state *);
+static struct arc *allocarc(struct nfa *);
+static void freearc(struct nfa *, struct arc *);
+static void changearcsource(struct arc *, struct state *);
+static void changearctarget(struct arc *, struct state *);
+static int hasnonemptyout(struct state *);
+static struct arc *findarc(struct state *, int, color);
+static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
+static void sortins(struct nfa *, struct state *);
+static int sortins_cmp(const void *, const void *);
+static void sortouts(struct nfa *, struct state *);
+static int sortouts_cmp(const void *, const void *);
+static void moveins(struct nfa *, struct state *, struct state *);
+static void copyins(struct nfa *, struct state *, struct state *);
+static void mergeins(struct nfa *, struct state *, struct arc **, int);
+static void moveouts(struct nfa *, struct state *, struct state *);
+static void copyouts(struct nfa *, struct state *, struct state *);
+static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
+static void delsub(struct nfa *, struct state *, struct state *);
+static void deltraverse(struct nfa *, struct state *, struct state *);
+static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
+static void duptraverse(struct nfa *, struct state *, struct state *);
+static void removeconstraints(struct nfa *, struct state *, struct state *);
+static void removetraverse(struct nfa *, struct state *);
+static void cleartraverse(struct nfa *, struct state *);
+static struct state *single_color_transition(struct state *, struct state *);
+static void specialcolors(struct nfa *);
+static long optimize(struct nfa *, FILE *);
+static void pullback(struct nfa *, FILE *);
+static int pull(struct nfa *, struct arc *, struct state **);
+static void pushfwd(struct nfa *, FILE *);
+static int push(struct nfa *, struct arc *, struct state **);
+
+#define INCOMPATIBLE 1 /* destroys arc */
+#define SATISFIED 2 /* constraint satisfied */
+#define COMPATIBLE 3 /* compatible but not satisfied yet */
+#define REPLACEARC 4 /* replace arc's color with constraint color */
+static int combine(struct nfa *nfa, struct arc *con, struct arc *a);
+static void fixempties(struct nfa *, FILE *);
+static struct state *emptyreachable(struct nfa *, struct state *,
+ struct state *, struct arc **);
+static int isconstraintarc(struct arc *);
+static int hasconstraintout(struct state *);
+static void fixconstraintloops(struct nfa *, FILE *);
+static int findconstraintloop(struct nfa *, struct state *);
+static void breakconstraintloop(struct nfa *, struct state *);
+static void clonesuccessorstates(struct nfa *, struct state *, struct state *,
+ struct state *, struct arc *,
+ char *, char *, int);
+static void cleanup(struct nfa *);
+static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
+static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
+static long analyze(struct nfa *);
+static void checkmatchall(struct nfa *);
+static bool checkmatchall_recurse(struct nfa *, struct state *, bool **);
+static bool check_out_colors_match(struct state *, color, color);
+static bool check_in_colors_match(struct state *, color, color);
+static void compact(struct nfa *, struct cnfa *);
+static void carcsort(struct carc *, size_t);
+static int carc_cmp(const void *, const void *);
+static void freecnfa(struct cnfa *);
+static void dumpnfa(struct nfa *, FILE *);
+
+#ifdef REG_DEBUG
+static void dumpstate(struct state *, FILE *);
+static void dumparcs(struct state *, FILE *);
+static void dumparc(struct arc *, struct state *, FILE *);
+static void dumpcnfa(struct cnfa *, FILE *);
+static void dumpcstate(int, struct cnfa *, FILE *);
+#endif
+/* === regc_cvec.c === */
+static struct cvec *newcvec(int, int);
+static struct cvec *clearcvec(struct cvec *);
+static void addchr(struct cvec *, chr);
+static void addrange(struct cvec *, chr, chr);
+static struct cvec *getcvec(struct vars *, int, int);
+static void freecvec(struct cvec *);
+
+/* === regc_pg_locale.c === */
+static int pg_wc_isdigit(pg_wchar c);
+static int pg_wc_isalpha(pg_wchar c);
+static int pg_wc_isalnum(pg_wchar c);
+static int pg_wc_isword(pg_wchar c);
+static int pg_wc_isupper(pg_wchar c);
+static int pg_wc_islower(pg_wchar c);
+static int pg_wc_isgraph(pg_wchar c);
+static int pg_wc_isprint(pg_wchar c);
+static int pg_wc_ispunct(pg_wchar c);
+static int pg_wc_isspace(pg_wchar c);
+static pg_wchar pg_wc_toupper(pg_wchar c);
+static pg_wchar pg_wc_tolower(pg_wchar c);
+
+/* === regc_locale.c === */
+static chr element(struct vars *, const chr *, const chr *);
+static struct cvec *range(struct vars *, chr, chr, int);
+static int before(chr, chr);
+static struct cvec *eclass(struct vars *, chr, int);
+static enum char_classes lookupcclass(struct vars *, const chr *, const chr *);
+static struct cvec *cclasscvec(struct vars *, enum char_classes, int);
+static int cclass_column_index(struct colormap *, chr);
+static struct cvec *allcases(struct vars *, chr);
+static int cmp(const chr *, const chr *, size_t);
+static int casecmp(const chr *, const chr *, size_t);
+
+
+/* internal variables, bundled for easy passing around */
+struct vars
+{
+ regex_t *re;
+ const chr *now; /* scan pointer into string */
+ const chr *stop; /* end of string */
+ int err; /* error code (0 if none) */
+ int cflags; /* copy of compile flags */
+ int lasttype; /* type of previous token */
+ int nexttype; /* type of next token */
+ chr nextvalue; /* value (if any) of next token */
+ int lexcon; /* lexical context type (see regc_lex.c) */
+ int nsubexp; /* subexpression count */
+ struct subre **subs; /* subRE pointer vector */
+ size_t nsubs; /* length of vector */
+ struct subre *sub10[10]; /* initial vector, enough for most */
+ struct nfa *nfa; /* the NFA */
+ struct colormap *cm; /* character color map */
+ color nlcolor; /* color of newline */
+ struct state *wordchrs; /* state in nfa holding word-char outarcs */
+ struct subre *tree; /* subexpression tree */
+ struct subre *treechain; /* all tree nodes allocated */
+ struct subre *treefree; /* any free tree nodes */
+ int ntree; /* number of tree nodes, plus one */
+ struct cvec *cv; /* interface cvec */
+ struct cvec *cv2; /* utility cvec */
+ struct subre *lacons; /* lookaround-constraint vector */
+ int nlacons; /* size of lacons[]; note that only slots
+ * numbered 1 .. nlacons-1 are used */
+ size_t spaceused; /* approx. space used for compilation */
+};
+
+/* parsing macros; most know that `v' is the struct vars pointer */
+#define NEXT() (next(v)) /* advance by one token */
+#define SEE(t) (v->nexttype == (t)) /* is next token this? */
+#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
+#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
+#define ISERR() VISERR(v)
+#define VERR(vv,e) ((vv)->nexttype = EOS, \
+ (vv)->err = ((vv)->err ? (vv)->err : (e)))
+#define ERR(e) VERR(v, e) /* record an error */
+#define NOERR() {if (ISERR()) return;} /* if error seen, return */
+#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
+#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
+#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */
+#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
+#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
+
+/* token type codes, some also used as NFA arc types */
+#define EMPTY 'n' /* no token present */
+#define EOS 'e' /* end of string */
+#define PLAIN 'p' /* ordinary character */
+#define DIGIT 'd' /* digit (in bound) */
+#define BACKREF 'b' /* back reference */
+#define COLLEL 'I' /* start of [. */
+#define ECLASS 'E' /* start of [= */
+#define CCLASS 'C' /* start of [: */
+#define END 'X' /* end of [. [= [: */
+#define CCLASSS 's' /* char class shorthand escape */
+#define CCLASSC 'c' /* complement char class shorthand escape */
+#define RANGE 'R' /* - within [] which might be range delim. */
+#define LACON 'L' /* lookaround constraint subRE */
+#define AHEAD 'a' /* color-lookahead arc */
+#define BEHIND 'r' /* color-lookbehind arc */
+#define WBDRY 'w' /* word boundary constraint */
+#define NWBDRY 'W' /* non-word-boundary constraint */
+#define SBEGIN 'A' /* beginning of string (even if not BOL) */
+#define SEND 'Z' /* end of string (even if not EOL) */
+
+/* is an arc colored, and hence should belong to a color chain? */
+/* the test on "co" eliminates RAINBOW arcs, which we don't bother to chain */
+#define COLORED(a) \
+ ((a)->co >= 0 && \
+ ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND))
+
+
+/* static function list */
+static const struct fns functions = {
+ rfree, /* regfree insides */
+ rcancelrequested, /* check for cancel request */
+ rstacktoodeep /* check for stack getting dangerously deep */
+};
+
+
+
+/*
+ * pg_regcomp - compile regular expression
+ *
+ * Note: on failure, no resources remain allocated, so pg_regfree()
+ * need not be applied to re.
+ */
+int
+pg_regcomp(regex_t *re,
+ const chr *string,
+ size_t len,
+ int flags,
+ Oid collation)
+{
+ struct vars var;
+ struct vars *v = &var;
+ struct guts *g;
+ int i;
+ size_t j;
+
+#ifdef REG_DEBUG
+ FILE *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;
+#else
+ FILE *debug = (FILE *) NULL;
+#endif
+
+#define CNOERR() { if (ISERR()) return freev(v, v->err); }
+
+ /* sanity checks */
+
+ if (re == NULL || string == NULL)
+ return REG_INVARG;
+ if ((flags & REG_QUOTE) &&
+ (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)))
+ return REG_INVARG;
+ if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
+ return REG_INVARG;
+
+ /* Initialize locale-dependent support */
+ pg_set_regex_collation(collation);
+
+ /* initial setup (after which freev() is callable) */
+ v->re = re;
+ v->now = string;
+ v->stop = v->now + len;
+ v->err = 0;
+ v->cflags = flags;
+ v->nsubexp = 0;
+ v->subs = v->sub10;
+ v->nsubs = 10;
+ for (j = 0; j < v->nsubs; j++)
+ v->subs[j] = NULL;
+ v->nfa = NULL;
+ v->cm = NULL;
+ v->nlcolor = COLORLESS;
+ v->wordchrs = NULL;
+ v->tree = NULL;
+ v->treechain = NULL;
+ v->treefree = NULL;
+ v->cv = NULL;
+ v->cv2 = NULL;
+ v->lacons = NULL;
+ v->nlacons = 0;
+ v->spaceused = 0;
+ re->re_magic = REMAGIC;
+ re->re_info = 0; /* bits get set during parse */
+ re->re_csize = sizeof(chr);
+ re->re_collation = collation;
+ re->re_guts = NULL;
+ re->re_fns = VS(&functions);
+
+ /* more complex setup, malloced things */
+ re->re_guts = VS(MALLOC(sizeof(struct guts)));
+ if (re->re_guts == NULL)
+ return freev(v, REG_ESPACE);
+ g = (struct guts *) re->re_guts;
+ g->tree = NULL;
+ initcm(v, &g->cmap);
+ v->cm = &g->cmap;
+ g->lacons = NULL;
+ g->nlacons = 0;
+ ZAPCNFA(g->search);
+ v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
+ CNOERR();
+ /* set up a reasonably-sized transient cvec for getcvec usage */
+ v->cv = newcvec(100, 20);
+ if (v->cv == NULL)
+ return freev(v, REG_ESPACE);
+
+ /* parsing */
+ lexstart(v); /* also handles prefixes */
+ if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH))
+ {
+ /* assign newline a unique color */
+ v->nlcolor = subcolor(v->cm, newline());
+ okcolors(v->nfa, v->cm);
+ }
+ CNOERR();
+ v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
+ assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
+ CNOERR();
+ assert(v->tree != NULL);
+
+ /* finish setup of nfa and its subre tree */
+ specialcolors(v->nfa);
+ CNOERR();
+#ifdef REG_DEBUG
+ if (debug != NULL)
+ {
+ fprintf(debug, "\n\n\n========= RAW ==========\n");
+ dumpnfa(v->nfa, debug);
+ dumpst(v->tree, debug, 1);
+ }
+#endif
+ if (v->cflags & REG_NOSUB)
+ removecaptures(v, v->tree);
+ v->ntree = numst(v->tree, 1);
+ markst(v->tree);
+ cleanst(v);
+#ifdef REG_DEBUG
+ if (debug != NULL)
+ {
+ fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
+ dumpst(v->tree, debug, 1);
+ }
+#endif
+
+ /* build compacted NFAs for tree and lacons */
+ re->re_info |= nfatree(v, v->tree, debug);
+ CNOERR();
+ assert(v->nlacons == 0 || v->lacons != NULL);
+ for (i = 1; i < v->nlacons; i++)
+ {
+ struct subre *lasub = &v->lacons[i];
+
+#ifdef REG_DEBUG
+ if (debug != NULL)
+ fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
+#endif
+
+ /* Prepend .* to pattern if it's a lookbehind LACON */
+ nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->latype), debug);
+ }
+ CNOERR();
+ if (v->tree->flags & SHORTER)
+ NOTE(REG_USHORTEST);
+
+ /* build compacted NFAs for tree, lacons, fast search */
+#ifdef REG_DEBUG
+ if (debug != NULL)
+ fprintf(debug, "\n\n\n========= SEARCH ==========\n");
+#endif
+ /* can sacrifice main NFA now, so use it as work area */
+ (DISCARD) optimize(v->nfa, debug);
+ CNOERR();
+ makesearch(v, v->nfa);
+ CNOERR();
+ compact(v->nfa, &g->search);
+ CNOERR();
+
+ /* looks okay, package it up */
+ re->re_nsub = v->nsubexp;
+ v->re = NULL; /* freev no longer frees re */
+ g->magic = GUTSMAGIC;
+ g->cflags = v->cflags;
+ g->info = re->re_info;
+ g->nsub = re->re_nsub;
+ g->tree = v->tree;
+ v->tree = NULL;
+ g->ntree = v->ntree;
+ g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
+ g->lacons = v->lacons;
+ v->lacons = NULL;
+ g->nlacons = v->nlacons;
+
+#ifdef REG_DEBUG
+ if (flags & REG_DUMP)
+ {
+ dump(re, stdout);
+ fflush(stdout);
+ }
+#endif
+
+ assert(v->err == 0);
+ return freev(v, 0);
+}
+
+/*
+ * moresubs - enlarge subRE vector
+ */
+static void
+moresubs(struct vars *v,
+ int wanted) /* want enough room for this one */
+{
+ struct subre **p;
+ size_t n;
+
+ assert(wanted > 0 && (size_t) wanted >= v->nsubs);
+ n = (size_t) wanted * 3 / 2 + 1;
+
+ if (v->subs == v->sub10)
+ {
+ p = (struct subre **) MALLOC(n * sizeof(struct subre *));
+ if (p != NULL)
+ memcpy(VS(p), VS(v->subs),
+ v->nsubs * sizeof(struct subre *));
+ }
+ else
+ p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
+ if (p == NULL)
+ {
+ ERR(REG_ESPACE);
+ return;
+ }
+ v->subs = p;
+ for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
+ *p = NULL;
+ assert(v->nsubs == n);
+ assert((size_t) wanted < v->nsubs);
+}
+
+/*
+ * freev - free vars struct's substructures where necessary
+ *
+ * Optionally does error-number setting, and always returns error code
+ * (if any), to make error-handling code terser.
+ */
+static int
+freev(struct vars *v,
+ int err)
+{
+ if (v->re != NULL)
+ rfree(v->re);
+ if (v->subs != v->sub10)
+ FREE(v->subs);
+ if (v->nfa != NULL)
+ freenfa(v->nfa);
+ if (v->tree != NULL)
+ freesubre(v, v->tree);
+ if (v->treechain != NULL)
+ cleanst(v);
+ if (v->cv != NULL)
+ freecvec(v->cv);
+ if (v->cv2 != NULL)
+ freecvec(v->cv2);
+ if (v->lacons != NULL)
+ freelacons(v->lacons, v->nlacons);
+ ERR(err); /* nop if err==0 */
+
+ return v->err;
+}
+
+/*
+ * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
+ * NFA must have been optimize()d already.
+ */
+static void
+makesearch(struct vars *v,
+ struct nfa *nfa)
+{
+ struct arc *a;
+ struct arc *b;
+ struct state *pre = nfa->pre;
+ struct state *s;
+ struct state *s2;
+ struct state *slist;
+
+ /* no loops are needed if it's anchored */
+ for (a = pre->outs; a != NULL; a = a->outchain)
+ {
+ assert(a->type == PLAIN);
+ if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
+ break;
+ }
+ if (a != NULL)
+ {
+ /* add implicit .* in front */
+ rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
+
+ /* and ^* and \A* too -- not always necessary, but harmless */
+ newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
+ newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
+
+ /*
+ * The pattern is still MATCHALL if it was before, but the max match
+ * length is now infinity.
+ */
+ if (nfa->flags & MATCHALL)
+ nfa->maxmatchall = DUPINF;
+ }
+
+ /*
+ * Now here's the subtle part. Because many REs have no lookback
+ * constraints, often knowing when you were in the pre state tells you
+ * little; it's the next state(s) that are informative. But some of them
+ * may have other inarcs, i.e. it may be possible to make actual progress
+ * and then return to one of them. We must de-optimize such cases,
+ * splitting each such state into progress and no-progress states.
+ */
+
+ /* first, make a list of the states reachable from pre and elsewhere */
+ slist = NULL;
+ for (a = pre->outs; a != NULL; a = a->outchain)
+ {
+ s = a->to;
+ for (b = s->ins; b != NULL; b = b->inchain)
+ {
+ if (b->from != pre)
+ break;
+ }
+
+ /*
+ * We want to mark states as being in the list already by having non
+ * NULL tmp fields, but we can't just store the old slist value in tmp
+ * because that doesn't work for the first such state. Instead, the
+ * first list entry gets its own address in tmp.
+ */
+ if (b != NULL && s->tmp == NULL)
+ {
+ s->tmp = (slist != NULL) ? slist : s;
+ slist = s;
+ }
+ }
+
+ /* do the splits */
+ for (s = slist; s != NULL; s = s2)
+ {
+ s2 = newstate(nfa);
+ NOERR();
+ copyouts(nfa, s, s2);
+ NOERR();
+ for (a = s->ins; a != NULL; a = b)
+ {
+ b = a->inchain;
+ if (a->from != pre)
+ {
+ cparc(nfa, a, a->from, s2);
+ freearc(nfa, a);
+ }
+ }
+ s2 = (s->tmp != s) ? s->tmp : NULL;
+ s->tmp = NULL; /* clean up while we're at it */
+ }
+}
+
+/*
+ * parse - parse an RE
+ *
+ * This is actually just the top level, which parses a bunch of branches
+ * tied together with '|'. If there's more than one, they appear in the
+ * tree as the children of a '|' subre.
+ */
+static struct subre *
+parse(struct vars *v,
+ int stopper, /* EOS or ')' */
+ int type, /* LACON (lookaround subRE) or PLAIN */
+ struct state *init, /* initial state */
+ struct state *final) /* final state */
+{
+ struct subre *branches; /* top level */
+ struct subre *lastbranch; /* latest branch */
+
+ assert(stopper == ')' || stopper == EOS);
+
+ branches = subre(v, '|', LONGER, init, final);
+ NOERRN();
+ lastbranch = NULL;
+ do
+ { /* a branch */
+ struct subre *branch;
+ struct state *left; /* scaffolding for branch */
+ struct state *right;
+
+ left = newstate(v->nfa);
+ right = newstate(v->nfa);
+ NOERRN();
+ EMPTYARC(init, left);
+ EMPTYARC(right, final);
+ NOERRN();
+ branch = parsebranch(v, stopper, type, left, right, 0);
+ NOERRN();
+ if (lastbranch)
+ lastbranch->sibling = branch;
+ else
+ branches->child = branch;
+ branches->flags |= UP(branches->flags | branch->flags);
+ lastbranch = branch;
+ } while (EAT('|'));
+ assert(SEE(stopper) || SEE(EOS));
+
+ if (!SEE(stopper))
+ {
+ assert(stopper == ')' && SEE(EOS));
+ ERR(REG_EPAREN);
+ }
+
+ /* optimize out simple cases */
+ if (lastbranch == branches->child)
+ { /* only one branch */
+ assert(lastbranch->sibling == NULL);
+ freesrnode(v, branches);
+ branches = lastbranch;
+ }
+ else if (!MESSY(branches->flags))
+ { /* no interesting innards */
+ freesubreandsiblings(v, branches->child);
+ branches->child = NULL;
+ branches->op = '=';
+ }
+
+ return branches;
+}
+
+/*
+ * parsebranch - parse one branch of an RE
+ *
+ * This mostly manages concatenation, working closely with parseqatom().
+ * Concatenated things are bundled up as much as possible, with separate
+ * '.' nodes introduced only when necessary due to substructure.
+ */
+static struct subre *
+parsebranch(struct vars *v,
+ int stopper, /* EOS or ')' */
+ int type, /* LACON (lookaround subRE) or PLAIN */
+ struct state *left, /* leftmost state */
+ struct state *right, /* rightmost state */
+ int partial) /* is this only part of a branch? */
+{
+ struct state *lp; /* left end of current construct */
+ int seencontent; /* is there anything in this branch yet? */
+ struct subre *t;
+
+ lp = left;
+ seencontent = 0;
+ t = subre(v, '=', 0, left, right); /* op '=' is tentative */
+ NOERRN();
+ while (!SEE('|') && !SEE(stopper) && !SEE(EOS))
+ {
+ if (seencontent)
+ { /* implicit concat operator */
+ lp = newstate(v->nfa);
+ NOERRN();
+ moveins(v->nfa, right, lp);
+ }
+ seencontent = 1;
+
+ /* NB, recursion in parseqatom() may swallow rest of branch */
+ t = parseqatom(v, stopper, type, lp, right, t);
+ NOERRN();
+ }
+
+ if (!seencontent)
+ { /* empty branch */
+ if (!partial)
+ NOTE(REG_UUNSPEC);
+ assert(lp == left);
+ EMPTYARC(left, right);
+ }
+
+ return t;
+}
+
+/*
+ * parseqatom - parse one quantified atom or constraint of an RE
+ *
+ * The bookkeeping near the end cooperates very closely with parsebranch();
+ * in particular, it contains a recursion that can involve parsing the rest
+ * of the branch, making this function's name somewhat inaccurate.
+ *
+ * Usually, the return value is just "top", but in some cases where we
+ * have parsed the rest of the branch, we may deem "top" redundant and
+ * free it, returning some child subre instead.
+ */
+static struct subre *
+parseqatom(struct vars *v,
+ int stopper, /* EOS or ')' */
+ int type, /* LACON (lookaround subRE) or PLAIN */
+ struct state *lp, /* left state to hang it on */
+ struct state *rp, /* right state to hang it on */
+ struct subre *top) /* subtree top */
+{
+ struct state *s; /* temporaries for new states */
+ struct state *s2;
+
+#define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
+ int m,
+ n;
+ struct subre *atom; /* atom's subtree */
+ struct subre *t;
+ int cap; /* capturing parens? */
+ int latype; /* lookaround constraint type */
+ int subno; /* capturing-parens or backref number */
+ int atomtype;
+ int qprefer; /* quantifier short/long preference */
+ int f;
+ struct subre **atomp; /* where the pointer to atom is */
+
+ /* initial bookkeeping */
+ atom = NULL;
+ assert(lp->nouts == 0); /* must string new code */
+ assert(rp->nins == 0); /* between lp and rp */
+ subno = 0; /* just to shut lint up */
+
+ /* an atom or constraint... */
+ atomtype = v->nexttype;
+ switch (atomtype)
+ {
+ /* first, constraints, which end by returning */
+ case '^':
+ ARCV('^', 1);
+ if (v->cflags & REG_NLANCH)
+ ARCV(BEHIND, v->nlcolor);
+ NEXT();
+ return top;
+ break;
+ case '$':
+ ARCV('$', 1);
+ if (v->cflags & REG_NLANCH)
+ ARCV(AHEAD, v->nlcolor);
+ NEXT();
+ return top;
+ break;
+ case SBEGIN:
+ ARCV('^', 1); /* BOL */
+ ARCV('^', 0); /* or BOS */
+ NEXT();
+ return top;
+ break;
+ case SEND:
+ ARCV('$', 1); /* EOL */
+ ARCV('$', 0); /* or EOS */
+ NEXT();
+ return top;
+ break;
+ case '<':
+ wordchrs(v);
+ s = newstate(v->nfa);
+ NOERRN();
+ nonword(v, BEHIND, lp, s);
+ word(v, AHEAD, s, rp);
+ NEXT();
+ return top;
+ break;
+ case '>':
+ wordchrs(v);
+ s = newstate(v->nfa);
+ NOERRN();
+ word(v, BEHIND, lp, s);
+ nonword(v, AHEAD, s, rp);
+ NEXT();
+ return top;
+ break;
+ case WBDRY:
+ wordchrs(v);
+ s = newstate(v->nfa);
+ NOERRN();
+ nonword(v, BEHIND, lp, s);
+ word(v, AHEAD, s, rp);
+ s = newstate(v->nfa);
+ NOERRN();
+ word(v, BEHIND, lp, s);
+ nonword(v, AHEAD, s, rp);
+ NEXT();
+ return top;
+ break;
+ case NWBDRY:
+ wordchrs(v);
+ s = newstate(v->nfa);
+ NOERRN();
+ word(v, BEHIND, lp, s);
+ word(v, AHEAD, s, rp);
+ s = newstate(v->nfa);
+ NOERRN();
+ nonword(v, BEHIND, lp, s);
+ nonword(v, AHEAD, s, rp);
+ NEXT();
+ return top;
+ break;
+ case LACON: /* lookaround constraint */
+ latype = v->nextvalue;
+ NEXT();
+ s = newstate(v->nfa);
+ s2 = newstate(v->nfa);
+ NOERRN();
+ t = parse(v, ')', LACON, s, s2);
+ freesubre(v, t); /* internal structure irrelevant */
+ NOERRN();
+ assert(SEE(')'));
+ NEXT();
+ processlacon(v, s, s2, latype, lp, rp);
+ return top;
+ break;
+ /* then errors, to get them out of the way */
+ case '*':
+ case '+':
+ case '?':
+ case '{':
+ ERR(REG_BADRPT);
+ return top;
+ break;
+ default:
+ ERR(REG_ASSERT);
+ return top;
+ break;
+ /* then plain characters, and minor variants on that theme */
+ case ')': /* unbalanced paren */
+ if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
+ {
+ ERR(REG_EPAREN);
+ return top;
+ }
+ /* legal in EREs due to specification botch */
+ NOTE(REG_UPBOTCH);
+ /* fall through into case PLAIN */
+ /* FALLTHROUGH */
+ case PLAIN:
+ onechr(v, v->nextvalue, lp, rp);
+ okcolors(v->nfa, v->cm);
+ NOERRN();
+ NEXT();
+ break;
+ case '[':
+ if (v->nextvalue == 1)
+ bracket(v, lp, rp);
+ else
+ cbracket(v, lp, rp);
+ assert(SEE(']') || ISERR());
+ NEXT();
+ break;
+ case CCLASSS:
+ charclass(v, (enum char_classes) v->nextvalue, lp, rp);
+ okcolors(v->nfa, v->cm);
+ NEXT();
+ break;
+ case CCLASSC:
+ charclasscomplement(v, (enum char_classes) v->nextvalue, lp, rp);
+ /* charclasscomplement() did okcolors() internally */
+ NEXT();
+ break;
+ case '.':
+ rainbow(v->nfa, v->cm, PLAIN,
+ (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
+ lp, rp);
+ NEXT();
+ break;
+ /* and finally the ugly stuff */
+ case '(': /* value flags as capturing or non */
+ cap = (type == LACON) ? 0 : v->nextvalue;
+ if (cap)
+ {
+ v->nsubexp++;
+ subno = v->nsubexp;
+ if ((size_t) subno >= v->nsubs)
+ moresubs(v, subno);
+ }
+ else
+ atomtype = PLAIN; /* something that's not '(' */
+ NEXT();
+
+ /*
+ * Make separate endpoint states to keep this sub-NFA distinct
+ * from what surrounds it. We need to be sure that when we
+ * duplicate the sub-NFA for a backref, we get the right
+ * states/arcs and no others. In particular, letting a backref
+ * duplicate the sub-NFA from lp to rp would be quite wrong,
+ * because we may add quantification superstructure around this
+ * atom below. (Perhaps we could skip the extra states for
+ * non-capturing parens, but it seems not worth the trouble.)
+ */
+ s = newstate(v->nfa);
+ s2 = newstate(v->nfa);
+ NOERRN();
+ /* We may not need these arcs, but keep things connected for now */
+ EMPTYARC(lp, s);
+ EMPTYARC(s2, rp);
+ NOERRN();
+ atom = parse(v, ')', type, s, s2);
+ assert(SEE(')') || ISERR());
+ NEXT();
+ NOERRN();
+ if (cap)
+ {
+ if (atom->capno == 0)
+ {
+ /* normal case: just mark the atom as capturing */
+ atom->flags |= CAP;
+ atom->capno = subno;
+ }
+ else
+ {
+ /* generate no-op wrapper node to handle "((x))" */
+ t = subre(v, '(', atom->flags | CAP, s, s2);
+ NOERRN();
+ t->capno = subno;
+ t->child = atom;
+ atom = t;
+ }
+ assert(v->subs[subno] == NULL);
+ v->subs[subno] = atom;
+ }
+ /* postpone everything else pending possible {0} */
+ break;
+ case BACKREF: /* the Feature From The Black Lagoon */
+ INSIST(type != LACON, REG_ESUBREG);
+ subno = v->nextvalue;
+ assert(subno > 0);
+ INSIST(subno < v->nsubs, REG_ESUBREG);
+ NOERRN();
+ INSIST(v->subs[subno] != NULL, REG_ESUBREG);
+ NOERRN();
+ atom = subre(v, 'b', BACKR, lp, rp);
+ NOERRN();
+ atom->backno = subno;
+ v->subs[subno]->flags |= BRUSE;
+ EMPTYARC(lp, rp); /* temporarily, so there's something */
+ NEXT();
+ break;
+ }
+
+ /* ...and an atom may be followed by a quantifier */
+ switch (v->nexttype)
+ {
+ case '*':
+ m = 0;
+ n = DUPINF;
+ qprefer = (v->nextvalue) ? LONGER : SHORTER;
+ NEXT();
+ break;
+ case '+':
+ m = 1;
+ n = DUPINF;
+ qprefer = (v->nextvalue) ? LONGER : SHORTER;
+ NEXT();
+ break;
+ case '?':
+ m = 0;
+ n = 1;
+ qprefer = (v->nextvalue) ? LONGER : SHORTER;
+ NEXT();
+ break;
+ case '{':
+ NEXT();
+ m = scannum(v);
+ if (EAT(','))
+ {
+ if (SEE(DIGIT))
+ n = scannum(v);
+ else
+ n = DUPINF;
+ if (m > n)
+ {
+ ERR(REG_BADBR);
+ return top;
+ }
+ /* {m,n} exercises preference, even if it's {m,m} */
+ qprefer = (v->nextvalue) ? LONGER : SHORTER;
+ }
+ else
+ {
+ n = m;
+ /* {m} passes operand's preference through */
+ qprefer = 0;
+ }
+ if (!SEE('}'))
+ { /* catches errors too */
+ ERR(REG_BADBR);
+ return top;
+ }
+ NEXT();
+ break;
+ default: /* no quantifier */
+ m = n = 1;
+ qprefer = 0;
+ break;
+ }
+
+ /* annoying special case: {0} or {0,0} cancels everything */
+ if (m == 0 && n == 0)
+ {
+ /*
+ * If we had capturing subexpression(s) within the atom, we don't want
+ * to destroy them, because it's legal (if useless) to back-ref them
+ * later. Hence, just unlink the atom from lp/rp and then ignore it.
+ */
+ if (atom != NULL && (atom->flags & CAP))
+ {
+ delsub(v->nfa, lp, atom->begin);
+ delsub(v->nfa, atom->end, rp);
+ }
+ else
+ {
+ /* Otherwise, we can clean up any subre infrastructure we made */
+ if (atom != NULL)
+ freesubre(v, atom);
+ delsub(v->nfa, lp, rp);
+ }
+ EMPTYARC(lp, rp);
+ return top;
+ }
+
+ /* if not a messy case, avoid hard part */
+ assert(!MESSY(top->flags));
+ f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
+ if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))
+ {
+ if (!(m == 1 && n == 1))
+ repeat(v, lp, rp, m, n);
+ if (atom != NULL)
+ freesubre(v, atom);
+ top->flags = f;
+ return top;
+ }
+
+ /*
+ * hard part: something messy
+ *
+ * That is, capturing parens, back reference, short/long clash, or an atom
+ * with substructure containing one of those.
+ */
+
+ /* now we'll need a subre for the contents even if they're boring */
+ if (atom == NULL)
+ {
+ atom = subre(v, '=', 0, lp, rp);
+ NOERRN();
+ }
+
+ /*
+ * For what follows, we need the atom to have its own begin/end states
+ * that are distinct from lp/rp, so that we can wrap iteration structure
+ * around it. The parenthesized-atom case above already made suitable
+ * states (and we don't want to modify a capturing subre, since it's
+ * already recorded in v->subs[]). Otherwise, we need more states.
+ */
+ if (atom->begin == lp || atom->end == rp)
+ {
+ s = newstate(v->nfa);
+ s2 = newstate(v->nfa);
+ NOERRN();
+ moveouts(v->nfa, lp, s);
+ moveins(v->nfa, rp, s2);
+ atom->begin = s;
+ atom->end = s2;
+ }
+ else
+ {
+ /* The atom's OK, but we must temporarily disconnect it from lp/rp */
+ /* (this removes the EMPTY arcs we made above) */
+ delsub(v->nfa, lp, atom->begin);
+ delsub(v->nfa, atom->end, rp);
+ }
+
+ /*----------
+ * Prepare a general-purpose state skeleton.
+ *
+ * In the no-backrefs case, we want this:
+ *
+ * [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
+ *
+ * where prefix is some repetitions of atom, and "rest" is the remainder
+ * of the branch. In the general case we need:
+ *
+ * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
+ *
+ * where the iterator wraps around the atom.
+ *
+ * We make the s state here for both cases; s2 is made below if needed
+ *----------
+ */
+ s = newstate(v->nfa); /* set up starting state */
+ NOERRN();
+ EMPTYARC(lp, s);
+ NOERRN();
+
+ /* break remaining subRE into x{...} and what follows */
+ t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
+ NOERRN();
+ t->child = atom;
+ atomp = &t->child;
+
+ /*
+ * Here we should recurse to fill t->child->sibling ... but we must
+ * postpone that to the end. One reason is that t->child may be replaced
+ * below, and we don't want to worry about its sibling link.
+ */
+
+ /*
+ * Convert top node to a concatenation of the prefix (top->child, covering
+ * whatever we parsed previously) and remaining (t). Note that the prefix
+ * could be empty, in which case this concatenation node is unnecessary.
+ * To keep things simple, we operate in a general way for now, and get rid
+ * of unnecessary subres below.
+ */
+ assert(top->op == '=' && top->child == NULL);
+ top->child = subre(v, '=', top->flags, top->begin, lp);
+ NOERRN();
+ top->op = '.';
+ top->child->sibling = t;
+ /* top->flags will get updated later */
+
+ /* if it's a backref, now is the time to replicate the subNFA */
+ if (atomtype == BACKREF)
+ {
+ assert(atom->begin->nouts == 1); /* just the EMPTY */
+ delsub(v->nfa, atom->begin, atom->end);
+ assert(v->subs[subno] != NULL);
+
+ /*
+ * And here's why the recursion got postponed: it must wait until the
+ * skeleton is filled in, because it may hit a backref that wants to
+ * copy the filled-in skeleton.
+ */
+ dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
+ atom->begin, atom->end);
+ NOERRN();
+
+ /* The backref node's NFA should not enforce any constraints */
+ removeconstraints(v->nfa, atom->begin, atom->end);
+ NOERRN();
+ }
+
+ /*
+ * It's quantifier time. If the atom is just a backref, we'll let it deal
+ * with quantifiers internally.
+ */
+ if (atomtype == BACKREF)
+ {
+ /* special case: backrefs have internal quantifiers */
+ EMPTYARC(s, atom->begin); /* empty prefix */
+ /* just stuff everything into atom */
+ repeat(v, atom->begin, atom->end, m, n);
+ atom->min = (short) m;
+ atom->max = (short) n;
+ atom->flags |= COMBINE(qprefer, atom->flags);
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
+ }
+ else if (m == 1 && n == 1 &&
+ (qprefer == 0 ||
+ (atom->flags & (LONGER | SHORTER | MIXED)) == 0 ||
+ qprefer == (atom->flags & (LONGER | SHORTER | MIXED))))
+ {
+ /* no/vacuous quantifier: done */
+ EMPTYARC(s, atom->begin); /* empty prefix */
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
+ }
+ else if (!(atom->flags & (CAP | BACKR)))
+ {
+ /*
+ * If there's no captures nor backrefs in the atom being repeated, we
+ * don't really care where the submatches of the iteration are, so we
+ * don't need an iteration node. Make a plain DFA node instead.
+ */
+ EMPTYARC(s, atom->begin); /* empty prefix */
+ repeat(v, atom->begin, atom->end, m, n);
+ f = COMBINE(qprefer, atom->flags);
+ t = subre(v, '=', f, atom->begin, atom->end);
+ NOERRN();
+ freesubre(v, atom);
+ *atomp = t;
+ /* rest of branch can be strung starting from t->end */
+ s2 = t->end;
+ }
+ else if (m > 0 && !(atom->flags & BACKR))
+ {
+ /*
+ * If there's no backrefs involved, we can turn x{m,n} into
+ * x{m-1,n-1}x, with capturing parens in only the second x. This is
+ * valid because we only care about capturing matches from the final
+ * iteration of the quantifier. It's a win because we can implement
+ * the backref-free left side as a plain DFA node, since we don't
+ * really care where its submatches are.
+ */
+ dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
+ assert(m >= 1 && m != DUPINF && n >= 1);
+ repeat(v, s, atom->begin, m - 1, (n == DUPINF) ? n : n - 1);
+ f = COMBINE(qprefer, atom->flags);
+ t = subre(v, '.', f, s, atom->end); /* prefix and atom */
+ NOERRN();
+ t->child = subre(v, '=', PREF(f), s, atom->begin);
+ NOERRN();
+ t->child->sibling = atom;
+ *atomp = t;
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
+ }
+ else
+ {
+ /* general case: need an iteration node */
+ s2 = newstate(v->nfa);
+ NOERRN();
+ moveouts(v->nfa, atom->end, s2);
+ NOERRN();
+ dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+ repeat(v, s, s2, m, n);
+ f = COMBINE(qprefer, atom->flags);
+ t = subre(v, '*', f, s, s2);
+ NOERRN();
+ t->min = (short) m;
+ t->max = (short) n;
+ t->child = atom;
+ *atomp = t;
+ /* rest of branch is to be strung from iteration's end state */
+ }
+
+ /* and finally, look after that postponed recursion */
+ t = top->child->sibling;
+ if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
+ {
+ /* parse all the rest of the branch, and insert in t->child->sibling */
+ t->child->sibling = parsebranch(v, stopper, type, s2, rp, 1);
+ NOERRN();
+ assert(SEE('|') || SEE(stopper) || SEE(EOS));
+
+ /* here's the promised update of the flags */
+ t->flags |= COMBINE(t->flags, t->child->sibling->flags);
+ top->flags |= COMBINE(top->flags, t->flags);
+
+ /* neither t nor top could be directly marked for capture as yet */
+ assert(t->capno == 0);
+ assert(top->capno == 0);
+
+ /*
+ * At this point both top and t are concatenation (op == '.') subres,
+ * and we have top->child = prefix of branch, top->child->sibling = t,
+ * t->child = messy atom (with quantification superstructure if
+ * needed), t->child->sibling = rest of branch.
+ *
+ * If the messy atom was the first thing in the branch, then
+ * top->child is vacuous and we can get rid of one level of
+ * concatenation.
+ */
+ assert(top->child->op == '=');
+ if (top->child->begin == top->child->end)
+ {
+ assert(!MESSY(top->child->flags));
+ freesubre(v, top->child);
+ top->child = t->child;
+ freesrnode(v, t);
+ }
+
+ /*
+ * Otherwise, it's possible that t->child is not messy in itself, but
+ * we considered it messy because its greediness conflicts with what
+ * preceded it. Then it could be that the combination of t->child and
+ * the rest of the branch is also not messy, in which case we can get
+ * rid of the child concatenation by merging t->child and the rest of
+ * the branch into one plain DFA node.
+ */
+ else if (t->child->op == '=' &&
+ t->child->sibling->op == '=' &&
+ !MESSY(UP(t->child->flags | t->child->sibling->flags)))
+ {
+ t->op = '=';
+ t->flags = COMBINE(t->child->flags, t->child->sibling->flags);
+ freesubreandsiblings(v, t->child);
+ t->child = NULL;
+ }
+ }
+ else
+ {
+ /*
+ * There's nothing left in the branch, so we don't need the second
+ * concatenation node 't'. Just link s2 straight to rp.
+ */
+ EMPTYARC(s2, rp);
+ top->child->sibling = t->child;
+ top->flags |= COMBINE(top->flags, top->child->sibling->flags);
+ freesrnode(v, t);
+
+ /*
+ * Again, it could be that top->child is vacuous (if the messy atom
+ * was in fact the only thing in the branch). In that case we need no
+ * concatenation at all; just replace top with top->child->sibling.
+ */
+ assert(top->child->op == '=');
+ if (top->child->begin == top->child->end)
+ {
+ assert(!MESSY(top->child->flags));
+ t = top->child->sibling;
+ top->child->sibling = NULL;
+ freesubre(v, top);
+ top = t;
+ }
+ }
+
+ return top;
+}
+
+/*
+ * nonword - generate arcs for non-word-character ahead or behind
+ */
+static void
+nonword(struct vars *v,
+ int dir, /* AHEAD or BEHIND */
+ struct state *lp,
+ struct state *rp)
+{
+ int anchor = (dir == AHEAD) ? '$' : '^';
+
+ assert(dir == AHEAD || dir == BEHIND);
+ newarc(v->nfa, anchor, 1, lp, rp);
+ newarc(v->nfa, anchor, 0, lp, rp);
+ colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
+ /* (no need for special attention to \n) */
+}
+
+/*
+ * word - generate arcs for word character ahead or behind
+ */
+static void
+word(struct vars *v,
+ int dir, /* AHEAD or BEHIND */
+ struct state *lp,
+ struct state *rp)
+{
+ assert(dir == AHEAD || dir == BEHIND);
+ cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
+ /* (no need for special attention to \n) */
+}
+
+/*
+ * charclass - generate arcs for a character class
+ *
+ * This is used for both atoms (\w and sibling escapes) and for elements
+ * of bracket expressions. The caller is responsible for calling okcolors()
+ * at the end of processing the atom or bracket.
+ */
+static void
+charclass(struct vars *v,
+ enum char_classes cls,
+ struct state *lp,
+ struct state *rp)
+{
+ struct cvec *cv;
+
+ /* obtain possibly-cached cvec for char class */
+ NOTE(REG_ULOCALE);
+ cv = cclasscvec(v, cls, (v->cflags & REG_ICASE));
+ NOERR();
+
+ /* build the arcs; this may cause color splitting */
+ subcolorcvec(v, cv, lp, rp);
+}
+
+/*
+ * charclasscomplement - generate arcs for a complemented character class
+ *
+ * This is used for both atoms (\W and sibling escapes) and for elements
+ * of bracket expressions. In bracket expressions, it is the caller's
+ * responsibility that there not be any open subcolors when this is called.
+ */
+static void
+charclasscomplement(struct vars *v,
+ enum char_classes cls,
+ struct state *lp,
+ struct state *rp)
+{
+ struct state *cstate;
+ struct cvec *cv;
+
+ /* make dummy state to hang temporary arcs on */
+ cstate = newstate(v->nfa);
+ NOERR();
+
+ /* obtain possibly-cached cvec for char class */
+ NOTE(REG_ULOCALE);
+ cv = cclasscvec(v, cls, (v->cflags & REG_ICASE));
+ NOERR();
+
+ /* build arcs for char class; this may cause color splitting */
+ subcolorcvec(v, cv, cstate, cstate);
+ NOERR();
+
+ /* clean up any subcolors in the arc set */
+ okcolors(v->nfa, v->cm);
+ NOERR();
+
+ /* now build output arcs for the complement of the char class */
+ colorcomplement(v->nfa, v->cm, PLAIN, cstate, lp, rp);
+ NOERR();
+
+ /* clean up dummy state */
+ dropstate(v->nfa, cstate);
+}
+
+/*
+ * scannum - scan a number
+ */
+static int /* value, <= DUPMAX */
+scannum(struct vars *v)
+{
+ int n = 0;
+
+ while (SEE(DIGIT) && n < DUPMAX)
+ {
+ n = n * 10 + v->nextvalue;
+ NEXT();
+ }
+ if (SEE(DIGIT) || n > DUPMAX)
+ {
+ ERR(REG_BADBR);
+ return 0;
+ }
+ return n;
+}
+
+/*
+ * repeat - replicate subNFA for quantifiers
+ *
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
+ *
+ * The duplication sequences used here are chosen carefully so that any
+ * pointers starting out pointing into the subexpression end up pointing into
+ * the last occurrence. (Note that it may not be strung between the same
+ * left and right end states, however!) This used to be important for the
+ * subRE tree, although the important bits are now handled by the in-line
+ * code in parse(), and when this is called, it doesn't matter any more.
+ */
+static void
+repeat(struct vars *v,
+ struct state *lp,
+ struct state *rp,
+ int m,
+ int n)
+{
+#define SOME 2
+#define INF 3
+#define PAIR(x, y) ((x)*4 + (y))
+#define REDUCE(x) ( ((x) == DUPINF) ? INF : (((x) > 1) ? SOME : (x)) )
+ const int rm = REDUCE(m);
+ const int rn = REDUCE(n);
+ struct state *s;
+ struct state *s2;
+
+ switch (PAIR(rm, rn))
+ {
+ case PAIR(0, 0): /* empty string */
+ delsub(v->nfa, lp, rp);
+ EMPTYARC(lp, rp);
+ break;
+ case PAIR(0, 1): /* do as x| */
+ EMPTYARC(lp, rp);
+ break;
+ case PAIR(0, SOME): /* do as x{1,n}| */
+ repeat(v, lp, rp, 1, n);
+ NOERR();
+ EMPTYARC(lp, rp);
+ break;
+ case PAIR(0, INF): /* loop x around */
+ s = newstate(v->nfa);
+ NOERR();
+ moveouts(v->nfa, lp, s);
+ moveins(v->nfa, rp, s);
+ EMPTYARC(lp, s);
+ EMPTYARC(s, rp);
+ break;
+ case PAIR(1, 1): /* no action required */
+ break;
+ case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */
+ s = newstate(v->nfa);
+ NOERR();
+ moveouts(v->nfa, lp, s);
+ dupnfa(v->nfa, s, rp, lp, s);
+ NOERR();
+ repeat(v, lp, s, 1, n - 1);
+ NOERR();
+ EMPTYARC(lp, s);
+ break;
+ case PAIR(1, INF): /* add loopback arc */
+ s = newstate(v->nfa);
+ s2 = newstate(v->nfa);
+ NOERR();
+ moveouts(v->nfa, lp, s);
+ moveins(v->nfa, rp, s2);
+ EMPTYARC(lp, s);
+ EMPTYARC(s2, rp);
+ EMPTYARC(s2, s);
+ break;
+ case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
+ s = newstate(v->nfa);
+ NOERR();
+ moveouts(v->nfa, lp, s);
+ dupnfa(v->nfa, s, rp, lp, s);
+ NOERR();
+ repeat(v, lp, s, m - 1, n - 1);
+ break;
+ case PAIR(SOME, INF): /* do as x{m-1,}x */
+ s = newstate(v->nfa);
+ NOERR();
+ moveouts(v->nfa, lp, s);
+ dupnfa(v->nfa, s, rp, lp, s);
+ NOERR();
+ repeat(v, lp, s, m - 1, n);
+ break;
+ default:
+ ERR(REG_ASSERT);
+ break;
+ }
+}
+
+/*
+ * bracket - handle non-complemented bracket expression
+ *
+ * Also called from cbracket for complemented bracket expressions.
+ */
+static void
+bracket(struct vars *v,
+ struct state *lp,
+ struct state *rp)
+{
+ /*
+ * We can't process complemented char classes (e.g. \W) immediately while
+ * scanning the bracket expression, else color bookkeeping gets confused.
+ * Instead, remember whether we saw any in have_cclassc[], and process
+ * them at the end.
+ */
+ bool have_cclassc[NUM_CCLASSES];
+ bool any_cclassc;
+ int i;
+
+ memset(have_cclassc, false, sizeof(have_cclassc));
+
+ assert(SEE('['));
+ NEXT();
+ while (!SEE(']') && !SEE(EOS))
+ brackpart(v, lp, rp, have_cclassc);
+ assert(SEE(']') || ISERR());
+
+ /* close up open subcolors from the positive bracket elements */
+ okcolors(v->nfa, v->cm);
+ NOERR();
+
+ /* now handle any complemented elements */
+ any_cclassc = false;
+ for (i = 0; i < NUM_CCLASSES; i++)
+ {
+ if (have_cclassc[i])
+ {
+ charclasscomplement(v, (enum char_classes) i, lp, rp);
+ NOERR();
+ any_cclassc = true;
+ }
+ }
+
+ /*
+ * If we had any complemented elements, see if we can optimize the bracket
+ * into a rainbow. Since a complemented element is the only way a WHITE
+ * arc could get into the result, there's no point in checking otherwise.
+ */
+ if (any_cclassc)
+ optimizebracket(v, lp, rp);
+}
+
+/*
+ * cbracket - handle complemented bracket expression
+ *
+ * We do it by calling bracket() with dummy endpoints, and then complementing
+ * the result. The alternative would be to invoke rainbow(), and then delete
+ * arcs as the b.e. is seen... but that gets messy, and is really quite
+ * infeasible now that rainbow() just puts out one RAINBOW arc.
+ */
+static void
+cbracket(struct vars *v,
+ struct state *lp,
+ struct state *rp)
+{
+ struct state *left = newstate(v->nfa);
+ struct state *right = newstate(v->nfa);
+
+ NOERR();
+ bracket(v, left, right);
+
+ /* in NLSTOP mode, ensure newline is not part of the result set */
+ if (v->cflags & REG_NLSTOP)
+ newarc(v->nfa, PLAIN, v->nlcolor, left, right);
+ NOERR();
+
+ assert(lp->nouts == 0); /* all outarcs will be ours */
+
+ /*
+ * Easy part of complementing, and all there is to do since the MCCE code
+ * was removed. Note that the result of colorcomplement() cannot be a
+ * rainbow, since we don't allow empty brackets; so there's no point in
+ * calling optimizebracket() again.
+ */
+ colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
+ NOERR();
+ dropstate(v->nfa, left);
+ assert(right->nins == 0);
+ freestate(v->nfa, right);
+}
+
+/*
+ * brackpart - handle one item (or range) within a bracket expression
+ */
+static void
+brackpart(struct vars *v,
+ struct state *lp,
+ struct state *rp,
+ bool *have_cclassc)
+{
+ chr startc;
+ chr endc;
+ struct cvec *cv;
+ enum char_classes cls;
+ const chr *startp;
+ const chr *endp;
+
+ /* parse something, get rid of special cases, take shortcuts */
+ switch (v->nexttype)
+ {
+ case RANGE: /* a-b-c or other botch */
+ ERR(REG_ERANGE);
+ return;
+ break;
+ case PLAIN:
+ startc = v->nextvalue;
+ NEXT();
+ /* shortcut for ordinary chr (not range) */
+ if (!SEE(RANGE))
+ {
+ onechr(v, startc, lp, rp);
+ return;
+ }
+ NOERR();
+ break;
+ case COLLEL:
+ startp = v->now;
+ endp = scanplain(v);
+ INSIST(startp < endp, REG_ECOLLATE);
+ NOERR();
+ startc = element(v, startp, endp);
+ NOERR();
+ break;
+ case ECLASS:
+ startp = v->now;
+ endp = scanplain(v);
+ INSIST(startp < endp, REG_ECOLLATE);
+ NOERR();
+ startc = element(v, startp, endp);
+ NOERR();
+ cv = eclass(v, startc, (v->cflags & REG_ICASE));
+ NOERR();
+ subcolorcvec(v, cv, lp, rp);
+ return;
+ break;
+ case CCLASS:
+ startp = v->now;
+ endp = scanplain(v);
+ INSIST(startp < endp, REG_ECTYPE);
+ NOERR();
+ cls = lookupcclass(v, startp, endp);
+ NOERR();
+ charclass(v, cls, lp, rp);
+ return;
+ break;
+ case CCLASSS:
+ charclass(v, (enum char_classes) v->nextvalue, lp, rp);
+ NEXT();
+ return;
+ break;
+ case CCLASSC:
+ /* we cannot call charclasscomplement() immediately */
+ have_cclassc[v->nextvalue] = true;
+ NEXT();
+ return;
+ break;
+ default:
+ ERR(REG_ASSERT);
+ return;
+ break;
+ }
+
+ if (SEE(RANGE))
+ {
+ NEXT();
+ switch (v->nexttype)
+ {
+ case PLAIN:
+ case RANGE:
+ endc = v->nextvalue;
+ NEXT();
+ NOERR();
+ break;
+ case COLLEL:
+ startp = v->now;
+ endp = scanplain(v);
+ INSIST(startp < endp, REG_ECOLLATE);
+ NOERR();
+ endc = element(v, startp, endp);
+ NOERR();
+ break;
+ default:
+ ERR(REG_ERANGE);
+ return;
+ break;
+ }
+ }
+ else
+ endc = startc;
+
+ /*
+ * Ranges are unportable. Actually, standard C does guarantee that digits
+ * are contiguous, but making that an exception is just too complicated.
+ */
+ if (startc != endc)
+ NOTE(REG_UUNPORT);
+ cv = range(v, startc, endc, (v->cflags & REG_ICASE));
+ NOERR();
+ subcolorcvec(v, cv, lp, rp);
+}
+
+/*
+ * scanplain - scan PLAIN contents of [. etc.
+ *
+ * Certain bits of trickery in regc_lex.c know that this code does not try
+ * to look past the final bracket of the [. etc.
+ */
+static const chr * /* just after end of sequence */
+scanplain(struct vars *v)
+{
+ const chr *endp;
+
+ assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
+ NEXT();
+
+ endp = v->now;
+ while (SEE(PLAIN))
+ {
+ endp = v->now;
+ NEXT();
+ }
+
+ assert(SEE(END) || ISERR());
+ NEXT();
+
+ return endp;
+}
+
+/*
+ * onechr - fill in arcs for a plain character, and possible case complements
+ * This is mostly a shortcut for efficient handling of the common case.
+ */
+static void
+onechr(struct vars *v,
+ chr c,
+ struct state *lp,
+ struct state *rp)
+{
+ if (!(v->cflags & REG_ICASE))
+ {
+ color lastsubcolor = COLORLESS;
+
+ subcoloronechr(v, c, lp, rp, &lastsubcolor);
+ return;
+ }
+
+ /* rats, need general case anyway... */
+ subcolorcvec(v, allcases(v, c), lp, rp);
+}
+
+/*
+ * optimizebracket - see if bracket expression can be converted to RAINBOW
+ *
+ * Cases such as "[\s\S]" can produce a set of arcs of all colors, which we
+ * can replace by a single RAINBOW arc for efficiency. (This might seem
+ * like a silly way to write ".", but it's seemingly a common locution in
+ * some other flavors of regex, so take the trouble to support it well.)
+ */
+static void
+optimizebracket(struct vars *v,
+ struct state *lp,
+ struct state *rp)
+{
+ struct colordesc *cd;
+ struct colordesc *end = CDEND(v->cm);
+ struct arc *a;
+ bool israinbow;
+
+ /*
+ * Scan lp's out-arcs and transiently mark the mentioned colors. We
+ * expect that all of lp's out-arcs are plain, non-RAINBOW arcs to rp.
+ * (Note: there shouldn't be any pseudocolors yet, but check anyway.)
+ */
+ for (a = lp->outs; a != NULL; a = a->outchain)
+ {
+ assert(a->type == PLAIN);
+ assert(a->co >= 0); /* i.e. not RAINBOW */
+ assert(a->to == rp);
+ cd = &v->cm->cd[a->co];
+ assert(!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO));
+ cd->flags |= COLMARK;
+ }
+
+ /* Scan colors, clear transient marks, check for unmarked live colors */
+ israinbow = true;
+ for (cd = v->cm->cd; cd < end; cd++)
+ {
+ if (cd->flags & COLMARK)
+ cd->flags &= ~COLMARK;
+ else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
+ israinbow = false;
+ }
+
+ /* Can't do anything if not all colors have arcs */
+ if (!israinbow)
+ return;
+
+ /* OK, drop existing arcs and replace with a rainbow */
+ while ((a = lp->outs) != NULL)
+ freearc(v->nfa, a);
+ newarc(v->nfa, PLAIN, RAINBOW, lp, rp);
+}
+
+/*
+ * wordchrs - set up word-chr list for word-boundary stuff, if needed
+ *
+ * The list is kept as a bunch of circular arcs on an otherwise-unused state.
+ *
+ * Note that this must not be called while we have any open subcolors,
+ * else construction of the list would confuse color bookkeeping.
+ * Hence, we can't currently apply a similar optimization in
+ * charclass[complement](), as those need to be usable within bracket
+ * expressions.
+ */
+static void
+wordchrs(struct vars *v)
+{
+ struct state *cstate;
+ struct cvec *cv;
+
+ if (v->wordchrs != NULL)
+ return; /* done already */
+
+ /* make dummy state to hang the cache arcs on */
+ cstate = newstate(v->nfa);
+ NOERR();
+
+ /* obtain possibly-cached cvec for \w characters */
+ NOTE(REG_ULOCALE);
+ cv = cclasscvec(v, CC_WORD, (v->cflags & REG_ICASE));
+ NOERR();
+
+ /* build the arcs; this may cause color splitting */
+ subcolorcvec(v, cv, cstate, cstate);
+ NOERR();
+
+ /* close new open subcolors to ensure the cache entry is self-contained */
+ okcolors(v->nfa, v->cm);
+ NOERR();
+
+ /* success! save the cache pointer */
+ v->wordchrs = cstate;
+}
+
+/*
+ * processlacon - generate the NFA representation of a LACON
+ *
+ * In the general case this is just newlacon() + newarc(), but some cases
+ * can be optimized.
+ */
+static void
+processlacon(struct vars *v,
+ struct state *begin, /* start of parsed LACON sub-re */
+ struct state *end, /* end of parsed LACON sub-re */
+ int latype,
+ struct state *lp, /* left state to hang it on */
+ struct state *rp) /* right state to hang it on */
+{
+ struct state *s1;
+ int n;
+
+ /*
+ * Check for lookaround RE consisting of a single plain color arc (or set
+ * of arcs); this would typically be a simple chr or a bracket expression.
+ */
+ s1 = single_color_transition(begin, end);
+ switch (latype)
+ {
+ case LATYPE_AHEAD_POS:
+ /* If lookahead RE is just colorset C, convert to AHEAD(C) */
+ if (s1 != NULL)
+ {
+ cloneouts(v->nfa, s1, lp, rp, AHEAD);
+ return;
+ }
+ break;
+ case LATYPE_AHEAD_NEG:
+ /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */
+ if (s1 != NULL)
+ {
+ colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp);
+ newarc(v->nfa, '$', 1, lp, rp);
+ newarc(v->nfa, '$', 0, lp, rp);
+ return;
+ }
+ break;
+ case LATYPE_BEHIND_POS:
+ /* If lookbehind RE is just colorset C, convert to BEHIND(C) */
+ if (s1 != NULL)
+ {
+ cloneouts(v->nfa, s1, lp, rp, BEHIND);
+ return;
+ }
+ break;
+ case LATYPE_BEHIND_NEG:
+ /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */
+ if (s1 != NULL)
+ {
+ colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp);
+ newarc(v->nfa, '^', 1, lp, rp);
+ newarc(v->nfa, '^', 0, lp, rp);
+ return;
+ }
+ break;
+ default:
+ assert(NOTREACHED);
+ }
+
+ /* General case: we need a LACON subre and arc */
+ n = newlacon(v, begin, end, latype);
+ newarc(v->nfa, LACON, n, lp, rp);
+}
+
+/*
+ * subre - allocate a subre
+ */
+static struct subre *
+subre(struct vars *v,
+ int op,
+ int flags,
+ struct state *begin,
+ struct state *end)
+{
+ struct subre *ret = v->treefree;
+
+ /*
+ * Checking for stack overflow here is sufficient to protect parse() and
+ * its recursive subroutines.
+ */
+ if (STACK_TOO_DEEP(v->re))
+ {
+ ERR(REG_ETOOBIG);
+ return NULL;
+ }
+
+ if (ret != NULL)
+ v->treefree = ret->child;
+ else
+ {
+ ret = (struct subre *) MALLOC(sizeof(struct subre));
+ if (ret == NULL)
+ {
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+ ret->chain = v->treechain;
+ v->treechain = ret;
+ }
+
+ assert(strchr("=b|.*(", op) != NULL);
+
+ ret->op = op;
+ ret->flags = flags;
+ ret->latype = (char) -1;
+ ret->id = 0; /* will be assigned later */
+ ret->capno = 0;
+ ret->backno = 0;
+ ret->min = ret->max = 1;
+ ret->child = NULL;
+ ret->sibling = NULL;
+ ret->begin = begin;
+ ret->end = end;
+ ZAPCNFA(ret->cnfa);
+
+ return ret;
+}
+
+/*
+ * freesubre - free a subRE subtree
+ *
+ * This frees child node(s) of the given subRE too,
+ * but not its siblings.
+ */
+static void
+freesubre(struct vars *v, /* might be NULL */
+ struct subre *sr)
+{
+ if (sr == NULL)
+ return;
+
+ if (sr->child != NULL)
+ freesubreandsiblings(v, sr->child);
+
+ freesrnode(v, sr);
+}
+
+/*
+ * freesubreandsiblings - free a subRE subtree
+ *
+ * This frees child node(s) of the given subRE too,
+ * as well as any following siblings.
+ */
+static void
+freesubreandsiblings(struct vars *v, /* might be NULL */
+ struct subre *sr)
+{
+ while (sr != NULL)
+ {
+ struct subre *next = sr->sibling;
+
+ freesubre(v, sr);
+ sr = next;
+ }
+}
+
+/*
+ * freesrnode - free one node in a subRE subtree
+ */
+static void
+freesrnode(struct vars *v, /* might be NULL */
+ struct subre *sr)
+{
+ if (sr == NULL)
+ return;
+
+ if (!NULLCNFA(sr->cnfa))
+ freecnfa(&sr->cnfa);
+ sr->flags = 0; /* in particular, not INUSE */
+ sr->child = sr->sibling = NULL;
+ sr->begin = sr->end = NULL;
+
+ if (v != NULL && v->treechain != NULL)
+ {
+ /* we're still parsing, maybe we can reuse the subre */
+ sr->child = v->treefree;
+ v->treefree = sr;
+ }
+ else
+ FREE(sr);
+}
+
+/*
+ * removecaptures - remove unnecessary capture subREs
+ *
+ * If the caller said that it doesn't care about subexpression match data,
+ * we may delete the "capture" markers on subREs that are not referenced
+ * by any backrefs, and then simplify anything that's become non-messy.
+ * Call this only if REG_NOSUB flag is set.
+ */
+static void
+removecaptures(struct vars *v,
+ struct subre *t)
+{
+ struct subre *t2;
+
+ assert(t != NULL);
+
+ /*
+ * If this isn't itself a backref target, clear capno and tentatively
+ * clear CAP flag.
+ */
+ if (!(t->flags & BRUSE))
+ {
+ t->capno = 0;
+ t->flags &= ~CAP;
+ }
+
+ /* Now recurse to children */
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ {
+ removecaptures(v, t2);
+ /* Propagate child CAP flag back up, if it's still set */
+ if (t2->flags & CAP)
+ t->flags |= CAP;
+ }
+
+ /*
+ * If t now contains neither captures nor backrefs, there's no longer any
+ * need to care where its sub-match boundaries are, so we can reduce it to
+ * a simple DFA node. (Note in particular that MIXED child greediness is
+ * not a hindrance here, so we don't use the MESSY() macro.)
+ */
+ if ((t->flags & (CAP | BACKR)) == 0)
+ {
+ if (t->child)
+ freesubreandsiblings(v, t->child);
+ t->child = NULL;
+ t->op = '=';
+ t->flags &= ~MIXED;
+ }
+}
+
+/*
+ * numst - number tree nodes (assigning "id" indexes)
+ */
+static int /* next number */
+numst(struct subre *t,
+ int start) /* starting point for subtree numbers */
+{
+ int i;
+ struct subre *t2;
+
+ assert(t != NULL);
+
+ i = start;
+ t->id = i++;
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ i = numst(t2, i);
+ return i;
+}
+
+/*
+ * markst - mark tree nodes as INUSE
+ *
+ * Note: this is a great deal more subtle than it looks. During initial
+ * parsing of a regex, all subres are linked into the treechain list;
+ * discarded ones are also linked into the treefree list for possible reuse.
+ * After we are done creating all subres required for a regex, we run markst()
+ * then cleanst(), which results in discarding all subres not reachable from
+ * v->tree. We then clear v->treechain, indicating that subres must be found
+ * by descending from v->tree. This changes the behavior of freesubre(): it
+ * will henceforth FREE() unwanted subres rather than sticking them into the
+ * treefree list. (Doing that any earlier would result in dangling links in
+ * the treechain list.) This all means that freev() will clean up correctly
+ * if invoked before or after markst()+cleanst(); but it would not work if
+ * called partway through this state conversion, so we mustn't error out
+ * in or between these two functions.
+ */
+static void
+markst(struct subre *t)
+{
+ struct subre *t2;
+
+ assert(t != NULL);
+
+ t->flags |= INUSE;
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ markst(t2);
+}
+
+/*
+ * cleanst - free any tree nodes not marked INUSE
+ */
+static void
+cleanst(struct vars *v)
+{
+ struct subre *t;
+ struct subre *next;
+
+ for (t = v->treechain; t != NULL; t = next)
+ {
+ next = t->chain;
+ if (!(t->flags & INUSE))
+ FREE(t);
+ }
+ v->treechain = NULL;
+ v->treefree = NULL; /* just on general principles */
+}
+
+/*
+ * nfatree - turn a subRE subtree into a tree of compacted NFAs
+ */
+static long /* optimize results from top node */
+nfatree(struct vars *v,
+ struct subre *t,
+ FILE *f) /* for debug output */
+{
+ struct subre *t2;
+
+ assert(t != NULL && t->begin != NULL);
+
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ (DISCARD) nfatree(v, t2, f);
+
+ return nfanode(v, t, 0, f);
+}
+
+/*
+ * nfanode - do one NFA for nfatree or lacons
+ *
+ * If converttosearch is true, apply makesearch() to the NFA.
+ */
+static long /* optimize results */
+nfanode(struct vars *v,
+ struct subre *t,
+ int converttosearch,
+ FILE *f) /* for debug output */
+{
+ struct nfa *nfa;
+ long ret = 0;
+
+ assert(t->begin != NULL);
+
+#ifdef REG_DEBUG
+ if (f != NULL)
+ {
+ char idbuf[50];
+
+ fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
+ stid(t, idbuf, sizeof(idbuf)));
+ }
+#endif
+ nfa = newnfa(v, v->cm, v->nfa);
+ NOERRZ();
+ dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
+ if (!ISERR())
+ specialcolors(nfa);
+ if (!ISERR())
+ ret = optimize(nfa, f);
+ if (converttosearch && !ISERR())
+ makesearch(v, nfa);
+ if (!ISERR())
+ compact(nfa, &t->cnfa);
+
+ freenfa(nfa);
+ return ret;
+}
+
+/*
+ * newlacon - allocate a lookaround-constraint subRE
+ */
+static int /* lacon number */
+newlacon(struct vars *v,
+ struct state *begin,
+ struct state *end,
+ int latype)
+{
+ int n;
+ struct subre *newlacons;
+ struct subre *sub;
+
+ if (v->nlacons == 0)
+ {
+ n = 1; /* skip 0th */
+ newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
+ }
+ else
+ {
+ n = v->nlacons;
+ newlacons = (struct subre *) REALLOC(v->lacons,
+ (n + 1) * sizeof(struct subre));
+ }
+ if (newlacons == NULL)
+ {
+ ERR(REG_ESPACE);
+ return 0;
+ }
+ v->lacons = newlacons;
+ v->nlacons = n + 1;
+ sub = &v->lacons[n];
+ sub->begin = begin;
+ sub->end = end;
+ sub->latype = latype;
+ ZAPCNFA(sub->cnfa);
+ return n;
+}
+
+/*
+ * freelacons - free lookaround-constraint subRE vector
+ */
+static void
+freelacons(struct subre *subs,
+ int n)
+{
+ struct subre *sub;
+ int i;
+
+ assert(n > 0);
+ for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
+ if (!NULLCNFA(sub->cnfa))
+ freecnfa(&sub->cnfa);
+ FREE(subs);
+}
+
+/*
+ * rfree - free a whole RE (insides of regfree)
+ */
+static void
+rfree(regex_t *re)
+{
+ struct guts *g;
+
+ if (re == NULL || re->re_magic != REMAGIC)
+ return;
+
+ re->re_magic = 0; /* invalidate RE */
+ g = (struct guts *) re->re_guts;
+ re->re_guts = NULL;
+ re->re_fns = NULL;
+ if (g != NULL)
+ {
+ g->magic = 0;
+ freecm(&g->cmap);
+ if (g->tree != NULL)
+ freesubre((struct vars *) NULL, g->tree);
+ if (g->lacons != NULL)
+ freelacons(g->lacons, g->nlacons);
+ if (!NULLCNFA(g->search))
+ freecnfa(&g->search);
+ FREE(g);
+ }
+}
+
+/*
+ * rcancelrequested - check for external request to cancel regex operation
+ *
+ * Return nonzero to fail the operation with error code REG_CANCEL,
+ * zero to keep going
+ *
+ * The current implementation is Postgres-specific. If we ever get around
+ * to splitting the regex code out as a standalone library, there will need
+ * to be some API to let applications define a callback function for this.
+ */
+static int
+rcancelrequested(void)
+{
+ return InterruptPending && (QueryCancelPending || ProcDiePending);
+}
+
+/*
+ * rstacktoodeep - check for stack getting dangerously deep
+ *
+ * Return nonzero to fail the operation with error code REG_ETOOBIG,
+ * zero to keep going
+ *
+ * The current implementation is Postgres-specific. If we ever get around
+ * to splitting the regex code out as a standalone library, there will need
+ * to be some API to let applications define a callback function for this.
+ */
+static int
+rstacktoodeep(void)
+{
+ return stack_is_too_deep();
+}
+
+#ifdef REG_DEBUG
+
+/*
+ * dump - dump an RE in human-readable form
+ */
+static void
+dump(regex_t *re,
+ FILE *f)
+{
+ struct guts *g;
+ int i;
+
+ if (re->re_magic != REMAGIC)
+ fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
+ REMAGIC);
+ if (re->re_guts == NULL)
+ {
+ fprintf(f, "NULL guts!!!\n");
+ return;
+ }
+ g = (struct guts *) re->re_guts;
+ if (g->magic != GUTSMAGIC)
+ fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
+ GUTSMAGIC);
+
+ fprintf(f, "\n\n\n========= DUMP ==========\n");
+ fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
+ (int) re->re_nsub, re->re_info, re->re_csize, g->ntree);
+
+ dumpcolors(&g->cmap, f);
+ if (!NULLCNFA(g->search))
+ {
+ fprintf(f, "\nsearch:\n");
+ dumpcnfa(&g->search, f);
+ }
+ for (i = 1; i < g->nlacons; i++)
+ {
+ struct subre *lasub = &g->lacons[i];
+ const char *latype;
+
+ switch (lasub->latype)
+ {
+ case LATYPE_AHEAD_POS:
+ latype = "positive lookahead";
+ break;
+ case LATYPE_AHEAD_NEG:
+ latype = "negative lookahead";
+ break;
+ case LATYPE_BEHIND_POS:
+ latype = "positive lookbehind";
+ break;
+ case LATYPE_BEHIND_NEG:
+ latype = "negative lookbehind";
+ break;
+ default:
+ latype = "???";
+ break;
+ }
+ fprintf(f, "\nla%d (%s):\n", i, latype);
+ dumpcnfa(&lasub->cnfa, f);
+ }
+ fprintf(f, "\n");
+ dumpst(g->tree, f, 0);
+}
+
+/*
+ * dumpst - dump a subRE tree
+ */
+static void
+dumpst(struct subre *t,
+ FILE *f,
+ int nfapresent) /* is the original NFA still around? */
+{
+ if (t == NULL)
+ fprintf(f, "null tree\n");
+ else
+ stdump(t, f, nfapresent);
+ fflush(f);
+}
+
+/*
+ * stdump - recursive guts of dumpst
+ */
+static void
+stdump(struct subre *t,
+ FILE *f,
+ int nfapresent) /* is the original NFA still around? */
+{
+ char idbuf[50];
+ struct subre *t2;
+
+ fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
+ if (t->flags & LONGER)
+ fprintf(f, " longest");
+ if (t->flags & SHORTER)
+ fprintf(f, " shortest");
+ if (t->flags & MIXED)
+ fprintf(f, " hasmixed");
+ if (t->flags & CAP)
+ fprintf(f, " hascapture");
+ if (t->flags & BACKR)
+ fprintf(f, " hasbackref");
+ if (t->flags & BRUSE)
+ fprintf(f, " isreferenced");
+ if (!(t->flags & INUSE))
+ fprintf(f, " UNUSED");
+ if (t->latype != (char) -1)
+ fprintf(f, " latype(%d)", t->latype);
+ if (t->capno != 0)
+ fprintf(f, " capture(%d)", t->capno);
+ if (t->backno != 0)
+ fprintf(f, " backref(%d)", t->backno);
+ if (t->min != 1 || t->max != 1)
+ {
+ fprintf(f, " {%d,", t->min);
+ if (t->max != DUPINF)
+ fprintf(f, "%d", t->max);
+ fprintf(f, "}");
+ }
+ if (nfapresent)
+ fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
+ if (t->child != NULL)
+ fprintf(f, " C:%s", stid(t->child, idbuf, sizeof(idbuf)));
+ /* printing second child isn't necessary, but it is often helpful */
+ if (t->child != NULL && t->child->sibling != NULL)
+ fprintf(f, " C2:%s", stid(t->child->sibling, idbuf, sizeof(idbuf)));
+ if (t->sibling != NULL)
+ fprintf(f, " S:%s", stid(t->sibling, idbuf, sizeof(idbuf)));
+ if (!NULLCNFA(t->cnfa))
+ {
+ fprintf(f, "\n");
+ dumpcnfa(&t->cnfa, f);
+ }
+ fprintf(f, "\n");
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ stdump(t2, f, nfapresent);
+}
+
+/*
+ * stid - identify a subtree node for dumping
+ */
+static const char * /* points to buf or constant string */
+stid(struct subre *t,
+ char *buf,
+ size_t bufsize)
+{
+ /* big enough for hex int or decimal t->id? */
+ if (bufsize < sizeof(void *) * 2 + 3 || bufsize < sizeof(t->id) * 3 + 1)
+ return "unable";
+ if (t->id != 0)
+ sprintf(buf, "%d", t->id);
+ else
+ sprintf(buf, "%p", t);
+ return buf;
+}
+#endif /* REG_DEBUG */
+
+
+#include "regc_lex.c"
+#include "regc_color.c"
+#include "regc_nfa.c"
+#include "regc_cvec.c"
+#include "regc_pg_locale.c"
+#include "regc_locale.c"
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
new file mode 100644
index 0000000..ba1289c
--- /dev/null
+++ b/src/backend/regex/rege_dfa.c
@@ -0,0 +1,1106 @@
+/*
+ * DFA routines
+ * This file is #included by regexec.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/rege_dfa.c
+ *
+ */
+
+/*
+ * longest - longest-preferred matching engine
+ *
+ * On success, returns match endpoint address. Returns NULL on no match.
+ * Internal errors also return NULL, with v->err set.
+ */
+static chr *
+longest(struct vars *v,
+ struct dfa *d,
+ chr *start, /* where the match should start */
+ chr *stop, /* match must end at or before here */
+ int *hitstopp) /* record whether hit v->stop, if non-NULL */
+{
+ chr *cp;
+ chr *realstop = (stop == v->stop) ? stop : stop + 1;
+ color co;
+ struct sset *css;
+ struct sset *ss;
+ chr *post;
+ int i;
+ struct colormap *cm = d->cm;
+
+ /* prevent "uninitialized variable" warnings */
+ if (hitstopp != NULL)
+ *hitstopp = 0;
+
+ /* if this is a backref to a known string, just match against that */
+ if (d->backno >= 0)
+ {
+ assert((size_t) d->backno < v->nmatch);
+ if (v->pmatch[d->backno].rm_so >= 0)
+ {
+ cp = dfa_backref(v, d, start, start, stop, false);
+ if (cp == v->stop && stop == v->stop && hitstopp != NULL)
+ *hitstopp = 1;
+ return cp;
+ }
+ }
+
+ /* fast path for matchall NFAs */
+ if (d->cnfa->flags & MATCHALL)
+ {
+ size_t nchr = stop - start;
+ size_t maxmatchall = d->cnfa->maxmatchall;
+
+ if (nchr < d->cnfa->minmatchall)
+ return NULL;
+ if (maxmatchall == DUPINF)
+ {
+ if (stop == v->stop && hitstopp != NULL)
+ *hitstopp = 1;
+ }
+ else
+ {
+ if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
+ *hitstopp = 1;
+ if (nchr > maxmatchall)
+ return start + maxmatchall;
+ }
+ return stop;
+ }
+
+ /* initialize */
+ css = initialize(v, d, start);
+ if (css == NULL)
+ return NULL;
+ cp = start;
+
+ /* startup */
+ FDEBUG(("+++ startup +++\n"));
+ if (cp == v->start)
+ {
+ co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ }
+ else
+ {
+ co = GETCOLOR(cm, *(cp - 1));
+ FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
+ }
+ css = miss(v, d, css, co, cp, start);
+ if (css == NULL)
+ return NULL;
+ css->lastseen = cp;
+
+ /*
+ * This is the main text-scanning loop. It seems worth having two copies
+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+ * builds, when you're not actively tracing.
+ */
+#ifdef REG_DEBUG
+ if (v->eflags & REG_FTRACE)
+ {
+ while (cp < realstop)
+ {
+ FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ }
+ else
+#endif
+ {
+ while (cp < realstop)
+ {
+ co = GETCOLOR(cm, *cp);
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ }
+
+ if (ISERR())
+ return NULL;
+
+ /* shutdown */
+ FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
+ if (cp == v->stop && stop == v->stop)
+ {
+ if (hitstopp != NULL)
+ *hitstopp = 1;
+ co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ ss = miss(v, d, css, co, cp, start);
+ if (ISERR())
+ return NULL;
+ /* special case: match ended at eol? */
+ if (ss != NULL && (ss->flags & POSTSTATE))
+ return cp;
+ else if (ss != NULL)
+ ss->lastseen = cp; /* to be tidy */
+ }
+
+ /* find last match, if any */
+ post = d->lastpost;
+ for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+ if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
+ (post == NULL || post < ss->lastseen))
+ post = ss->lastseen;
+ if (post != NULL) /* found one */
+ return post - 1;
+
+ return NULL;
+}
+
+/*
+ * shortest - shortest-preferred matching engine
+ *
+ * On success, returns match endpoint address. Returns NULL on no match.
+ * Internal errors also return NULL, with v->err set.
+ */
+static chr *
+shortest(struct vars *v,
+ struct dfa *d,
+ chr *start, /* where the match should start */
+ chr *min, /* match must end at or after here */
+ chr *max, /* match must end at or before here */
+ chr **coldp, /* store coldstart pointer here, if non-NULL */
+ int *hitstopp) /* record whether hit v->stop, if non-NULL */
+{
+ chr *cp;
+ chr *realmin = (min == v->stop) ? min : min + 1;
+ chr *realmax = (max == v->stop) ? max : max + 1;
+ color co;
+ struct sset *css;
+ struct sset *ss;
+ struct colormap *cm = d->cm;
+
+ /* prevent "uninitialized variable" warnings */
+ if (coldp != NULL)
+ *coldp = NULL;
+ if (hitstopp != NULL)
+ *hitstopp = 0;
+
+ /* if this is a backref to a known string, just match against that */
+ if (d->backno >= 0)
+ {
+ assert((size_t) d->backno < v->nmatch);
+ if (v->pmatch[d->backno].rm_so >= 0)
+ {
+ cp = dfa_backref(v, d, start, min, max, true);
+ if (cp != NULL && coldp != NULL)
+ *coldp = start;
+ /* there is no case where we should set *hitstopp */
+ return cp;
+ }
+ }
+
+ /* fast path for matchall NFAs */
+ if (d->cnfa->flags & MATCHALL)
+ {
+ size_t nchr = min - start;
+
+ if (d->cnfa->maxmatchall != DUPINF &&
+ nchr > d->cnfa->maxmatchall)
+ return NULL;
+ if ((max - start) < d->cnfa->minmatchall)
+ return NULL;
+ if (nchr < d->cnfa->minmatchall)
+ min = start + d->cnfa->minmatchall;
+ if (coldp != NULL)
+ *coldp = start;
+ /* there is no case where we should set *hitstopp */
+ return min;
+ }
+
+ /* initialize */
+ css = initialize(v, d, start);
+ if (css == NULL)
+ return NULL;
+ cp = start;
+
+ /* startup */
+ FDEBUG(("--- startup ---\n"));
+ if (cp == v->start)
+ {
+ co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ }
+ else
+ {
+ co = GETCOLOR(cm, *(cp - 1));
+ FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
+ }
+ css = miss(v, d, css, co, cp, start);
+ if (css == NULL)
+ return NULL;
+ css->lastseen = cp;
+ ss = css;
+
+ /*
+ * This is the main text-scanning loop. It seems worth having two copies
+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+ * builds, when you're not actively tracing.
+ */
+#ifdef REG_DEBUG
+ if (v->eflags & REG_FTRACE)
+ {
+ while (cp < realmax)
+ {
+ FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ if ((ss->flags & POSTSTATE) && cp >= realmin)
+ break; /* NOTE BREAK OUT */
+ }
+ }
+ else
+#endif
+ {
+ while (cp < realmax)
+ {
+ co = GETCOLOR(cm, *cp);
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ if ((ss->flags & POSTSTATE) && cp >= realmin)
+ break; /* NOTE BREAK OUT */
+ }
+ }
+
+ if (ss == NULL)
+ return NULL;
+
+ if (coldp != NULL) /* report last no-progress state set, if any */
+ *coldp = lastcold(v, d);
+
+ if ((ss->flags & POSTSTATE) && cp > min)
+ {
+ assert(cp >= realmin);
+ cp--;
+ }
+ else if (cp == v->stop && max == v->stop)
+ {
+ co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ ss = miss(v, d, css, co, cp, start);
+ /* match might have ended at eol */
+ if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
+ *hitstopp = 1;
+ }
+
+ if (ss == NULL || !(ss->flags & POSTSTATE))
+ return NULL;
+
+ return cp;
+}
+
+/*
+ * matchuntil - incremental matching engine
+ *
+ * This is meant for use with a search-style NFA (that is, the pattern is
+ * known to act as though it had a leading .*). We determine whether a
+ * match exists starting at v->start and ending at probe. Multiple calls
+ * require only O(N) time not O(N^2) so long as the probe values are
+ * nondecreasing. *lastcss and *lastcp must be initialized to NULL before
+ * starting a series of calls.
+ *
+ * Returns 1 if a match exists, 0 if not.
+ * Internal errors also return 0, with v->err set.
+ */
+static int
+matchuntil(struct vars *v,
+ struct dfa *d,
+ chr *probe, /* we want to know if a match ends here */
+ struct sset **lastcss, /* state storage across calls */
+ chr **lastcp) /* state storage across calls */
+{
+ chr *cp = *lastcp;
+ color co;
+ struct sset *css = *lastcss;
+ struct sset *ss;
+ struct colormap *cm = d->cm;
+
+ /* fast path for matchall NFAs */
+ if (d->cnfa->flags & MATCHALL)
+ {
+ size_t nchr = probe - v->start;
+
+ if (nchr < d->cnfa->minmatchall)
+ return 0;
+ /* maxmatchall will always be infinity, cf. makesearch() */
+ assert(d->cnfa->maxmatchall == DUPINF);
+ return 1;
+ }
+
+ /* initialize and startup, or restart, if necessary */
+ if (cp == NULL || cp > probe)
+ {
+ cp = v->start;
+ css = initialize(v, d, cp);
+ if (css == NULL)
+ return 0;
+
+ FDEBUG((">>> startup >>>\n"));
+ co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+
+ css = miss(v, d, css, co, cp, v->start);
+ if (css == NULL)
+ return 0;
+ css->lastseen = cp;
+ }
+ else if (css == NULL)
+ {
+ /* we previously found that no match is possible beyond *lastcp */
+ return 0;
+ }
+ ss = css;
+
+ /*
+ * This is the main text-scanning loop. It seems worth having two copies
+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+ * builds, when you're not actively tracing.
+ */
+#ifdef REG_DEBUG
+ if (v->eflags & REG_FTRACE)
+ {
+ while (cp < probe)
+ {
+ FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, v->start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ }
+ else
+#endif
+ {
+ while (cp < probe)
+ {
+ co = GETCOLOR(cm, *cp);
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, v->start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ }
+
+ *lastcss = ss;
+ *lastcp = cp;
+
+ if (ss == NULL)
+ return 0; /* impossible match, or internal error */
+
+ /* We need to process one more chr, or the EOS symbol, to check match */
+ if (cp < v->stop)
+ {
+ FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL)
+ ss = miss(v, d, css, co, cp + 1, v->start);
+ }
+ else
+ {
+ assert(cp == v->stop);
+ co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ ss = miss(v, d, css, co, cp, v->start);
+ }
+
+ if (ss == NULL || !(ss->flags & POSTSTATE))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * dfa_backref - find best match length for a known backref string
+ *
+ * When the backref's referent is already available, we can deliver an exact
+ * answer with considerably less work than running the backref node's NFA.
+ *
+ * Return match endpoint for longest or shortest valid repeated match,
+ * or NULL if there is no valid match.
+ *
+ * Should be in sync with cbrdissect(), although that has the different task
+ * of checking a match to a predetermined section of the string.
+ */
+static chr *
+dfa_backref(struct vars *v,
+ struct dfa *d,
+ chr *start, /* where the match should start */
+ chr *min, /* match must end at or after here */
+ chr *max, /* match must end at or before here */
+ bool shortest)
+{
+ int n = d->backno;
+ int backmin = d->backmin;
+ int backmax = d->backmax;
+ size_t numreps;
+ size_t minreps;
+ size_t maxreps;
+ size_t brlen;
+ chr *brstring;
+ chr *p;
+
+ /* get the backreferenced string (caller should have checked this) */
+ if (v->pmatch[n].rm_so == -1)
+ return NULL;
+ brstring = v->start + v->pmatch[n].rm_so;
+ brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
+
+ /* special-case zero-length backreference to avoid divide by zero */
+ if (brlen == 0)
+ {
+ /*
+ * matches only a zero-length string, but any number of repetitions
+ * can be considered to be present
+ */
+ if (min == start && backmin <= backmax)
+ return start;
+ return NULL;
+ }
+
+ /*
+ * convert min and max into numbers of possible repetitions of the backref
+ * string, rounding appropriately
+ */
+ if (min <= start)
+ minreps = 0;
+ else
+ minreps = (min - start - 1) / brlen + 1;
+ maxreps = (max - start) / brlen;
+
+ /* apply bounds, then see if there is any allowed match length */
+ if (minreps < backmin)
+ minreps = backmin;
+ if (backmax != DUPINF && maxreps > backmax)
+ maxreps = backmax;
+ if (maxreps < minreps)
+ return NULL;
+
+ /* quick exit if zero-repetitions match is valid and preferred */
+ if (shortest && minreps == 0)
+ return start;
+
+ /* okay, compare the actual string contents */
+ p = start;
+ numreps = 0;
+ while (numreps < maxreps)
+ {
+ if ((*v->g->compare) (brstring, p, brlen) != 0)
+ break;
+ p += brlen;
+ numreps++;
+ if (shortest && numreps >= minreps)
+ break;
+ }
+
+ if (numreps >= minreps)
+ return p;
+ return NULL;
+}
+
+/*
+ * lastcold - determine last point at which no progress had been made
+ */
+static chr * /* endpoint, or NULL */
+lastcold(struct vars *v,
+ struct dfa *d)
+{
+ struct sset *ss;
+ chr *nopr;
+ int i;
+
+ nopr = d->lastnopr;
+ if (nopr == NULL)
+ nopr = v->start;
+ for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+ if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
+ nopr = ss->lastseen;
+ return nopr;
+}
+
+/*
+ * newdfa - set up a fresh DFA
+ *
+ * Returns NULL (and sets v->err) on failure.
+ */
+static struct dfa *
+newdfa(struct vars *v,
+ struct cnfa *cnfa,
+ struct colormap *cm,
+ struct smalldfa *sml) /* preallocated space, may be NULL */
+{
+ struct dfa *d;
+ size_t nss = cnfa->nstates * 2;
+ int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
+ bool ismalloced = false;
+
+ assert(cnfa != NULL && cnfa->nstates != 0);
+
+ if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
+ {
+ assert(wordsper == 1);
+ if (sml == NULL)
+ {
+ sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
+ if (sml == NULL)
+ {
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+ ismalloced = true;
+ }
+ d = &sml->dfa;
+ d->ssets = sml->ssets;
+ d->statesarea = sml->statesarea;
+ d->work = &d->statesarea[nss];
+ d->outsarea = sml->outsarea;
+ d->incarea = sml->incarea;
+ d->ismalloced = ismalloced;
+ d->arraysmalloced = false; /* not separately allocated, anyway */
+ }
+ else
+ {
+ d = (struct dfa *) MALLOC(sizeof(struct dfa));
+ if (d == NULL)
+ {
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+ d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
+ d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
+ sizeof(unsigned));
+ d->work = &d->statesarea[nss * wordsper];
+ d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
+ sizeof(struct sset *));
+ d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
+ sizeof(struct arcp));
+ d->ismalloced = true;
+ d->arraysmalloced = true;
+ /* now freedfa() will behave sanely */
+ if (d->ssets == NULL || d->statesarea == NULL ||
+ d->outsarea == NULL || d->incarea == NULL)
+ {
+ freedfa(d);
+ ERR(REG_ESPACE);
+ return NULL;
+ }
+ }
+
+ d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
+ d->nssused = 0;
+ d->nstates = cnfa->nstates;
+ d->ncolors = cnfa->ncolors;
+ d->wordsper = wordsper;
+ d->cnfa = cnfa;
+ d->cm = cm;
+ d->lastpost = NULL;
+ d->lastnopr = NULL;
+ d->search = d->ssets;
+ d->backno = -1; /* may be set by caller */
+ d->backmin = d->backmax = 0;
+
+ /* initialization of sset fields is done as needed */
+
+ return d;
+}
+
+/*
+ * freedfa - free a DFA
+ */
+static void
+freedfa(struct dfa *d)
+{
+ if (d->arraysmalloced)
+ {
+ if (d->ssets != NULL)
+ FREE(d->ssets);
+ if (d->statesarea != NULL)
+ FREE(d->statesarea);
+ if (d->outsarea != NULL)
+ FREE(d->outsarea);
+ if (d->incarea != NULL)
+ FREE(d->incarea);
+ }
+
+ if (d->ismalloced)
+ FREE(d);
+}
+
+/*
+ * hash - construct a hash code for a bitvector
+ *
+ * There are probably better ways, but they're more expensive.
+ */
+static unsigned
+hash(unsigned *uv,
+ int n)
+{
+ int i;
+ unsigned h;
+
+ h = 0;
+ for (i = 0; i < n; i++)
+ h ^= uv[i];
+ return h;
+}
+
+/*
+ * initialize - hand-craft a cache entry for startup, otherwise get ready
+ */
+static struct sset *
+initialize(struct vars *v,
+ struct dfa *d,
+ chr *start)
+{
+ struct sset *ss;
+ int i;
+
+ /* is previous one still there? */
+ if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
+ ss = &d->ssets[0];
+ else
+ { /* no, must (re)build it */
+ ss = getvacant(v, d, start, start);
+ if (ss == NULL)
+ return NULL;
+ for (i = 0; i < d->wordsper; i++)
+ ss->states[i] = 0;
+ BSET(ss->states, d->cnfa->pre);
+ ss->hash = HASH(ss->states, d->wordsper);
+ assert(d->cnfa->pre != d->cnfa->post);
+ ss->flags = STARTER | LOCKED | NOPROGRESS;
+ /* lastseen dealt with below */
+ }
+
+ for (i = 0; i < d->nssused; i++)
+ d->ssets[i].lastseen = NULL;
+ ss->lastseen = start; /* maybe untrue, but harmless */
+ d->lastpost = NULL;
+ d->lastnopr = NULL;
+ return ss;
+}
+
+/*
+ * miss - handle a stateset cache miss
+ *
+ * css is the current stateset, co is the color of the current input character,
+ * cp points to the character after that (which is where we may need to test
+ * LACONs). start does not affect matching behavior but is needed for pickss'
+ * heuristics about which stateset cache entry to replace.
+ *
+ * Ordinarily, returns the address of the next stateset (the one that is
+ * valid after consuming the input character). Returns NULL if no valid
+ * NFA states remain, ie we have a certain match failure.
+ * Internal errors also return NULL, with v->err set.
+ */
+static struct sset *
+miss(struct vars *v,
+ struct dfa *d,
+ struct sset *css,
+ color co,
+ chr *cp, /* next chr */
+ chr *start) /* where the attempt got started */
+{
+ struct cnfa *cnfa = d->cnfa;
+ int i;
+ unsigned h;
+ struct carc *ca;
+ struct sset *p;
+ int ispseudocolor;
+ int ispost;
+ int noprogress;
+ int gotstate;
+ int dolacons;
+ int sawlacons;
+
+ /* for convenience, we can be called even if it might not be a miss */
+ if (css->outs[co] != NULL)
+ {
+ FDEBUG(("hit\n"));
+ return css->outs[co];
+ }
+ FDEBUG(("miss\n"));
+
+ /*
+ * Checking for operation cancel in the inner text search loop seems
+ * unduly expensive. As a compromise, check during cache misses.
+ */
+ if (CANCEL_REQUESTED(v->re))
+ {
+ ERR(REG_CANCEL);
+ return NULL;
+ }
+
+ /*
+ * What set of states would we end up in after consuming the co character?
+ * We first consider PLAIN arcs that consume the character, and then look
+ * to see what LACON arcs could be traversed after consuming it.
+ */
+ for (i = 0; i < d->wordsper; i++)
+ d->work[i] = 0; /* build new stateset bitmap in d->work */
+ ispseudocolor = d->cm->cd[co].flags & PSEUDO;
+ ispost = 0;
+ noprogress = 1;
+ gotstate = 0;
+ for (i = 0; i < d->nstates; i++)
+ if (ISBSET(css->states, i))
+ for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
+ if (ca->co == co ||
+ (ca->co == RAINBOW && !ispseudocolor))
+ {
+ BSET(d->work, ca->to);
+ gotstate = 1;
+ if (ca->to == cnfa->post)
+ ispost = 1;
+ if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
+ noprogress = 0;
+ FDEBUG(("%d -> %d\n", i, ca->to));
+ }
+ if (!gotstate)
+ return NULL; /* character cannot reach any new state */
+ dolacons = (cnfa->flags & HASLACONS);
+ sawlacons = 0;
+ /* outer loop handles transitive closure of reachable-by-LACON states */
+ while (dolacons)
+ {
+ dolacons = 0;
+ for (i = 0; i < d->nstates; i++)
+ if (ISBSET(d->work, i))
+ for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co < cnfa->ncolors)
+ continue; /* not a LACON arc */
+ if (ISBSET(d->work, ca->to))
+ continue; /* arc would be a no-op anyway */
+ sawlacons = 1; /* this LACON affects our result */
+ if (!lacon(v, cnfa, cp, ca->co))
+ {
+ if (ISERR())
+ return NULL;
+ continue; /* LACON arc cannot be traversed */
+ }
+ if (ISERR())
+ return NULL;
+ BSET(d->work, ca->to);
+ dolacons = 1;
+ if (ca->to == cnfa->post)
+ ispost = 1;
+ if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
+ noprogress = 0;
+ FDEBUG(("%d :> %d\n", i, ca->to));
+ }
+ }
+ h = HASH(d->work, d->wordsper);
+
+ /* Is this stateset already in the cache? */
+ for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
+ if (HIT(h, d->work, p, d->wordsper))
+ {
+ FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
+ break; /* NOTE BREAK OUT */
+ }
+ if (i == 0)
+ { /* nope, need a new cache entry */
+ p = getvacant(v, d, cp, start);
+ if (p == NULL)
+ return NULL;
+ assert(p != css);
+ for (i = 0; i < d->wordsper; i++)
+ p->states[i] = d->work[i];
+ p->hash = h;
+ p->flags = (ispost) ? POSTSTATE : 0;
+ if (noprogress)
+ p->flags |= NOPROGRESS;
+ /* lastseen to be dealt with by caller */
+ }
+
+ /*
+ * Link new stateset to old, unless a LACON affected the result, in which
+ * case we don't create the link. That forces future transitions across
+ * this same arc (same prior stateset and character color) to come through
+ * miss() again, so that we can recheck the LACON(s), which might or might
+ * not pass since context will be different.
+ */
+ if (!sawlacons)
+ {
+ FDEBUG(("c%d[%d]->c%d\n",
+ (int) (css - d->ssets), co, (int) (p - d->ssets)));
+ css->outs[co] = p;
+ css->inchain[co] = p->ins;
+ p->ins.ss = css;
+ p->ins.co = co;
+ }
+ return p;
+}
+
+/*
+ * lacon - lookaround-constraint checker for miss()
+ */
+static int /* predicate: constraint satisfied? */
+lacon(struct vars *v,
+ struct cnfa *pcnfa, /* parent cnfa */
+ chr *cp,
+ color co) /* "color" of the lookaround constraint */
+{
+ int n;
+ struct subre *sub;
+ struct dfa *d;
+ chr *end;
+ int satisfied;
+
+ /* Since this is recursive, it could be driven to stack overflow */
+ if (STACK_TOO_DEEP(v->re))
+ {
+ ERR(REG_ETOOBIG);
+ return 0;
+ }
+
+ n = co - pcnfa->ncolors;
+ assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
+ FDEBUG(("=== testing lacon %d\n", n));
+ sub = &v->g->lacons[n];
+ d = getladfa(v, n);
+ if (d == NULL)
+ return 0;
+ if (LATYPE_IS_AHEAD(sub->latype))
+ {
+ /* used to use longest() here, but shortest() could be much cheaper */
+ end = shortest(v, d, cp, cp, v->stop,
+ (chr **) NULL, (int *) NULL);
+ satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
+ }
+ else
+ {
+ /*
+ * To avoid doing O(N^2) work when repeatedly testing a lookbehind
+ * constraint in an N-character string, we use matchuntil() which can
+ * cache the DFA state across calls. We only need to restart if the
+ * probe point decreases, which is not common. The NFA we're using is
+ * a search NFA, so it doesn't mind scanning over stuff before the
+ * nominal match.
+ */
+ satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
+ if (!LATYPE_IS_POS(sub->latype))
+ satisfied = !satisfied;
+ }
+ FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
+ return satisfied;
+}
+
+/*
+ * getvacant - get a vacant state set
+ *
+ * This routine clears out the inarcs and outarcs, but does not otherwise
+ * clear the innards of the state set -- that's up to the caller.
+ */
+static struct sset *
+getvacant(struct vars *v,
+ struct dfa *d,
+ chr *cp,
+ chr *start)
+{
+ int i;
+ struct sset *ss;
+ struct sset *p;
+ struct arcp ap;
+ color co;
+
+ ss = pickss(v, d, cp, start);
+ if (ss == NULL)
+ return NULL;
+ assert(!(ss->flags & LOCKED));
+
+ /* clear out its inarcs, including self-referential ones */
+ ap = ss->ins;
+ while ((p = ap.ss) != NULL)
+ {
+ co = ap.co;
+ FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
+ p->outs[co] = NULL;
+ ap = p->inchain[co];
+ p->inchain[co].ss = NULL; /* paranoia */
+ }
+ ss->ins.ss = NULL;
+
+ /* take it off the inarc chains of the ssets reached by its outarcs */
+ for (i = 0; i < d->ncolors; i++)
+ {
+ p = ss->outs[i];
+ assert(p != ss); /* not self-referential */
+ if (p == NULL)
+ continue; /* NOTE CONTINUE */
+ FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
+ if (p->ins.ss == ss && p->ins.co == i)
+ p->ins = ss->inchain[i];
+ else
+ {
+ struct arcp lastap = {NULL, 0};
+
+ assert(p->ins.ss != NULL);
+ for (ap = p->ins; ap.ss != NULL &&
+ !(ap.ss == ss && ap.co == i);
+ ap = ap.ss->inchain[ap.co])
+ lastap = ap;
+ assert(ap.ss != NULL);
+ lastap.ss->inchain[lastap.co] = ss->inchain[i];
+ }
+ ss->outs[i] = NULL;
+ ss->inchain[i].ss = NULL;
+ }
+
+ /* if ss was a success state, may need to remember location */
+ if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
+ (d->lastpost == NULL || d->lastpost < ss->lastseen))
+ d->lastpost = ss->lastseen;
+
+ /* likewise for a no-progress state */
+ if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
+ (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
+ d->lastnopr = ss->lastseen;
+
+ return ss;
+}
+
+/*
+ * pickss - pick the next stateset to be used
+ */
+static struct sset *
+pickss(struct vars *v,
+ struct dfa *d,
+ chr *cp,
+ chr *start)
+{
+ int i;
+ struct sset *ss;
+ struct sset *end;
+ chr *ancient;
+
+ /* shortcut for cases where cache isn't full */
+ if (d->nssused < d->nssets)
+ {
+ i = d->nssused;
+ d->nssused++;
+ ss = &d->ssets[i];
+ FDEBUG(("new c%d\n", i));
+ /* set up innards */
+ ss->states = &d->statesarea[i * d->wordsper];
+ ss->flags = 0;
+ ss->ins.ss = NULL;
+ ss->ins.co = WHITE; /* give it some value */
+ ss->outs = &d->outsarea[i * d->ncolors];
+ ss->inchain = &d->incarea[i * d->ncolors];
+ for (i = 0; i < d->ncolors; i++)
+ {
+ ss->outs[i] = NULL;
+ ss->inchain[i].ss = NULL;
+ }
+ return ss;
+ }
+
+ /* look for oldest, or old enough anyway */
+ if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
+ ancient = cp - d->nssets * 2 / 3;
+ else
+ ancient = start;
+ for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
+ if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+ !(ss->flags & LOCKED))
+ {
+ d->search = ss + 1;
+ FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
+ return ss;
+ }
+ for (ss = d->ssets, end = d->search; ss < end; ss++)
+ if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+ !(ss->flags & LOCKED))
+ {
+ d->search = ss + 1;
+ FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
+ return ss;
+ }
+
+ /* nobody's old enough?!? -- something's really wrong */
+ FDEBUG(("cannot find victim to replace!\n"));
+ ERR(REG_ASSERT);
+ return NULL;
+}
diff --git a/src/backend/regex/regerror.c b/src/backend/regex/regerror.c
new file mode 100644
index 0000000..4a27c25
--- /dev/null
+++ b/src/backend/regex/regerror.c
@@ -0,0 +1,120 @@
+/*
+ * regerror - error-code expansion
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regerror.c
+ *
+ */
+
+#include "regex/regguts.h"
+
+/* unknown-error explanation */
+static const char unk[] = "*** unknown regex error code 0x%x ***";
+
+/* struct to map among codes, code names, and explanations */
+static const struct rerr
+{
+ int code;
+ const char *name;
+ const char *explain;
+} rerrs[] =
+
+{
+ /* the actual table is built from regex.h */
+#include "regex/regerrs.h" /* pgrminclude ignore */
+ {
+ -1, "", "oops"
+ }, /* explanation special-cased in code */
+};
+
+/*
+ * pg_regerror - the interface to error numbers
+ */
+/* ARGSUSED */
+size_t /* actual space needed (including NUL) */
+pg_regerror(int errcode, /* error code, or REG_ATOI or REG_ITOA */
+ const regex_t *preg, /* associated regex_t (unused at present) */
+ char *errbuf, /* result buffer (unless errbuf_size==0) */
+ size_t errbuf_size) /* available space in errbuf, can be 0 */
+{
+ const struct rerr *r;
+ const char *msg;
+ char convbuf[sizeof(unk) + 50]; /* 50 = plenty for int */
+ size_t len;
+ int icode;
+
+ switch (errcode)
+ {
+ case REG_ATOI: /* convert name to number */
+ for (r = rerrs; r->code >= 0; r++)
+ if (strcmp(r->name, errbuf) == 0)
+ break;
+ sprintf(convbuf, "%d", r->code); /* -1 for unknown */
+ msg = convbuf;
+ break;
+ case REG_ITOA: /* convert number to name */
+ icode = atoi(errbuf); /* not our problem if this fails */
+ for (r = rerrs; r->code >= 0; r++)
+ if (r->code == icode)
+ break;
+ if (r->code >= 0)
+ msg = r->name;
+ else
+ { /* unknown; tell him the number */
+ sprintf(convbuf, "REG_%u", (unsigned) icode);
+ msg = convbuf;
+ }
+ break;
+ default: /* a real, normal error code */
+ for (r = rerrs; r->code >= 0; r++)
+ if (r->code == errcode)
+ break;
+ if (r->code >= 0)
+ msg = r->explain;
+ else
+ { /* unknown; say so */
+ sprintf(convbuf, unk, errcode);
+ msg = convbuf;
+ }
+ break;
+ }
+
+ len = strlen(msg) + 1; /* space needed, including NUL */
+ if (errbuf_size > 0)
+ {
+ if (errbuf_size > len)
+ strcpy(errbuf, msg);
+ else
+ { /* truncate to fit */
+ memcpy(errbuf, msg, errbuf_size - 1);
+ errbuf[errbuf_size - 1] = '\0';
+ }
+ }
+
+ return len;
+}
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
new file mode 100644
index 0000000..9271544
--- /dev/null
+++ b/src/backend/regex/regexec.c
@@ -0,0 +1,1506 @@
+/*
+ * re_*exec and friends - match REs
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regexec.c
+ *
+ */
+
+#include "regex/regguts.h"
+
+
+
+/* lazy-DFA representation */
+struct arcp
+{ /* "pointer" to an outarc */
+ struct sset *ss;
+ color co;
+};
+
+struct sset
+{ /* state set */
+ unsigned *states; /* pointer to bitvector */
+ unsigned hash; /* hash of bitvector */
+#define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
+#define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
+ memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
+ int flags;
+#define STARTER 01 /* the initial state set */
+#define POSTSTATE 02 /* includes the goal state */
+#define LOCKED 04 /* locked in cache */
+#define NOPROGRESS 010 /* zero-progress state set */
+ struct arcp ins; /* chain of inarcs pointing here */
+ chr *lastseen; /* last entered on arrival here */
+ struct sset **outs; /* outarc vector indexed by color */
+ struct arcp *inchain; /* chain-pointer vector for outarcs */
+};
+
+struct dfa
+{
+ int nssets; /* size of cache */
+ int nssused; /* how many entries occupied yet */
+ int nstates; /* number of states */
+ int ncolors; /* length of outarc and inchain vectors */
+ int wordsper; /* length of state-set bitvectors */
+ struct sset *ssets; /* state-set cache */
+ unsigned *statesarea; /* bitvector storage */
+ unsigned *work; /* pointer to work area within statesarea */
+ struct sset **outsarea; /* outarc-vector storage */
+ struct arcp *incarea; /* inchain storage */
+ struct cnfa *cnfa;
+ struct colormap *cm;
+ chr *lastpost; /* location of last cache-flushed success */
+ chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
+ struct sset *search; /* replacement-search-pointer memory */
+ int backno; /* if DFA for a backref, subno it refers to */
+ short backmin; /* min repetitions for backref */
+ short backmax; /* max repetitions for backref */
+ bool ismalloced; /* should this struct dfa be freed? */
+ bool arraysmalloced; /* should its subsidiary arrays be freed? */
+};
+
+#define WORK 1 /* number of work bitvectors needed */
+
+/* setup for non-malloc allocation for small cases */
+#define FEWSTATES 20 /* must be less than UBITS */
+#define FEWCOLORS 15
+struct smalldfa
+{
+ struct dfa dfa; /* must be first */
+ struct sset ssets[FEWSTATES * 2];
+ unsigned statesarea[FEWSTATES * 2 + WORK];
+ struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
+ struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
+};
+
+#define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */
+
+
+
+/* internal variables, bundled for easy passing around */
+struct vars
+{
+ regex_t *re;
+ struct guts *g;
+ int eflags; /* copies of arguments */
+ size_t nmatch;
+ regmatch_t *pmatch;
+ rm_detail_t *details;
+ chr *start; /* start of string */
+ chr *search_start; /* search start of string */
+ chr *stop; /* just past end of string */
+ int err; /* error code if any (0 none) */
+ struct dfa **subdfas; /* per-tree-subre DFAs */
+ struct dfa **ladfas; /* per-lacon-subre DFAs */
+ struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
+ chr **lblastcp; /* per-lacon-subre lookbehind restart data */
+ struct smalldfa dfa1;
+ struct smalldfa dfa2;
+};
+
+#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
+#define ISERR() VISERR(v)
+#define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
+#define ERR(e) VERR(v, e) /* record an error */
+#define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
+#define OFF(p) ((p) - v->start)
+#define LOFF(p) ((long)OFF(p))
+
+
+
+/*
+ * forward declarations
+ */
+/* === regexec.c === */
+static struct dfa *getsubdfa(struct vars *, struct subre *);
+static struct dfa *getladfa(struct vars *, int);
+static int find(struct vars *, struct cnfa *, struct colormap *);
+static int cfind(struct vars *, struct cnfa *, struct colormap *);
+static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
+static void zapallsubs(regmatch_t *, size_t);
+static void zaptreesubs(struct vars *, struct subre *);
+static void subset(struct vars *, struct subre *, chr *, chr *);
+static int cdissect(struct vars *, struct subre *, chr *, chr *);
+static int ccondissect(struct vars *, struct subre *, chr *, chr *);
+static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
+static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
+static int caltdissect(struct vars *, struct subre *, chr *, chr *);
+static int citerdissect(struct vars *, struct subre *, chr *, chr *);
+static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
+
+/* === rege_dfa.c === */
+static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
+static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
+static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
+static chr *dfa_backref(struct vars *, struct dfa *, chr *, chr *, chr *, bool);
+static chr *lastcold(struct vars *, struct dfa *);
+static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
+static void freedfa(struct dfa *);
+static unsigned hash(unsigned *, int);
+static struct sset *initialize(struct vars *, struct dfa *, chr *);
+static struct sset *miss(struct vars *, struct dfa *, struct sset *, color, chr *, chr *);
+static int lacon(struct vars *, struct cnfa *, chr *, color);
+static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
+static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
+
+
+/*
+ * pg_regexec - match regular expression
+ */
+int
+pg_regexec(regex_t *re,
+ const chr *string,
+ size_t len,
+ size_t search_start,
+ rm_detail_t *details,
+ size_t nmatch,
+ regmatch_t pmatch[],
+ int flags)
+{
+ struct vars var;
+ register struct vars *v = &var;
+ int st;
+ size_t n;
+ size_t i;
+ int backref;
+
+#define LOCALMAT 20
+ regmatch_t mat[LOCALMAT];
+
+#define LOCALDFAS 40
+ struct dfa *subdfas[LOCALDFAS];
+
+ /* sanity checks */
+ if (re == NULL || string == NULL || re->re_magic != REMAGIC)
+ return REG_INVARG;
+ if (re->re_csize != sizeof(chr))
+ return REG_MIXED;
+ if (search_start > len)
+ return REG_NOMATCH;
+
+ /* Initialize locale-dependent support */
+ pg_set_regex_collation(re->re_collation);
+
+ /* setup */
+ v->re = re;
+ v->g = (struct guts *) re->re_guts;
+ if ((v->g->cflags & REG_EXPECT) && details == NULL)
+ return REG_INVARG;
+ if (v->g->info & REG_UIMPOSSIBLE)
+ return REG_NOMATCH;
+ backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
+ v->eflags = flags;
+ if (backref && nmatch <= v->g->nsub)
+ {
+ /* need larger work area */
+ v->nmatch = v->g->nsub + 1;
+ if (v->nmatch <= LOCALMAT)
+ v->pmatch = mat;
+ else
+ v->pmatch = (regmatch_t *) MALLOC(v->nmatch * sizeof(regmatch_t));
+ if (v->pmatch == NULL)
+ return REG_ESPACE;
+ zapallsubs(v->pmatch, v->nmatch);
+ }
+ else
+ {
+ /* we can store results directly in caller's array */
+ v->pmatch = pmatch;
+ /* ensure any extra entries in caller's array are filled with -1 */
+ if (nmatch > 0)
+ zapallsubs(pmatch, nmatch);
+ /* then forget about extra entries, to avoid useless work in find() */
+ if (nmatch > v->g->nsub + 1)
+ nmatch = v->g->nsub + 1;
+ v->nmatch = nmatch;
+ }
+ v->details = details;
+ v->start = (chr *) string;
+ v->search_start = (chr *) string + search_start;
+ v->stop = (chr *) string + len;
+ v->err = 0;
+ v->subdfas = NULL;
+ v->ladfas = NULL;
+ v->lblastcss = NULL;
+ v->lblastcp = NULL;
+ /* below this point, "goto cleanup" will behave sanely */
+
+ assert(v->g->ntree >= 0);
+ n = (size_t) v->g->ntree;
+ if (n <= LOCALDFAS)
+ v->subdfas = subdfas;
+ else
+ {
+ v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+ if (v->subdfas == NULL)
+ {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
+ }
+ for (i = 0; i < n; i++)
+ v->subdfas[i] = NULL;
+
+ assert(v->g->nlacons >= 0);
+ n = (size_t) v->g->nlacons;
+ if (n > 0)
+ {
+ v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+ if (v->ladfas == NULL)
+ {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
+ for (i = 0; i < n; i++)
+ v->ladfas[i] = NULL;
+ v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
+ v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
+ if (v->lblastcss == NULL || v->lblastcp == NULL)
+ {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
+ for (i = 0; i < n; i++)
+ {
+ v->lblastcss[i] = NULL;
+ v->lblastcp[i] = NULL;
+ }
+ }
+
+ /* do it */
+ assert(v->g->tree != NULL);
+ if (backref)
+ st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
+ else
+ st = find(v, &v->g->tree->cnfa, &v->g->cmap);
+
+ /* on success, ensure caller's match vector is filled correctly */
+ if (st == REG_OKAY && nmatch > 0)
+ {
+ if (v->pmatch != pmatch)
+ {
+ /* copy portion of match vector over from (larger) work area */
+ assert(nmatch <= v->nmatch);
+ memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
+ }
+ if (v->g->cflags & REG_NOSUB)
+ {
+ /* don't expose possibly-partial sub-match results to caller */
+ zapallsubs(pmatch, nmatch);
+ }
+ }
+
+ /* clean up */
+cleanup:
+ if (v->pmatch != pmatch && v->pmatch != mat)
+ FREE(v->pmatch);
+ if (v->subdfas != NULL)
+ {
+ n = (size_t) v->g->ntree;
+ for (i = 0; i < n; i++)
+ {
+ if (v->subdfas[i] != NULL)
+ freedfa(v->subdfas[i]);
+ }
+ if (v->subdfas != subdfas)
+ FREE(v->subdfas);
+ }
+ if (v->ladfas != NULL)
+ {
+ n = (size_t) v->g->nlacons;
+ for (i = 0; i < n; i++)
+ {
+ if (v->ladfas[i] != NULL)
+ freedfa(v->ladfas[i]);
+ }
+ FREE(v->ladfas);
+ }
+ if (v->lblastcss != NULL)
+ FREE(v->lblastcss);
+ if (v->lblastcp != NULL)
+ FREE(v->lblastcp);
+
+#ifdef REG_DEBUG
+ if (v->eflags & (REG_FTRACE | REG_MTRACE))
+ fflush(stdout);
+#endif
+
+ return st;
+}
+
+/*
+ * getsubdfa - create or re-fetch the DFA for a tree subre node
+ *
+ * We only need to create the DFA once per overall regex execution.
+ * The DFA will be freed by the cleanup step in pg_regexec().
+ */
+static struct dfa *
+getsubdfa(struct vars *v,
+ struct subre *t)
+{
+ struct dfa *d = v->subdfas[t->id];
+
+ if (d == NULL)
+ {
+ d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
+ if (d == NULL)
+ return NULL;
+ /* set up additional info if this is a backref node */
+ if (t->op == 'b')
+ {
+ d->backno = t->backno;
+ d->backmin = t->min;
+ d->backmax = t->max;
+ }
+ v->subdfas[t->id] = d;
+ }
+ return d;
+}
+
+/*
+ * getladfa - create or re-fetch the DFA for a LACON subre node
+ *
+ * Same as above, but for LACONs.
+ */
+static struct dfa *
+getladfa(struct vars *v,
+ int n)
+{
+ assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
+
+ if (v->ladfas[n] == NULL)
+ {
+ struct subre *sub = &v->g->lacons[n];
+
+ v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
+ /* a LACON can't contain a backref, so nothing else to do */
+ }
+ return v->ladfas[n];
+}
+
+/*
+ * find - find a match for the main NFA (no-complications case)
+ */
+static int
+find(struct vars *v,
+ struct cnfa *cnfa,
+ struct colormap *cm)
+{
+ struct dfa *s;
+ struct dfa *d;
+ chr *begin;
+ chr *end = NULL;
+ chr *cold;
+ chr *open; /* open and close of range of possible starts */
+ chr *close;
+ int hitend;
+ int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
+
+ /* first, a shot with the search RE */
+ s = newdfa(v, &v->g->search, cm, &v->dfa1);
+ if (s == NULL)
+ return v->err;
+ MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
+ cold = NULL;
+ close = shortest(v, s, v->search_start, v->search_start, v->stop,
+ &cold, (int *) NULL);
+ freedfa(s);
+ NOERR();
+ if (v->g->cflags & REG_EXPECT)
+ {
+ assert(v->details != NULL);
+ if (cold != NULL)
+ v->details->rm_extend.rm_so = OFF(cold);
+ else
+ v->details->rm_extend.rm_so = OFF(v->stop);
+ v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
+ }
+ if (close == NULL) /* not found */
+ return REG_NOMATCH;
+ if (v->nmatch == 0) /* found, don't need exact location */
+ return REG_OKAY;
+
+ /* find starting point and match */
+ assert(cold != NULL);
+ open = cold;
+ cold = NULL;
+ MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
+ d = newdfa(v, cnfa, cm, &v->dfa1);
+ if (d == NULL)
+ return v->err;
+ for (begin = open; begin <= close; begin++)
+ {
+ MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
+ if (shorter)
+ end = shortest(v, d, begin, begin, v->stop,
+ (chr **) NULL, &hitend);
+ else
+ end = longest(v, d, begin, v->stop, &hitend);
+ if (ISERR())
+ {
+ freedfa(d);
+ return v->err;
+ }
+ if (hitend && cold == NULL)
+ cold = begin;
+ if (end != NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ assert(end != NULL); /* search RE succeeded so loop should */
+ freedfa(d);
+
+ /* and pin down details */
+ assert(v->nmatch > 0);
+ v->pmatch[0].rm_so = OFF(begin);
+ v->pmatch[0].rm_eo = OFF(end);
+ if (v->g->cflags & REG_EXPECT)
+ {
+ if (cold != NULL)
+ v->details->rm_extend.rm_so = OFF(cold);
+ else
+ v->details->rm_extend.rm_so = OFF(v->stop);
+ v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
+ }
+ if (v->nmatch == 1) /* no need for submatches */
+ return REG_OKAY;
+
+ /* find submatches */
+ return cdissect(v, v->g->tree, begin, end);
+}
+
+/*
+ * cfind - find a match for the main NFA (with complications)
+ */
+static int
+cfind(struct vars *v,
+ struct cnfa *cnfa,
+ struct colormap *cm)
+{
+ struct dfa *s;
+ struct dfa *d;
+ chr *cold;
+ int ret;
+
+ s = newdfa(v, &v->g->search, cm, &v->dfa1);
+ if (s == NULL)
+ return v->err;
+ d = newdfa(v, cnfa, cm, &v->dfa2);
+ if (d == NULL)
+ {
+ freedfa(s);
+ return v->err;
+ }
+
+ ret = cfindloop(v, cnfa, cm, d, s, &cold);
+
+ freedfa(d);
+ freedfa(s);
+ NOERR();
+ if (v->g->cflags & REG_EXPECT)
+ {
+ assert(v->details != NULL);
+ if (cold != NULL)
+ v->details->rm_extend.rm_so = OFF(cold);
+ else
+ v->details->rm_extend.rm_so = OFF(v->stop);
+ v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
+ }
+ return ret;
+}
+
+/*
+ * cfindloop - the heart of cfind
+ */
+static int
+cfindloop(struct vars *v,
+ struct cnfa *cnfa,
+ struct colormap *cm,
+ struct dfa *d,
+ struct dfa *s,
+ chr **coldp) /* where to put coldstart pointer */
+{
+ chr *begin;
+ chr *end;
+ chr *cold;
+ chr *open; /* open and close of range of possible starts */
+ chr *close;
+ chr *estart;
+ chr *estop;
+ int er;
+ int shorter = v->g->tree->flags & SHORTER;
+ int hitend;
+
+ assert(d != NULL && s != NULL);
+ cold = NULL;
+ close = v->search_start;
+ do
+ {
+ /* Search with the search RE for match range at/beyond "close" */
+ MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
+ close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
+ if (ISERR())
+ {
+ *coldp = cold;
+ return v->err;
+ }
+ if (close == NULL)
+ break; /* no more possible match anywhere */
+ assert(cold != NULL);
+ open = cold;
+ cold = NULL;
+ /* Search for matches starting between "open" and "close" inclusive */
+ MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
+ for (begin = open; begin <= close; begin++)
+ {
+ MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
+ estart = begin;
+ estop = v->stop;
+ for (;;)
+ {
+ /* Here we use the top node's detailed RE */
+ if (shorter)
+ end = shortest(v, d, begin, estart,
+ estop, (chr **) NULL, &hitend);
+ else
+ end = longest(v, d, begin, estop,
+ &hitend);
+ if (ISERR())
+ {
+ *coldp = cold;
+ return v->err;
+ }
+ if (hitend && cold == NULL)
+ cold = begin;
+ if (end == NULL)
+ break; /* no match with this begin point, try next */
+ MDEBUG(("tentative end %ld\n", LOFF(end)));
+ /* Dissect the potential match to see if it really matches */
+ er = cdissect(v, v->g->tree, begin, end);
+ if (er == REG_OKAY)
+ {
+ if (v->nmatch > 0)
+ {
+ v->pmatch[0].rm_so = OFF(begin);
+ v->pmatch[0].rm_eo = OFF(end);
+ }
+ *coldp = cold;
+ return REG_OKAY;
+ }
+ if (er != REG_NOMATCH)
+ {
+ ERR(er);
+ *coldp = cold;
+ return er;
+ }
+ /* Try next longer/shorter match with same begin point */
+ if (shorter)
+ {
+ if (end == estop)
+ break; /* no more, so try next begin point */
+ estart = end + 1;
+ }
+ else
+ {
+ if (end == begin)
+ break; /* no more, so try next begin point */
+ estop = end - 1;
+ }
+ } /* end loop over endpoint positions */
+ } /* end loop over beginning positions */
+
+ /*
+ * If we get here, there is no possible match starting at or before
+ * "close", so consider matches beyond that. We'll do a fresh search
+ * with the search RE to find a new promising match range.
+ */
+ close++;
+ } while (close < v->stop);
+
+ *coldp = cold;
+ return REG_NOMATCH;
+}
+
+/*
+ * zapallsubs - initialize all subexpression matches to "no match"
+ *
+ * Note that p[0], the overall-match location, is not touched.
+ */
+static void
+zapallsubs(regmatch_t *p,
+ size_t n)
+{
+ size_t i;
+
+ for (i = n - 1; i > 0; i--)
+ {
+ p[i].rm_so = -1;
+ p[i].rm_eo = -1;
+ }
+}
+
+/*
+ * zaptreesubs - initialize subexpressions within subtree to "no match"
+ */
+static void
+zaptreesubs(struct vars *v,
+ struct subre *t)
+{
+ int n = t->capno;
+ struct subre *t2;
+
+ if (n > 0)
+ {
+ if ((size_t) n < v->nmatch)
+ {
+ v->pmatch[n].rm_so = -1;
+ v->pmatch[n].rm_eo = -1;
+ }
+ }
+
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ zaptreesubs(v, t2);
+}
+
+/*
+ * subset - set subexpression match data for a successful subre
+ */
+static void
+subset(struct vars *v,
+ struct subre *sub,
+ chr *begin,
+ chr *end)
+{
+ int n = sub->capno;
+
+ assert(n > 0);
+ if ((size_t) n >= v->nmatch)
+ return;
+
+ MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end)));
+ v->pmatch[n].rm_so = OFF(begin);
+ v->pmatch[n].rm_eo = OFF(end);
+}
+
+/*
+ * cdissect - check backrefs and determine subexpression matches
+ *
+ * cdissect recursively processes a subre tree to check matching of backrefs
+ * and/or identify submatch boundaries for capture nodes. The proposed match
+ * runs from "begin" to "end" (not including "end"), and we are basically
+ * "dissecting" it to see where the submatches are.
+ *
+ * Before calling any level of cdissect, the caller must have run the node's
+ * DFA and found that the proposed substring satisfies the DFA. (We make
+ * the caller do that because in concatenation and iteration nodes, it's
+ * much faster to check all the substrings against the child DFAs before we
+ * recurse.)
+ *
+ * A side-effect of a successful match is to save match locations for
+ * capturing subexpressions in v->pmatch[]. This is a little bit tricky,
+ * so we make the following rules:
+ * 1. Before initial entry to cdissect, all match data must have been
+ * cleared (this is seen to by zapallsubs).
+ * 2. Before any recursive entry to cdissect, the match data for that
+ * subexpression tree must be guaranteed clear (see zaptreesubs).
+ * 3. When returning REG_OKAY, each level of cdissect will have saved
+ * any relevant match locations.
+ * 4. When returning REG_NOMATCH, each level of cdissect will guarantee
+ * that its subexpression match locations are again clear.
+ * 5. No guarantees are made for error cases (i.e., other result codes).
+ * 6. When a level of cdissect abandons a successful sub-match, it will
+ * clear that subtree's match locations with zaptreesubs before trying
+ * any new DFA match or cdissect call for that subtree or any subtree
+ * to its right (that is, any subtree that could have a backref into the
+ * abandoned match).
+ * This may seem overly complicated, but it's difficult to simplify it
+ * because of the provision that match locations must be reset before
+ * any fresh DFA match (a rule that is needed to make dfa_backref safe).
+ * That means it won't work to just reset relevant match locations at the
+ * start of each cdissect level.
+ */
+static int /* regexec return code */
+cdissect(struct vars *v,
+ struct subre *t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ int er;
+
+ assert(t != NULL);
+ MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));
+
+ /* handy place to check for operation cancel */
+ if (CANCEL_REQUESTED(v->re))
+ return REG_CANCEL;
+ /* ... and stack overrun */
+ if (STACK_TOO_DEEP(v->re))
+ return REG_ETOOBIG;
+
+ switch (t->op)
+ {
+ case '=': /* terminal node */
+ assert(t->child == NULL);
+ er = REG_OKAY; /* no action, parent did the work */
+ break;
+ case 'b': /* back reference */
+ assert(t->child == NULL);
+ er = cbrdissect(v, t, begin, end);
+ break;
+ case '.': /* concatenation */
+ assert(t->child != NULL);
+ if (t->child->flags & SHORTER) /* reverse scan */
+ er = crevcondissect(v, t, begin, end);
+ else
+ er = ccondissect(v, t, begin, end);
+ break;
+ case '|': /* alternation */
+ assert(t->child != NULL);
+ er = caltdissect(v, t, begin, end);
+ break;
+ case '*': /* iteration */
+ assert(t->child != NULL);
+ if (t->child->flags & SHORTER) /* reverse scan */
+ er = creviterdissect(v, t, begin, end);
+ else
+ er = citerdissect(v, t, begin, end);
+ break;
+ case '(': /* no-op capture node */
+ assert(t->child != NULL);
+ er = cdissect(v, t->child, begin, end);
+ break;
+ default:
+ er = REG_ASSERT;
+ break;
+ }
+
+ /*
+ * We should never have a match failure unless backrefs lurk below;
+ * otherwise, either caller failed to check the DFA, or there's some
+ * inconsistency between the DFA and the node's innards.
+ */
+ assert(er != REG_NOMATCH || (t->flags & BACKR));
+
+ /*
+ * If this node is marked as capturing, save successful match's location.
+ */
+ if (t->capno > 0 && er == REG_OKAY)
+ subset(v, t, begin, end);
+
+ return er;
+}
+
+/*
+ * ccondissect - dissect match for concatenation node
+ */
+static int /* regexec return code */
+ccondissect(struct vars *v,
+ struct subre *t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ struct subre *left = t->child;
+ struct subre *right = left->sibling;
+ struct dfa *d;
+ struct dfa *d2;
+ chr *mid;
+ int er;
+
+ assert(t->op == '.');
+ assert(left != NULL && left->cnfa.nstates > 0);
+ assert(right != NULL && right->cnfa.nstates > 0);
+ assert(right->sibling == NULL);
+ assert(!(left->flags & SHORTER));
+
+ d = getsubdfa(v, left);
+ NOERR();
+ d2 = getsubdfa(v, right);
+ NOERR();
+ MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
+
+ /* pick a tentative midpoint */
+ mid = longest(v, d, begin, end, (int *) NULL);
+ NOERR();
+ if (mid == NULL)
+ return REG_NOMATCH;
+ MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
+
+ /* iterate until satisfaction or failure */
+ for (;;)
+ {
+ /* try this midpoint on for size */
+ if (longest(v, d2, mid, end, (int *) NULL) == end)
+ {
+ er = cdissect(v, left, begin, mid);
+ if (er == REG_OKAY)
+ {
+ er = cdissect(v, right, mid, end);
+ if (er == REG_OKAY)
+ {
+ /* satisfaction */
+ MDEBUG(("%d: successful\n", t->id));
+ return REG_OKAY;
+ }
+ /* Reset left's matches (right should have done so itself) */
+ zaptreesubs(v, left);
+ }
+ if (er != REG_NOMATCH)
+ return er;
+ }
+ NOERR();
+
+ /* that midpoint didn't work, find a new one */
+ if (mid == begin)
+ {
+ /* all possibilities exhausted */
+ MDEBUG(("%d: no midpoint\n", t->id));
+ return REG_NOMATCH;
+ }
+ mid = longest(v, d, begin, mid - 1, (int *) NULL);
+ NOERR();
+ if (mid == NULL)
+ {
+ /* failed to find a new one */
+ MDEBUG(("%d: failed midpoint\n", t->id));
+ return REG_NOMATCH;
+ }
+ MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
+ }
+
+ /* can't get here */
+ return REG_ASSERT;
+}
+
+/*
+ * crevcondissect - dissect match for concatenation node, shortest-first
+ */
+static int /* regexec return code */
+crevcondissect(struct vars *v,
+ struct subre *t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ struct subre *left = t->child;
+ struct subre *right = left->sibling;
+ struct dfa *d;
+ struct dfa *d2;
+ chr *mid;
+ int er;
+
+ assert(t->op == '.');
+ assert(left != NULL && left->cnfa.nstates > 0);
+ assert(right != NULL && right->cnfa.nstates > 0);
+ assert(right->sibling == NULL);
+ assert(left->flags & SHORTER);
+
+ d = getsubdfa(v, left);
+ NOERR();
+ d2 = getsubdfa(v, right);
+ NOERR();
+ MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
+
+ /* pick a tentative midpoint */
+ mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
+ NOERR();
+ if (mid == NULL)
+ return REG_NOMATCH;
+ MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
+
+ /* iterate until satisfaction or failure */
+ for (;;)
+ {
+ /* try this midpoint on for size */
+ if (longest(v, d2, mid, end, (int *) NULL) == end)
+ {
+ er = cdissect(v, left, begin, mid);
+ if (er == REG_OKAY)
+ {
+ er = cdissect(v, right, mid, end);
+ if (er == REG_OKAY)
+ {
+ /* satisfaction */
+ MDEBUG(("%d: successful\n", t->id));
+ return REG_OKAY;
+ }
+ /* Reset left's matches (right should have done so itself) */
+ zaptreesubs(v, left);
+ }
+ if (er != REG_NOMATCH)
+ return er;
+ }
+ NOERR();
+
+ /* that midpoint didn't work, find a new one */
+ if (mid == end)
+ {
+ /* all possibilities exhausted */
+ MDEBUG(("%d: no midpoint\n", t->id));
+ return REG_NOMATCH;
+ }
+ mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
+ NOERR();
+ if (mid == NULL)
+ {
+ /* failed to find a new one */
+ MDEBUG(("%d: failed midpoint\n", t->id));
+ return REG_NOMATCH;
+ }
+ MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
+ }
+
+ /* can't get here */
+ return REG_ASSERT;
+}
+
+/*
+ * cbrdissect - dissect match for backref node
+ *
+ * The backref match might already have been verified by dfa_backref(),
+ * but we don't know that for sure so must check it here.
+ */
+static int /* regexec return code */
+cbrdissect(struct vars *v,
+ struct subre *t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ int n = t->backno;
+ size_t numreps;
+ size_t tlen;
+ size_t brlen;
+ chr *brstring;
+ chr *p;
+ int min = t->min;
+ int max = t->max;
+
+ assert(t != NULL);
+ assert(t->op == 'b');
+ assert(n >= 0);
+ assert((size_t) n < v->nmatch);
+
+ MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max,
+ LOFF(begin), LOFF(end)));
+
+ /* get the backreferenced string */
+ if (v->pmatch[n].rm_so == -1)
+ return REG_NOMATCH;
+ brstring = v->start + v->pmatch[n].rm_so;
+ brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
+
+ /* special cases for zero-length strings */
+ if (brlen == 0)
+ {
+ /*
+ * matches only if target is zero length, but any number of
+ * repetitions can be considered to be present
+ */
+ if (begin == end && min <= max)
+ {
+ MDEBUG(("%d: backref matched trivially\n", t->id));
+ return REG_OKAY;
+ }
+ return REG_NOMATCH;
+ }
+ if (begin == end)
+ {
+ /* matches only if zero repetitions are okay */
+ if (min == 0)
+ {
+ MDEBUG(("%d: backref matched trivially\n", t->id));
+ return REG_OKAY;
+ }
+ return REG_NOMATCH;
+ }
+
+ /*
+ * check target length to see if it could possibly be an allowed number of
+ * repetitions of brstring
+ */
+ assert(end > begin);
+ tlen = end - begin;
+ if (tlen % brlen != 0)
+ return REG_NOMATCH;
+ numreps = tlen / brlen;
+ if (numreps < min || (numreps > max && max != DUPINF))
+ return REG_NOMATCH;
+
+ /* okay, compare the actual string contents */
+ p = begin;
+ while (numreps-- > 0)
+ {
+ if ((*v->g->compare) (brstring, p, brlen) != 0)
+ return REG_NOMATCH;
+ p += brlen;
+ }
+
+ MDEBUG(("%d: backref matched\n", t->id));
+ return REG_OKAY;
+}
+
+/*
+ * caltdissect - dissect match for alternation node
+ */
+static int /* regexec return code */
+caltdissect(struct vars *v,
+ struct subre *t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ struct dfa *d;
+ int er;
+
+ assert(t->op == '|');
+
+ t = t->child;
+ /* there should be at least 2 alternatives */
+ assert(t != NULL && t->sibling != NULL);
+
+ while (t != NULL)
+ {
+ assert(t->cnfa.nstates > 0);
+
+ MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
+
+ d = getsubdfa(v, t);
+ NOERR();
+ if (longest(v, d, begin, end, (int *) NULL) == end)
+ {
+ MDEBUG(("%d: caltdissect matched\n", t->id));
+ er = cdissect(v, t, begin, end);
+ if (er != REG_NOMATCH)
+ return er;
+ }
+ NOERR();
+
+ t = t->sibling;
+ }
+
+ return REG_NOMATCH;
+}
+
+/*
+ * citerdissect - dissect match for iteration node
+ */
+static int /* regexec return code */
+citerdissect(struct vars *v,
+ struct subre *t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ struct dfa *d;
+ chr **endpts;
+ chr *limit;
+ int min_matches;
+ size_t max_matches;
+ int nverified;
+ int k;
+ int i;
+ int er;
+
+ assert(t->op == '*');
+ assert(t->child != NULL && t->child->cnfa.nstates > 0);
+ assert(!(t->child->flags & SHORTER));
+ assert(begin <= end);
+
+ MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
+
+ /*
+ * For the moment, assume the minimum number of matches is 1. If zero
+ * matches are allowed, and the target string is empty, we are allowed to
+ * match regardless of the contents of the iter node --- but we would
+ * prefer to match once, so that capturing parens get set. (An example of
+ * the concern here is a pattern like "()*\1", which historically this
+ * code has allowed to succeed.) Therefore, we deal with the zero-matches
+ * case at the bottom, after failing to find any other way to match.
+ */
+ min_matches = t->min;
+ if (min_matches <= 0)
+ min_matches = 1;
+
+ /*
+ * We need workspace to track the endpoints of each sub-match. Normally
+ * we consider only nonzero-length sub-matches, so there can be at most
+ * end-begin of them. However, if min is larger than that, we will also
+ * consider zero-length sub-matches in order to find enough matches.
+ *
+ * For convenience, endpts[0] contains the "begin" pointer and we store
+ * sub-match endpoints in endpts[1..max_matches].
+ */
+ max_matches = end - begin;
+ if (max_matches > t->max && t->max != DUPINF)
+ max_matches = t->max;
+ if (max_matches < min_matches)
+ max_matches = min_matches;
+ endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+ if (endpts == NULL)
+ return REG_ESPACE;
+ endpts[0] = begin;
+
+ d = getsubdfa(v, t->child);
+ if (ISERR())
+ {
+ FREE(endpts);
+ return v->err;
+ }
+
+ /*
+ * Our strategy is to first find a set of sub-match endpoints that are
+ * valid according to the child node's DFA, and then recursively dissect
+ * each sub-match to confirm validity. If any validity check fails,
+ * backtrack that sub-match and try again. And, when we next try for a
+ * validity check, we need not recheck any successfully verified
+ * sub-matches that we didn't move the endpoints of. nverified remembers
+ * how many sub-matches are currently known okay.
+ */
+
+ /* initialize to consider first sub-match */
+ nverified = 0;
+ k = 1;
+ limit = end;
+
+ /* iterate until satisfaction or failure */
+ while (k > 0)
+ {
+ /* try to find an endpoint for the k'th sub-match */
+ endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
+ if (ISERR())
+ {
+ FREE(endpts);
+ return v->err;
+ }
+ if (endpts[k] == NULL)
+ {
+ /* no match possible, so see if we can shorten previous one */
+ k--;
+ goto backtrack;
+ }
+ MDEBUG(("%d: working endpoint %d: %ld\n",
+ t->id, k, LOFF(endpts[k])));
+
+ /* k'th sub-match can no longer be considered verified */
+ if (nverified >= k)
+ nverified = k - 1;
+
+ if (endpts[k] != end)
+ {
+ /* haven't reached end yet, try another iteration if allowed */
+ if (k >= max_matches)
+ {
+ /* must try to shorten some previous match */
+ k--;
+ goto backtrack;
+ }
+
+ /* reject zero-length match unless necessary to achieve min */
+ if (endpts[k] == endpts[k - 1] &&
+ (k >= min_matches || min_matches - k < end - endpts[k]))
+ goto backtrack;
+
+ k++;
+ limit = end;
+ continue;
+ }
+
+ /*
+ * We've identified a way to divide the string into k sub-matches that
+ * works so far as the child DFA can tell. If k is an allowed number
+ * of matches, start the slow part: recurse to verify each sub-match.
+ * We always have k <= max_matches, needn't check that.
+ */
+ if (k < min_matches)
+ goto backtrack;
+
+ MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
+
+ for (i = nverified + 1; i <= k; i++)
+ {
+ /* zap any match data from a non-last iteration */
+ zaptreesubs(v, t->child);
+ er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
+ if (er == REG_OKAY)
+ {
+ nverified = i;
+ continue;
+ }
+ if (er == REG_NOMATCH)
+ break;
+ /* oops, something failed */
+ FREE(endpts);
+ return er;
+ }
+
+ if (i > k)
+ {
+ /* satisfaction */
+ MDEBUG(("%d: successful\n", t->id));
+ FREE(endpts);
+ return REG_OKAY;
+ }
+
+ /* i'th match failed to verify, so backtrack it */
+ k = i;
+
+backtrack:
+
+ /*
+ * Must consider shorter versions of the k'th sub-match. However,
+ * we'll only ask for a zero-length match if necessary.
+ */
+ while (k > 0)
+ {
+ chr *prev_end = endpts[k - 1];
+
+ if (endpts[k] > prev_end)
+ {
+ limit = endpts[k] - 1;
+ if (limit > prev_end ||
+ (k < min_matches && min_matches - k >= end - prev_end))
+ {
+ /* break out of backtrack loop, continue the outer one */
+ break;
+ }
+ }
+ /* can't shorten k'th sub-match any more, consider previous one */
+ k--;
+ }
+ }
+
+ /* all possibilities exhausted */
+ FREE(endpts);
+
+ /*
+ * Now consider the possibility that we can match to a zero-length string
+ * by using zero repetitions.
+ */
+ if (t->min == 0 && begin == end)
+ {
+ MDEBUG(("%d: allowing zero matches\n", t->id));
+ return REG_OKAY;
+ }
+
+ MDEBUG(("%d: failed\n", t->id));
+ return REG_NOMATCH;
+}
+
+/*
+ * creviterdissect - dissect match for iteration node, shortest-first
+ */
+static int /* regexec return code */
+creviterdissect(struct vars *v,
+ struct subre *t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+{
+ struct dfa *d;
+ chr **endpts;
+ chr *limit;
+ int min_matches;
+ size_t max_matches;
+ int nverified;
+ int k;
+ int i;
+ int er;
+
+ assert(t->op == '*');
+ assert(t->child != NULL && t->child->cnfa.nstates > 0);
+ assert(t->child->flags & SHORTER);
+ assert(begin <= end);
+
+ MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
+
+ /*
+ * If zero matches are allowed, and target string is empty, just declare
+ * victory. OTOH, if target string isn't empty, zero matches can't work
+ * so we pretend the min is 1.
+ */
+ min_matches = t->min;
+ if (min_matches <= 0)
+ {
+ if (begin == end)
+ {
+ MDEBUG(("%d: allowing zero matches\n", t->id));
+ return REG_OKAY;
+ }
+ min_matches = 1;
+ }
+
+ /*
+ * We need workspace to track the endpoints of each sub-match. Normally
+ * we consider only nonzero-length sub-matches, so there can be at most
+ * end-begin of them. However, if min is larger than that, we will also
+ * consider zero-length sub-matches in order to find enough matches.
+ *
+ * For convenience, endpts[0] contains the "begin" pointer and we store
+ * sub-match endpoints in endpts[1..max_matches].
+ */
+ max_matches = end - begin;
+ if (max_matches > t->max && t->max != DUPINF)
+ max_matches = t->max;
+ if (max_matches < min_matches)
+ max_matches = min_matches;
+ endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+ if (endpts == NULL)
+ return REG_ESPACE;
+ endpts[0] = begin;
+
+ d = getsubdfa(v, t->child);
+ if (ISERR())
+ {
+ FREE(endpts);
+ return v->err;
+ }
+
+ /*
+ * Our strategy is to first find a set of sub-match endpoints that are
+ * valid according to the child node's DFA, and then recursively dissect
+ * each sub-match to confirm validity. If any validity check fails,
+ * backtrack that sub-match and try again. And, when we next try for a
+ * validity check, we need not recheck any successfully verified
+ * sub-matches that we didn't move the endpoints of. nverified remembers
+ * how many sub-matches are currently known okay.
+ */
+
+ /* initialize to consider first sub-match */
+ nverified = 0;
+ k = 1;
+ limit = begin;
+
+ /* iterate until satisfaction or failure */
+ while (k > 0)
+ {
+ /* disallow zero-length match unless necessary to achieve min */
+ if (limit == endpts[k - 1] &&
+ limit != end &&
+ (k >= min_matches || min_matches - k < end - limit))
+ limit++;
+
+ /* if this is the last allowed sub-match, it must reach to the end */
+ if (k >= max_matches)
+ limit = end;
+
+ /* try to find an endpoint for the k'th sub-match */
+ endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
+ (chr **) NULL, (int *) NULL);
+ if (ISERR())
+ {
+ FREE(endpts);
+ return v->err;
+ }
+ if (endpts[k] == NULL)
+ {
+ /* no match possible, so see if we can lengthen previous one */
+ k--;
+ goto backtrack;
+ }
+ MDEBUG(("%d: working endpoint %d: %ld\n",
+ t->id, k, LOFF(endpts[k])));
+
+ /* k'th sub-match can no longer be considered verified */
+ if (nverified >= k)
+ nverified = k - 1;
+
+ if (endpts[k] != end)
+ {
+ /* haven't reached end yet, try another iteration if allowed */
+ if (k >= max_matches)
+ {
+ /* must try to lengthen some previous match */
+ k--;
+ goto backtrack;
+ }
+
+ k++;
+ limit = endpts[k - 1];
+ continue;
+ }
+
+ /*
+ * We've identified a way to divide the string into k sub-matches that
+ * works so far as the child DFA can tell. If k is an allowed number
+ * of matches, start the slow part: recurse to verify each sub-match.
+ * We always have k <= max_matches, needn't check that.
+ */
+ if (k < min_matches)
+ goto backtrack;
+
+ MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
+
+ for (i = nverified + 1; i <= k; i++)
+ {
+ /* zap any match data from a non-last iteration */
+ zaptreesubs(v, t->child);
+ er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
+ if (er == REG_OKAY)
+ {
+ nverified = i;
+ continue;
+ }
+ if (er == REG_NOMATCH)
+ break;
+ /* oops, something failed */
+ FREE(endpts);
+ return er;
+ }
+
+ if (i > k)
+ {
+ /* satisfaction */
+ MDEBUG(("%d: successful\n", t->id));
+ FREE(endpts);
+ return REG_OKAY;
+ }
+
+ /* i'th match failed to verify, so backtrack it */
+ k = i;
+
+backtrack:
+
+ /*
+ * Must consider longer versions of the k'th sub-match.
+ */
+ while (k > 0)
+ {
+ if (endpts[k] < end)
+ {
+ limit = endpts[k] + 1;
+ /* break out of backtrack loop, continue the outer one */
+ break;
+ }
+ /* can't lengthen k'th sub-match any more, consider previous one */
+ k--;
+ }
+ }
+
+ /* all possibilities exhausted */
+ MDEBUG(("%d: failed\n", t->id));
+ FREE(endpts);
+ return REG_NOMATCH;
+}
+
+
+
+#include "rege_dfa.c"
diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c
new file mode 100644
index 0000000..04bbe0e
--- /dev/null
+++ b/src/backend/regex/regexport.c
@@ -0,0 +1,293 @@
+/*-------------------------------------------------------------------------
+ *
+ * regexport.c
+ * Functions for exporting info about a regex's NFA
+ *
+ * In this implementation, the NFA defines a necessary but not sufficient
+ * condition for a string to match the regex: that is, there can be strings
+ * that match the NFA but don't match the full regex, but not vice versa.
+ * Thus, for example, it is okay for the functions below to treat lookaround
+ * constraints as no-ops, since they merely constrain the string some more.
+ *
+ * Notice that these functions return info into caller-provided arrays
+ * rather than doing their own malloc's. This simplifies the APIs by
+ * eliminating a class of error conditions, and in the case of colors
+ * allows the caller to decide how big is too big to bother with.
+ *
+ *
+ * Portions Copyright (c) 2013-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ * src/backend/regex/regexport.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+#include "regex/regexport.h"
+
+
+/*
+ * Get total number of NFA states.
+ */
+int
+pg_reg_getnumstates(const regex_t *regex)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ return cnfa->nstates;
+}
+
+/*
+ * Get initial state of NFA.
+ */
+int
+pg_reg_getinitialstate(const regex_t *regex)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ return cnfa->pre;
+}
+
+/*
+ * Get final state of NFA.
+ */
+int
+pg_reg_getfinalstate(const regex_t *regex)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ return cnfa->post;
+}
+
+/*
+ * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON
+ * arcs from the caller, treating any LACON as being automatically satisfied.
+ * Since the output representation does not support arcs that consume no
+ * character when traversed, we have to recursively traverse LACON arcs here,
+ * and report whatever normal arcs are reachable by traversing LACON arcs.
+ * Note that this wouldn't work if it were possible to reach the final state
+ * via LACON traversal, but the regex library never builds NFAs that have
+ * LACON arcs leading directly to the final state. (This is because the
+ * regex executor is designed to consume one character beyond the nominal
+ * match end --- possibly an EOS indicator --- so there is always a set of
+ * ordinary arcs leading to the final state.)
+ *
+ * traverse_lacons is a recursive subroutine used by both exported functions
+ * to count and then emit the reachable regular arcs. *arcs_count is
+ * incremented by the number of reachable arcs, and as many as will fit in
+ * arcs_len (possibly 0) are emitted into arcs[].
+ */
+static void
+traverse_lacons(struct cnfa *cnfa, int st,
+ int *arcs_count,
+ regex_arc_t *arcs, int arcs_len)
+{
+ struct carc *ca;
+
+ /*
+ * Since this function recurses, it could theoretically be driven to stack
+ * overflow. In practice, this is mostly useful to backstop against a
+ * failure of the regex compiler to remove a loop of LACON arcs.
+ */
+ check_stack_depth();
+
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co < cnfa->ncolors)
+ {
+ /* Ordinary arc, so count and possibly emit it */
+ int ndx = (*arcs_count)++;
+
+ if (ndx < arcs_len)
+ {
+ arcs[ndx].co = ca->co;
+ arcs[ndx].to = ca->to;
+ }
+ }
+ else
+ {
+ /* LACON arc --- assume it's satisfied and recurse... */
+ /* ... but first, assert it doesn't lead directly to post state */
+ Assert(ca->to != cnfa->post);
+
+ traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len);
+ }
+ }
+}
+
+/*
+ * Get number of outgoing NFA arcs of state number "st".
+ */
+int
+pg_reg_getnumoutarcs(const regex_t *regex, int st)
+{
+ struct cnfa *cnfa;
+ int arcs_count;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (st < 0 || st >= cnfa->nstates)
+ return 0;
+ arcs_count = 0;
+ traverse_lacons(cnfa, st, &arcs_count, NULL, 0);
+ return arcs_count;
+}
+
+/*
+ * Write array of outgoing NFA arcs of state number "st" into arcs[],
+ * whose length arcs_len must be at least as long as indicated by
+ * pg_reg_getnumoutarcs(), else not all arcs will be returned.
+ */
+void
+pg_reg_getoutarcs(const regex_t *regex, int st,
+ regex_arc_t *arcs, int arcs_len)
+{
+ struct cnfa *cnfa;
+ int arcs_count;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
+ return;
+ arcs_count = 0;
+ traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len);
+}
+
+/*
+ * Get total number of colors.
+ */
+int
+pg_reg_getnumcolors(const regex_t *regex)
+{
+ struct colormap *cm;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cm = &((struct guts *) regex->re_guts)->cmap;
+
+ return cm->max + 1;
+}
+
+/*
+ * Check if color is beginning of line/string.
+ *
+ * (We might at some point need to offer more refined handling of pseudocolors,
+ * but this will do for now.)
+ */
+int
+pg_reg_colorisbegin(const regex_t *regex, int co)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (co == cnfa->bos[0] || co == cnfa->bos[1])
+ return true;
+ else
+ return false;
+}
+
+/*
+ * Check if color is end of line/string.
+ */
+int
+pg_reg_colorisend(const regex_t *regex, int co)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (co == cnfa->eos[0] || co == cnfa->eos[1])
+ return true;
+ else
+ return false;
+}
+
+/*
+ * Get number of member chrs of color number "co".
+ *
+ * Note: we return -1 if the color number is invalid, or if it is a special
+ * color (WHITE, RAINBOW, or a pseudocolor), or if the number of members is
+ * uncertain.
+ * Callers should not try to extract the members if -1 is returned.
+ */
+int
+pg_reg_getnumcharacters(const regex_t *regex, int co)
+{
+ struct colormap *cm;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cm = &((struct guts *) regex->re_guts)->cmap;
+
+ if (co <= 0 || co > cm->max) /* <= 0 rejects WHITE and RAINBOW */
+ return -1;
+ if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */
+ return -1;
+
+ /*
+ * If the color appears anywhere in the high colormap, treat its number of
+ * members as uncertain. In principle we could determine all the specific
+ * chrs corresponding to each such entry, but it would be expensive
+ * (particularly if character class tests are required) and it doesn't
+ * seem worth it.
+ */
+ if (cm->cd[co].nuchrs != 0)
+ return -1;
+
+ /* OK, return the known number of member chrs */
+ return cm->cd[co].nschrs;
+}
+
+/*
+ * Write array of member chrs of color number "co" into chars[],
+ * whose length chars_len must be at least as long as indicated by
+ * pg_reg_getnumcharacters(), else not all chars will be returned.
+ *
+ * Fetching the members of WHITE, RAINBOW, or a pseudocolor is not supported.
+ *
+ * Caution: this is a relatively expensive operation.
+ */
+void
+pg_reg_getcharacters(const regex_t *regex, int co,
+ pg_wchar *chars, int chars_len)
+{
+ struct colormap *cm;
+ chr c;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cm = &((struct guts *) regex->re_guts)->cmap;
+
+ if (co <= 0 || co > cm->max || chars_len <= 0)
+ return;
+ if (cm->cd[co].flags & PSEUDO)
+ return;
+
+ /*
+ * We need only examine the low character map; there should not be any
+ * matching entries in the high map.
+ */
+ for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
+ {
+ if (cm->locolormap[c - CHR_MIN] == co)
+ {
+ *chars++ = c;
+ if (--chars_len == 0)
+ break;
+ }
+ }
+}
diff --git a/src/backend/regex/regfree.c b/src/backend/regex/regfree.c
new file mode 100644
index 0000000..ae17ae7
--- /dev/null
+++ b/src/backend/regex/regfree.c
@@ -0,0 +1,54 @@
+/*
+ * regfree - free an RE
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results. The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regfree.c
+ *
+ *
+ * You might think that this could be incorporated into regcomp.c, and
+ * that would be a reasonable idea... except that this is a generic
+ * function (with a generic name), applicable to all compiled REs
+ * regardless of the size of their characters, whereas the stuff in
+ * regcomp.c gets compiled once per character size.
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * pg_regfree - free an RE (generic function, punts to RE-specific function)
+ *
+ * Ignoring invocation with NULL is a convenience.
+ */
+void
+pg_regfree(regex_t *re)
+{
+ if (re == NULL)
+ return;
+ (*((struct fns *) re->re_fns)->free) (re);
+}
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
new file mode 100644
index 0000000..4a7794c
--- /dev/null
+++ b/src/backend/regex/regprefix.c
@@ -0,0 +1,268 @@
+/*-------------------------------------------------------------------------
+ *
+ * regprefix.c
+ * Extract a common prefix, if any, from a compiled regex.
+ *
+ *
+ * Portions Copyright (c) 2012-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ * src/backend/regex/regprefix.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * forward declarations
+ */
+static int findprefix(struct cnfa *cnfa, struct colormap *cm,
+ chr *string, size_t *slength);
+
+
+/*
+ * pg_regprefix - get common prefix for regular expression
+ *
+ * Returns one of:
+ * REG_NOMATCH: there is no common prefix of strings matching the regex
+ * REG_PREFIX: there is a common prefix of strings matching the regex
+ * REG_EXACT: all strings satisfying the regex must match the same string
+ * or a REG_XXX error code
+ *
+ * In the non-failure cases, *string is set to a malloc'd string containing
+ * the common prefix or exact value, of length *slength (measured in chrs
+ * not bytes!).
+ *
+ * This function does not analyze all complex cases (such as lookaround
+ * constraints) exactly. Therefore it is possible that some strings matching
+ * the reported prefix or exact-match string do not satisfy the regex. But
+ * it should never be the case that a string satisfying the regex does not
+ * match the reported prefix or exact-match string.
+ */
+int
+pg_regprefix(regex_t *re,
+ chr **string,
+ size_t *slength)
+{
+ struct guts *g;
+ struct cnfa *cnfa;
+ int st;
+
+ /* sanity checks */
+ if (string == NULL || slength == NULL)
+ return REG_INVARG;
+ *string = NULL; /* initialize for failure cases */
+ *slength = 0;
+ if (re == NULL || re->re_magic != REMAGIC)
+ return REG_INVARG;
+ if (re->re_csize != sizeof(chr))
+ return REG_MIXED;
+
+ /* Initialize locale-dependent support */
+ pg_set_regex_collation(re->re_collation);
+
+ /* setup */
+ g = (struct guts *) re->re_guts;
+ if (g->info & REG_UIMPOSSIBLE)
+ return REG_NOMATCH;
+
+ /*
+ * This implementation considers only the search NFA for the topmost regex
+ * tree node. Therefore, constraints such as backrefs are not fully
+ * applied, which is allowed per the function's API spec.
+ */
+ assert(g->tree != NULL);
+ cnfa = &g->tree->cnfa;
+
+ /* matchall NFAs never have a fixed prefix */
+ if (cnfa->flags & MATCHALL)
+ return REG_NOMATCH;
+
+ /*
+ * Since a correct NFA should never contain any exit-free loops, it should
+ * not be possible for our traversal to return to a previously visited NFA
+ * state. Hence we need at most nstates chrs in the output string.
+ */
+ *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
+ if (*string == NULL)
+ return REG_ESPACE;
+
+ /* do it */
+ st = findprefix(cnfa, &g->cmap, *string, slength);
+
+ assert(*slength <= cnfa->nstates);
+
+ /* clean up */
+ if (st != REG_PREFIX && st != REG_EXACT)
+ {
+ FREE(*string);
+ *string = NULL;
+ *slength = 0;
+ }
+
+ return st;
+}
+
+/*
+ * findprefix - extract common prefix from cNFA
+ *
+ * Results are returned into the preallocated chr array string[], with
+ * *slength (which must be preset to zero) incremented for each chr.
+ */
+static int /* regprefix return code */
+findprefix(struct cnfa *cnfa,
+ struct colormap *cm,
+ chr *string,
+ size_t *slength)
+{
+ int st;
+ int nextst;
+ color thiscolor;
+ chr c;
+ struct carc *ca;
+
+ /*
+ * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
+ * anchored left. If we have both BOS and BOL, they must go to the same
+ * next state.
+ */
+ st = cnfa->pre;
+ nextst = -1;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+ {
+ if (nextst == -1)
+ nextst = ca->to;
+ else if (nextst != ca->to)
+ return REG_NOMATCH;
+ }
+ else
+ return REG_NOMATCH;
+ }
+ if (nextst == -1)
+ return REG_NOMATCH;
+
+ /*
+ * Scan through successive states, stopping as soon as we find one with
+ * more than one acceptable transition character (either multiple colors
+ * on out-arcs, or a color with more than one member chr).
+ *
+ * We could find a state with multiple out-arcs that are all labeled with
+ * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
+ * In that case we add the chr "c" to the output string but then exit the
+ * loop with nextst == -1. This leaves a little bit on the table: if the
+ * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
+ * to the prefix. But chasing multiple parallel state chains doesn't seem
+ * worth the trouble.
+ */
+ do
+ {
+ st = nextst;
+ nextst = -1;
+ thiscolor = COLORLESS;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ /* We can ignore BOS/BOL arcs */
+ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+ continue;
+
+ /*
+ * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
+ * and LACONs
+ */
+ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
+ ca->co == RAINBOW || ca->co >= cnfa->ncolors)
+ {
+ thiscolor = COLORLESS;
+ break;
+ }
+ if (thiscolor == COLORLESS)
+ {
+ /* First plain outarc */
+ thiscolor = ca->co;
+ nextst = ca->to;
+ }
+ else if (thiscolor == ca->co)
+ {
+ /* Another plain outarc for same color */
+ nextst = -1;
+ }
+ else
+ {
+ /* More than one plain outarc color terminates the search */
+ thiscolor = COLORLESS;
+ break;
+ }
+ }
+ /* Done if we didn't find exactly one color on plain outarcs */
+ if (thiscolor == COLORLESS)
+ break;
+ /* The color must be a singleton */
+ if (cm->cd[thiscolor].nschrs != 1)
+ break;
+ /* Must not have any high-color-map entries */
+ if (cm->cd[thiscolor].nuchrs != 0)
+ break;
+
+ /*
+ * Identify the color's sole member chr and add it to the prefix
+ * string. In general the colormap data structure doesn't provide a
+ * way to find color member chrs, except by trying GETCOLOR() on each
+ * possible chr value, which won't do at all. However, for the cases
+ * we care about it should be sufficient to test the "firstchr" value,
+ * that is the first chr ever added to the color. There are cases
+ * where this might no longer be a member of the color (so we do need
+ * to test), but none of them are likely to arise for a character that
+ * is a member of a common prefix. If we do hit such a corner case,
+ * we just fall out without adding anything to the prefix string.
+ */
+ c = cm->cd[thiscolor].firstchr;
+ if (GETCOLOR(cm, c) != thiscolor)
+ break;
+
+ string[(*slength)++] = c;
+
+ /* Advance to next state, but only if we have a unique next state */
+ } while (nextst != -1);
+
+ /*
+ * If we ended at a state that only has EOS/EOL outarcs leading to the
+ * "post" state, then we have an exact-match string. Note this is true
+ * even if the string is of zero length.
+ */
+ nextst = -1;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+ {
+ if (nextst == -1)
+ nextst = ca->to;
+ else if (nextst != ca->to)
+ {
+ nextst = -1;
+ break;
+ }
+ }
+ else
+ {
+ nextst = -1;
+ break;
+ }
+ }
+ if (nextst == cnfa->post)
+ return REG_EXACT;
+
+ /*
+ * Otherwise, if we were unable to identify any prefix characters, say
+ * NOMATCH --- the pattern is anchored left, but doesn't specify any
+ * particular first character.
+ */
+ if (*slength > 0)
+ return REG_PREFIX;
+
+ return REG_NOMATCH;
+}