diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/regex | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/regex')
-rw-r--r-- | src/backend/regex/COPYRIGHT | 84 | ||||
-rw-r--r-- | src/backend/regex/Makefile | 29 | ||||
-rw-r--r-- | src/backend/regex/README | 440 | ||||
-rw-r--r-- | src/backend/regex/regc_color.c | 1186 | ||||
-rw-r--r-- | src/backend/regex/regc_cvec.c | 138 | ||||
-rw-r--r-- | src/backend/regex/regc_lex.c | 1039 | ||||
-rw-r--r-- | src/backend/regex/regc_locale.c | 771 | ||||
-rw-r--r-- | src/backend/regex/regc_nfa.c | 3824 | ||||
-rw-r--r-- | src/backend/regex/regc_pg_locale.c | 944 | ||||
-rw-r--r-- | src/backend/regex/regcomp.c | 2582 | ||||
-rw-r--r-- | src/backend/regex/rege_dfa.c | 1106 | ||||
-rw-r--r-- | src/backend/regex/regerror.c | 120 | ||||
-rw-r--r-- | src/backend/regex/regexec.c | 1494 | ||||
-rw-r--r-- | src/backend/regex/regexport.c | 293 | ||||
-rw-r--r-- | src/backend/regex/regfree.c | 54 | ||||
-rw-r--r-- | src/backend/regex/regprefix.c | 268 |
16 files changed, 14372 insertions, 0 deletions
diff --git a/src/backend/regex/COPYRIGHT b/src/backend/regex/COPYRIGHT new file mode 100644 index 0000000..e50cfb1 --- /dev/null +++ b/src/backend/regex/COPYRIGHT @@ -0,0 +1,84 @@ +This regular expression package was originally developed by Henry Spencer. +It bears the following copyright notice: + +********************************************************************** + +Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + +Development of this software was funded, in part, by Cray Research Inc., +UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics +Corporation, none of whom are responsible for the results. The author +thanks all of them. + +Redistribution and use in source and binary forms -- with or without +modification -- are permitted for any purpose, provided that +redistributions in source form retain this entire copyright notice and +indicate the origin and nature of any modifications. + +I'd appreciate being given credit for this package in the documentation +of software which uses it, but that is not a requirement. + +THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +********************************************************************** + +PostgreSQL adopted the code out of Tcl 8.4.1. Portions of regc_locale.c +and re_syntax.n were developed by Tcl developers other than Henry; these +files bear the Tcl copyright and license notice: + +********************************************************************** + +This software is copyrighted by the Regents of the University of +California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState +Corporation and other parties. The following terms apply to all files +associated with the software unless explicitly disclaimed in +individual files. + +The authors hereby grant permission to use, copy, modify, distribute, +and license this software and its documentation for any purpose, provided +that existing copyright notices are retained in all copies and that this +notice is included verbatim in any distributions. No written agreement, +license, or royalty fee is required for any of the authorized uses. +Modifications to this software may be copyrighted by their authors +and need not follow the licensing terms described here, provided that +the new terms are clearly indicated on the first page of each file where +they apply. + +IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY +FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES +ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY +DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE +IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE +NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR +MODIFICATIONS. + +GOVERNMENT USE: If you are acquiring this software on behalf of the +U.S. government, the Government shall have only "Restricted Rights" +in the software and related documentation as defined in the Federal +Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you +are acquiring the software on behalf of the Department of Defense, the +software shall be classified as "Commercial Computer Software" and the +Government shall have only "Restricted Rights" as defined in Clause +252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the +authors grant the U.S. Government and others acting in its behalf +permission to use and distribute the software in accordance with the +terms specified in this license. + +********************************************************************** + +Subsequent modifications to the code by the PostgreSQL project follow +the same license terms as the rest of PostgreSQL. diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile new file mode 100644 index 0000000..5210c16 --- /dev/null +++ b/src/backend/regex/Makefile @@ -0,0 +1,29 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for backend/regex +# +# IDENTIFICATION +# src/backend/regex/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/regex +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + regcomp.o \ + regerror.o \ + regexec.o \ + regexport.o \ + regfree.o \ + regprefix.o + +include $(top_srcdir)/src/backend/common.mk + +# mark inclusion dependencies between .c files explicitly +regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c \ + regc_locale.c regc_pg_locale.c + +regexec.o: regexec.c rege_dfa.c diff --git a/src/backend/regex/README b/src/backend/regex/README new file mode 100644 index 0000000..e4b0836 --- /dev/null +++ b/src/backend/regex/README @@ -0,0 +1,440 @@ +Implementation notes about Henry Spencer's regex library +======================================================== + +If Henry ever had any internals documentation, he didn't publish it. +So this file is an attempt to reverse-engineer some docs. + +General source-file layout +-------------------------- + +There are six separately-compilable source files, five of which expose +exactly one exported function apiece: + regcomp.c: pg_regcomp + regexec.c: pg_regexec + regerror.c: pg_regerror + regfree.c: pg_regfree + regprefix.c: pg_regprefix +(The pg_ prefixes were added by the Postgres project to distinguish this +library version from any similar one that might be present on a particular +system. They'd need to be removed or replaced in any standalone version +of the library.) + +The sixth file, regexport.c, exposes multiple functions that allow extraction +of info about a compiled regex (see regexport.h). + +There are additional source files regc_*.c that are #include'd in regcomp, +and similarly additional source files rege_*.c that are #include'd in +regexec. This was done to avoid exposing internal symbols globally; +all functions not meant to be part of the library API are static. + +(Actually the above is a lie in one respect: there are two more global +symbols, pg_set_regex_collation and pg_reg_getcolor in regcomp. These are +not meant to be part of the API, but they have to be global because both +regcomp and regexec call them. It'd be better to get rid of +pg_set_regex_collation, as well as the static variables it sets, in favor of +keeping the needed locale state in the regex structs. We have not done this +yet for lack of a design for how to add application-specific state to the +structs.) + +What's where in src/backend/regex/: + +regcomp.c Top-level regex compilation code +regc_color.c Color map management +regc_cvec.c Character vector (cvec) management +regc_lex.c Lexer +regc_nfa.c NFA handling +regc_locale.c Application-specific locale code from Tcl project +regc_pg_locale.c Postgres-added application-specific locale code +regexec.c Top-level regex execution code +rege_dfa.c DFA creation and execution +regerror.c pg_regerror: generate text for a regex error code +regfree.c pg_regfree: API to free a no-longer-needed regex_t +regexport.c Functions for extracting info from a regex_t +regprefix.c Code for extracting a common prefix from a regex_t + +The locale-specific code is concerned primarily with case-folding and with +expanding locale-specific character classes, such as [[:alnum:]]. It +really needs refactoring if this is ever to become a standalone library. + +The header files for the library are in src/include/regex/: + +regcustom.h Customizes library for particular application +regerrs.h Error message list +regex.h Exported API +regexport.h Exported API for regexport.c +regguts.h Internals declarations + + +DFAs, NFAs, and all that +------------------------ + +This library is a hybrid DFA/NFA regex implementation. (If you've never +heard either of those terms, get thee to a first-year comp sci textbook.) +It might not be clear at first glance what that really means and how it +relates to what you'll see in the code. Here's what really happens: + +* Initial parsing of a regex generates an NFA representation, with number +of states approximately proportional to the length of the regexp. + +* The NFA is then optimized into a "compact NFA" representation, which is +basically the same idea but without fields that are not going to be needed +at runtime. It is simplified too: the compact format only allows "plain" +and "LACON" arc types. The cNFA representation is what is passed from +regcomp to regexec. + +* Unlike traditional NFA-based regex engines, we do not execute directly +from the NFA representation, as that would require backtracking and so be +very slow in some cases. Rather, we execute a DFA, which ideally can +process an input string in linear time (O(M) for M characters of input) +without backtracking. Each state of the DFA corresponds to a set of +states of the NFA, that is all the states that the NFA might have been in +upon reaching the current point in the input string. Therefore, an NFA +with N states might require as many as 2^N states in the corresponding +DFA, which could easily require unreasonable amounts of memory. We deal +with this by materializing states of the DFA lazily (only when needed) and +keeping them in a limited-size cache. The possible need to build the same +state of the DFA repeatedly makes this approach not truly O(M) time, but +in the worst case as much as O(M*N). That's still far better than the +worst case for a backtracking NFA engine. + +If that were the end of it, we'd just say this is a DFA engine, with the +use of NFAs being merely an implementation detail. However, a DFA engine +cannot handle some important regex features such as capturing parens and +back-references. If the parser finds that a regex uses these features +(collectively called "messy cases" in the code), then we have to use +NFA-style backtracking search after all. + +When using the NFA mode, the representation constructed by the parser +consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are +either plain regular expressions (which are executed as DFAs in the manner +described above) or back-references (which try to match the input to some +previous substring). Non-leaf nodes are capture nodes (which save the +location of the substring currently matching their child node), +concatenation, alternation, or iteration nodes. At execution time, the +executor recursively scans the tree. At concatenation, alternation, or +iteration nodes, it considers each possible alternative way of matching the +input string, that is each place where the string could be split for a +concatenation or iteration, or each child node for an alternation. It +tries the next alternative if the match fails according to the child nodes. +This is exactly the sort of backtracking search done by a traditional NFA +regex engine. If there are many tree levels it can get very slow. + +But all is not lost: we can still be smarter than the average pure NFA +engine. To do this, each subre node has an associated DFA, which +represents what the node could possibly match insofar as a mathematically +pure regex can describe that, which basically means "no backrefs". +Before we perform any search of possible alternative sub-matches, we run +the DFA to see if it thinks the proposed substring could possibly match. +If not, we can reject the match immediately without iterating through many +possibilities. + +As an example, consider the regex "(a[bc]+)\1". The compiled +representation will have a top-level concatenation subre node. Its first +child is a plain DFA node for "a[bc]+" (which is marked as being a capture +node). The concatenation's second child is a backref node for \1. +The DFA associated with the concatenation node will be "a[bc]+a[bc]+", +where the backref has been replaced by a copy of the DFA for its referent +expression. When executed, the concatenation node will have to search for +a possible division of the input string that allows its two child nodes to +each match their part of the string (and although this specific case can +only succeed when the division is at the middle, the code does not know +that, nor would it be true in general). However, we can first run the DFA +and quickly reject any input that doesn't start with an "a" and contain +one more "a" plus some number of b's and c's. If the DFA doesn't match, +there is no need to recurse to the two child nodes for each possible +string division point. In many cases, this prefiltering makes the search +run much faster than a pure NFA engine could do. It is this behavior that +justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's +library. + +It's perhaps worth noting that separate capture subre nodes are a rarity: +normally, we just mark a subre as capturing and that's it. However, it's +legal to write a regex like "((x))" in which the same substring has to be +captured by multiple sets of parentheses. Since a subre has room for only +one "capno" field, a single subre can't handle that. We handle such cases +by wrapping the base subre (which captures the innermost parens) in a +no-op capture node, or even more than one for "(((x)))" etc. This is a +little bit inefficient because we end up with multiple identical NFAs, +but since the case is pointless and infrequent, it's not worth working +harder. + + +Colors and colormapping +----------------------- + +In many common regex patterns, there are large numbers of characters that +can be treated alike by the execution engine. A simple example is the +pattern "[[:alpha:]][[:alnum:]]*" for an identifier. Basically the engine +only needs to care whether an input symbol is a letter, a digit, or other. +We could build the NFA or DFA with a separate arc for each possible letter +and digit, but that's very wasteful of space and not so cheap to execute +either, especially when dealing with Unicode which can have thousands of +letters. Instead, the parser builds a "color map" that maps each possible +input symbol to a "color", or equivalence class. The NFA or DFA +representation then has arcs labeled with colors, not specific input +symbols. At execution, the first thing the executor does with each input +symbol is to look up its color in the color map, and then everything else +works from the color only. + +To build the colormap, we start by assigning every possible input symbol +the color WHITE, which means "other" (that is, at the end of parsing, the +symbols that are still WHITE are those not explicitly referenced anywhere +in the regex). When we see a simple literal character or a bracket +expression in the regex, we want to assign that character, or all the +characters represented by the bracket expression, a unique new color that +can be used to label the NFA arc corresponding to the state transition for +matching this character or bracket expression. The basic idea is: +first, change the color assigned to a character to some new value; +second, run through all the existing arcs in the partially-built NFA, +and for each one referencing the character's old color, add a parallel +arc referencing its new color (this keeps the reassignment from changing +the semantics of what we already built); and third, add a new arc with +the character's new color to the current pair of NFA states, denoting +that seeing this character allows the state transition to be made. + +This is complicated a bit by not wanting to create more colors +(equivalence classes) than absolutely necessary. In particular, if a +bracket expression mentions two characters that had the same color before, +they should still share the same color after we process the bracket, since +there is still not a need to distinguish them. But we do need to +distinguish them from other characters that previously had the same color +yet are not listed in the bracket expression. To mechanize this, the code +has a concept of "parent colors" and "subcolors", where a color's subcolor +is the new color that we are giving to any characters of that color while +parsing the current atom. (The word "parent" is a bit unfortunate here, +because it suggests a long-lived relationship, but a subcolor link really +only lasts for the duration of parsing a single atom.) In other words, +a subcolor link means that we are in process of splitting the parent color +into two colors (equivalence classes), depending on whether or not each +member character should be included by the current regex atom. + +As an example, suppose we have the regex "a\d\wx". Initially all possible +character codes are labeled WHITE (color 0). To parse the atom "a", we +create a new color (1), update "a"'s color map entry to 1, and create an +arc labeled 1 between the first two states of the NFA. Now we see \d, +which is really a bracket expression containing the digits "0"-"9". +First we process "0", which is currently WHITE, so we create a new color +(2), update "0"'s color map entry to 2, and create an arc labeled 2 +between the second and third states of the NFA. We also mark color WHITE +as having the subcolor 2, which means that future relabelings of WHITE +characters should also select 2 as the new color. Thus, when we process +"1", we won't create a new color but re-use 2. We update "1"'s color map +entry to 2, and then find that we don't need a new arc because there is +already one labeled 2 between the second and third states of the NFA. +Similarly for the other 8 digits, so there will be only one arc labeled 2 +between NFA states 2 and 3 for all members of this bracket expression. +At completion of processing of the bracket expression, we call okcolors() +which breaks all the existing parent/subcolor links; there is no longer a +marker saying that WHITE characters should be relabeled 2. (Note: +actually, we did the same creation and clearing of a subcolor link for the +primitive atom "a", but it didn't do anything very interesting.) Now we +come to the "\w" bracket expression, which for simplicity assume expands +to just "[a-z0-9]". We process "a", but observe that it is already the +sole member of its color 1. This means there is no need to subdivide that +equivalence class more finely, so we do not create any new color. We just +make an arc labeled 1 between the third and fourth NFA states. Next we +process "b", which is WHITE and far from the only WHITE character, so we +create a new color (3), link that as WHITE's subcolor, relabel "b" as +color 3, and make an arc labeled 3. As we process "c" through "z", each +is relabeled from WHITE to 3, but no new arc is needed. Now we come to +"0", which is not the only member of its color 2, so we suppose that a new +color is needed and create color 4. We link 4 as subcolor of 2, relabel +"0" as color 4 in the map, and add an arc for color 4. Next "1" through +"9" are similarly relabeled as color 4, with no additional arcs needed. +Having finished the bracket expression, we call okcolors(), which breaks +the subcolor links. okcolors() further observes that we have removed +every member of color 2 (the previous color of the digit characters). +Therefore, it runs through the partial NFA built so far and relabels arcs +labeled 2 to color 4; in particular the arc from NFA state 2 to state 3 is +relabeled color 4. Then it frees up color 2, since we have no more use +for that color. We now have an NFA in which transitions for digits are +consistently labeled with color 4. Last, we come to the atom "x". +"x" is currently labeled with color 3, and it's not the only member of +that color, so we realize that we now need to distinguish "x" from other +letters when we did not before. We create a new color, which might have +been 5 but instead we recycle the unused color 2. "x" is relabeled 2 in +the color map and 2 is linked as the subcolor of 3, and we add an arc for +2 between states 4 and 5 of the NFA. Now we call okcolors(), which breaks +the subcolor link between colors 3 and 2 and notices that both colors are +nonempty. Therefore, it also runs through the existing NFA arcs and adds +an additional arc labeled 2 wherever there is an arc labeled 3; this +action ensures that characters of color 2 (i.e., "x") will still be +considered as allowing any transitions they did before. We are now done +parsing the regex, and we have these final color assignments: + color 1: "a" + color 2: "x" + color 3: other letters + color 4: digits +and the NFA has these arcs: + states 1 -> 2 on color 1 (hence, "a" only) + states 2 -> 3 on color 4 (digits) + states 3 -> 4 on colors 1, 3, 4, and 2 (covering all \w characters) + states 4 -> 5 on color 2 ("x" only) +which can be seen to be a correct representation of the regex. + +There is one more complexity, which is how to handle ".", that is a +match-anything atom. We used to do that by generating a "rainbow" +of arcs of all live colors between the two NFA states before and after +the dot. That's expensive in itself when there are lots of colors, +and it also typically adds lots of follow-on arc-splitting work for the +color splitting logic. Now we handle this case by generating a single arc +labeled with the special color RAINBOW, meaning all colors. Such arcs +never need to be split, so they help keep NFAs small in this common case. +(Note: this optimization doesn't help in REG_NLSTOP mode, where "." is +not supposed to match newline. In that case we still handle "." by +generating an almost-rainbow of all colors except newline's color.) + +Given this summary, we can see we need the following operations for +colors: + +* A fast way to look up the current color assignment for any character + code. (This is needed during both parsing and execution, while the + remaining operations are needed only during parsing.) +* A way to alter the color assignment for any given character code. +* We must track the number of characters currently assigned to each + color, so that we can detect empty and singleton colors. +* We must track all existing NFA arcs of a given color, so that we + can relabel them at need, or add parallel arcs of a new color when + an existing color has to be subdivided. + +The last two of these are handled with the "struct colordesc" array and +the "colorchain" links in NFA arc structs. + +Ideally, we'd do the first two operations using a simple linear array +storing the current color assignment for each character code. +Unfortunately, that's not terribly workable for large charsets such as +Unicode. Our solution is to divide the color map into two parts. A simple +linear array is used for character codes up to MAX_SIMPLE_CHR, which can be +chosen large enough to include all popular characters (so that the +significantly-slower code paths about to be described are seldom invoked). +Characters above that need be considered at compile time only if they +appear explicitly in the regex pattern. We store each such mentioned +character or character range as an entry in the "colormaprange" array in +the colormap. (Overlapping ranges are split into unique subranges, so that +each range in the finished list needs only a single color that describes +all its characters.) When mapping a character above MAX_SIMPLE_CHR to a +color at runtime, we search this list of ranges explicitly. + +That's still not quite enough, though, because of locale-dependent +character classes such as [[:alpha:]]. In Unicode locales these classes +may have thousands of entries that are above MAX_SIMPLE_CHR, and we +certainly don't want to be searching large colormaprange arrays at runtime. +Nor do we even want to spend the time to initialize cvec structures that +exhaustively describe all of those characters. Our solution is to compute +exact per-character colors at regex compile time only up to MAX_SIMPLE_CHR. +For characters above that, we apply the <ctype.h> or <wctype.h> lookup +functions at runtime for each locale-dependent character class used in the +regex pattern, constructing a bitmap that describes which classes the +runtime character belongs to. The per-character-range data structure +mentioned above actually holds, for each range, a separate color entry +for each possible combination of character class properties. That is, +the color map for characters above MAX_SIMPLE_CHR is really a 2-D array, +whose rows correspond to high characters or character ranges that are +explicitly mentioned in the regex pattern, and whose columns correspond +to sets of the locale-dependent character classes that are used in the +regex. + +As an example, given the pattern '\w\u1234[\U0001D100-\U0001D1FF]' +(and supposing that MAX_SIMPLE_CHR is less than 0x1234), we will need +a high color map with three rows. One row is for the single character +U+1234 (represented as a single-element range), one is for the range +U+1D100..U+1D1FF, and the other row represents all remaining high +characters. The color map has two columns, one for characters that +satisfy iswalnum() and one for those that don't. + +We build this color map in parallel with scanning the regex. Each time +we detect a new explicit high character (or range) or a locale-dependent +character class, we split existing entry(s) in the high color map so that +characters we need to be able to distinguish will have distinct entries +that can be given separate colors. Often, though, single entries in the +high color map will represent very large sets of characters. + +If there are both explicit high characters/ranges and locale-dependent +character classes, we may have entries in the high color map array that +have non-WHITE colors but don't actually represent any real characters. +(For example, in a row representing a singleton range, only one of the +columns could possibly be a live entry; it's the one matching the actual +locale properties for that single character.) We don't currently make +any effort to reclaim such colors. In principle it could be done, but +it's not clear that it's worth the trouble. + + +Detailed semantics of an NFA +---------------------------- + +When trying to read dumped-out NFAs, it's helpful to know these facts: + +State 0 (additionally marked with "@" in dumpnfa's output) is always the +goal state, and state 1 (additionally marked with ">") is the start state. +(The code refers to these as the post state and pre state respectively.) + +The possible arc types are: + + PLAIN arcs, which specify matching of any character of a given "color" + (see above). These are dumped as "[color_number]->to_state". + In addition there can be "rainbow" PLAIN arcs, which are dumped as + "[*]->to_state". + + EMPTY arcs, which specify a no-op transition to another state. These + are dumped as "->to_state". + + AHEAD constraints, which represent a "next character must be of this + color" constraint. AHEAD differs from a PLAIN arc in that the input + character is not consumed when crossing the arc. These are dumped as + ">color_number>->to_state", or possibly ">*>->to_state". + + BEHIND constraints, which represent a "previous character must be of + this color" constraint, which likewise consumes no input. These are + dumped as "<color_number<->to_state", or possibly "<*<->to_state". + + '^' arcs, which specify a beginning-of-input constraint. These are + dumped as "^0->to_state" or "^1->to_state" for beginning-of-string and + beginning-of-line constraints respectively. + + '$' arcs, which specify an end-of-input constraint. These are dumped + as "$0->to_state" or "$1->to_state" for end-of-string and end-of-line + constraints respectively. + + LACON constraints, which represent "(?=re)", "(?!re)", "(?<=re)", and + "(?<!re)" constraints, i.e. the input starting/ending at this point must + match (or not match) a given sub-RE, but the matching input is not + consumed. These are dumped as ":subtree_number:->to_state". + +If you see anything else (especially any question marks) in the display of +an arc, it's dumpnfa() trying to tell you that there's something fishy +about the arc; see the source code. + +The regex executor can only handle PLAIN and LACON transitions. The regex +optimize() function is responsible for transforming the parser's output +to get rid of all the other arc types. In particular, ^ and $ arcs that +are not dropped as impossible will always end up adjacent to the pre or +post state respectively, and then will be converted into PLAIN arcs that +mention the special "colors" for BOS, BOL, EOS, or EOL. + +To decide whether a thus-transformed NFA matches a given substring of the +input string, the executor essentially follows these rules: +1. Start the NFA "looking at" the character *before* the given substring, +or if the substring is at the start of the input, prepend an imaginary BOS +character instead. +2. Run the NFA until it has consumed the character *after* the given +substring, or an imaginary following EOS character if the substring is at +the end of the input. +3. If the NFA is (or can be) in the goal state at this point, it matches. + +This definition is necessary to support regexes that begin or end with +constraints such as \m and \M, which imply requirements on the adjacent +character if any. The executor implements that by checking if the +adjacent character (or BOS/BOL/EOS/EOL pseudo-character) is of the +right color, and it does that in the same loop that checks characters +within the match. + +So one can mentally execute an untransformed NFA by taking ^ and $ as +ordinary constraints that match at start and end of input; but plain +arcs out of the start state should be taken as matches for the character +before the target substring, and similarly, plain arcs leading to the +post state are matches for the character after the target substring. +After the optimize() transformation, there are explicit arcs mentioning +BOS/BOL/EOS/EOL adjacent to the pre-state and post-state. So a finished +NFA for a pattern without anchors or adjacent-character constraints will +have pre-state outarcs for RAINBOW (all possible character colors) as well +as BOS and BOL, and likewise post-state inarcs for RAINBOW, EOS, and EOL. diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c new file mode 100644 index 0000000..30bda0e --- /dev/null +++ b/src/backend/regex/regc_color.c @@ -0,0 +1,1186 @@ +/* + * colorings of characters + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regc_color.c + * + * + * Note that there are some incestuous relationships between this code and + * NFA arc maintenance, which perhaps ought to be cleaned up sometime. + */ + + + +#define CISERR() VISERR(cm->v) +#define CERR(e) VERR(cm->v, (e)) + + + +/* + * initcm - set up new colormap + */ +static void +initcm(struct vars *v, + struct colormap *cm) +{ + struct colordesc *cd; + + cm->magic = CMMAGIC; + cm->v = v; + + cm->ncds = NINLINECDS; + cm->cd = cm->cdspace; + cm->max = 0; + cm->free = 0; + + cd = cm->cd; /* cm->cd[WHITE] */ + cd->nschrs = MAX_SIMPLE_CHR - CHR_MIN + 1; + cd->nuchrs = 1; + cd->sub = NOSUB; + cd->arcs = NULL; + cd->firstchr = CHR_MIN; + cd->flags = 0; + + cm->locolormap = (color *) + MALLOC((MAX_SIMPLE_CHR - CHR_MIN + 1) * sizeof(color)); + if (cm->locolormap == NULL) + { + CERR(REG_ESPACE); + cm->cmranges = NULL; /* prevent failure during freecm */ + cm->hicolormap = NULL; + return; + } + /* this memset relies on WHITE being zero: */ + memset(cm->locolormap, WHITE, + (MAX_SIMPLE_CHR - CHR_MIN + 1) * sizeof(color)); + + memset(cm->classbits, 0, sizeof(cm->classbits)); + cm->numcmranges = 0; + cm->cmranges = NULL; + cm->maxarrayrows = 4; /* arbitrary initial allocation */ + cm->hiarrayrows = 1; /* but we have only one row/col initially */ + cm->hiarraycols = 1; + cm->hicolormap = (color *) MALLOC(cm->maxarrayrows * sizeof(color)); + if (cm->hicolormap == NULL) + { + CERR(REG_ESPACE); + return; + } + /* initialize the "all other characters" row to WHITE */ + cm->hicolormap[0] = WHITE; +} + +/* + * freecm - free dynamically-allocated things in a colormap + */ +static void +freecm(struct colormap *cm) +{ + cm->magic = 0; + if (cm->cd != cm->cdspace) + FREE(cm->cd); + if (cm->locolormap != NULL) + FREE(cm->locolormap); + if (cm->cmranges != NULL) + FREE(cm->cmranges); + if (cm->hicolormap != NULL) + FREE(cm->hicolormap); +} + +/* + * pg_reg_getcolor - slow case of GETCOLOR() + */ +color +pg_reg_getcolor(struct colormap *cm, chr c) +{ + int rownum, + colnum, + low, + high; + + /* Should not be used for chrs in the locolormap */ + assert(c > MAX_SIMPLE_CHR); + + /* + * Find which row it's in. The colormapranges are in order, so we can use + * binary search. + */ + rownum = 0; /* if no match, use array row zero */ + low = 0; + high = cm->numcmranges; + while (low < high) + { + int middle = low + (high - low) / 2; + const colormaprange *cmr = &cm->cmranges[middle]; + + if (c < cmr->cmin) + high = middle; + else if (c > cmr->cmax) + low = middle + 1; + else + { + rownum = cmr->rownum; /* found a match */ + break; + } + } + + /* + * Find which column it's in --- this is all locale-dependent. + */ + if (cm->hiarraycols > 1) + { + colnum = cclass_column_index(cm, c); + return cm->hicolormap[rownum * cm->hiarraycols + colnum]; + } + else + { + /* fast path if no relevant cclasses */ + return cm->hicolormap[rownum]; + } +} + +/* + * maxcolor - report largest color number in use + */ +static color +maxcolor(struct colormap *cm) +{ + if (CISERR()) + return COLORLESS; + + return (color) cm->max; +} + +/* + * newcolor - find a new color (must be assigned at once) + * Beware: may relocate the colordescs. + */ +static color /* COLORLESS for error */ +newcolor(struct colormap *cm) +{ + struct colordesc *cd; + size_t n; + + if (CISERR()) + return COLORLESS; + + if (cm->free != 0) + { + assert(cm->free > 0); + assert((size_t) cm->free < cm->ncds); + cd = &cm->cd[cm->free]; + assert(UNUSEDCOLOR(cd)); + assert(cd->arcs == NULL); + cm->free = cd->sub; + } + else if (cm->max < cm->ncds - 1) + { + cm->max++; + cd = &cm->cd[cm->max]; + } + else + { + /* oops, must allocate more */ + struct colordesc *newCd; + + if (cm->max == MAX_COLOR) + { + CERR(REG_ECOLORS); + return COLORLESS; /* too many colors */ + } + + n = cm->ncds * 2; + if (n > MAX_COLOR + 1) + n = MAX_COLOR + 1; + if (cm->cd == cm->cdspace) + { + newCd = (struct colordesc *) MALLOC(n * sizeof(struct colordesc)); + if (newCd != NULL) + memcpy(VS(newCd), VS(cm->cdspace), cm->ncds * + sizeof(struct colordesc)); + } + else + newCd = (struct colordesc *) + REALLOC(cm->cd, n * sizeof(struct colordesc)); + if (newCd == NULL) + { + CERR(REG_ESPACE); + return COLORLESS; + } + cm->cd = newCd; + cm->ncds = n; + assert(cm->max < cm->ncds - 1); + cm->max++; + cd = &cm->cd[cm->max]; + } + + cd->nschrs = 0; + cd->nuchrs = 0; + cd->sub = NOSUB; + cd->arcs = NULL; + cd->firstchr = CHR_MIN; /* in case never set otherwise */ + cd->flags = 0; + + return (color) (cd - cm->cd); +} + +/* + * freecolor - free a color (must have no arcs or subcolor) + */ +static void +freecolor(struct colormap *cm, + color co) +{ + struct colordesc *cd = &cm->cd[co]; + color pco, + nco; /* for freelist scan */ + + assert(co >= 0); + if (co == WHITE) + return; + + assert(cd->arcs == NULL); + assert(cd->sub == NOSUB); + assert(cd->nschrs == 0); + assert(cd->nuchrs == 0); + cd->flags = FREECOL; + + if ((size_t) co == cm->max) + { + while (cm->max > WHITE && UNUSEDCOLOR(&cm->cd[cm->max])) + cm->max--; + assert(cm->free >= 0); + while ((size_t) cm->free > cm->max) + cm->free = cm->cd[cm->free].sub; + if (cm->free > 0) + { + assert(cm->free < cm->max); + pco = cm->free; + nco = cm->cd[pco].sub; + while (nco > 0) + if ((size_t) nco > cm->max) + { + /* take this one out of freelist */ + nco = cm->cd[nco].sub; + cm->cd[pco].sub = nco; + } + else + { + assert(nco < cm->max); + pco = nco; + nco = cm->cd[pco].sub; + } + } + } + else + { + cd->sub = cm->free; + cm->free = (color) (cd - cm->cd); + } +} + +/* + * pseudocolor - allocate a false color, to be managed by other means + */ +static color +pseudocolor(struct colormap *cm) +{ + color co; + struct colordesc *cd; + + co = newcolor(cm); + if (CISERR()) + return COLORLESS; + cd = &cm->cd[co]; + cd->nschrs = 0; + cd->nuchrs = 1; /* pretend it is in the upper map */ + cd->sub = NOSUB; + cd->arcs = NULL; + cd->firstchr = CHR_MIN; + cd->flags = PSEUDO; + return co; +} + +/* + * subcolor - allocate a new subcolor (if necessary) to this chr + * + * This works only for chrs that map into the low color map. + */ +static color +subcolor(struct colormap *cm, chr c) +{ + color co; /* current color of c */ + color sco; /* new subcolor */ + + assert(c <= MAX_SIMPLE_CHR); + + co = cm->locolormap[c - CHR_MIN]; + sco = newsub(cm, co); + if (CISERR()) + return COLORLESS; + assert(sco != COLORLESS); + + if (co == sco) /* already in an open subcolor */ + return co; /* rest is redundant */ + cm->cd[co].nschrs--; + if (cm->cd[sco].nschrs == 0) + cm->cd[sco].firstchr = c; + cm->cd[sco].nschrs++; + cm->locolormap[c - CHR_MIN] = sco; + return sco; +} + +/* + * subcolorhi - allocate a new subcolor (if necessary) to this colormap entry + * + * This is the same processing as subcolor(), but for entries in the high + * colormap, which do not necessarily correspond to exactly one chr code. + */ +static color +subcolorhi(struct colormap *cm, color *pco) +{ + color co; /* current color of entry */ + color sco; /* new subcolor */ + + co = *pco; + sco = newsub(cm, co); + if (CISERR()) + return COLORLESS; + assert(sco != COLORLESS); + + if (co == sco) /* already in an open subcolor */ + return co; /* rest is redundant */ + cm->cd[co].nuchrs--; + cm->cd[sco].nuchrs++; + *pco = sco; + return sco; +} + +/* + * newsub - allocate a new subcolor (if necessary) for a color + */ +static color +newsub(struct colormap *cm, + color co) +{ + color sco; /* new subcolor */ + + sco = cm->cd[co].sub; + if (sco == NOSUB) + { /* color has no open subcolor */ + /* optimization: singly-referenced color need not be subcolored */ + if ((cm->cd[co].nschrs + cm->cd[co].nuchrs) == 1) + return co; + sco = newcolor(cm); /* must create subcolor */ + if (sco == COLORLESS) + { + assert(CISERR()); + return COLORLESS; + } + cm->cd[co].sub = sco; + cm->cd[sco].sub = sco; /* open subcolor points to self */ + } + assert(sco != NOSUB); + + return sco; +} + +/* + * newhicolorrow - get a new row in the hicolormap, cloning it from oldrow + * + * Returns array index of new row. Note the array might move. + */ +static int +newhicolorrow(struct colormap *cm, + int oldrow) +{ + int newrow = cm->hiarrayrows; + color *newrowptr; + int i; + + /* Assign a fresh array row index, enlarging storage if needed */ + if (newrow >= cm->maxarrayrows) + { + color *newarray; + + if (cm->maxarrayrows >= INT_MAX / (cm->hiarraycols * 2)) + { + CERR(REG_ESPACE); + return 0; + } + newarray = (color *) REALLOC(cm->hicolormap, + cm->maxarrayrows * 2 * + cm->hiarraycols * sizeof(color)); + if (newarray == NULL) + { + CERR(REG_ESPACE); + return 0; + } + cm->hicolormap = newarray; + cm->maxarrayrows *= 2; + } + cm->hiarrayrows++; + + /* Copy old row data */ + newrowptr = &cm->hicolormap[newrow * cm->hiarraycols]; + memcpy(newrowptr, + &cm->hicolormap[oldrow * cm->hiarraycols], + cm->hiarraycols * sizeof(color)); + + /* Increase color reference counts to reflect new colormap entries */ + for (i = 0; i < cm->hiarraycols; i++) + cm->cd[newrowptr[i]].nuchrs++; + + return newrow; +} + +/* + * newhicolorcols - create a new set of columns in the high colormap + * + * Essentially, extends the 2-D array to the right with a copy of itself. + */ +static void +newhicolorcols(struct colormap *cm) +{ + color *newarray; + int r, + c; + + if (cm->hiarraycols >= INT_MAX / (cm->maxarrayrows * 2)) + { + CERR(REG_ESPACE); + return; + } + newarray = (color *) REALLOC(cm->hicolormap, + cm->maxarrayrows * + cm->hiarraycols * 2 * sizeof(color)); + if (newarray == NULL) + { + CERR(REG_ESPACE); + return; + } + cm->hicolormap = newarray; + + /* Duplicate existing columns to the right, and increase ref counts */ + /* Must work backwards in the array because we realloc'd in place */ + for (r = cm->hiarrayrows - 1; r >= 0; r--) + { + color *oldrowptr = &newarray[r * cm->hiarraycols]; + color *newrowptr = &newarray[r * cm->hiarraycols * 2]; + color *newrowptr2 = newrowptr + cm->hiarraycols; + + for (c = 0; c < cm->hiarraycols; c++) + { + color co = oldrowptr[c]; + + newrowptr[c] = newrowptr2[c] = co; + cm->cd[co].nuchrs++; + } + } + + cm->hiarraycols *= 2; +} + +/* + * subcolorcvec - allocate new subcolors to cvec members, fill in arcs + * + * For each chr "c" represented by the cvec, do the equivalent of + * newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp); + * + * Note that in typical cases, many of the subcolors are the same. + * While newarc() would discard duplicate arc requests, we can save + * some cycles by not calling it repetitively to begin with. This is + * mechanized with the "lastsubcolor" state variable. + */ +static void +subcolorcvec(struct vars *v, + struct cvec *cv, + struct state *lp, + struct state *rp) +{ + struct colormap *cm = v->cm; + color lastsubcolor = COLORLESS; + chr ch, + from, + to; + const chr *p; + int i; + + /* ordinary characters */ + for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) + { + ch = *p; + subcoloronechr(v, ch, lp, rp, &lastsubcolor); + NOERR(); + } + + /* and the ranges */ + for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) + { + from = *p; + to = *(p + 1); + if (from <= MAX_SIMPLE_CHR) + { + /* deal with simple chars one at a time */ + chr lim = (to <= MAX_SIMPLE_CHR) ? to : MAX_SIMPLE_CHR; + + while (from <= lim) + { + color sco = subcolor(cm, from); + + NOERR(); + if (sco != lastsubcolor) + { + newarc(v->nfa, PLAIN, sco, lp, rp); + NOERR(); + lastsubcolor = sco; + } + from++; + } + } + /* deal with any part of the range that's above MAX_SIMPLE_CHR */ + if (from < to) + subcoloronerange(v, from, to, lp, rp, &lastsubcolor); + else if (from == to) + subcoloronechr(v, from, lp, rp, &lastsubcolor); + NOERR(); + } + + /* and deal with cclass if any */ + if (cv->cclasscode >= 0) + { + int classbit; + color *pco; + int r, + c; + + /* Enlarge array if we don't have a column bit assignment for cclass */ + if (cm->classbits[cv->cclasscode] == 0) + { + cm->classbits[cv->cclasscode] = cm->hiarraycols; + newhicolorcols(cm); + NOERR(); + } + /* Apply subcolorhi() and make arc for each entry in relevant cols */ + classbit = cm->classbits[cv->cclasscode]; + pco = cm->hicolormap; + for (r = 0; r < cm->hiarrayrows; r++) + { + for (c = 0; c < cm->hiarraycols; c++) + { + if (c & classbit) + { + color sco = subcolorhi(cm, pco); + + NOERR(); + /* add the arc if needed */ + if (sco != lastsubcolor) + { + newarc(v->nfa, PLAIN, sco, lp, rp); + NOERR(); + lastsubcolor = sco; + } + } + pco++; + } + } + } +} + +/* + * subcoloronechr - do subcolorcvec's work for a singleton chr + * + * We could just let subcoloronerange do this, but it's a bit more efficient + * if we exploit the single-chr case. Also, callers find it useful for this + * to be able to handle both low and high chr codes. + */ +static void +subcoloronechr(struct vars *v, + chr ch, + struct state *lp, + struct state *rp, + color *lastsubcolor) +{ + struct colormap *cm = v->cm; + colormaprange *newranges; + int numnewranges; + colormaprange *oldrange; + int oldrangen; + int newrow; + + /* Easy case for low chr codes */ + if (ch <= MAX_SIMPLE_CHR) + { + color sco = subcolor(cm, ch); + + NOERR(); + if (sco != *lastsubcolor) + { + newarc(v->nfa, PLAIN, sco, lp, rp); + *lastsubcolor = sco; + } + return; + } + + /* + * Potentially, we could need two more colormapranges than we have now, if + * the given chr is in the middle of some existing range. + */ + newranges = (colormaprange *) + MALLOC((cm->numcmranges + 2) * sizeof(colormaprange)); + if (newranges == NULL) + { + CERR(REG_ESPACE); + return; + } + numnewranges = 0; + + /* Ranges before target are unchanged */ + for (oldrange = cm->cmranges, oldrangen = 0; + oldrangen < cm->numcmranges; + oldrange++, oldrangen++) + { + if (oldrange->cmax >= ch) + break; + newranges[numnewranges++] = *oldrange; + } + + /* Match target chr against current range */ + if (oldrangen >= cm->numcmranges || oldrange->cmin > ch) + { + /* chr does not belong to any existing range, make a new one */ + newranges[numnewranges].cmin = ch; + newranges[numnewranges].cmax = ch; + /* row state should be cloned from the "all others" row */ + newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0); + numnewranges++; + } + else if (oldrange->cmin == oldrange->cmax) + { + /* we have an existing singleton range matching the chr */ + newranges[numnewranges++] = *oldrange; + newrow = oldrange->rownum; + /* we've now fully processed this old range */ + oldrange++, oldrangen++; + } + else + { + /* chr is a subset of this existing range, must split it */ + if (ch > oldrange->cmin) + { + /* emit portion of old range before chr */ + newranges[numnewranges].cmin = oldrange->cmin; + newranges[numnewranges].cmax = ch - 1; + newranges[numnewranges].rownum = oldrange->rownum; + numnewranges++; + } + /* emit chr as singleton range, initially cloning from range */ + newranges[numnewranges].cmin = ch; + newranges[numnewranges].cmax = ch; + newranges[numnewranges].rownum = newrow = + newhicolorrow(cm, oldrange->rownum); + numnewranges++; + if (ch < oldrange->cmax) + { + /* emit portion of old range after chr */ + newranges[numnewranges].cmin = ch + 1; + newranges[numnewranges].cmax = oldrange->cmax; + /* must clone the row if we are making two new ranges from old */ + newranges[numnewranges].rownum = + (ch > oldrange->cmin) ? newhicolorrow(cm, oldrange->rownum) : + oldrange->rownum; + numnewranges++; + } + /* we've now fully processed this old range */ + oldrange++, oldrangen++; + } + + /* Update colors in newrow and create arcs as needed */ + subcoloronerow(v, newrow, lp, rp, lastsubcolor); + + /* Ranges after target are unchanged */ + for (; oldrangen < cm->numcmranges; oldrange++, oldrangen++) + { + newranges[numnewranges++] = *oldrange; + } + + /* Assert our original space estimate was adequate */ + assert(numnewranges <= (cm->numcmranges + 2)); + + /* And finally, store back the updated list of ranges */ + if (cm->cmranges != NULL) + FREE(cm->cmranges); + cm->cmranges = newranges; + cm->numcmranges = numnewranges; +} + +/* + * subcoloronerange - do subcolorcvec's work for a high range + */ +static void +subcoloronerange(struct vars *v, + chr from, + chr to, + struct state *lp, + struct state *rp, + color *lastsubcolor) +{ + struct colormap *cm = v->cm; + colormaprange *newranges; + int numnewranges; + colormaprange *oldrange; + int oldrangen; + int newrow; + + /* Caller should take care of non-high-range cases */ + assert(from > MAX_SIMPLE_CHR); + assert(from < to); + + /* + * Potentially, if we have N non-adjacent ranges, we could need as many as + * 2N+1 result ranges (consider case where new range spans 'em all). + */ + newranges = (colormaprange *) + MALLOC((cm->numcmranges * 2 + 1) * sizeof(colormaprange)); + if (newranges == NULL) + { + CERR(REG_ESPACE); + return; + } + numnewranges = 0; + + /* Ranges before target are unchanged */ + for (oldrange = cm->cmranges, oldrangen = 0; + oldrangen < cm->numcmranges; + oldrange++, oldrangen++) + { + if (oldrange->cmax >= from) + break; + newranges[numnewranges++] = *oldrange; + } + + /* + * Deal with ranges that (partially) overlap the target. As we process + * each such range, increase "from" to remove the dealt-with characters + * from the target range. + */ + while (oldrangen < cm->numcmranges && oldrange->cmin <= to) + { + if (from < oldrange->cmin) + { + /* Handle portion of new range that corresponds to no old range */ + newranges[numnewranges].cmin = from; + newranges[numnewranges].cmax = oldrange->cmin - 1; + /* row state should be cloned from the "all others" row */ + newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0); + numnewranges++; + /* Update colors in newrow and create arcs as needed */ + subcoloronerow(v, newrow, lp, rp, lastsubcolor); + /* We've now fully processed the part of new range before old */ + from = oldrange->cmin; + } + + if (from <= oldrange->cmin && to >= oldrange->cmax) + { + /* old range is fully contained in new, process it in-place */ + newranges[numnewranges++] = *oldrange; + newrow = oldrange->rownum; + from = oldrange->cmax + 1; + } + else + { + /* some part of old range does not overlap new range */ + if (from > oldrange->cmin) + { + /* emit portion of old range before new range */ + newranges[numnewranges].cmin = oldrange->cmin; + newranges[numnewranges].cmax = from - 1; + newranges[numnewranges].rownum = oldrange->rownum; + numnewranges++; + } + /* emit common subrange, initially cloning from old range */ + newranges[numnewranges].cmin = from; + newranges[numnewranges].cmax = + (to < oldrange->cmax) ? to : oldrange->cmax; + newranges[numnewranges].rownum = newrow = + newhicolorrow(cm, oldrange->rownum); + numnewranges++; + if (to < oldrange->cmax) + { + /* emit portion of old range after new range */ + newranges[numnewranges].cmin = to + 1; + newranges[numnewranges].cmax = oldrange->cmax; + /* must clone the row if we are making two new ranges from old */ + newranges[numnewranges].rownum = + (from > oldrange->cmin) ? newhicolorrow(cm, oldrange->rownum) : + oldrange->rownum; + numnewranges++; + } + from = oldrange->cmax + 1; + } + /* Update colors in newrow and create arcs as needed */ + subcoloronerow(v, newrow, lp, rp, lastsubcolor); + /* we've now fully processed this old range */ + oldrange++, oldrangen++; + } + + if (from <= to) + { + /* Handle portion of new range that corresponds to no old range */ + newranges[numnewranges].cmin = from; + newranges[numnewranges].cmax = to; + /* row state should be cloned from the "all others" row */ + newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0); + numnewranges++; + /* Update colors in newrow and create arcs as needed */ + subcoloronerow(v, newrow, lp, rp, lastsubcolor); + } + + /* Ranges after target are unchanged */ + for (; oldrangen < cm->numcmranges; oldrange++, oldrangen++) + { + newranges[numnewranges++] = *oldrange; + } + + /* Assert our original space estimate was adequate */ + assert(numnewranges <= (cm->numcmranges * 2 + 1)); + + /* And finally, store back the updated list of ranges */ + if (cm->cmranges != NULL) + FREE(cm->cmranges); + cm->cmranges = newranges; + cm->numcmranges = numnewranges; +} + +/* + * subcoloronerow - do subcolorcvec's work for one new row in the high colormap + */ +static void +subcoloronerow(struct vars *v, + int rownum, + struct state *lp, + struct state *rp, + color *lastsubcolor) +{ + struct colormap *cm = v->cm; + color *pco; + int i; + + /* Apply subcolorhi() and make arc for each entry in row */ + pco = &cm->hicolormap[rownum * cm->hiarraycols]; + for (i = 0; i < cm->hiarraycols; pco++, i++) + { + color sco = subcolorhi(cm, pco); + + NOERR(); + /* make the arc if needed */ + if (sco != *lastsubcolor) + { + newarc(v->nfa, PLAIN, sco, lp, rp); + NOERR(); + *lastsubcolor = sco; + } + } +} + +/* + * okcolors - promote subcolors to full colors + */ +static void +okcolors(struct nfa *nfa, + struct colormap *cm) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(cm); + struct colordesc *scd; + struct arc *a; + color co; + color sco; + + for (cd = cm->cd, co = 0; cd < end; cd++, co++) + { + sco = cd->sub; + if (UNUSEDCOLOR(cd) || sco == NOSUB) + { + /* has no subcolor, no further action */ + } + else if (sco == co) + { + /* is subcolor, let parent deal with it */ + } + else if (cd->nschrs == 0 && cd->nuchrs == 0) + { + /* + * Parent is now empty, so just change all its arcs to the + * subcolor, then free the parent. + * + * It is not obvious that simply relabeling the arcs like this is + * OK; it appears to risk creating duplicate arcs. We are + * basically relying on the assumption that processing of a + * bracket expression can't create arcs of both a color and its + * subcolor between the bracket's endpoints. + */ + cd->sub = NOSUB; + scd = &cm->cd[sco]; + assert(scd->nschrs > 0 || scd->nuchrs > 0); + assert(scd->sub == sco); + scd->sub = NOSUB; + while ((a = cd->arcs) != NULL) + { + assert(a->co == co); + uncolorchain(cm, a); + a->co = sco; + colorchain(cm, a); + } + freecolor(cm, co); + } + else + { + /* parent's arcs must gain parallel subcolor arcs */ + cd->sub = NOSUB; + scd = &cm->cd[sco]; + assert(scd->nschrs > 0 || scd->nuchrs > 0); + assert(scd->sub == sco); + scd->sub = NOSUB; + for (a = cd->arcs; a != NULL; a = a->colorchain) + { + assert(a->co == co); + newarc(nfa, a->type, sco, a->from, a->to); + } + } + } +} + +/* + * colorchain - add this arc to the color chain of its color + */ +static void +colorchain(struct colormap *cm, + struct arc *a) +{ + struct colordesc *cd = &cm->cd[a->co]; + + assert(a->co >= 0); + if (cd->arcs != NULL) + cd->arcs->colorchainRev = a; + a->colorchain = cd->arcs; + a->colorchainRev = NULL; + cd->arcs = a; +} + +/* + * uncolorchain - delete this arc from the color chain of its color + */ +static void +uncolorchain(struct colormap *cm, + struct arc *a) +{ + struct colordesc *cd = &cm->cd[a->co]; + struct arc *aa = a->colorchainRev; + + assert(a->co >= 0); + if (aa == NULL) + { + assert(cd->arcs == a); + cd->arcs = a->colorchain; + } + else + { + assert(aa->colorchain == a); + aa->colorchain = a->colorchain; + } + if (a->colorchain != NULL) + a->colorchain->colorchainRev = aa; + a->colorchain = NULL; /* paranoia */ + a->colorchainRev = NULL; +} + +/* + * rainbow - add arcs of all full colors (but one) between specified states + * + * If there isn't an exception color, we now generate just a single arc + * labeled RAINBOW, saving lots of arc-munging later on. + */ +static void +rainbow(struct nfa *nfa, + struct colormap *cm, + int type, + color but, /* COLORLESS if no exceptions */ + struct state *from, + struct state *to) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(cm); + color co; + + if (but == COLORLESS) + { + newarc(nfa, type, RAINBOW, from, to); + return; + } + + /* Gotta do it the hard way. Skip subcolors, pseudocolors, and "but" */ + for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++) + if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but && + !(cd->flags & PSEUDO)) + newarc(nfa, type, co, from, to); +} + +/* + * colorcomplement - add arcs of complementary colors + * + * We add arcs of all colors that are not pseudocolors and do not match + * any of the "of" state's PLAIN outarcs. + * + * The calling sequence ought to be reconciled with cloneouts(). + */ +static void +colorcomplement(struct nfa *nfa, + struct colormap *cm, + int type, + struct state *of, + struct state *from, + struct state *to) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(cm); + color co; + struct arc *a; + + assert(of != from); + + /* A RAINBOW arc matches all colors, making the complement empty */ + if (findarc(of, PLAIN, RAINBOW) != NULL) + return; + + /* Otherwise, transiently mark the colors that appear in of's out-arcs */ + for (a = of->outs; a != NULL; a = a->outchain) + { + if (a->type == PLAIN) + { + assert(a->co >= 0); + cd = &cm->cd[a->co]; + assert(!UNUSEDCOLOR(cd)); + cd->flags |= COLMARK; + } + } + + /* Scan colors, clear transient marks, add arcs for unmarked colors */ + for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++) + { + if (cd->flags & COLMARK) + cd->flags &= ~COLMARK; + else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) + newarc(nfa, type, co, from, to); + } +} + + +#ifdef REG_DEBUG + +/* + * dumpcolors - debugging output + */ +static void +dumpcolors(struct colormap *cm, + FILE *f) +{ + struct colordesc *cd; + struct colordesc *end; + color co; + chr c; + + fprintf(f, "max %ld\n", (long) cm->max); + end = CDEND(cm); + for (cd = cm->cd + 1, co = 1; cd < end; cd++, co++) /* skip 0 */ + { + if (!UNUSEDCOLOR(cd)) + { + assert(cd->nschrs > 0 || cd->nuchrs > 0); + if (cd->flags & PSEUDO) + fprintf(f, "#%2ld(ps): ", (long) co); + else + fprintf(f, "#%2ld(%2d): ", (long) co, cd->nschrs + cd->nuchrs); + + /* + * Unfortunately, it's hard to do this next bit more efficiently. + */ + for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++) + if (GETCOLOR(cm, c) == co) + dumpchr(c, f); + fprintf(f, "\n"); + } + } + /* dump the high colormap if it contains anything interesting */ + if (cm->hiarrayrows > 1 || cm->hiarraycols > 1) + { + int r, + c; + const color *rowptr; + + fprintf(f, "other:\t"); + for (c = 0; c < cm->hiarraycols; c++) + { + fprintf(f, "\t%ld", (long) cm->hicolormap[c]); + } + fprintf(f, "\n"); + for (r = 0; r < cm->numcmranges; r++) + { + dumpchr(cm->cmranges[r].cmin, f); + fprintf(f, ".."); + dumpchr(cm->cmranges[r].cmax, f); + fprintf(f, ":"); + rowptr = &cm->hicolormap[cm->cmranges[r].rownum * cm->hiarraycols]; + for (c = 0; c < cm->hiarraycols; c++) + { + fprintf(f, "\t%ld", (long) rowptr[c]); + } + fprintf(f, "\n"); + } + } +} + +/* + * dumpchr - print a chr + * + * Kind of char-centric but works well enough for debug use. + */ +static void +dumpchr(chr c, + FILE *f) +{ + if (c == '\\') + fprintf(f, "\\\\"); + else if (c > ' ' && c <= '~') + putc((char) c, f); + else + fprintf(f, "\\u%04lx", (long) c); +} + +#endif /* REG_DEBUG */ diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c new file mode 100644 index 0000000..1030621 --- /dev/null +++ b/src/backend/regex/regc_cvec.c @@ -0,0 +1,138 @@ +/* + * Utility functions for handling cvecs + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regc_cvec.c + * + */ + +/* + * Notes: + * Only (selected) functions in _this_ file should treat the chr arrays + * of a cvec as non-constant. + */ + +/* + * newcvec - allocate a new cvec + */ +static struct cvec * +newcvec(int nchrs, /* to hold this many chrs... */ + int nranges) /* ... and this many ranges */ +{ + size_t nc = (size_t) nchrs + (size_t) nranges * 2; + size_t n = sizeof(struct cvec) + nc * sizeof(chr); + struct cvec *cv = (struct cvec *) MALLOC(n); + + if (cv == NULL) + return NULL; + cv->chrspace = nchrs; + cv->chrs = (chr *) (((char *) cv) + sizeof(struct cvec)); + cv->ranges = cv->chrs + nchrs; + cv->rangespace = nranges; + return clearcvec(cv); +} + +/* + * clearcvec - clear a possibly-new cvec + * Returns pointer as convenience. + */ +static struct cvec * +clearcvec(struct cvec *cv) +{ + assert(cv != NULL); + cv->nchrs = 0; + cv->nranges = 0; + cv->cclasscode = -1; + return cv; +} + +/* + * addchr - add a chr to a cvec + */ +static void +addchr(struct cvec *cv, /* character vector */ + chr c) /* character to add */ +{ + assert(cv->nchrs < cv->chrspace); + cv->chrs[cv->nchrs++] = c; +} + +/* + * addrange - add a range to a cvec + */ +static void +addrange(struct cvec *cv, /* character vector */ + chr from, /* first character of range */ + chr to) /* last character of range */ +{ + assert(cv->nranges < cv->rangespace); + cv->ranges[cv->nranges * 2] = from; + cv->ranges[cv->nranges * 2 + 1] = to; + cv->nranges++; +} + +/* + * getcvec - get a transient cvec, initialized to empty + * + * The returned cvec is valid only until the next call of getcvec, which + * typically will recycle the space. Callers should *not* free the cvec + * explicitly; it will be cleaned up when the struct vars is destroyed. + * + * This is typically used while interpreting bracket expressions. In that + * usage the cvec is only needed momentarily until we build arcs from it, + * so transientness is a convenient behavior. + */ +static struct cvec * +getcvec(struct vars *v, /* context */ + int nchrs, /* to hold this many chrs... */ + int nranges) /* ... and this many ranges */ +{ + /* recycle existing transient cvec if large enough */ + if (v->cv != NULL && nchrs <= v->cv->chrspace && + nranges <= v->cv->rangespace) + return clearcvec(v->cv); + + /* nope, make a new one */ + if (v->cv != NULL) + freecvec(v->cv); + v->cv = newcvec(nchrs, nranges); + if (v->cv == NULL) + ERR(REG_ESPACE); + + return v->cv; +} + +/* + * freecvec - free a cvec + */ +static void +freecvec(struct cvec *cv) +{ + FREE(cv); +} diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c new file mode 100644 index 0000000..7673dab --- /dev/null +++ b/src/backend/regex/regc_lex.c @@ -0,0 +1,1039 @@ +/* + * lexical analyzer + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regc_lex.c + * + */ + +/* scanning macros (know about v) */ +#define ATEOS() (v->now >= v->stop) +#define HAVE(n) (v->stop - v->now >= (n)) +#define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) +#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) +#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \ + *(v->now+1) == CHR(b) && \ + *(v->now+2) == CHR(c)) +#define SET(c) (v->nexttype = (c)) +#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) +#define RET(c) return (SET(c), 1) +#define RETV(c, n) return (SETV(c, n), 1) +#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ +#define LASTTYPE(t) (v->lasttype == (t)) + +/* lexical contexts */ +#define L_ERE 1 /* mainline ERE/ARE */ +#define L_BRE 2 /* mainline BRE */ +#define L_Q 3 /* REG_QUOTE */ +#define L_EBND 4 /* ERE/ARE bound */ +#define L_BBND 5 /* BRE bound */ +#define L_BRACK 6 /* brackets */ +#define L_CEL 7 /* collating element */ +#define L_ECL 8 /* equivalence class */ +#define L_CCL 9 /* character class */ +#define INTOCON(c) (v->lexcon = (c)) +#define INCON(con) (v->lexcon == (con)) + +/* construct pointer past end of chr array */ +#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) + +/* + * lexstart - set up lexical stuff, scan leading options + */ +static void +lexstart(struct vars *v) +{ + prefixes(v); /* may turn on new type bits etc. */ + NOERR(); + + if (v->cflags & REG_QUOTE) + { + assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE))); + INTOCON(L_Q); + } + else if (v->cflags & REG_EXTENDED) + { + assert(!(v->cflags & REG_QUOTE)); + INTOCON(L_ERE); + } + else + { + assert(!(v->cflags & (REG_QUOTE | REG_ADVF))); + INTOCON(L_BRE); + } + + v->nexttype = EMPTY; /* remember we were at the start */ + next(v); /* set up the first token */ +} + +/* + * prefixes - implement various special prefixes + */ +static void +prefixes(struct vars *v) +{ + /* literal string doesn't get any of this stuff */ + if (v->cflags & REG_QUOTE) + return; + + /* initial "***" gets special things */ + if (HAVE(4) && NEXT3('*', '*', '*')) + switch (*(v->now + 3)) + { + case CHR('?'): /* "***?" error, msg shows version */ + ERR(REG_BADPAT); + return; /* proceed no further */ + break; + case CHR('='): /* "***=" shifts to literal string */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_QUOTE; + v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE); + v->now += 4; + return; /* and there can be no more prefixes */ + break; + case CHR(':'): /* "***:" shifts to AREs */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_ADVANCED; + v->now += 4; + break; + default: /* otherwise *** is just an error */ + ERR(REG_BADRPT); + return; + break; + } + + /* BREs and EREs don't get embedded options */ + if ((v->cflags & REG_ADVANCED) != REG_ADVANCED) + return; + + /* embedded options (AREs only) */ + if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) + { + NOTE(REG_UNONPOSIX); + v->now += 2; + for (; !ATEOS() && iscalpha(*v->now); v->now++) + switch (*v->now) + { + case CHR('b'): /* BREs (but why???) */ + v->cflags &= ~(REG_ADVANCED | REG_QUOTE); + break; + case CHR('c'): /* case sensitive */ + v->cflags &= ~REG_ICASE; + break; + case CHR('e'): /* plain EREs */ + v->cflags |= REG_EXTENDED; + v->cflags &= ~(REG_ADVF | REG_QUOTE); + break; + case CHR('i'): /* case insensitive */ + v->cflags |= REG_ICASE; + break; + case CHR('m'): /* Perloid synonym for n */ + case CHR('n'): /* \n affects ^ $ . [^ */ + v->cflags |= REG_NEWLINE; + break; + case CHR('p'): /* ~Perl, \n affects . [^ */ + v->cflags |= REG_NLSTOP; + v->cflags &= ~REG_NLANCH; + break; + case CHR('q'): /* literal string */ + v->cflags |= REG_QUOTE; + v->cflags &= ~REG_ADVANCED; + break; + case CHR('s'): /* single line, \n ordinary */ + v->cflags &= ~REG_NEWLINE; + break; + case CHR('t'): /* tight syntax */ + v->cflags &= ~REG_EXPANDED; + break; + case CHR('w'): /* weird, \n affects ^ $ only */ + v->cflags &= ~REG_NLSTOP; + v->cflags |= REG_NLANCH; + break; + case CHR('x'): /* expanded syntax */ + v->cflags |= REG_EXPANDED; + break; + default: + ERR(REG_BADOPT); + return; + } + if (!NEXT1(')')) + { + ERR(REG_BADOPT); + return; + } + v->now++; + if (v->cflags & REG_QUOTE) + v->cflags &= ~(REG_EXPANDED | REG_NEWLINE); + } +} + +/* + * next - get next token + */ +static int /* 1 normal, 0 failure */ +next(struct vars *v) +{ + chr c; + + /* errors yield an infinite sequence of failures */ + if (ISERR()) + return 0; /* the error has set nexttype to EOS */ + + /* remember flavor of last token */ + v->lasttype = v->nexttype; + + /* REG_BOSONLY */ + if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY)) + { + /* at start of a REG_BOSONLY RE */ + RETV(SBEGIN, 0); /* same as \A */ + } + + /* skip white space etc. if appropriate (not in literal or []) */ + if (v->cflags & REG_EXPANDED) + switch (v->lexcon) + { + case L_ERE: + case L_BRE: + case L_EBND: + case L_BBND: + skip(v); + break; + } + + /* handle EOS, depending on context */ + if (ATEOS()) + { + switch (v->lexcon) + { + case L_ERE: + case L_BRE: + case L_Q: + RET(EOS); + break; + case L_EBND: + case L_BBND: + FAILW(REG_EBRACE); + break; + case L_BRACK: + case L_CEL: + case L_ECL: + case L_CCL: + FAILW(REG_EBRACK); + break; + } + assert(NOTREACHED); + } + + /* okay, time to actually get a character */ + c = *v->now++; + + /* deal with the easy contexts, punt EREs to code below */ + switch (v->lexcon) + { + case L_BRE: /* punt BREs to separate function */ + return brenext(v, c); + break; + case L_ERE: /* see below */ + break; + case L_Q: /* literal strings are easy */ + RETV(PLAIN, c); + break; + case L_BBND: /* bounds are fairly simple */ + case L_EBND: + switch (c) + { + case CHR('0'): + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + RETV(DIGIT, (chr) DIGITVAL(c)); + break; + case CHR(','): + RET(','); + break; + case CHR('}'): /* ERE bound ends with } */ + if (INCON(L_EBND)) + { + INTOCON(L_ERE); + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('}', 0); + } + RETV('}', 1); + } + else + FAILW(REG_BADBR); + break; + case CHR('\\'): /* BRE bound ends with \} */ + if (INCON(L_BBND) && NEXT1('}')) + { + v->now++; + INTOCON(L_BRE); + RETV('}', 1); + } + else + FAILW(REG_BADBR); + break; + default: + FAILW(REG_BADBR); + break; + } + assert(NOTREACHED); + break; + case L_BRACK: /* brackets are not too hard */ + switch (c) + { + case CHR(']'): + if (LASTTYPE('[')) + RETV(PLAIN, c); + else + { + INTOCON((v->cflags & REG_EXTENDED) ? + L_ERE : L_BRE); + RET(']'); + } + break; + case CHR('\\'): + NOTE(REG_UBBS); + if (!(v->cflags & REG_ADVF)) + RETV(PLAIN, c); + NOTE(REG_UNONPOSIX); + if (ATEOS()) + FAILW(REG_EESCAPE); + if (!lexescape(v)) + return 0; + switch (v->nexttype) + { /* not all escapes okay here */ + case PLAIN: + case CCLASSS: + case CCLASSC: + return 1; + break; + } + /* not one of the acceptable escapes */ + FAILW(REG_EESCAPE); + break; + case CHR('-'): + if (LASTTYPE('[') || NEXT1(']')) + RETV(PLAIN, c); + else + RETV(RANGE, c); + break; + case CHR('['): + if (ATEOS()) + FAILW(REG_EBRACK); + switch (*v->now++) + { + case CHR('.'): + INTOCON(L_CEL); + /* might or might not be locale-specific */ + RET(COLLEL); + break; + case CHR('='): + INTOCON(L_ECL); + NOTE(REG_ULOCALE); + RET(ECLASS); + break; + case CHR(':'): + INTOCON(L_CCL); + NOTE(REG_ULOCALE); + RET(CCLASS); + break; + default: /* oops */ + v->now--; + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + default: + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + case L_CEL: /* collating elements are easy */ + if (c == CHR('.') && NEXT1(']')) + { + v->now++; + INTOCON(L_BRACK); + RETV(END, '.'); + } + else + RETV(PLAIN, c); + break; + case L_ECL: /* ditto equivalence classes */ + if (c == CHR('=') && NEXT1(']')) + { + v->now++; + INTOCON(L_BRACK); + RETV(END, '='); + } + else + RETV(PLAIN, c); + break; + case L_CCL: /* ditto character classes */ + if (c == CHR(':') && NEXT1(']')) + { + v->now++; + INTOCON(L_BRACK); + RETV(END, ':'); + } + else + RETV(PLAIN, c); + break; + default: + assert(NOTREACHED); + break; + } + + /* that got rid of everything except EREs and AREs */ + assert(INCON(L_ERE)); + + /* deal with EREs and AREs, except for backslashes */ + switch (c) + { + case CHR('|'): + RET('|'); + break; + case CHR('*'): + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('*', 0); + } + RETV('*', 1); + break; + case CHR('+'): + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('+', 0); + } + RETV('+', 1); + break; + case CHR('?'): + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('?', 0); + } + RETV('?', 1); + break; + case CHR('{'): /* bounds start or plain character */ + if (v->cflags & REG_EXPANDED) + skip(v); + if (ATEOS() || !iscdigit(*v->now)) + { + NOTE(REG_UBRACES); + NOTE(REG_UUNSPEC); + RETV(PLAIN, c); + } + else + { + NOTE(REG_UBOUNDS); + INTOCON(L_EBND); + RET('{'); + } + assert(NOTREACHED); + break; + case CHR('('): /* parenthesis, or advanced extension */ + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + NOTE(REG_UNONPOSIX); + v->now++; + if (ATEOS()) + FAILW(REG_BADRPT); + switch (*v->now++) + { + case CHR(':'): /* non-capturing paren */ + RETV('(', 0); + break; + case CHR('#'): /* comment */ + while (!ATEOS() && *v->now != CHR(')')) + v->now++; + if (!ATEOS()) + v->now++; + assert(v->nexttype == v->lasttype); + return next(v); + break; + case CHR('='): /* positive lookahead */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_AHEAD_POS); + break; + case CHR('!'): /* negative lookahead */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_AHEAD_NEG); + break; + case CHR('<'): + if (ATEOS()) + FAILW(REG_BADRPT); + switch (*v->now++) + { + case CHR('='): /* positive lookbehind */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_BEHIND_POS); + break; + case CHR('!'): /* negative lookbehind */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_BEHIND_NEG); + break; + default: + FAILW(REG_BADRPT); + break; + } + assert(NOTREACHED); + break; + default: + FAILW(REG_BADRPT); + break; + } + assert(NOTREACHED); + } + if (v->cflags & REG_NOSUB) + RETV('(', 0); /* all parens non-capturing */ + else + RETV('(', 1); + break; + case CHR(')'): + if (LASTTYPE('(')) + NOTE(REG_UUNSPEC); + RETV(')', c); + break; + case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ + if (HAVE(6) && *(v->now + 0) == CHR('[') && + *(v->now + 1) == CHR(':') && + (*(v->now + 2) == CHR('<') || + *(v->now + 2) == CHR('>')) && + *(v->now + 3) == CHR(':') && + *(v->now + 4) == CHR(']') && + *(v->now + 5) == CHR(']')) + { + c = *(v->now + 2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) + { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + RET('^'); + break; + case CHR('$'): + RET('$'); + break; + case CHR('\\'): /* mostly punt backslashes to code below */ + if (ATEOS()) + FAILW(REG_EESCAPE); + break; + default: /* ordinary character */ + RETV(PLAIN, c); + break; + } + + /* ERE/ARE backslash handling; backslash already eaten */ + assert(!ATEOS()); + if (!(v->cflags & REG_ADVF)) + { /* only AREs have non-trivial escapes */ + if (iscalnum(*v->now)) + { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, *v->now++); + } + return lexescape(v); +} + +/* + * lexescape - parse an ARE backslash escape (backslash already eaten) + * + * This is used for ARE backslashes both normally and inside bracket + * expressions. In the latter case, not all escape types are allowed, + * but the caller must reject unwanted ones after we return. + */ +static int +lexescape(struct vars *v) +{ + chr c; + static const chr alert[] = { + CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') + }; + static const chr esc[] = { + CHR('E'), CHR('S'), CHR('C') + }; + const chr *save; + + assert(v->cflags & REG_ADVF); + + assert(!ATEOS()); + c = *v->now++; + if (!iscalnum(c)) + RETV(PLAIN, c); + + NOTE(REG_UNONPOSIX); + switch (c) + { + case CHR('a'): + RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); + break; + case CHR('A'): + RETV(SBEGIN, 0); + break; + case CHR('b'): + RETV(PLAIN, CHR('\b')); + break; + case CHR('B'): + RETV(PLAIN, CHR('\\')); + break; + case CHR('c'): + NOTE(REG_UUNPORT); + if (ATEOS()) + FAILW(REG_EESCAPE); + RETV(PLAIN, (chr) (*v->now++ & 037)); + break; + case CHR('d'): + NOTE(REG_ULOCALE); + RETV(CCLASSS, CC_DIGIT); + break; + case CHR('D'): + NOTE(REG_ULOCALE); + RETV(CCLASSC, CC_DIGIT); + break; + case CHR('e'): + NOTE(REG_UUNPORT); + RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); + break; + case CHR('f'): + RETV(PLAIN, CHR('\f')); + break; + case CHR('m'): + RET('<'); + break; + case CHR('M'): + RET('>'); + break; + case CHR('n'): + RETV(PLAIN, CHR('\n')); + break; + case CHR('r'): + RETV(PLAIN, CHR('\r')); + break; + case CHR('s'): + NOTE(REG_ULOCALE); + RETV(CCLASSS, CC_SPACE); + break; + case CHR('S'): + NOTE(REG_ULOCALE); + RETV(CCLASSC, CC_SPACE); + break; + case CHR('t'): + RETV(PLAIN, CHR('\t')); + break; + case CHR('u'): + c = lexdigits(v, 16, 4, 4); + if (ISERR() || !CHR_IS_IN_RANGE(c)) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + case CHR('U'): + c = lexdigits(v, 16, 8, 8); + if (ISERR() || !CHR_IS_IN_RANGE(c)) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + case CHR('v'): + RETV(PLAIN, CHR('\v')); + break; + case CHR('w'): + NOTE(REG_ULOCALE); + RETV(CCLASSS, CC_WORD); + break; + case CHR('W'): + NOTE(REG_ULOCALE); + RETV(CCLASSC, CC_WORD); + break; + case CHR('x'): + NOTE(REG_UUNPORT); + c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ + if (ISERR() || !CHR_IS_IN_RANGE(c)) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + case CHR('y'): + NOTE(REG_ULOCALE); + RETV(WBDRY, 0); + break; + case CHR('Y'): + NOTE(REG_ULOCALE); + RETV(NWBDRY, 0); + break; + case CHR('Z'): + RETV(SEND, 0); + break; + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + save = v->now; + v->now--; /* put first digit back */ + c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ + if (ISERR()) + FAILW(REG_EESCAPE); + /* ugly heuristic (first test is "exactly 1 digit?") */ + if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp)) + { + NOTE(REG_UBACKREF); + RETV(BACKREF, c); + } + /* oops, doesn't look like it's a backref after all... */ + v->now = save; + /* and fall through into octal number */ + /* FALLTHROUGH */ + case CHR('0'): + NOTE(REG_UUNPORT); + v->now--; /* put first digit back */ + c = lexdigits(v, 8, 1, 3); + if (ISERR()) + FAILW(REG_EESCAPE); + if (c > 0xff) + { + /* out of range, so we handled one digit too much */ + v->now--; + c >>= 3; + } + RETV(PLAIN, c); + break; + default: + assert(iscalpha(c)); + FAILW(REG_EESCAPE); /* unknown alphabetic escape */ + break; + } + assert(NOTREACHED); +} + +/* + * lexdigits - slurp up digits and return chr value + * + * This does not account for overflow; callers should range-check the result + * if maxlen is large enough to make that possible. + */ +static chr /* chr value; errors signalled via ERR */ +lexdigits(struct vars *v, + int base, + int minlen, + int maxlen) +{ + uchr n; /* unsigned to avoid overflow misbehavior */ + int len; + chr c; + int d; + const uchr ub = (uchr) base; + + n = 0; + for (len = 0; len < maxlen && !ATEOS(); len++) + { + c = *v->now++; + switch (c) + { + case CHR('0'): + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + d = DIGITVAL(c); + break; + case CHR('a'): + case CHR('A'): + d = 10; + break; + case CHR('b'): + case CHR('B'): + d = 11; + break; + case CHR('c'): + case CHR('C'): + d = 12; + break; + case CHR('d'): + case CHR('D'): + d = 13; + break; + case CHR('e'): + case CHR('E'): + d = 14; + break; + case CHR('f'): + case CHR('F'): + d = 15; + break; + default: + v->now--; /* oops, not a digit at all */ + d = -1; + break; + } + + if (d >= base) + { /* not a plausible digit */ + v->now--; + d = -1; + } + if (d < 0) + break; /* NOTE BREAK OUT */ + n = n * ub + (uchr) d; + } + if (len < minlen) + ERR(REG_EESCAPE); + + return (chr) n; +} + +/* + * brenext - get next BRE token + * + * This is much like EREs except for all the stupid backslashes and the + * context-dependency of some things. + */ +static int /* 1 normal, 0 failure */ +brenext(struct vars *v, + chr c) +{ + switch (c) + { + case CHR('*'): + if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) + RETV(PLAIN, c); + RETV('*', 1); + break; + case CHR('['): + if (HAVE(6) && *(v->now + 0) == CHR('[') && + *(v->now + 1) == CHR(':') && + (*(v->now + 2) == CHR('<') || + *(v->now + 2) == CHR('>')) && + *(v->now + 3) == CHR(':') && + *(v->now + 4) == CHR(']') && + *(v->now + 5) == CHR(']')) + { + c = *(v->now + 2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) + { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + if (LASTTYPE(EMPTY)) + RET('^'); + if (LASTTYPE('(')) + { + NOTE(REG_UUNSPEC); + RET('^'); + } + RETV(PLAIN, c); + break; + case CHR('$'): + if (v->cflags & REG_EXPANDED) + skip(v); + if (ATEOS()) + RET('$'); + if (NEXT2('\\', ')')) + { + NOTE(REG_UUNSPEC); + RET('$'); + } + RETV(PLAIN, c); + break; + case CHR('\\'): + break; /* see below */ + default: + RETV(PLAIN, c); + break; + } + + assert(c == CHR('\\')); + + if (ATEOS()) + FAILW(REG_EESCAPE); + + c = *v->now++; + switch (c) + { + case CHR('{'): + INTOCON(L_BBND); + NOTE(REG_UBOUNDS); + RET('{'); + break; + case CHR('('): + RETV('(', 1); + break; + case CHR(')'): + RETV(')', c); + break; + case CHR('<'): + NOTE(REG_UNONPOSIX); + RET('<'); + break; + case CHR('>'): + NOTE(REG_UNONPOSIX); + RET('>'); + break; + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + NOTE(REG_UBACKREF); + RETV(BACKREF, (chr) DIGITVAL(c)); + break; + default: + if (iscalnum(c)) + { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, c); + break; + } + + assert(NOTREACHED); + return 0; +} + +/* + * skip - skip white space and comments in expanded form + */ +static void +skip(struct vars *v) +{ + const chr *start = v->now; + + assert(v->cflags & REG_EXPANDED); + + for (;;) + { + while (!ATEOS() && iscspace(*v->now)) + v->now++; + if (ATEOS() || *v->now != CHR('#')) + break; /* NOTE BREAK OUT */ + assert(NEXT1('#')); + while (!ATEOS() && *v->now != CHR('\n')) + v->now++; + /* leave the newline to be picked up by the iscspace loop */ + } + + if (v->now != start) + NOTE(REG_UNONPOSIX); +} + +/* + * newline - return the chr for a newline + * + * This helps confine use of CHR to this source file. + */ +static chr +newline(void) +{ + return CHR('\n'); +} + +/* + * chrnamed - return the chr known by a given (chr string) name + * + * The code is a bit clumsy, but this routine gets only such specialized + * use that it hardly matters. + */ +static chr +chrnamed(struct vars *v, + const chr *startp, /* start of name */ + const chr *endp, /* just past end of name */ + chr lastresort) /* what to return if name lookup fails */ +{ + chr c; + int errsave; + int e; + struct cvec *cv; + + errsave = v->err; + v->err = 0; + c = element(v, startp, endp); + e = v->err; + v->err = errsave; + + if (e != 0) + return lastresort; + + cv = range(v, c, c, 0); + if (cv->nchrs == 0) + return lastresort; + return cv->chrs[0]; +} diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c new file mode 100644 index 0000000..b5f3a73 --- /dev/null +++ b/src/backend/regex/regc_locale.c @@ -0,0 +1,771 @@ +/* + * regc_locale.c -- + * + * This file contains locale-specific regexp routines. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998 by Scriptics Corporation. + * + * This software is copyrighted by the Regents of the University of + * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState + * Corporation and other parties. The following terms apply to all files + * associated with the software unless explicitly disclaimed in + * individual files. + * + * The authors hereby grant permission to use, copy, modify, distribute, + * and license this software and its documentation for any purpose, provided + * that existing copyright notices are retained in all copies and that this + * notice is included verbatim in any distributions. No written agreement, + * license, or royalty fee is required for any of the authorized uses. + * Modifications to this software may be copyrighted by their authors + * and need not follow the licensing terms described here, provided that + * the new terms are clearly indicated on the first page of each file where + * they apply. + * + * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY + * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY + * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE + * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE + * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR + * MODIFICATIONS. + * + * GOVERNMENT USE: If you are acquiring this software on behalf of the + * U.S. government, the Government shall have only "Restricted Rights" + * in the software and related documentation as defined in the Federal + * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you + * are acquiring the software on behalf of the Department of Defense, the + * software shall be classified as "Commercial Computer Software" and the + * Government shall have only "Restricted Rights" as defined in Clause + * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the + * authors grant the U.S. Government and others acting in its behalf + * permission to use and distribute the software in accordance with the + * terms specified in this license. + * + * src/backend/regex/regc_locale.c + */ + +/* ASCII character-name table */ + +static const struct cname +{ + const char *name; + const char code; +} cnames[] = + +{ + { + "NUL", '\0' + }, + { + "SOH", '\001' + }, + { + "STX", '\002' + }, + { + "ETX", '\003' + }, + { + "EOT", '\004' + }, + { + "ENQ", '\005' + }, + { + "ACK", '\006' + }, + { + "BEL", '\007' + }, + { + "alert", '\007' + }, + { + "BS", '\010' + }, + { + "backspace", '\b' + }, + { + "HT", '\011' + }, + { + "tab", '\t' + }, + { + "LF", '\012' + }, + { + "newline", '\n' + }, + { + "VT", '\013' + }, + { + "vertical-tab", '\v' + }, + { + "FF", '\014' + }, + { + "form-feed", '\f' + }, + { + "CR", '\015' + }, + { + "carriage-return", '\r' + }, + { + "SO", '\016' + }, + { + "SI", '\017' + }, + { + "DLE", '\020' + }, + { + "DC1", '\021' + }, + { + "DC2", '\022' + }, + { + "DC3", '\023' + }, + { + "DC4", '\024' + }, + { + "NAK", '\025' + }, + { + "SYN", '\026' + }, + { + "ETB", '\027' + }, + { + "CAN", '\030' + }, + { + "EM", '\031' + }, + { + "SUB", '\032' + }, + { + "ESC", '\033' + }, + { + "IS4", '\034' + }, + { + "FS", '\034' + }, + { + "IS3", '\035' + }, + { + "GS", '\035' + }, + { + "IS2", '\036' + }, + { + "RS", '\036' + }, + { + "IS1", '\037' + }, + { + "US", '\037' + }, + { + "space", ' ' + }, + { + "exclamation-mark", '!' + }, + { + "quotation-mark", '"' + }, + { + "number-sign", '#' + }, + { + "dollar-sign", '$' + }, + { + "percent-sign", '%' + }, + { + "ampersand", '&' + }, + { + "apostrophe", '\'' + }, + { + "left-parenthesis", '(' + }, + { + "right-parenthesis", ')' + }, + { + "asterisk", '*' + }, + { + "plus-sign", '+' + }, + { + "comma", ',' + }, + { + "hyphen", '-' + }, + { + "hyphen-minus", '-' + }, + { + "period", '.' + }, + { + "full-stop", '.' + }, + { + "slash", '/' + }, + { + "solidus", '/' + }, + { + "zero", '0' + }, + { + "one", '1' + }, + { + "two", '2' + }, + { + "three", '3' + }, + { + "four", '4' + }, + { + "five", '5' + }, + { + "six", '6' + }, + { + "seven", '7' + }, + { + "eight", '8' + }, + { + "nine", '9' + }, + { + "colon", ':' + }, + { + "semicolon", ';' + }, + { + "less-than-sign", '<' + }, + { + "equals-sign", '=' + }, + { + "greater-than-sign", '>' + }, + { + "question-mark", '?' + }, + { + "commercial-at", '@' + }, + { + "left-square-bracket", '[' + }, + { + "backslash", '\\' + }, + { + "reverse-solidus", '\\' + }, + { + "right-square-bracket", ']' + }, + { + "circumflex", '^' + }, + { + "circumflex-accent", '^' + }, + { + "underscore", '_' + }, + { + "low-line", '_' + }, + { + "grave-accent", '`' + }, + { + "left-brace", '{' + }, + { + "left-curly-bracket", '{' + }, + { + "vertical-line", '|' + }, + { + "right-brace", '}' + }, + { + "right-curly-bracket", '}' + }, + { + "tilde", '~' + }, + { + "DEL", '\177' + }, + { + NULL, 0 + } +}; + +/* + * The following array defines the valid character class names. + * The entries must match enum char_classes in regguts.h. + */ +static const char *const classNames[NUM_CCLASSES + 1] = { + "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", + "lower", "print", "punct", "space", "upper", "xdigit", "word", + NULL +}; + +/* + * We do not use the hard-wired Unicode classification tables that Tcl does. + * This is because (a) we need to deal with other encodings besides Unicode, + * and (b) we want to track the behavior of the libc locale routines as + * closely as possible. For example, it wouldn't be unreasonable for a + * locale to not consider every Unicode letter as a letter. So we build + * character classification cvecs by asking libc, even for Unicode. + */ + + +/* + * element - map collating-element name to chr + */ +static chr +element(struct vars *v, /* context */ + const chr *startp, /* points to start of name */ + const chr *endp) /* points just past end of name */ +{ + const struct cname *cn; + size_t len; + + /* generic: one-chr names stand for themselves */ + assert(startp < endp); + len = endp - startp; + if (len == 1) + return *startp; + + NOTE(REG_ULOCALE); + + /* search table */ + for (cn = cnames; cn->name != NULL; cn++) + { + if (strlen(cn->name) == len && + pg_char_and_wchar_strncmp(cn->name, startp, len) == 0) + { + break; /* NOTE BREAK OUT */ + } + } + if (cn->name != NULL) + return CHR(cn->code); + + /* couldn't find it */ + ERR(REG_ECOLLATE); + return 0; +} + +/* + * range - supply cvec for a range, including legality check + */ +static struct cvec * +range(struct vars *v, /* context */ + chr a, /* range start */ + chr b, /* range end, might equal a */ + int cases) /* case-independent? */ +{ + int nchrs; + struct cvec *cv; + chr c, + cc; + + if (a != b && !before(a, b)) + { + ERR(REG_ERANGE); + return NULL; + } + + if (!cases) + { /* easy version */ + cv = getcvec(v, 0, 1); + NOERRN(); + addrange(cv, a, b); + return cv; + } + + /* + * When case-independent, it's hard to decide when cvec ranges are usable, + * so for now at least, we won't try. We use a range for the originally + * specified chrs and then add on any case-equivalents that are outside + * that range as individual chrs. + * + * To ensure sane behavior if someone specifies a very large range, limit + * the allocation size to 100000 chrs (arbitrary) and check for overrun + * inside the loop below. + */ + nchrs = b - a + 1; + if (nchrs <= 0 || nchrs > 100000) + nchrs = 100000; + + cv = getcvec(v, nchrs, 1); + NOERRN(); + addrange(cv, a, b); + + for (c = a; c <= b; c++) + { + cc = pg_wc_tolower(c); + if (cc != c && + (before(cc, a) || before(b, cc))) + { + if (cv->nchrs >= cv->chrspace) + { + ERR(REG_ETOOBIG); + return NULL; + } + addchr(cv, cc); + } + cc = pg_wc_toupper(c); + if (cc != c && + (before(cc, a) || before(b, cc))) + { + if (cv->nchrs >= cv->chrspace) + { + ERR(REG_ETOOBIG); + return NULL; + } + addchr(cv, cc); + } + if (CANCEL_REQUESTED(v->re)) + { + ERR(REG_CANCEL); + return NULL; + } + } + + return cv; +} + +/* + * before - is chr x before chr y, for purposes of range legality? + */ +static int /* predicate */ +before(chr x, chr y) +{ + if (x < y) + return 1; + return 0; +} + +/* + * eclass - supply cvec for an equivalence class + * Must include case counterparts on request. + */ +static struct cvec * +eclass(struct vars *v, /* context */ + chr c, /* Collating element representing the + * equivalence class. */ + int cases) /* all cases? */ +{ + struct cvec *cv; + + /* crude fake equivalence class for testing */ + if ((v->cflags & REG_FAKE) && c == 'x') + { + cv = getcvec(v, 4, 0); + addchr(cv, CHR('x')); + addchr(cv, CHR('y')); + if (cases) + { + addchr(cv, CHR('X')); + addchr(cv, CHR('Y')); + } + return cv; + } + + /* otherwise, none */ + if (cases) + return allcases(v, c); + cv = getcvec(v, 1, 0); + assert(cv != NULL); + addchr(cv, c); + return cv; +} + +/* + * lookupcclass - lookup a character class identified by name + * + * On failure, sets an error code in *v; the result is then garbage. + */ +static enum char_classes +lookupcclass(struct vars *v, /* context (for returning errors) */ + const chr *startp, /* where the name starts */ + const chr *endp) /* just past the end of the name */ +{ + size_t len; + const char *const *namePtr; + int i; + + /* + * Map the name to the corresponding enumerated value. + */ + len = endp - startp; + for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) + { + if (strlen(*namePtr) == len && + pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) + return (enum char_classes) i; + } + + ERR(REG_ECTYPE); + return (enum char_classes) 0; +} + +/* + * cclasscvec - supply cvec for a character class + * + * Must include case counterparts if "cases" is true. + * + * The returned cvec might be either a transient cvec gotten from getcvec(), + * or a permanently cached one from pg_ctype_get_cache(). This is okay + * because callers are not supposed to explicitly free the result either way. + */ +static struct cvec * +cclasscvec(struct vars *v, /* context */ + enum char_classes cclasscode, /* class to build a cvec for */ + int cases) /* case-independent? */ +{ + struct cvec *cv = NULL; + + /* + * Remap lower and upper to alpha if the match is case insensitive. + */ + + if (cases && + (cclasscode == CC_LOWER || + cclasscode == CC_UPPER)) + cclasscode = CC_ALPHA; + + /* + * Now compute the character class contents. For classes that are based + * on the behavior of a <wctype.h> or <ctype.h> function, we use + * pg_ctype_get_cache so that we can cache the results. Other classes + * have definitions that are hard-wired here, and for those we just + * construct a transient cvec on the fly. + * + * NB: keep this code in sync with cclass_column_index(), below. + */ + + switch (cclasscode) + { + case CC_PRINT: + cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode); + break; + case CC_ALNUM: + cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode); + break; + case CC_ALPHA: + cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode); + break; + case CC_WORD: + cv = pg_ctype_get_cache(pg_wc_isword, cclasscode); + break; + case CC_ASCII: + /* hard-wired meaning */ + cv = getcvec(v, 0, 1); + if (cv) + addrange(cv, 0, 0x7f); + break; + case CC_BLANK: + /* hard-wired meaning */ + cv = getcvec(v, 2, 0); + addchr(cv, '\t'); + addchr(cv, ' '); + break; + case CC_CNTRL: + /* hard-wired meaning */ + cv = getcvec(v, 0, 2); + addrange(cv, 0x0, 0x1f); + addrange(cv, 0x7f, 0x9f); + break; + case CC_DIGIT: + cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode); + break; + case CC_PUNCT: + cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode); + break; + case CC_XDIGIT: + + /* + * It's not clear how to define this in non-western locales, and + * even less clear that there's any particular use in trying. So + * just hard-wire the meaning. + */ + cv = getcvec(v, 0, 3); + if (cv) + { + addrange(cv, '0', '9'); + addrange(cv, 'a', 'f'); + addrange(cv, 'A', 'F'); + } + break; + case CC_SPACE: + cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode); + break; + case CC_LOWER: + cv = pg_ctype_get_cache(pg_wc_islower, cclasscode); + break; + case CC_UPPER: + cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode); + break; + case CC_GRAPH: + cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode); + break; + } + + /* If cv is NULL now, the reason must be "out of memory" */ + if (cv == NULL) + ERR(REG_ESPACE); + return cv; +} + +/* + * cclass_column_index - get appropriate high colormap column index for chr + */ +static int +cclass_column_index(struct colormap *cm, chr c) +{ + int colnum = 0; + + /* Shouldn't go through all these pushups for simple chrs */ + assert(c > MAX_SIMPLE_CHR); + + /* + * Note: we should not see requests to consider cclasses that are not + * treated as locale-specific by cclasscvec(), above. + */ + if (cm->classbits[CC_PRINT] && pg_wc_isprint(c)) + colnum |= cm->classbits[CC_PRINT]; + if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c)) + colnum |= cm->classbits[CC_ALNUM]; + if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c)) + colnum |= cm->classbits[CC_ALPHA]; + if (cm->classbits[CC_WORD] && pg_wc_isword(c)) + colnum |= cm->classbits[CC_WORD]; + assert(cm->classbits[CC_ASCII] == 0); + assert(cm->classbits[CC_BLANK] == 0); + assert(cm->classbits[CC_CNTRL] == 0); + if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c)) + colnum |= cm->classbits[CC_DIGIT]; + if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c)) + colnum |= cm->classbits[CC_PUNCT]; + assert(cm->classbits[CC_XDIGIT] == 0); + if (cm->classbits[CC_SPACE] && pg_wc_isspace(c)) + colnum |= cm->classbits[CC_SPACE]; + if (cm->classbits[CC_LOWER] && pg_wc_islower(c)) + colnum |= cm->classbits[CC_LOWER]; + if (cm->classbits[CC_UPPER] && pg_wc_isupper(c)) + colnum |= cm->classbits[CC_UPPER]; + if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c)) + colnum |= cm->classbits[CC_GRAPH]; + + return colnum; +} + +/* + * allcases - supply cvec for all case counterparts of a chr (including itself) + * + * This is a shortcut, preferably an efficient one, for simple characters; + * messy cases are done via range(). + */ +static struct cvec * +allcases(struct vars *v, /* context */ + chr c) /* character to get case equivs of */ +{ + struct cvec *cv; + chr lc, + uc; + + lc = pg_wc_tolower(c); + uc = pg_wc_toupper(c); + + cv = getcvec(v, 2, 0); + addchr(cv, lc); + if (lc != uc) + addchr(cv, uc); + return cv; +} + +/* + * cmp - chr-substring compare + * + * Backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +cmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + return memcmp(VS(x), VS(y), len * sizeof(chr)); +} + +/* + * casecmp - case-independent chr-substring compare + * + * REG_ICASE backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +casecmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + for (; len > 0; len--, x++, y++) + { + if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y))) + return 1; + } + return 0; +} diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c new file mode 100644 index 0000000..0e93c74 --- /dev/null +++ b/src/backend/regex/regc_nfa.c @@ -0,0 +1,3824 @@ +/* + * NFA utilities. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regc_nfa.c + * + * + * One or two things that technically ought to be in here + * are actually in color.c, thanks to some incestuous relationships in + * the color chains. + */ + +#define NISERR() VISERR(nfa->v) +#define NERR(e) VERR(nfa->v, (e)) + + +/* + * newnfa - set up an NFA + */ +static struct nfa * /* the NFA, or NULL */ +newnfa(struct vars *v, + struct colormap *cm, + struct nfa *parent) /* NULL if primary NFA */ +{ + struct nfa *nfa; + + nfa = (struct nfa *) MALLOC(sizeof(struct nfa)); + if (nfa == NULL) + { + ERR(REG_ESPACE); + return NULL; + } + + /* Make the NFA minimally valid, so freenfa() will behave sanely */ + nfa->states = NULL; + nfa->slast = NULL; + nfa->freestates = NULL; + nfa->freearcs = NULL; + nfa->lastsb = NULL; + nfa->lastab = NULL; + nfa->lastsbused = 0; + nfa->lastabused = 0; + nfa->nstates = 0; + nfa->cm = cm; + nfa->v = v; + nfa->bos[0] = nfa->bos[1] = COLORLESS; + nfa->eos[0] = nfa->eos[1] = COLORLESS; + nfa->flags = 0; + nfa->minmatchall = nfa->maxmatchall = -1; + nfa->parent = parent; /* Precedes newfstate so parent is valid. */ + + /* Create required infrastructure */ + nfa->post = newfstate(nfa, '@'); /* number 0 */ + nfa->pre = newfstate(nfa, '>'); /* number 1 */ + nfa->init = newstate(nfa); /* may become invalid later */ + nfa->final = newstate(nfa); + if (ISERR()) + { + freenfa(nfa); + return NULL; + } + rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init); + newarc(nfa, '^', 1, nfa->pre, nfa->init); + newarc(nfa, '^', 0, nfa->pre, nfa->init); + rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post); + newarc(nfa, '$', 1, nfa->final, nfa->post); + newarc(nfa, '$', 0, nfa->final, nfa->post); + + if (ISERR()) + { + freenfa(nfa); + return NULL; + } + return nfa; +} + +/* + * freenfa - free an entire NFA + */ +static void +freenfa(struct nfa *nfa) +{ + struct statebatch *sb; + struct statebatch *sbnext; + struct arcbatch *ab; + struct arcbatch *abnext; + + for (sb = nfa->lastsb; sb != NULL; sb = sbnext) + { + sbnext = sb->next; + nfa->v->spaceused -= STATEBATCHSIZE(sb->nstates); + FREE(sb); + } + nfa->lastsb = NULL; + for (ab = nfa->lastab; ab != NULL; ab = abnext) + { + abnext = ab->next; + nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs); + FREE(ab); + } + nfa->lastab = NULL; + + nfa->nstates = -1; + FREE(nfa); +} + +/* + * newstate - allocate an NFA state, with zero flag value + */ +static struct state * /* NULL on error */ +newstate(struct nfa *nfa) +{ + struct state *s; + + /* + * This is a handy place to check for operation cancel during regex + * compilation, since no code path will go very long without making a new + * state or arc. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return NULL; + } + + /* first, recycle anything that's on the freelist */ + if (nfa->freestates != NULL) + { + s = nfa->freestates; + nfa->freestates = s->next; + } + /* otherwise, is there anything left in the last statebatch? */ + else if (nfa->lastsb != NULL && nfa->lastsbused < nfa->lastsb->nstates) + { + s = &nfa->lastsb->s[nfa->lastsbused++]; + } + /* otherwise, need to allocate a new statebatch */ + else + { + struct statebatch *newSb; + size_t nstates; + + if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE) + { + NERR(REG_ETOOBIG); + return NULL; + } + nstates = (nfa->lastsb != NULL) ? nfa->lastsb->nstates * 2 : FIRSTSBSIZE; + if (nstates > MAXSBSIZE) + nstates = MAXSBSIZE; + newSb = (struct statebatch *) MALLOC(STATEBATCHSIZE(nstates)); + if (newSb == NULL) + { + NERR(REG_ESPACE); + return NULL; + } + nfa->v->spaceused += STATEBATCHSIZE(nstates); + newSb->nstates = nstates; + newSb->next = nfa->lastsb; + nfa->lastsb = newSb; + nfa->lastsbused = 1; + s = &newSb->s[0]; + } + + assert(nfa->nstates >= 0); + s->no = nfa->nstates++; + s->flag = 0; + if (nfa->states == NULL) + nfa->states = s; + s->nins = 0; + s->ins = NULL; + s->nouts = 0; + s->outs = NULL; + s->tmp = NULL; + s->next = NULL; + if (nfa->slast != NULL) + { + assert(nfa->slast->next == NULL); + nfa->slast->next = s; + } + s->prev = nfa->slast; + nfa->slast = s; + return s; +} + +/* + * newfstate - allocate an NFA state with a specified flag value + */ +static struct state * /* NULL on error */ +newfstate(struct nfa *nfa, int flag) +{ + struct state *s; + + s = newstate(nfa); + if (s != NULL) + s->flag = (char) flag; + return s; +} + +/* + * dropstate - delete a state's inarcs and outarcs and free it + */ +static void +dropstate(struct nfa *nfa, + struct state *s) +{ + struct arc *a; + + while ((a = s->ins) != NULL) + freearc(nfa, a); + while ((a = s->outs) != NULL) + freearc(nfa, a); + freestate(nfa, s); +} + +/* + * freestate - free a state, which has no in-arcs or out-arcs + */ +static void +freestate(struct nfa *nfa, + struct state *s) +{ + assert(s != NULL); + assert(s->nins == 0 && s->nouts == 0); + + s->no = FREESTATE; + s->flag = 0; + if (s->next != NULL) + s->next->prev = s->prev; + else + { + assert(s == nfa->slast); + nfa->slast = s->prev; + } + if (s->prev != NULL) + s->prev->next = s->next; + else + { + assert(s == nfa->states); + nfa->states = s->next; + } + s->prev = NULL; + s->next = nfa->freestates; /* don't delete it, put it on the free list */ + nfa->freestates = s; +} + +/* + * newarc - set up a new arc within an NFA + * + * This function checks to make sure that no duplicate arcs are created. + * In general we never want duplicates. + * + * However: in principle, a RAINBOW arc is redundant with any plain arc + * (unless that arc is for a pseudocolor). But we don't try to recognize + * that redundancy, either here or in allied operations such as moveins(). + * The pseudocolor consideration makes that more costly than it seems worth. + */ +static void +newarc(struct nfa *nfa, + int t, + color co, + struct state *from, + struct state *to) +{ + struct arc *a; + + assert(from != NULL && to != NULL); + + /* + * This is a handy place to check for operation cancel during regex + * compilation, since no code path will go very long without making a new + * state or arc. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + /* check for duplicate arc, using whichever chain is shorter */ + if (from->nouts <= to->nins) + { + for (a = from->outs; a != NULL; a = a->outchain) + if (a->to == to && a->co == co && a->type == t) + return; + } + else + { + for (a = to->ins; a != NULL; a = a->inchain) + if (a->from == from && a->co == co && a->type == t) + return; + } + + /* no dup, so create the arc */ + createarc(nfa, t, co, from, to); +} + +/* + * createarc - create a new arc within an NFA + * + * This function must *only* be used after verifying that there is no existing + * identical arc (same type/color/from/to). + */ +static void +createarc(struct nfa *nfa, + int t, + color co, + struct state *from, + struct state *to) +{ + struct arc *a; + + a = allocarc(nfa); + if (NISERR()) + return; + assert(a != NULL); + + a->type = t; + a->co = co; + a->to = to; + a->from = from; + + /* + * Put the new arc on the beginning, not the end, of the chains; it's + * simpler here, and freearc() is the same cost either way. See also the + * logic in moveins() and its cohorts, as well as fixempties(). + */ + a->inchain = to->ins; + a->inchainRev = NULL; + if (to->ins) + to->ins->inchainRev = a; + to->ins = a; + a->outchain = from->outs; + a->outchainRev = NULL; + if (from->outs) + from->outs->outchainRev = a; + from->outs = a; + + from->nouts++; + to->nins++; + + if (COLORED(a) && nfa->parent == NULL) + colorchain(nfa->cm, a); +} + +/* + * allocarc - allocate a new arc within an NFA + */ +static struct arc * /* NULL for failure */ +allocarc(struct nfa *nfa) +{ + struct arc *a; + + /* first, recycle anything that's on the freelist */ + if (nfa->freearcs != NULL) + { + a = nfa->freearcs; + nfa->freearcs = a->freechain; + } + /* otherwise, is there anything left in the last arcbatch? */ + else if (nfa->lastab != NULL && nfa->lastabused < nfa->lastab->narcs) + { + a = &nfa->lastab->a[nfa->lastabused++]; + } + /* otherwise, need to allocate a new arcbatch */ + else + { + struct arcbatch *newAb; + size_t narcs; + + if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE) + { + NERR(REG_ETOOBIG); + return NULL; + } + narcs = (nfa->lastab != NULL) ? nfa->lastab->narcs * 2 : FIRSTABSIZE; + if (narcs > MAXABSIZE) + narcs = MAXABSIZE; + newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs)); + if (newAb == NULL) + { + NERR(REG_ESPACE); + return NULL; + } + nfa->v->spaceused += ARCBATCHSIZE(narcs); + newAb->narcs = narcs; + newAb->next = nfa->lastab; + nfa->lastab = newAb; + nfa->lastabused = 1; + a = &newAb->a[0]; + } + + return a; +} + +/* + * freearc - free an arc + */ +static void +freearc(struct nfa *nfa, + struct arc *victim) +{ + struct state *from = victim->from; + struct state *to = victim->to; + struct arc *predecessor; + + assert(victim->type != 0); + + /* take it off color chain if necessary */ + if (COLORED(victim) && nfa->parent == NULL) + uncolorchain(nfa->cm, victim); + + /* take it off source's out-chain */ + assert(from != NULL); + predecessor = victim->outchainRev; + if (predecessor == NULL) + { + assert(from->outs == victim); + from->outs = victim->outchain; + } + else + { + assert(predecessor->outchain == victim); + predecessor->outchain = victim->outchain; + } + if (victim->outchain != NULL) + { + assert(victim->outchain->outchainRev == victim); + victim->outchain->outchainRev = predecessor; + } + from->nouts--; + + /* take it off target's in-chain */ + assert(to != NULL); + predecessor = victim->inchainRev; + if (predecessor == NULL) + { + assert(to->ins == victim); + to->ins = victim->inchain; + } + else + { + assert(predecessor->inchain == victim); + predecessor->inchain = victim->inchain; + } + if (victim->inchain != NULL) + { + assert(victim->inchain->inchainRev == victim); + victim->inchain->inchainRev = predecessor; + } + to->nins--; + + /* clean up and place on NFA's free list */ + victim->type = 0; + victim->from = NULL; /* precautions... */ + victim->to = NULL; + victim->inchain = NULL; + victim->inchainRev = NULL; + victim->outchain = NULL; + victim->outchainRev = NULL; + victim->freechain = nfa->freearcs; + nfa->freearcs = victim; +} + +/* + * changearcsource - flip an arc to have a different from state + * + * Caller must have verified that there is no pre-existing duplicate arc. + */ +static void +changearcsource(struct arc *a, struct state *newfrom) +{ + struct state *oldfrom = a->from; + struct arc *predecessor; + + assert(oldfrom != newfrom); + + /* take it off old source's out-chain */ + assert(oldfrom != NULL); + predecessor = a->outchainRev; + if (predecessor == NULL) + { + assert(oldfrom->outs == a); + oldfrom->outs = a->outchain; + } + else + { + assert(predecessor->outchain == a); + predecessor->outchain = a->outchain; + } + if (a->outchain != NULL) + { + assert(a->outchain->outchainRev == a); + a->outchain->outchainRev = predecessor; + } + oldfrom->nouts--; + + a->from = newfrom; + + /* prepend it to new source's out-chain */ + a->outchain = newfrom->outs; + a->outchainRev = NULL; + if (newfrom->outs) + newfrom->outs->outchainRev = a; + newfrom->outs = a; + newfrom->nouts++; +} + +/* + * changearctarget - flip an arc to have a different to state + * + * Caller must have verified that there is no pre-existing duplicate arc. + */ +static void +changearctarget(struct arc *a, struct state *newto) +{ + struct state *oldto = a->to; + struct arc *predecessor; + + assert(oldto != newto); + + /* take it off old target's in-chain */ + assert(oldto != NULL); + predecessor = a->inchainRev; + if (predecessor == NULL) + { + assert(oldto->ins == a); + oldto->ins = a->inchain; + } + else + { + assert(predecessor->inchain == a); + predecessor->inchain = a->inchain; + } + if (a->inchain != NULL) + { + assert(a->inchain->inchainRev == a); + a->inchain->inchainRev = predecessor; + } + oldto->nins--; + + a->to = newto; + + /* prepend it to new target's in-chain */ + a->inchain = newto->ins; + a->inchainRev = NULL; + if (newto->ins) + newto->ins->inchainRev = a; + newto->ins = a; + newto->nins++; +} + +/* + * hasnonemptyout - Does state have a non-EMPTY out arc? + */ +static int +hasnonemptyout(struct state *s) +{ + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->type != EMPTY) + return 1; + } + return 0; +} + +/* + * findarc - find arc, if any, from given source with given type and color + * If there is more than one such arc, the result is random. + */ +static struct arc * +findarc(struct state *s, + int type, + color co) +{ + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + if (a->type == type && a->co == co) + return a; + return NULL; +} + +/* + * cparc - allocate a new arc within an NFA, copying details from old one + */ +static void +cparc(struct nfa *nfa, + struct arc *oa, + struct state *from, + struct state *to) +{ + newarc(nfa, oa->type, oa->co, from, to); +} + +/* + * sortins - sort the in arcs of a state by from/color/type + */ +static void +sortins(struct nfa *nfa, + struct state *s) +{ + struct arc **sortarray; + struct arc *a; + int n = s->nins; + int i; + + if (n <= 1) + return; /* nothing to do */ + /* make an array of arc pointers ... */ + sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *)); + if (sortarray == NULL) + { + NERR(REG_ESPACE); + return; + } + i = 0; + for (a = s->ins; a != NULL; a = a->inchain) + sortarray[i++] = a; + assert(i == n); + /* ... sort the array */ + qsort(sortarray, n, sizeof(struct arc *), sortins_cmp); + /* ... and rebuild arc list in order */ + /* it seems worth special-casing first and last items to simplify loop */ + a = sortarray[0]; + s->ins = a; + a->inchain = sortarray[1]; + a->inchainRev = NULL; + for (i = 1; i < n - 1; i++) + { + a = sortarray[i]; + a->inchain = sortarray[i + 1]; + a->inchainRev = sortarray[i - 1]; + } + a = sortarray[i]; + a->inchain = NULL; + a->inchainRev = sortarray[i - 1]; + FREE(sortarray); +} + +static int +sortins_cmp(const void *a, const void *b) +{ + const struct arc *aa = *((const struct arc *const *) a); + const struct arc *bb = *((const struct arc *const *) b); + + /* we check the fields in the order they are most likely to be different */ + if (aa->from->no < bb->from->no) + return -1; + if (aa->from->no > bb->from->no) + return 1; + if (aa->co < bb->co) + return -1; + if (aa->co > bb->co) + return 1; + if (aa->type < bb->type) + return -1; + if (aa->type > bb->type) + return 1; + return 0; +} + +/* + * sortouts - sort the out arcs of a state by to/color/type + */ +static void +sortouts(struct nfa *nfa, + struct state *s) +{ + struct arc **sortarray; + struct arc *a; + int n = s->nouts; + int i; + + if (n <= 1) + return; /* nothing to do */ + /* make an array of arc pointers ... */ + sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *)); + if (sortarray == NULL) + { + NERR(REG_ESPACE); + return; + } + i = 0; + for (a = s->outs; a != NULL; a = a->outchain) + sortarray[i++] = a; + assert(i == n); + /* ... sort the array */ + qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp); + /* ... and rebuild arc list in order */ + /* it seems worth special-casing first and last items to simplify loop */ + a = sortarray[0]; + s->outs = a; + a->outchain = sortarray[1]; + a->outchainRev = NULL; + for (i = 1; i < n - 1; i++) + { + a = sortarray[i]; + a->outchain = sortarray[i + 1]; + a->outchainRev = sortarray[i - 1]; + } + a = sortarray[i]; + a->outchain = NULL; + a->outchainRev = sortarray[i - 1]; + FREE(sortarray); +} + +static int +sortouts_cmp(const void *a, const void *b) +{ + const struct arc *aa = *((const struct arc *const *) a); + const struct arc *bb = *((const struct arc *const *) b); + + /* we check the fields in the order they are most likely to be different */ + if (aa->to->no < bb->to->no) + return -1; + if (aa->to->no > bb->to->no) + return 1; + if (aa->co < bb->co) + return -1; + if (aa->co > bb->co) + return 1; + if (aa->type < bb->type) + return -1; + if (aa->type > bb->type) + return 1; + return 0; +} + +/* + * Common decision logic about whether to use arc-by-arc operations or + * sort/merge. If there's just a few source arcs we cannot recoup the + * cost of sorting the destination arc list, no matter how large it is. + * Otherwise, limit the number of arc-by-arc comparisons to about 1000 + * (a somewhat arbitrary choice, but the breakeven point would probably + * be machine dependent anyway). + */ +#define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \ + ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32)) + +/* + * moveins - move all in arcs of a state to another state + * + * You might think this could be done better by just updating the + * existing arcs, and you would be right if it weren't for the need + * for duplicate suppression, which makes it easier to just make new + * ones to exploit the suppression built into newarc. + * + * However, if we have a whole lot of arcs to deal with, retail duplicate + * checks become too slow. In that case we proceed by sorting and merging + * the arc lists, and then we can indeed just update the arcs in-place. + */ +static void +moveins(struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + assert(oldState != newState); + + if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) + { + /* With not too many arcs, just do them one at a time */ + struct arc *a; + + while ((a = oldState->ins) != NULL) + { + cparc(nfa, a, a->from, newState); + freearc(nfa, a); + } + } + else + { + /* + * With many arcs, use a sort-merge approach. Note changearctarget() + * will put the arc onto the front of newState's chain, so it does not + * break our walk through the sorted part of the chain. + */ + struct arc *oa; + struct arc *na; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + sortins(nfa, oldState); + sortins(nfa, newState); + if (NISERR()) + return; /* might have failed to sort */ + oa = oldState->ins; + na = newState->ins; + while (oa != NULL && na != NULL) + { + struct arc *a = oa; + + switch (sortins_cmp(&oa, &na)) + { + case -1: + /* newState does not have anything matching oa */ + oa = oa->inchain; + + /* + * Rather than doing createarc+freearc, we can just unlink + * and relink the existing arc struct. + */ + changearctarget(a, newState); + break; + case 0: + /* match, advance in both lists */ + oa = oa->inchain; + na = na->inchain; + /* ... and drop duplicate arc from oldState */ + freearc(nfa, a); + break; + case +1: + /* advance only na; oa might have a match later */ + na = na->inchain; + break; + default: + assert(NOTREACHED); + } + } + while (oa != NULL) + { + /* newState does not have anything matching oa */ + struct arc *a = oa; + + oa = oa->inchain; + changearctarget(a, newState); + } + } + + assert(oldState->nins == 0); + assert(oldState->ins == NULL); +} + +/* + * copyins - copy in arcs of a state to another state + */ +static void +copyins(struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + assert(oldState != newState); + + if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) + { + /* With not too many arcs, just do them one at a time */ + struct arc *a; + + for (a = oldState->ins; a != NULL; a = a->inchain) + cparc(nfa, a, a->from, newState); + } + else + { + /* + * With many arcs, use a sort-merge approach. Note that createarc() + * will put new arcs onto the front of newState's chain, so it does + * not break our walk through the sorted part of the chain. + */ + struct arc *oa; + struct arc *na; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + sortins(nfa, oldState); + sortins(nfa, newState); + if (NISERR()) + return; /* might have failed to sort */ + oa = oldState->ins; + na = newState->ins; + while (oa != NULL && na != NULL) + { + struct arc *a = oa; + + switch (sortins_cmp(&oa, &na)) + { + case -1: + /* newState does not have anything matching oa */ + oa = oa->inchain; + createarc(nfa, a->type, a->co, a->from, newState); + break; + case 0: + /* match, advance in both lists */ + oa = oa->inchain; + na = na->inchain; + break; + case +1: + /* advance only na; oa might have a match later */ + na = na->inchain; + break; + default: + assert(NOTREACHED); + } + } + while (oa != NULL) + { + /* newState does not have anything matching oa */ + struct arc *a = oa; + + oa = oa->inchain; + createarc(nfa, a->type, a->co, a->from, newState); + } + } +} + +/* + * mergeins - merge a list of inarcs into a state + * + * This is much like copyins, but the source arcs are listed in an array, + * and are not guaranteed unique. It's okay to clobber the array contents. + */ +static void +mergeins(struct nfa *nfa, + struct state *s, + struct arc **arcarray, + int arccount) +{ + struct arc *na; + int i; + int j; + + if (arccount <= 0) + return; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + /* Sort existing inarcs as well as proposed new ones */ + sortins(nfa, s); + if (NISERR()) + return; /* might have failed to sort */ + + qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp); + + /* + * arcarray very likely includes dups, so we must eliminate them. (This + * could be folded into the next loop, but it's not worth the trouble.) + */ + j = 0; + for (i = 1; i < arccount; i++) + { + switch (sortins_cmp(&arcarray[j], &arcarray[i])) + { + case -1: + /* non-dup */ + arcarray[++j] = arcarray[i]; + break; + case 0: + /* dup */ + break; + default: + /* trouble */ + assert(NOTREACHED); + } + } + arccount = j + 1; + + /* + * Now merge into s' inchain. Note that createarc() will put new arcs + * onto the front of s's chain, so it does not break our walk through the + * sorted part of the chain. + */ + i = 0; + na = s->ins; + while (i < arccount && na != NULL) + { + struct arc *a = arcarray[i]; + + switch (sortins_cmp(&a, &na)) + { + case -1: + /* s does not have anything matching a */ + createarc(nfa, a->type, a->co, a->from, s); + i++; + break; + case 0: + /* match, advance in both lists */ + i++; + na = na->inchain; + break; + case +1: + /* advance only na; array might have a match later */ + na = na->inchain; + break; + default: + assert(NOTREACHED); + } + } + while (i < arccount) + { + /* s does not have anything matching a */ + struct arc *a = arcarray[i]; + + createarc(nfa, a->type, a->co, a->from, s); + i++; + } +} + +/* + * moveouts - move all out arcs of a state to another state + * + * See comments for moveins() + */ +static void +moveouts(struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + assert(oldState != newState); + + if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) + { + /* With not too many arcs, just do them one at a time */ + struct arc *a; + + while ((a = oldState->outs) != NULL) + { + cparc(nfa, a, newState, a->to); + freearc(nfa, a); + } + } + else + { + /* + * With many arcs, use a sort-merge approach. Note changearcsource() + * will put the arc onto the front of newState's chain, so it does not + * break our walk through the sorted part of the chain. + */ + struct arc *oa; + struct arc *na; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + sortouts(nfa, oldState); + sortouts(nfa, newState); + if (NISERR()) + return; /* might have failed to sort */ + oa = oldState->outs; + na = newState->outs; + while (oa != NULL && na != NULL) + { + struct arc *a = oa; + + switch (sortouts_cmp(&oa, &na)) + { + case -1: + /* newState does not have anything matching oa */ + oa = oa->outchain; + + /* + * Rather than doing createarc+freearc, we can just unlink + * and relink the existing arc struct. + */ + changearcsource(a, newState); + break; + case 0: + /* match, advance in both lists */ + oa = oa->outchain; + na = na->outchain; + /* ... and drop duplicate arc from oldState */ + freearc(nfa, a); + break; + case +1: + /* advance only na; oa might have a match later */ + na = na->outchain; + break; + default: + assert(NOTREACHED); + } + } + while (oa != NULL) + { + /* newState does not have anything matching oa */ + struct arc *a = oa; + + oa = oa->outchain; + changearcsource(a, newState); + } + } + + assert(oldState->nouts == 0); + assert(oldState->outs == NULL); +} + +/* + * copyouts - copy out arcs of a state to another state + */ +static void +copyouts(struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + assert(oldState != newState); + + if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) + { + /* With not too many arcs, just do them one at a time */ + struct arc *a; + + for (a = oldState->outs; a != NULL; a = a->outchain) + cparc(nfa, a, newState, a->to); + } + else + { + /* + * With many arcs, use a sort-merge approach. Note that createarc() + * will put new arcs onto the front of newState's chain, so it does + * not break our walk through the sorted part of the chain. + */ + struct arc *oa; + struct arc *na; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + sortouts(nfa, oldState); + sortouts(nfa, newState); + if (NISERR()) + return; /* might have failed to sort */ + oa = oldState->outs; + na = newState->outs; + while (oa != NULL && na != NULL) + { + struct arc *a = oa; + + switch (sortouts_cmp(&oa, &na)) + { + case -1: + /* newState does not have anything matching oa */ + oa = oa->outchain; + createarc(nfa, a->type, a->co, newState, a->to); + break; + case 0: + /* match, advance in both lists */ + oa = oa->outchain; + na = na->outchain; + break; + case +1: + /* advance only na; oa might have a match later */ + na = na->outchain; + break; + default: + assert(NOTREACHED); + } + } + while (oa != NULL) + { + /* newState does not have anything matching oa */ + struct arc *a = oa; + + oa = oa->outchain; + createarc(nfa, a->type, a->co, newState, a->to); + } + } +} + +/* + * cloneouts - copy out arcs of a state to another state pair, modifying type + * + * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share + * the same interpretation of "co". It wouldn't be sensible with LACONs. + */ +static void +cloneouts(struct nfa *nfa, + struct state *old, + struct state *from, + struct state *to, + int type) +{ + struct arc *a; + + assert(old != from); + assert(type == AHEAD || type == BEHIND); + + for (a = old->outs; a != NULL; a = a->outchain) + { + assert(a->type == PLAIN); + newarc(nfa, type, a->co, from, to); + } +} + +/* + * delsub - delete a sub-NFA, updating subre pointers if necessary + * + * This uses a recursive traversal of the sub-NFA, marking already-seen + * states using their tmp pointer. + */ +static void +delsub(struct nfa *nfa, + struct state *lp, /* the sub-NFA goes from here... */ + struct state *rp) /* ...to here, *not* inclusive */ +{ + assert(lp != rp); + + rp->tmp = rp; /* mark end */ + + deltraverse(nfa, lp, lp); + if (NISERR()) + return; /* asserts might not hold after failure */ + assert(lp->nouts == 0 && rp->nins == 0); /* did the job */ + assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */ + + rp->tmp = NULL; /* unmark end */ + lp->tmp = NULL; /* and begin, marked by deltraverse */ +} + +/* + * deltraverse - the recursive heart of delsub + * This routine's basic job is to destroy all out-arcs of the state. + */ +static void +deltraverse(struct nfa *nfa, + struct state *leftend, + struct state *s) +{ + struct arc *a; + struct state *to; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->nouts == 0) + return; /* nothing to do */ + if (s->tmp != NULL) + return; /* already in progress */ + + s->tmp = s; /* mark as in progress */ + + while ((a = s->outs) != NULL) + { + to = a->to; + deltraverse(nfa, leftend, to); + if (NISERR()) + return; /* asserts might not hold after failure */ + assert(to->nouts == 0 || to->tmp != NULL); + freearc(nfa, a); + if (to->nins == 0 && to->tmp == NULL) + { + assert(to->nouts == 0); + freestate(nfa, to); + } + } + + assert(s->no != FREESTATE); /* we're still here */ + assert(s == leftend || s->nins != 0); /* and still reachable */ + assert(s->nouts == 0); /* but have no outarcs */ + + s->tmp = NULL; /* we're done here */ +} + +/* + * dupnfa - duplicate sub-NFA + * + * Another recursive traversal, this time using tmp to point to duplicates + * as well as mark already-seen states. (You knew there was a reason why + * it's a state pointer, didn't you? :-)) + */ +static void +dupnfa(struct nfa *nfa, + struct state *start, /* duplicate of subNFA starting here */ + struct state *stop, /* and stopping here */ + struct state *from, /* stringing duplicate from here */ + struct state *to) /* to here */ +{ + if (start == stop) + { + newarc(nfa, EMPTY, 0, from, to); + return; + } + + stop->tmp = to; + duptraverse(nfa, start, from); + /* done, except for clearing out the tmp pointers */ + + stop->tmp = NULL; + cleartraverse(nfa, start); +} + +/* + * duptraverse - recursive heart of dupnfa + */ +static void +duptraverse(struct nfa *nfa, + struct state *s, + struct state *stmp) /* s's duplicate, or NULL */ +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp != NULL) + return; /* already done */ + + s->tmp = (stmp == NULL) ? newstate(nfa) : stmp; + if (s->tmp == NULL) + { + assert(NISERR()); + return; + } + + for (a = s->outs; a != NULL && !NISERR(); a = a->outchain) + { + duptraverse(nfa, a->to, (struct state *) NULL); + if (NISERR()) + break; + assert(a->to->tmp != NULL); + cparc(nfa, a, s->tmp, a->to->tmp); + } +} + +/* + * removeconstraints - remove any constraints in an NFA + * + * Constraint arcs are replaced by empty arcs, essentially treating all + * constraints as automatically satisfied. + */ +static void +removeconstraints(struct nfa *nfa, + struct state *start, /* process subNFA starting here */ + struct state *stop) /* and stopping here */ +{ + if (start == stop) + return; + + stop->tmp = stop; + removetraverse(nfa, start); + /* done, except for clearing out the tmp pointers */ + + stop->tmp = NULL; + cleartraverse(nfa, start); +} + +/* + * removetraverse - recursive heart of removeconstraints + */ +static void +removetraverse(struct nfa *nfa, + struct state *s) +{ + struct arc *a; + struct arc *oa; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp != NULL) + return; /* already done */ + + s->tmp = s; + for (a = s->outs; a != NULL && !NISERR(); a = oa) + { + removetraverse(nfa, a->to); + if (NISERR()) + break; + oa = a->outchain; + switch (a->type) + { + case PLAIN: + case EMPTY: + /* nothing to do */ + break; + case AHEAD: + case BEHIND: + case '^': + case '$': + case LACON: + /* replace it */ + newarc(nfa, EMPTY, 0, s, a->to); + freearc(nfa, a); + break; + default: + NERR(REG_ASSERT); + break; + } + } +} + +/* + * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set + */ +static void +cleartraverse(struct nfa *nfa, + struct state *s) +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp == NULL) + return; + s->tmp = NULL; + + for (a = s->outs; a != NULL; a = a->outchain) + cleartraverse(nfa, a->to); +} + +/* + * single_color_transition - does getting from s1 to s2 cross one PLAIN arc? + * + * If traversing from s1 to s2 requires a single PLAIN match (possibly of any + * of a set of colors), return a state whose outarc list contains only PLAIN + * arcs of those color(s). Otherwise return NULL. + * + * This is used before optimizing the NFA, so there may be EMPTY arcs, which + * we should ignore; the possibility of an EMPTY is why the result state could + * be different from s1. + * + * It's worth troubling to handle multiple parallel PLAIN arcs here because a + * bracket construct such as [abc] might yield either one or several parallel + * PLAIN arcs depending on earlier atoms in the expression. We'd rather that + * that implementation detail not create user-visible performance differences. + */ +static struct state * +single_color_transition(struct state *s1, struct state *s2) +{ + struct arc *a; + + /* Ignore leading EMPTY arc, if any */ + if (s1->nouts == 1 && s1->outs->type == EMPTY) + s1 = s1->outs->to; + /* Likewise for any trailing EMPTY arc */ + if (s2->nins == 1 && s2->ins->type == EMPTY) + s2 = s2->ins->from; + /* Perhaps we could have a single-state loop in between, if so reject */ + if (s1 == s2) + return NULL; + /* s1 must have at least one outarc... */ + if (s1->outs == NULL) + return NULL; + /* ... and they must all be PLAIN arcs to s2 */ + for (a = s1->outs; a != NULL; a = a->outchain) + { + if (a->type != PLAIN || a->to != s2) + return NULL; + } + /* OK, return s1 as the possessor of the relevant outarcs */ + return s1; +} + +/* + * specialcolors - fill in special colors for an NFA + */ +static void +specialcolors(struct nfa *nfa) +{ + /* false colors for BOS, BOL, EOS, EOL */ + if (nfa->parent == NULL) + { + nfa->bos[0] = pseudocolor(nfa->cm); + nfa->bos[1] = pseudocolor(nfa->cm); + nfa->eos[0] = pseudocolor(nfa->cm); + nfa->eos[1] = pseudocolor(nfa->cm); + } + else + { + assert(nfa->parent->bos[0] != COLORLESS); + nfa->bos[0] = nfa->parent->bos[0]; + assert(nfa->parent->bos[1] != COLORLESS); + nfa->bos[1] = nfa->parent->bos[1]; + assert(nfa->parent->eos[0] != COLORLESS); + nfa->eos[0] = nfa->parent->eos[0]; + assert(nfa->parent->eos[1] != COLORLESS); + nfa->eos[1] = nfa->parent->eos[1]; + } +} + +/* + * optimize - optimize an NFA + * + * The main goal of this function is not so much "optimization" (though it + * does try to get rid of useless NFA states) as reducing the NFA to a form + * the regex executor can handle. The executor, and indeed the cNFA format + * that is its input, can only handle PLAIN and LACON arcs. The output of + * the regex parser also includes EMPTY (do-nothing) arcs, as well as + * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here. + * We first get rid of EMPTY arcs and then deal with the constraint arcs. + * The hardest part of either job is to get rid of circular loops of the + * target arc type. We would have to do that in any case, though, as such a + * loop would otherwise allow the executor to cycle through the loop endlessly + * without making any progress in the input string. + */ +static long /* re_info bits */ +optimize(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ +#ifdef REG_DEBUG + int verbose = (f != NULL) ? 1 : 0; + + if (verbose) + fprintf(f, "\ninitial cleanup:\n"); +#endif + cleanup(nfa); /* may simplify situation */ +#ifdef REG_DEBUG + if (verbose) + dumpnfa(nfa, f); + if (verbose) + fprintf(f, "\nempties:\n"); +#endif + fixempties(nfa, f); /* get rid of EMPTY arcs */ +#ifdef REG_DEBUG + if (verbose) + fprintf(f, "\nconstraints:\n"); +#endif + fixconstraintloops(nfa, f); /* get rid of constraint loops */ + pullback(nfa, f); /* pull back constraints backward */ + pushfwd(nfa, f); /* push fwd constraints forward */ +#ifdef REG_DEBUG + if (verbose) + fprintf(f, "\nfinal cleanup:\n"); +#endif + cleanup(nfa); /* final tidying */ +#ifdef REG_DEBUG + if (verbose) + dumpnfa(nfa, f); +#endif + return analyze(nfa); /* and analysis */ +} + +/* + * pullback - pull back constraints backward to eliminate them + */ +static void +pullback(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *nexts; + struct arc *a; + struct arc *nexta; + struct state *intermediates; + int progress; + + /* find and pull until there are no more */ + do + { + progress = 0; + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + intermediates = NULL; + for (a = s->outs; a != NULL && !NISERR(); a = nexta) + { + nexta = a->outchain; + if (a->type == '^' || a->type == BEHIND) + if (pull(nfa, a, &intermediates)) + progress = 1; + } + /* clear tmp fields of intermediate states created here */ + while (intermediates != NULL) + { + struct state *ns = intermediates->tmp; + + intermediates->tmp = NULL; + intermediates = ns; + } + /* if s is now useless, get rid of it */ + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + if (progress && f != NULL) + dumpnfa(nfa, f); + } while (progress && !NISERR()); + if (NISERR()) + return; + + /* + * Any ^ constraints we were able to pull to the start state can now be + * replaced by PLAIN arcs referencing the BOS or BOL colors. There should + * be no other ^ or BEHIND arcs left in the NFA, though we do not check + * that here (compact() will fail if so). + */ + for (a = nfa->pre->outs; a != NULL; a = nexta) + { + nexta = a->outchain; + if (a->type == '^') + { + assert(a->co == 0 || a->co == 1); + newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to); + freearc(nfa, a); + } + } +} + +/* + * pull - pull a back constraint backward past its source state + * + * Returns 1 if successful (which it always is unless the source is the + * start state or we have an internal error), 0 if nothing happened. + * + * A significant property of this function is that it deletes no pre-existing + * states, and no outarcs of the constraint's from state other than the given + * constraint arc. This makes the loops in pullback() safe, at the cost that + * we may leave useless states behind. Therefore, we leave it to pullback() + * to delete such states. + * + * If the from state has multiple back-constraint outarcs, and/or multiple + * compatible constraint inarcs, we only need to create one new intermediate + * state per combination of predecessor and successor states. *intermediates + * points to a list of such intermediate states for this from state (chained + * through their tmp fields). + */ +static int +pull(struct nfa *nfa, + struct arc *con, + struct state **intermediates) +{ + struct state *from = con->from; + struct state *to = con->to; + struct arc *a; + struct arc *nexta; + struct state *s; + + assert(from != to); /* should have gotten rid of this earlier */ + if (from->flag) /* can't pull back beyond start */ + return 0; + if (from->nins == 0) + { /* unreachable */ + freearc(nfa, con); + return 1; + } + + /* + * First, clone from state if necessary to avoid other outarcs. This may + * seem wasteful, but it simplifies the logic, and we'll get rid of the + * clone state again at the bottom. + */ + if (from->nouts > 1) + { + s = newstate(nfa); + if (NISERR()) + return 0; + copyins(nfa, from, s); /* duplicate inarcs */ + cparc(nfa, con, s, to); /* move constraint arc */ + freearc(nfa, con); + if (NISERR()) + return 0; + from = s; + con = from->outs; + } + assert(from->nouts == 1); + + /* propagate the constraint into the from state's inarcs */ + for (a = from->ins; a != NULL && !NISERR(); a = nexta) + { + nexta = a->inchain; + switch (combine(nfa, con, a)) + { + case INCOMPATIBLE: /* destroy the arc */ + freearc(nfa, a); + break; + case SATISFIED: /* no action needed */ + break; + case COMPATIBLE: /* swap the two arcs, more or less */ + /* need an intermediate state, but might have one already */ + for (s = *intermediates; s != NULL; s = s->tmp) + { + assert(s->nins > 0 && s->nouts > 0); + if (s->ins->from == a->from && s->outs->to == to) + break; + } + if (s == NULL) + { + s = newstate(nfa); + if (NISERR()) + return 0; + s->tmp = *intermediates; + *intermediates = s; + } + cparc(nfa, con, a->from, s); + cparc(nfa, a, s, to); + freearc(nfa, a); + break; + case REPLACEARC: /* replace arc's color */ + newarc(nfa, a->type, con->co, a->from, to); + freearc(nfa, a); + break; + default: + assert(NOTREACHED); + break; + } + } + + /* remaining inarcs, if any, incorporate the constraint */ + moveins(nfa, from, to); + freearc(nfa, con); + /* from state is now useless, but we leave it to pullback() to clean up */ + return 1; +} + +/* + * pushfwd - push forward constraints forward to eliminate them + */ +static void +pushfwd(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *nexts; + struct arc *a; + struct arc *nexta; + struct state *intermediates; + int progress; + + /* find and push until there are no more */ + do + { + progress = 0; + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + intermediates = NULL; + for (a = s->ins; a != NULL && !NISERR(); a = nexta) + { + nexta = a->inchain; + if (a->type == '$' || a->type == AHEAD) + if (push(nfa, a, &intermediates)) + progress = 1; + } + /* clear tmp fields of intermediate states created here */ + while (intermediates != NULL) + { + struct state *ns = intermediates->tmp; + + intermediates->tmp = NULL; + intermediates = ns; + } + /* if s is now useless, get rid of it */ + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + if (progress && f != NULL) + dumpnfa(nfa, f); + } while (progress && !NISERR()); + if (NISERR()) + return; + + /* + * Any $ constraints we were able to push to the post state can now be + * replaced by PLAIN arcs referencing the EOS or EOL colors. There should + * be no other $ or AHEAD arcs left in the NFA, though we do not check + * that here (compact() will fail if so). + */ + for (a = nfa->post->ins; a != NULL; a = nexta) + { + nexta = a->inchain; + if (a->type == '$') + { + assert(a->co == 0 || a->co == 1); + newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to); + freearc(nfa, a); + } + } +} + +/* + * push - push a forward constraint forward past its destination state + * + * Returns 1 if successful (which it always is unless the destination is the + * post state or we have an internal error), 0 if nothing happened. + * + * A significant property of this function is that it deletes no pre-existing + * states, and no inarcs of the constraint's to state other than the given + * constraint arc. This makes the loops in pushfwd() safe, at the cost that + * we may leave useless states behind. Therefore, we leave it to pushfwd() + * to delete such states. + * + * If the to state has multiple forward-constraint inarcs, and/or multiple + * compatible constraint outarcs, we only need to create one new intermediate + * state per combination of predecessor and successor states. *intermediates + * points to a list of such intermediate states for this to state (chained + * through their tmp fields). + */ +static int +push(struct nfa *nfa, + struct arc *con, + struct state **intermediates) +{ + struct state *from = con->from; + struct state *to = con->to; + struct arc *a; + struct arc *nexta; + struct state *s; + + assert(to != from); /* should have gotten rid of this earlier */ + if (to->flag) /* can't push forward beyond end */ + return 0; + if (to->nouts == 0) + { /* dead end */ + freearc(nfa, con); + return 1; + } + + /* + * First, clone to state if necessary to avoid other inarcs. This may + * seem wasteful, but it simplifies the logic, and we'll get rid of the + * clone state again at the bottom. + */ + if (to->nins > 1) + { + s = newstate(nfa); + if (NISERR()) + return 0; + copyouts(nfa, to, s); /* duplicate outarcs */ + cparc(nfa, con, from, s); /* move constraint arc */ + freearc(nfa, con); + if (NISERR()) + return 0; + to = s; + con = to->ins; + } + assert(to->nins == 1); + + /* propagate the constraint into the to state's outarcs */ + for (a = to->outs; a != NULL && !NISERR(); a = nexta) + { + nexta = a->outchain; + switch (combine(nfa, con, a)) + { + case INCOMPATIBLE: /* destroy the arc */ + freearc(nfa, a); + break; + case SATISFIED: /* no action needed */ + break; + case COMPATIBLE: /* swap the two arcs, more or less */ + /* need an intermediate state, but might have one already */ + for (s = *intermediates; s != NULL; s = s->tmp) + { + assert(s->nins > 0 && s->nouts > 0); + if (s->ins->from == from && s->outs->to == a->to) + break; + } + if (s == NULL) + { + s = newstate(nfa); + if (NISERR()) + return 0; + s->tmp = *intermediates; + *intermediates = s; + } + cparc(nfa, con, s, a->to); + cparc(nfa, a, from, s); + freearc(nfa, a); + break; + case REPLACEARC: /* replace arc's color */ + newarc(nfa, a->type, con->co, from, a->to); + freearc(nfa, a); + break; + default: + assert(NOTREACHED); + break; + } + } + + /* remaining outarcs, if any, incorporate the constraint */ + moveouts(nfa, to, from); + freearc(nfa, con); + /* to state is now useless, but we leave it to pushfwd() to clean up */ + return 1; +} + +/* + * combine - constraint lands on an arc, what happens? + * + * #def INCOMPATIBLE 1 // destroys arc + * #def SATISFIED 2 // constraint satisfied + * #def COMPATIBLE 3 // compatible but not satisfied yet + * #def REPLACEARC 4 // replace arc's color with constraint color + */ +static int +combine(struct nfa *nfa, + struct arc *con, + struct arc *a) +{ +#define CA(ct,at) (((ct)<<CHAR_BIT) | (at)) + + switch (CA(con->type, a->type)) + { + case CA('^', PLAIN): /* newlines are handled separately */ + case CA('$', PLAIN): + return INCOMPATIBLE; + break; + case CA(AHEAD, PLAIN): /* color constraints meet colors */ + case CA(BEHIND, PLAIN): + if (con->co == a->co) + return SATISFIED; + if (con->co == RAINBOW) + { + /* con is satisfied unless arc's color is a pseudocolor */ + if (!(nfa->cm->cd[a->co].flags & PSEUDO)) + return SATISFIED; + } + else if (a->co == RAINBOW) + { + /* con is incompatible if it's for a pseudocolor */ + if (nfa->cm->cd[con->co].flags & PSEUDO) + return INCOMPATIBLE; + /* otherwise, constraint constrains arc to be only its color */ + return REPLACEARC; + } + return INCOMPATIBLE; + break; + case CA('^', '^'): /* collision, similar constraints */ + case CA('$', '$'): + if (con->co == a->co) /* true duplication */ + return SATISFIED; + return INCOMPATIBLE; + break; + case CA(AHEAD, AHEAD): /* collision, similar constraints */ + case CA(BEHIND, BEHIND): + if (con->co == a->co) /* true duplication */ + return SATISFIED; + if (con->co == RAINBOW) + { + /* con is satisfied unless arc's color is a pseudocolor */ + if (!(nfa->cm->cd[a->co].flags & PSEUDO)) + return SATISFIED; + } + else if (a->co == RAINBOW) + { + /* con is incompatible if it's for a pseudocolor */ + if (nfa->cm->cd[con->co].flags & PSEUDO) + return INCOMPATIBLE; + /* otherwise, constraint constrains arc to be only its color */ + return REPLACEARC; + } + return INCOMPATIBLE; + break; + case CA('^', BEHIND): /* collision, dissimilar constraints */ + case CA(BEHIND, '^'): + case CA('$', AHEAD): + case CA(AHEAD, '$'): + return INCOMPATIBLE; + break; + case CA('^', '$'): /* constraints passing each other */ + case CA('^', AHEAD): + case CA(BEHIND, '$'): + case CA(BEHIND, AHEAD): + case CA('$', '^'): + case CA('$', BEHIND): + case CA(AHEAD, '^'): + case CA(AHEAD, BEHIND): + case CA('^', LACON): + case CA(BEHIND, LACON): + case CA('$', LACON): + case CA(AHEAD, LACON): + return COMPATIBLE; + break; + } + assert(NOTREACHED); + return INCOMPATIBLE; /* for benefit of blind compilers */ +} + +/* + * fixempties - get rid of EMPTY arcs + */ +static void +fixempties(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *s2; + struct state *nexts; + struct arc *a; + struct arc *nexta; + int totalinarcs; + struct arc **inarcsorig; + struct arc **arcarray; + int arccount; + int prevnins; + int nskip; + + /* + * First, get rid of any states whose sole out-arc is an EMPTY, since + * they're basically just aliases for their successor. The parsing + * algorithm creates enough of these that it's worth special-casing this. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + if (s->flag || s->nouts != 1) + continue; + a = s->outs; + assert(a != NULL && a->outchain == NULL); + if (a->type != EMPTY) + continue; + if (s != a->to) + moveins(nfa, s, a->to); + dropstate(nfa, s); + } + + /* + * Similarly, get rid of any state with a single EMPTY in-arc, by folding + * it into its predecessor. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + /* while we're at it, ensure tmp fields are clear for next step */ + assert(s->tmp == NULL); + if (s->flag || s->nins != 1) + continue; + a = s->ins; + assert(a != NULL && a->inchain == NULL); + if (a->type != EMPTY) + continue; + if (s != a->from) + moveouts(nfa, s, a->from); + dropstate(nfa, s); + } + + if (NISERR()) + return; + + /* + * For each remaining NFA state, find all other states from which it is + * reachable by a chain of one or more EMPTY arcs. Then generate new arcs + * that eliminate the need for each such chain. + * + * We could replace a chain of EMPTY arcs that leads from a "from" state + * to a "to" state either by pushing non-EMPTY arcs forward (linking + * directly from "from"'s predecessors to "to") or by pulling them back + * (linking directly from "from" to "to"'s successors). We choose to + * always do the former; this choice is somewhat arbitrary, but the + * approach below requires that we uniformly do one or the other. + * + * Suppose we have a chain of N successive EMPTY arcs (where N can easily + * approach the size of the NFA). All of the intermediate states must + * have additional inarcs and outarcs, else they'd have been removed by + * the steps above. Assuming their inarcs are mostly not empties, we will + * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one + * state in the chain must be duplicated to lead to all its successor + * states as well. So there is no hope of doing less than O(N^2) work; + * however, we should endeavor to keep the big-O cost from being even + * worse than that, which it can easily become without care. In + * particular, suppose we were to copy all S1's inarcs forward to S2, and + * then also to S3, and then later we consider pushing S2's inarcs forward + * to S3. If we include the arcs already copied from S1 in that, we'd be + * doing O(N^3) work. (The duplicate-arc elimination built into newarc() + * and its cohorts would get rid of the extra arcs, but not without cost.) + * + * We can avoid this cost by treating only arcs that existed at the start + * of this phase as candidates to be pushed forward. To identify those, + * we remember the first inarc each state had to start with. We rely on + * the fact that newarc() and friends put new arcs on the front of their + * to-states' inchains, and that this phase never deletes arcs, so that + * the original arcs must be the last arcs in their to-states' inchains. + * + * So the process here is that, for each state in the NFA, we gather up + * all non-EMPTY inarcs of states that can reach the target state via + * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the + * target state's inchain. (We can safely use sort-merge for this as long + * as we update each state's original-arcs pointer after we add arcs to + * it; the sort step of mergeins probably changed the order of the old + * arcs.) + * + * Another refinement worth making is that, because we only add non-EMPTY + * arcs during this phase, and all added arcs have the same from-state as + * the non-EMPTY arc they were cloned from, we know ahead of time that any + * states having only EMPTY outarcs will be useless for lack of outarcs + * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if + * they had none to start with.) So we need not bother to update the + * inchains of such states at all. + */ + + /* Remember the states' first original inarcs */ + /* ... and while at it, count how many old inarcs there are altogether */ + inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *)); + if (inarcsorig == NULL) + { + NERR(REG_ESPACE); + return; + } + totalinarcs = 0; + for (s = nfa->states; s != NULL; s = s->next) + { + inarcsorig[s->no] = s->ins; + totalinarcs += s->nins; + } + + /* + * Create a workspace for accumulating the inarcs to be added to the + * current target state. totalinarcs is probably a considerable + * overestimate of the space needed, but the NFA is unlikely to be large + * enough at this point to make it worth being smarter. + */ + arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *)); + if (arcarray == NULL) + { + NERR(REG_ESPACE); + FREE(inarcsorig); + return; + } + + /* And iterate over the target states */ + for (s = nfa->states; s != NULL && !NISERR(); s = s->next) + { + /* Ignore target states without non-EMPTY outarcs, per note above */ + if (!s->flag && !hasnonemptyout(s)) + continue; + + /* Find predecessor states and accumulate their original inarcs */ + arccount = 0; + for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts) + { + /* Add s2's original inarcs to arcarray[], but ignore empties */ + for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain) + { + if (a->type != EMPTY) + arcarray[arccount++] = a; + } + + /* Reset the tmp fields as we walk back */ + nexts = s2->tmp; + s2->tmp = NULL; + } + s->tmp = NULL; + assert(arccount <= totalinarcs); + + /* Remember how many original inarcs this state has */ + prevnins = s->nins; + + /* Add non-duplicate inarcs to target state */ + mergeins(nfa, s, arcarray, arccount); + + /* Now we must update the state's inarcsorig pointer */ + nskip = s->nins - prevnins; + a = s->ins; + while (nskip-- > 0) + a = a->inchain; + inarcsorig[s->no] = a; + } + + FREE(arcarray); + FREE(inarcsorig); + + if (NISERR()) + return; + + /* + * Now remove all the EMPTY arcs, since we don't need them anymore. + */ + for (s = nfa->states; s != NULL; s = s->next) + { + for (a = s->outs; a != NULL; a = nexta) + { + nexta = a->outchain; + if (a->type == EMPTY) + freearc(nfa, a); + } + } + + /* + * And remove any states that have become useless. (This cleanup is not + * very thorough, and would be even less so if we tried to combine it with + * the previous step; but cleanup() will take care of anything we miss.) + */ + for (s = nfa->states; s != NULL; s = nexts) + { + nexts = s->next; + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + + if (f != NULL) + dumpnfa(nfa, f); +} + +/* + * emptyreachable - recursively find all states that can reach s by EMPTY arcs + * + * The return value is the last such state found. Its tmp field links back + * to the next-to-last such state, and so on back to s, so that all these + * states can be located without searching the whole NFA. + * + * Since this is only used in fixempties(), we pass in the inarcsorig[] array + * maintained by that function. This lets us skip over all new inarcs, which + * are certainly not EMPTY arcs. + * + * The maximum recursion depth here is equal to the length of the longest + * loop-free chain of EMPTY arcs, which is surely no more than the size of + * the NFA ... but that could still be enough to cause trouble. + */ +static struct state * +emptyreachable(struct nfa *nfa, + struct state *s, + struct state *lastfound, + struct arc **inarcsorig) +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return lastfound; + } + + s->tmp = lastfound; + lastfound = s; + for (a = inarcsorig[s->no]; a != NULL; a = a->inchain) + { + if (a->type == EMPTY && a->from->tmp == NULL) + lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig); + } + return lastfound; +} + +/* + * isconstraintarc - detect whether an arc is of a constraint type + */ +static inline int +isconstraintarc(struct arc *a) +{ + switch (a->type) + { + case '^': + case '$': + case BEHIND: + case AHEAD: + case LACON: + return 1; + } + return 0; +} + +/* + * hasconstraintout - does state have a constraint out arc? + */ +static int +hasconstraintout(struct state *s) +{ + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (isconstraintarc(a)) + return 1; + } + return 0; +} + +/* + * fixconstraintloops - get rid of loops containing only constraint arcs + * + * A loop of states that contains only constraint arcs is useless, since + * passing around the loop represents no forward progress. Moreover, it + * would cause infinite looping in pullback/pushfwd, so we need to get rid + * of such loops before doing that. + */ +static void +fixconstraintloops(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *nexts; + struct arc *a; + struct arc *nexta; + int hasconstraints; + + /* + * In the trivial case of a state that loops to itself, we can just drop + * the constraint arc altogether. This is worth special-casing because + * such loops are far more common than loops containing multiple states. + * While we're at it, note whether any constraint arcs survive. + */ + hasconstraints = 0; + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + /* while we're at it, ensure tmp fields are clear for next step */ + assert(s->tmp == NULL); + for (a = s->outs; a != NULL && !NISERR(); a = nexta) + { + nexta = a->outchain; + if (isconstraintarc(a)) + { + if (a->to == s) + freearc(nfa, a); + else + hasconstraints = 1; + } + } + /* If we removed all the outarcs, the state is useless. */ + if (s->nouts == 0 && !s->flag) + dropstate(nfa, s); + } + + /* Nothing to do if no remaining constraint arcs */ + if (NISERR() || !hasconstraints) + return; + + /* + * Starting from each remaining NFA state, search outwards for a + * constraint loop. If we find a loop, break the loop, then start the + * search over. (We could possibly retain some state from the first scan, + * but it would complicate things greatly, and multi-state constraint + * loops are rare enough that it's not worth optimizing the case.) + */ +restart: + for (s = nfa->states; s != NULL && !NISERR(); s = s->next) + { + if (findconstraintloop(nfa, s)) + goto restart; + } + + if (NISERR()) + return; + + /* + * Now remove any states that have become useless. (This cleanup is not + * very thorough, and would be even less so if we tried to combine it with + * the previous step; but cleanup() will take care of anything we miss.) + * + * Because findconstraintloop intentionally doesn't reset all tmp fields, + * we have to clear them after it's done. This is a convenient place to + * do that, too. + */ + for (s = nfa->states; s != NULL; s = nexts) + { + nexts = s->next; + s->tmp = NULL; + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + + if (f != NULL) + dumpnfa(nfa, f); +} + +/* + * findconstraintloop - recursively find a loop of constraint arcs + * + * If we find a loop, break it by calling breakconstraintloop(), then + * return 1; otherwise return 0. + * + * State tmp fields are guaranteed all NULL on a success return, because + * breakconstraintloop does that. After a failure return, any state that + * is known not to be part of a loop is marked with s->tmp == s; this allows + * us not to have to re-prove that fact on later calls. (This convention is + * workable because we already eliminated single-state loops.) + * + * Note that the found loop doesn't necessarily include the first state we + * are called on. Any loop reachable from that state will do. + * + * The maximum recursion depth here is one more than the length of the longest + * loop-free chain of constraint arcs, which is surely no more than the size + * of the NFA ... but that could still be enough to cause trouble. + */ +static int +findconstraintloop(struct nfa *nfa, struct state *s) +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return 1; /* to exit as quickly as possible */ + } + + if (s->tmp != NULL) + { + /* Already proven uninteresting? */ + if (s->tmp == s) + return 0; + /* Found a loop involving s */ + breakconstraintloop(nfa, s); + /* The tmp fields have been cleaned up by breakconstraintloop */ + return 1; + } + for (a = s->outs; a != NULL; a = a->outchain) + { + if (isconstraintarc(a)) + { + struct state *sto = a->to; + + assert(sto != s); + s->tmp = sto; + if (findconstraintloop(nfa, sto)) + return 1; + } + } + + /* + * If we get here, no constraint loop exists leading out from s. Mark it + * with s->tmp == s so we need not rediscover that fact again later. + */ + s->tmp = s; + return 0; +} + +/* + * breakconstraintloop - break a loop of constraint arcs + * + * sinitial is any one member state of the loop. Each loop member's tmp + * field links to its successor within the loop. (Note that this function + * will reset all the tmp fields to NULL.) + * + * We can break the loop by, for any one state S1 in the loop, cloning its + * loop successor state S2 (and possibly following states), and then moving + * all S1->S2 constraint arcs to point to the cloned S2. The cloned S2 should + * copy any non-constraint outarcs of S2. Constraint outarcs should be + * dropped if they point back to S1, else they need to be copied as arcs to + * similarly cloned states S3, S4, etc. In general, each cloned state copies + * non-constraint outarcs, drops constraint outarcs that would lead to itself + * or any earlier cloned state, and sends other constraint outarcs to newly + * cloned states. No cloned state will have any inarcs that aren't constraint + * arcs or do not lead from S1 or earlier-cloned states. It's okay to drop + * constraint back-arcs since they would not take us to any state we've not + * already been in; therefore, no new constraint loop is created. In this way + * we generate a modified NFA that can still represent every useful state + * sequence, but not sequences that represent state loops with no consumption + * of input data. Note that the set of cloned states will certainly include + * all of the loop member states other than S1, and it may also include + * non-loop states that are reachable from S2 via constraint arcs. This is + * important because there is no guarantee that findconstraintloop found a + * maximal loop (and searching for one would be NP-hard, so don't try). + * Frequently the "non-loop states" are actually part of a larger loop that + * we didn't notice, and indeed there may be several overlapping loops. + * This technique ensures convergence in such cases, while considering only + * the originally-found loop does not. + * + * If there is only one S1->S2 constraint arc, then that constraint is + * certainly satisfied when we enter any of the clone states. This means that + * in the common case where many of the constraint arcs are identically + * labeled, we can merge together clone states linked by a similarly-labeled + * constraint: if we can get to the first one we can certainly get to the + * second, so there's no need to distinguish. This greatly reduces the number + * of new states needed, so we preferentially break the given loop at a state + * pair where this is true. + * + * Furthermore, it's fairly common to find that a cloned successor state has + * no outarcs, especially if we're a bit aggressive about removing unnecessary + * outarcs. If that happens, then there is simply not any interesting state + * that can be reached through the predecessor's loop arcs, which means we can + * break the loop just by removing those loop arcs, with no new states added. + */ +static void +breakconstraintloop(struct nfa *nfa, struct state *sinitial) +{ + struct state *s; + struct state *shead; + struct state *stail; + struct state *sclone; + struct state *nexts; + struct arc *refarc; + struct arc *a; + struct arc *nexta; + + /* + * Start by identifying which loop step we want to break at. + * Preferentially this is one with only one constraint arc. (XXX are + * there any other secondary heuristics we want to use here?) Set refarc + * to point to the selected lone constraint arc, if there is one. + */ + refarc = NULL; + s = sinitial; + do + { + nexts = s->tmp; + assert(nexts != s); /* should not see any one-element loops */ + if (refarc == NULL) + { + int narcs = 0; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->to == nexts && isconstraintarc(a)) + { + refarc = a; + narcs++; + } + } + assert(narcs > 0); + if (narcs > 1) + refarc = NULL; /* multiple constraint arcs here, no good */ + } + s = nexts; + } while (s != sinitial); + + if (refarc) + { + /* break at the refarc */ + shead = refarc->from; + stail = refarc->to; + assert(stail == shead->tmp); + } + else + { + /* for lack of a better idea, break after sinitial */ + shead = sinitial; + stail = sinitial->tmp; + } + + /* + * Reset the tmp fields so that we can use them for local storage in + * clonesuccessorstates. (findconstraintloop won't mind, since it's just + * going to abandon its search anyway.) + */ + for (s = nfa->states; s != NULL; s = s->next) + s->tmp = NULL; + + /* + * Recursively build clone state(s) as needed. + */ + sclone = newstate(nfa); + if (sclone == NULL) + { + assert(NISERR()); + return; + } + + clonesuccessorstates(nfa, stail, sclone, shead, refarc, + NULL, NULL, nfa->nstates); + + if (NISERR()) + return; + + /* + * It's possible that sclone has no outarcs at all, in which case it's + * useless. (We don't try extremely hard to get rid of useless states + * here, but this is an easy and fairly common case.) + */ + if (sclone->nouts == 0) + { + freestate(nfa, sclone); + sclone = NULL; + } + + /* + * Move shead's constraint-loop arcs to point to sclone, or just drop them + * if we discovered we don't need sclone. + */ + for (a = shead->outs; a != NULL; a = nexta) + { + nexta = a->outchain; + if (a->to == stail && isconstraintarc(a)) + { + if (sclone) + cparc(nfa, a, shead, sclone); + freearc(nfa, a); + if (NISERR()) + break; + } + } +} + +/* + * clonesuccessorstates - create a tree of constraint-arc successor states + * + * ssource is the state to be cloned, and sclone is the state to copy its + * outarcs into. sclone's inarcs, if any, should already be set up. + * + * spredecessor is the original predecessor state that we are trying to build + * successors for (it may not be the immediate predecessor of ssource). + * refarc, if not NULL, is the original constraint arc that is known to have + * been traversed out of spredecessor to reach the successor(s). + * + * For each cloned successor state, we transiently create a "donemap" that is + * a boolean array showing which source states we've already visited for this + * clone state. This prevents infinite recursion as well as useless repeat + * visits to the same state subtree (which can add up fast, since typical NFAs + * have multiple redundant arc pathways). Each donemap is a char array + * indexed by state number. The donemaps are all of the same size "nstates", + * which is nfa->nstates as of the start of the recursion. This is enough to + * have entries for all pre-existing states, but *not* entries for clone + * states created during the recursion. That's okay since we have no need to + * mark those. + * + * curdonemap is NULL when recursing to a new sclone state, or sclone's + * donemap when we are recursing without having created a new state (which we + * do when we decide we can merge a successor state into the current clone + * state). outerdonemap is NULL at the top level and otherwise the parent + * clone state's donemap. + * + * The successor states we create and fill here form a strict tree structure, + * with each state having exactly one predecessor, except that the toplevel + * state has no inarcs as yet (breakconstraintloop will add its inarcs from + * spredecessor after we're done). Thus, we can examine sclone's inarcs back + * to the root, plus refarc if any, to identify the set of constraints already + * known valid at the current point. This allows us to avoid generating extra + * successor states. + */ +static void +clonesuccessorstates(struct nfa *nfa, + struct state *ssource, + struct state *sclone, + struct state *spredecessor, + struct arc *refarc, + char *curdonemap, + char *outerdonemap, + int nstates) +{ + char *donemap; + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + /* If this state hasn't already got a donemap, create one */ + donemap = curdonemap; + if (donemap == NULL) + { + donemap = (char *) MALLOC(nstates * sizeof(char)); + if (donemap == NULL) + { + NERR(REG_ESPACE); + return; + } + + if (outerdonemap != NULL) + { + /* + * Not at outermost recursion level, so copy the outer level's + * donemap; this ensures that we see states in process of being + * visited at outer levels, or already merged into predecessor + * states, as ones we shouldn't traverse back to. + */ + memcpy(donemap, outerdonemap, nstates * sizeof(char)); + } + else + { + /* At outermost level, only spredecessor is off-limits */ + memset(donemap, 0, nstates * sizeof(char)); + assert(spredecessor->no < nstates); + donemap[spredecessor->no] = 1; + } + } + + /* Mark ssource as visited in the donemap */ + assert(ssource->no < nstates); + assert(donemap[ssource->no] == 0); + donemap[ssource->no] = 1; + + /* + * We proceed by first cloning all of ssource's outarcs, creating new + * clone states as needed but not doing more with them than that. Then in + * a second pass, recurse to process the child clone states. This allows + * us to have only one child clone state per reachable source state, even + * when there are multiple outarcs leading to the same state. Also, when + * we do visit a child state, its set of inarcs is known exactly, which + * makes it safe to apply the constraint-is-already-checked optimization. + * Also, this ensures that we've merged all the states we can into the + * current clone before we recurse to any children, thus possibly saving + * them from making extra images of those states. + * + * While this function runs, child clone states of the current state are + * marked by setting their tmp fields to point to the original state they + * were cloned from. This makes it possible to detect multiple outarcs + * leading to the same state, and also makes it easy to distinguish clone + * states from original states (which will have tmp == NULL). + */ + for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain) + { + struct state *sto = a->to; + + /* + * We do not consider cloning successor states that have no constraint + * outarcs; just link to them as-is. They cannot be part of a + * constraint loop so there is no need to make copies. In particular, + * this rule keeps us from trying to clone the post state, which would + * be a bad idea. + */ + if (isconstraintarc(a) && hasconstraintout(sto)) + { + struct state *prevclone; + int canmerge; + struct arc *a2; + + /* + * Back-link constraint arcs must not be followed. Nor is there a + * need to revisit states previously merged into this clone. + */ + assert(sto->no < nstates); + if (donemap[sto->no] != 0) + continue; + + /* + * Check whether we already have a child clone state for this + * source state. + */ + prevclone = NULL; + for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain) + { + if (a2->to->tmp == sto) + { + prevclone = a2->to; + break; + } + } + + /* + * If this arc is labeled the same as refarc, or the same as any + * arc we must have traversed to get to sclone, then no additional + * constraints need to be met to get to sto, so we should just + * merge its outarcs into sclone. + */ + if (refarc && a->type == refarc->type && a->co == refarc->co) + canmerge = 1; + else + { + struct state *s; + + canmerge = 0; + for (s = sclone; s->ins; s = s->ins->from) + { + if (s->nins == 1 && + a->type == s->ins->type && a->co == s->ins->co) + { + canmerge = 1; + break; + } + } + } + + if (canmerge) + { + /* + * We can merge into sclone. If we previously made a child + * clone state, drop it; there's no need to visit it. (This + * can happen if ssource has multiple pathways to sto, and we + * only just now found one that is provably a no-op.) + */ + if (prevclone) + dropstate(nfa, prevclone); /* kills our outarc, too */ + + /* Recurse to merge sto's outarcs into sclone */ + clonesuccessorstates(nfa, + sto, + sclone, + spredecessor, + refarc, + donemap, + outerdonemap, + nstates); + /* sto should now be marked as previously visited */ + assert(NISERR() || donemap[sto->no] == 1); + } + else if (prevclone) + { + /* + * We already have a clone state for this successor, so just + * make another arc to it. + */ + cparc(nfa, a, sclone, prevclone); + } + else + { + /* + * We need to create a new successor clone state. + */ + struct state *stoclone; + + stoclone = newstate(nfa); + if (stoclone == NULL) + { + assert(NISERR()); + break; + } + /* Mark it as to what it's a clone of */ + stoclone->tmp = sto; + /* ... and add the outarc leading to it */ + cparc(nfa, a, sclone, stoclone); + } + } + else + { + /* + * Non-constraint outarcs just get copied to sclone, as do outarcs + * leading to states with no constraint outarc. + */ + cparc(nfa, a, sclone, sto); + } + } + + /* + * If we are at outer level for this clone state, recurse to all its child + * clone states, clearing their tmp fields as we go. (If we're not + * outermost for sclone, leave this to be done by the outer call level.) + * Note that if we have multiple outarcs leading to the same clone state, + * it will only be recursed-to once. + */ + if (curdonemap == NULL) + { + for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain) + { + struct state *stoclone = a->to; + struct state *sto = stoclone->tmp; + + if (sto != NULL) + { + stoclone->tmp = NULL; + clonesuccessorstates(nfa, + sto, + stoclone, + spredecessor, + refarc, + NULL, + donemap, + nstates); + } + } + + /* Don't forget to free sclone's donemap when done with it */ + FREE(donemap); + } +} + +/* + * cleanup - clean up NFA after optimizations + */ +static void +cleanup(struct nfa *nfa) +{ + struct state *s; + struct state *nexts; + int n; + + if (NISERR()) + return; + + /* clear out unreachable or dead-end states */ + /* use pre to mark reachable, then post to mark can-reach-post */ + markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre); + markcanreach(nfa, nfa->post, nfa->pre, nfa->post); + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + if (s->tmp != nfa->post && !s->flag) + dropstate(nfa, s); + } + assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post); + cleartraverse(nfa, nfa->pre); + assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL); + /* the nins==0 (final unreachable) case will be caught later */ + + /* renumber surviving states */ + n = 0; + for (s = nfa->states; s != NULL; s = s->next) + s->no = n++; + nfa->nstates = n; +} + +/* + * markreachable - recursive marking of reachable states + */ +static void +markreachable(struct nfa *nfa, + struct state *s, + struct state *okay, /* consider only states with this mark */ + struct state *mark) /* the value to mark with */ +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp != okay) + return; + s->tmp = mark; + + for (a = s->outs; a != NULL; a = a->outchain) + markreachable(nfa, a->to, okay, mark); +} + +/* + * markcanreach - recursive marking of states which can reach here + */ +static void +markcanreach(struct nfa *nfa, + struct state *s, + struct state *okay, /* consider only states with this mark */ + struct state *mark) /* the value to mark with */ +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp != okay) + return; + s->tmp = mark; + + for (a = s->ins; a != NULL; a = a->inchain) + markcanreach(nfa, a->from, okay, mark); +} + +/* + * analyze - ascertain potentially-useful facts about an optimized NFA + */ +static long /* re_info bits to be ORed in */ +analyze(struct nfa *nfa) +{ + struct arc *a; + struct arc *aa; + + if (NISERR()) + return 0; + + /* Detect whether NFA can't match anything */ + if (nfa->pre->outs == NULL) + return REG_UIMPOSSIBLE; + + /* Detect whether NFA matches all strings (possibly with length bounds) */ + checkmatchall(nfa); + + /* Detect whether NFA can possibly match a zero-length string */ + for (a = nfa->pre->outs; a != NULL; a = a->outchain) + for (aa = a->to->outs; aa != NULL; aa = aa->outchain) + if (aa->to == nfa->post) + return REG_UEMPTYMATCH; + return 0; +} + +/* + * checkmatchall - does the NFA represent no more than a string length test? + * + * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1 + * to begin with) and set the MATCHALL bit in nfa->flags. + * + * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those + * for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the + * post state via RAINBOW arcs, and if there are any loops in the graph, they + * must be loop-to-self arcs, ensuring that each loop iteration consumes + * exactly one character. (Longer loops are problematic because they create + * non-consecutive possible match lengths; we have no good way to represent + * that situation for lengths beyond the DUPINF limit.) + * + * Pseudocolor arcs complicate things a little. We know that they can only + * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for + * EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs, + * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS + * and BOL outarcs to state 2, and no others. Missing or extra pseudocolor + * arcs can occur, meaning that the NFA involves some constraint on the + * adjacent characters, which makes it not a matchall NFA. + */ +static void +checkmatchall(struct nfa *nfa) +{ + bool **haspaths; + struct state *s; + int i; + + /* + * If there are too many states, don't bother trying to detect matchall. + * This limit serves to bound the time and memory we could consume below. + * Note that even if the graph is all-RAINBOW, if there are significantly + * more than DUPINF states then it's likely that there are paths of length + * more than DUPINF, which would force us to fail anyhow. In practice, + * plausible ways of writing a matchall regex with maximum finite path + * length K tend not to have very many more than K states. + */ + if (nfa->nstates > DUPINF * 2) + return; + + /* + * First, scan all the states to verify that only RAINBOW arcs appear, + * plus pseudocolor arcs adjacent to the pre and post states. This lets + * us quickly eliminate most cases that aren't matchall NFAs. + */ + for (s = nfa->states; s != NULL; s = s->next) + { + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->type != PLAIN) + return; /* any LACONs make it non-matchall */ + if (a->co != RAINBOW) + { + if (nfa->cm->cd[a->co].flags & PSEUDO) + { + /* + * Pseudocolor arc: verify it's in a valid place (this + * seems quite unlikely to fail, but let's be sure). + */ + if (s == nfa->pre && + (a->co == nfa->bos[0] || a->co == nfa->bos[1])) + /* okay BOS/BOL arc */ ; + else if (a->to == nfa->post && + (a->co == nfa->eos[0] || a->co == nfa->eos[1])) + /* okay EOS/EOL arc */ ; + else + return; /* unexpected pseudocolor arc */ + /* We'll check these arcs some more below. */ + } + else + return; /* any other color makes it non-matchall */ + } + } + /* Also, assert that the tmp fields are available for use. */ + assert(s->tmp == NULL); + } + + /* + * The next cheapest check we can make is to verify that the BOS/BOL + * outarcs of the pre state reach the same states as its RAINBOW outarcs. + * If they don't, the NFA expresses some constraints on the character + * before the matched string, making it non-matchall. Likewise, the + * EOS/EOL inarcs of the post state must match its RAINBOW inarcs. + */ + if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) || + !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) || + !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) || + !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1])) + return; + + /* + * Initialize an array of path-length arrays, in which + * checkmatchall_recurse will return per-state results. This lets us + * memo-ize the recursive search and avoid exponential time consumption. + */ + haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *)); + if (haspaths == NULL) + return; /* fail quietly */ + memset(haspaths, 0, nfa->nstates * sizeof(bool *)); + + /* + * Recursively search the graph for all-RAINBOW paths to the "post" state, + * starting at the "pre" state, and computing the lengths of the paths. + * (Given the preceding checks, there should be at least one such path. + * However we could get back a false result anyway, in case there are + * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic + * failures such as ENOMEM.) + */ + if (checkmatchall_recurse(nfa, nfa->pre, haspaths)) + { + /* The useful result is the path length array for the pre state */ + bool *haspath = haspaths[nfa->pre->no]; + int minmatch, + maxmatch, + morematch; + + assert(haspath != NULL); + + /* + * haspath[] now represents the set of possible path lengths; but we + * want to reduce that to a min and max value, because it doesn't seem + * worth complicating regexec.c to deal with nonconsecutive possible + * match lengths. Find min and max of first run of lengths, then + * verify there are no nonconsecutive lengths. + */ + for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++) + { + if (haspath[minmatch]) + break; + } + assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */ + for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++) + { + if (!haspath[maxmatch + 1]) + break; + } + for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++) + { + if (haspath[morematch]) + { + haspath = NULL; /* fail, there are nonconsecutive lengths */ + break; + } + } + + if (haspath != NULL) + { + /* + * Success, so record the info. Here we have a fine point: the + * path length from the pre state includes the pre-to-initial + * transition, so it's one more than the actually matched string + * length. (We avoided counting the final-to-post transition + * within checkmatchall_recurse, but not this one.) This is why + * checkmatchall_recurse allows one more level of path length than + * might seem necessary. This decrement also takes care of + * converting checkmatchall_recurse's definition of "infinity" as + * "DUPINF+1" to our normal representation as "DUPINF". + */ + assert(minmatch > 0); /* else pre and post states were adjacent */ + nfa->minmatchall = minmatch - 1; + nfa->maxmatchall = maxmatch - 1; + nfa->flags |= MATCHALL; + } + } + + /* Clean up */ + for (i = 0; i < nfa->nstates; i++) + { + if (haspaths[i] != NULL) + FREE(haspaths[i]); + } + FREE(haspaths); +} + +/* + * checkmatchall_recurse - recursive search for checkmatchall + * + * s is the state to be examined in this recursion level. + * haspaths[] is an array of per-state exit path length arrays. + * + * We return true if the search was performed successfully, false if + * we had to fail because of multi-state loops or other internal reasons. + * (Because "dead" states that can't reach the post state have been + * eliminated, and we already verified that only RAINBOW and matching + * pseudocolor arcs exist, every state should have RAINBOW path(s) to + * the post state. Hence we take a false result from recursive calls + * as meaning that we'd better fail altogether, not just that that + * particular state can't reach the post state.) + * + * On success, we store a malloc'd result array in haspaths[s->no], + * showing the possible path lengths from s to the post state. + * Each state's haspath[] array is of length DUPINF+2. The entries from + * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k + * from this state to the string end. haspath[DUPINF+1] is true if all + * path lengths >= DUPINF+1 are possible. (Situations that cannot be + * represented under these rules cause failure.) + * + * checkmatchall is responsible for eventually freeing the haspath[] arrays. + */ +static bool +checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths) +{ + bool result = false; + bool foundloop = false; + bool *haspath; + struct arc *a; + + /* + * Since this is recursive, it could be driven to stack overflow. But we + * need not treat that as a hard failure; just deem the NFA non-matchall. + */ + if (STACK_TOO_DEEP(nfa->v->re)) + return false; + + /* In case the search takes a long time, check for cancel */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return false; + } + + /* Create a haspath array for this state */ + haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool)); + if (haspath == NULL) + return false; /* again, treat as non-matchall */ + memset(haspath, 0, (DUPINF + 2) * sizeof(bool)); + + /* Mark this state as being visited */ + assert(s->tmp == NULL); + s->tmp = s; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->co != RAINBOW) + continue; /* ignore pseudocolor arcs */ + if (a->to == nfa->post) + { + /* We found an all-RAINBOW path to the post state */ + result = true; + + /* + * Mark this state as being zero steps away from the string end + * (the transition to the post state isn't counted). + */ + haspath[0] = true; + } + else if (a->to == s) + { + /* We found a cycle of length 1, which we'll deal with below. */ + foundloop = true; + } + else if (a->to->tmp != NULL) + { + /* It's busy, so we found a cycle of length > 1, so fail. */ + result = false; + break; + } + else + { + /* Consider paths forward through this to-state. */ + bool *nexthaspath; + int i; + + /* If to-state was not already visited, recurse */ + if (haspaths[a->to->no] == NULL) + { + result = checkmatchall_recurse(nfa, a->to, haspaths); + /* Fail if any recursive path fails */ + if (!result) + break; + } + else + { + /* The previous visit must have found path(s) to the end */ + result = true; + } + assert(a->to->tmp == NULL); + nexthaspath = haspaths[a->to->no]; + assert(nexthaspath != NULL); + + /* + * Now, for every path of length i from a->to to the string end, + * there is a path of length i + 1 from s to the string end. + */ + if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1]) + { + /* + * a->to has a path of length exactly DUPINF, but not longer; + * or it has paths of all lengths > DUPINF but not one of + * exactly that length. In either case, we cannot represent + * the possible path lengths from s correctly, so fail. + */ + result = false; + break; + } + /* Merge knowledge of these path lengths into what we have */ + for (i = 0; i < DUPINF; i++) + haspath[i + 1] |= nexthaspath[i]; + /* Infinity + 1 is still infinity */ + haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1]; + } + } + + if (result && foundloop) + { + /* + * If there is a length-1 loop at this state, then find the shortest + * known path length to the end. The loop means that every larger + * path length is possible, too. (It doesn't matter whether any of + * the longer lengths were already known possible.) + */ + int i; + + for (i = 0; i <= DUPINF; i++) + { + if (haspath[i]) + break; + } + for (i++; i <= DUPINF + 1; i++) + haspath[i] = true; + } + + /* Report out the completed path length map */ + assert(s->no < nfa->nstates); + assert(haspaths[s->no] == NULL); + haspaths[s->no] = haspath; + + /* Mark state no longer busy */ + s->tmp = NULL; + + return result; +} + +/* + * check_out_colors_match - subroutine for checkmatchall + * + * Check whether the set of states reachable from s by arcs of color co1 + * is equivalent to the set reachable by arcs of color co2. + * checkmatchall already verified that all of the NFA's arcs are PLAIN, + * so we need not examine arc types here. + */ +static bool +check_out_colors_match(struct state *s, color co1, color co2) +{ + bool result = true; + struct arc *a; + + /* + * To do this in linear time, we assume that the NFA contains no duplicate + * arcs. Run through the out-arcs, marking states reachable by arcs of + * color co1. Run through again, un-marking states reachable by arcs of + * color co2; if we see a not-marked state, we know this co2 arc is + * unmatched. Then run through again, checking for still-marked states, + * and in any case leaving all the tmp fields reset to NULL. + */ + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->co == co1) + { + assert(a->to->tmp == NULL); + a->to->tmp = a->to; + } + } + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->co == co2) + { + if (a->to->tmp != NULL) + a->to->tmp = NULL; + else + result = false; /* unmatched co2 arc */ + } + } + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->co == co1) + { + if (a->to->tmp != NULL) + { + result = false; /* unmatched co1 arc */ + a->to->tmp = NULL; + } + } + } + return result; +} + +/* + * check_in_colors_match - subroutine for checkmatchall + * + * Check whether the set of states that can reach s by arcs of color co1 + * is equivalent to the set that can reach s by arcs of color co2. + * checkmatchall already verified that all of the NFA's arcs are PLAIN, + * so we need not examine arc types here. + */ +static bool +check_in_colors_match(struct state *s, color co1, color co2) +{ + bool result = true; + struct arc *a; + + /* + * Identical algorithm to check_out_colors_match, except examine the + * from-states of s' inarcs. + */ + for (a = s->ins; a != NULL; a = a->inchain) + { + if (a->co == co1) + { + assert(a->from->tmp == NULL); + a->from->tmp = a->from; + } + } + for (a = s->ins; a != NULL; a = a->inchain) + { + if (a->co == co2) + { + if (a->from->tmp != NULL) + a->from->tmp = NULL; + else + result = false; /* unmatched co2 arc */ + } + } + for (a = s->ins; a != NULL; a = a->inchain) + { + if (a->co == co1) + { + if (a->from->tmp != NULL) + { + result = false; /* unmatched co1 arc */ + a->from->tmp = NULL; + } + } + } + return result; +} + +/* + * compact - construct the compact representation of an NFA + */ +static void +compact(struct nfa *nfa, + struct cnfa *cnfa) +{ + struct state *s; + struct arc *a; + size_t nstates; + size_t narcs; + struct carc *ca; + struct carc *first; + + assert(!NISERR()); + + nstates = 0; + narcs = 0; + for (s = nfa->states; s != NULL; s = s->next) + { + nstates++; + narcs += s->nouts + 1; /* need one extra for endmarker */ + } + + cnfa->stflags = (char *) MALLOC(nstates * sizeof(char)); + cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *)); + cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc)); + if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL) + { + if (cnfa->stflags != NULL) + FREE(cnfa->stflags); + if (cnfa->states != NULL) + FREE(cnfa->states); + if (cnfa->arcs != NULL) + FREE(cnfa->arcs); + NERR(REG_ESPACE); + return; + } + cnfa->nstates = nstates; + cnfa->pre = nfa->pre->no; + cnfa->post = nfa->post->no; + cnfa->bos[0] = nfa->bos[0]; + cnfa->bos[1] = nfa->bos[1]; + cnfa->eos[0] = nfa->eos[0]; + cnfa->eos[1] = nfa->eos[1]; + cnfa->ncolors = maxcolor(nfa->cm) + 1; + cnfa->flags = nfa->flags; + cnfa->minmatchall = nfa->minmatchall; + cnfa->maxmatchall = nfa->maxmatchall; + + ca = cnfa->arcs; + for (s = nfa->states; s != NULL; s = s->next) + { + assert((size_t) s->no < nstates); + cnfa->stflags[s->no] = 0; + cnfa->states[s->no] = ca; + first = ca; + for (a = s->outs; a != NULL; a = a->outchain) + switch (a->type) + { + case PLAIN: + ca->co = a->co; + ca->to = a->to->no; + ca++; + break; + case LACON: + assert(s->no != cnfa->pre); + assert(a->co >= 0); + ca->co = (color) (cnfa->ncolors + a->co); + ca->to = a->to->no; + ca++; + cnfa->flags |= HASLACONS; + break; + default: + NERR(REG_ASSERT); + return; + } + carcsort(first, ca - first); + ca->co = COLORLESS; + ca->to = 0; + ca++; + } + assert(ca == &cnfa->arcs[narcs]); + assert(cnfa->nstates != 0); + + /* mark no-progress states */ + for (a = nfa->pre->outs; a != NULL; a = a->outchain) + cnfa->stflags[a->to->no] = CNFA_NOPROGRESS; + cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS; +} + +/* + * carcsort - sort compacted-NFA arcs by color + */ +static void +carcsort(struct carc *first, size_t n) +{ + if (n > 1) + qsort(first, n, sizeof(struct carc), carc_cmp); +} + +static int +carc_cmp(const void *a, const void *b) +{ + const struct carc *aa = (const struct carc *) a; + const struct carc *bb = (const struct carc *) b; + + if (aa->co < bb->co) + return -1; + if (aa->co > bb->co) + return +1; + if (aa->to < bb->to) + return -1; + if (aa->to > bb->to) + return +1; + return 0; +} + +/* + * freecnfa - free a compacted NFA + */ +static void +freecnfa(struct cnfa *cnfa) +{ + assert(!NULLCNFA(*cnfa)); /* not empty already */ + FREE(cnfa->stflags); + FREE(cnfa->states); + FREE(cnfa->arcs); + ZAPCNFA(*cnfa); +} + +/* + * dumpnfa - dump an NFA in human-readable form + */ +static void +dumpnfa(struct nfa *nfa, + FILE *f) +{ +#ifdef REG_DEBUG + struct state *s; + int nstates = 0; + int narcs = 0; + + fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no); + if (nfa->bos[0] != COLORLESS) + fprintf(f, ", bos [%ld]", (long) nfa->bos[0]); + if (nfa->bos[1] != COLORLESS) + fprintf(f, ", bol [%ld]", (long) nfa->bos[1]); + if (nfa->eos[0] != COLORLESS) + fprintf(f, ", eos [%ld]", (long) nfa->eos[0]); + if (nfa->eos[1] != COLORLESS) + fprintf(f, ", eol [%ld]", (long) nfa->eos[1]); + if (nfa->flags & HASLACONS) + fprintf(f, ", haslacons"); + if (nfa->flags & MATCHALL) + { + fprintf(f, ", minmatchall %d", nfa->minmatchall); + if (nfa->maxmatchall == DUPINF) + fprintf(f, ", maxmatchall inf"); + else + fprintf(f, ", maxmatchall %d", nfa->maxmatchall); + } + fprintf(f, "\n"); + for (s = nfa->states; s != NULL; s = s->next) + { + dumpstate(s, f); + nstates++; + narcs += s->nouts; + } + fprintf(f, "total of %d states, %d arcs\n", nstates, narcs); + if (nfa->parent == NULL) + dumpcolors(nfa->cm, f); + fflush(f); +#endif +} + +#ifdef REG_DEBUG /* subordinates of dumpnfa */ + +/* + * dumpstate - dump an NFA state in human-readable form + */ +static void +dumpstate(struct state *s, + FILE *f) +{ + struct arc *a; + + fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "", + (s->flag) ? s->flag : '.'); + if (s->prev != NULL && s->prev->next != s) + fprintf(f, "\tstate chain bad\n"); + if (s->nouts == 0) + fprintf(f, "\tno out arcs\n"); + else + dumparcs(s, f); + for (a = s->ins; a != NULL; a = a->inchain) + { + if (a->to != s) + fprintf(f, "\tlink from %d to %d on %d's in-chain\n", + a->from->no, a->to->no, s->no); + } + fflush(f); +} + +/* + * dumparcs - dump out-arcs in human-readable form + */ +static void +dumparcs(struct state *s, + FILE *f) +{ + int pos; + struct arc *a; + + /* printing oldest arcs first is usually clearer */ + a = s->outs; + assert(a != NULL); + while (a->outchain != NULL) + a = a->outchain; + pos = 1; + do + { + dumparc(a, s, f); + if (pos == 5) + { + fprintf(f, "\n"); + pos = 1; + } + else + pos++; + a = a->outchainRev; + } while (a != NULL); + if (pos != 1) + fprintf(f, "\n"); +} + +/* + * dumparc - dump one outarc in readable form, including prefixing tab + */ +static void +dumparc(struct arc *a, + struct state *s, + FILE *f) +{ + struct arc *aa; + + fprintf(f, "\t"); + switch (a->type) + { + case PLAIN: + if (a->co == RAINBOW) + fprintf(f, "[*]"); + else + fprintf(f, "[%ld]", (long) a->co); + break; + case AHEAD: + if (a->co == RAINBOW) + fprintf(f, ">*>"); + else + fprintf(f, ">%ld>", (long) a->co); + break; + case BEHIND: + if (a->co == RAINBOW) + fprintf(f, "<*<"); + else + fprintf(f, "<%ld<", (long) a->co); + break; + case LACON: + fprintf(f, ":%ld:", (long) a->co); + break; + case '^': + case '$': + fprintf(f, "%c%d", a->type, (int) a->co); + break; + case EMPTY: + break; + default: + fprintf(f, "0x%x/0%lo", a->type, (long) a->co); + break; + } + if (a->from != s) + fprintf(f, "?%d?", a->from->no); + for (aa = a->from->outs; aa != NULL; aa = aa->outchain) + if (aa == a) + break; /* NOTE BREAK OUT */ + if (aa == NULL) + fprintf(f, "?!?"); /* missing from out-chain */ + fprintf(f, "->"); + if (a->to == NULL) + { + fprintf(f, "NULL"); + return; + } + fprintf(f, "%d", a->to->no); + for (aa = a->to->ins; aa != NULL; aa = aa->inchain) + if (aa == a) + break; /* NOTE BREAK OUT */ + if (aa == NULL) + fprintf(f, "?!?"); /* missing from in-chain */ +} +#endif /* REG_DEBUG */ + +/* + * dumpcnfa - dump a compacted NFA in human-readable form + */ +#ifdef REG_DEBUG +static void +dumpcnfa(struct cnfa *cnfa, + FILE *f) +{ + int st; + + fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post); + if (cnfa->bos[0] != COLORLESS) + fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]); + if (cnfa->bos[1] != COLORLESS) + fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]); + if (cnfa->eos[0] != COLORLESS) + fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]); + if (cnfa->eos[1] != COLORLESS) + fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]); + if (cnfa->flags & HASLACONS) + fprintf(f, ", haslacons"); + if (cnfa->flags & MATCHALL) + { + fprintf(f, ", minmatchall %d", cnfa->minmatchall); + if (cnfa->maxmatchall == DUPINF) + fprintf(f, ", maxmatchall inf"); + else + fprintf(f, ", maxmatchall %d", cnfa->maxmatchall); + } + fprintf(f, "\n"); + for (st = 0; st < cnfa->nstates; st++) + dumpcstate(st, cnfa, f); + fflush(f); +} +#endif + +#ifdef REG_DEBUG /* subordinates of dumpcnfa */ + +/* + * dumpcstate - dump a compacted-NFA state in human-readable form + */ +static void +dumpcstate(int st, + struct cnfa *cnfa, + FILE *f) +{ + struct carc *ca; + int pos; + + fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : "."); + pos = 1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == RAINBOW) + fprintf(f, "\t[*]->%d", ca->to); + else if (ca->co < cnfa->ncolors) + fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to); + else + fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to); + if (pos == 5) + { + fprintf(f, "\n"); + pos = 1; + } + else + pos++; + } + if (ca == cnfa->states[st] || pos != 1) + fprintf(f, "\n"); + fflush(f); +} + +#endif /* REG_DEBUG */ diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c new file mode 100644 index 0000000..bbbd61c --- /dev/null +++ b/src/backend/regex/regc_pg_locale.c @@ -0,0 +1,944 @@ +/*------------------------------------------------------------------------- + * + * regc_pg_locale.c + * ctype functions adapted to work on pg_wchar (a/k/a chr), + * and functions to cache the results of wholesale ctype probing. + * + * This file is #included by regcomp.c; it's not meant to compile standalone. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/regex/regc_pg_locale.c + * + *------------------------------------------------------------------------- + */ + +#include "catalog/pg_collation.h" +#include "utils/pg_locale.h" + +/* + * To provide as much functionality as possible on a variety of platforms, + * without going so far as to implement everything from scratch, we use + * several implementation strategies depending on the situation: + * + * 1. In C/POSIX collations, we use hard-wired code. We can't depend on + * the <ctype.h> functions since those will obey LC_CTYPE. Note that these + * collations don't give a fig about multibyte characters. + * + * 2. In the "default" collation (which is supposed to obey LC_CTYPE): + * + * 2a. When working in UTF8 encoding, we use the <wctype.h> functions. + * This assumes that every platform uses Unicode codepoints directly + * as the wchar_t representation of Unicode. On some platforms + * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. + * + * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar + * values up to 255, and punt for values above that. This is 100% correct + * only in single-byte encodings such as LATINn. However, non-Unicode + * multibyte encodings are mostly Far Eastern character sets for which the + * properties being tested here aren't very relevant for higher code values + * anyway. The difficulty with using the <wctype.h> functions with + * non-Unicode multibyte encodings is that we can have no certainty that + * the platform's wchar_t representation matches what we do in pg_wchar + * conversions. + * + * 3. Other collations are only supported on platforms that HAVE_LOCALE_T. + * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h> + * functions, under exactly the same cases as #2. + * + * There is one notable difference between cases 2 and 3: in the "default" + * collation we force ASCII letters to follow ASCII upcase/downcase rules, + * while in a non-default collation we just let the library functions do what + * they will. The case where this matters is treatment of I/i in Turkish, + * and the behavior is meant to match the upper()/lower() SQL functions. + * + * We store the active collation setting in static variables. In principle + * it could be passed down to here via the regex library's "struct vars" data + * structure; but that would require somewhat invasive changes in the regex + * library, and right now there's no real benefit to be gained from that. + * + * NB: the coding here assumes pg_wchar is an unsigned type. + */ + +typedef enum +{ + PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */ + PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */ + PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */ + PG_REGEX_LOCALE_1BYTE_L, /* Use locale_t <ctype.h> functions */ + PG_REGEX_LOCALE_ICU /* Use ICU uchar.h functions */ +} PG_Locale_Strategy; + +static PG_Locale_Strategy pg_regex_strategy; +static pg_locale_t pg_regex_locale; +static Oid pg_regex_collation; + +/* + * Hard-wired character properties for C locale + */ +#define PG_ISDIGIT 0x01 +#define PG_ISALPHA 0x02 +#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA) +#define PG_ISUPPER 0x04 +#define PG_ISLOWER 0x08 +#define PG_ISGRAPH 0x10 +#define PG_ISPRINT 0x20 +#define PG_ISPUNCT 0x40 +#define PG_ISSPACE 0x80 + +static const unsigned char pg_char_properties[128] = { + /* NUL */ 0, + /* ^A */ 0, + /* ^B */ 0, + /* ^C */ 0, + /* ^D */ 0, + /* ^E */ 0, + /* ^F */ 0, + /* ^G */ 0, + /* ^H */ 0, + /* ^I */ PG_ISSPACE, + /* ^J */ PG_ISSPACE, + /* ^K */ PG_ISSPACE, + /* ^L */ PG_ISSPACE, + /* ^M */ PG_ISSPACE, + /* ^N */ 0, + /* ^O */ 0, + /* ^P */ 0, + /* ^Q */ 0, + /* ^R */ 0, + /* ^S */ 0, + /* ^T */ 0, + /* ^U */ 0, + /* ^V */ 0, + /* ^W */ 0, + /* ^X */ 0, + /* ^Y */ 0, + /* ^Z */ 0, + /* ^[ */ 0, + /* ^\ */ 0, + /* ^] */ 0, + /* ^^ */ 0, + /* ^_ */ 0, + /* */ PG_ISPRINT | PG_ISSPACE, + /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* DEL */ 0 +}; + + +/* + * pg_set_regex_collation: set collation for these functions to obey + * + * This is called when beginning compilation or execution of a regexp. + * Since there's no need for reentrancy of regexp operations, it's okay + * to store the results in static variables. + */ +void +pg_set_regex_collation(Oid collation) +{ + if (lc_ctype_is_c(collation)) + { + /* C/POSIX collations use this path regardless of database encoding */ + pg_regex_strategy = PG_REGEX_LOCALE_C; + pg_regex_locale = 0; + pg_regex_collation = C_COLLATION_OID; + } + else + { + if (collation == DEFAULT_COLLATION_OID) + pg_regex_locale = 0; + else if (OidIsValid(collation)) + { + /* + * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T; + * the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not + * have to be considered below. + */ + pg_regex_locale = pg_newlocale_from_collation(collation); + } + else + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for regular expression"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + + if (pg_regex_locale && !pg_regex_locale->deterministic) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("nondeterministic collations are not supported for regular expressions"))); + +#ifdef USE_ICU + if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU) + pg_regex_strategy = PG_REGEX_LOCALE_ICU; + else +#endif + if (GetDatabaseEncoding() == PG_UTF8) + { + if (pg_regex_locale) + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE; + } + else + { + if (pg_regex_locale) + pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L; + else + pg_regex_strategy = PG_REGEX_LOCALE_1BYTE; + } + + pg_regex_collation = collation; + } +} + +static int +pg_wc_isdigit(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswdigit((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isdigit((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isdigit(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isalpha(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalpha((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isalpha((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isalpha(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isalnum(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalnum((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isalnum((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isalnum(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isword(pg_wchar c) +{ + /* We define word characters as alnum class plus underscore */ + if (c == CHR('_')) + return 1; + return pg_wc_isalnum(c); +} + +static int +pg_wc_isupper(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswupper((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isupper((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswupper_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isupper_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isupper(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_islower(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswlower((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + islower((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswlower_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + islower_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_islower(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isgraph(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswgraph((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isgraph((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isgraph(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isprint(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswprint((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isprint((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswprint_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isprint_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isprint(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_ispunct(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswpunct((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + ispunct((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_ispunct(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isspace(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswspace((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isspace((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswspace_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isspace_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isspace(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static pg_wchar +pg_wc_toupper(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + return c; + case PG_REGEX_LOCALE_WIDE: + /* force C behavior for ASCII characters, per comments above */ + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towupper((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + /* force C behavior for ASCII characters, per comments above */ + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + if (c <= (pg_wchar) UCHAR_MAX) + return toupper((unsigned char) c); + return c; + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towupper_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + if (c <= (pg_wchar) UCHAR_MAX) + return toupper_l((unsigned char) c, pg_regex_locale->info.lt); +#endif + return c; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_toupper(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static pg_wchar +pg_wc_tolower(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + return c; + case PG_REGEX_LOCALE_WIDE: + /* force C behavior for ASCII characters, per comments above */ + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towlower((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + /* force C behavior for ASCII characters, per comments above */ + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + if (c <= (pg_wchar) UCHAR_MAX) + return tolower((unsigned char) c); + return c; + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towlower_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + if (c <= (pg_wchar) UCHAR_MAX) + return tolower_l((unsigned char) c, pg_regex_locale->info.lt); +#endif + return c; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_tolower(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + + +/* + * These functions cache the results of probing libc's ctype behavior for + * all character codes of interest in a given encoding/collation. The + * result is provided as a "struct cvec", but notice that the representation + * is a touch different from a cvec created by regc_cvec.c: we allocate the + * chrs[] and ranges[] arrays separately from the struct so that we can + * realloc them larger at need. This is okay since the cvecs made here + * should never be freed by freecvec(). + * + * We use malloc not palloc since we mustn't lose control on out-of-memory; + * the main regex code expects us to return a failure indication instead. + */ + +typedef int (*pg_wc_probefunc) (pg_wchar c); + +typedef struct pg_ctype_cache +{ + pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */ + Oid collation; /* collation this entry is for */ + struct cvec cv; /* cache entry contents */ + struct pg_ctype_cache *next; /* chain link */ +} pg_ctype_cache; + +static pg_ctype_cache *pg_ctype_cache_list = NULL; + +/* + * Add a chr or range to pcc->cv; return false if run out of memory + */ +static bool +store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs) +{ + chr *newchrs; + + if (nchrs > 1) + { + if (pcc->cv.nranges >= pcc->cv.rangespace) + { + pcc->cv.rangespace *= 2; + newchrs = (chr *) realloc(pcc->cv.ranges, + pcc->cv.rangespace * sizeof(chr) * 2); + if (newchrs == NULL) + return false; + pcc->cv.ranges = newchrs; + } + pcc->cv.ranges[pcc->cv.nranges * 2] = chr1; + pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1; + pcc->cv.nranges++; + } + else + { + assert(nchrs == 1); + if (pcc->cv.nchrs >= pcc->cv.chrspace) + { + pcc->cv.chrspace *= 2; + newchrs = (chr *) realloc(pcc->cv.chrs, + pcc->cv.chrspace * sizeof(chr)); + if (newchrs == NULL) + return false; + pcc->cv.chrs = newchrs; + } + pcc->cv.chrs[pcc->cv.nchrs++] = chr1; + } + return true; +} + +/* + * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all + * chrs satisfying the probe function. The active collation is the one + * previously set by pg_set_regex_collation. Return NULL if out of memory. + * + * Note that the result must not be freed or modified by caller. + */ +static struct cvec * +pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) +{ + pg_ctype_cache *pcc; + pg_wchar max_chr; + pg_wchar cur_chr; + int nmatches; + chr *newchrs; + + /* + * Do we already have the answer cached? + */ + for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next) + { + if (pcc->probefunc == probefunc && + pcc->collation == pg_regex_collation) + return &pcc->cv; + } + + /* + * Nope, so initialize some workspace ... + */ + pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache)); + if (pcc == NULL) + return NULL; + pcc->probefunc = probefunc; + pcc->collation = pg_regex_collation; + pcc->cv.nchrs = 0; + pcc->cv.chrspace = 128; + pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr)); + pcc->cv.nranges = 0; + pcc->cv.rangespace = 64; + pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2); + if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL) + goto out_of_memory; + pcc->cv.cclasscode = cclasscode; + + /* + * Decide how many character codes we ought to look through. In general + * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at + * runtime using the "high colormap" mechanism. However, in C locale + * there's no need to go further than 127, and if we only have a 1-byte + * <ctype.h> API there's no need to go further than that can handle. + * + * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the + * output cvec as not having any locale-dependent behavior, since there + * will be no need to do any run-time locale checks. (The #if's here + * would always be true for production values of MAX_SIMPLE_CHR, but it's + * useful to allow it to be small for testing purposes.) + */ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: +#if MAX_SIMPLE_CHR >= 127 + max_chr = (pg_wchar) 127; + pcc->cv.cclasscode = -1; +#else + max_chr = (pg_wchar) MAX_SIMPLE_CHR; +#endif + break; + case PG_REGEX_LOCALE_WIDE: + case PG_REGEX_LOCALE_WIDE_L: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; + case PG_REGEX_LOCALE_1BYTE: + case PG_REGEX_LOCALE_1BYTE_L: +#if MAX_SIMPLE_CHR >= UCHAR_MAX + max_chr = (pg_wchar) UCHAR_MAX; + pcc->cv.cclasscode = -1; +#else + max_chr = (pg_wchar) MAX_SIMPLE_CHR; +#endif + break; + case PG_REGEX_LOCALE_ICU: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; + default: + max_chr = 0; /* can't get here, but keep compiler quiet */ + break; + } + + /* + * And scan 'em ... + */ + nmatches = 0; /* number of consecutive matches */ + + for (cur_chr = 0; cur_chr <= max_chr; cur_chr++) + { + if ((*probefunc) (cur_chr)) + nmatches++; + else if (nmatches > 0) + { + if (!store_match(pcc, cur_chr - nmatches, nmatches)) + goto out_of_memory; + nmatches = 0; + } + } + + if (nmatches > 0) + if (!store_match(pcc, cur_chr - nmatches, nmatches)) + goto out_of_memory; + + /* + * We might have allocated more memory than needed, if so free it + */ + if (pcc->cv.nchrs == 0) + { + free(pcc->cv.chrs); + pcc->cv.chrs = NULL; + pcc->cv.chrspace = 0; + } + else if (pcc->cv.nchrs < pcc->cv.chrspace) + { + newchrs = (chr *) realloc(pcc->cv.chrs, + pcc->cv.nchrs * sizeof(chr)); + if (newchrs == NULL) + goto out_of_memory; + pcc->cv.chrs = newchrs; + pcc->cv.chrspace = pcc->cv.nchrs; + } + if (pcc->cv.nranges == 0) + { + free(pcc->cv.ranges); + pcc->cv.ranges = NULL; + pcc->cv.rangespace = 0; + } + else if (pcc->cv.nranges < pcc->cv.rangespace) + { + newchrs = (chr *) realloc(pcc->cv.ranges, + pcc->cv.nranges * sizeof(chr) * 2); + if (newchrs == NULL) + goto out_of_memory; + pcc->cv.ranges = newchrs; + pcc->cv.rangespace = pcc->cv.nranges; + } + + /* + * Success, link it into cache chain + */ + pcc->next = pg_ctype_cache_list; + pg_ctype_cache_list = pcc; + + return &pcc->cv; + + /* + * Failure, clean up + */ +out_of_memory: + if (pcc->cv.chrs) + free(pcc->cv.chrs); + if (pcc->cv.ranges) + free(pcc->cv.ranges); + free(pcc); + + return NULL; +} diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c new file mode 100644 index 0000000..b735fa6 --- /dev/null +++ b/src/backend/regex/regcomp.c @@ -0,0 +1,2582 @@ +/* + * re_*comp and friends - compile REs + * This file #includes several others (see the bottom). + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regcomp.c + * + */ + +#include "regex/regguts.h" + +/* + * forward declarations, up here so forward datatypes etc. are defined early + */ +/* === regcomp.c === */ +static void moresubs(struct vars *, int); +static int freev(struct vars *, int); +static void makesearch(struct vars *, struct nfa *); +static struct subre *parse(struct vars *, int, int, struct state *, struct state *); +static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int); +static struct subre *parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *); +static void nonword(struct vars *, int, struct state *, struct state *); +static void word(struct vars *, int, struct state *, struct state *); +static void charclass(struct vars *, enum char_classes, + struct state *, struct state *); +static void charclasscomplement(struct vars *, enum char_classes, + struct state *, struct state *); +static int scannum(struct vars *); +static void repeat(struct vars *, struct state *, struct state *, int, int); +static void bracket(struct vars *, struct state *, struct state *); +static void cbracket(struct vars *, struct state *, struct state *); +static void brackpart(struct vars *, struct state *, struct state *, bool *); +static const chr *scanplain(struct vars *); +static void onechr(struct vars *, chr, struct state *, struct state *); +static void optimizebracket(struct vars *, struct state *, struct state *); +static void wordchrs(struct vars *); +static void processlacon(struct vars *, struct state *, struct state *, int, + struct state *, struct state *); +static struct subre *subre(struct vars *, int, int, struct state *, struct state *); +static void freesubre(struct vars *, struct subre *); +static void freesubreandsiblings(struct vars *, struct subre *); +static void freesrnode(struct vars *, struct subre *); +static void optst(struct vars *, struct subre *); +static int numst(struct subre *, int); +static void markst(struct subre *); +static void cleanst(struct vars *); +static long nfatree(struct vars *, struct subre *, FILE *); +static long nfanode(struct vars *, struct subre *, int, FILE *); +static int newlacon(struct vars *, struct state *, struct state *, int); +static void freelacons(struct subre *, int); +static void rfree(regex_t *); +static int rcancelrequested(void); +static int rstacktoodeep(void); + +#ifdef REG_DEBUG +static void dump(regex_t *, FILE *); +static void dumpst(struct subre *, FILE *, int); +static void stdump(struct subre *, FILE *, int); +static const char *stid(struct subre *, char *, size_t); +#endif +/* === regc_lex.c === */ +static void lexstart(struct vars *); +static void prefixes(struct vars *); +static int next(struct vars *); +static int lexescape(struct vars *); +static chr lexdigits(struct vars *, int, int, int); +static int brenext(struct vars *, chr); +static void skip(struct vars *); +static chr newline(void); +static chr chrnamed(struct vars *, const chr *, const chr *, chr); + +/* === regc_color.c === */ +static void initcm(struct vars *, struct colormap *); +static void freecm(struct colormap *); +static color maxcolor(struct colormap *); +static color newcolor(struct colormap *); +static void freecolor(struct colormap *, color); +static color pseudocolor(struct colormap *); +static color subcolor(struct colormap *, chr); +static color subcolorhi(struct colormap *, color *); +static color newsub(struct colormap *, color); +static int newhicolorrow(struct colormap *, int); +static void newhicolorcols(struct colormap *); +static void subcolorcvec(struct vars *, struct cvec *, struct state *, struct state *); +static void subcoloronechr(struct vars *, chr, struct state *, struct state *, color *); +static void subcoloronerange(struct vars *, chr, chr, struct state *, struct state *, color *); +static void subcoloronerow(struct vars *, int, struct state *, struct state *, color *); +static void okcolors(struct nfa *, struct colormap *); +static void colorchain(struct colormap *, struct arc *); +static void uncolorchain(struct colormap *, struct arc *); +static void rainbow(struct nfa *, struct colormap *, int, color, struct state *, struct state *); +static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *); + +#ifdef REG_DEBUG +static void dumpcolors(struct colormap *, FILE *); +static void dumpchr(chr, FILE *); +#endif +/* === regc_nfa.c === */ +static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *); +static void freenfa(struct nfa *); +static struct state *newstate(struct nfa *); +static struct state *newfstate(struct nfa *, int flag); +static void dropstate(struct nfa *, struct state *); +static void freestate(struct nfa *, struct state *); +static void newarc(struct nfa *, int, color, struct state *, struct state *); +static void createarc(struct nfa *, int, color, struct state *, struct state *); +static struct arc *allocarc(struct nfa *); +static void freearc(struct nfa *, struct arc *); +static void changearcsource(struct arc *, struct state *); +static void changearctarget(struct arc *, struct state *); +static int hasnonemptyout(struct state *); +static struct arc *findarc(struct state *, int, color); +static void cparc(struct nfa *, struct arc *, struct state *, struct state *); +static void sortins(struct nfa *, struct state *); +static int sortins_cmp(const void *, const void *); +static void sortouts(struct nfa *, struct state *); +static int sortouts_cmp(const void *, const void *); +static void moveins(struct nfa *, struct state *, struct state *); +static void copyins(struct nfa *, struct state *, struct state *); +static void mergeins(struct nfa *, struct state *, struct arc **, int); +static void moveouts(struct nfa *, struct state *, struct state *); +static void copyouts(struct nfa *, struct state *, struct state *); +static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int); +static void delsub(struct nfa *, struct state *, struct state *); +static void deltraverse(struct nfa *, struct state *, struct state *); +static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *); +static void duptraverse(struct nfa *, struct state *, struct state *); +static void removeconstraints(struct nfa *, struct state *, struct state *); +static void removetraverse(struct nfa *, struct state *); +static void cleartraverse(struct nfa *, struct state *); +static struct state *single_color_transition(struct state *, struct state *); +static void specialcolors(struct nfa *); +static long optimize(struct nfa *, FILE *); +static void pullback(struct nfa *, FILE *); +static int pull(struct nfa *, struct arc *, struct state **); +static void pushfwd(struct nfa *, FILE *); +static int push(struct nfa *, struct arc *, struct state **); + +#define INCOMPATIBLE 1 /* destroys arc */ +#define SATISFIED 2 /* constraint satisfied */ +#define COMPATIBLE 3 /* compatible but not satisfied yet */ +#define REPLACEARC 4 /* replace arc's color with constraint color */ +static int combine(struct nfa *nfa, struct arc *con, struct arc *a); +static void fixempties(struct nfa *, FILE *); +static struct state *emptyreachable(struct nfa *, struct state *, + struct state *, struct arc **); +static int isconstraintarc(struct arc *); +static int hasconstraintout(struct state *); +static void fixconstraintloops(struct nfa *, FILE *); +static int findconstraintloop(struct nfa *, struct state *); +static void breakconstraintloop(struct nfa *, struct state *); +static void clonesuccessorstates(struct nfa *, struct state *, struct state *, + struct state *, struct arc *, + char *, char *, int); +static void cleanup(struct nfa *); +static void markreachable(struct nfa *, struct state *, struct state *, struct state *); +static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); +static long analyze(struct nfa *); +static void checkmatchall(struct nfa *); +static bool checkmatchall_recurse(struct nfa *, struct state *, bool **); +static bool check_out_colors_match(struct state *, color, color); +static bool check_in_colors_match(struct state *, color, color); +static void compact(struct nfa *, struct cnfa *); +static void carcsort(struct carc *, size_t); +static int carc_cmp(const void *, const void *); +static void freecnfa(struct cnfa *); +static void dumpnfa(struct nfa *, FILE *); + +#ifdef REG_DEBUG +static void dumpstate(struct state *, FILE *); +static void dumparcs(struct state *, FILE *); +static void dumparc(struct arc *, struct state *, FILE *); +static void dumpcnfa(struct cnfa *, FILE *); +static void dumpcstate(int, struct cnfa *, FILE *); +#endif +/* === regc_cvec.c === */ +static struct cvec *newcvec(int, int); +static struct cvec *clearcvec(struct cvec *); +static void addchr(struct cvec *, chr); +static void addrange(struct cvec *, chr, chr); +static struct cvec *getcvec(struct vars *, int, int); +static void freecvec(struct cvec *); + +/* === regc_pg_locale.c === */ +static int pg_wc_isdigit(pg_wchar c); +static int pg_wc_isalpha(pg_wchar c); +static int pg_wc_isalnum(pg_wchar c); +static int pg_wc_isword(pg_wchar c); +static int pg_wc_isupper(pg_wchar c); +static int pg_wc_islower(pg_wchar c); +static int pg_wc_isgraph(pg_wchar c); +static int pg_wc_isprint(pg_wchar c); +static int pg_wc_ispunct(pg_wchar c); +static int pg_wc_isspace(pg_wchar c); +static pg_wchar pg_wc_toupper(pg_wchar c); +static pg_wchar pg_wc_tolower(pg_wchar c); + +/* === regc_locale.c === */ +static chr element(struct vars *, const chr *, const chr *); +static struct cvec *range(struct vars *, chr, chr, int); +static int before(chr, chr); +static struct cvec *eclass(struct vars *, chr, int); +static enum char_classes lookupcclass(struct vars *, const chr *, const chr *); +static struct cvec *cclasscvec(struct vars *, enum char_classes, int); +static int cclass_column_index(struct colormap *, chr); +static struct cvec *allcases(struct vars *, chr); +static int cmp(const chr *, const chr *, size_t); +static int casecmp(const chr *, const chr *, size_t); + + +/* internal variables, bundled for easy passing around */ +struct vars +{ + regex_t *re; + const chr *now; /* scan pointer into string */ + const chr *stop; /* end of string */ + int err; /* error code (0 if none) */ + int cflags; /* copy of compile flags */ + int lasttype; /* type of previous token */ + int nexttype; /* type of next token */ + chr nextvalue; /* value (if any) of next token */ + int lexcon; /* lexical context type (see regc_lex.c) */ + int nsubexp; /* subexpression count */ + struct subre **subs; /* subRE pointer vector */ + size_t nsubs; /* length of vector */ + struct subre *sub10[10]; /* initial vector, enough for most */ + struct nfa *nfa; /* the NFA */ + struct colormap *cm; /* character color map */ + color nlcolor; /* color of newline */ + struct state *wordchrs; /* state in nfa holding word-char outarcs */ + struct subre *tree; /* subexpression tree */ + struct subre *treechain; /* all tree nodes allocated */ + struct subre *treefree; /* any free tree nodes */ + int ntree; /* number of tree nodes, plus one */ + struct cvec *cv; /* interface cvec */ + struct cvec *cv2; /* utility cvec */ + struct subre *lacons; /* lookaround-constraint vector */ + int nlacons; /* size of lacons[]; note that only slots + * numbered 1 .. nlacons-1 are used */ + size_t spaceused; /* approx. space used for compilation */ +}; + +/* parsing macros; most know that `v' is the struct vars pointer */ +#define NEXT() (next(v)) /* advance by one token */ +#define SEE(t) (v->nexttype == (t)) /* is next token this? */ +#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */ +#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ +#define ISERR() VISERR(v) +#define VERR(vv,e) ((vv)->nexttype = EOS, \ + (vv)->err = ((vv)->err ? (vv)->err : (e))) +#define ERR(e) VERR(v, e) /* record an error */ +#define NOERR() {if (ISERR()) return;} /* if error seen, return */ +#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */ +#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */ +#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */ +#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */ +#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y) + +/* token type codes, some also used as NFA arc types */ +#define EMPTY 'n' /* no token present */ +#define EOS 'e' /* end of string */ +#define PLAIN 'p' /* ordinary character */ +#define DIGIT 'd' /* digit (in bound) */ +#define BACKREF 'b' /* back reference */ +#define COLLEL 'I' /* start of [. */ +#define ECLASS 'E' /* start of [= */ +#define CCLASS 'C' /* start of [: */ +#define END 'X' /* end of [. [= [: */ +#define CCLASSS 's' /* char class shorthand escape */ +#define CCLASSC 'c' /* complement char class shorthand escape */ +#define RANGE 'R' /* - within [] which might be range delim. */ +#define LACON 'L' /* lookaround constraint subRE */ +#define AHEAD 'a' /* color-lookahead arc */ +#define BEHIND 'r' /* color-lookbehind arc */ +#define WBDRY 'w' /* word boundary constraint */ +#define NWBDRY 'W' /* non-word-boundary constraint */ +#define SBEGIN 'A' /* beginning of string (even if not BOL) */ +#define SEND 'Z' /* end of string (even if not EOL) */ + +/* is an arc colored, and hence should belong to a color chain? */ +/* the test on "co" eliminates RAINBOW arcs, which we don't bother to chain */ +#define COLORED(a) \ + ((a)->co >= 0 && \ + ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)) + + +/* static function list */ +static const struct fns functions = { + rfree, /* regfree insides */ + rcancelrequested, /* check for cancel request */ + rstacktoodeep /* check for stack getting dangerously deep */ +}; + + + +/* + * pg_regcomp - compile regular expression + * + * Note: on failure, no resources remain allocated, so pg_regfree() + * need not be applied to re. + */ +int +pg_regcomp(regex_t *re, + const chr *string, + size_t len, + int flags, + Oid collation) +{ + struct vars var; + struct vars *v = &var; + struct guts *g; + int i; + size_t j; + +#ifdef REG_DEBUG + FILE *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL; +#else + FILE *debug = (FILE *) NULL; +#endif + +#define CNOERR() { if (ISERR()) return freev(v, v->err); } + + /* sanity checks */ + + if (re == NULL || string == NULL) + return REG_INVARG; + if ((flags & REG_QUOTE) && + (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE))) + return REG_INVARG; + if (!(flags & REG_EXTENDED) && (flags & REG_ADVF)) + return REG_INVARG; + + /* Initialize locale-dependent support */ + pg_set_regex_collation(collation); + + /* initial setup (after which freev() is callable) */ + v->re = re; + v->now = string; + v->stop = v->now + len; + v->err = 0; + v->cflags = flags; + v->nsubexp = 0; + v->subs = v->sub10; + v->nsubs = 10; + for (j = 0; j < v->nsubs; j++) + v->subs[j] = NULL; + v->nfa = NULL; + v->cm = NULL; + v->nlcolor = COLORLESS; + v->wordchrs = NULL; + v->tree = NULL; + v->treechain = NULL; + v->treefree = NULL; + v->cv = NULL; + v->cv2 = NULL; + v->lacons = NULL; + v->nlacons = 0; + v->spaceused = 0; + re->re_magic = REMAGIC; + re->re_info = 0; /* bits get set during parse */ + re->re_csize = sizeof(chr); + re->re_collation = collation; + re->re_guts = NULL; + re->re_fns = VS(&functions); + + /* more complex setup, malloced things */ + re->re_guts = VS(MALLOC(sizeof(struct guts))); + if (re->re_guts == NULL) + return freev(v, REG_ESPACE); + g = (struct guts *) re->re_guts; + g->tree = NULL; + initcm(v, &g->cmap); + v->cm = &g->cmap; + g->lacons = NULL; + g->nlacons = 0; + ZAPCNFA(g->search); + v->nfa = newnfa(v, v->cm, (struct nfa *) NULL); + CNOERR(); + /* set up a reasonably-sized transient cvec for getcvec usage */ + v->cv = newcvec(100, 20); + if (v->cv == NULL) + return freev(v, REG_ESPACE); + + /* parsing */ + lexstart(v); /* also handles prefixes */ + if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH)) + { + /* assign newline a unique color */ + v->nlcolor = subcolor(v->cm, newline()); + okcolors(v->nfa, v->cm); + } + CNOERR(); + v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final); + assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */ + CNOERR(); + assert(v->tree != NULL); + + /* finish setup of nfa and its subre tree */ + specialcolors(v->nfa); + CNOERR(); +#ifdef REG_DEBUG + if (debug != NULL) + { + fprintf(debug, "\n\n\n========= RAW ==========\n"); + dumpnfa(v->nfa, debug); + dumpst(v->tree, debug, 1); + } +#endif + optst(v, v->tree); + v->ntree = numst(v->tree, 1); + markst(v->tree); + cleanst(v); +#ifdef REG_DEBUG + if (debug != NULL) + { + fprintf(debug, "\n\n\n========= TREE FIXED ==========\n"); + dumpst(v->tree, debug, 1); + } +#endif + + /* build compacted NFAs for tree and lacons */ + re->re_info |= nfatree(v, v->tree, debug); + CNOERR(); + assert(v->nlacons == 0 || v->lacons != NULL); + for (i = 1; i < v->nlacons; i++) + { + struct subre *lasub = &v->lacons[i]; + +#ifdef REG_DEBUG + if (debug != NULL) + fprintf(debug, "\n\n\n========= LA%d ==========\n", i); +#endif + + /* Prepend .* to pattern if it's a lookbehind LACON */ + nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->latype), debug); + } + CNOERR(); + if (v->tree->flags & SHORTER) + NOTE(REG_USHORTEST); + + /* build compacted NFAs for tree, lacons, fast search */ +#ifdef REG_DEBUG + if (debug != NULL) + fprintf(debug, "\n\n\n========= SEARCH ==========\n"); +#endif + /* can sacrifice main NFA now, so use it as work area */ + (DISCARD) optimize(v->nfa, debug); + CNOERR(); + makesearch(v, v->nfa); + CNOERR(); + compact(v->nfa, &g->search); + CNOERR(); + + /* looks okay, package it up */ + re->re_nsub = v->nsubexp; + v->re = NULL; /* freev no longer frees re */ + g->magic = GUTSMAGIC; + g->cflags = v->cflags; + g->info = re->re_info; + g->nsub = re->re_nsub; + g->tree = v->tree; + v->tree = NULL; + g->ntree = v->ntree; + g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp; + g->lacons = v->lacons; + v->lacons = NULL; + g->nlacons = v->nlacons; + +#ifdef REG_DEBUG + if (flags & REG_DUMP) + { + dump(re, stdout); + fflush(stdout); + } +#endif + + assert(v->err == 0); + return freev(v, 0); +} + +/* + * moresubs - enlarge subRE vector + */ +static void +moresubs(struct vars *v, + int wanted) /* want enough room for this one */ +{ + struct subre **p; + size_t n; + + assert(wanted > 0 && (size_t) wanted >= v->nsubs); + n = (size_t) wanted * 3 / 2 + 1; + + if (v->subs == v->sub10) + { + p = (struct subre **) MALLOC(n * sizeof(struct subre *)); + if (p != NULL) + memcpy(VS(p), VS(v->subs), + v->nsubs * sizeof(struct subre *)); + } + else + p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *)); + if (p == NULL) + { + ERR(REG_ESPACE); + return; + } + v->subs = p; + for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) + *p = NULL; + assert(v->nsubs == n); + assert((size_t) wanted < v->nsubs); +} + +/* + * freev - free vars struct's substructures where necessary + * + * Optionally does error-number setting, and always returns error code + * (if any), to make error-handling code terser. + */ +static int +freev(struct vars *v, + int err) +{ + if (v->re != NULL) + rfree(v->re); + if (v->subs != v->sub10) + FREE(v->subs); + if (v->nfa != NULL) + freenfa(v->nfa); + if (v->tree != NULL) + freesubre(v, v->tree); + if (v->treechain != NULL) + cleanst(v); + if (v->cv != NULL) + freecvec(v->cv); + if (v->cv2 != NULL) + freecvec(v->cv2); + if (v->lacons != NULL) + freelacons(v->lacons, v->nlacons); + ERR(err); /* nop if err==0 */ + + return v->err; +} + +/* + * makesearch - turn an NFA into a search NFA (implicit prepend of .*?) + * NFA must have been optimize()d already. + */ +static void +makesearch(struct vars *v, + struct nfa *nfa) +{ + struct arc *a; + struct arc *b; + struct state *pre = nfa->pre; + struct state *s; + struct state *s2; + struct state *slist; + + /* no loops are needed if it's anchored */ + for (a = pre->outs; a != NULL; a = a->outchain) + { + assert(a->type == PLAIN); + if (a->co != nfa->bos[0] && a->co != nfa->bos[1]) + break; + } + if (a != NULL) + { + /* add implicit .* in front */ + rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre); + + /* and ^* and \A* too -- not always necessary, but harmless */ + newarc(nfa, PLAIN, nfa->bos[0], pre, pre); + newarc(nfa, PLAIN, nfa->bos[1], pre, pre); + + /* + * The pattern is still MATCHALL if it was before, but the max match + * length is now infinity. + */ + if (nfa->flags & MATCHALL) + nfa->maxmatchall = DUPINF; + } + + /* + * Now here's the subtle part. Because many REs have no lookback + * constraints, often knowing when you were in the pre state tells you + * little; it's the next state(s) that are informative. But some of them + * may have other inarcs, i.e. it may be possible to make actual progress + * and then return to one of them. We must de-optimize such cases, + * splitting each such state into progress and no-progress states. + */ + + /* first, make a list of the states reachable from pre and elsewhere */ + slist = NULL; + for (a = pre->outs; a != NULL; a = a->outchain) + { + s = a->to; + for (b = s->ins; b != NULL; b = b->inchain) + { + if (b->from != pre) + break; + } + + /* + * We want to mark states as being in the list already by having non + * NULL tmp fields, but we can't just store the old slist value in tmp + * because that doesn't work for the first such state. Instead, the + * first list entry gets its own address in tmp. + */ + if (b != NULL && s->tmp == NULL) + { + s->tmp = (slist != NULL) ? slist : s; + slist = s; + } + } + + /* do the splits */ + for (s = slist; s != NULL; s = s2) + { + s2 = newstate(nfa); + NOERR(); + copyouts(nfa, s, s2); + NOERR(); + for (a = s->ins; a != NULL; a = b) + { + b = a->inchain; + if (a->from != pre) + { + cparc(nfa, a, a->from, s2); + freearc(nfa, a); + } + } + s2 = (s->tmp != s) ? s->tmp : NULL; + s->tmp = NULL; /* clean up while we're at it */ + } +} + +/* + * parse - parse an RE + * + * This is actually just the top level, which parses a bunch of branches + * tied together with '|'. If there's more than one, they appear in the + * tree as the children of a '|' subre. + */ +static struct subre * +parse(struct vars *v, + int stopper, /* EOS or ')' */ + int type, /* LACON (lookaround subRE) or PLAIN */ + struct state *init, /* initial state */ + struct state *final) /* final state */ +{ + struct subre *branches; /* top level */ + struct subre *lastbranch; /* latest branch */ + + assert(stopper == ')' || stopper == EOS); + + branches = subre(v, '|', LONGER, init, final); + NOERRN(); + lastbranch = NULL; + do + { /* a branch */ + struct subre *branch; + struct state *left; /* scaffolding for branch */ + struct state *right; + + left = newstate(v->nfa); + right = newstate(v->nfa); + NOERRN(); + EMPTYARC(init, left); + EMPTYARC(right, final); + NOERRN(); + branch = parsebranch(v, stopper, type, left, right, 0); + NOERRN(); + if (lastbranch) + lastbranch->sibling = branch; + else + branches->child = branch; + branches->flags |= UP(branches->flags | branch->flags); + lastbranch = branch; + } while (EAT('|')); + assert(SEE(stopper) || SEE(EOS)); + + if (!SEE(stopper)) + { + assert(stopper == ')' && SEE(EOS)); + ERR(REG_EPAREN); + } + + /* optimize out simple cases */ + if (lastbranch == branches->child) + { /* only one branch */ + assert(lastbranch->sibling == NULL); + freesrnode(v, branches); + branches = lastbranch; + } + else if (!MESSY(branches->flags)) + { /* no interesting innards */ + freesubreandsiblings(v, branches->child); + branches->child = NULL; + branches->op = '='; + } + + return branches; +} + +/* + * parsebranch - parse one branch of an RE + * + * This mostly manages concatenation, working closely with parseqatom(). + * Concatenated things are bundled up as much as possible, with separate + * '.' nodes introduced only when necessary due to substructure. + */ +static struct subre * +parsebranch(struct vars *v, + int stopper, /* EOS or ')' */ + int type, /* LACON (lookaround subRE) or PLAIN */ + struct state *left, /* leftmost state */ + struct state *right, /* rightmost state */ + int partial) /* is this only part of a branch? */ +{ + struct state *lp; /* left end of current construct */ + int seencontent; /* is there anything in this branch yet? */ + struct subre *t; + + lp = left; + seencontent = 0; + t = subre(v, '=', 0, left, right); /* op '=' is tentative */ + NOERRN(); + while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) + { + if (seencontent) + { /* implicit concat operator */ + lp = newstate(v->nfa); + NOERRN(); + moveins(v->nfa, right, lp); + } + seencontent = 1; + + /* NB, recursion in parseqatom() may swallow rest of branch */ + t = parseqatom(v, stopper, type, lp, right, t); + NOERRN(); + } + + if (!seencontent) + { /* empty branch */ + if (!partial) + NOTE(REG_UUNSPEC); + assert(lp == left); + EMPTYARC(left, right); + } + + return t; +} + +/* + * parseqatom - parse one quantified atom or constraint of an RE + * + * The bookkeeping near the end cooperates very closely with parsebranch(); + * in particular, it contains a recursion that can involve parsing the rest + * of the branch, making this function's name somewhat inaccurate. + * + * Usually, the return value is just "top", but in some cases where we + * have parsed the rest of the branch, we may deem "top" redundant and + * free it, returning some child subre instead. + */ +static struct subre * +parseqatom(struct vars *v, + int stopper, /* EOS or ')' */ + int type, /* LACON (lookaround subRE) or PLAIN */ + struct state *lp, /* left state to hang it on */ + struct state *rp, /* right state to hang it on */ + struct subre *top) /* subtree top */ +{ + struct state *s; /* temporaries for new states */ + struct state *s2; + +#define ARCV(t, val) newarc(v->nfa, t, val, lp, rp) + int m, + n; + struct subre *atom; /* atom's subtree */ + struct subre *t; + int cap; /* capturing parens? */ + int latype; /* lookaround constraint type */ + int subno; /* capturing-parens or backref number */ + int atomtype; + int qprefer; /* quantifier short/long preference */ + int f; + struct subre **atomp; /* where the pointer to atom is */ + + /* initial bookkeeping */ + atom = NULL; + assert(lp->nouts == 0); /* must string new code */ + assert(rp->nins == 0); /* between lp and rp */ + subno = 0; /* just to shut lint up */ + + /* an atom or constraint... */ + atomtype = v->nexttype; + switch (atomtype) + { + /* first, constraints, which end by returning */ + case '^': + ARCV('^', 1); + if (v->cflags & REG_NLANCH) + ARCV(BEHIND, v->nlcolor); + NEXT(); + return top; + break; + case '$': + ARCV('$', 1); + if (v->cflags & REG_NLANCH) + ARCV(AHEAD, v->nlcolor); + NEXT(); + return top; + break; + case SBEGIN: + ARCV('^', 1); /* BOL */ + ARCV('^', 0); /* or BOS */ + NEXT(); + return top; + break; + case SEND: + ARCV('$', 1); /* EOL */ + ARCV('$', 0); /* or EOS */ + NEXT(); + return top; + break; + case '<': + wordchrs(v); + s = newstate(v->nfa); + NOERRN(); + nonword(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + NEXT(); + return top; + break; + case '>': + wordchrs(v); + s = newstate(v->nfa); + NOERRN(); + word(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + NEXT(); + return top; + break; + case WBDRY: + wordchrs(v); + s = newstate(v->nfa); + NOERRN(); + nonword(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + s = newstate(v->nfa); + NOERRN(); + word(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + NEXT(); + return top; + break; + case NWBDRY: + wordchrs(v); + s = newstate(v->nfa); + NOERRN(); + word(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + s = newstate(v->nfa); + NOERRN(); + nonword(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + NEXT(); + return top; + break; + case LACON: /* lookaround constraint */ + latype = v->nextvalue; + NEXT(); + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERRN(); + t = parse(v, ')', LACON, s, s2); + freesubre(v, t); /* internal structure irrelevant */ + NOERRN(); + assert(SEE(')')); + NEXT(); + processlacon(v, s, s2, latype, lp, rp); + return top; + break; + /* then errors, to get them out of the way */ + case '*': + case '+': + case '?': + case '{': + ERR(REG_BADRPT); + return top; + break; + default: + ERR(REG_ASSERT); + return top; + break; + /* then plain characters, and minor variants on that theme */ + case ')': /* unbalanced paren */ + if ((v->cflags & REG_ADVANCED) != REG_EXTENDED) + { + ERR(REG_EPAREN); + return top; + } + /* legal in EREs due to specification botch */ + NOTE(REG_UPBOTCH); + /* fall through into case PLAIN */ + /* FALLTHROUGH */ + case PLAIN: + onechr(v, v->nextvalue, lp, rp); + okcolors(v->nfa, v->cm); + NOERRN(); + NEXT(); + break; + case '[': + if (v->nextvalue == 1) + bracket(v, lp, rp); + else + cbracket(v, lp, rp); + assert(SEE(']') || ISERR()); + NEXT(); + break; + case CCLASSS: + charclass(v, (enum char_classes) v->nextvalue, lp, rp); + okcolors(v->nfa, v->cm); + NEXT(); + break; + case CCLASSC: + charclasscomplement(v, (enum char_classes) v->nextvalue, lp, rp); + /* charclasscomplement() did okcolors() internally */ + NEXT(); + break; + case '.': + rainbow(v->nfa, v->cm, PLAIN, + (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS, + lp, rp); + NEXT(); + break; + /* and finally the ugly stuff */ + case '(': /* value flags as capturing or non */ + cap = (type == LACON) ? 0 : v->nextvalue; + if (cap) + { + v->nsubexp++; + subno = v->nsubexp; + if ((size_t) subno >= v->nsubs) + moresubs(v, subno); + } + else + atomtype = PLAIN; /* something that's not '(' */ + NEXT(); + + /* + * Make separate endpoint states to keep this sub-NFA distinct + * from what surrounds it. We need to be sure that when we + * duplicate the sub-NFA for a backref, we get the right + * states/arcs and no others. In particular, letting a backref + * duplicate the sub-NFA from lp to rp would be quite wrong, + * because we may add quantification superstructure around this + * atom below. (Perhaps we could skip the extra states for + * non-capturing parens, but it seems not worth the trouble.) + */ + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERRN(); + /* We may not need these arcs, but keep things connected for now */ + EMPTYARC(lp, s); + EMPTYARC(s2, rp); + NOERRN(); + atom = parse(v, ')', type, s, s2); + assert(SEE(')') || ISERR()); + NEXT(); + NOERRN(); + if (cap) + { + if (atom->capno == 0) + { + /* normal case: just mark the atom as capturing */ + atom->flags |= CAP; + atom->capno = subno; + } + else + { + /* generate no-op wrapper node to handle "((x))" */ + t = subre(v, '(', atom->flags | CAP, s, s2); + NOERRN(); + t->capno = subno; + t->child = atom; + atom = t; + } + assert(v->subs[subno] == NULL); + v->subs[subno] = atom; + } + /* postpone everything else pending possible {0} */ + break; + case BACKREF: /* the Feature From The Black Lagoon */ + INSIST(type != LACON, REG_ESUBREG); + INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); + INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG); + NOERRN(); + assert(v->nextvalue > 0); + atom = subre(v, 'b', BACKR, lp, rp); + NOERRN(); + subno = v->nextvalue; + atom->backno = subno; + EMPTYARC(lp, rp); /* temporarily, so there's something */ + NEXT(); + break; + } + + /* ...and an atom may be followed by a quantifier */ + switch (v->nexttype) + { + case '*': + m = 0; + n = DUPINF; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '+': + m = 1; + n = DUPINF; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '?': + m = 0; + n = 1; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '{': + NEXT(); + m = scannum(v); + if (EAT(',')) + { + if (SEE(DIGIT)) + n = scannum(v); + else + n = DUPINF; + if (m > n) + { + ERR(REG_BADBR); + return top; + } + /* {m,n} exercises preference, even if it's {m,m} */ + qprefer = (v->nextvalue) ? LONGER : SHORTER; + } + else + { + n = m; + /* {m} passes operand's preference through */ + qprefer = 0; + } + if (!SEE('}')) + { /* catches errors too */ + ERR(REG_BADBR); + return top; + } + NEXT(); + break; + default: /* no quantifier */ + m = n = 1; + qprefer = 0; + break; + } + + /* annoying special case: {0} or {0,0} cancels everything */ + if (m == 0 && n == 0) + { + /* + * If we had capturing subexpression(s) within the atom, we don't want + * to destroy them, because it's legal (if useless) to back-ref them + * later. Hence, just unlink the atom from lp/rp and then ignore it. + */ + if (atom != NULL && (atom->flags & CAP)) + { + delsub(v->nfa, lp, atom->begin); + delsub(v->nfa, atom->end, rp); + } + else + { + /* Otherwise, we can clean up any subre infrastructure we made */ + if (atom != NULL) + freesubre(v, atom); + delsub(v->nfa, lp, rp); + } + EMPTYARC(lp, rp); + return top; + } + + /* if not a messy case, avoid hard part */ + assert(!MESSY(top->flags)); + f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0); + if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) + { + if (!(m == 1 && n == 1)) + repeat(v, lp, rp, m, n); + if (atom != NULL) + freesubre(v, atom); + top->flags = f; + return top; + } + + /* + * hard part: something messy + * + * That is, capturing parens, back reference, short/long clash, or an atom + * with substructure containing one of those. + */ + + /* now we'll need a subre for the contents even if they're boring */ + if (atom == NULL) + { + atom = subre(v, '=', 0, lp, rp); + NOERRN(); + } + + /* + * For what follows, we need the atom to have its own begin/end states + * that are distinct from lp/rp, so that we can wrap iteration structure + * around it. The parenthesized-atom case above already made suitable + * states (and we don't want to modify a capturing subre, since it's + * already recorded in v->subs[]). Otherwise, we need more states. + */ + if (atom->begin == lp || atom->end == rp) + { + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERRN(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s2); + atom->begin = s; + atom->end = s2; + } + else + { + /* The atom's OK, but we must temporarily disconnect it from lp/rp */ + /* (this removes the EMPTY arcs we made above) */ + delsub(v->nfa, lp, atom->begin); + delsub(v->nfa, atom->end, rp); + } + + /*---------- + * Prepare a general-purpose state skeleton. + * + * In the no-backrefs case, we want this: + * + * [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp] + * + * where prefix is some repetitions of atom, and "rest" is the remainder + * of the branch. In the general case we need: + * + * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp] + * + * where the iterator wraps around the atom. + * + * We make the s state here for both cases; s2 is made below if needed + *---------- + */ + s = newstate(v->nfa); /* set up starting state */ + NOERRN(); + EMPTYARC(lp, s); + NOERRN(); + + /* break remaining subRE into x{...} and what follows */ + t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp); + NOERRN(); + t->child = atom; + atomp = &t->child; + + /* + * Here we should recurse to fill t->child->sibling ... but we must + * postpone that to the end. One reason is that t->child may be replaced + * below, and we don't want to worry about its sibling link. + */ + + /* + * Convert top node to a concatenation of the prefix (top->child, covering + * whatever we parsed previously) and remaining (t). Note that the prefix + * could be empty, in which case this concatenation node is unnecessary. + * To keep things simple, we operate in a general way for now, and get rid + * of unnecessary subres below. + */ + assert(top->op == '=' && top->child == NULL); + top->child = subre(v, '=', top->flags, top->begin, lp); + NOERRN(); + top->op = '.'; + top->child->sibling = t; + /* top->flags will get updated later */ + + /* if it's a backref, now is the time to replicate the subNFA */ + if (atomtype == BACKREF) + { + assert(atom->begin->nouts == 1); /* just the EMPTY */ + delsub(v->nfa, atom->begin, atom->end); + assert(v->subs[subno] != NULL); + + /* + * And here's why the recursion got postponed: it must wait until the + * skeleton is filled in, because it may hit a backref that wants to + * copy the filled-in skeleton. + */ + dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end, + atom->begin, atom->end); + NOERRN(); + + /* The backref node's NFA should not enforce any constraints */ + removeconstraints(v->nfa, atom->begin, atom->end); + NOERRN(); + } + + /* + * It's quantifier time. If the atom is just a backref, we'll let it deal + * with quantifiers internally. + */ + if (atomtype == BACKREF) + { + /* special case: backrefs have internal quantifiers */ + EMPTYARC(s, atom->begin); /* empty prefix */ + /* just stuff everything into atom */ + repeat(v, atom->begin, atom->end, m, n); + atom->min = (short) m; + atom->max = (short) n; + atom->flags |= COMBINE(qprefer, atom->flags); + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } + else if (m == 1 && n == 1 && + (qprefer == 0 || + (atom->flags & (LONGER | SHORTER | MIXED)) == 0 || + qprefer == (atom->flags & (LONGER | SHORTER | MIXED)))) + { + /* no/vacuous quantifier: done */ + EMPTYARC(s, atom->begin); /* empty prefix */ + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } + else if (!(atom->flags & (CAP | BACKR))) + { + /* + * If there's no captures nor backrefs in the atom being repeated, we + * don't really care where the submatches of the iteration are, so we + * don't need an iteration node. Make a plain DFA node instead. + */ + EMPTYARC(s, atom->begin); /* empty prefix */ + repeat(v, atom->begin, atom->end, m, n); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '=', f, atom->begin, atom->end); + NOERRN(); + freesubre(v, atom); + *atomp = t; + /* rest of branch can be strung starting from t->end */ + s2 = t->end; + } + else if (m > 0 && !(atom->flags & BACKR)) + { + /* + * If there's no backrefs involved, we can turn x{m,n} into + * x{m-1,n-1}x, with capturing parens in only the second x. This is + * valid because we only care about capturing matches from the final + * iteration of the quantifier. It's a win because we can implement + * the backref-free left side as a plain DFA node, since we don't + * really care where its submatches are. + */ + dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); + assert(m >= 1 && m != DUPINF && n >= 1); + repeat(v, s, atom->begin, m - 1, (n == DUPINF) ? n : n - 1); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '.', f, s, atom->end); /* prefix and atom */ + NOERRN(); + t->child = subre(v, '=', PREF(f), s, atom->begin); + NOERRN(); + t->child->sibling = atom; + *atomp = t; + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } + else + { + /* general case: need an iteration node */ + s2 = newstate(v->nfa); + NOERRN(); + moveouts(v->nfa, atom->end, s2); + NOERRN(); + dupnfa(v->nfa, atom->begin, atom->end, s, s2); + repeat(v, s, s2, m, n); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '*', f, s, s2); + NOERRN(); + t->min = (short) m; + t->max = (short) n; + t->child = atom; + *atomp = t; + /* rest of branch is to be strung from iteration's end state */ + } + + /* and finally, look after that postponed recursion */ + t = top->child->sibling; + if (!(SEE('|') || SEE(stopper) || SEE(EOS))) + { + /* parse all the rest of the branch, and insert in t->child->sibling */ + t->child->sibling = parsebranch(v, stopper, type, s2, rp, 1); + NOERRN(); + assert(SEE('|') || SEE(stopper) || SEE(EOS)); + + /* here's the promised update of the flags */ + t->flags |= COMBINE(t->flags, t->child->sibling->flags); + top->flags |= COMBINE(top->flags, t->flags); + + /* neither t nor top could be directly marked for capture as yet */ + assert(t->capno == 0); + assert(top->capno == 0); + + /* + * At this point both top and t are concatenation (op == '.') subres, + * and we have top->child = prefix of branch, top->child->sibling = t, + * t->child = messy atom (with quantification superstructure if + * needed), t->child->sibling = rest of branch. + * + * If the messy atom was the first thing in the branch, then + * top->child is vacuous and we can get rid of one level of + * concatenation. + */ + assert(top->child->op == '='); + if (top->child->begin == top->child->end) + { + assert(!MESSY(top->child->flags)); + freesubre(v, top->child); + top->child = t->child; + freesrnode(v, t); + } + + /* + * Otherwise, it's possible that t->child is not messy in itself, but + * we considered it messy because its greediness conflicts with what + * preceded it. Then it could be that the combination of t->child and + * the rest of the branch is also not messy, in which case we can get + * rid of the child concatenation by merging t->child and the rest of + * the branch into one plain DFA node. + */ + else if (t->child->op == '=' && + t->child->sibling->op == '=' && + !MESSY(UP(t->child->flags | t->child->sibling->flags))) + { + t->op = '='; + t->flags = COMBINE(t->child->flags, t->child->sibling->flags); + freesubreandsiblings(v, t->child); + t->child = NULL; + } + } + else + { + /* + * There's nothing left in the branch, so we don't need the second + * concatenation node 't'. Just link s2 straight to rp. + */ + EMPTYARC(s2, rp); + top->child->sibling = t->child; + top->flags |= COMBINE(top->flags, top->child->sibling->flags); + freesrnode(v, t); + + /* + * Again, it could be that top->child is vacuous (if the messy atom + * was in fact the only thing in the branch). In that case we need no + * concatenation at all; just replace top with top->child->sibling. + */ + assert(top->child->op == '='); + if (top->child->begin == top->child->end) + { + assert(!MESSY(top->child->flags)); + t = top->child->sibling; + top->child->sibling = NULL; + freesubre(v, top); + top = t; + } + } + + return top; +} + +/* + * nonword - generate arcs for non-word-character ahead or behind + */ +static void +nonword(struct vars *v, + int dir, /* AHEAD or BEHIND */ + struct state *lp, + struct state *rp) +{ + int anchor = (dir == AHEAD) ? '$' : '^'; + + assert(dir == AHEAD || dir == BEHIND); + newarc(v->nfa, anchor, 1, lp, rp); + newarc(v->nfa, anchor, 0, lp, rp); + colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp); + /* (no need for special attention to \n) */ +} + +/* + * word - generate arcs for word character ahead or behind + */ +static void +word(struct vars *v, + int dir, /* AHEAD or BEHIND */ + struct state *lp, + struct state *rp) +{ + assert(dir == AHEAD || dir == BEHIND); + cloneouts(v->nfa, v->wordchrs, lp, rp, dir); + /* (no need for special attention to \n) */ +} + +/* + * charclass - generate arcs for a character class + * + * This is used for both atoms (\w and sibling escapes) and for elements + * of bracket expressions. The caller is responsible for calling okcolors() + * at the end of processing the atom or bracket. + */ +static void +charclass(struct vars *v, + enum char_classes cls, + struct state *lp, + struct state *rp) +{ + struct cvec *cv; + + /* obtain possibly-cached cvec for char class */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, cls, (v->cflags & REG_ICASE)); + NOERR(); + + /* build the arcs; this may cause color splitting */ + subcolorcvec(v, cv, lp, rp); +} + +/* + * charclasscomplement - generate arcs for a complemented character class + * + * This is used for both atoms (\W and sibling escapes) and for elements + * of bracket expressions. In bracket expressions, it is the caller's + * responsibility that there not be any open subcolors when this is called. + */ +static void +charclasscomplement(struct vars *v, + enum char_classes cls, + struct state *lp, + struct state *rp) +{ + struct state *cstate; + struct cvec *cv; + + /* make dummy state to hang temporary arcs on */ + cstate = newstate(v->nfa); + NOERR(); + + /* obtain possibly-cached cvec for char class */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, cls, (v->cflags & REG_ICASE)); + NOERR(); + + /* build arcs for char class; this may cause color splitting */ + subcolorcvec(v, cv, cstate, cstate); + NOERR(); + + /* clean up any subcolors in the arc set */ + okcolors(v->nfa, v->cm); + NOERR(); + + /* now build output arcs for the complement of the char class */ + colorcomplement(v->nfa, v->cm, PLAIN, cstate, lp, rp); + NOERR(); + + /* clean up dummy state */ + dropstate(v->nfa, cstate); +} + +/* + * scannum - scan a number + */ +static int /* value, <= DUPMAX */ +scannum(struct vars *v) +{ + int n = 0; + + while (SEE(DIGIT) && n < DUPMAX) + { + n = n * 10 + v->nextvalue; + NEXT(); + } + if (SEE(DIGIT) || n > DUPMAX) + { + ERR(REG_BADBR); + return 0; + } + return n; +} + +/* + * repeat - replicate subNFA for quantifiers + * + * The sub-NFA strung from lp to rp is modified to represent m to n + * repetitions of its initial contents. + * + * The duplication sequences used here are chosen carefully so that any + * pointers starting out pointing into the subexpression end up pointing into + * the last occurrence. (Note that it may not be strung between the same + * left and right end states, however!) This used to be important for the + * subRE tree, although the important bits are now handled by the in-line + * code in parse(), and when this is called, it doesn't matter any more. + */ +static void +repeat(struct vars *v, + struct state *lp, + struct state *rp, + int m, + int n) +{ +#define SOME 2 +#define INF 3 +#define PAIR(x, y) ((x)*4 + (y)) +#define REDUCE(x) ( ((x) == DUPINF) ? INF : (((x) > 1) ? SOME : (x)) ) + const int rm = REDUCE(m); + const int rn = REDUCE(n); + struct state *s; + struct state *s2; + + switch (PAIR(rm, rn)) + { + case PAIR(0, 0): /* empty string */ + delsub(v->nfa, lp, rp); + EMPTYARC(lp, rp); + break; + case PAIR(0, 1): /* do as x| */ + EMPTYARC(lp, rp); + break; + case PAIR(0, SOME): /* do as x{1,n}| */ + repeat(v, lp, rp, 1, n); + NOERR(); + EMPTYARC(lp, rp); + break; + case PAIR(0, INF): /* loop x around */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s); + EMPTYARC(lp, s); + EMPTYARC(s, rp); + break; + case PAIR(1, 1): /* no action required */ + break; + case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + dupnfa(v->nfa, s, rp, lp, s); + NOERR(); + repeat(v, lp, s, 1, n - 1); + NOERR(); + EMPTYARC(lp, s); + break; + case PAIR(1, INF): /* add loopback arc */ + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s2); + EMPTYARC(lp, s); + EMPTYARC(s2, rp); + EMPTYARC(s2, s); + break; + case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + dupnfa(v->nfa, s, rp, lp, s); + NOERR(); + repeat(v, lp, s, m - 1, n - 1); + break; + case PAIR(SOME, INF): /* do as x{m-1,}x */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + dupnfa(v->nfa, s, rp, lp, s); + NOERR(); + repeat(v, lp, s, m - 1, n); + break; + default: + ERR(REG_ASSERT); + break; + } +} + +/* + * bracket - handle non-complemented bracket expression + * + * Also called from cbracket for complemented bracket expressions. + */ +static void +bracket(struct vars *v, + struct state *lp, + struct state *rp) +{ + /* + * We can't process complemented char classes (e.g. \W) immediately while + * scanning the bracket expression, else color bookkeeping gets confused. + * Instead, remember whether we saw any in have_cclassc[], and process + * them at the end. + */ + bool have_cclassc[NUM_CCLASSES]; + bool any_cclassc; + int i; + + memset(have_cclassc, false, sizeof(have_cclassc)); + + assert(SEE('[')); + NEXT(); + while (!SEE(']') && !SEE(EOS)) + brackpart(v, lp, rp, have_cclassc); + assert(SEE(']') || ISERR()); + + /* close up open subcolors from the positive bracket elements */ + okcolors(v->nfa, v->cm); + NOERR(); + + /* now handle any complemented elements */ + any_cclassc = false; + for (i = 0; i < NUM_CCLASSES; i++) + { + if (have_cclassc[i]) + { + charclasscomplement(v, (enum char_classes) i, lp, rp); + NOERR(); + any_cclassc = true; + } + } + + /* + * If we had any complemented elements, see if we can optimize the bracket + * into a rainbow. Since a complemented element is the only way a WHITE + * arc could get into the result, there's no point in checking otherwise. + */ + if (any_cclassc) + optimizebracket(v, lp, rp); +} + +/* + * cbracket - handle complemented bracket expression + * + * We do it by calling bracket() with dummy endpoints, and then complementing + * the result. The alternative would be to invoke rainbow(), and then delete + * arcs as the b.e. is seen... but that gets messy, and is really quite + * infeasible now that rainbow() just puts out one RAINBOW arc. + */ +static void +cbracket(struct vars *v, + struct state *lp, + struct state *rp) +{ + struct state *left = newstate(v->nfa); + struct state *right = newstate(v->nfa); + + NOERR(); + bracket(v, left, right); + + /* in NLSTOP mode, ensure newline is not part of the result set */ + if (v->cflags & REG_NLSTOP) + newarc(v->nfa, PLAIN, v->nlcolor, left, right); + NOERR(); + + assert(lp->nouts == 0); /* all outarcs will be ours */ + + /* + * Easy part of complementing, and all there is to do since the MCCE code + * was removed. Note that the result of colorcomplement() cannot be a + * rainbow, since we don't allow empty brackets; so there's no point in + * calling optimizebracket() again. + */ + colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp); + NOERR(); + dropstate(v->nfa, left); + assert(right->nins == 0); + freestate(v->nfa, right); +} + +/* + * brackpart - handle one item (or range) within a bracket expression + */ +static void +brackpart(struct vars *v, + struct state *lp, + struct state *rp, + bool *have_cclassc) +{ + chr startc; + chr endc; + struct cvec *cv; + enum char_classes cls; + const chr *startp; + const chr *endp; + + /* parse something, get rid of special cases, take shortcuts */ + switch (v->nexttype) + { + case RANGE: /* a-b-c or other botch */ + ERR(REG_ERANGE); + return; + break; + case PLAIN: + startc = v->nextvalue; + NEXT(); + /* shortcut for ordinary chr (not range) */ + if (!SEE(RANGE)) + { + onechr(v, startc, lp, rp); + return; + } + NOERR(); + break; + case COLLEL: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECOLLATE); + NOERR(); + startc = element(v, startp, endp); + NOERR(); + break; + case ECLASS: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECOLLATE); + NOERR(); + startc = element(v, startp, endp); + NOERR(); + cv = eclass(v, startc, (v->cflags & REG_ICASE)); + NOERR(); + subcolorcvec(v, cv, lp, rp); + return; + break; + case CCLASS: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECTYPE); + NOERR(); + cls = lookupcclass(v, startp, endp); + NOERR(); + charclass(v, cls, lp, rp); + return; + break; + case CCLASSS: + charclass(v, (enum char_classes) v->nextvalue, lp, rp); + NEXT(); + return; + break; + case CCLASSC: + /* we cannot call charclasscomplement() immediately */ + have_cclassc[v->nextvalue] = true; + NEXT(); + return; + break; + default: + ERR(REG_ASSERT); + return; + break; + } + + if (SEE(RANGE)) + { + NEXT(); + switch (v->nexttype) + { + case PLAIN: + case RANGE: + endc = v->nextvalue; + NEXT(); + NOERR(); + break; + case COLLEL: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECOLLATE); + NOERR(); + endc = element(v, startp, endp); + NOERR(); + break; + default: + ERR(REG_ERANGE); + return; + break; + } + } + else + endc = startc; + + /* + * Ranges are unportable. Actually, standard C does guarantee that digits + * are contiguous, but making that an exception is just too complicated. + */ + if (startc != endc) + NOTE(REG_UUNPORT); + cv = range(v, startc, endc, (v->cflags & REG_ICASE)); + NOERR(); + subcolorcvec(v, cv, lp, rp); +} + +/* + * scanplain - scan PLAIN contents of [. etc. + * + * Certain bits of trickery in regc_lex.c know that this code does not try + * to look past the final bracket of the [. etc. + */ +static const chr * /* just after end of sequence */ +scanplain(struct vars *v) +{ + const chr *endp; + + assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS)); + NEXT(); + + endp = v->now; + while (SEE(PLAIN)) + { + endp = v->now; + NEXT(); + } + + assert(SEE(END) || ISERR()); + NEXT(); + + return endp; +} + +/* + * onechr - fill in arcs for a plain character, and possible case complements + * This is mostly a shortcut for efficient handling of the common case. + */ +static void +onechr(struct vars *v, + chr c, + struct state *lp, + struct state *rp) +{ + if (!(v->cflags & REG_ICASE)) + { + color lastsubcolor = COLORLESS; + + subcoloronechr(v, c, lp, rp, &lastsubcolor); + return; + } + + /* rats, need general case anyway... */ + subcolorcvec(v, allcases(v, c), lp, rp); +} + +/* + * optimizebracket - see if bracket expression can be converted to RAINBOW + * + * Cases such as "[\s\S]" can produce a set of arcs of all colors, which we + * can replace by a single RAINBOW arc for efficiency. (This might seem + * like a silly way to write ".", but it's seemingly a common locution in + * some other flavors of regex, so take the trouble to support it well.) + */ +static void +optimizebracket(struct vars *v, + struct state *lp, + struct state *rp) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(v->cm); + struct arc *a; + bool israinbow; + + /* + * Scan lp's out-arcs and transiently mark the mentioned colors. We + * expect that all of lp's out-arcs are plain, non-RAINBOW arcs to rp. + * (Note: there shouldn't be any pseudocolors yet, but check anyway.) + */ + for (a = lp->outs; a != NULL; a = a->outchain) + { + assert(a->type == PLAIN); + assert(a->co >= 0); /* i.e. not RAINBOW */ + assert(a->to == rp); + cd = &v->cm->cd[a->co]; + assert(!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)); + cd->flags |= COLMARK; + } + + /* Scan colors, clear transient marks, check for unmarked live colors */ + israinbow = true; + for (cd = v->cm->cd; cd < end; cd++) + { + if (cd->flags & COLMARK) + cd->flags &= ~COLMARK; + else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) + israinbow = false; + } + + /* Can't do anything if not all colors have arcs */ + if (!israinbow) + return; + + /* OK, drop existing arcs and replace with a rainbow */ + while ((a = lp->outs) != NULL) + freearc(v->nfa, a); + newarc(v->nfa, PLAIN, RAINBOW, lp, rp); +} + +/* + * wordchrs - set up word-chr list for word-boundary stuff, if needed + * + * The list is kept as a bunch of circular arcs on an otherwise-unused state. + * + * Note that this must not be called while we have any open subcolors, + * else construction of the list would confuse color bookkeeping. + * Hence, we can't currently apply a similar optimization in + * charclass[complement](), as those need to be usable within bracket + * expressions. + */ +static void +wordchrs(struct vars *v) +{ + struct state *cstate; + struct cvec *cv; + + if (v->wordchrs != NULL) + return; /* done already */ + + /* make dummy state to hang the cache arcs on */ + cstate = newstate(v->nfa); + NOERR(); + + /* obtain possibly-cached cvec for \w characters */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, CC_WORD, (v->cflags & REG_ICASE)); + NOERR(); + + /* build the arcs; this may cause color splitting */ + subcolorcvec(v, cv, cstate, cstate); + NOERR(); + + /* close new open subcolors to ensure the cache entry is self-contained */ + okcolors(v->nfa, v->cm); + NOERR(); + + /* success! save the cache pointer */ + v->wordchrs = cstate; +} + +/* + * processlacon - generate the NFA representation of a LACON + * + * In the general case this is just newlacon() + newarc(), but some cases + * can be optimized. + */ +static void +processlacon(struct vars *v, + struct state *begin, /* start of parsed LACON sub-re */ + struct state *end, /* end of parsed LACON sub-re */ + int latype, + struct state *lp, /* left state to hang it on */ + struct state *rp) /* right state to hang it on */ +{ + struct state *s1; + int n; + + /* + * Check for lookaround RE consisting of a single plain color arc (or set + * of arcs); this would typically be a simple chr or a bracket expression. + */ + s1 = single_color_transition(begin, end); + switch (latype) + { + case LATYPE_AHEAD_POS: + /* If lookahead RE is just colorset C, convert to AHEAD(C) */ + if (s1 != NULL) + { + cloneouts(v->nfa, s1, lp, rp, AHEAD); + return; + } + break; + case LATYPE_AHEAD_NEG: + /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */ + if (s1 != NULL) + { + colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp); + newarc(v->nfa, '$', 1, lp, rp); + newarc(v->nfa, '$', 0, lp, rp); + return; + } + break; + case LATYPE_BEHIND_POS: + /* If lookbehind RE is just colorset C, convert to BEHIND(C) */ + if (s1 != NULL) + { + cloneouts(v->nfa, s1, lp, rp, BEHIND); + return; + } + break; + case LATYPE_BEHIND_NEG: + /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */ + if (s1 != NULL) + { + colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp); + newarc(v->nfa, '^', 1, lp, rp); + newarc(v->nfa, '^', 0, lp, rp); + return; + } + break; + default: + assert(NOTREACHED); + } + + /* General case: we need a LACON subre and arc */ + n = newlacon(v, begin, end, latype); + newarc(v->nfa, LACON, n, lp, rp); +} + +/* + * subre - allocate a subre + */ +static struct subre * +subre(struct vars *v, + int op, + int flags, + struct state *begin, + struct state *end) +{ + struct subre *ret = v->treefree; + + /* + * Checking for stack overflow here is sufficient to protect parse() and + * its recursive subroutines. + */ + if (STACK_TOO_DEEP(v->re)) + { + ERR(REG_ETOOBIG); + return NULL; + } + + if (ret != NULL) + v->treefree = ret->child; + else + { + ret = (struct subre *) MALLOC(sizeof(struct subre)); + if (ret == NULL) + { + ERR(REG_ESPACE); + return NULL; + } + ret->chain = v->treechain; + v->treechain = ret; + } + + assert(strchr("=b|.*(", op) != NULL); + + ret->op = op; + ret->flags = flags; + ret->latype = (char) -1; + ret->id = 0; /* will be assigned later */ + ret->capno = 0; + ret->backno = 0; + ret->min = ret->max = 1; + ret->child = NULL; + ret->sibling = NULL; + ret->begin = begin; + ret->end = end; + ZAPCNFA(ret->cnfa); + + return ret; +} + +/* + * freesubre - free a subRE subtree + * + * This frees child node(s) of the given subRE too, + * but not its siblings. + */ +static void +freesubre(struct vars *v, /* might be NULL */ + struct subre *sr) +{ + if (sr == NULL) + return; + + if (sr->child != NULL) + freesubreandsiblings(v, sr->child); + + freesrnode(v, sr); +} + +/* + * freesubreandsiblings - free a subRE subtree + * + * This frees child node(s) of the given subRE too, + * as well as any following siblings. + */ +static void +freesubreandsiblings(struct vars *v, /* might be NULL */ + struct subre *sr) +{ + while (sr != NULL) + { + struct subre *next = sr->sibling; + + freesubre(v, sr); + sr = next; + } +} + +/* + * freesrnode - free one node in a subRE subtree + */ +static void +freesrnode(struct vars *v, /* might be NULL */ + struct subre *sr) +{ + if (sr == NULL) + return; + + if (!NULLCNFA(sr->cnfa)) + freecnfa(&sr->cnfa); + sr->flags = 0; /* in particular, not INUSE */ + sr->child = sr->sibling = NULL; + sr->begin = sr->end = NULL; + + if (v != NULL && v->treechain != NULL) + { + /* we're still parsing, maybe we can reuse the subre */ + sr->child = v->treefree; + v->treefree = sr; + } + else + FREE(sr); +} + +/* + * optst - optimize a subRE subtree + */ +static void +optst(struct vars *v, + struct subre *t) +{ + /* + * DGP (2007-11-13): I assume it was the programmer's intent to eventually + * come back and add code to optimize subRE trees, but the routine coded + * just spends effort traversing the tree and doing nothing. We can do + * nothing with less effort. + */ + return; +} + +/* + * numst - number tree nodes (assigning "id" indexes) + */ +static int /* next number */ +numst(struct subre *t, + int start) /* starting point for subtree numbers */ +{ + int i; + struct subre *t2; + + assert(t != NULL); + + i = start; + t->id = i++; + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + i = numst(t2, i); + return i; +} + +/* + * markst - mark tree nodes as INUSE + * + * Note: this is a great deal more subtle than it looks. During initial + * parsing of a regex, all subres are linked into the treechain list; + * discarded ones are also linked into the treefree list for possible reuse. + * After we are done creating all subres required for a regex, we run markst() + * then cleanst(), which results in discarding all subres not reachable from + * v->tree. We then clear v->treechain, indicating that subres must be found + * by descending from v->tree. This changes the behavior of freesubre(): it + * will henceforth FREE() unwanted subres rather than sticking them into the + * treefree list. (Doing that any earlier would result in dangling links in + * the treechain list.) This all means that freev() will clean up correctly + * if invoked before or after markst()+cleanst(); but it would not work if + * called partway through this state conversion, so we mustn't error out + * in or between these two functions. + */ +static void +markst(struct subre *t) +{ + struct subre *t2; + + assert(t != NULL); + + t->flags |= INUSE; + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + markst(t2); +} + +/* + * cleanst - free any tree nodes not marked INUSE + */ +static void +cleanst(struct vars *v) +{ + struct subre *t; + struct subre *next; + + for (t = v->treechain; t != NULL; t = next) + { + next = t->chain; + if (!(t->flags & INUSE)) + FREE(t); + } + v->treechain = NULL; + v->treefree = NULL; /* just on general principles */ +} + +/* + * nfatree - turn a subRE subtree into a tree of compacted NFAs + */ +static long /* optimize results from top node */ +nfatree(struct vars *v, + struct subre *t, + FILE *f) /* for debug output */ +{ + struct subre *t2; + + assert(t != NULL && t->begin != NULL); + + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + (DISCARD) nfatree(v, t2, f); + + return nfanode(v, t, 0, f); +} + +/* + * nfanode - do one NFA for nfatree or lacons + * + * If converttosearch is true, apply makesearch() to the NFA. + */ +static long /* optimize results */ +nfanode(struct vars *v, + struct subre *t, + int converttosearch, + FILE *f) /* for debug output */ +{ + struct nfa *nfa; + long ret = 0; + + assert(t->begin != NULL); + +#ifdef REG_DEBUG + if (f != NULL) + { + char idbuf[50]; + + fprintf(f, "\n\n\n========= TREE NODE %s ==========\n", + stid(t, idbuf, sizeof(idbuf))); + } +#endif + nfa = newnfa(v, v->cm, v->nfa); + NOERRZ(); + dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final); + if (!ISERR()) + specialcolors(nfa); + if (!ISERR()) + ret = optimize(nfa, f); + if (converttosearch && !ISERR()) + makesearch(v, nfa); + if (!ISERR()) + compact(nfa, &t->cnfa); + + freenfa(nfa); + return ret; +} + +/* + * newlacon - allocate a lookaround-constraint subRE + */ +static int /* lacon number */ +newlacon(struct vars *v, + struct state *begin, + struct state *end, + int latype) +{ + int n; + struct subre *newlacons; + struct subre *sub; + + if (v->nlacons == 0) + { + n = 1; /* skip 0th */ + newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre)); + } + else + { + n = v->nlacons; + newlacons = (struct subre *) REALLOC(v->lacons, + (n + 1) * sizeof(struct subre)); + } + if (newlacons == NULL) + { + ERR(REG_ESPACE); + return 0; + } + v->lacons = newlacons; + v->nlacons = n + 1; + sub = &v->lacons[n]; + sub->begin = begin; + sub->end = end; + sub->latype = latype; + ZAPCNFA(sub->cnfa); + return n; +} + +/* + * freelacons - free lookaround-constraint subRE vector + */ +static void +freelacons(struct subre *subs, + int n) +{ + struct subre *sub; + int i; + + assert(n > 0); + for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */ + if (!NULLCNFA(sub->cnfa)) + freecnfa(&sub->cnfa); + FREE(subs); +} + +/* + * rfree - free a whole RE (insides of regfree) + */ +static void +rfree(regex_t *re) +{ + struct guts *g; + + if (re == NULL || re->re_magic != REMAGIC) + return; + + re->re_magic = 0; /* invalidate RE */ + g = (struct guts *) re->re_guts; + re->re_guts = NULL; + re->re_fns = NULL; + if (g != NULL) + { + g->magic = 0; + freecm(&g->cmap); + if (g->tree != NULL) + freesubre((struct vars *) NULL, g->tree); + if (g->lacons != NULL) + freelacons(g->lacons, g->nlacons); + if (!NULLCNFA(g->search)) + freecnfa(&g->search); + FREE(g); + } +} + +/* + * rcancelrequested - check for external request to cancel regex operation + * + * Return nonzero to fail the operation with error code REG_CANCEL, + * zero to keep going + * + * The current implementation is Postgres-specific. If we ever get around + * to splitting the regex code out as a standalone library, there will need + * to be some API to let applications define a callback function for this. + */ +static int +rcancelrequested(void) +{ + return InterruptPending && (QueryCancelPending || ProcDiePending); +} + +/* + * rstacktoodeep - check for stack getting dangerously deep + * + * Return nonzero to fail the operation with error code REG_ETOOBIG, + * zero to keep going + * + * The current implementation is Postgres-specific. If we ever get around + * to splitting the regex code out as a standalone library, there will need + * to be some API to let applications define a callback function for this. + */ +static int +rstacktoodeep(void) +{ + return stack_is_too_deep(); +} + +#ifdef REG_DEBUG + +/* + * dump - dump an RE in human-readable form + */ +static void +dump(regex_t *re, + FILE *f) +{ + struct guts *g; + int i; + + if (re->re_magic != REMAGIC) + fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic, + REMAGIC); + if (re->re_guts == NULL) + { + fprintf(f, "NULL guts!!!\n"); + return; + } + g = (struct guts *) re->re_guts; + if (g->magic != GUTSMAGIC) + fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic, + GUTSMAGIC); + + fprintf(f, "\n\n\n========= DUMP ==========\n"); + fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", + (int) re->re_nsub, re->re_info, re->re_csize, g->ntree); + + dumpcolors(&g->cmap, f); + if (!NULLCNFA(g->search)) + { + fprintf(f, "\nsearch:\n"); + dumpcnfa(&g->search, f); + } + for (i = 1; i < g->nlacons; i++) + { + struct subre *lasub = &g->lacons[i]; + const char *latype; + + switch (lasub->latype) + { + case LATYPE_AHEAD_POS: + latype = "positive lookahead"; + break; + case LATYPE_AHEAD_NEG: + latype = "negative lookahead"; + break; + case LATYPE_BEHIND_POS: + latype = "positive lookbehind"; + break; + case LATYPE_BEHIND_NEG: + latype = "negative lookbehind"; + break; + default: + latype = "???"; + break; + } + fprintf(f, "\nla%d (%s):\n", i, latype); + dumpcnfa(&lasub->cnfa, f); + } + fprintf(f, "\n"); + dumpst(g->tree, f, 0); +} + +/* + * dumpst - dump a subRE tree + */ +static void +dumpst(struct subre *t, + FILE *f, + int nfapresent) /* is the original NFA still around? */ +{ + if (t == NULL) + fprintf(f, "null tree\n"); + else + stdump(t, f, nfapresent); + fflush(f); +} + +/* + * stdump - recursive guts of dumpst + */ +static void +stdump(struct subre *t, + FILE *f, + int nfapresent) /* is the original NFA still around? */ +{ + char idbuf[50]; + struct subre *t2; + + fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op); + if (t->flags & LONGER) + fprintf(f, " longest"); + if (t->flags & SHORTER) + fprintf(f, " shortest"); + if (t->flags & MIXED) + fprintf(f, " hasmixed"); + if (t->flags & CAP) + fprintf(f, " hascapture"); + if (t->flags & BACKR) + fprintf(f, " hasbackref"); + if (!(t->flags & INUSE)) + fprintf(f, " UNUSED"); + if (t->latype != (char) -1) + fprintf(f, " latype(%d)", t->latype); + if (t->capno != 0) + fprintf(f, " capture(%d)", t->capno); + if (t->backno != 0) + fprintf(f, " backref(%d)", t->backno); + if (t->min != 1 || t->max != 1) + { + fprintf(f, " {%d,", t->min); + if (t->max != DUPINF) + fprintf(f, "%d", t->max); + fprintf(f, "}"); + } + if (nfapresent) + fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no); + if (t->child != NULL) + fprintf(f, " C:%s", stid(t->child, idbuf, sizeof(idbuf))); + /* printing second child isn't necessary, but it is often helpful */ + if (t->child != NULL && t->child->sibling != NULL) + fprintf(f, " C2:%s", stid(t->child->sibling, idbuf, sizeof(idbuf))); + if (t->sibling != NULL) + fprintf(f, " S:%s", stid(t->sibling, idbuf, sizeof(idbuf))); + if (!NULLCNFA(t->cnfa)) + { + fprintf(f, "\n"); + dumpcnfa(&t->cnfa, f); + } + fprintf(f, "\n"); + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + stdump(t2, f, nfapresent); +} + +/* + * stid - identify a subtree node for dumping + */ +static const char * /* points to buf or constant string */ +stid(struct subre *t, + char *buf, + size_t bufsize) +{ + /* big enough for hex int or decimal t->id? */ + if (bufsize < sizeof(void *) * 2 + 3 || bufsize < sizeof(t->id) * 3 + 1) + return "unable"; + if (t->id != 0) + sprintf(buf, "%d", t->id); + else + sprintf(buf, "%p", t); + return buf; +} +#endif /* REG_DEBUG */ + + +#include "regc_lex.c" +#include "regc_color.c" +#include "regc_nfa.c" +#include "regc_cvec.c" +#include "regc_pg_locale.c" +#include "regc_locale.c" diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c new file mode 100644 index 0000000..ba1289c --- /dev/null +++ b/src/backend/regex/rege_dfa.c @@ -0,0 +1,1106 @@ +/* + * DFA routines + * This file is #included by regexec.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/rege_dfa.c + * + */ + +/* + * longest - longest-preferred matching engine + * + * On success, returns match endpoint address. Returns NULL on no match. + * Internal errors also return NULL, with v->err set. + */ +static chr * +longest(struct vars *v, + struct dfa *d, + chr *start, /* where the match should start */ + chr *stop, /* match must end at or before here */ + int *hitstopp) /* record whether hit v->stop, if non-NULL */ +{ + chr *cp; + chr *realstop = (stop == v->stop) ? stop : stop + 1; + color co; + struct sset *css; + struct sset *ss; + chr *post; + int i; + struct colormap *cm = d->cm; + + /* prevent "uninitialized variable" warnings */ + if (hitstopp != NULL) + *hitstopp = 0; + + /* if this is a backref to a known string, just match against that */ + if (d->backno >= 0) + { + assert((size_t) d->backno < v->nmatch); + if (v->pmatch[d->backno].rm_so >= 0) + { + cp = dfa_backref(v, d, start, start, stop, false); + if (cp == v->stop && stop == v->stop && hitstopp != NULL) + *hitstopp = 1; + return cp; + } + } + + /* fast path for matchall NFAs */ + if (d->cnfa->flags & MATCHALL) + { + size_t nchr = stop - start; + size_t maxmatchall = d->cnfa->maxmatchall; + + if (nchr < d->cnfa->minmatchall) + return NULL; + if (maxmatchall == DUPINF) + { + if (stop == v->stop && hitstopp != NULL) + *hitstopp = 1; + } + else + { + if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL) + *hitstopp = 1; + if (nchr > maxmatchall) + return start + maxmatchall; + } + return stop; + } + + /* initialize */ + css = initialize(v, d, start); + if (css == NULL) + return NULL; + cp = start; + + /* startup */ + FDEBUG(("+++ startup +++\n")); + if (cp == v->start) + { + co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + } + else + { + co = GETCOLOR(cm, *(cp - 1)); + FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co)); + } + css = miss(v, d, css, co, cp, start); + if (css == NULL) + return NULL; + css->lastseen = cp; + + /* + * This is the main text-scanning loop. It seems worth having two copies + * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG + * builds, when you're not actively tracing. + */ +#ifdef REG_DEBUG + if (v->eflags & REG_FTRACE) + { + while (cp < realstop) + { + FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + else +#endif + { + while (cp < realstop) + { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + + if (ISERR()) + return NULL; + + /* shutdown */ + FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets))); + if (cp == v->stop && stop == v->stop) + { + if (hitstopp != NULL) + *hitstopp = 1; + co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + ss = miss(v, d, css, co, cp, start); + if (ISERR()) + return NULL; + /* special case: match ended at eol? */ + if (ss != NULL && (ss->flags & POSTSTATE)) + return cp; + else if (ss != NULL) + ss->lastseen = cp; /* to be tidy */ + } + + /* find last match, if any */ + post = d->lastpost; + for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--) + if ((ss->flags & POSTSTATE) && post != ss->lastseen && + (post == NULL || post < ss->lastseen)) + post = ss->lastseen; + if (post != NULL) /* found one */ + return post - 1; + + return NULL; +} + +/* + * shortest - shortest-preferred matching engine + * + * On success, returns match endpoint address. Returns NULL on no match. + * Internal errors also return NULL, with v->err set. + */ +static chr * +shortest(struct vars *v, + struct dfa *d, + chr *start, /* where the match should start */ + chr *min, /* match must end at or after here */ + chr *max, /* match must end at or before here */ + chr **coldp, /* store coldstart pointer here, if non-NULL */ + int *hitstopp) /* record whether hit v->stop, if non-NULL */ +{ + chr *cp; + chr *realmin = (min == v->stop) ? min : min + 1; + chr *realmax = (max == v->stop) ? max : max + 1; + color co; + struct sset *css; + struct sset *ss; + struct colormap *cm = d->cm; + + /* prevent "uninitialized variable" warnings */ + if (coldp != NULL) + *coldp = NULL; + if (hitstopp != NULL) + *hitstopp = 0; + + /* if this is a backref to a known string, just match against that */ + if (d->backno >= 0) + { + assert((size_t) d->backno < v->nmatch); + if (v->pmatch[d->backno].rm_so >= 0) + { + cp = dfa_backref(v, d, start, min, max, true); + if (cp != NULL && coldp != NULL) + *coldp = start; + /* there is no case where we should set *hitstopp */ + return cp; + } + } + + /* fast path for matchall NFAs */ + if (d->cnfa->flags & MATCHALL) + { + size_t nchr = min - start; + + if (d->cnfa->maxmatchall != DUPINF && + nchr > d->cnfa->maxmatchall) + return NULL; + if ((max - start) < d->cnfa->minmatchall) + return NULL; + if (nchr < d->cnfa->minmatchall) + min = start + d->cnfa->minmatchall; + if (coldp != NULL) + *coldp = start; + /* there is no case where we should set *hitstopp */ + return min; + } + + /* initialize */ + css = initialize(v, d, start); + if (css == NULL) + return NULL; + cp = start; + + /* startup */ + FDEBUG(("--- startup ---\n")); + if (cp == v->start) + { + co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + } + else + { + co = GETCOLOR(cm, *(cp - 1)); + FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co)); + } + css = miss(v, d, css, co, cp, start); + if (css == NULL) + return NULL; + css->lastseen = cp; + ss = css; + + /* + * This is the main text-scanning loop. It seems worth having two copies + * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG + * builds, when you're not actively tracing. + */ +#ifdef REG_DEBUG + if (v->eflags & REG_FTRACE) + { + while (cp < realmax) + { + FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + if ((ss->flags & POSTSTATE) && cp >= realmin) + break; /* NOTE BREAK OUT */ + } + } + else +#endif + { + while (cp < realmax) + { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + if ((ss->flags & POSTSTATE) && cp >= realmin) + break; /* NOTE BREAK OUT */ + } + } + + if (ss == NULL) + return NULL; + + if (coldp != NULL) /* report last no-progress state set, if any */ + *coldp = lastcold(v, d); + + if ((ss->flags & POSTSTATE) && cp > min) + { + assert(cp >= realmin); + cp--; + } + else if (cp == v->stop && max == v->stop) + { + co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + ss = miss(v, d, css, co, cp, start); + /* match might have ended at eol */ + if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL) + *hitstopp = 1; + } + + if (ss == NULL || !(ss->flags & POSTSTATE)) + return NULL; + + return cp; +} + +/* + * matchuntil - incremental matching engine + * + * This is meant for use with a search-style NFA (that is, the pattern is + * known to act as though it had a leading .*). We determine whether a + * match exists starting at v->start and ending at probe. Multiple calls + * require only O(N) time not O(N^2) so long as the probe values are + * nondecreasing. *lastcss and *lastcp must be initialized to NULL before + * starting a series of calls. + * + * Returns 1 if a match exists, 0 if not. + * Internal errors also return 0, with v->err set. + */ +static int +matchuntil(struct vars *v, + struct dfa *d, + chr *probe, /* we want to know if a match ends here */ + struct sset **lastcss, /* state storage across calls */ + chr **lastcp) /* state storage across calls */ +{ + chr *cp = *lastcp; + color co; + struct sset *css = *lastcss; + struct sset *ss; + struct colormap *cm = d->cm; + + /* fast path for matchall NFAs */ + if (d->cnfa->flags & MATCHALL) + { + size_t nchr = probe - v->start; + + if (nchr < d->cnfa->minmatchall) + return 0; + /* maxmatchall will always be infinity, cf. makesearch() */ + assert(d->cnfa->maxmatchall == DUPINF); + return 1; + } + + /* initialize and startup, or restart, if necessary */ + if (cp == NULL || cp > probe) + { + cp = v->start; + css = initialize(v, d, cp); + if (css == NULL) + return 0; + + FDEBUG((">>> startup >>>\n")); + co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + + css = miss(v, d, css, co, cp, v->start); + if (css == NULL) + return 0; + css->lastseen = cp; + } + else if (css == NULL) + { + /* we previously found that no match is possible beyond *lastcp */ + return 0; + } + ss = css; + + /* + * This is the main text-scanning loop. It seems worth having two copies + * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG + * builds, when you're not actively tracing. + */ +#ifdef REG_DEBUG + if (v->eflags & REG_FTRACE) + { + while (cp < probe) + { + FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, v->start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + else +#endif + { + while (cp < probe) + { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, v->start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + + *lastcss = ss; + *lastcp = cp; + + if (ss == NULL) + return 0; /* impossible match, or internal error */ + + /* We need to process one more chr, or the EOS symbol, to check match */ + if (cp < v->stop) + { + FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + ss = miss(v, d, css, co, cp + 1, v->start); + } + else + { + assert(cp == v->stop); + co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + ss = miss(v, d, css, co, cp, v->start); + } + + if (ss == NULL || !(ss->flags & POSTSTATE)) + return 0; + + return 1; +} + +/* + * dfa_backref - find best match length for a known backref string + * + * When the backref's referent is already available, we can deliver an exact + * answer with considerably less work than running the backref node's NFA. + * + * Return match endpoint for longest or shortest valid repeated match, + * or NULL if there is no valid match. + * + * Should be in sync with cbrdissect(), although that has the different task + * of checking a match to a predetermined section of the string. + */ +static chr * +dfa_backref(struct vars *v, + struct dfa *d, + chr *start, /* where the match should start */ + chr *min, /* match must end at or after here */ + chr *max, /* match must end at or before here */ + bool shortest) +{ + int n = d->backno; + int backmin = d->backmin; + int backmax = d->backmax; + size_t numreps; + size_t minreps; + size_t maxreps; + size_t brlen; + chr *brstring; + chr *p; + + /* get the backreferenced string (caller should have checked this) */ + if (v->pmatch[n].rm_so == -1) + return NULL; + brstring = v->start + v->pmatch[n].rm_so; + brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; + + /* special-case zero-length backreference to avoid divide by zero */ + if (brlen == 0) + { + /* + * matches only a zero-length string, but any number of repetitions + * can be considered to be present + */ + if (min == start && backmin <= backmax) + return start; + return NULL; + } + + /* + * convert min and max into numbers of possible repetitions of the backref + * string, rounding appropriately + */ + if (min <= start) + minreps = 0; + else + minreps = (min - start - 1) / brlen + 1; + maxreps = (max - start) / brlen; + + /* apply bounds, then see if there is any allowed match length */ + if (minreps < backmin) + minreps = backmin; + if (backmax != DUPINF && maxreps > backmax) + maxreps = backmax; + if (maxreps < minreps) + return NULL; + + /* quick exit if zero-repetitions match is valid and preferred */ + if (shortest && minreps == 0) + return start; + + /* okay, compare the actual string contents */ + p = start; + numreps = 0; + while (numreps < maxreps) + { + if ((*v->g->compare) (brstring, p, brlen) != 0) + break; + p += brlen; + numreps++; + if (shortest && numreps >= minreps) + break; + } + + if (numreps >= minreps) + return p; + return NULL; +} + +/* + * lastcold - determine last point at which no progress had been made + */ +static chr * /* endpoint, or NULL */ +lastcold(struct vars *v, + struct dfa *d) +{ + struct sset *ss; + chr *nopr; + int i; + + nopr = d->lastnopr; + if (nopr == NULL) + nopr = v->start; + for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--) + if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen) + nopr = ss->lastseen; + return nopr; +} + +/* + * newdfa - set up a fresh DFA + * + * Returns NULL (and sets v->err) on failure. + */ +static struct dfa * +newdfa(struct vars *v, + struct cnfa *cnfa, + struct colormap *cm, + struct smalldfa *sml) /* preallocated space, may be NULL */ +{ + struct dfa *d; + size_t nss = cnfa->nstates * 2; + int wordsper = (cnfa->nstates + UBITS - 1) / UBITS; + bool ismalloced = false; + + assert(cnfa != NULL && cnfa->nstates != 0); + + if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS) + { + assert(wordsper == 1); + if (sml == NULL) + { + sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa)); + if (sml == NULL) + { + ERR(REG_ESPACE); + return NULL; + } + ismalloced = true; + } + d = &sml->dfa; + d->ssets = sml->ssets; + d->statesarea = sml->statesarea; + d->work = &d->statesarea[nss]; + d->outsarea = sml->outsarea; + d->incarea = sml->incarea; + d->ismalloced = ismalloced; + d->arraysmalloced = false; /* not separately allocated, anyway */ + } + else + { + d = (struct dfa *) MALLOC(sizeof(struct dfa)); + if (d == NULL) + { + ERR(REG_ESPACE); + return NULL; + } + d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset)); + d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper * + sizeof(unsigned)); + d->work = &d->statesarea[nss * wordsper]; + d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors * + sizeof(struct sset *)); + d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors * + sizeof(struct arcp)); + d->ismalloced = true; + d->arraysmalloced = true; + /* now freedfa() will behave sanely */ + if (d->ssets == NULL || d->statesarea == NULL || + d->outsarea == NULL || d->incarea == NULL) + { + freedfa(d); + ERR(REG_ESPACE); + return NULL; + } + } + + d->nssets = (v->eflags & REG_SMALL) ? 7 : nss; + d->nssused = 0; + d->nstates = cnfa->nstates; + d->ncolors = cnfa->ncolors; + d->wordsper = wordsper; + d->cnfa = cnfa; + d->cm = cm; + d->lastpost = NULL; + d->lastnopr = NULL; + d->search = d->ssets; + d->backno = -1; /* may be set by caller */ + d->backmin = d->backmax = 0; + + /* initialization of sset fields is done as needed */ + + return d; +} + +/* + * freedfa - free a DFA + */ +static void +freedfa(struct dfa *d) +{ + if (d->arraysmalloced) + { + if (d->ssets != NULL) + FREE(d->ssets); + if (d->statesarea != NULL) + FREE(d->statesarea); + if (d->outsarea != NULL) + FREE(d->outsarea); + if (d->incarea != NULL) + FREE(d->incarea); + } + + if (d->ismalloced) + FREE(d); +} + +/* + * hash - construct a hash code for a bitvector + * + * There are probably better ways, but they're more expensive. + */ +static unsigned +hash(unsigned *uv, + int n) +{ + int i; + unsigned h; + + h = 0; + for (i = 0; i < n; i++) + h ^= uv[i]; + return h; +} + +/* + * initialize - hand-craft a cache entry for startup, otherwise get ready + */ +static struct sset * +initialize(struct vars *v, + struct dfa *d, + chr *start) +{ + struct sset *ss; + int i; + + /* is previous one still there? */ + if (d->nssused > 0 && (d->ssets[0].flags & STARTER)) + ss = &d->ssets[0]; + else + { /* no, must (re)build it */ + ss = getvacant(v, d, start, start); + if (ss == NULL) + return NULL; + for (i = 0; i < d->wordsper; i++) + ss->states[i] = 0; + BSET(ss->states, d->cnfa->pre); + ss->hash = HASH(ss->states, d->wordsper); + assert(d->cnfa->pre != d->cnfa->post); + ss->flags = STARTER | LOCKED | NOPROGRESS; + /* lastseen dealt with below */ + } + + for (i = 0; i < d->nssused; i++) + d->ssets[i].lastseen = NULL; + ss->lastseen = start; /* maybe untrue, but harmless */ + d->lastpost = NULL; + d->lastnopr = NULL; + return ss; +} + +/* + * miss - handle a stateset cache miss + * + * css is the current stateset, co is the color of the current input character, + * cp points to the character after that (which is where we may need to test + * LACONs). start does not affect matching behavior but is needed for pickss' + * heuristics about which stateset cache entry to replace. + * + * Ordinarily, returns the address of the next stateset (the one that is + * valid after consuming the input character). Returns NULL if no valid + * NFA states remain, ie we have a certain match failure. + * Internal errors also return NULL, with v->err set. + */ +static struct sset * +miss(struct vars *v, + struct dfa *d, + struct sset *css, + color co, + chr *cp, /* next chr */ + chr *start) /* where the attempt got started */ +{ + struct cnfa *cnfa = d->cnfa; + int i; + unsigned h; + struct carc *ca; + struct sset *p; + int ispseudocolor; + int ispost; + int noprogress; + int gotstate; + int dolacons; + int sawlacons; + + /* for convenience, we can be called even if it might not be a miss */ + if (css->outs[co] != NULL) + { + FDEBUG(("hit\n")); + return css->outs[co]; + } + FDEBUG(("miss\n")); + + /* + * Checking for operation cancel in the inner text search loop seems + * unduly expensive. As a compromise, check during cache misses. + */ + if (CANCEL_REQUESTED(v->re)) + { + ERR(REG_CANCEL); + return NULL; + } + + /* + * What set of states would we end up in after consuming the co character? + * We first consider PLAIN arcs that consume the character, and then look + * to see what LACON arcs could be traversed after consuming it. + */ + for (i = 0; i < d->wordsper; i++) + d->work[i] = 0; /* build new stateset bitmap in d->work */ + ispseudocolor = d->cm->cd[co].flags & PSEUDO; + ispost = 0; + noprogress = 1; + gotstate = 0; + for (i = 0; i < d->nstates; i++) + if (ISBSET(css->states, i)) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) + if (ca->co == co || + (ca->co == RAINBOW && !ispseudocolor)) + { + BSET(d->work, ca->to); + gotstate = 1; + if (ca->to == cnfa->post) + ispost = 1; + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) + noprogress = 0; + FDEBUG(("%d -> %d\n", i, ca->to)); + } + if (!gotstate) + return NULL; /* character cannot reach any new state */ + dolacons = (cnfa->flags & HASLACONS); + sawlacons = 0; + /* outer loop handles transitive closure of reachable-by-LACON states */ + while (dolacons) + { + dolacons = 0; + for (i = 0; i < d->nstates; i++) + if (ISBSET(d->work, i)) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) + { + if (ca->co < cnfa->ncolors) + continue; /* not a LACON arc */ + if (ISBSET(d->work, ca->to)) + continue; /* arc would be a no-op anyway */ + sawlacons = 1; /* this LACON affects our result */ + if (!lacon(v, cnfa, cp, ca->co)) + { + if (ISERR()) + return NULL; + continue; /* LACON arc cannot be traversed */ + } + if (ISERR()) + return NULL; + BSET(d->work, ca->to); + dolacons = 1; + if (ca->to == cnfa->post) + ispost = 1; + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) + noprogress = 0; + FDEBUG(("%d :> %d\n", i, ca->to)); + } + } + h = HASH(d->work, d->wordsper); + + /* Is this stateset already in the cache? */ + for (p = d->ssets, i = d->nssused; i > 0; p++, i--) + if (HIT(h, d->work, p, d->wordsper)) + { + FDEBUG(("cached c%d\n", (int) (p - d->ssets))); + break; /* NOTE BREAK OUT */ + } + if (i == 0) + { /* nope, need a new cache entry */ + p = getvacant(v, d, cp, start); + if (p == NULL) + return NULL; + assert(p != css); + for (i = 0; i < d->wordsper; i++) + p->states[i] = d->work[i]; + p->hash = h; + p->flags = (ispost) ? POSTSTATE : 0; + if (noprogress) + p->flags |= NOPROGRESS; + /* lastseen to be dealt with by caller */ + } + + /* + * Link new stateset to old, unless a LACON affected the result, in which + * case we don't create the link. That forces future transitions across + * this same arc (same prior stateset and character color) to come through + * miss() again, so that we can recheck the LACON(s), which might or might + * not pass since context will be different. + */ + if (!sawlacons) + { + FDEBUG(("c%d[%d]->c%d\n", + (int) (css - d->ssets), co, (int) (p - d->ssets))); + css->outs[co] = p; + css->inchain[co] = p->ins; + p->ins.ss = css; + p->ins.co = co; + } + return p; +} + +/* + * lacon - lookaround-constraint checker for miss() + */ +static int /* predicate: constraint satisfied? */ +lacon(struct vars *v, + struct cnfa *pcnfa, /* parent cnfa */ + chr *cp, + color co) /* "color" of the lookaround constraint */ +{ + int n; + struct subre *sub; + struct dfa *d; + chr *end; + int satisfied; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(v->re)) + { + ERR(REG_ETOOBIG); + return 0; + } + + n = co - pcnfa->ncolors; + assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL); + FDEBUG(("=== testing lacon %d\n", n)); + sub = &v->g->lacons[n]; + d = getladfa(v, n); + if (d == NULL) + return 0; + if (LATYPE_IS_AHEAD(sub->latype)) + { + /* used to use longest() here, but shortest() could be much cheaper */ + end = shortest(v, d, cp, cp, v->stop, + (chr **) NULL, (int *) NULL); + satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL); + } + else + { + /* + * To avoid doing O(N^2) work when repeatedly testing a lookbehind + * constraint in an N-character string, we use matchuntil() which can + * cache the DFA state across calls. We only need to restart if the + * probe point decreases, which is not common. The NFA we're using is + * a search NFA, so it doesn't mind scanning over stuff before the + * nominal match. + */ + satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]); + if (!LATYPE_IS_POS(sub->latype)) + satisfied = !satisfied; + } + FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied)); + return satisfied; +} + +/* + * getvacant - get a vacant state set + * + * This routine clears out the inarcs and outarcs, but does not otherwise + * clear the innards of the state set -- that's up to the caller. + */ +static struct sset * +getvacant(struct vars *v, + struct dfa *d, + chr *cp, + chr *start) +{ + int i; + struct sset *ss; + struct sset *p; + struct arcp ap; + color co; + + ss = pickss(v, d, cp, start); + if (ss == NULL) + return NULL; + assert(!(ss->flags & LOCKED)); + + /* clear out its inarcs, including self-referential ones */ + ap = ss->ins; + while ((p = ap.ss) != NULL) + { + co = ap.co; + FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co)); + p->outs[co] = NULL; + ap = p->inchain[co]; + p->inchain[co].ss = NULL; /* paranoia */ + } + ss->ins.ss = NULL; + + /* take it off the inarc chains of the ssets reached by its outarcs */ + for (i = 0; i < d->ncolors; i++) + { + p = ss->outs[i]; + assert(p != ss); /* not self-referential */ + if (p == NULL) + continue; /* NOTE CONTINUE */ + FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets))); + if (p->ins.ss == ss && p->ins.co == i) + p->ins = ss->inchain[i]; + else + { + struct arcp lastap = {NULL, 0}; + + assert(p->ins.ss != NULL); + for (ap = p->ins; ap.ss != NULL && + !(ap.ss == ss && ap.co == i); + ap = ap.ss->inchain[ap.co]) + lastap = ap; + assert(ap.ss != NULL); + lastap.ss->inchain[lastap.co] = ss->inchain[i]; + } + ss->outs[i] = NULL; + ss->inchain[i].ss = NULL; + } + + /* if ss was a success state, may need to remember location */ + if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost && + (d->lastpost == NULL || d->lastpost < ss->lastseen)) + d->lastpost = ss->lastseen; + + /* likewise for a no-progress state */ + if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr && + (d->lastnopr == NULL || d->lastnopr < ss->lastseen)) + d->lastnopr = ss->lastseen; + + return ss; +} + +/* + * pickss - pick the next stateset to be used + */ +static struct sset * +pickss(struct vars *v, + struct dfa *d, + chr *cp, + chr *start) +{ + int i; + struct sset *ss; + struct sset *end; + chr *ancient; + + /* shortcut for cases where cache isn't full */ + if (d->nssused < d->nssets) + { + i = d->nssused; + d->nssused++; + ss = &d->ssets[i]; + FDEBUG(("new c%d\n", i)); + /* set up innards */ + ss->states = &d->statesarea[i * d->wordsper]; + ss->flags = 0; + ss->ins.ss = NULL; + ss->ins.co = WHITE; /* give it some value */ + ss->outs = &d->outsarea[i * d->ncolors]; + ss->inchain = &d->incarea[i * d->ncolors]; + for (i = 0; i < d->ncolors; i++) + { + ss->outs[i] = NULL; + ss->inchain[i].ss = NULL; + } + return ss; + } + + /* look for oldest, or old enough anyway */ + if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */ + ancient = cp - d->nssets * 2 / 3; + else + ancient = start; + for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++) + if ((ss->lastseen == NULL || ss->lastseen < ancient) && + !(ss->flags & LOCKED)) + { + d->search = ss + 1; + FDEBUG(("replacing c%d\n", (int) (ss - d->ssets))); + return ss; + } + for (ss = d->ssets, end = d->search; ss < end; ss++) + if ((ss->lastseen == NULL || ss->lastseen < ancient) && + !(ss->flags & LOCKED)) + { + d->search = ss + 1; + FDEBUG(("replacing c%d\n", (int) (ss - d->ssets))); + return ss; + } + + /* nobody's old enough?!? -- something's really wrong */ + FDEBUG(("cannot find victim to replace!\n")); + ERR(REG_ASSERT); + return NULL; +} diff --git a/src/backend/regex/regerror.c b/src/backend/regex/regerror.c new file mode 100644 index 0000000..4a27c25 --- /dev/null +++ b/src/backend/regex/regerror.c @@ -0,0 +1,120 @@ +/* + * regerror - error-code expansion + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regerror.c + * + */ + +#include "regex/regguts.h" + +/* unknown-error explanation */ +static const char unk[] = "*** unknown regex error code 0x%x ***"; + +/* struct to map among codes, code names, and explanations */ +static const struct rerr +{ + int code; + const char *name; + const char *explain; +} rerrs[] = + +{ + /* the actual table is built from regex.h */ +#include "regex/regerrs.h" /* pgrminclude ignore */ + { + -1, "", "oops" + }, /* explanation special-cased in code */ +}; + +/* + * pg_regerror - the interface to error numbers + */ +/* ARGSUSED */ +size_t /* actual space needed (including NUL) */ +pg_regerror(int errcode, /* error code, or REG_ATOI or REG_ITOA */ + const regex_t *preg, /* associated regex_t (unused at present) */ + char *errbuf, /* result buffer (unless errbuf_size==0) */ + size_t errbuf_size) /* available space in errbuf, can be 0 */ +{ + const struct rerr *r; + const char *msg; + char convbuf[sizeof(unk) + 50]; /* 50 = plenty for int */ + size_t len; + int icode; + + switch (errcode) + { + case REG_ATOI: /* convert name to number */ + for (r = rerrs; r->code >= 0; r++) + if (strcmp(r->name, errbuf) == 0) + break; + sprintf(convbuf, "%d", r->code); /* -1 for unknown */ + msg = convbuf; + break; + case REG_ITOA: /* convert number to name */ + icode = atoi(errbuf); /* not our problem if this fails */ + for (r = rerrs; r->code >= 0; r++) + if (r->code == icode) + break; + if (r->code >= 0) + msg = r->name; + else + { /* unknown; tell him the number */ + sprintf(convbuf, "REG_%u", (unsigned) icode); + msg = convbuf; + } + break; + default: /* a real, normal error code */ + for (r = rerrs; r->code >= 0; r++) + if (r->code == errcode) + break; + if (r->code >= 0) + msg = r->explain; + else + { /* unknown; say so */ + sprintf(convbuf, unk, errcode); + msg = convbuf; + } + break; + } + + len = strlen(msg) + 1; /* space needed, including NUL */ + if (errbuf_size > 0) + { + if (errbuf_size > len) + strcpy(errbuf, msg); + else + { /* truncate to fit */ + memcpy(errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = '\0'; + } + } + + return len; +} diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c new file mode 100644 index 0000000..e72aa8c --- /dev/null +++ b/src/backend/regex/regexec.c @@ -0,0 +1,1494 @@ +/* + * re_*exec and friends - match REs + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regexec.c + * + */ + +#include "regex/regguts.h" + + + +/* lazy-DFA representation */ +struct arcp +{ /* "pointer" to an outarc */ + struct sset *ss; + color co; +}; + +struct sset +{ /* state set */ + unsigned *states; /* pointer to bitvector */ + unsigned hash; /* hash of bitvector */ +#define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) +#define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ + memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) + int flags; +#define STARTER 01 /* the initial state set */ +#define POSTSTATE 02 /* includes the goal state */ +#define LOCKED 04 /* locked in cache */ +#define NOPROGRESS 010 /* zero-progress state set */ + struct arcp ins; /* chain of inarcs pointing here */ + chr *lastseen; /* last entered on arrival here */ + struct sset **outs; /* outarc vector indexed by color */ + struct arcp *inchain; /* chain-pointer vector for outarcs */ +}; + +struct dfa +{ + int nssets; /* size of cache */ + int nssused; /* how many entries occupied yet */ + int nstates; /* number of states */ + int ncolors; /* length of outarc and inchain vectors */ + int wordsper; /* length of state-set bitvectors */ + struct sset *ssets; /* state-set cache */ + unsigned *statesarea; /* bitvector storage */ + unsigned *work; /* pointer to work area within statesarea */ + struct sset **outsarea; /* outarc-vector storage */ + struct arcp *incarea; /* inchain storage */ + struct cnfa *cnfa; + struct colormap *cm; + chr *lastpost; /* location of last cache-flushed success */ + chr *lastnopr; /* location of last cache-flushed NOPROGRESS */ + struct sset *search; /* replacement-search-pointer memory */ + int backno; /* if DFA for a backref, subno it refers to */ + short backmin; /* min repetitions for backref */ + short backmax; /* max repetitions for backref */ + bool ismalloced; /* should this struct dfa be freed? */ + bool arraysmalloced; /* should its subsidiary arrays be freed? */ +}; + +#define WORK 1 /* number of work bitvectors needed */ + +/* setup for non-malloc allocation for small cases */ +#define FEWSTATES 20 /* must be less than UBITS */ +#define FEWCOLORS 15 +struct smalldfa +{ + struct dfa dfa; /* must be first */ + struct sset ssets[FEWSTATES * 2]; + unsigned statesarea[FEWSTATES * 2 + WORK]; + struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS]; + struct arcp incarea[FEWSTATES * 2 * FEWCOLORS]; +}; + +#define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */ + + + +/* internal variables, bundled for easy passing around */ +struct vars +{ + regex_t *re; + struct guts *g; + int eflags; /* copies of arguments */ + size_t nmatch; + regmatch_t *pmatch; + rm_detail_t *details; + chr *start; /* start of string */ + chr *search_start; /* search start of string */ + chr *stop; /* just past end of string */ + int err; /* error code if any (0 none) */ + struct dfa **subdfas; /* per-tree-subre DFAs */ + struct dfa **ladfas; /* per-lacon-subre DFAs */ + struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */ + chr **lblastcp; /* per-lacon-subre lookbehind restart data */ + struct smalldfa dfa1; + struct smalldfa dfa2; +}; + +#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ +#define ISERR() VISERR(v) +#define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e))) +#define ERR(e) VERR(v, e) /* record an error */ +#define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */ +#define OFF(p) ((p) - v->start) +#define LOFF(p) ((long)OFF(p)) + + + +/* + * forward declarations + */ +/* === regexec.c === */ +static struct dfa *getsubdfa(struct vars *, struct subre *); +static struct dfa *getladfa(struct vars *, int); +static int find(struct vars *, struct cnfa *, struct colormap *); +static int cfind(struct vars *, struct cnfa *, struct colormap *); +static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **); +static void zapallsubs(regmatch_t *, size_t); +static void zaptreesubs(struct vars *, struct subre *); +static void subset(struct vars *, struct subre *, chr *, chr *); +static int cdissect(struct vars *, struct subre *, chr *, chr *); +static int ccondissect(struct vars *, struct subre *, chr *, chr *); +static int crevcondissect(struct vars *, struct subre *, chr *, chr *); +static int cbrdissect(struct vars *, struct subre *, chr *, chr *); +static int caltdissect(struct vars *, struct subre *, chr *, chr *); +static int citerdissect(struct vars *, struct subre *, chr *, chr *); +static int creviterdissect(struct vars *, struct subre *, chr *, chr *); + +/* === rege_dfa.c === */ +static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); +static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *); +static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **); +static chr *dfa_backref(struct vars *, struct dfa *, chr *, chr *, chr *, bool); +static chr *lastcold(struct vars *, struct dfa *); +static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *); +static void freedfa(struct dfa *); +static unsigned hash(unsigned *, int); +static struct sset *initialize(struct vars *, struct dfa *, chr *); +static struct sset *miss(struct vars *, struct dfa *, struct sset *, color, chr *, chr *); +static int lacon(struct vars *, struct cnfa *, chr *, color); +static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *); +static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *); + + +/* + * pg_regexec - match regular expression + */ +int +pg_regexec(regex_t *re, + const chr *string, + size_t len, + size_t search_start, + rm_detail_t *details, + size_t nmatch, + regmatch_t pmatch[], + int flags) +{ + struct vars var; + register struct vars *v = &var; + int st; + size_t n; + size_t i; + int backref; + +#define LOCALMAT 20 + regmatch_t mat[LOCALMAT]; + +#define LOCALDFAS 40 + struct dfa *subdfas[LOCALDFAS]; + + /* sanity checks */ + if (re == NULL || string == NULL || re->re_magic != REMAGIC) + return REG_INVARG; + if (re->re_csize != sizeof(chr)) + return REG_MIXED; + if (search_start > len) + return REG_NOMATCH; + + /* Initialize locale-dependent support */ + pg_set_regex_collation(re->re_collation); + + /* setup */ + v->re = re; + v->g = (struct guts *) re->re_guts; + if ((v->g->cflags & REG_EXPECT) && details == NULL) + return REG_INVARG; + if (v->g->info & REG_UIMPOSSIBLE) + return REG_NOMATCH; + backref = (v->g->info & REG_UBACKREF) ? 1 : 0; + v->eflags = flags; + if (v->g->cflags & REG_NOSUB) + nmatch = 0; /* override client */ + v->nmatch = nmatch; + if (backref) + { + /* need work area */ + if (v->g->nsub + 1 <= LOCALMAT) + v->pmatch = mat; + else + v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) * + sizeof(regmatch_t)); + if (v->pmatch == NULL) + return REG_ESPACE; + v->nmatch = v->g->nsub + 1; + } + else + v->pmatch = pmatch; + if (v->nmatch > 0) + zapallsubs(v->pmatch, v->nmatch); + v->details = details; + v->start = (chr *) string; + v->search_start = (chr *) string + search_start; + v->stop = (chr *) string + len; + v->err = 0; + v->subdfas = NULL; + v->ladfas = NULL; + v->lblastcss = NULL; + v->lblastcp = NULL; + /* below this point, "goto cleanup" will behave sanely */ + + assert(v->g->ntree >= 0); + n = (size_t) v->g->ntree; + if (n <= LOCALDFAS) + v->subdfas = subdfas; + else + { + v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *)); + if (v->subdfas == NULL) + { + st = REG_ESPACE; + goto cleanup; + } + } + for (i = 0; i < n; i++) + v->subdfas[i] = NULL; + + assert(v->g->nlacons >= 0); + n = (size_t) v->g->nlacons; + if (n > 0) + { + v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *)); + if (v->ladfas == NULL) + { + st = REG_ESPACE; + goto cleanup; + } + for (i = 0; i < n; i++) + v->ladfas[i] = NULL; + v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *)); + v->lblastcp = (chr **) MALLOC(n * sizeof(chr *)); + if (v->lblastcss == NULL || v->lblastcp == NULL) + { + st = REG_ESPACE; + goto cleanup; + } + for (i = 0; i < n; i++) + { + v->lblastcss[i] = NULL; + v->lblastcp[i] = NULL; + } + } + + /* do it */ + assert(v->g->tree != NULL); + if (backref) + st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); + else + st = find(v, &v->g->tree->cnfa, &v->g->cmap); + + /* copy (portion of) match vector over if necessary */ + if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) + { + zapallsubs(pmatch, nmatch); + n = (nmatch < v->nmatch) ? nmatch : v->nmatch; + memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t)); + } + + /* clean up */ +cleanup: + if (v->pmatch != pmatch && v->pmatch != mat) + FREE(v->pmatch); + if (v->subdfas != NULL) + { + n = (size_t) v->g->ntree; + for (i = 0; i < n; i++) + { + if (v->subdfas[i] != NULL) + freedfa(v->subdfas[i]); + } + if (v->subdfas != subdfas) + FREE(v->subdfas); + } + if (v->ladfas != NULL) + { + n = (size_t) v->g->nlacons; + for (i = 0; i < n; i++) + { + if (v->ladfas[i] != NULL) + freedfa(v->ladfas[i]); + } + FREE(v->ladfas); + } + if (v->lblastcss != NULL) + FREE(v->lblastcss); + if (v->lblastcp != NULL) + FREE(v->lblastcp); + +#ifdef REG_DEBUG + if (v->eflags & (REG_FTRACE | REG_MTRACE)) + fflush(stdout); +#endif + + return st; +} + +/* + * getsubdfa - create or re-fetch the DFA for a tree subre node + * + * We only need to create the DFA once per overall regex execution. + * The DFA will be freed by the cleanup step in pg_regexec(). + */ +static struct dfa * +getsubdfa(struct vars *v, + struct subre *t) +{ + struct dfa *d = v->subdfas[t->id]; + + if (d == NULL) + { + d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC); + if (d == NULL) + return NULL; + /* set up additional info if this is a backref node */ + if (t->op == 'b') + { + d->backno = t->backno; + d->backmin = t->min; + d->backmax = t->max; + } + v->subdfas[t->id] = d; + } + return d; +} + +/* + * getladfa - create or re-fetch the DFA for a LACON subre node + * + * Same as above, but for LACONs. + */ +static struct dfa * +getladfa(struct vars *v, + int n) +{ + assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL); + + if (v->ladfas[n] == NULL) + { + struct subre *sub = &v->g->lacons[n]; + + v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC); + /* a LACON can't contain a backref, so nothing else to do */ + } + return v->ladfas[n]; +} + +/* + * find - find a match for the main NFA (no-complications case) + */ +static int +find(struct vars *v, + struct cnfa *cnfa, + struct colormap *cm) +{ + struct dfa *s; + struct dfa *d; + chr *begin; + chr *end = NULL; + chr *cold; + chr *open; /* open and close of range of possible starts */ + chr *close; + int hitend; + int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0; + + /* first, a shot with the search RE */ + s = newdfa(v, &v->g->search, cm, &v->dfa1); + if (s == NULL) + return v->err; + MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); + cold = NULL; + close = shortest(v, s, v->search_start, v->search_start, v->stop, + &cold, (int *) NULL); + freedfa(s); + NOERR(); + if (v->g->cflags & REG_EXPECT) + { + assert(v->details != NULL); + if (cold != NULL) + v->details->rm_extend.rm_so = OFF(cold); + else + v->details->rm_extend.rm_so = OFF(v->stop); + v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ + } + if (close == NULL) /* not found */ + return REG_NOMATCH; + if (v->nmatch == 0) /* found, don't need exact location */ + return REG_OKAY; + + /* find starting point and match */ + assert(cold != NULL); + open = cold; + cold = NULL; + MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); + d = newdfa(v, cnfa, cm, &v->dfa1); + if (d == NULL) + return v->err; + for (begin = open; begin <= close; begin++) + { + MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); + if (shorter) + end = shortest(v, d, begin, begin, v->stop, + (chr **) NULL, &hitend); + else + end = longest(v, d, begin, v->stop, &hitend); + if (ISERR()) + { + freedfa(d); + return v->err; + } + if (hitend && cold == NULL) + cold = begin; + if (end != NULL) + break; /* NOTE BREAK OUT */ + } + assert(end != NULL); /* search RE succeeded so loop should */ + freedfa(d); + + /* and pin down details */ + assert(v->nmatch > 0); + v->pmatch[0].rm_so = OFF(begin); + v->pmatch[0].rm_eo = OFF(end); + if (v->g->cflags & REG_EXPECT) + { + if (cold != NULL) + v->details->rm_extend.rm_so = OFF(cold); + else + v->details->rm_extend.rm_so = OFF(v->stop); + v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ + } + if (v->nmatch == 1) /* no need for submatches */ + return REG_OKAY; + + /* find submatches */ + return cdissect(v, v->g->tree, begin, end); +} + +/* + * cfind - find a match for the main NFA (with complications) + */ +static int +cfind(struct vars *v, + struct cnfa *cnfa, + struct colormap *cm) +{ + struct dfa *s; + struct dfa *d; + chr *cold; + int ret; + + s = newdfa(v, &v->g->search, cm, &v->dfa1); + if (s == NULL) + return v->err; + d = newdfa(v, cnfa, cm, &v->dfa2); + if (d == NULL) + { + freedfa(s); + return v->err; + } + + ret = cfindloop(v, cnfa, cm, d, s, &cold); + + freedfa(d); + freedfa(s); + NOERR(); + if (v->g->cflags & REG_EXPECT) + { + assert(v->details != NULL); + if (cold != NULL) + v->details->rm_extend.rm_so = OFF(cold); + else + v->details->rm_extend.rm_so = OFF(v->stop); + v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ + } + return ret; +} + +/* + * cfindloop - the heart of cfind + */ +static int +cfindloop(struct vars *v, + struct cnfa *cnfa, + struct colormap *cm, + struct dfa *d, + struct dfa *s, + chr **coldp) /* where to put coldstart pointer */ +{ + chr *begin; + chr *end; + chr *cold; + chr *open; /* open and close of range of possible starts */ + chr *close; + chr *estart; + chr *estop; + int er; + int shorter = v->g->tree->flags & SHORTER; + int hitend; + + assert(d != NULL && s != NULL); + cold = NULL; + close = v->search_start; + do + { + /* Search with the search RE for match range at/beyond "close" */ + MDEBUG(("\ncsearch at %ld\n", LOFF(close))); + close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL); + if (ISERR()) + { + *coldp = cold; + return v->err; + } + if (close == NULL) + break; /* no more possible match anywhere */ + assert(cold != NULL); + open = cold; + cold = NULL; + /* Search for matches starting between "open" and "close" inclusive */ + MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); + for (begin = open; begin <= close; begin++) + { + MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); + estart = begin; + estop = v->stop; + for (;;) + { + /* Here we use the top node's detailed RE */ + if (shorter) + end = shortest(v, d, begin, estart, + estop, (chr **) NULL, &hitend); + else + end = longest(v, d, begin, estop, + &hitend); + if (ISERR()) + { + *coldp = cold; + return v->err; + } + if (hitend && cold == NULL) + cold = begin; + if (end == NULL) + break; /* no match with this begin point, try next */ + MDEBUG(("tentative end %ld\n", LOFF(end))); + /* Dissect the potential match to see if it really matches */ + er = cdissect(v, v->g->tree, begin, end); + if (er == REG_OKAY) + { + if (v->nmatch > 0) + { + v->pmatch[0].rm_so = OFF(begin); + v->pmatch[0].rm_eo = OFF(end); + } + *coldp = cold; + return REG_OKAY; + } + if (er != REG_NOMATCH) + { + ERR(er); + *coldp = cold; + return er; + } + /* Try next longer/shorter match with same begin point */ + if (shorter) + { + if (end == estop) + break; /* no more, so try next begin point */ + estart = end + 1; + } + else + { + if (end == begin) + break; /* no more, so try next begin point */ + estop = end - 1; + } + } /* end loop over endpoint positions */ + } /* end loop over beginning positions */ + + /* + * If we get here, there is no possible match starting at or before + * "close", so consider matches beyond that. We'll do a fresh search + * with the search RE to find a new promising match range. + */ + close++; + } while (close < v->stop); + + *coldp = cold; + return REG_NOMATCH; +} + +/* + * zapallsubs - initialize all subexpression matches to "no match" + * + * Note that p[0], the overall-match location, is not touched. + */ +static void +zapallsubs(regmatch_t *p, + size_t n) +{ + size_t i; + + for (i = n - 1; i > 0; i--) + { + p[i].rm_so = -1; + p[i].rm_eo = -1; + } +} + +/* + * zaptreesubs - initialize subexpressions within subtree to "no match" + */ +static void +zaptreesubs(struct vars *v, + struct subre *t) +{ + int n = t->capno; + struct subre *t2; + + if (n > 0) + { + if ((size_t) n < v->nmatch) + { + v->pmatch[n].rm_so = -1; + v->pmatch[n].rm_eo = -1; + } + } + + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + zaptreesubs(v, t2); +} + +/* + * subset - set subexpression match data for a successful subre + */ +static void +subset(struct vars *v, + struct subre *sub, + chr *begin, + chr *end) +{ + int n = sub->capno; + + assert(n > 0); + if ((size_t) n >= v->nmatch) + return; + + MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end))); + v->pmatch[n].rm_so = OFF(begin); + v->pmatch[n].rm_eo = OFF(end); +} + +/* + * cdissect - check backrefs and determine subexpression matches + * + * cdissect recursively processes a subre tree to check matching of backrefs + * and/or identify submatch boundaries for capture nodes. The proposed match + * runs from "begin" to "end" (not including "end"), and we are basically + * "dissecting" it to see where the submatches are. + * + * Before calling any level of cdissect, the caller must have run the node's + * DFA and found that the proposed substring satisfies the DFA. (We make + * the caller do that because in concatenation and iteration nodes, it's + * much faster to check all the substrings against the child DFAs before we + * recurse.) + * + * A side-effect of a successful match is to save match locations for + * capturing subexpressions in v->pmatch[]. This is a little bit tricky, + * so we make the following rules: + * 1. Before initial entry to cdissect, all match data must have been + * cleared (this is seen to by zapallsubs). + * 2. Before any recursive entry to cdissect, the match data for that + * subexpression tree must be guaranteed clear (see zaptreesubs). + * 3. When returning REG_OKAY, each level of cdissect will have saved + * any relevant match locations. + * 4. When returning REG_NOMATCH, each level of cdissect will guarantee + * that its subexpression match locations are again clear. + * 5. No guarantees are made for error cases (i.e., other result codes). + * 6. When a level of cdissect abandons a successful sub-match, it will + * clear that subtree's match locations with zaptreesubs before trying + * any new DFA match or cdissect call for that subtree or any subtree + * to its right (that is, any subtree that could have a backref into the + * abandoned match). + * This may seem overly complicated, but it's difficult to simplify it + * because of the provision that match locations must be reset before + * any fresh DFA match (a rule that is needed to make dfa_backref safe). + * That means it won't work to just reset relevant match locations at the + * start of each cdissect level. + */ +static int /* regexec return code */ +cdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + int er; + + assert(t != NULL); + MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end))); + + /* handy place to check for operation cancel */ + if (CANCEL_REQUESTED(v->re)) + return REG_CANCEL; + /* ... and stack overrun */ + if (STACK_TOO_DEEP(v->re)) + return REG_ETOOBIG; + + switch (t->op) + { + case '=': /* terminal node */ + assert(t->child == NULL); + er = REG_OKAY; /* no action, parent did the work */ + break; + case 'b': /* back reference */ + assert(t->child == NULL); + er = cbrdissect(v, t, begin, end); + break; + case '.': /* concatenation */ + assert(t->child != NULL); + if (t->child->flags & SHORTER) /* reverse scan */ + er = crevcondissect(v, t, begin, end); + else + er = ccondissect(v, t, begin, end); + break; + case '|': /* alternation */ + assert(t->child != NULL); + er = caltdissect(v, t, begin, end); + break; + case '*': /* iteration */ + assert(t->child != NULL); + if (t->child->flags & SHORTER) /* reverse scan */ + er = creviterdissect(v, t, begin, end); + else + er = citerdissect(v, t, begin, end); + break; + case '(': /* no-op capture node */ + assert(t->child != NULL); + assert(t->capno > 0); + er = cdissect(v, t->child, begin, end); + break; + default: + er = REG_ASSERT; + break; + } + + /* + * We should never have a match failure unless backrefs lurk below; + * otherwise, either caller failed to check the DFA, or there's some + * inconsistency between the DFA and the node's innards. + */ + assert(er != REG_NOMATCH || (t->flags & BACKR)); + + /* + * If this node is marked as capturing, save successful match's location. + */ + if (t->capno > 0 && er == REG_OKAY) + subset(v, t, begin, end); + + return er; +} + +/* + * ccondissect - dissect match for concatenation node + */ +static int /* regexec return code */ +ccondissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct subre *left = t->child; + struct subre *right = left->sibling; + struct dfa *d; + struct dfa *d2; + chr *mid; + int er; + + assert(t->op == '.'); + assert(left != NULL && left->cnfa.nstates > 0); + assert(right != NULL && right->cnfa.nstates > 0); + assert(right->sibling == NULL); + assert(!(left->flags & SHORTER)); + + d = getsubdfa(v, left); + NOERR(); + d2 = getsubdfa(v, right); + NOERR(); + MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); + + /* pick a tentative midpoint */ + mid = longest(v, d, begin, end, (int *) NULL); + NOERR(); + if (mid == NULL) + return REG_NOMATCH; + MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid))); + + /* iterate until satisfaction or failure */ + for (;;) + { + /* try this midpoint on for size */ + if (longest(v, d2, mid, end, (int *) NULL) == end) + { + er = cdissect(v, left, begin, mid); + if (er == REG_OKAY) + { + er = cdissect(v, right, mid, end); + if (er == REG_OKAY) + { + /* satisfaction */ + MDEBUG(("%d: successful\n", t->id)); + return REG_OKAY; + } + /* Reset left's matches (right should have done so itself) */ + zaptreesubs(v, left); + } + if (er != REG_NOMATCH) + return er; + } + NOERR(); + + /* that midpoint didn't work, find a new one */ + if (mid == begin) + { + /* all possibilities exhausted */ + MDEBUG(("%d: no midpoint\n", t->id)); + return REG_NOMATCH; + } + mid = longest(v, d, begin, mid - 1, (int *) NULL); + NOERR(); + if (mid == NULL) + { + /* failed to find a new one */ + MDEBUG(("%d: failed midpoint\n", t->id)); + return REG_NOMATCH; + } + MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); + } + + /* can't get here */ + return REG_ASSERT; +} + +/* + * crevcondissect - dissect match for concatenation node, shortest-first + */ +static int /* regexec return code */ +crevcondissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct subre *left = t->child; + struct subre *right = left->sibling; + struct dfa *d; + struct dfa *d2; + chr *mid; + int er; + + assert(t->op == '.'); + assert(left != NULL && left->cnfa.nstates > 0); + assert(right != NULL && right->cnfa.nstates > 0); + assert(right->sibling == NULL); + assert(left->flags & SHORTER); + + d = getsubdfa(v, left); + NOERR(); + d2 = getsubdfa(v, right); + NOERR(); + MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); + + /* pick a tentative midpoint */ + mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL); + NOERR(); + if (mid == NULL) + return REG_NOMATCH; + MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid))); + + /* iterate until satisfaction or failure */ + for (;;) + { + /* try this midpoint on for size */ + if (longest(v, d2, mid, end, (int *) NULL) == end) + { + er = cdissect(v, left, begin, mid); + if (er == REG_OKAY) + { + er = cdissect(v, right, mid, end); + if (er == REG_OKAY) + { + /* satisfaction */ + MDEBUG(("%d: successful\n", t->id)); + return REG_OKAY; + } + /* Reset left's matches (right should have done so itself) */ + zaptreesubs(v, left); + } + if (er != REG_NOMATCH) + return er; + } + NOERR(); + + /* that midpoint didn't work, find a new one */ + if (mid == end) + { + /* all possibilities exhausted */ + MDEBUG(("%d: no midpoint\n", t->id)); + return REG_NOMATCH; + } + mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL); + NOERR(); + if (mid == NULL) + { + /* failed to find a new one */ + MDEBUG(("%d: failed midpoint\n", t->id)); + return REG_NOMATCH; + } + MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); + } + + /* can't get here */ + return REG_ASSERT; +} + +/* + * cbrdissect - dissect match for backref node + * + * The backref match might already have been verified by dfa_backref(), + * but we don't know that for sure so must check it here. + */ +static int /* regexec return code */ +cbrdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + int n = t->backno; + size_t numreps; + size_t tlen; + size_t brlen; + chr *brstring; + chr *p; + int min = t->min; + int max = t->max; + + assert(t != NULL); + assert(t->op == 'b'); + assert(n >= 0); + assert((size_t) n < v->nmatch); + + MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max, + LOFF(begin), LOFF(end))); + + /* get the backreferenced string */ + if (v->pmatch[n].rm_so == -1) + return REG_NOMATCH; + brstring = v->start + v->pmatch[n].rm_so; + brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; + + /* special cases for zero-length strings */ + if (brlen == 0) + { + /* + * matches only if target is zero length, but any number of + * repetitions can be considered to be present + */ + if (begin == end && min <= max) + { + MDEBUG(("%d: backref matched trivially\n", t->id)); + return REG_OKAY; + } + return REG_NOMATCH; + } + if (begin == end) + { + /* matches only if zero repetitions are okay */ + if (min == 0) + { + MDEBUG(("%d: backref matched trivially\n", t->id)); + return REG_OKAY; + } + return REG_NOMATCH; + } + + /* + * check target length to see if it could possibly be an allowed number of + * repetitions of brstring + */ + assert(end > begin); + tlen = end - begin; + if (tlen % brlen != 0) + return REG_NOMATCH; + numreps = tlen / brlen; + if (numreps < min || (numreps > max && max != DUPINF)) + return REG_NOMATCH; + + /* okay, compare the actual string contents */ + p = begin; + while (numreps-- > 0) + { + if ((*v->g->compare) (brstring, p, brlen) != 0) + return REG_NOMATCH; + p += brlen; + } + + MDEBUG(("%d: backref matched\n", t->id)); + return REG_OKAY; +} + +/* + * caltdissect - dissect match for alternation node + */ +static int /* regexec return code */ +caltdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + int er; + + assert(t->op == '|'); + + t = t->child; + /* there should be at least 2 alternatives */ + assert(t != NULL && t->sibling != NULL); + + while (t != NULL) + { + assert(t->cnfa.nstates > 0); + + MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); + + d = getsubdfa(v, t); + NOERR(); + if (longest(v, d, begin, end, (int *) NULL) == end) + { + MDEBUG(("%d: caltdissect matched\n", t->id)); + er = cdissect(v, t, begin, end); + if (er != REG_NOMATCH) + return er; + } + NOERR(); + + t = t->sibling; + } + + return REG_NOMATCH; +} + +/* + * citerdissect - dissect match for iteration node + */ +static int /* regexec return code */ +citerdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->child != NULL && t->child->cnfa.nstates > 0); + assert(!(t->child->flags & SHORTER)); + assert(begin <= end); + + MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); + + /* + * For the moment, assume the minimum number of matches is 1. If zero + * matches are allowed, and the target string is empty, we are allowed to + * match regardless of the contents of the iter node --- but we would + * prefer to match once, so that capturing parens get set. (An example of + * the concern here is a pattern like "()*\1", which historically this + * code has allowed to succeed.) Therefore, we deal with the zero-matches + * case at the bottom, after failing to find any other way to match. + */ + min_matches = t->min; + if (min_matches <= 0) + min_matches = 1; + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != DUPINF) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = getsubdfa(v, t->child); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack that sub-match and try again. And, when we next try for a + * validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = end; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + if (endpts[k] == NULL) + { + /* no match possible, so see if we can shorten previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->id, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to shorten some previous match */ + k--; + goto backtrack; + } + + /* reject zero-length match unless necessary to achieve min */ + if (endpts[k] == endpts[k - 1] && + (k >= min_matches || min_matches - k < end - endpts[k])) + goto backtrack; + + k++; + limit = end; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches that + * works so far as the child DFA can tell. If k is an allowed number + * of matches, start the slow part: recurse to verify each sub-match. + * We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + /* zap any match data from a non-last iteration */ + zaptreesubs(v, t->child); + er = cdissect(v, t->child, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d: successful\n", t->id)); + FREE(endpts); + return REG_OKAY; + } + + /* i'th match failed to verify, so backtrack it */ + k = i; + +backtrack: + + /* + * Must consider shorter versions of the k'th sub-match. However, + * we'll only ask for a zero-length match if necessary. + */ + while (k > 0) + { + chr *prev_end = endpts[k - 1]; + + if (endpts[k] > prev_end) + { + limit = endpts[k] - 1; + if (limit > prev_end || + (k < min_matches && min_matches - k >= end - prev_end)) + { + /* break out of backtrack loop, continue the outer one */ + break; + } + } + /* can't shorten k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted */ + FREE(endpts); + + /* + * Now consider the possibility that we can match to a zero-length string + * by using zero repetitions. + */ + if (t->min == 0 && begin == end) + { + MDEBUG(("%d: allowing zero matches\n", t->id)); + return REG_OKAY; + } + + MDEBUG(("%d: failed\n", t->id)); + return REG_NOMATCH; +} + +/* + * creviterdissect - dissect match for iteration node, shortest-first + */ +static int /* regexec return code */ +creviterdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->child != NULL && t->child->cnfa.nstates > 0); + assert(t->child->flags & SHORTER); + assert(begin <= end); + + MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + { + MDEBUG(("%d: allowing zero matches\n", t->id)); + return REG_OKAY; + } + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != DUPINF) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = getsubdfa(v, t->child); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack that sub-match and try again. And, when we next try for a + * validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = begin; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* disallow zero-length match unless necessary to achieve min */ + if (limit == endpts[k - 1] && + limit != end && + (k >= min_matches || min_matches - k < end - limit)) + limit++; + + /* if this is the last allowed sub-match, it must reach to the end */ + if (k >= max_matches) + limit = end; + + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = shortest(v, d, endpts[k - 1], limit, end, + (chr **) NULL, (int *) NULL); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + if (endpts[k] == NULL) + { + /* no match possible, so see if we can lengthen previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->id, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to lengthen some previous match */ + k--; + goto backtrack; + } + + k++; + limit = endpts[k - 1]; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches that + * works so far as the child DFA can tell. If k is an allowed number + * of matches, start the slow part: recurse to verify each sub-match. + * We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + /* zap any match data from a non-last iteration */ + zaptreesubs(v, t->child); + er = cdissect(v, t->child, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d: successful\n", t->id)); + FREE(endpts); + return REG_OKAY; + } + + /* i'th match failed to verify, so backtrack it */ + k = i; + +backtrack: + + /* + * Must consider longer versions of the k'th sub-match. + */ + while (k > 0) + { + if (endpts[k] < end) + { + limit = endpts[k] + 1; + /* break out of backtrack loop, continue the outer one */ + break; + } + /* can't lengthen k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted */ + MDEBUG(("%d: failed\n", t->id)); + FREE(endpts); + return REG_NOMATCH; +} + + + +#include "rege_dfa.c" diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c new file mode 100644 index 0000000..a493dbe --- /dev/null +++ b/src/backend/regex/regexport.c @@ -0,0 +1,293 @@ +/*------------------------------------------------------------------------- + * + * regexport.c + * Functions for exporting info about a regex's NFA + * + * In this implementation, the NFA defines a necessary but not sufficient + * condition for a string to match the regex: that is, there can be strings + * that match the NFA but don't match the full regex, but not vice versa. + * Thus, for example, it is okay for the functions below to treat lookaround + * constraints as no-ops, since they merely constrain the string some more. + * + * Notice that these functions return info into caller-provided arrays + * rather than doing their own malloc's. This simplifies the APIs by + * eliminating a class of error conditions, and in the case of colors + * allows the caller to decide how big is too big to bother with. + * + * + * Portions Copyright (c) 2013-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/backend/regex/regexport.c + * + *------------------------------------------------------------------------- + */ + +#include "regex/regguts.h" + +#include "regex/regexport.h" + + +/* + * Get total number of NFA states. + */ +int +pg_reg_getnumstates(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->nstates; +} + +/* + * Get initial state of NFA. + */ +int +pg_reg_getinitialstate(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->pre; +} + +/* + * Get final state of NFA. + */ +int +pg_reg_getfinalstate(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->post; +} + +/* + * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON + * arcs from the caller, treating any LACON as being automatically satisfied. + * Since the output representation does not support arcs that consume no + * character when traversed, we have to recursively traverse LACON arcs here, + * and report whatever normal arcs are reachable by traversing LACON arcs. + * Note that this wouldn't work if it were possible to reach the final state + * via LACON traversal, but the regex library never builds NFAs that have + * LACON arcs leading directly to the final state. (This is because the + * regex executor is designed to consume one character beyond the nominal + * match end --- possibly an EOS indicator --- so there is always a set of + * ordinary arcs leading to the final state.) + * + * traverse_lacons is a recursive subroutine used by both exported functions + * to count and then emit the reachable regular arcs. *arcs_count is + * incremented by the number of reachable arcs, and as many as will fit in + * arcs_len (possibly 0) are emitted into arcs[]. + */ +static void +traverse_lacons(struct cnfa *cnfa, int st, + int *arcs_count, + regex_arc_t *arcs, int arcs_len) +{ + struct carc *ca; + + /* + * Since this function recurses, it could theoretically be driven to stack + * overflow. In practice, this is mostly useful to backstop against a + * failure of the regex compiler to remove a loop of LACON arcs. + */ + check_stack_depth(); + + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co < cnfa->ncolors) + { + /* Ordinary arc, so count and possibly emit it */ + int ndx = (*arcs_count)++; + + if (ndx < arcs_len) + { + arcs[ndx].co = ca->co; + arcs[ndx].to = ca->to; + } + } + else + { + /* LACON arc --- assume it's satisfied and recurse... */ + /* ... but first, assert it doesn't lead directly to post state */ + Assert(ca->to != cnfa->post); + + traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len); + } + } +} + +/* + * Get number of outgoing NFA arcs of state number "st". + */ +int +pg_reg_getnumoutarcs(const regex_t *regex, int st) +{ + struct cnfa *cnfa; + int arcs_count; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (st < 0 || st >= cnfa->nstates) + return 0; + arcs_count = 0; + traverse_lacons(cnfa, st, &arcs_count, NULL, 0); + return arcs_count; +} + +/* + * Write array of outgoing NFA arcs of state number "st" into arcs[], + * whose length arcs_len must be at least as long as indicated by + * pg_reg_getnumoutarcs(), else not all arcs will be returned. + */ +void +pg_reg_getoutarcs(const regex_t *regex, int st, + regex_arc_t *arcs, int arcs_len) +{ + struct cnfa *cnfa; + int arcs_count; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (st < 0 || st >= cnfa->nstates || arcs_len <= 0) + return; + arcs_count = 0; + traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len); +} + +/* + * Get total number of colors. + */ +int +pg_reg_getnumcolors(const regex_t *regex) +{ + struct colormap *cm; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + return cm->max + 1; +} + +/* + * Check if color is beginning of line/string. + * + * (We might at some point need to offer more refined handling of pseudocolors, + * but this will do for now.) + */ +int +pg_reg_colorisbegin(const regex_t *regex, int co) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (co == cnfa->bos[0] || co == cnfa->bos[1]) + return true; + else + return false; +} + +/* + * Check if color is end of line/string. + */ +int +pg_reg_colorisend(const regex_t *regex, int co) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (co == cnfa->eos[0] || co == cnfa->eos[1]) + return true; + else + return false; +} + +/* + * Get number of member chrs of color number "co". + * + * Note: we return -1 if the color number is invalid, or if it is a special + * color (WHITE, RAINBOW, or a pseudocolor), or if the number of members is + * uncertain. + * Callers should not try to extract the members if -1 is returned. + */ +int +pg_reg_getnumcharacters(const regex_t *regex, int co) +{ + struct colormap *cm; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + if (co <= 0 || co > cm->max) /* <= 0 rejects WHITE and RAINBOW */ + return -1; + if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */ + return -1; + + /* + * If the color appears anywhere in the high colormap, treat its number of + * members as uncertain. In principle we could determine all the specific + * chrs corresponding to each such entry, but it would be expensive + * (particularly if character class tests are required) and it doesn't + * seem worth it. + */ + if (cm->cd[co].nuchrs != 0) + return -1; + + /* OK, return the known number of member chrs */ + return cm->cd[co].nschrs; +} + +/* + * Write array of member chrs of color number "co" into chars[], + * whose length chars_len must be at least as long as indicated by + * pg_reg_getnumcharacters(), else not all chars will be returned. + * + * Fetching the members of WHITE, RAINBOW, or a pseudocolor is not supported. + * + * Caution: this is a relatively expensive operation. + */ +void +pg_reg_getcharacters(const regex_t *regex, int co, + pg_wchar *chars, int chars_len) +{ + struct colormap *cm; + chr c; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + if (co <= 0 || co > cm->max || chars_len <= 0) + return; + if (cm->cd[co].flags & PSEUDO) + return; + + /* + * We need only examine the low character map; there should not be any + * matching entries in the high map. + */ + for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++) + { + if (cm->locolormap[c - CHR_MIN] == co) + { + *chars++ = c; + if (--chars_len == 0) + break; + } + } +} diff --git a/src/backend/regex/regfree.c b/src/backend/regex/regfree.c new file mode 100644 index 0000000..ae17ae7 --- /dev/null +++ b/src/backend/regex/regfree.c @@ -0,0 +1,54 @@ +/* + * regfree - free an RE + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regfree.c + * + * + * You might think that this could be incorporated into regcomp.c, and + * that would be a reasonable idea... except that this is a generic + * function (with a generic name), applicable to all compiled REs + * regardless of the size of their characters, whereas the stuff in + * regcomp.c gets compiled once per character size. + */ + +#include "regex/regguts.h" + + +/* + * pg_regfree - free an RE (generic function, punts to RE-specific function) + * + * Ignoring invocation with NULL is a convenience. + */ +void +pg_regfree(regex_t *re) +{ + if (re == NULL) + return; + (*((struct fns *) re->re_fns)->free) (re); +} diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c new file mode 100644 index 0000000..ec435b6 --- /dev/null +++ b/src/backend/regex/regprefix.c @@ -0,0 +1,268 @@ +/*------------------------------------------------------------------------- + * + * regprefix.c + * Extract a common prefix, if any, from a compiled regex. + * + * + * Portions Copyright (c) 2012-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/backend/regex/regprefix.c + * + *------------------------------------------------------------------------- + */ + +#include "regex/regguts.h" + + +/* + * forward declarations + */ +static int findprefix(struct cnfa *cnfa, struct colormap *cm, + chr *string, size_t *slength); + + +/* + * pg_regprefix - get common prefix for regular expression + * + * Returns one of: + * REG_NOMATCH: there is no common prefix of strings matching the regex + * REG_PREFIX: there is a common prefix of strings matching the regex + * REG_EXACT: all strings satisfying the regex must match the same string + * or a REG_XXX error code + * + * In the non-failure cases, *string is set to a malloc'd string containing + * the common prefix or exact value, of length *slength (measured in chrs + * not bytes!). + * + * This function does not analyze all complex cases (such as lookaround + * constraints) exactly. Therefore it is possible that some strings matching + * the reported prefix or exact-match string do not satisfy the regex. But + * it should never be the case that a string satisfying the regex does not + * match the reported prefix or exact-match string. + */ +int +pg_regprefix(regex_t *re, + chr **string, + size_t *slength) +{ + struct guts *g; + struct cnfa *cnfa; + int st; + + /* sanity checks */ + if (string == NULL || slength == NULL) + return REG_INVARG; + *string = NULL; /* initialize for failure cases */ + *slength = 0; + if (re == NULL || re->re_magic != REMAGIC) + return REG_INVARG; + if (re->re_csize != sizeof(chr)) + return REG_MIXED; + + /* Initialize locale-dependent support */ + pg_set_regex_collation(re->re_collation); + + /* setup */ + g = (struct guts *) re->re_guts; + if (g->info & REG_UIMPOSSIBLE) + return REG_NOMATCH; + + /* + * This implementation considers only the search NFA for the topmost regex + * tree node. Therefore, constraints such as backrefs are not fully + * applied, which is allowed per the function's API spec. + */ + assert(g->tree != NULL); + cnfa = &g->tree->cnfa; + + /* matchall NFAs never have a fixed prefix */ + if (cnfa->flags & MATCHALL) + return REG_NOMATCH; + + /* + * Since a correct NFA should never contain any exit-free loops, it should + * not be possible for our traversal to return to a previously visited NFA + * state. Hence we need at most nstates chrs in the output string. + */ + *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); + if (*string == NULL) + return REG_ESPACE; + + /* do it */ + st = findprefix(cnfa, &g->cmap, *string, slength); + + assert(*slength <= cnfa->nstates); + + /* clean up */ + if (st != REG_PREFIX && st != REG_EXACT) + { + FREE(*string); + *string = NULL; + *slength = 0; + } + + return st; +} + +/* + * findprefix - extract common prefix from cNFA + * + * Results are returned into the preallocated chr array string[], with + * *slength (which must be preset to zero) incremented for each chr. + */ +static int /* regprefix return code */ +findprefix(struct cnfa *cnfa, + struct colormap *cm, + chr *string, + size_t *slength) +{ + int st; + int nextst; + color thiscolor; + chr c; + struct carc *ca; + + /* + * The "pre" state must have only BOS/BOL outarcs, else pattern isn't + * anchored left. If we have both BOS and BOL, they must go to the same + * next state. + */ + st = cnfa->pre; + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + return REG_NOMATCH; + } + else + return REG_NOMATCH; + } + if (nextst == -1) + return REG_NOMATCH; + + /* + * Scan through successive states, stopping as soon as we find one with + * more than one acceptable transition character (either multiple colors + * on out-arcs, or a color with more than one member chr). + * + * We could find a state with multiple out-arcs that are all labeled with + * the same singleton color; this comes from patterns like "^ab(cde|cxy)". + * In that case we add the chr "c" to the output string but then exit the + * loop with nextst == -1. This leaves a little bit on the table: if the + * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added + * to the prefix. But chasing multiple parallel state chains doesn't seem + * worth the trouble. + */ + do + { + st = nextst; + nextst = -1; + thiscolor = COLORLESS; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + /* We can ignore BOS/BOL arcs */ + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + continue; + + /* + * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs + * and LACONs + */ + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] || + ca->co == RAINBOW || ca->co >= cnfa->ncolors) + { + thiscolor = COLORLESS; + break; + } + if (thiscolor == COLORLESS) + { + /* First plain outarc */ + thiscolor = ca->co; + nextst = ca->to; + } + else if (thiscolor == ca->co) + { + /* Another plain outarc for same color */ + nextst = -1; + } + else + { + /* More than one plain outarc color terminates the search */ + thiscolor = COLORLESS; + break; + } + } + /* Done if we didn't find exactly one color on plain outarcs */ + if (thiscolor == COLORLESS) + break; + /* The color must be a singleton */ + if (cm->cd[thiscolor].nschrs != 1) + break; + /* Must not have any high-color-map entries */ + if (cm->cd[thiscolor].nuchrs != 0) + break; + + /* + * Identify the color's sole member chr and add it to the prefix + * string. In general the colormap data structure doesn't provide a + * way to find color member chrs, except by trying GETCOLOR() on each + * possible chr value, which won't do at all. However, for the cases + * we care about it should be sufficient to test the "firstchr" value, + * that is the first chr ever added to the color. There are cases + * where this might no longer be a member of the color (so we do need + * to test), but none of them are likely to arise for a character that + * is a member of a common prefix. If we do hit such a corner case, + * we just fall out without adding anything to the prefix string. + */ + c = cm->cd[thiscolor].firstchr; + if (GETCOLOR(cm, c) != thiscolor) + break; + + string[(*slength)++] = c; + + /* Advance to next state, but only if we have a unique next state */ + } while (nextst != -1); + + /* + * If we ended at a state that only has EOS/EOL outarcs leading to the + * "post" state, then we have an exact-match string. Note this is true + * even if the string is of zero length. + */ + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + { + nextst = -1; + break; + } + } + else + { + nextst = -1; + break; + } + } + if (nextst == cnfa->post) + return REG_EXACT; + + /* + * Otherwise, if we were unable to identify any prefix characters, say + * NOMATCH --- the pattern is anchored left, but doesn't specify any + * particular first character. + */ + if (*slength > 0) + return REG_PREFIX; + + return REG_NOMATCH; +} |