diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/regex/regexport.c | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/regex/regexport.c')
-rw-r--r-- | src/backend/regex/regexport.c | 293 |
1 files changed, 293 insertions, 0 deletions
diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c new file mode 100644 index 0000000..a493dbe --- /dev/null +++ b/src/backend/regex/regexport.c @@ -0,0 +1,293 @@ +/*------------------------------------------------------------------------- + * + * regexport.c + * Functions for exporting info about a regex's NFA + * + * In this implementation, the NFA defines a necessary but not sufficient + * condition for a string to match the regex: that is, there can be strings + * that match the NFA but don't match the full regex, but not vice versa. + * Thus, for example, it is okay for the functions below to treat lookaround + * constraints as no-ops, since they merely constrain the string some more. + * + * Notice that these functions return info into caller-provided arrays + * rather than doing their own malloc's. This simplifies the APIs by + * eliminating a class of error conditions, and in the case of colors + * allows the caller to decide how big is too big to bother with. + * + * + * Portions Copyright (c) 2013-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/backend/regex/regexport.c + * + *------------------------------------------------------------------------- + */ + +#include "regex/regguts.h" + +#include "regex/regexport.h" + + +/* + * Get total number of NFA states. + */ +int +pg_reg_getnumstates(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->nstates; +} + +/* + * Get initial state of NFA. + */ +int +pg_reg_getinitialstate(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->pre; +} + +/* + * Get final state of NFA. + */ +int +pg_reg_getfinalstate(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->post; +} + +/* + * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON + * arcs from the caller, treating any LACON as being automatically satisfied. + * Since the output representation does not support arcs that consume no + * character when traversed, we have to recursively traverse LACON arcs here, + * and report whatever normal arcs are reachable by traversing LACON arcs. + * Note that this wouldn't work if it were possible to reach the final state + * via LACON traversal, but the regex library never builds NFAs that have + * LACON arcs leading directly to the final state. (This is because the + * regex executor is designed to consume one character beyond the nominal + * match end --- possibly an EOS indicator --- so there is always a set of + * ordinary arcs leading to the final state.) + * + * traverse_lacons is a recursive subroutine used by both exported functions + * to count and then emit the reachable regular arcs. *arcs_count is + * incremented by the number of reachable arcs, and as many as will fit in + * arcs_len (possibly 0) are emitted into arcs[]. + */ +static void +traverse_lacons(struct cnfa *cnfa, int st, + int *arcs_count, + regex_arc_t *arcs, int arcs_len) +{ + struct carc *ca; + + /* + * Since this function recurses, it could theoretically be driven to stack + * overflow. In practice, this is mostly useful to backstop against a + * failure of the regex compiler to remove a loop of LACON arcs. + */ + check_stack_depth(); + + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co < cnfa->ncolors) + { + /* Ordinary arc, so count and possibly emit it */ + int ndx = (*arcs_count)++; + + if (ndx < arcs_len) + { + arcs[ndx].co = ca->co; + arcs[ndx].to = ca->to; + } + } + else + { + /* LACON arc --- assume it's satisfied and recurse... */ + /* ... but first, assert it doesn't lead directly to post state */ + Assert(ca->to != cnfa->post); + + traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len); + } + } +} + +/* + * Get number of outgoing NFA arcs of state number "st". + */ +int +pg_reg_getnumoutarcs(const regex_t *regex, int st) +{ + struct cnfa *cnfa; + int arcs_count; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (st < 0 || st >= cnfa->nstates) + return 0; + arcs_count = 0; + traverse_lacons(cnfa, st, &arcs_count, NULL, 0); + return arcs_count; +} + +/* + * Write array of outgoing NFA arcs of state number "st" into arcs[], + * whose length arcs_len must be at least as long as indicated by + * pg_reg_getnumoutarcs(), else not all arcs will be returned. + */ +void +pg_reg_getoutarcs(const regex_t *regex, int st, + regex_arc_t *arcs, int arcs_len) +{ + struct cnfa *cnfa; + int arcs_count; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (st < 0 || st >= cnfa->nstates || arcs_len <= 0) + return; + arcs_count = 0; + traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len); +} + +/* + * Get total number of colors. + */ +int +pg_reg_getnumcolors(const regex_t *regex) +{ + struct colormap *cm; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + return cm->max + 1; +} + +/* + * Check if color is beginning of line/string. + * + * (We might at some point need to offer more refined handling of pseudocolors, + * but this will do for now.) + */ +int +pg_reg_colorisbegin(const regex_t *regex, int co) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (co == cnfa->bos[0] || co == cnfa->bos[1]) + return true; + else + return false; +} + +/* + * Check if color is end of line/string. + */ +int +pg_reg_colorisend(const regex_t *regex, int co) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (co == cnfa->eos[0] || co == cnfa->eos[1]) + return true; + else + return false; +} + +/* + * Get number of member chrs of color number "co". + * + * Note: we return -1 if the color number is invalid, or if it is a special + * color (WHITE, RAINBOW, or a pseudocolor), or if the number of members is + * uncertain. + * Callers should not try to extract the members if -1 is returned. + */ +int +pg_reg_getnumcharacters(const regex_t *regex, int co) +{ + struct colormap *cm; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + if (co <= 0 || co > cm->max) /* <= 0 rejects WHITE and RAINBOW */ + return -1; + if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */ + return -1; + + /* + * If the color appears anywhere in the high colormap, treat its number of + * members as uncertain. In principle we could determine all the specific + * chrs corresponding to each such entry, but it would be expensive + * (particularly if character class tests are required) and it doesn't + * seem worth it. + */ + if (cm->cd[co].nuchrs != 0) + return -1; + + /* OK, return the known number of member chrs */ + return cm->cd[co].nschrs; +} + +/* + * Write array of member chrs of color number "co" into chars[], + * whose length chars_len must be at least as long as indicated by + * pg_reg_getnumcharacters(), else not all chars will be returned. + * + * Fetching the members of WHITE, RAINBOW, or a pseudocolor is not supported. + * + * Caution: this is a relatively expensive operation. + */ +void +pg_reg_getcharacters(const regex_t *regex, int co, + pg_wchar *chars, int chars_len) +{ + struct colormap *cm; + chr c; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + if (co <= 0 || co > cm->max || chars_len <= 0) + return; + if (cm->cd[co].flags & PSEUDO) + return; + + /* + * We need only examine the low character map; there should not be any + * matching entries in the high map. + */ + for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++) + { + if (cm->locolormap[c - CHR_MIN] == co) + { + *chars++ = c; + if (--chars_len == 0) + break; + } + } +} |