summaryrefslogtreecommitdiffstats
path: root/src/backend/regex/regexport.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/regex/regexport.c
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/regex/regexport.c')
-rw-r--r--src/backend/regex/regexport.c293
1 files changed, 293 insertions, 0 deletions
diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c
new file mode 100644
index 0000000..a493dbe
--- /dev/null
+++ b/src/backend/regex/regexport.c
@@ -0,0 +1,293 @@
+/*-------------------------------------------------------------------------
+ *
+ * regexport.c
+ * Functions for exporting info about a regex's NFA
+ *
+ * In this implementation, the NFA defines a necessary but not sufficient
+ * condition for a string to match the regex: that is, there can be strings
+ * that match the NFA but don't match the full regex, but not vice versa.
+ * Thus, for example, it is okay for the functions below to treat lookaround
+ * constraints as no-ops, since they merely constrain the string some more.
+ *
+ * Notice that these functions return info into caller-provided arrays
+ * rather than doing their own malloc's. This simplifies the APIs by
+ * eliminating a class of error conditions, and in the case of colors
+ * allows the caller to decide how big is too big to bother with.
+ *
+ *
+ * Portions Copyright (c) 2013-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ * src/backend/regex/regexport.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+#include "regex/regexport.h"
+
+
+/*
+ * Get total number of NFA states.
+ */
+int
+pg_reg_getnumstates(const regex_t *regex)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ return cnfa->nstates;
+}
+
+/*
+ * Get initial state of NFA.
+ */
+int
+pg_reg_getinitialstate(const regex_t *regex)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ return cnfa->pre;
+}
+
+/*
+ * Get final state of NFA.
+ */
+int
+pg_reg_getfinalstate(const regex_t *regex)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ return cnfa->post;
+}
+
+/*
+ * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON
+ * arcs from the caller, treating any LACON as being automatically satisfied.
+ * Since the output representation does not support arcs that consume no
+ * character when traversed, we have to recursively traverse LACON arcs here,
+ * and report whatever normal arcs are reachable by traversing LACON arcs.
+ * Note that this wouldn't work if it were possible to reach the final state
+ * via LACON traversal, but the regex library never builds NFAs that have
+ * LACON arcs leading directly to the final state. (This is because the
+ * regex executor is designed to consume one character beyond the nominal
+ * match end --- possibly an EOS indicator --- so there is always a set of
+ * ordinary arcs leading to the final state.)
+ *
+ * traverse_lacons is a recursive subroutine used by both exported functions
+ * to count and then emit the reachable regular arcs. *arcs_count is
+ * incremented by the number of reachable arcs, and as many as will fit in
+ * arcs_len (possibly 0) are emitted into arcs[].
+ */
+static void
+traverse_lacons(struct cnfa *cnfa, int st,
+ int *arcs_count,
+ regex_arc_t *arcs, int arcs_len)
+{
+ struct carc *ca;
+
+ /*
+ * Since this function recurses, it could theoretically be driven to stack
+ * overflow. In practice, this is mostly useful to backstop against a
+ * failure of the regex compiler to remove a loop of LACON arcs.
+ */
+ check_stack_depth();
+
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co < cnfa->ncolors)
+ {
+ /* Ordinary arc, so count and possibly emit it */
+ int ndx = (*arcs_count)++;
+
+ if (ndx < arcs_len)
+ {
+ arcs[ndx].co = ca->co;
+ arcs[ndx].to = ca->to;
+ }
+ }
+ else
+ {
+ /* LACON arc --- assume it's satisfied and recurse... */
+ /* ... but first, assert it doesn't lead directly to post state */
+ Assert(ca->to != cnfa->post);
+
+ traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len);
+ }
+ }
+}
+
+/*
+ * Get number of outgoing NFA arcs of state number "st".
+ */
+int
+pg_reg_getnumoutarcs(const regex_t *regex, int st)
+{
+ struct cnfa *cnfa;
+ int arcs_count;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (st < 0 || st >= cnfa->nstates)
+ return 0;
+ arcs_count = 0;
+ traverse_lacons(cnfa, st, &arcs_count, NULL, 0);
+ return arcs_count;
+}
+
+/*
+ * Write array of outgoing NFA arcs of state number "st" into arcs[],
+ * whose length arcs_len must be at least as long as indicated by
+ * pg_reg_getnumoutarcs(), else not all arcs will be returned.
+ */
+void
+pg_reg_getoutarcs(const regex_t *regex, int st,
+ regex_arc_t *arcs, int arcs_len)
+{
+ struct cnfa *cnfa;
+ int arcs_count;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
+ return;
+ arcs_count = 0;
+ traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len);
+}
+
+/*
+ * Get total number of colors.
+ */
+int
+pg_reg_getnumcolors(const regex_t *regex)
+{
+ struct colormap *cm;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cm = &((struct guts *) regex->re_guts)->cmap;
+
+ return cm->max + 1;
+}
+
+/*
+ * Check if color is beginning of line/string.
+ *
+ * (We might at some point need to offer more refined handling of pseudocolors,
+ * but this will do for now.)
+ */
+int
+pg_reg_colorisbegin(const regex_t *regex, int co)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (co == cnfa->bos[0] || co == cnfa->bos[1])
+ return true;
+ else
+ return false;
+}
+
+/*
+ * Check if color is end of line/string.
+ */
+int
+pg_reg_colorisend(const regex_t *regex, int co)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (co == cnfa->eos[0] || co == cnfa->eos[1])
+ return true;
+ else
+ return false;
+}
+
+/*
+ * Get number of member chrs of color number "co".
+ *
+ * Note: we return -1 if the color number is invalid, or if it is a special
+ * color (WHITE, RAINBOW, or a pseudocolor), or if the number of members is
+ * uncertain.
+ * Callers should not try to extract the members if -1 is returned.
+ */
+int
+pg_reg_getnumcharacters(const regex_t *regex, int co)
+{
+ struct colormap *cm;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cm = &((struct guts *) regex->re_guts)->cmap;
+
+ if (co <= 0 || co > cm->max) /* <= 0 rejects WHITE and RAINBOW */
+ return -1;
+ if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */
+ return -1;
+
+ /*
+ * If the color appears anywhere in the high colormap, treat its number of
+ * members as uncertain. In principle we could determine all the specific
+ * chrs corresponding to each such entry, but it would be expensive
+ * (particularly if character class tests are required) and it doesn't
+ * seem worth it.
+ */
+ if (cm->cd[co].nuchrs != 0)
+ return -1;
+
+ /* OK, return the known number of member chrs */
+ return cm->cd[co].nschrs;
+}
+
+/*
+ * Write array of member chrs of color number "co" into chars[],
+ * whose length chars_len must be at least as long as indicated by
+ * pg_reg_getnumcharacters(), else not all chars will be returned.
+ *
+ * Fetching the members of WHITE, RAINBOW, or a pseudocolor is not supported.
+ *
+ * Caution: this is a relatively expensive operation.
+ */
+void
+pg_reg_getcharacters(const regex_t *regex, int co,
+ pg_wchar *chars, int chars_len)
+{
+ struct colormap *cm;
+ chr c;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cm = &((struct guts *) regex->re_guts)->cmap;
+
+ if (co <= 0 || co > cm->max || chars_len <= 0)
+ return;
+ if (cm->cd[co].flags & PSEUDO)
+ return;
+
+ /*
+ * We need only examine the low character map; there should not be any
+ * matching entries in the high map.
+ */
+ for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
+ {
+ if (cm->locolormap[c - CHR_MIN] == co)
+ {
+ *chars++ = c;
+ if (--chars_len == 0)
+ break;
+ }
+ }
+}