diff options
Diffstat (limited to 'src/backend/regex/regprefix.c')
-rw-r--r-- | src/backend/regex/regprefix.c | 268 |
1 files changed, 268 insertions, 0 deletions
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c new file mode 100644 index 0000000..ec435b6 --- /dev/null +++ b/src/backend/regex/regprefix.c @@ -0,0 +1,268 @@ +/*------------------------------------------------------------------------- + * + * regprefix.c + * Extract a common prefix, if any, from a compiled regex. + * + * + * Portions Copyright (c) 2012-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/backend/regex/regprefix.c + * + *------------------------------------------------------------------------- + */ + +#include "regex/regguts.h" + + +/* + * forward declarations + */ +static int findprefix(struct cnfa *cnfa, struct colormap *cm, + chr *string, size_t *slength); + + +/* + * pg_regprefix - get common prefix for regular expression + * + * Returns one of: + * REG_NOMATCH: there is no common prefix of strings matching the regex + * REG_PREFIX: there is a common prefix of strings matching the regex + * REG_EXACT: all strings satisfying the regex must match the same string + * or a REG_XXX error code + * + * In the non-failure cases, *string is set to a malloc'd string containing + * the common prefix or exact value, of length *slength (measured in chrs + * not bytes!). + * + * This function does not analyze all complex cases (such as lookaround + * constraints) exactly. Therefore it is possible that some strings matching + * the reported prefix or exact-match string do not satisfy the regex. But + * it should never be the case that a string satisfying the regex does not + * match the reported prefix or exact-match string. + */ +int +pg_regprefix(regex_t *re, + chr **string, + size_t *slength) +{ + struct guts *g; + struct cnfa *cnfa; + int st; + + /* sanity checks */ + if (string == NULL || slength == NULL) + return REG_INVARG; + *string = NULL; /* initialize for failure cases */ + *slength = 0; + if (re == NULL || re->re_magic != REMAGIC) + return REG_INVARG; + if (re->re_csize != sizeof(chr)) + return REG_MIXED; + + /* Initialize locale-dependent support */ + pg_set_regex_collation(re->re_collation); + + /* setup */ + g = (struct guts *) re->re_guts; + if (g->info & REG_UIMPOSSIBLE) + return REG_NOMATCH; + + /* + * This implementation considers only the search NFA for the topmost regex + * tree node. Therefore, constraints such as backrefs are not fully + * applied, which is allowed per the function's API spec. + */ + assert(g->tree != NULL); + cnfa = &g->tree->cnfa; + + /* matchall NFAs never have a fixed prefix */ + if (cnfa->flags & MATCHALL) + return REG_NOMATCH; + + /* + * Since a correct NFA should never contain any exit-free loops, it should + * not be possible for our traversal to return to a previously visited NFA + * state. Hence we need at most nstates chrs in the output string. + */ + *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); + if (*string == NULL) + return REG_ESPACE; + + /* do it */ + st = findprefix(cnfa, &g->cmap, *string, slength); + + assert(*slength <= cnfa->nstates); + + /* clean up */ + if (st != REG_PREFIX && st != REG_EXACT) + { + FREE(*string); + *string = NULL; + *slength = 0; + } + + return st; +} + +/* + * findprefix - extract common prefix from cNFA + * + * Results are returned into the preallocated chr array string[], with + * *slength (which must be preset to zero) incremented for each chr. + */ +static int /* regprefix return code */ +findprefix(struct cnfa *cnfa, + struct colormap *cm, + chr *string, + size_t *slength) +{ + int st; + int nextst; + color thiscolor; + chr c; + struct carc *ca; + + /* + * The "pre" state must have only BOS/BOL outarcs, else pattern isn't + * anchored left. If we have both BOS and BOL, they must go to the same + * next state. + */ + st = cnfa->pre; + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + return REG_NOMATCH; + } + else + return REG_NOMATCH; + } + if (nextst == -1) + return REG_NOMATCH; + + /* + * Scan through successive states, stopping as soon as we find one with + * more than one acceptable transition character (either multiple colors + * on out-arcs, or a color with more than one member chr). + * + * We could find a state with multiple out-arcs that are all labeled with + * the same singleton color; this comes from patterns like "^ab(cde|cxy)". + * In that case we add the chr "c" to the output string but then exit the + * loop with nextst == -1. This leaves a little bit on the table: if the + * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added + * to the prefix. But chasing multiple parallel state chains doesn't seem + * worth the trouble. + */ + do + { + st = nextst; + nextst = -1; + thiscolor = COLORLESS; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + /* We can ignore BOS/BOL arcs */ + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + continue; + + /* + * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs + * and LACONs + */ + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] || + ca->co == RAINBOW || ca->co >= cnfa->ncolors) + { + thiscolor = COLORLESS; + break; + } + if (thiscolor == COLORLESS) + { + /* First plain outarc */ + thiscolor = ca->co; + nextst = ca->to; + } + else if (thiscolor == ca->co) + { + /* Another plain outarc for same color */ + nextst = -1; + } + else + { + /* More than one plain outarc color terminates the search */ + thiscolor = COLORLESS; + break; + } + } + /* Done if we didn't find exactly one color on plain outarcs */ + if (thiscolor == COLORLESS) + break; + /* The color must be a singleton */ + if (cm->cd[thiscolor].nschrs != 1) + break; + /* Must not have any high-color-map entries */ + if (cm->cd[thiscolor].nuchrs != 0) + break; + + /* + * Identify the color's sole member chr and add it to the prefix + * string. In general the colormap data structure doesn't provide a + * way to find color member chrs, except by trying GETCOLOR() on each + * possible chr value, which won't do at all. However, for the cases + * we care about it should be sufficient to test the "firstchr" value, + * that is the first chr ever added to the color. There are cases + * where this might no longer be a member of the color (so we do need + * to test), but none of them are likely to arise for a character that + * is a member of a common prefix. If we do hit such a corner case, + * we just fall out without adding anything to the prefix string. + */ + c = cm->cd[thiscolor].firstchr; + if (GETCOLOR(cm, c) != thiscolor) + break; + + string[(*slength)++] = c; + + /* Advance to next state, but only if we have a unique next state */ + } while (nextst != -1); + + /* + * If we ended at a state that only has EOS/EOL outarcs leading to the + * "post" state, then we have an exact-match string. Note this is true + * even if the string is of zero length. + */ + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + { + nextst = -1; + break; + } + } + else + { + nextst = -1; + break; + } + } + if (nextst == cnfa->post) + return REG_EXACT; + + /* + * Otherwise, if we were unable to identify any prefix characters, say + * NOMATCH --- the pattern is anchored left, but doesn't specify any + * particular first character. + */ + if (*slength > 0) + return REG_PREFIX; + + return REG_NOMATCH; +} |