diff options
Diffstat (limited to 'src/test/modules/test_regex/test_regex.c')
-rw-r--r-- | src/test/modules/test_regex/test_regex.c | 773 |
1 files changed, 773 insertions, 0 deletions
diff --git a/src/test/modules/test_regex/test_regex.c b/src/test/modules/test_regex/test_regex.c new file mode 100644 index 0000000..e23a0bd --- /dev/null +++ b/src/test/modules/test_regex/test_regex.c @@ -0,0 +1,773 @@ +/*-------------------------------------------------------------------------- + * + * test_regex.c + * Test harness for the regular expression package. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/test/modules/test_regex/test_regex.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "funcapi.h" +#include "miscadmin.h" +#include "regex/regex.h" +#include "utils/array.h" +#include "utils/builtins.h" + +PG_MODULE_MAGIC; + + +/* all the options of interest for regex functions */ +typedef struct test_re_flags +{ + int cflags; /* compile flags for Spencer's regex code */ + int eflags; /* execute flags for Spencer's regex code */ + long info; /* expected re_info bits */ + bool glob; /* do it globally (for each occurrence) */ + bool indices; /* report indices not actual strings */ + bool partial; /* expect partial match */ +} test_re_flags; + +/* cross-call state for test_regex() */ +typedef struct test_regex_ctx +{ + test_re_flags re_flags; /* flags */ + rm_detail_t details; /* "details" from execution */ + text *orig_str; /* data string in original TEXT form */ + int nmatches; /* number of places where pattern matched */ + int npatterns; /* number of capturing subpatterns */ + /* We store start char index and end+1 char index for each match */ + /* so the number of entries in match_locs is nmatches * npatterns * 2 */ + int *match_locs; /* 0-based character indexes */ + int next_match; /* 0-based index of next match to process */ + /* workspace for build_test_match_result() */ + Datum *elems; /* has npatterns+1 elements */ + bool *nulls; /* has npatterns+1 elements */ + pg_wchar *wide_str; /* wide-char version of original string */ + char *conv_buf; /* conversion buffer, if needed */ + int conv_bufsiz; /* size thereof */ +} test_regex_ctx; + +/* Local functions */ +static void test_re_compile(text *text_re, int cflags, Oid collation, + regex_t *result_re); +static void parse_test_flags(test_re_flags *flags, text *opts); +static test_regex_ctx *setup_test_matches(text *orig_str, + regex_t *cpattern, + test_re_flags *flags, + Oid collation, + bool use_subpatterns); +static ArrayType *build_test_info_result(regex_t *cpattern, + test_re_flags *flags); +static ArrayType *build_test_match_result(test_regex_ctx *matchctx); + + +/* + * test_regex(pattern text, string text, flags text) returns setof text[] + * + * This is largely based on regexp.c's regexp_matches, with additions + * for debugging purposes. + */ +PG_FUNCTION_INFO_V1(test_regex); + +Datum +test_regex(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + test_regex_ctx *matchctx; + ArrayType *result_ary; + + if (SRF_IS_FIRSTCALL()) + { + text *pattern = PG_GETARG_TEXT_PP(0); + text *flags = PG_GETARG_TEXT_PP(2); + Oid collation = PG_GET_COLLATION(); + test_re_flags re_flags; + regex_t cpattern; + MemoryContext oldcontext; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* Determine options */ + parse_test_flags(&re_flags, flags); + + /* set up the compiled pattern */ + test_re_compile(pattern, re_flags.cflags, collation, &cpattern); + + /* be sure to copy the input string into the multi-call ctx */ + matchctx = setup_test_matches(PG_GETARG_TEXT_P_COPY(1), &cpattern, + &re_flags, + collation, + true); + + /* Pre-create workspace that build_test_match_result needs */ + matchctx->elems = (Datum *) palloc(sizeof(Datum) * + (matchctx->npatterns + 1)); + matchctx->nulls = (bool *) palloc(sizeof(bool) * + (matchctx->npatterns + 1)); + + MemoryContextSwitchTo(oldcontext); + funcctx->user_fctx = (void *) matchctx; + + /* + * Return the first result row, which is info equivalent to Tcl's + * "regexp -about" output + */ + result_ary = build_test_info_result(&cpattern, &re_flags); + + pg_regfree(&cpattern); + + SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); + } + else + { + /* Each subsequent row describes one match */ + funcctx = SRF_PERCALL_SETUP(); + matchctx = (test_regex_ctx *) funcctx->user_fctx; + + if (matchctx->next_match < matchctx->nmatches) + { + result_ary = build_test_match_result(matchctx); + matchctx->next_match++; + SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); + } + } + + SRF_RETURN_DONE(funcctx); +} + + +/* + * test_re_compile - compile a RE + * + * text_re --- the pattern, expressed as a TEXT object + * cflags --- compile options for the pattern + * collation --- collation to use for LC_CTYPE-dependent behavior + * result_re --- output, compiled RE is stored here + * + * Pattern is given in the database encoding. We internally convert to + * an array of pg_wchar, which is what Spencer's regex package wants. + * + * Caller must eventually pg_regfree the resulting RE to avoid memory leaks. + */ +static void +test_re_compile(text *text_re, int cflags, Oid collation, + regex_t *result_re) +{ + int text_re_len = VARSIZE_ANY_EXHDR(text_re); + char *text_re_val = VARDATA_ANY(text_re); + pg_wchar *pattern; + int pattern_len; + int regcomp_result; + char errMsg[100]; + + /* Convert pattern string to wide characters */ + pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar)); + pattern_len = pg_mb2wchar_with_len(text_re_val, + pattern, + text_re_len); + + regcomp_result = pg_regcomp(result_re, + pattern, + pattern_len, + cflags, + collation); + + pfree(pattern); + + if (regcomp_result != REG_OKAY) + { + /* re didn't compile (no need for pg_regfree, if so) */ + + /* + * Here and in other places in this file, do CHECK_FOR_INTERRUPTS + * before reporting a regex error. This is so that if the regex + * library aborts and returns REG_CANCEL, we don't print an error + * message that implies the regex was invalid. + */ + CHECK_FOR_INTERRUPTS(); + + pg_regerror(regcomp_result, result_re, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("invalid regular expression: %s", errMsg))); + } +} + +/* + * test_re_execute - execute a RE on pg_wchar data + * + * Returns true on match, false on no match + * Arguments are as for pg_regexec + */ +static bool +test_re_execute(regex_t *re, pg_wchar *data, int data_len, + int start_search, + rm_detail_t *details, + int nmatch, regmatch_t *pmatch, + int eflags) +{ + int regexec_result; + char errMsg[100]; + + /* Initialize match locations in case engine doesn't */ + details->rm_extend.rm_so = -1; + details->rm_extend.rm_eo = -1; + for (int i = 0; i < nmatch; i++) + { + pmatch[i].rm_so = -1; + pmatch[i].rm_eo = -1; + } + + /* Perform RE match and return result */ + regexec_result = pg_regexec(re, + data, + data_len, + start_search, + details, + nmatch, + pmatch, + eflags); + + if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) + { + /* re failed??? */ + CHECK_FOR_INTERRUPTS(); + pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("regular expression failed: %s", errMsg))); + } + + return (regexec_result == REG_OKAY); +} + + +/* + * parse_test_flags - parse the flags argument + * + * flags --- output argument, filled with desired options + * opts --- TEXT object, or NULL for defaults + */ +static void +parse_test_flags(test_re_flags *flags, text *opts) +{ + /* these defaults must match Tcl's */ + int cflags = REG_ADVANCED; + int eflags = 0; + long info = 0; + + flags->glob = false; + flags->indices = false; + flags->partial = false; + + if (opts) + { + char *opt_p = VARDATA_ANY(opts); + int opt_len = VARSIZE_ANY_EXHDR(opts); + int i; + + for (i = 0; i < opt_len; i++) + { + switch (opt_p[i]) + { + case '-': + /* allowed, no-op */ + break; + case '!': + flags->partial = true; + break; + case '*': + /* test requires Unicode --- ignored here */ + break; + case '0': + flags->indices = true; + break; + + /* These flags correspond to user-exposed RE options: */ + case 'g': /* global match */ + flags->glob = true; + break; + case 'i': /* case insensitive */ + cflags |= REG_ICASE; + break; + case 'n': /* \n affects ^ $ . [^ */ + cflags |= REG_NEWLINE; + break; + case 'p': /* ~Perl, \n affects . [^ */ + cflags |= REG_NLSTOP; + cflags &= ~REG_NLANCH; + break; + case 'w': /* weird, \n affects ^ $ only */ + cflags &= ~REG_NLSTOP; + cflags |= REG_NLANCH; + break; + case 'x': /* expanded syntax */ + cflags |= REG_EXPANDED; + break; + + /* These flags correspond to Tcl's -xflags options: */ + case 'a': + cflags |= REG_ADVF; + break; + case 'b': + cflags &= ~REG_ADVANCED; + break; + case 'c': + + /* + * Tcl calls this TCL_REG_CANMATCH, but it's really + * REG_EXPECT. In this implementation we must also set + * the partial and indices flags, so that + * setup_test_matches and build_test_match_result will + * emit the desired data. (They'll emit more fields than + * Tcl would, but that's fine.) + */ + cflags |= REG_EXPECT; + flags->partial = true; + flags->indices = true; + break; + case 'e': + cflags &= ~REG_ADVANCED; + cflags |= REG_EXTENDED; + break; + case 'q': + cflags &= ~REG_ADVANCED; + cflags |= REG_QUOTE; + break; + case 'o': /* o for opaque */ + cflags |= REG_NOSUB; + break; + case 's': /* s for start */ + cflags |= REG_BOSONLY; + break; + case '+': + cflags |= REG_FAKE; + break; + case ',': + cflags |= REG_PROGRESS; + break; + case '.': + cflags |= REG_DUMP; + break; + case ':': + eflags |= REG_MTRACE; + break; + case ';': + eflags |= REG_FTRACE; + break; + case '^': + eflags |= REG_NOTBOL; + break; + case '$': + eflags |= REG_NOTEOL; + break; + case 't': + cflags |= REG_EXPECT; + break; + case '%': + eflags |= REG_SMALL; + break; + + /* These flags define expected info bits: */ + case 'A': + info |= REG_UBSALNUM; + break; + case 'B': + info |= REG_UBRACES; + break; + case 'E': + info |= REG_UBBS; + break; + case 'H': + info |= REG_ULOOKAROUND; + break; + case 'I': + info |= REG_UIMPOSSIBLE; + break; + case 'L': + info |= REG_ULOCALE; + break; + case 'M': + info |= REG_UUNPORT; + break; + case 'N': + info |= REG_UEMPTYMATCH; + break; + case 'P': + info |= REG_UNONPOSIX; + break; + case 'Q': + info |= REG_UBOUNDS; + break; + case 'R': + info |= REG_UBACKREF; + break; + case 'S': + info |= REG_UUNSPEC; + break; + case 'T': + info |= REG_USHORTEST; + break; + case 'U': + info |= REG_UPBOTCH; + break; + + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid regular expression test option: \"%.*s\"", + pg_mblen(opt_p + i), opt_p + i))); + break; + } + } + } + flags->cflags = cflags; + flags->eflags = eflags; + flags->info = info; +} + +/* + * setup_test_matches --- do the initial matching + * + * To simplify memory management, we do all the matching in one swoop. + * The returned test_regex_ctx contains the locations of all the substrings + * matching the pattern. + */ +static test_regex_ctx * +setup_test_matches(text *orig_str, + regex_t *cpattern, test_re_flags *re_flags, + Oid collation, + bool use_subpatterns) +{ + test_regex_ctx *matchctx = palloc0(sizeof(test_regex_ctx)); + int eml = pg_database_encoding_max_length(); + int orig_len; + pg_wchar *wide_str; + int wide_len; + regmatch_t *pmatch; + int pmatch_len; + int array_len; + int array_idx; + int prev_match_end; + int start_search; + int maxlen = 0; /* largest fetch length in characters */ + + /* save flags */ + matchctx->re_flags = *re_flags; + + /* save original string --- we'll extract result substrings from it */ + matchctx->orig_str = orig_str; + + /* convert string to pg_wchar form for matching */ + orig_len = VARSIZE_ANY_EXHDR(orig_str); + wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); + wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); + + /* do we want to remember subpatterns? */ + if (use_subpatterns && cpattern->re_nsub > 0) + { + matchctx->npatterns = cpattern->re_nsub + 1; + pmatch_len = cpattern->re_nsub + 1; + } + else + { + use_subpatterns = false; + matchctx->npatterns = 1; + pmatch_len = 1; + } + + /* temporary output space for RE package */ + pmatch = palloc(sizeof(regmatch_t) * pmatch_len); + + /* + * the real output space (grown dynamically if needed) + * + * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather + * than at 2^27 + */ + array_len = re_flags->glob ? 255 : 31; + matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); + array_idx = 0; + + /* search for the pattern, perhaps repeatedly */ + prev_match_end = 0; + start_search = 0; + while (test_re_execute(cpattern, wide_str, wide_len, + start_search, + &matchctx->details, + pmatch_len, pmatch, + re_flags->eflags)) + { + /* enlarge output space if needed */ + while (array_idx + matchctx->npatterns * 2 + 1 > array_len) + { + array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ + if (array_len > MaxAllocSize / sizeof(int)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("too many regular expression matches"))); + matchctx->match_locs = (int *) repalloc(matchctx->match_locs, + sizeof(int) * array_len); + } + + /* save this match's locations */ + for (int i = 0; i < matchctx->npatterns; i++) + { + int so = pmatch[i].rm_so; + int eo = pmatch[i].rm_eo; + + matchctx->match_locs[array_idx++] = so; + matchctx->match_locs[array_idx++] = eo; + if (so >= 0 && eo >= 0 && (eo - so) > maxlen) + maxlen = (eo - so); + } + matchctx->nmatches++; + prev_match_end = pmatch[0].rm_eo; + + /* if not glob, stop after one match */ + if (!re_flags->glob) + break; + + /* + * Advance search position. Normally we start the next search at the + * end of the previous match; but if the match was of zero length, we + * have to advance by one character, or we'd just find the same match + * again. + */ + start_search = prev_match_end; + if (pmatch[0].rm_so == pmatch[0].rm_eo) + start_search++; + if (start_search > wide_len) + break; + } + + /* + * If we had no match, but "partial" and "indices" are set, emit the + * details. + */ + if (matchctx->nmatches == 0 && re_flags->partial && re_flags->indices) + { + /* enlarge output space if needed */ + while (array_idx + matchctx->npatterns * 2 + 1 > array_len) + { + array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ + if (array_len > MaxAllocSize / sizeof(int)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("too many regular expression matches"))); + matchctx->match_locs = (int *) repalloc(matchctx->match_locs, + sizeof(int) * array_len); + } + + matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_so; + matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_eo; + /* we don't have pmatch data, so emit -1 */ + for (int i = 1; i < matchctx->npatterns; i++) + { + matchctx->match_locs[array_idx++] = -1; + matchctx->match_locs[array_idx++] = -1; + } + matchctx->nmatches++; + } + + Assert(array_idx <= array_len); + + if (eml > 1) + { + int64 maxsiz = eml * (int64) maxlen; + int conv_bufsiz; + + /* + * Make the conversion buffer large enough for any substring of + * interest. + * + * Worst case: assume we need the maximum size (maxlen*eml), but take + * advantage of the fact that the original string length in bytes is + * an upper bound on the byte length of any fetched substring (and we + * know that len+1 is safe to allocate because the varlena header is + * longer than 1 byte). + */ + if (maxsiz > orig_len) + conv_bufsiz = orig_len + 1; + else + conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */ + + matchctx->conv_buf = palloc(conv_bufsiz); + matchctx->conv_bufsiz = conv_bufsiz; + matchctx->wide_str = wide_str; + } + else + { + /* No need to keep the wide string if we're in a single-byte charset. */ + pfree(wide_str); + matchctx->wide_str = NULL; + matchctx->conv_buf = NULL; + matchctx->conv_bufsiz = 0; + } + + /* Clean up temp storage */ + pfree(pmatch); + + return matchctx; +} + +/* + * build_test_info_result - build output array describing compiled regexp + * + * This borrows some code from Tcl's TclRegAbout(). + */ +static ArrayType * +build_test_info_result(regex_t *cpattern, test_re_flags *flags) +{ + /* Translation data for flag bits in regex_t.re_info */ + struct infoname + { + int bit; + const char *text; + }; + static const struct infoname infonames[] = { + {REG_UBACKREF, "REG_UBACKREF"}, + {REG_ULOOKAROUND, "REG_ULOOKAROUND"}, + {REG_UBOUNDS, "REG_UBOUNDS"}, + {REG_UBRACES, "REG_UBRACES"}, + {REG_UBSALNUM, "REG_UBSALNUM"}, + {REG_UPBOTCH, "REG_UPBOTCH"}, + {REG_UBBS, "REG_UBBS"}, + {REG_UNONPOSIX, "REG_UNONPOSIX"}, + {REG_UUNSPEC, "REG_UUNSPEC"}, + {REG_UUNPORT, "REG_UUNPORT"}, + {REG_ULOCALE, "REG_ULOCALE"}, + {REG_UEMPTYMATCH, "REG_UEMPTYMATCH"}, + {REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"}, + {REG_USHORTEST, "REG_USHORTEST"}, + {0, NULL} + }; + const struct infoname *inf; + Datum elems[lengthof(infonames) + 1]; + int nresults = 0; + char buf[80]; + int dims[1]; + int lbs[1]; + + /* Set up results: first, the number of subexpressions */ + snprintf(buf, sizeof(buf), "%d", (int) cpattern->re_nsub); + elems[nresults++] = PointerGetDatum(cstring_to_text(buf)); + + /* Report individual info bit states */ + for (inf = infonames; inf->bit != 0; inf++) + { + if (cpattern->re_info & inf->bit) + { + if (flags->info & inf->bit) + elems[nresults++] = PointerGetDatum(cstring_to_text(inf->text)); + else + { + snprintf(buf, sizeof(buf), "unexpected %s!", inf->text); + elems[nresults++] = PointerGetDatum(cstring_to_text(buf)); + } + } + else + { + if (flags->info & inf->bit) + { + snprintf(buf, sizeof(buf), "missing %s!", inf->text); + elems[nresults++] = PointerGetDatum(cstring_to_text(buf)); + } + } + } + + /* And form an array */ + dims[0] = nresults; + lbs[0] = 1; + /* XXX: this hardcodes assumptions about the text type */ + return construct_md_array(elems, NULL, 1, dims, lbs, + TEXTOID, -1, false, TYPALIGN_INT); +} + +/* + * build_test_match_result - build output array for current match + * + * Note that if the indices flag is set, we don't need any strings, + * just the location data. + */ +static ArrayType * +build_test_match_result(test_regex_ctx *matchctx) +{ + char *buf = matchctx->conv_buf; + Datum *elems = matchctx->elems; + bool *nulls = matchctx->nulls; + bool indices = matchctx->re_flags.indices; + char bufstr[80]; + int dims[1]; + int lbs[1]; + int loc; + int i; + + /* Extract matching substrings from the original string */ + loc = matchctx->next_match * matchctx->npatterns * 2; + for (i = 0; i < matchctx->npatterns; i++) + { + int so = matchctx->match_locs[loc++]; + int eo = matchctx->match_locs[loc++]; + + if (indices) + { + /* Report eo this way for consistency with Tcl */ + snprintf(bufstr, sizeof(bufstr), "%d %d", + so, so < 0 ? eo : eo - 1); + elems[i] = PointerGetDatum(cstring_to_text(bufstr)); + nulls[i] = false; + } + else if (so < 0 || eo < 0) + { + elems[i] = (Datum) 0; + nulls[i] = true; + } + else if (buf) + { + int len = pg_wchar2mb_with_len(matchctx->wide_str + so, + buf, + eo - so); + + Assert(len < matchctx->conv_bufsiz); + elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len)); + nulls[i] = false; + } + else + { + elems[i] = DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(so + 1), + Int32GetDatum(eo - so)); + nulls[i] = false; + } + } + + /* In EXPECT indices mode, also report the "details" */ + if (indices && (matchctx->re_flags.cflags & REG_EXPECT)) + { + int so = matchctx->details.rm_extend.rm_so; + int eo = matchctx->details.rm_extend.rm_eo; + + snprintf(bufstr, sizeof(bufstr), "%d %d", + so, so < 0 ? eo : eo - 1); + elems[i] = PointerGetDatum(cstring_to_text(bufstr)); + nulls[i] = false; + i++; + } + + /* And form an array */ + dims[0] = i; + lbs[0] = 1; + /* XXX: this hardcodes assumptions about the text type */ + return construct_md_array(elems, nulls, 1, dims, lbs, + TEXTOID, -1, false, TYPALIGN_INT); +} |