/*-------------------------------------------------------------------------- * * test_regex.c * Test harness for the regular expression package. * * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/test/modules/test_regex/test_regex.c * * ------------------------------------------------------------------------- */ #include "postgres.h" #include "funcapi.h" #include "miscadmin.h" #include "regex/regex.h" #include "utils/array.h" #include "utils/builtins.h" PG_MODULE_MAGIC; /* all the options of interest for regex functions */ typedef struct test_re_flags { int cflags; /* compile flags for Spencer's regex code */ int eflags; /* execute flags for Spencer's regex code */ long info; /* expected re_info bits */ bool glob; /* do it globally (for each occurrence) */ bool indices; /* report indices not actual strings */ bool partial; /* expect partial match */ } test_re_flags; /* cross-call state for test_regex() */ typedef struct test_regex_ctx { test_re_flags re_flags; /* flags */ rm_detail_t details; /* "details" from execution */ text *orig_str; /* data string in original TEXT form */ int nmatches; /* number of places where pattern matched */ int npatterns; /* number of capturing subpatterns */ /* We store start char index and end+1 char index for each match */ /* so the number of entries in match_locs is nmatches * npatterns * 2 */ int *match_locs; /* 0-based character indexes */ int next_match; /* 0-based index of next match to process */ /* workspace for build_test_match_result() */ Datum *elems; /* has npatterns+1 elements */ bool *nulls; /* has npatterns+1 elements */ pg_wchar *wide_str; /* wide-char version of original string */ char *conv_buf; /* conversion buffer, if needed */ int conv_bufsiz; /* size thereof */ } test_regex_ctx; /* Local functions */ static void test_re_compile(text *text_re, int cflags, Oid collation, regex_t *result_re); static void parse_test_flags(test_re_flags *flags, text *opts); static test_regex_ctx *setup_test_matches(text *orig_str, regex_t *cpattern, test_re_flags *flags, Oid collation, bool use_subpatterns); static ArrayType *build_test_info_result(regex_t *cpattern, test_re_flags *flags); static ArrayType *build_test_match_result(test_regex_ctx *matchctx); /* * test_regex(pattern text, string text, flags text) returns setof text[] * * This is largely based on regexp.c's regexp_matches, with additions * for debugging purposes. */ PG_FUNCTION_INFO_V1(test_regex); Datum test_regex(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; test_regex_ctx *matchctx; ArrayType *result_ary; if (SRF_IS_FIRSTCALL()) { text *pattern = PG_GETARG_TEXT_PP(0); text *flags = PG_GETARG_TEXT_PP(2); Oid collation = PG_GET_COLLATION(); test_re_flags re_flags; regex_t cpattern; MemoryContext oldcontext; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Determine options */ parse_test_flags(&re_flags, flags); /* set up the compiled pattern */ test_re_compile(pattern, re_flags.cflags, collation, &cpattern); /* be sure to copy the input string into the multi-call ctx */ matchctx = setup_test_matches(PG_GETARG_TEXT_P_COPY(1), &cpattern, &re_flags, collation, true); /* Pre-create workspace that build_test_match_result needs */ matchctx->elems = (Datum *) palloc(sizeof(Datum) * (matchctx->npatterns + 1)); matchctx->nulls = (bool *) palloc(sizeof(bool) * (matchctx->npatterns + 1)); MemoryContextSwitchTo(oldcontext); funcctx->user_fctx = (void *) matchctx; /* * Return the first result row, which is info equivalent to Tcl's * "regexp -about" output */ result_ary = build_test_info_result(&cpattern, &re_flags); pg_regfree(&cpattern); SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); } else { /* Each subsequent row describes one match */ funcctx = SRF_PERCALL_SETUP(); matchctx = (test_regex_ctx *) funcctx->user_fctx; if (matchctx->next_match < matchctx->nmatches) { result_ary = build_test_match_result(matchctx); matchctx->next_match++; SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); } } SRF_RETURN_DONE(funcctx); } /* * test_re_compile - compile a RE * * text_re --- the pattern, expressed as a TEXT object * cflags --- compile options for the pattern * collation --- collation to use for LC_CTYPE-dependent behavior * result_re --- output, compiled RE is stored here * * Pattern is given in the database encoding. We internally convert to * an array of pg_wchar, which is what Spencer's regex package wants. * * Caller must eventually pg_regfree the resulting RE to avoid memory leaks. */ static void test_re_compile(text *text_re, int cflags, Oid collation, regex_t *result_re) { int text_re_len = VARSIZE_ANY_EXHDR(text_re); char *text_re_val = VARDATA_ANY(text_re); pg_wchar *pattern; int pattern_len; int regcomp_result; char errMsg[100]; /* Convert pattern string to wide characters */ pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar)); pattern_len = pg_mb2wchar_with_len(text_re_val, pattern, text_re_len); regcomp_result = pg_regcomp(result_re, pattern, pattern_len, cflags, collation); pfree(pattern); if (regcomp_result != REG_OKAY) { /* re didn't compile (no need for pg_regfree, if so) */ /* * Here and in other places in this file, do CHECK_FOR_INTERRUPTS * before reporting a regex error. This is so that if the regex * library aborts and returns REG_CANCEL, we don't print an error * message that implies the regex was invalid. */ CHECK_FOR_INTERRUPTS(); pg_regerror(regcomp_result, result_re, errMsg, sizeof(errMsg)); ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), errmsg("invalid regular expression: %s", errMsg))); } } /* * test_re_execute - execute a RE on pg_wchar data * * Returns true on match, false on no match * Arguments are as for pg_regexec */ static bool test_re_execute(regex_t *re, pg_wchar *data, int data_len, int start_search, rm_detail_t *details, int nmatch, regmatch_t *pmatch, int eflags) { int regexec_result; char errMsg[100]; /* Initialize match locations in case engine doesn't */ details->rm_extend.rm_so = -1; details->rm_extend.rm_eo = -1; for (int i = 0; i < nmatch; i++) { pmatch[i].rm_so = -1; pmatch[i].rm_eo = -1; } /* Perform RE match and return result */ regexec_result = pg_regexec(re, data, data_len, start_search, details, nmatch, pmatch, eflags); if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) { /* re failed??? */ CHECK_FOR_INTERRUPTS(); pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), errmsg("regular expression failed: %s", errMsg))); } return (regexec_result == REG_OKAY); } /* * parse_test_flags - parse the flags argument * * flags --- output argument, filled with desired options * opts --- TEXT object, or NULL for defaults */ static void parse_test_flags(test_re_flags *flags, text *opts) { /* these defaults must match Tcl's */ int cflags = REG_ADVANCED; int eflags = 0; long info = 0; flags->glob = false; flags->indices = false; flags->partial = false; if (opts) { char *opt_p = VARDATA_ANY(opts); int opt_len = VARSIZE_ANY_EXHDR(opts); int i; for (i = 0; i < opt_len; i++) { switch (opt_p[i]) { case '-': /* allowed, no-op */ break; case '!': flags->partial = true; break; case '*': /* test requires Unicode --- ignored here */ break; case '0': flags->indices = true; break; /* These flags correspond to user-exposed RE options: */ case 'g': /* global match */ flags->glob = true; break; case 'i': /* case insensitive */ cflags |= REG_ICASE; break; case 'n': /* \n affects ^ $ . [^ */ cflags |= REG_NEWLINE; break; case 'p': /* ~Perl, \n affects . [^ */ cflags |= REG_NLSTOP; cflags &= ~REG_NLANCH; break; case 'w': /* weird, \n affects ^ $ only */ cflags &= ~REG_NLSTOP; cflags |= REG_NLANCH; break; case 'x': /* expanded syntax */ cflags |= REG_EXPANDED; break; /* These flags correspond to Tcl's -xflags options: */ case 'a': cflags |= REG_ADVF; break; case 'b': cflags &= ~REG_ADVANCED; break; case 'c': /* * Tcl calls this TCL_REG_CANMATCH, but it's really * REG_EXPECT. In this implementation we must also set * the partial and indices flags, so that * setup_test_matches and build_test_match_result will * emit the desired data. (They'll emit more fields than * Tcl would, but that's fine.) */ cflags |= REG_EXPECT; flags->partial = true; flags->indices = true; break; case 'e': cflags &= ~REG_ADVANCED; cflags |= REG_EXTENDED; break; case 'q': cflags &= ~REG_ADVANCED; cflags |= REG_QUOTE; break; case 'o': /* o for opaque */ cflags |= REG_NOSUB; break; case 's': /* s for start */ cflags |= REG_BOSONLY; break; case '+': cflags |= REG_FAKE; break; case ',': cflags |= REG_PROGRESS; break; case '.': cflags |= REG_DUMP; break; case ':': eflags |= REG_MTRACE; break; case ';': eflags |= REG_FTRACE; break; case '^': eflags |= REG_NOTBOL; break; case '$': eflags |= REG_NOTEOL; break; case 't': cflags |= REG_EXPECT; break; case '%': eflags |= REG_SMALL; break; /* These flags define expected info bits: */ case 'A': info |= REG_UBSALNUM; break; case 'B': info |= REG_UBRACES; break; case 'E': info |= REG_UBBS; break; case 'H': info |= REG_ULOOKAROUND; break; case 'I': info |= REG_UIMPOSSIBLE; break; case 'L': info |= REG_ULOCALE; break; case 'M': info |= REG_UUNPORT; break; case 'N': info |= REG_UEMPTYMATCH; break; case 'P': info |= REG_UNONPOSIX; break; case 'Q': info |= REG_UBOUNDS; break; case 'R': info |= REG_UBACKREF; break; case 'S': info |= REG_UUNSPEC; break; case 'T': info |= REG_USHORTEST; break; case 'U': info |= REG_UPBOTCH; break; default: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression test option: \"%.*s\"", pg_mblen(opt_p + i), opt_p + i))); break; } } } flags->cflags = cflags; flags->eflags = eflags; flags->info = info; } /* * setup_test_matches --- do the initial matching * * To simplify memory management, we do all the matching in one swoop. * The returned test_regex_ctx contains the locations of all the substrings * matching the pattern. */ static test_regex_ctx * setup_test_matches(text *orig_str, regex_t *cpattern, test_re_flags *re_flags, Oid collation, bool use_subpatterns) { test_regex_ctx *matchctx = palloc0(sizeof(test_regex_ctx)); int eml = pg_database_encoding_max_length(); int orig_len; pg_wchar *wide_str; int wide_len; regmatch_t *pmatch; int pmatch_len; int array_len; int array_idx; int prev_match_end; int start_search; int maxlen = 0; /* largest fetch length in characters */ /* save flags */ matchctx->re_flags = *re_flags; /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; /* convert string to pg_wchar form for matching */ orig_len = VARSIZE_ANY_EXHDR(orig_str); wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); /* do we want to remember subpatterns? */ if (use_subpatterns && cpattern->re_nsub > 0) { matchctx->npatterns = cpattern->re_nsub + 1; pmatch_len = cpattern->re_nsub + 1; } else { use_subpatterns = false; matchctx->npatterns = 1; pmatch_len = 1; } /* temporary output space for RE package */ pmatch = palloc(sizeof(regmatch_t) * pmatch_len); /* * the real output space (grown dynamically if needed) * * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather * than at 2^27 */ array_len = re_flags->glob ? 255 : 31; matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); array_idx = 0; /* search for the pattern, perhaps repeatedly */ prev_match_end = 0; start_search = 0; while (test_re_execute(cpattern, wide_str, wide_len, start_search, &matchctx->details, pmatch_len, pmatch, re_flags->eflags)) { /* enlarge output space if needed */ while (array_idx + matchctx->npatterns * 2 + 1 > array_len) { array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ if (array_len > MaxAllocSize / sizeof(int)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("too many regular expression matches"))); matchctx->match_locs = (int *) repalloc(matchctx->match_locs, sizeof(int) * array_len); } /* save this match's locations */ for (int i = 0; i < matchctx->npatterns; i++) { int so = pmatch[i].rm_so; int eo = pmatch[i].rm_eo; matchctx->match_locs[array_idx++] = so; matchctx->match_locs[array_idx++] = eo; if (so >= 0 && eo >= 0 && (eo - so) > maxlen) maxlen = (eo - so); } matchctx->nmatches++; prev_match_end = pmatch[0].rm_eo; /* if not glob, stop after one match */ if (!re_flags->glob) break; /* * Advance search position. Normally we start the next search at the * end of the previous match; but if the match was of zero length, we * have to advance by one character, or we'd just find the same match * again. */ start_search = prev_match_end; if (pmatch[0].rm_so == pmatch[0].rm_eo) start_search++; if (start_search > wide_len) break; } /* * If we had no match, but "partial" and "indices" are set, emit the * details. */ if (matchctx->nmatches == 0 && re_flags->partial && re_flags->indices) { /* enlarge output space if needed */ while (array_idx + matchctx->npatterns * 2 + 1 > array_len) { array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ if (array_len > MaxAllocSize / sizeof(int)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("too many regular expression matches"))); matchctx->match_locs = (int *) repalloc(matchctx->match_locs, sizeof(int) * array_len); } matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_so; matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_eo; /* we don't have pmatch data, so emit -1 */ for (int i = 1; i < matchctx->npatterns; i++) { matchctx->match_locs[array_idx++] = -1; matchctx->match_locs[array_idx++] = -1; } matchctx->nmatches++; } Assert(array_idx <= array_len); if (eml > 1) { int64 maxsiz = eml * (int64) maxlen; int conv_bufsiz; /* * Make the conversion buffer large enough for any substring of * interest. * * Worst case: assume we need the maximum size (maxlen*eml), but take * advantage of the fact that the original string length in bytes is * an upper bound on the byte length of any fetched substring (and we * know that len+1 is safe to allocate because the varlena header is * longer than 1 byte). */ if (maxsiz > orig_len) conv_bufsiz = orig_len + 1; else conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */ matchctx->conv_buf = palloc(conv_bufsiz); matchctx->conv_bufsiz = conv_bufsiz; matchctx->wide_str = wide_str; } else { /* No need to keep the wide string if we're in a single-byte charset. */ pfree(wide_str); matchctx->wide_str = NULL; matchctx->conv_buf = NULL; matchctx->conv_bufsiz = 0; } /* Clean up temp storage */ pfree(pmatch); return matchctx; } /* * build_test_info_result - build output array describing compiled regexp * * This borrows some code from Tcl's TclRegAbout(). */ static ArrayType * build_test_info_result(regex_t *cpattern, test_re_flags *flags) { /* Translation data for flag bits in regex_t.re_info */ struct infoname { int bit; const char *text; }; static const struct infoname infonames[] = { {REG_UBACKREF, "REG_UBACKREF"}, {REG_ULOOKAROUND, "REG_ULOOKAROUND"}, {REG_UBOUNDS, "REG_UBOUNDS"}, {REG_UBRACES, "REG_UBRACES"}, {REG_UBSALNUM, "REG_UBSALNUM"}, {REG_UPBOTCH, "REG_UPBOTCH"}, {REG_UBBS, "REG_UBBS"}, {REG_UNONPOSIX, "REG_UNONPOSIX"}, {REG_UUNSPEC, "REG_UUNSPEC"}, {REG_UUNPORT, "REG_UUNPORT"}, {REG_ULOCALE, "REG_ULOCALE"}, {REG_UEMPTYMATCH, "REG_UEMPTYMATCH"}, {REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"}, {REG_USHORTEST, "REG_USHORTEST"}, {0, NULL} }; const struct infoname *inf; Datum elems[lengthof(infonames) + 1]; int nresults = 0; char buf[80]; int dims[1]; int lbs[1]; /* Set up results: first, the number of subexpressions */ snprintf(buf, sizeof(buf), "%d", (int) cpattern->re_nsub); elems[nresults++] = PointerGetDatum(cstring_to_text(buf)); /* Report individual info bit states */ for (inf = infonames; inf->bit != 0; inf++) { if (cpattern->re_info & inf->bit) { if (flags->info & inf->bit) elems[nresults++] = PointerGetDatum(cstring_to_text(inf->text)); else { snprintf(buf, sizeof(buf), "unexpected %s!", inf->text); elems[nresults++] = PointerGetDatum(cstring_to_text(buf)); } } else { if (flags->info & inf->bit) { snprintf(buf, sizeof(buf), "missing %s!", inf->text); elems[nresults++] = PointerGetDatum(cstring_to_text(buf)); } } } /* And form an array */ dims[0] = nresults; lbs[0] = 1; /* XXX: this hardcodes assumptions about the text type */ return construct_md_array(elems, NULL, 1, dims, lbs, TEXTOID, -1, false, TYPALIGN_INT); } /* * build_test_match_result - build output array for current match * * Note that if the indices flag is set, we don't need any strings, * just the location data. */ static ArrayType * build_test_match_result(test_regex_ctx *matchctx) { char *buf = matchctx->conv_buf; Datum *elems = matchctx->elems; bool *nulls = matchctx->nulls; bool indices = matchctx->re_flags.indices; char bufstr[80]; int dims[1]; int lbs[1]; int loc; int i; /* Extract matching substrings from the original string */ loc = matchctx->next_match * matchctx->npatterns * 2; for (i = 0; i < matchctx->npatterns; i++) { int so = matchctx->match_locs[loc++]; int eo = matchctx->match_locs[loc++]; if (indices) { /* Report eo this way for consistency with Tcl */ snprintf(bufstr, sizeof(bufstr), "%d %d", so, so < 0 ? eo : eo - 1); elems[i] = PointerGetDatum(cstring_to_text(bufstr)); nulls[i] = false; } else if (so < 0 || eo < 0) { elems[i] = (Datum) 0; nulls[i] = true; } else if (buf) { int len = pg_wchar2mb_with_len(matchctx->wide_str + so, buf, eo - so); Assert(len < matchctx->conv_bufsiz); elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len)); nulls[i] = false; } else { elems[i] = DirectFunctionCall3(text_substr, PointerGetDatum(matchctx->orig_str), Int32GetDatum(so + 1), Int32GetDatum(eo - so)); nulls[i] = false; } } /* In EXPECT indices mode, also report the "details" */ if (indices && (matchctx->re_flags.cflags & REG_EXPECT)) { int so = matchctx->details.rm_extend.rm_so; int eo = matchctx->details.rm_extend.rm_eo; snprintf(bufstr, sizeof(bufstr), "%d %d", so, so < 0 ? eo : eo - 1); elems[i] = PointerGetDatum(cstring_to_text(bufstr)); nulls[i] = false; i++; } /* And form an array */ dims[0] = i; lbs[0] = 1; /* XXX: this hardcodes assumptions about the text type */ return construct_md_array(elems, nulls, 1, dims, lbs, TEXTOID, -1, false, TYPALIGN_INT); }