diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
commit | 293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch) | |
tree | fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/utils/adt/tsvector_op.c | |
parent | Initial commit. (diff) | |
download | postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip |
Adding upstream version 16.2.upstream/16.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/adt/tsvector_op.c')
-rw-r--r-- | src/backend/utils/adt/tsvector_op.c | 2893 |
1 files changed, 2893 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c new file mode 100644 index 0000000..4457c5d --- /dev/null +++ b/src/backend/utils/adt/tsvector_op.c @@ -0,0 +1,2893 @@ +/*------------------------------------------------------------------------- + * + * tsvector_op.c + * operations over tsvector + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/adt/tsvector_op.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <limits.h> + +#include "access/htup_details.h" +#include "catalog/namespace.h" +#include "catalog/pg_type.h" +#include "commands/trigger.h" +#include "executor/spi.h" +#include "funcapi.h" +#include "lib/qunique.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "parser/parse_coerce.h" +#include "tsearch/ts_utils.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/regproc.h" +#include "utils/rel.h" + + +typedef struct +{ + WordEntry *arrb; + WordEntry *arre; + char *values; + char *operand; +} CHKVAL; + + +typedef struct StatEntry +{ + uint32 ndoc; /* zero indicates that we were already here + * while walking through the tree */ + uint32 nentry; + struct StatEntry *left; + struct StatEntry *right; + uint32 lenlexeme; + char lexeme[FLEXIBLE_ARRAY_MEMBER]; +} StatEntry; + +#define STATENTRYHDRSZ (offsetof(StatEntry, lexeme)) + +typedef struct +{ + int32 weight; + + uint32 maxdepth; + + StatEntry **stack; + uint32 stackpos; + + StatEntry *root; +} TSVectorStat; + + +static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg, + uint32 flags, + TSExecuteCallback chkcond); +static bool TS_execute_locations_recurse(QueryItem *curitem, + void *arg, + TSExecuteCallback chkcond, + List **locations); +static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len); +static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); + + +/* + * Order: haspos, len, word, for all positions (pos, weight) + */ +static int +silly_cmp_tsvector(const TSVector a, const TSVector b) +{ + if (VARSIZE(a) < VARSIZE(b)) + return -1; + else if (VARSIZE(a) > VARSIZE(b)) + return 1; + else if (a->size < b->size) + return -1; + else if (a->size > b->size) + return 1; + else + { + WordEntry *aptr = ARRPTR(a); + WordEntry *bptr = ARRPTR(b); + int i = 0; + int res; + + + for (i = 0; i < a->size; i++) + { + if (aptr->haspos != bptr->haspos) + { + return (aptr->haspos > bptr->haspos) ? -1 : 1; + } + else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0) + { + return res; + } + else if (aptr->haspos) + { + WordEntryPos *ap = POSDATAPTR(a, aptr); + WordEntryPos *bp = POSDATAPTR(b, bptr); + int j; + + if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr)) + return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1; + + for (j = 0; j < POSDATALEN(a, aptr); j++) + { + if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp)) + { + return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1; + } + else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp)) + { + return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1; + } + ap++, bp++; + } + } + + aptr++; + bptr++; + } + } + + return 0; +} + +#define TSVECTORCMPFUNC( type, action, ret ) \ +Datum \ +tsvector_##type(PG_FUNCTION_ARGS) \ +{ \ + TSVector a = PG_GETARG_TSVECTOR(0); \ + TSVector b = PG_GETARG_TSVECTOR(1); \ + int res = silly_cmp_tsvector(a, b); \ + PG_FREE_IF_COPY(a,0); \ + PG_FREE_IF_COPY(b,1); \ + PG_RETURN_##ret( res action 0 ); \ +} \ +/* keep compiler quiet - no extra ; */ \ +extern int no_such_variable + +TSVECTORCMPFUNC(lt, <, BOOL); +TSVECTORCMPFUNC(le, <=, BOOL); +TSVECTORCMPFUNC(eq, ==, BOOL); +TSVECTORCMPFUNC(ge, >=, BOOL); +TSVECTORCMPFUNC(gt, >, BOOL); +TSVECTORCMPFUNC(ne, !=, BOOL); +TSVECTORCMPFUNC(cmp, +, INT32); + +Datum +tsvector_strip(PG_FUNCTION_ARGS) +{ + TSVector in = PG_GETARG_TSVECTOR(0); + TSVector out; + int i, + len = 0; + WordEntry *arrin = ARRPTR(in), + *arrout; + char *cur; + + for (i = 0; i < in->size; i++) + len += arrin[i].len; + + len = CALCDATASIZE(in->size, len); + out = (TSVector) palloc0(len); + SET_VARSIZE(out, len); + out->size = in->size; + arrout = ARRPTR(out); + cur = STRPTR(out); + for (i = 0; i < in->size; i++) + { + memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len); + arrout[i].haspos = 0; + arrout[i].len = arrin[i].len; + arrout[i].pos = cur - STRPTR(out); + cur += arrout[i].len; + } + + PG_FREE_IF_COPY(in, 0); + PG_RETURN_POINTER(out); +} + +Datum +tsvector_length(PG_FUNCTION_ARGS) +{ + TSVector in = PG_GETARG_TSVECTOR(0); + int32 ret = in->size; + + PG_FREE_IF_COPY(in, 0); + PG_RETURN_INT32(ret); +} + +Datum +tsvector_setweight(PG_FUNCTION_ARGS) +{ + TSVector in = PG_GETARG_TSVECTOR(0); + char cw = PG_GETARG_CHAR(1); + TSVector out; + int i, + j; + WordEntry *entry; + WordEntryPos *p; + int w = 0; + + switch (cw) + { + case 'A': + case 'a': + w = 3; + break; + case 'B': + case 'b': + w = 2; + break; + case 'C': + case 'c': + w = 1; + break; + case 'D': + case 'd': + w = 0; + break; + default: + /* internal error */ + elog(ERROR, "unrecognized weight: %d", cw); + } + + out = (TSVector) palloc(VARSIZE(in)); + memcpy(out, in, VARSIZE(in)); + entry = ARRPTR(out); + i = out->size; + while (i--) + { + if ((j = POSDATALEN(out, entry)) != 0) + { + p = POSDATAPTR(out, entry); + while (j--) + { + WEP_SETWEIGHT(*p, w); + p++; + } + } + entry++; + } + + PG_FREE_IF_COPY(in, 0); + PG_RETURN_POINTER(out); +} + +/* + * setweight(tsin tsvector, char_weight "char", lexemes "text"[]) + * + * Assign weight w to elements of tsin that are listed in lexemes. + */ +Datum +tsvector_setweight_by_filter(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0); + char char_weight = PG_GETARG_CHAR(1); + ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2); + + TSVector tsout; + int i, + j, + nlexemes, + weight; + WordEntry *entry; + Datum *dlexemes; + bool *nulls; + + switch (char_weight) + { + case 'A': + case 'a': + weight = 3; + break; + case 'B': + case 'b': + weight = 2; + break; + case 'C': + case 'c': + weight = 1; + break; + case 'D': + case 'd': + weight = 0; + break; + default: + /* internal error */ + elog(ERROR, "unrecognized weight: %c", char_weight); + } + + tsout = (TSVector) palloc(VARSIZE(tsin)); + memcpy(tsout, tsin, VARSIZE(tsin)); + entry = ARRPTR(tsout); + + deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlexemes); + + /* + * Assuming that lexemes array is significantly shorter than tsvector we + * can iterate through lexemes performing binary search of each lexeme + * from lexemes in tsvector. + */ + for (i = 0; i < nlexemes; i++) + { + char *lex; + int lex_len, + lex_pos; + + /* Ignore null array elements, they surely don't match */ + if (nulls[i]) + continue; + + lex = VARDATA(dlexemes[i]); + lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + lex_pos = tsvector_bsearch(tsout, lex, lex_len); + + if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0) + { + WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos); + + while (j--) + { + WEP_SETWEIGHT(*p, weight); + p++; + } + } + } + + PG_FREE_IF_COPY(tsin, 0); + PG_FREE_IF_COPY(lexemes, 2); + + PG_RETURN_POINTER(tsout); +} + +#define compareEntry(pa, a, pb, b) \ + tsCompareString((pa) + (a)->pos, (a)->len, \ + (pb) + (b)->pos, (b)->len, \ + false) + +/* + * Add positions from src to dest after offsetting them by maxpos. + * Return the number added (might be less than expected due to overflow) + */ +static int32 +add_pos(TSVector src, WordEntry *srcptr, + TSVector dest, WordEntry *destptr, + int32 maxpos) +{ + uint16 *clen = &_POSVECPTR(dest, destptr)->npos; + int i; + uint16 slen = POSDATALEN(src, srcptr), + startlen; + WordEntryPos *spos = POSDATAPTR(src, srcptr), + *dpos = POSDATAPTR(dest, destptr); + + if (!destptr->haspos) + *clen = 0; + + startlen = *clen; + for (i = 0; + i < slen && *clen < MAXNUMPOS && + (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); + i++) + { + WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); + WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); + (*clen)++; + } + + if (*clen != startlen) + destptr->haspos = 1; + return *clen - startlen; +} + +/* + * Perform binary search of given lexeme in TSVector. + * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't + * found. + */ +static int +tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len) +{ + WordEntry *arrin = ARRPTR(tsv); + int StopLow = 0, + StopHigh = tsv->size, + StopMiddle, + cmp; + + while (StopLow < StopHigh) + { + StopMiddle = (StopLow + StopHigh) / 2; + + cmp = tsCompareString(lexeme, lexeme_len, + STRPTR(tsv) + arrin[StopMiddle].pos, + arrin[StopMiddle].len, + false); + + if (cmp < 0) + StopHigh = StopMiddle; + else if (cmp > 0) + StopLow = StopMiddle + 1; + else /* found it */ + return StopMiddle; + } + + return -1; +} + +/* + * qsort comparator functions + */ + +static int +compare_int(const void *va, const void *vb) +{ + int a = *((const int *) va); + int b = *((const int *) vb); + + if (a == b) + return 0; + return (a > b) ? 1 : -1; +} + +static int +compare_text_lexemes(const void *va, const void *vb) +{ + Datum a = *((const Datum *) va); + Datum b = *((const Datum *) vb); + char *alex = VARDATA_ANY(a); + int alex_len = VARSIZE_ANY_EXHDR(a); + char *blex = VARDATA_ANY(b); + int blex_len = VARSIZE_ANY_EXHDR(b); + + return tsCompareString(alex, alex_len, blex, blex_len, false); +} + +/* + * Internal routine to delete lexemes from TSVector by array of offsets. + * + * int *indices_to_delete -- array of lexeme offsets to delete (modified here!) + * int indices_count -- size of that array + * + * Returns new TSVector without given lexemes along with their positions + * and weights. + */ +static TSVector +tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, + int indices_count) +{ + TSVector tsout; + WordEntry *arrin = ARRPTR(tsv), + *arrout; + char *data = STRPTR(tsv), + *dataout; + int i, /* index in arrin */ + j, /* index in arrout */ + k, /* index in indices_to_delete */ + curoff; /* index in dataout area */ + + /* + * Sort the filter array to simplify membership checks below. Also, get + * rid of any duplicate entries, so that we can assume that indices_count + * is exactly equal to the number of lexemes that will be removed. + */ + if (indices_count > 1) + { + qsort(indices_to_delete, indices_count, sizeof(int), compare_int); + indices_count = qunique(indices_to_delete, indices_count, sizeof(int), + compare_int); + } + + /* + * Here we overestimate tsout size, since we don't know how much space is + * used by the deleted lexeme(s). We will set exact size below. + */ + tsout = (TSVector) palloc0(VARSIZE(tsv)); + + /* This count must be correct because STRPTR(tsout) relies on it. */ + tsout->size = tsv->size - indices_count; + + /* + * Copy tsv to tsout, skipping lexemes listed in indices_to_delete. + */ + arrout = ARRPTR(tsout); + dataout = STRPTR(tsout); + curoff = 0; + for (i = j = k = 0; i < tsv->size; i++) + { + /* + * If current i is present in indices_to_delete, skip this lexeme. + * Since indices_to_delete is already sorted, we only need to check + * the current (k'th) entry. + */ + if (k < indices_count && i == indices_to_delete[k]) + { + k++; + continue; + } + + /* Copy lexeme and its positions and weights */ + memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len); + arrout[j].haspos = arrin[i].haspos; + arrout[j].len = arrin[i].len; + arrout[j].pos = curoff; + curoff += arrin[i].len; + if (arrin[i].haspos) + { + int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) + + sizeof(uint16); + + curoff = SHORTALIGN(curoff); + memcpy(dataout + curoff, + STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len), + len); + curoff += len; + } + + j++; + } + + /* + * k should now be exactly equal to indices_count. If it isn't then the + * caller provided us with indices outside of [0, tsv->size) range and + * estimation of tsout's size is wrong. + */ + Assert(k == indices_count); + + SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff)); + return tsout; +} + +/* + * Delete given lexeme from tsvector. + * Implementation of user-level ts_delete(tsvector, text). + */ +Datum +tsvector_delete_str(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0), + tsout; + text *tlexeme = PG_GETARG_TEXT_PP(1); + char *lexeme = VARDATA_ANY(tlexeme); + int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme), + skip_index; + + if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1) + PG_RETURN_POINTER(tsin); + + tsout = tsvector_delete_by_indices(tsin, &skip_index, 1); + + PG_FREE_IF_COPY(tsin, 0); + PG_FREE_IF_COPY(tlexeme, 1); + PG_RETURN_POINTER(tsout); +} + +/* + * Delete given array of lexemes from tsvector. + * Implementation of user-level ts_delete(tsvector, text[]). + */ +Datum +tsvector_delete_arr(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0), + tsout; + ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1); + int i, + nlex, + skip_count, + *skip_indices; + Datum *dlexemes; + bool *nulls; + + deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlex); + + /* + * In typical use case array of lexemes to delete is relatively small. So + * here we optimize things for that scenario: iterate through lexarr + * performing binary search of each lexeme from lexarr in tsvector. + */ + skip_indices = palloc0(nlex * sizeof(int)); + for (i = skip_count = 0; i < nlex; i++) + { + char *lex; + int lex_len, + lex_pos; + + /* Ignore null array elements, they surely don't match */ + if (nulls[i]) + continue; + + lex = VARDATA(dlexemes[i]); + lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + lex_pos = tsvector_bsearch(tsin, lex, lex_len); + + if (lex_pos >= 0) + skip_indices[skip_count++] = lex_pos; + } + + tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count); + + pfree(skip_indices); + PG_FREE_IF_COPY(tsin, 0); + PG_FREE_IF_COPY(lexemes, 1); + + PG_RETURN_POINTER(tsout); +} + +/* + * Expand tsvector as table with following columns: + * lexeme: lexeme text + * positions: integer array of lexeme positions + * weights: char array of weights corresponding to positions + */ +Datum +tsvector_unnest(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + TSVector tsin; + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions", + INT2ARRAYOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights", + TEXTARRAYOID, -1, 0); + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + funcctx->tuple_desc = tupdesc; + + funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0); + + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + tsin = (TSVector) funcctx->user_fctx; + + if (funcctx->call_cntr < tsin->size) + { + WordEntry *arrin = ARRPTR(tsin); + char *data = STRPTR(tsin); + HeapTuple tuple; + int j, + i = funcctx->call_cntr; + bool nulls[] = {false, false, false}; + Datum values[3]; + + values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)); + + if (arrin[i].haspos) + { + WordEntryPosVector *posv; + Datum *positions; + Datum *weights; + char weight; + + /* + * Internally tsvector stores position and weight in the same + * uint16 (2 bits for weight, 14 for position). Here we extract + * that in two separate arrays. + */ + posv = _POSVECPTR(tsin, arrin + i); + positions = palloc(posv->npos * sizeof(Datum)); + weights = palloc(posv->npos * sizeof(Datum)); + for (j = 0; j < posv->npos; j++) + { + positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j])); + weight = 'D' - WEP_GETWEIGHT(posv->pos[j]); + weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight, + 1)); + } + + values[1] = PointerGetDatum(construct_array_builtin(positions, posv->npos, INT2OID)); + values[2] = PointerGetDatum(construct_array_builtin(weights, posv->npos, TEXTOID)); + } + else + { + nulls[1] = nulls[2] = true; + } + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + SRF_RETURN_DONE(funcctx); + } +} + +/* + * Convert tsvector to array of lexemes. + */ +Datum +tsvector_to_array(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0); + WordEntry *arrin = ARRPTR(tsin); + Datum *elements; + int i; + ArrayType *array; + + elements = palloc(tsin->size * sizeof(Datum)); + + for (i = 0; i < tsin->size; i++) + { + elements[i] = PointerGetDatum(cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, + arrin[i].len)); + } + + array = construct_array_builtin(elements, tsin->size, TEXTOID); + + pfree(elements); + PG_FREE_IF_COPY(tsin, 0); + PG_RETURN_POINTER(array); +} + +/* + * Build tsvector from array of lexemes. + */ +Datum +array_to_tsvector(PG_FUNCTION_ARGS) +{ + ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); + TSVector tsout; + Datum *dlexemes; + WordEntry *arrout; + bool *nulls; + int nitems, + i, + tslen, + datalen = 0; + char *cur; + + deconstruct_array_builtin(v, TEXTOID, &dlexemes, &nulls, &nitems); + + /* + * Reject nulls and zero length strings (maybe we should just ignore them, + * instead?) + */ + for (i = 0; i < nitems; i++) + { + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("lexeme array may not contain nulls"))); + + if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0) + ereport(ERROR, + (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING), + errmsg("lexeme array may not contain empty strings"))); + } + + /* Sort and de-dup, because this is required for a valid tsvector. */ + if (nitems > 1) + { + qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes); + nitems = qunique(dlexemes, nitems, sizeof(Datum), + compare_text_lexemes); + } + + /* Calculate space needed for surviving lexemes. */ + for (i = 0; i < nitems; i++) + datalen += VARSIZE(dlexemes[i]) - VARHDRSZ; + tslen = CALCDATASIZE(nitems, datalen); + + /* Allocate and fill tsvector. */ + tsout = (TSVector) palloc0(tslen); + SET_VARSIZE(tsout, tslen); + tsout->size = nitems; + + arrout = ARRPTR(tsout); + cur = STRPTR(tsout); + for (i = 0; i < nitems; i++) + { + char *lex = VARDATA(dlexemes[i]); + int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + + memcpy(cur, lex, lex_len); + arrout[i].haspos = 0; + arrout[i].len = lex_len; + arrout[i].pos = cur - STRPTR(tsout); + cur += lex_len; + } + + PG_FREE_IF_COPY(v, 0); + PG_RETURN_POINTER(tsout); +} + +/* + * ts_filter(): keep only lexemes with given weights in tsvector. + */ +Datum +tsvector_filter(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0), + tsout; + ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1); + WordEntry *arrin = ARRPTR(tsin), + *arrout; + char *datain = STRPTR(tsin), + *dataout; + Datum *dweights; + bool *nulls; + int nweights; + int i, + j; + int cur_pos = 0; + char mask = 0; + + deconstruct_array_builtin(weights, CHAROID, &dweights, &nulls, &nweights); + + for (i = 0; i < nweights; i++) + { + char char_weight; + + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("weight array may not contain nulls"))); + + char_weight = DatumGetChar(dweights[i]); + switch (char_weight) + { + case 'A': + case 'a': + mask = mask | 8; + break; + case 'B': + case 'b': + mask = mask | 4; + break; + case 'C': + case 'c': + mask = mask | 2; + break; + case 'D': + case 'd': + mask = mask | 1; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized weight: \"%c\"", char_weight))); + } + } + + tsout = (TSVector) palloc0(VARSIZE(tsin)); + tsout->size = tsin->size; + arrout = ARRPTR(tsout); + dataout = STRPTR(tsout); + + for (i = j = 0; i < tsin->size; i++) + { + WordEntryPosVector *posvin, + *posvout; + int npos = 0; + int k; + + if (!arrin[i].haspos) + continue; + + posvin = _POSVECPTR(tsin, arrin + i); + posvout = (WordEntryPosVector *) + (dataout + SHORTALIGN(cur_pos + arrin[i].len)); + + for (k = 0; k < posvin->npos; k++) + { + if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k]))) + posvout->pos[npos++] = posvin->pos[k]; + } + + /* if no satisfactory positions found, skip lexeme */ + if (!npos) + continue; + + arrout[j].haspos = true; + arrout[j].len = arrin[i].len; + arrout[j].pos = cur_pos; + + memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len); + posvout->npos = npos; + cur_pos += SHORTALIGN(arrin[i].len); + cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) + + sizeof(uint16); + j++; + } + + tsout->size = j; + if (dataout != STRPTR(tsout)) + memmove(STRPTR(tsout), dataout, cur_pos); + + SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos)); + + PG_FREE_IF_COPY(tsin, 0); + PG_RETURN_POINTER(tsout); +} + +Datum +tsvector_concat(PG_FUNCTION_ARGS) +{ + TSVector in1 = PG_GETARG_TSVECTOR(0); + TSVector in2 = PG_GETARG_TSVECTOR(1); + TSVector out; + WordEntry *ptr; + WordEntry *ptr1, + *ptr2; + WordEntryPos *p; + int maxpos = 0, + i, + j, + i1, + i2, + dataoff, + output_bytes, + output_size; + char *data, + *data1, + *data2; + + /* Get max position in in1; we'll need this to offset in2's positions */ + ptr = ARRPTR(in1); + i = in1->size; + while (i--) + { + if ((j = POSDATALEN(in1, ptr)) != 0) + { + p = POSDATAPTR(in1, ptr); + while (j--) + { + if (WEP_GETPOS(*p) > maxpos) + maxpos = WEP_GETPOS(*p); + p++; + } + } + ptr++; + } + + ptr1 = ARRPTR(in1); + ptr2 = ARRPTR(in2); + data1 = STRPTR(in1); + data2 = STRPTR(in2); + i1 = in1->size; + i2 = in2->size; + + /* + * Conservative estimate of space needed. We might need all the data in + * both inputs, and conceivably add a pad byte before position data for + * each item where there was none before. + */ + output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2; + + out = (TSVector) palloc0(output_bytes); + SET_VARSIZE(out, output_bytes); + + /* + * We must make out->size valid so that STRPTR(out) is sensible. We'll + * collapse out any unused space at the end. + */ + out->size = in1->size + in2->size; + + ptr = ARRPTR(out); + data = STRPTR(out); + dataoff = 0; + while (i1 && i2) + { + int cmp = compareEntry(data1, ptr1, data2, ptr2); + + if (cmp < 0) + { /* in1 first */ + ptr->haspos = ptr1->haspos; + ptr->len = ptr1->len; + memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); + ptr->pos = dataoff; + dataoff += ptr1->len; + if (ptr->haspos) + { + dataoff = SHORTALIGN(dataoff); + memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + } + + ptr++; + ptr1++; + i1--; + } + else if (cmp > 0) + { /* in2 first */ + ptr->haspos = ptr2->haspos; + ptr->len = ptr2->len; + memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); + ptr->pos = dataoff; + dataoff += ptr2->len; + if (ptr->haspos) + { + int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + + if (addlen == 0) + ptr->haspos = 0; + else + { + dataoff = SHORTALIGN(dataoff); + dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } + } + + ptr++; + ptr2++; + i2--; + } + else + { + ptr->haspos = ptr1->haspos | ptr2->haspos; + ptr->len = ptr1->len; + memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); + ptr->pos = dataoff; + dataoff += ptr1->len; + if (ptr->haspos) + { + if (ptr1->haspos) + { + dataoff = SHORTALIGN(dataoff); + memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + if (ptr2->haspos) + dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); + } + else /* must have ptr2->haspos */ + { + int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + + if (addlen == 0) + ptr->haspos = 0; + else + { + dataoff = SHORTALIGN(dataoff); + dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } + } + } + + ptr++; + ptr1++; + ptr2++; + i1--; + i2--; + } + } + + while (i1) + { + ptr->haspos = ptr1->haspos; + ptr->len = ptr1->len; + memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); + ptr->pos = dataoff; + dataoff += ptr1->len; + if (ptr->haspos) + { + dataoff = SHORTALIGN(dataoff); + memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + } + + ptr++; + ptr1++; + i1--; + } + + while (i2) + { + ptr->haspos = ptr2->haspos; + ptr->len = ptr2->len; + memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); + ptr->pos = dataoff; + dataoff += ptr2->len; + if (ptr->haspos) + { + int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + + if (addlen == 0) + ptr->haspos = 0; + else + { + dataoff = SHORTALIGN(dataoff); + dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } + } + + ptr++; + ptr2++; + i2--; + } + + /* + * Instead of checking each offset individually, we check for overflow of + * pos fields once at the end. + */ + if (dataoff > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS))); + + /* + * Adjust sizes (asserting that we didn't overrun the original estimates) + * and collapse out any unused array entries. + */ + output_size = ptr - ARRPTR(out); + Assert(output_size <= out->size); + out->size = output_size; + if (data != STRPTR(out)) + memmove(STRPTR(out), data, dataoff); + output_bytes = CALCDATASIZE(out->size, dataoff); + Assert(output_bytes <= VARSIZE(out)); + SET_VARSIZE(out, output_bytes); + + PG_FREE_IF_COPY(in1, 0); + PG_FREE_IF_COPY(in2, 1); + PG_RETURN_POINTER(out); +} + +/* + * Compare two strings by tsvector rules. + * + * if prefix = true then it returns zero value iff b has prefix a + */ +int32 +tsCompareString(char *a, int lena, char *b, int lenb, bool prefix) +{ + int cmp; + + if (lena == 0) + { + if (prefix) + cmp = 0; /* empty string is prefix of anything */ + else + cmp = (lenb > 0) ? -1 : 0; + } + else if (lenb == 0) + { + cmp = (lena > 0) ? 1 : 0; + } + else + { + cmp = memcmp(a, b, Min((unsigned int) lena, (unsigned int) lenb)); + + if (prefix) + { + if (cmp == 0 && lena > lenb) + cmp = 1; /* a is longer, so not a prefix of b */ + } + else if (cmp == 0 && lena != lenb) + { + cmp = (lena < lenb) ? -1 : 1; + } + } + + return cmp; +} + +/* + * Check weight info or/and fill 'data' with the required positions + */ +static TSTernaryValue +checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, + ExecPhraseData *data) +{ + TSTernaryValue result = TS_NO; + + Assert(data == NULL || data->npos == 0); + + if (entry->haspos) + { + WordEntryPosVector *posvec; + + /* + * We can't use the _POSVECPTR macro here because the pointer to the + * tsvector's lexeme storage is already contained in chkval->values. + */ + posvec = (WordEntryPosVector *) + (chkval->values + SHORTALIGN(entry->pos + entry->len)); + + if (val->weight && data) + { + WordEntryPos *posvec_iter = posvec->pos; + WordEntryPos *dptr; + + /* + * Filter position information by weights + */ + dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos); + data->allocated = true; + + /* Is there a position with a matching weight? */ + while (posvec_iter < posvec->pos + posvec->npos) + { + /* If true, append this position to the data->pos */ + if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) + { + *dptr = WEP_GETPOS(*posvec_iter); + dptr++; + } + + posvec_iter++; + } + + data->npos = dptr - data->pos; + + if (data->npos > 0) + result = TS_YES; + else + { + pfree(data->pos); + data->pos = NULL; + data->allocated = false; + } + } + else if (val->weight) + { + WordEntryPos *posvec_iter = posvec->pos; + + /* Is there a position with a matching weight? */ + while (posvec_iter < posvec->pos + posvec->npos) + { + if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) + { + result = TS_YES; + break; /* no need to go further */ + } + + posvec_iter++; + } + } + else if (data) + { + data->npos = posvec->npos; + data->pos = posvec->pos; + data->allocated = false; + result = TS_YES; + } + else + { + /* simplest case: no weight check, positions not needed */ + result = TS_YES; + } + } + else + { + /* + * Position info is lacking, so if the caller requires it, we can only + * say that maybe there is a match. + * + * Notice, however, that we *don't* check val->weight here. + * Historically, stripped tsvectors are considered to match queries + * whether or not the query has a weight restriction; that's a little + * dubious but we'll preserve the behavior. + */ + if (data) + result = TS_MAYBE; + else + result = TS_YES; + } + + return result; +} + +/* + * TS_execute callback for matching a tsquery operand to plain tsvector data + */ +static TSTernaryValue +checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) +{ + CHKVAL *chkval = (CHKVAL *) checkval; + WordEntry *StopLow = chkval->arrb; + WordEntry *StopHigh = chkval->arre; + WordEntry *StopMiddle = StopHigh; + TSTernaryValue res = TS_NO; + + /* Loop invariant: StopLow <= val < StopHigh */ + while (StopLow < StopHigh) + { + int difference; + + StopMiddle = StopLow + (StopHigh - StopLow) / 2; + difference = tsCompareString(chkval->operand + val->distance, + val->length, + chkval->values + StopMiddle->pos, + StopMiddle->len, + false); + + if (difference == 0) + { + /* Check weight info & fill 'data' with positions */ + res = checkclass_str(chkval, StopMiddle, val, data); + break; + } + else if (difference > 0) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + + /* + * If it's a prefix search, we should also consider lexemes that the + * search term is a prefix of (which will necessarily immediately follow + * the place we found in the above loop). But we can skip them if there + * was a definite match on the exact term AND the caller doesn't need + * position info. + */ + if (val->prefix && (res != TS_YES || data)) + { + WordEntryPos *allpos = NULL; + int npos = 0, + totalpos = 0; + + /* adjust start position for corner case */ + if (StopLow >= StopHigh) + StopMiddle = StopHigh; + + /* we don't try to re-use any data from the initial match */ + if (data) + { + if (data->allocated) + pfree(data->pos); + data->pos = NULL; + data->allocated = false; + data->npos = 0; + } + res = TS_NO; + + while ((res != TS_YES || data) && + StopMiddle < chkval->arre && + tsCompareString(chkval->operand + val->distance, + val->length, + chkval->values + StopMiddle->pos, + StopMiddle->len, + true) == 0) + { + TSTernaryValue subres; + + subres = checkclass_str(chkval, StopMiddle, val, data); + + if (subres != TS_NO) + { + if (data) + { + /* + * We need to join position information + */ + if (subres == TS_MAYBE) + { + /* + * No position info for this match, so we must report + * MAYBE overall. + */ + res = TS_MAYBE; + /* forget any previous positions */ + npos = 0; + /* don't leak storage */ + if (allpos) + pfree(allpos); + break; + } + + while (npos + data->npos > totalpos) + { + if (totalpos == 0) + { + totalpos = 256; + allpos = palloc(sizeof(WordEntryPos) * totalpos); + } + else + { + totalpos *= 2; + allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos); + } + } + + memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos); + npos += data->npos; + + /* don't leak storage from individual matches */ + if (data->allocated) + pfree(data->pos); + data->pos = NULL; + data->allocated = false; + /* it's important to reset data->npos before next loop */ + data->npos = 0; + } + else + { + /* Don't need positions, just handle YES/MAYBE */ + if (subres == TS_YES || res == TS_NO) + res = subres; + } + } + + StopMiddle++; + } + + if (data && npos > 0) + { + /* Sort and make unique array of found positions */ + data->pos = allpos; + qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos); + data->npos = qunique(data->pos, npos, sizeof(WordEntryPos), + compareWordEntryPos); + data->allocated = true; + res = TS_YES; + } + } + + return res; +} + +/* + * Compute output position list for a tsquery operator in phrase mode. + * + * Merge the position lists in Ldata and Rdata as specified by "emit", + * returning the result list into *data. The input position lists must be + * sorted and unique, and the output will be as well. + * + * data: pointer to initially-all-zeroes output struct, or NULL + * Ldata, Rdata: input position lists + * emit: bitmask of TSPO_XXX flags + * Loffset: offset to be added to Ldata positions before comparing/outputting + * Roffset: offset to be added to Rdata positions before comparing/outputting + * max_npos: maximum possible required size of output position array + * + * Loffset and Roffset should not be negative, else we risk trying to output + * negative positions, which won't fit into WordEntryPos. + * + * The result is boolean (TS_YES or TS_NO), but for the caller's convenience + * we return it as TSTernaryValue. + * + * Returns TS_YES if any positions were emitted to *data; or if data is NULL, + * returns TS_YES if any positions would have been emitted. + */ +#define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */ +#define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */ +#define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */ + +static TSTernaryValue +TS_phrase_output(ExecPhraseData *data, + ExecPhraseData *Ldata, + ExecPhraseData *Rdata, + int emit, + int Loffset, + int Roffset, + int max_npos) +{ + int Lindex, + Rindex; + + /* Loop until both inputs are exhausted */ + Lindex = Rindex = 0; + while (Lindex < Ldata->npos || Rindex < Rdata->npos) + { + int Lpos, + Rpos; + int output_pos = 0; + + /* + * Fetch current values to compare. WEP_GETPOS() is needed because + * ExecPhraseData->data can point to a tsvector's WordEntryPosVector. + */ + if (Lindex < Ldata->npos) + Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset; + else + { + /* L array exhausted, so we're done if R_ONLY isn't set */ + if (!(emit & TSPO_R_ONLY)) + break; + Lpos = INT_MAX; + } + if (Rindex < Rdata->npos) + Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset; + else + { + /* R array exhausted, so we're done if L_ONLY isn't set */ + if (!(emit & TSPO_L_ONLY)) + break; + Rpos = INT_MAX; + } + + /* Merge-join the two input lists */ + if (Lpos < Rpos) + { + /* Lpos is not matched in Rdata, should we output it? */ + if (emit & TSPO_L_ONLY) + output_pos = Lpos; + Lindex++; + } + else if (Lpos == Rpos) + { + /* Lpos and Rpos match ... should we output it? */ + if (emit & TSPO_BOTH) + output_pos = Rpos; + Lindex++; + Rindex++; + } + else /* Lpos > Rpos */ + { + /* Rpos is not matched in Ldata, should we output it? */ + if (emit & TSPO_R_ONLY) + output_pos = Rpos; + Rindex++; + } + + if (output_pos > 0) + { + if (data) + { + /* Store position, first allocating output array if needed */ + if (data->pos == NULL) + { + data->pos = (WordEntryPos *) + palloc(max_npos * sizeof(WordEntryPos)); + data->allocated = true; + } + data->pos[data->npos++] = output_pos; + } + else + { + /* + * Exact positions not needed, so return TS_YES as soon as we + * know there is at least one. + */ + return TS_YES; + } + } + } + + if (data && data->npos > 0) + { + /* Let's assert we didn't overrun the array */ + Assert(data->npos <= max_npos); + return TS_YES; + } + return TS_NO; +} + +/* + * Execute tsquery at or below an OP_PHRASE operator. + * + * This handles tsquery execution at recursion levels where we need to care + * about match locations. + * + * In addition to the same arguments used for TS_execute, the caller may pass + * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme + * match position info on success. data == NULL if no position data need be + * returned. + * Note: the function assumes data != NULL for operators other than OP_PHRASE. + * This is OK because an outside call always starts from an OP_PHRASE node, + * and all internal recursion cases pass data != NULL. + * + * The detailed semantics of the match data, given that the function returned + * TS_YES (successful match), are: + * + * npos > 0, negate = false: + * query is matched at specified position(s) (and only those positions) + * npos > 0, negate = true: + * query is matched at all positions *except* specified position(s) + * npos = 0, negate = true: + * query is matched at all positions + * npos = 0, negate = false: + * disallowed (this should result in TS_NO or TS_MAYBE, as appropriate) + * + * Successful matches also return a "width" value which is the match width in + * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches, + * and is the sum of the phrase operator distances for phrase matches. Note + * that when width > 0, the listed positions represent the ends of matches not + * the starts. (This unintuitive rule is needed to avoid possibly generating + * negative positions, which wouldn't fit into the WordEntryPos arrays.) + * + * If the TSExecuteCallback function reports that an operand is present + * but fails to provide position(s) for it, we will return TS_MAYBE when + * it is possible but not certain that the query is matched. + * + * When the function returns TS_NO or TS_MAYBE, it must return npos = 0, + * negate = false (which is the state initialized by the caller); but the + * "width" output in such cases is undefined. + */ +static TSTernaryValue +TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags, + TSExecuteCallback chkcond, + ExecPhraseData *data) +{ + ExecPhraseData Ldata, + Rdata; + TSTernaryValue lmatch, + rmatch; + int Loffset, + Roffset, + maxwidth; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + /* ... and let's check for query cancel while we're at it */ + CHECK_FOR_INTERRUPTS(); + + if (curitem->type == QI_VAL) + return chkcond(arg, (QueryOperand *) curitem, data); + + switch (curitem->qoperator.oper) + { + case OP_NOT: + + /* + * We need not touch data->width, since a NOT operation does not + * change the match width. + */ + if (flags & TS_EXEC_SKIP_NOT) + { + /* with SKIP_NOT, report NOT as "match everywhere" */ + Assert(data->npos == 0 && !data->negate); + data->negate = true; + return TS_YES; + } + switch (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data)) + { + case TS_NO: + /* change "match nowhere" to "match everywhere" */ + Assert(data->npos == 0 && !data->negate); + data->negate = true; + return TS_YES; + case TS_YES: + if (data->npos > 0) + { + /* we have some positions, invert negate flag */ + data->negate = !data->negate; + return TS_YES; + } + else if (data->negate) + { + /* change "match everywhere" to "match nowhere" */ + data->negate = false; + return TS_NO; + } + /* Should not get here if result was TS_YES */ + Assert(false); + break; + case TS_MAYBE: + /* match positions are, and remain, uncertain */ + return TS_MAYBE; + } + break; + + case OP_PHRASE: + case OP_AND: + memset(&Ldata, 0, sizeof(Ldata)); + memset(&Rdata, 0, sizeof(Rdata)); + + lmatch = TS_phrase_execute(curitem + curitem->qoperator.left, + arg, flags, chkcond, &Ldata); + if (lmatch == TS_NO) + return TS_NO; + + rmatch = TS_phrase_execute(curitem + 1, + arg, flags, chkcond, &Rdata); + if (rmatch == TS_NO) + return TS_NO; + + /* + * If either operand has no position information, then we can't + * return reliable position data, only a MAYBE result. + */ + if (lmatch == TS_MAYBE || rmatch == TS_MAYBE) + return TS_MAYBE; + + if (curitem->qoperator.oper == OP_PHRASE) + { + /* + * Compute Loffset and Roffset suitable for phrase match, and + * compute overall width of whole phrase match. + */ + Loffset = curitem->qoperator.distance + Rdata.width; + Roffset = 0; + if (data) + data->width = curitem->qoperator.distance + + Ldata.width + Rdata.width; + } + else + { + /* + * For OP_AND, set output width and alignment like OP_OR (see + * comment below) + */ + maxwidth = Max(Ldata.width, Rdata.width); + Loffset = maxwidth - Ldata.width; + Roffset = maxwidth - Rdata.width; + if (data) + data->width = maxwidth; + } + + if (Ldata.negate && Rdata.negate) + { + /* !L & !R: treat as !(L | R) */ + (void) TS_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, + Loffset, Roffset, + Ldata.npos + Rdata.npos); + if (data) + data->negate = true; + return TS_YES; + } + else if (Ldata.negate) + { + /* !L & R */ + return TS_phrase_output(data, &Ldata, &Rdata, + TSPO_R_ONLY, + Loffset, Roffset, + Rdata.npos); + } + else if (Rdata.negate) + { + /* L & !R */ + return TS_phrase_output(data, &Ldata, &Rdata, + TSPO_L_ONLY, + Loffset, Roffset, + Ldata.npos); + } + else + { + /* straight AND */ + return TS_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH, + Loffset, Roffset, + Min(Ldata.npos, Rdata.npos)); + } + + case OP_OR: + memset(&Ldata, 0, sizeof(Ldata)); + memset(&Rdata, 0, sizeof(Rdata)); + + lmatch = TS_phrase_execute(curitem + curitem->qoperator.left, + arg, flags, chkcond, &Ldata); + rmatch = TS_phrase_execute(curitem + 1, + arg, flags, chkcond, &Rdata); + + if (lmatch == TS_NO && rmatch == TS_NO) + return TS_NO; + + /* + * If either operand has no position information, then we can't + * return reliable position data, only a MAYBE result. + */ + if (lmatch == TS_MAYBE || rmatch == TS_MAYBE) + return TS_MAYBE; + + /* + * Cope with undefined output width from failed submatch. (This + * takes less code than trying to ensure that all failure returns + * set data->width to zero.) + */ + if (lmatch == TS_NO) + Ldata.width = 0; + if (rmatch == TS_NO) + Rdata.width = 0; + + /* + * For OP_AND and OP_OR, report the width of the wider of the two + * inputs, and align the narrower input's positions to the right + * end of that width. This rule deals at least somewhat + * reasonably with cases like "x <-> (y | z <-> q)". + */ + maxwidth = Max(Ldata.width, Rdata.width); + Loffset = maxwidth - Ldata.width; + Roffset = maxwidth - Rdata.width; + data->width = maxwidth; + + if (Ldata.negate && Rdata.negate) + { + /* !L | !R: treat as !(L & R) */ + (void) TS_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH, + Loffset, Roffset, + Min(Ldata.npos, Rdata.npos)); + data->negate = true; + return TS_YES; + } + else if (Ldata.negate) + { + /* !L | R: treat as !(L & !R) */ + (void) TS_phrase_output(data, &Ldata, &Rdata, + TSPO_L_ONLY, + Loffset, Roffset, + Ldata.npos); + data->negate = true; + return TS_YES; + } + else if (Rdata.negate) + { + /* L | !R: treat as !(!L & R) */ + (void) TS_phrase_output(data, &Ldata, &Rdata, + TSPO_R_ONLY, + Loffset, Roffset, + Rdata.npos); + data->negate = true; + return TS_YES; + } + else + { + /* straight OR */ + return TS_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, + Loffset, Roffset, + Ldata.npos + Rdata.npos); + } + + default: + elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + } + + /* not reachable, but keep compiler quiet */ + return TS_NO; +} + + +/* + * Evaluate tsquery boolean expression. + * + * curitem: current tsquery item (initially, the first one) + * arg: opaque value to pass through to callback function + * flags: bitmask of flag bits shown in ts_utils.h + * chkcond: callback function to check whether a primitive value is present + */ +bool +TS_execute(QueryItem *curitem, void *arg, uint32 flags, + TSExecuteCallback chkcond) +{ + /* + * If we get TS_MAYBE from the recursion, return true. We could only see + * that result if the caller passed TS_EXEC_PHRASE_NO_POS, so there's no + * need to check again. + */ + return TS_execute_recurse(curitem, arg, flags, chkcond) != TS_NO; +} + +/* + * Evaluate tsquery boolean expression. + * + * This is the same as TS_execute except that TS_MAYBE is returned as-is. + */ +TSTernaryValue +TS_execute_ternary(QueryItem *curitem, void *arg, uint32 flags, + TSExecuteCallback chkcond) +{ + return TS_execute_recurse(curitem, arg, flags, chkcond); +} + +/* + * TS_execute recursion for operators above any phrase operator. Here we do + * not need to worry about lexeme positions. As soon as we hit an OP_PHRASE + * operator, we pass it off to TS_phrase_execute which does worry. + */ +static TSTernaryValue +TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags, + TSExecuteCallback chkcond) +{ + TSTernaryValue lmatch; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + /* ... and let's check for query cancel while we're at it */ + CHECK_FOR_INTERRUPTS(); + + if (curitem->type == QI_VAL) + return chkcond(arg, (QueryOperand *) curitem, + NULL /* don't need position info */ ); + + switch (curitem->qoperator.oper) + { + case OP_NOT: + if (flags & TS_EXEC_SKIP_NOT) + return TS_YES; + switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond)) + { + case TS_NO: + return TS_YES; + case TS_YES: + return TS_NO; + case TS_MAYBE: + return TS_MAYBE; + } + break; + + case OP_AND: + lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg, + flags, chkcond); + if (lmatch == TS_NO) + return TS_NO; + switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond)) + { + case TS_NO: + return TS_NO; + case TS_YES: + return lmatch; + case TS_MAYBE: + return TS_MAYBE; + } + break; + + case OP_OR: + lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg, + flags, chkcond); + if (lmatch == TS_YES) + return TS_YES; + switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond)) + { + case TS_NO: + return lmatch; + case TS_YES: + return TS_YES; + case TS_MAYBE: + return TS_MAYBE; + } + break; + + case OP_PHRASE: + + /* + * If we get a MAYBE result, and the caller doesn't want that, + * convert it to NO. It would be more consistent, perhaps, to + * return the result of TS_phrase_execute() verbatim and then + * convert MAYBE results at the top of the recursion. But + * converting at the topmost phrase operator gives results that + * are bug-compatible with the old implementation, so do it like + * this for now. + */ + switch (TS_phrase_execute(curitem, arg, flags, chkcond, NULL)) + { + case TS_NO: + return TS_NO; + case TS_YES: + return TS_YES; + case TS_MAYBE: + return (flags & TS_EXEC_PHRASE_NO_POS) ? TS_MAYBE : TS_NO; + } + break; + + default: + elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + } + + /* not reachable, but keep compiler quiet */ + return TS_NO; +} + +/* + * Evaluate tsquery and report locations of matching terms. + * + * This is like TS_execute except that it returns match locations not just + * success/failure status. The callback function is required to provide + * position data (we report failure if it doesn't). + * + * On successful match, the result is a List of ExecPhraseData structs, one + * for each AND'ed term or phrase operator in the query. Each struct includes + * a sorted array of lexeme positions matching that term. (Recall that for + * phrase operators, the match includes width+1 lexemes, and the recorded + * position is that of the rightmost lexeme.) + * + * OR subexpressions are handled by union'ing their match locations into a + * single List element, which is valid since any of those locations contains + * a match. However, when some of the OR'ed terms are phrase operators, we + * report the maximum width of any of the OR'ed terms, making such cases + * slightly imprecise in the conservative direction. (For example, if the + * tsquery is "(A <-> B) | C", an occurrence of C in the data would be + * reported as though it includes the lexeme to the left of C.) + * + * Locations of NOT subexpressions are not reported. (Obviously, there can + * be no successful NOT matches at top level, or the match would have failed. + * So this amounts to ignoring NOTs underneath ORs.) + * + * The result is NIL if no match, or if position data was not returned. + * + * Arguments are the same as for TS_execute, although flags is currently + * vestigial since none of the defined bits are sensible here. + */ +List * +TS_execute_locations(QueryItem *curitem, void *arg, + uint32 flags, + TSExecuteCallback chkcond) +{ + List *result; + + /* No flags supported, as yet */ + Assert(flags == TS_EXEC_EMPTY); + if (TS_execute_locations_recurse(curitem, arg, chkcond, &result)) + return result; + return NIL; +} + +/* + * TS_execute_locations recursion for operators above any phrase operator. + * OP_PHRASE subexpressions can be passed off to TS_phrase_execute. + */ +static bool +TS_execute_locations_recurse(QueryItem *curitem, void *arg, + TSExecuteCallback chkcond, + List **locations) +{ + bool lmatch, + rmatch; + List *llocations, + *rlocations; + ExecPhraseData *data; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + /* ... and let's check for query cancel while we're at it */ + CHECK_FOR_INTERRUPTS(); + + /* Default locations result is empty */ + *locations = NIL; + + if (curitem->type == QI_VAL) + { + data = palloc0_object(ExecPhraseData); + if (chkcond(arg, (QueryOperand *) curitem, data) == TS_YES) + { + *locations = list_make1(data); + return true; + } + pfree(data); + return false; + } + + switch (curitem->qoperator.oper) + { + case OP_NOT: + if (!TS_execute_locations_recurse(curitem + 1, arg, chkcond, + &llocations)) + return true; /* we don't pass back any locations */ + return false; + + case OP_AND: + if (!TS_execute_locations_recurse(curitem + curitem->qoperator.left, + arg, chkcond, + &llocations)) + return false; + if (!TS_execute_locations_recurse(curitem + 1, + arg, chkcond, + &rlocations)) + return false; + *locations = list_concat(llocations, rlocations); + return true; + + case OP_OR: + lmatch = TS_execute_locations_recurse(curitem + curitem->qoperator.left, + arg, chkcond, + &llocations); + rmatch = TS_execute_locations_recurse(curitem + 1, + arg, chkcond, + &rlocations); + if (lmatch || rmatch) + { + /* + * We generate an AND'able location struct from each + * combination of sub-matches, following the disjunctive law + * (A & B) | (C & D) = (A | C) & (A | D) & (B | C) & (B | D). + * + * However, if either input didn't produce locations (i.e., it + * failed or was a NOT), we must just return the other list. + */ + if (llocations == NIL) + *locations = rlocations; + else if (rlocations == NIL) + *locations = llocations; + else + { + ListCell *ll; + + foreach(ll, llocations) + { + ExecPhraseData *ldata = (ExecPhraseData *) lfirst(ll); + ListCell *lr; + + foreach(lr, rlocations) + { + ExecPhraseData *rdata = (ExecPhraseData *) lfirst(lr); + + data = palloc0_object(ExecPhraseData); + (void) TS_phrase_output(data, ldata, rdata, + TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, + 0, 0, + ldata->npos + rdata->npos); + /* Report the larger width, as explained above. */ + data->width = Max(ldata->width, rdata->width); + *locations = lappend(*locations, data); + } + } + } + + return true; + } + return false; + + case OP_PHRASE: + /* We can hand this off to TS_phrase_execute */ + data = palloc0_object(ExecPhraseData); + if (TS_phrase_execute(curitem, arg, TS_EXEC_EMPTY, chkcond, + data) == TS_YES) + { + if (!data->negate) + *locations = list_make1(data); + return true; + } + pfree(data); + return false; + + default: + elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + } + + /* not reachable, but keep compiler quiet */ + return false; +} + +/* + * Detect whether a tsquery boolean expression requires any positive matches + * to values shown in the tsquery. + * + * This is needed to know whether a GIN index search requires full index scan. + * For example, 'x & !y' requires a match of x, so it's sufficient to scan + * entries for x; but 'x | !y' could match rows containing neither x nor y. + */ +bool +tsquery_requires_match(QueryItem *curitem) +{ + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (curitem->type == QI_VAL) + return true; + + switch (curitem->qoperator.oper) + { + case OP_NOT: + + /* + * Assume there are no required matches underneath a NOT. For + * some cases with nested NOTs, we could prove there's a required + * match, but it seems unlikely to be worth the trouble. + */ + return false; + + case OP_PHRASE: + + /* + * Treat OP_PHRASE as OP_AND here + */ + case OP_AND: + /* If either side requires a match, we're good */ + if (tsquery_requires_match(curitem + curitem->qoperator.left)) + return true; + else + return tsquery_requires_match(curitem + 1); + + case OP_OR: + /* Both sides must require a match */ + if (tsquery_requires_match(curitem + curitem->qoperator.left)) + return tsquery_requires_match(curitem + 1); + else + return false; + + default: + elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + } + + /* not reachable, but keep compiler quiet */ + return false; +} + +/* + * boolean operations + */ +Datum +ts_match_qv(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq, + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(0))); +} + +Datum +ts_match_vq(PG_FUNCTION_ARGS) +{ + TSVector val = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + CHKVAL chkval; + bool result; + + /* empty query matches nothing */ + if (!query->size) + { + PG_FREE_IF_COPY(val, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_BOOL(false); + } + + chkval.arrb = ARRPTR(val); + chkval.arre = chkval.arrb + val->size; + chkval.values = STRPTR(val); + chkval.operand = GETOPERAND(query); + result = TS_execute(GETQUERY(query), + &chkval, + TS_EXEC_EMPTY, + checkcondition_str); + + PG_FREE_IF_COPY(val, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_BOOL(result); +} + +Datum +ts_match_tt(PG_FUNCTION_ARGS) +{ + TSVector vector; + TSQuery query; + bool res; + + vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector, + PG_GETARG_DATUM(0))); + query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery, + PG_GETARG_DATUM(1))); + + res = DatumGetBool(DirectFunctionCall2(ts_match_vq, + TSVectorGetDatum(vector), + TSQueryGetDatum(query))); + + pfree(vector); + pfree(query); + + PG_RETURN_BOOL(res); +} + +Datum +ts_match_tq(PG_FUNCTION_ARGS) +{ + TSVector vector; + TSQuery query = PG_GETARG_TSQUERY(1); + bool res; + + vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector, + PG_GETARG_DATUM(0))); + + res = DatumGetBool(DirectFunctionCall2(ts_match_vq, + TSVectorGetDatum(vector), + TSQueryGetDatum(query))); + + pfree(vector); + PG_FREE_IF_COPY(query, 1); + + PG_RETURN_BOOL(res); +} + +/* + * ts_stat statistic function support + */ + + +/* + * Returns the number of positions in value 'wptr' within tsvector 'txt', + * that have a weight equal to one of the weights in 'weight' bitmask. + */ +static int +check_weight(TSVector txt, WordEntry *wptr, int8 weight) +{ + int len = POSDATALEN(txt, wptr); + int num = 0; + WordEntryPos *ptr = POSDATAPTR(txt, wptr); + + while (len--) + { + if (weight & (1 << WEP_GETWEIGHT(*ptr))) + num++; + ptr++; + } + return num; +} + +#define compareStatWord(a,e,t) \ + tsCompareString((a)->lexeme, (a)->lenlexeme, \ + STRPTR(t) + (e)->pos, (e)->len, \ + false) + +static void +insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off) +{ + WordEntry *we = ARRPTR(txt) + off; + StatEntry *node = stat->root, + *pnode = NULL; + int n, + res = 0; + uint32 depth = 1; + + if (stat->weight == 0) + n = (we->haspos) ? POSDATALEN(txt, we) : 1; + else + n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0; + + if (n == 0) + return; /* nothing to insert */ + + while (node) + { + res = compareStatWord(node, we, txt); + + if (res == 0) + { + break; + } + else + { + pnode = node; + node = (res < 0) ? node->left : node->right; + } + depth++; + } + + if (depth > stat->maxdepth) + stat->maxdepth = depth; + + if (node == NULL) + { + node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len); + node->left = node->right = NULL; + node->ndoc = 1; + node->nentry = n; + node->lenlexeme = we->len; + memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme); + + if (pnode == NULL) + { + stat->root = node; + } + else + { + if (res < 0) + pnode->left = node; + else + pnode->right = node; + } + } + else + { + node->ndoc++; + node->nentry += n; + } +} + +static void +chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, + uint32 low, uint32 high, uint32 offset) +{ + uint32 pos; + uint32 middle = (low + high) >> 1; + + pos = (low + middle) >> 1; + if (low != middle && pos >= offset && pos - offset < txt->size) + insertStatEntry(persistentContext, stat, txt, pos - offset); + pos = (high + middle + 1) >> 1; + if (middle + 1 != high && pos >= offset && pos - offset < txt->size) + insertStatEntry(persistentContext, stat, txt, pos - offset); + + if (low != middle) + chooseNextStatEntry(persistentContext, stat, txt, low, middle, offset); + if (high != middle + 1) + chooseNextStatEntry(persistentContext, stat, txt, middle + 1, high, offset); +} + +/* + * This is written like a custom aggregate function, because the + * original plan was to do just that. Unfortunately, an aggregate function + * can't return a set, so that plan was abandoned. If that limitation is + * lifted in the future, ts_stat could be a real aggregate function so that + * you could use it like this: + * + * SELECT ts_stat(vector_column) FROM vector_table; + * + * where vector_column is a tsvector-type column in vector_table. + */ + +static TSVectorStat * +ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data) +{ + TSVector txt = DatumGetTSVector(data); + uint32 i, + nbit = 0, + offset; + + if (stat == NULL) + { /* Init in first */ + stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat)); + stat->maxdepth = 1; + } + + /* simple check of correctness */ + if (txt == NULL || txt->size == 0) + { + if (txt && txt != (TSVector) DatumGetPointer(data)) + pfree(txt); + return stat; + } + + i = txt->size - 1; + for (; i > 0; i >>= 1) + nbit++; + + nbit = 1 << nbit; + offset = (nbit - txt->size) / 2; + + insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset); + chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset); + + return stat; +} + +static void +ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx, + TSVectorStat *stat) +{ + TupleDesc tupdesc; + MemoryContext oldcontext; + StatEntry *node; + + funcctx->user_fctx = (void *) stat; + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1)); + stat->stackpos = 0; + + node = stat->root; + /* find leftmost value */ + if (node == NULL) + stat->stack[stat->stackpos] = NULL; + else + for (;;) + { + stat->stack[stat->stackpos] = node; + if (node->left) + { + stat->stackpos++; + node = node->left; + } + else + break; + } + Assert(stat->stackpos <= stat->maxdepth); + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + funcctx->tuple_desc = tupdesc; + funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); + + MemoryContextSwitchTo(oldcontext); +} + +static StatEntry * +walkStatEntryTree(TSVectorStat *stat) +{ + StatEntry *node = stat->stack[stat->stackpos]; + + if (node == NULL) + return NULL; + + if (node->ndoc != 0) + { + /* return entry itself: we already was at left sublink */ + return node; + } + else if (node->right && node->right != stat->stack[stat->stackpos + 1]) + { + /* go on right sublink */ + stat->stackpos++; + node = node->right; + + /* find most-left value */ + for (;;) + { + stat->stack[stat->stackpos] = node; + if (node->left) + { + stat->stackpos++; + node = node->left; + } + else + break; + } + Assert(stat->stackpos <= stat->maxdepth); + } + else + { + /* we already return all left subtree, itself and right subtree */ + if (stat->stackpos == 0) + return NULL; + + stat->stackpos--; + return walkStatEntryTree(stat); + } + + return node; +} + +static Datum +ts_process_call(FuncCallContext *funcctx) +{ + TSVectorStat *st; + StatEntry *entry; + + st = (TSVectorStat *) funcctx->user_fctx; + + entry = walkStatEntryTree(st); + + if (entry != NULL) + { + Datum result; + char *values[3]; + char ndoc[16]; + char nentry[16]; + HeapTuple tuple; + + values[0] = palloc(entry->lenlexeme + 1); + memcpy(values[0], entry->lexeme, entry->lenlexeme); + (values[0])[entry->lenlexeme] = '\0'; + sprintf(ndoc, "%d", entry->ndoc); + values[1] = ndoc; + sprintf(nentry, "%d", entry->nentry); + values[2] = nentry; + + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + pfree(values[0]); + + /* mark entry as already visited */ + entry->ndoc = 0; + + return result; + } + + return (Datum) 0; +} + +static TSVectorStat * +ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) +{ + char *query = text_to_cstring(txt); + TSVectorStat *stat; + bool isnull; + Portal portal; + SPIPlanPtr plan; + + if ((plan = SPI_prepare(query, 0, NULL)) == NULL) + /* internal error */ + elog(ERROR, "SPI_prepare(\"%s\") failed", query); + + if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL) + /* internal error */ + elog(ERROR, "SPI_cursor_open(\"%s\") failed", query); + + SPI_cursor_fetch(portal, true, 100); + + if (SPI_tuptable == NULL || + SPI_tuptable->tupdesc->natts != 1 || + !IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1), + TSVECTOROID)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ts_stat query must return one tsvector column"))); + + stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat)); + stat->maxdepth = 1; + + if (ws) + { + char *buf; + + buf = VARDATA_ANY(ws); + while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws)) + { + if (pg_mblen(buf) == 1) + { + switch (*buf) + { + case 'A': + case 'a': + stat->weight |= 1 << 3; + break; + case 'B': + case 'b': + stat->weight |= 1 << 2; + break; + case 'C': + case 'c': + stat->weight |= 1 << 1; + break; + case 'D': + case 'd': + stat->weight |= 1; + break; + default: + stat->weight |= 0; + } + } + buf += pg_mblen(buf); + } + } + + while (SPI_processed > 0) + { + uint64 i; + + for (i = 0; i < SPI_processed; i++) + { + Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull); + + if (!isnull) + stat = ts_accum(persistentContext, stat, data); + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_fetch(portal, true, 100); + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_close(portal); + SPI_freeplan(plan); + pfree(query); + + return stat; +} + +Datum +ts_stat1(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + TSVectorStat *stat; + text *txt = PG_GETARG_TEXT_PP(0); + + funcctx = SRF_FIRSTCALL_INIT(); + SPI_connect(); + stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, NULL); + PG_FREE_IF_COPY(txt, 0); + ts_setup_firstcall(fcinfo, funcctx, stat); + SPI_finish(); + } + + funcctx = SRF_PERCALL_SETUP(); + if ((result = ts_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + +Datum +ts_stat2(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + TSVectorStat *stat; + text *txt = PG_GETARG_TEXT_PP(0); + text *ws = PG_GETARG_TEXT_PP(1); + + funcctx = SRF_FIRSTCALL_INIT(); + SPI_connect(); + stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, ws); + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(ws, 1); + ts_setup_firstcall(fcinfo, funcctx, stat); + SPI_finish(); + } + + funcctx = SRF_PERCALL_SETUP(); + if ((result = ts_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + + +/* + * Triggers for automatic update of a tsvector column from text column(s) + * + * Trigger arguments are either + * name of tsvector col, name of tsconfig to use, name(s) of text col(s) + * name of tsvector col, name of regconfig col, name(s) of text col(s) + * ie, tsconfig can either be specified by name, or indirectly as the + * contents of a regconfig field in the row. If the name is used, it must + * be explicitly schema-qualified. + */ +Datum +tsvector_update_trigger_byid(PG_FUNCTION_ARGS) +{ + return tsvector_update_trigger(fcinfo, false); +} + +Datum +tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS) +{ + return tsvector_update_trigger(fcinfo, true); +} + +static Datum +tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column) +{ + TriggerData *trigdata; + Trigger *trigger; + Relation rel; + HeapTuple rettuple = NULL; + int tsvector_attr_num, + i; + ParsedText prs; + Datum datum; + bool isnull; + text *txt; + Oid cfgId; + bool update_needed; + + /* Check call context */ + if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */ + elog(ERROR, "tsvector_update_trigger: not fired by trigger manager"); + + trigdata = (TriggerData *) fcinfo->context; + if (!TRIGGER_FIRED_FOR_ROW(trigdata->tg_event)) + elog(ERROR, "tsvector_update_trigger: must be fired for row"); + if (!TRIGGER_FIRED_BEFORE(trigdata->tg_event)) + elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event"); + + if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) + { + rettuple = trigdata->tg_trigtuple; + update_needed = true; + } + else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) + { + rettuple = trigdata->tg_newtuple; + update_needed = false; /* computed below */ + } + else + elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE"); + + trigger = trigdata->tg_trigger; + rel = trigdata->tg_relation; + + if (trigger->tgnargs < 3) + elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)"); + + /* Find the target tsvector column */ + tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]); + if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("tsvector column \"%s\" does not exist", + trigger->tgargs[0]))); + /* This will effectively reject system columns, so no separate test: */ + if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, tsvector_attr_num), + TSVECTOROID)) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("column \"%s\" is not of tsvector type", + trigger->tgargs[0]))); + + /* Find the configuration to use */ + if (config_column) + { + int config_attr_num; + + config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]); + if (config_attr_num == SPI_ERROR_NOATTRIBUTE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("configuration column \"%s\" does not exist", + trigger->tgargs[1]))); + if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, config_attr_num), + REGCONFIGOID)) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("column \"%s\" is not of regconfig type", + trigger->tgargs[1]))); + + datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("configuration column \"%s\" must not be null", + trigger->tgargs[1]))); + cfgId = DatumGetObjectId(datum); + } + else + { + List *names; + + names = stringToQualifiedNameList(trigger->tgargs[1], NULL); + /* require a schema so that results are not search path dependent */ + if (list_length(names) < 2) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("text search configuration name \"%s\" must be schema-qualified", + trigger->tgargs[1]))); + cfgId = get_ts_config_oid(names, false); + } + + /* initialize parse state */ + prs.lenwords = 32; + prs.curwords = 0; + prs.pos = 0; + prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + + /* find all words in indexable column(s) */ + for (i = 2; i < trigger->tgnargs; i++) + { + int numattr; + + numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]); + if (numattr == SPI_ERROR_NOATTRIBUTE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" does not exist", + trigger->tgargs[i]))); + if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, numattr), TEXTOID)) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("column \"%s\" is not of a character type", + trigger->tgargs[i]))); + + if (bms_is_member(numattr - FirstLowInvalidHeapAttributeNumber, trigdata->tg_updatedcols)) + update_needed = true; + + datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull); + if (isnull) + continue; + + txt = DatumGetTextPP(datum); + + parsetext(cfgId, &prs, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt)); + + if (txt != (text *) DatumGetPointer(datum)) + pfree(txt); + } + + if (update_needed) + { + /* make tsvector value */ + datum = TSVectorGetDatum(make_tsvector(&prs)); + isnull = false; + + /* and insert it into tuple */ + rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att, + 1, &tsvector_attr_num, + &datum, &isnull); + + pfree(DatumGetPointer(datum)); + } + + return PointerGetDatum(rettuple); +} |