summaryrefslogtreecommitdiffstats
path: root/src/backend/utils/adt/tsvector_op.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/tsvector_op.c')
-rw-r--r--src/backend/utils/adt/tsvector_op.c2893
1 files changed, 2893 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
new file mode 100644
index 0000000..4457c5d
--- /dev/null
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -0,0 +1,2893 @@
+/*-------------------------------------------------------------------------
+ *
+ * tsvector_op.c
+ * operations over tsvector
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/tsvector_op.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "access/htup_details.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_type.h"
+#include "commands/trigger.h"
+#include "executor/spi.h"
+#include "funcapi.h"
+#include "lib/qunique.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "parser/parse_coerce.h"
+#include "tsearch/ts_utils.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/regproc.h"
+#include "utils/rel.h"
+
+
+typedef struct
+{
+ WordEntry *arrb;
+ WordEntry *arre;
+ char *values;
+ char *operand;
+} CHKVAL;
+
+
+typedef struct StatEntry
+{
+ uint32 ndoc; /* zero indicates that we were already here
+ * while walking through the tree */
+ uint32 nentry;
+ struct StatEntry *left;
+ struct StatEntry *right;
+ uint32 lenlexeme;
+ char lexeme[FLEXIBLE_ARRAY_MEMBER];
+} StatEntry;
+
+#define STATENTRYHDRSZ (offsetof(StatEntry, lexeme))
+
+typedef struct
+{
+ int32 weight;
+
+ uint32 maxdepth;
+
+ StatEntry **stack;
+ uint32 stackpos;
+
+ StatEntry *root;
+} TSVectorStat;
+
+
+static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg,
+ uint32 flags,
+ TSExecuteCallback chkcond);
+static bool TS_execute_locations_recurse(QueryItem *curitem,
+ void *arg,
+ TSExecuteCallback chkcond,
+ List **locations);
+static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
+static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
+
+
+/*
+ * Order: haspos, len, word, for all positions (pos, weight)
+ */
+static int
+silly_cmp_tsvector(const TSVector a, const TSVector b)
+{
+ if (VARSIZE(a) < VARSIZE(b))
+ return -1;
+ else if (VARSIZE(a) > VARSIZE(b))
+ return 1;
+ else if (a->size < b->size)
+ return -1;
+ else if (a->size > b->size)
+ return 1;
+ else
+ {
+ WordEntry *aptr = ARRPTR(a);
+ WordEntry *bptr = ARRPTR(b);
+ int i = 0;
+ int res;
+
+
+ for (i = 0; i < a->size; i++)
+ {
+ if (aptr->haspos != bptr->haspos)
+ {
+ return (aptr->haspos > bptr->haspos) ? -1 : 1;
+ }
+ else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0)
+ {
+ return res;
+ }
+ else if (aptr->haspos)
+ {
+ WordEntryPos *ap = POSDATAPTR(a, aptr);
+ WordEntryPos *bp = POSDATAPTR(b, bptr);
+ int j;
+
+ if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
+ return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
+
+ for (j = 0; j < POSDATALEN(a, aptr); j++)
+ {
+ if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
+ {
+ return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1;
+ }
+ else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp))
+ {
+ return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1;
+ }
+ ap++, bp++;
+ }
+ }
+
+ aptr++;
+ bptr++;
+ }
+ }
+
+ return 0;
+}
+
+#define TSVECTORCMPFUNC( type, action, ret ) \
+Datum \
+tsvector_##type(PG_FUNCTION_ARGS) \
+{ \
+ TSVector a = PG_GETARG_TSVECTOR(0); \
+ TSVector b = PG_GETARG_TSVECTOR(1); \
+ int res = silly_cmp_tsvector(a, b); \
+ PG_FREE_IF_COPY(a,0); \
+ PG_FREE_IF_COPY(b,1); \
+ PG_RETURN_##ret( res action 0 ); \
+} \
+/* keep compiler quiet - no extra ; */ \
+extern int no_such_variable
+
+TSVECTORCMPFUNC(lt, <, BOOL);
+TSVECTORCMPFUNC(le, <=, BOOL);
+TSVECTORCMPFUNC(eq, ==, BOOL);
+TSVECTORCMPFUNC(ge, >=, BOOL);
+TSVECTORCMPFUNC(gt, >, BOOL);
+TSVECTORCMPFUNC(ne, !=, BOOL);
+TSVECTORCMPFUNC(cmp, +, INT32);
+
+Datum
+tsvector_strip(PG_FUNCTION_ARGS)
+{
+ TSVector in = PG_GETARG_TSVECTOR(0);
+ TSVector out;
+ int i,
+ len = 0;
+ WordEntry *arrin = ARRPTR(in),
+ *arrout;
+ char *cur;
+
+ for (i = 0; i < in->size; i++)
+ len += arrin[i].len;
+
+ len = CALCDATASIZE(in->size, len);
+ out = (TSVector) palloc0(len);
+ SET_VARSIZE(out, len);
+ out->size = in->size;
+ arrout = ARRPTR(out);
+ cur = STRPTR(out);
+ for (i = 0; i < in->size; i++)
+ {
+ memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
+ arrout[i].haspos = 0;
+ arrout[i].len = arrin[i].len;
+ arrout[i].pos = cur - STRPTR(out);
+ cur += arrout[i].len;
+ }
+
+ PG_FREE_IF_COPY(in, 0);
+ PG_RETURN_POINTER(out);
+}
+
+Datum
+tsvector_length(PG_FUNCTION_ARGS)
+{
+ TSVector in = PG_GETARG_TSVECTOR(0);
+ int32 ret = in->size;
+
+ PG_FREE_IF_COPY(in, 0);
+ PG_RETURN_INT32(ret);
+}
+
+Datum
+tsvector_setweight(PG_FUNCTION_ARGS)
+{
+ TSVector in = PG_GETARG_TSVECTOR(0);
+ char cw = PG_GETARG_CHAR(1);
+ TSVector out;
+ int i,
+ j;
+ WordEntry *entry;
+ WordEntryPos *p;
+ int w = 0;
+
+ switch (cw)
+ {
+ case 'A':
+ case 'a':
+ w = 3;
+ break;
+ case 'B':
+ case 'b':
+ w = 2;
+ break;
+ case 'C':
+ case 'c':
+ w = 1;
+ break;
+ case 'D':
+ case 'd':
+ w = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %d", cw);
+ }
+
+ out = (TSVector) palloc(VARSIZE(in));
+ memcpy(out, in, VARSIZE(in));
+ entry = ARRPTR(out);
+ i = out->size;
+ while (i--)
+ {
+ if ((j = POSDATALEN(out, entry)) != 0)
+ {
+ p = POSDATAPTR(out, entry);
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, w);
+ p++;
+ }
+ }
+ entry++;
+ }
+
+ PG_FREE_IF_COPY(in, 0);
+ PG_RETURN_POINTER(out);
+}
+
+/*
+ * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
+ *
+ * Assign weight w to elements of tsin that are listed in lexemes.
+ */
+Datum
+tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ char char_weight = PG_GETARG_CHAR(1);
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
+
+ TSVector tsout;
+ int i,
+ j,
+ nlexemes,
+ weight;
+ WordEntry *entry;
+ Datum *dlexemes;
+ bool *nulls;
+
+ switch (char_weight)
+ {
+ case 'A':
+ case 'a':
+ weight = 3;
+ break;
+ case 'B':
+ case 'b':
+ weight = 2;
+ break;
+ case 'C':
+ case 'c':
+ weight = 1;
+ break;
+ case 'D':
+ case 'd':
+ weight = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+
+ tsout = (TSVector) palloc(VARSIZE(tsin));
+ memcpy(tsout, tsin, VARSIZE(tsin));
+ entry = ARRPTR(tsout);
+
+ deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlexemes);
+
+ /*
+ * Assuming that lexemes array is significantly shorter than tsvector we
+ * can iterate through lexemes performing binary search of each lexeme
+ * from lexemes in tsvector.
+ */
+ for (i = 0; i < nlexemes; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ /* Ignore null array elements, they surely don't match */
+ if (nulls[i])
+ continue;
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
+ lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+
+ if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+ {
+ WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, weight);
+ p++;
+ }
+ }
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 2);
+
+ PG_RETURN_POINTER(tsout);
+}
+
+#define compareEntry(pa, a, pb, b) \
+ tsCompareString((pa) + (a)->pos, (a)->len, \
+ (pb) + (b)->pos, (b)->len, \
+ false)
+
+/*
+ * Add positions from src to dest after offsetting them by maxpos.
+ * Return the number added (might be less than expected due to overflow)
+ */
+static int32
+add_pos(TSVector src, WordEntry *srcptr,
+ TSVector dest, WordEntry *destptr,
+ int32 maxpos)
+{
+ uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
+ int i;
+ uint16 slen = POSDATALEN(src, srcptr),
+ startlen;
+ WordEntryPos *spos = POSDATAPTR(src, srcptr),
+ *dpos = POSDATAPTR(dest, destptr);
+
+ if (!destptr->haspos)
+ *clen = 0;
+
+ startlen = *clen;
+ for (i = 0;
+ i < slen && *clen < MAXNUMPOS &&
+ (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
+ i++)
+ {
+ WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
+ WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
+ (*clen)++;
+ }
+
+ if (*clen != startlen)
+ destptr->haspos = 1;
+ return *clen - startlen;
+}
+
+/*
+ * Perform binary search of given lexeme in TSVector.
+ * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
+ * found.
+ */
+static int
+tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
+{
+ WordEntry *arrin = ARRPTR(tsv);
+ int StopLow = 0,
+ StopHigh = tsv->size,
+ StopMiddle,
+ cmp;
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh) / 2;
+
+ cmp = tsCompareString(lexeme, lexeme_len,
+ STRPTR(tsv) + arrin[StopMiddle].pos,
+ arrin[StopMiddle].len,
+ false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ return StopMiddle;
+ }
+
+ return -1;
+}
+
+/*
+ * qsort comparator functions
+ */
+
+static int
+compare_int(const void *va, const void *vb)
+{
+ int a = *((const int *) va);
+ int b = *((const int *) vb);
+
+ if (a == b)
+ return 0;
+ return (a > b) ? 1 : -1;
+}
+
+static int
+compare_text_lexemes(const void *va, const void *vb)
+{
+ Datum a = *((const Datum *) va);
+ Datum b = *((const Datum *) vb);
+ char *alex = VARDATA_ANY(a);
+ int alex_len = VARSIZE_ANY_EXHDR(a);
+ char *blex = VARDATA_ANY(b);
+ int blex_len = VARSIZE_ANY_EXHDR(b);
+
+ return tsCompareString(alex, alex_len, blex, blex_len, false);
+}
+
+/*
+ * Internal routine to delete lexemes from TSVector by array of offsets.
+ *
+ * int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
+ * int indices_count -- size of that array
+ *
+ * Returns new TSVector without given lexemes along with their positions
+ * and weights.
+ */
+static TSVector
+tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
+ int indices_count)
+{
+ TSVector tsout;
+ WordEntry *arrin = ARRPTR(tsv),
+ *arrout;
+ char *data = STRPTR(tsv),
+ *dataout;
+ int i, /* index in arrin */
+ j, /* index in arrout */
+ k, /* index in indices_to_delete */
+ curoff; /* index in dataout area */
+
+ /*
+ * Sort the filter array to simplify membership checks below. Also, get
+ * rid of any duplicate entries, so that we can assume that indices_count
+ * is exactly equal to the number of lexemes that will be removed.
+ */
+ if (indices_count > 1)
+ {
+ qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
+ indices_count = qunique(indices_to_delete, indices_count, sizeof(int),
+ compare_int);
+ }
+
+ /*
+ * Here we overestimate tsout size, since we don't know how much space is
+ * used by the deleted lexeme(s). We will set exact size below.
+ */
+ tsout = (TSVector) palloc0(VARSIZE(tsv));
+
+ /* This count must be correct because STRPTR(tsout) relies on it. */
+ tsout->size = tsv->size - indices_count;
+
+ /*
+ * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
+ */
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+ curoff = 0;
+ for (i = j = k = 0; i < tsv->size; i++)
+ {
+ /*
+ * If current i is present in indices_to_delete, skip this lexeme.
+ * Since indices_to_delete is already sorted, we only need to check
+ * the current (k'th) entry.
+ */
+ if (k < indices_count && i == indices_to_delete[k])
+ {
+ k++;
+ continue;
+ }
+
+ /* Copy lexeme and its positions and weights */
+ memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+ curoff += arrin[i].len;
+ if (arrin[i].haspos)
+ {
+ int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
+ + sizeof(uint16);
+
+ curoff = SHORTALIGN(curoff);
+ memcpy(dataout + curoff,
+ STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
+ len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ /*
+ * k should now be exactly equal to indices_count. If it isn't then the
+ * caller provided us with indices outside of [0, tsv->size) range and
+ * estimation of tsout's size is wrong.
+ */
+ Assert(k == indices_count);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ return tsout;
+}
+
+/*
+ * Delete given lexeme from tsvector.
+ * Implementation of user-level ts_delete(tsvector, text).
+ */
+Datum
+tsvector_delete_str(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ text *tlexeme = PG_GETARG_TEXT_PP(1);
+ char *lexeme = VARDATA_ANY(tlexeme);
+ int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
+ skip_index;
+
+ if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
+ PG_RETURN_POINTER(tsin);
+
+ tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(tlexeme, 1);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete given array of lexemes from tsvector.
+ * Implementation of user-level ts_delete(tsvector, text[]).
+ */
+Datum
+tsvector_delete_arr(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
+ int i,
+ nlex,
+ skip_count,
+ *skip_indices;
+ Datum *dlexemes;
+ bool *nulls;
+
+ deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlex);
+
+ /*
+ * In typical use case array of lexemes to delete is relatively small. So
+ * here we optimize things for that scenario: iterate through lexarr
+ * performing binary search of each lexeme from lexarr in tsvector.
+ */
+ skip_indices = palloc0(nlex * sizeof(int));
+ for (i = skip_count = 0; i < nlex; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ /* Ignore null array elements, they surely don't match */
+ if (nulls[i])
+ continue;
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
+ lex_pos = tsvector_bsearch(tsin, lex, lex_len);
+
+ if (lex_pos >= 0)
+ skip_indices[skip_count++] = lex_pos;
+ }
+
+ tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
+
+ pfree(skip_indices);
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 1);
+
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Expand tsvector as table with following columns:
+ * lexeme: lexeme text
+ * positions: integer array of lexeme positions
+ * weights: char array of weights corresponding to positions
+ */
+Datum
+tsvector_unnest(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ TSVector tsin;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
+ INT2ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ TEXTARRAYOID, -1, 0);
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+ funcctx->tuple_desc = tupdesc;
+
+ funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+ tsin = (TSVector) funcctx->user_fctx;
+
+ if (funcctx->call_cntr < tsin->size)
+ {
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data = STRPTR(tsin);
+ HeapTuple tuple;
+ int j,
+ i = funcctx->call_cntr;
+ bool nulls[] = {false, false, false};
+ Datum values[3];
+
+ values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len));
+
+ if (arrin[i].haspos)
+ {
+ WordEntryPosVector *posv;
+ Datum *positions;
+ Datum *weights;
+ char weight;
+
+ /*
+ * Internally tsvector stores position and weight in the same
+ * uint16 (2 bits for weight, 14 for position). Here we extract
+ * that in two separate arrays.
+ */
+ posv = _POSVECPTR(tsin, arrin + i);
+ positions = palloc(posv->npos * sizeof(Datum));
+ weights = palloc(posv->npos * sizeof(Datum));
+ for (j = 0; j < posv->npos; j++)
+ {
+ positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+ weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight,
+ 1));
+ }
+
+ values[1] = PointerGetDatum(construct_array_builtin(positions, posv->npos, INT2OID));
+ values[2] = PointerGetDatum(construct_array_builtin(weights, posv->npos, TEXTOID));
+ }
+ else
+ {
+ nulls[1] = nulls[2] = true;
+ }
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ {
+ SRF_RETURN_DONE(funcctx);
+ }
+}
+
+/*
+ * Convert tsvector to array of lexemes.
+ */
+Datum
+tsvector_to_array(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum *elements;
+ int i;
+ ArrayType *array;
+
+ elements = palloc(tsin->size * sizeof(Datum));
+
+ for (i = 0; i < tsin->size; i++)
+ {
+ elements[i] = PointerGetDatum(cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos,
+ arrin[i].len));
+ }
+
+ array = construct_array_builtin(elements, tsin->size, TEXTOID);
+
+ pfree(elements);
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+}
+
+/*
+ * Build tsvector from array of lexemes.
+ */
+Datum
+array_to_tsvector(PG_FUNCTION_ARGS)
+{
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ datalen = 0;
+ char *cur;
+
+ deconstruct_array_builtin(v, TEXTOID, &dlexemes, &nulls, &nitems);
+
+ /*
+ * Reject nulls and zero length strings (maybe we should just ignore them,
+ * instead?)
+ */
+ for (i = 0; i < nitems; i++)
+ {
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("lexeme array may not contain nulls")));
+
+ if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
+ errmsg("lexeme array may not contain empty strings")));
+ }
+
+ /* Sort and de-dup, because this is required for a valid tsvector. */
+ if (nitems > 1)
+ {
+ qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
+ nitems = qunique(dlexemes, nitems, sizeof(Datum),
+ compare_text_lexemes);
+ }
+
+ /* Calculate space needed for surviving lexemes. */
+ for (i = 0; i < nitems; i++)
+ datalen += VARSIZE(dlexemes[i]) - VARHDRSZ;
+ tslen = CALCDATASIZE(nitems, datalen);
+
+ /* Allocate and fill tsvector. */
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+ for (i = 0; i < nitems; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
+
+ memcpy(cur, lex, lex_len);
+ arrout[i].haspos = 0;
+ arrout[i].len = lex_len;
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lex_len;
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * ts_filter(): keep only lexemes with given weights in tsvector.
+ */
+Datum
+tsvector_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *datain = STRPTR(tsin),
+ *dataout;
+ Datum *dweights;
+ bool *nulls;
+ int nweights;
+ int i,
+ j;
+ int cur_pos = 0;
+ char mask = 0;
+
+ deconstruct_array_builtin(weights, CHAROID, &dweights, &nulls, &nweights);
+
+ for (i = 0; i < nweights; i++)
+ {
+ char char_weight;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("weight array may not contain nulls")));
+
+ char_weight = DatumGetChar(dweights[i]);
+ switch (char_weight)
+ {
+ case 'A':
+ case 'a':
+ mask = mask | 8;
+ break;
+ case 'B':
+ case 'b':
+ mask = mask | 4;
+ break;
+ case 'C':
+ case 'c':
+ mask = mask | 2;
+ break;
+ case 'D':
+ case 'd':
+ mask = mask | 1;
+ break;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized weight: \"%c\"", char_weight)));
+ }
+ }
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ tsout->size = tsin->size;
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ for (i = j = 0; i < tsin->size; i++)
+ {
+ WordEntryPosVector *posvin,
+ *posvout;
+ int npos = 0;
+ int k;
+
+ if (!arrin[i].haspos)
+ continue;
+
+ posvin = _POSVECPTR(tsin, arrin + i);
+ posvout = (WordEntryPosVector *)
+ (dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+ for (k = 0; k < posvin->npos; k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
+ posvout->pos[npos++] = posvin->pos[k];
+ }
+
+ /* if no satisfactory positions found, skip lexeme */
+ if (!npos)
+ continue;
+
+ arrout[j].haspos = true;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = cur_pos;
+
+ memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+ posvout->npos = npos;
+ cur_pos += SHORTALIGN(arrin[i].len);
+ cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) +
+ sizeof(uint16);
+ j++;
+ }
+
+ tsout->size = j;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, cur_pos);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+Datum
+tsvector_concat(PG_FUNCTION_ARGS)
+{
+ TSVector in1 = PG_GETARG_TSVECTOR(0);
+ TSVector in2 = PG_GETARG_TSVECTOR(1);
+ TSVector out;
+ WordEntry *ptr;
+ WordEntry *ptr1,
+ *ptr2;
+ WordEntryPos *p;
+ int maxpos = 0,
+ i,
+ j,
+ i1,
+ i2,
+ dataoff,
+ output_bytes,
+ output_size;
+ char *data,
+ *data1,
+ *data2;
+
+ /* Get max position in in1; we'll need this to offset in2's positions */
+ ptr = ARRPTR(in1);
+ i = in1->size;
+ while (i--)
+ {
+ if ((j = POSDATALEN(in1, ptr)) != 0)
+ {
+ p = POSDATAPTR(in1, ptr);
+ while (j--)
+ {
+ if (WEP_GETPOS(*p) > maxpos)
+ maxpos = WEP_GETPOS(*p);
+ p++;
+ }
+ }
+ ptr++;
+ }
+
+ ptr1 = ARRPTR(in1);
+ ptr2 = ARRPTR(in2);
+ data1 = STRPTR(in1);
+ data2 = STRPTR(in2);
+ i1 = in1->size;
+ i2 = in2->size;
+
+ /*
+ * Conservative estimate of space needed. We might need all the data in
+ * both inputs, and conceivably add a pad byte before position data for
+ * each item where there was none before.
+ */
+ output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2;
+
+ out = (TSVector) palloc0(output_bytes);
+ SET_VARSIZE(out, output_bytes);
+
+ /*
+ * We must make out->size valid so that STRPTR(out) is sensible. We'll
+ * collapse out any unused space at the end.
+ */
+ out->size = in1->size + in2->size;
+
+ ptr = ARRPTR(out);
+ data = STRPTR(out);
+ dataoff = 0;
+ while (i1 && i2)
+ {
+ int cmp = compareEntry(data1, ptr1, data2, ptr2);
+
+ if (cmp < 0)
+ { /* in1 first */
+ ptr->haspos = ptr1->haspos;
+ ptr->len = ptr1->len;
+ memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
+ ptr->pos = dataoff;
+ dataoff += ptr1->len;
+ if (ptr->haspos)
+ {
+ dataoff = SHORTALIGN(dataoff);
+ memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
+ dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
+ }
+
+ ptr++;
+ ptr1++;
+ i1--;
+ }
+ else if (cmp > 0)
+ { /* in2 first */
+ ptr->haspos = ptr2->haspos;
+ ptr->len = ptr2->len;
+ memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
+ ptr->pos = dataoff;
+ dataoff += ptr2->len;
+ if (ptr->haspos)
+ {
+ int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+
+ if (addlen == 0)
+ ptr->haspos = 0;
+ else
+ {
+ dataoff = SHORTALIGN(dataoff);
+ dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ }
+ }
+
+ ptr++;
+ ptr2++;
+ i2--;
+ }
+ else
+ {
+ ptr->haspos = ptr1->haspos | ptr2->haspos;
+ ptr->len = ptr1->len;
+ memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
+ ptr->pos = dataoff;
+ dataoff += ptr1->len;
+ if (ptr->haspos)
+ {
+ if (ptr1->haspos)
+ {
+ dataoff = SHORTALIGN(dataoff);
+ memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
+ dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
+ if (ptr2->haspos)
+ dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
+ }
+ else /* must have ptr2->haspos */
+ {
+ int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+
+ if (addlen == 0)
+ ptr->haspos = 0;
+ else
+ {
+ dataoff = SHORTALIGN(dataoff);
+ dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ }
+ }
+ }
+
+ ptr++;
+ ptr1++;
+ ptr2++;
+ i1--;
+ i2--;
+ }
+ }
+
+ while (i1)
+ {
+ ptr->haspos = ptr1->haspos;
+ ptr->len = ptr1->len;
+ memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
+ ptr->pos = dataoff;
+ dataoff += ptr1->len;
+ if (ptr->haspos)
+ {
+ dataoff = SHORTALIGN(dataoff);
+ memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
+ dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
+ }
+
+ ptr++;
+ ptr1++;
+ i1--;
+ }
+
+ while (i2)
+ {
+ ptr->haspos = ptr2->haspos;
+ ptr->len = ptr2->len;
+ memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
+ ptr->pos = dataoff;
+ dataoff += ptr2->len;
+ if (ptr->haspos)
+ {
+ int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+
+ if (addlen == 0)
+ ptr->haspos = 0;
+ else
+ {
+ dataoff = SHORTALIGN(dataoff);
+ dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ }
+ }
+
+ ptr++;
+ ptr2++;
+ i2--;
+ }
+
+ /*
+ * Instead of checking each offset individually, we check for overflow of
+ * pos fields once at the end.
+ */
+ if (dataoff > MAXSTRPOS)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS)));
+
+ /*
+ * Adjust sizes (asserting that we didn't overrun the original estimates)
+ * and collapse out any unused array entries.
+ */
+ output_size = ptr - ARRPTR(out);
+ Assert(output_size <= out->size);
+ out->size = output_size;
+ if (data != STRPTR(out))
+ memmove(STRPTR(out), data, dataoff);
+ output_bytes = CALCDATASIZE(out->size, dataoff);
+ Assert(output_bytes <= VARSIZE(out));
+ SET_VARSIZE(out, output_bytes);
+
+ PG_FREE_IF_COPY(in1, 0);
+ PG_FREE_IF_COPY(in2, 1);
+ PG_RETURN_POINTER(out);
+}
+
+/*
+ * Compare two strings by tsvector rules.
+ *
+ * if prefix = true then it returns zero value iff b has prefix a
+ */
+int32
+tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
+{
+ int cmp;
+
+ if (lena == 0)
+ {
+ if (prefix)
+ cmp = 0; /* empty string is prefix of anything */
+ else
+ cmp = (lenb > 0) ? -1 : 0;
+ }
+ else if (lenb == 0)
+ {
+ cmp = (lena > 0) ? 1 : 0;
+ }
+ else
+ {
+ cmp = memcmp(a, b, Min((unsigned int) lena, (unsigned int) lenb));
+
+ if (prefix)
+ {
+ if (cmp == 0 && lena > lenb)
+ cmp = 1; /* a is longer, so not a prefix of b */
+ }
+ else if (cmp == 0 && lena != lenb)
+ {
+ cmp = (lena < lenb) ? -1 : 1;
+ }
+ }
+
+ return cmp;
+}
+
+/*
+ * Check weight info or/and fill 'data' with the required positions
+ */
+static TSTernaryValue
+checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
+ ExecPhraseData *data)
+{
+ TSTernaryValue result = TS_NO;
+
+ Assert(data == NULL || data->npos == 0);
+
+ if (entry->haspos)
+ {
+ WordEntryPosVector *posvec;
+
+ /*
+ * We can't use the _POSVECPTR macro here because the pointer to the
+ * tsvector's lexeme storage is already contained in chkval->values.
+ */
+ posvec = (WordEntryPosVector *)
+ (chkval->values + SHORTALIGN(entry->pos + entry->len));
+
+ if (val->weight && data)
+ {
+ WordEntryPos *posvec_iter = posvec->pos;
+ WordEntryPos *dptr;
+
+ /*
+ * Filter position information by weights
+ */
+ dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos);
+ data->allocated = true;
+
+ /* Is there a position with a matching weight? */
+ while (posvec_iter < posvec->pos + posvec->npos)
+ {
+ /* If true, append this position to the data->pos */
+ if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
+ {
+ *dptr = WEP_GETPOS(*posvec_iter);
+ dptr++;
+ }
+
+ posvec_iter++;
+ }
+
+ data->npos = dptr - data->pos;
+
+ if (data->npos > 0)
+ result = TS_YES;
+ else
+ {
+ pfree(data->pos);
+ data->pos = NULL;
+ data->allocated = false;
+ }
+ }
+ else if (val->weight)
+ {
+ WordEntryPos *posvec_iter = posvec->pos;
+
+ /* Is there a position with a matching weight? */
+ while (posvec_iter < posvec->pos + posvec->npos)
+ {
+ if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
+ {
+ result = TS_YES;
+ break; /* no need to go further */
+ }
+
+ posvec_iter++;
+ }
+ }
+ else if (data)
+ {
+ data->npos = posvec->npos;
+ data->pos = posvec->pos;
+ data->allocated = false;
+ result = TS_YES;
+ }
+ else
+ {
+ /* simplest case: no weight check, positions not needed */
+ result = TS_YES;
+ }
+ }
+ else
+ {
+ /*
+ * Position info is lacking, so if the caller requires it, we can only
+ * say that maybe there is a match.
+ *
+ * Notice, however, that we *don't* check val->weight here.
+ * Historically, stripped tsvectors are considered to match queries
+ * whether or not the query has a weight restriction; that's a little
+ * dubious but we'll preserve the behavior.
+ */
+ if (data)
+ result = TS_MAYBE;
+ else
+ result = TS_YES;
+ }
+
+ return result;
+}
+
+/*
+ * TS_execute callback for matching a tsquery operand to plain tsvector data
+ */
+static TSTernaryValue
+checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
+{
+ CHKVAL *chkval = (CHKVAL *) checkval;
+ WordEntry *StopLow = chkval->arrb;
+ WordEntry *StopHigh = chkval->arre;
+ WordEntry *StopMiddle = StopHigh;
+ TSTernaryValue res = TS_NO;
+
+ /* Loop invariant: StopLow <= val < StopHigh */
+ while (StopLow < StopHigh)
+ {
+ int difference;
+
+ StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+ difference = tsCompareString(chkval->operand + val->distance,
+ val->length,
+ chkval->values + StopMiddle->pos,
+ StopMiddle->len,
+ false);
+
+ if (difference == 0)
+ {
+ /* Check weight info & fill 'data' with positions */
+ res = checkclass_str(chkval, StopMiddle, val, data);
+ break;
+ }
+ else if (difference > 0)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+
+ /*
+ * If it's a prefix search, we should also consider lexemes that the
+ * search term is a prefix of (which will necessarily immediately follow
+ * the place we found in the above loop). But we can skip them if there
+ * was a definite match on the exact term AND the caller doesn't need
+ * position info.
+ */
+ if (val->prefix && (res != TS_YES || data))
+ {
+ WordEntryPos *allpos = NULL;
+ int npos = 0,
+ totalpos = 0;
+
+ /* adjust start position for corner case */
+ if (StopLow >= StopHigh)
+ StopMiddle = StopHigh;
+
+ /* we don't try to re-use any data from the initial match */
+ if (data)
+ {
+ if (data->allocated)
+ pfree(data->pos);
+ data->pos = NULL;
+ data->allocated = false;
+ data->npos = 0;
+ }
+ res = TS_NO;
+
+ while ((res != TS_YES || data) &&
+ StopMiddle < chkval->arre &&
+ tsCompareString(chkval->operand + val->distance,
+ val->length,
+ chkval->values + StopMiddle->pos,
+ StopMiddle->len,
+ true) == 0)
+ {
+ TSTernaryValue subres;
+
+ subres = checkclass_str(chkval, StopMiddle, val, data);
+
+ if (subres != TS_NO)
+ {
+ if (data)
+ {
+ /*
+ * We need to join position information
+ */
+ if (subres == TS_MAYBE)
+ {
+ /*
+ * No position info for this match, so we must report
+ * MAYBE overall.
+ */
+ res = TS_MAYBE;
+ /* forget any previous positions */
+ npos = 0;
+ /* don't leak storage */
+ if (allpos)
+ pfree(allpos);
+ break;
+ }
+
+ while (npos + data->npos > totalpos)
+ {
+ if (totalpos == 0)
+ {
+ totalpos = 256;
+ allpos = palloc(sizeof(WordEntryPos) * totalpos);
+ }
+ else
+ {
+ totalpos *= 2;
+ allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos);
+ }
+ }
+
+ memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos);
+ npos += data->npos;
+
+ /* don't leak storage from individual matches */
+ if (data->allocated)
+ pfree(data->pos);
+ data->pos = NULL;
+ data->allocated = false;
+ /* it's important to reset data->npos before next loop */
+ data->npos = 0;
+ }
+ else
+ {
+ /* Don't need positions, just handle YES/MAYBE */
+ if (subres == TS_YES || res == TS_NO)
+ res = subres;
+ }
+ }
+
+ StopMiddle++;
+ }
+
+ if (data && npos > 0)
+ {
+ /* Sort and make unique array of found positions */
+ data->pos = allpos;
+ qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
+ data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
+ compareWordEntryPos);
+ data->allocated = true;
+ res = TS_YES;
+ }
+ }
+
+ return res;
+}
+
+/*
+ * Compute output position list for a tsquery operator in phrase mode.
+ *
+ * Merge the position lists in Ldata and Rdata as specified by "emit",
+ * returning the result list into *data. The input position lists must be
+ * sorted and unique, and the output will be as well.
+ *
+ * data: pointer to initially-all-zeroes output struct, or NULL
+ * Ldata, Rdata: input position lists
+ * emit: bitmask of TSPO_XXX flags
+ * Loffset: offset to be added to Ldata positions before comparing/outputting
+ * Roffset: offset to be added to Rdata positions before comparing/outputting
+ * max_npos: maximum possible required size of output position array
+ *
+ * Loffset and Roffset should not be negative, else we risk trying to output
+ * negative positions, which won't fit into WordEntryPos.
+ *
+ * The result is boolean (TS_YES or TS_NO), but for the caller's convenience
+ * we return it as TSTernaryValue.
+ *
+ * Returns TS_YES if any positions were emitted to *data; or if data is NULL,
+ * returns TS_YES if any positions would have been emitted.
+ */
+#define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */
+#define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */
+#define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */
+
+static TSTernaryValue
+TS_phrase_output(ExecPhraseData *data,
+ ExecPhraseData *Ldata,
+ ExecPhraseData *Rdata,
+ int emit,
+ int Loffset,
+ int Roffset,
+ int max_npos)
+{
+ int Lindex,
+ Rindex;
+
+ /* Loop until both inputs are exhausted */
+ Lindex = Rindex = 0;
+ while (Lindex < Ldata->npos || Rindex < Rdata->npos)
+ {
+ int Lpos,
+ Rpos;
+ int output_pos = 0;
+
+ /*
+ * Fetch current values to compare. WEP_GETPOS() is needed because
+ * ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
+ */
+ if (Lindex < Ldata->npos)
+ Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset;
+ else
+ {
+ /* L array exhausted, so we're done if R_ONLY isn't set */
+ if (!(emit & TSPO_R_ONLY))
+ break;
+ Lpos = INT_MAX;
+ }
+ if (Rindex < Rdata->npos)
+ Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset;
+ else
+ {
+ /* R array exhausted, so we're done if L_ONLY isn't set */
+ if (!(emit & TSPO_L_ONLY))
+ break;
+ Rpos = INT_MAX;
+ }
+
+ /* Merge-join the two input lists */
+ if (Lpos < Rpos)
+ {
+ /* Lpos is not matched in Rdata, should we output it? */
+ if (emit & TSPO_L_ONLY)
+ output_pos = Lpos;
+ Lindex++;
+ }
+ else if (Lpos == Rpos)
+ {
+ /* Lpos and Rpos match ... should we output it? */
+ if (emit & TSPO_BOTH)
+ output_pos = Rpos;
+ Lindex++;
+ Rindex++;
+ }
+ else /* Lpos > Rpos */
+ {
+ /* Rpos is not matched in Ldata, should we output it? */
+ if (emit & TSPO_R_ONLY)
+ output_pos = Rpos;
+ Rindex++;
+ }
+
+ if (output_pos > 0)
+ {
+ if (data)
+ {
+ /* Store position, first allocating output array if needed */
+ if (data->pos == NULL)
+ {
+ data->pos = (WordEntryPos *)
+ palloc(max_npos * sizeof(WordEntryPos));
+ data->allocated = true;
+ }
+ data->pos[data->npos++] = output_pos;
+ }
+ else
+ {
+ /*
+ * Exact positions not needed, so return TS_YES as soon as we
+ * know there is at least one.
+ */
+ return TS_YES;
+ }
+ }
+ }
+
+ if (data && data->npos > 0)
+ {
+ /* Let's assert we didn't overrun the array */
+ Assert(data->npos <= max_npos);
+ return TS_YES;
+ }
+ return TS_NO;
+}
+
+/*
+ * Execute tsquery at or below an OP_PHRASE operator.
+ *
+ * This handles tsquery execution at recursion levels where we need to care
+ * about match locations.
+ *
+ * In addition to the same arguments used for TS_execute, the caller may pass
+ * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme
+ * match position info on success. data == NULL if no position data need be
+ * returned.
+ * Note: the function assumes data != NULL for operators other than OP_PHRASE.
+ * This is OK because an outside call always starts from an OP_PHRASE node,
+ * and all internal recursion cases pass data != NULL.
+ *
+ * The detailed semantics of the match data, given that the function returned
+ * TS_YES (successful match), are:
+ *
+ * npos > 0, negate = false:
+ * query is matched at specified position(s) (and only those positions)
+ * npos > 0, negate = true:
+ * query is matched at all positions *except* specified position(s)
+ * npos = 0, negate = true:
+ * query is matched at all positions
+ * npos = 0, negate = false:
+ * disallowed (this should result in TS_NO or TS_MAYBE, as appropriate)
+ *
+ * Successful matches also return a "width" value which is the match width in
+ * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches,
+ * and is the sum of the phrase operator distances for phrase matches. Note
+ * that when width > 0, the listed positions represent the ends of matches not
+ * the starts. (This unintuitive rule is needed to avoid possibly generating
+ * negative positions, which wouldn't fit into the WordEntryPos arrays.)
+ *
+ * If the TSExecuteCallback function reports that an operand is present
+ * but fails to provide position(s) for it, we will return TS_MAYBE when
+ * it is possible but not certain that the query is matched.
+ *
+ * When the function returns TS_NO or TS_MAYBE, it must return npos = 0,
+ * negate = false (which is the state initialized by the caller); but the
+ * "width" output in such cases is undefined.
+ */
+static TSTernaryValue
+TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
+ TSExecuteCallback chkcond,
+ ExecPhraseData *data)
+{
+ ExecPhraseData Ldata,
+ Rdata;
+ TSTernaryValue lmatch,
+ rmatch;
+ int Loffset,
+ Roffset,
+ maxwidth;
+
+ /* since this function recurses, it could be driven to stack overflow */
+ check_stack_depth();
+
+ /* ... and let's check for query cancel while we're at it */
+ CHECK_FOR_INTERRUPTS();
+
+ if (curitem->type == QI_VAL)
+ return chkcond(arg, (QueryOperand *) curitem, data);
+
+ switch (curitem->qoperator.oper)
+ {
+ case OP_NOT:
+
+ /*
+ * We need not touch data->width, since a NOT operation does not
+ * change the match width.
+ */
+ if (flags & TS_EXEC_SKIP_NOT)
+ {
+ /* with SKIP_NOT, report NOT as "match everywhere" */
+ Assert(data->npos == 0 && !data->negate);
+ data->negate = true;
+ return TS_YES;
+ }
+ switch (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data))
+ {
+ case TS_NO:
+ /* change "match nowhere" to "match everywhere" */
+ Assert(data->npos == 0 && !data->negate);
+ data->negate = true;
+ return TS_YES;
+ case TS_YES:
+ if (data->npos > 0)
+ {
+ /* we have some positions, invert negate flag */
+ data->negate = !data->negate;
+ return TS_YES;
+ }
+ else if (data->negate)
+ {
+ /* change "match everywhere" to "match nowhere" */
+ data->negate = false;
+ return TS_NO;
+ }
+ /* Should not get here if result was TS_YES */
+ Assert(false);
+ break;
+ case TS_MAYBE:
+ /* match positions are, and remain, uncertain */
+ return TS_MAYBE;
+ }
+ break;
+
+ case OP_PHRASE:
+ case OP_AND:
+ memset(&Ldata, 0, sizeof(Ldata));
+ memset(&Rdata, 0, sizeof(Rdata));
+
+ lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
+ arg, flags, chkcond, &Ldata);
+ if (lmatch == TS_NO)
+ return TS_NO;
+
+ rmatch = TS_phrase_execute(curitem + 1,
+ arg, flags, chkcond, &Rdata);
+ if (rmatch == TS_NO)
+ return TS_NO;
+
+ /*
+ * If either operand has no position information, then we can't
+ * return reliable position data, only a MAYBE result.
+ */
+ if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
+ return TS_MAYBE;
+
+ if (curitem->qoperator.oper == OP_PHRASE)
+ {
+ /*
+ * Compute Loffset and Roffset suitable for phrase match, and
+ * compute overall width of whole phrase match.
+ */
+ Loffset = curitem->qoperator.distance + Rdata.width;
+ Roffset = 0;
+ if (data)
+ data->width = curitem->qoperator.distance +
+ Ldata.width + Rdata.width;
+ }
+ else
+ {
+ /*
+ * For OP_AND, set output width and alignment like OP_OR (see
+ * comment below)
+ */
+ maxwidth = Max(Ldata.width, Rdata.width);
+ Loffset = maxwidth - Ldata.width;
+ Roffset = maxwidth - Rdata.width;
+ if (data)
+ data->width = maxwidth;
+ }
+
+ if (Ldata.negate && Rdata.negate)
+ {
+ /* !L & !R: treat as !(L | R) */
+ (void) TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
+ Loffset, Roffset,
+ Ldata.npos + Rdata.npos);
+ if (data)
+ data->negate = true;
+ return TS_YES;
+ }
+ else if (Ldata.negate)
+ {
+ /* !L & R */
+ return TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_R_ONLY,
+ Loffset, Roffset,
+ Rdata.npos);
+ }
+ else if (Rdata.negate)
+ {
+ /* L & !R */
+ return TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_L_ONLY,
+ Loffset, Roffset,
+ Ldata.npos);
+ }
+ else
+ {
+ /* straight AND */
+ return TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_BOTH,
+ Loffset, Roffset,
+ Min(Ldata.npos, Rdata.npos));
+ }
+
+ case OP_OR:
+ memset(&Ldata, 0, sizeof(Ldata));
+ memset(&Rdata, 0, sizeof(Rdata));
+
+ lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
+ arg, flags, chkcond, &Ldata);
+ rmatch = TS_phrase_execute(curitem + 1,
+ arg, flags, chkcond, &Rdata);
+
+ if (lmatch == TS_NO && rmatch == TS_NO)
+ return TS_NO;
+
+ /*
+ * If either operand has no position information, then we can't
+ * return reliable position data, only a MAYBE result.
+ */
+ if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
+ return TS_MAYBE;
+
+ /*
+ * Cope with undefined output width from failed submatch. (This
+ * takes less code than trying to ensure that all failure returns
+ * set data->width to zero.)
+ */
+ if (lmatch == TS_NO)
+ Ldata.width = 0;
+ if (rmatch == TS_NO)
+ Rdata.width = 0;
+
+ /*
+ * For OP_AND and OP_OR, report the width of the wider of the two
+ * inputs, and align the narrower input's positions to the right
+ * end of that width. This rule deals at least somewhat
+ * reasonably with cases like "x <-> (y | z <-> q)".
+ */
+ maxwidth = Max(Ldata.width, Rdata.width);
+ Loffset = maxwidth - Ldata.width;
+ Roffset = maxwidth - Rdata.width;
+ data->width = maxwidth;
+
+ if (Ldata.negate && Rdata.negate)
+ {
+ /* !L | !R: treat as !(L & R) */
+ (void) TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_BOTH,
+ Loffset, Roffset,
+ Min(Ldata.npos, Rdata.npos));
+ data->negate = true;
+ return TS_YES;
+ }
+ else if (Ldata.negate)
+ {
+ /* !L | R: treat as !(L & !R) */
+ (void) TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_L_ONLY,
+ Loffset, Roffset,
+ Ldata.npos);
+ data->negate = true;
+ return TS_YES;
+ }
+ else if (Rdata.negate)
+ {
+ /* L | !R: treat as !(!L & R) */
+ (void) TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_R_ONLY,
+ Loffset, Roffset,
+ Rdata.npos);
+ data->negate = true;
+ return TS_YES;
+ }
+ else
+ {
+ /* straight OR */
+ return TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
+ Loffset, Roffset,
+ Ldata.npos + Rdata.npos);
+ }
+
+ default:
+ elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
+ }
+
+ /* not reachable, but keep compiler quiet */
+ return TS_NO;
+}
+
+
+/*
+ * Evaluate tsquery boolean expression.
+ *
+ * curitem: current tsquery item (initially, the first one)
+ * arg: opaque value to pass through to callback function
+ * flags: bitmask of flag bits shown in ts_utils.h
+ * chkcond: callback function to check whether a primitive value is present
+ */
+bool
+TS_execute(QueryItem *curitem, void *arg, uint32 flags,
+ TSExecuteCallback chkcond)
+{
+ /*
+ * If we get TS_MAYBE from the recursion, return true. We could only see
+ * that result if the caller passed TS_EXEC_PHRASE_NO_POS, so there's no
+ * need to check again.
+ */
+ return TS_execute_recurse(curitem, arg, flags, chkcond) != TS_NO;
+}
+
+/*
+ * Evaluate tsquery boolean expression.
+ *
+ * This is the same as TS_execute except that TS_MAYBE is returned as-is.
+ */
+TSTernaryValue
+TS_execute_ternary(QueryItem *curitem, void *arg, uint32 flags,
+ TSExecuteCallback chkcond)
+{
+ return TS_execute_recurse(curitem, arg, flags, chkcond);
+}
+
+/*
+ * TS_execute recursion for operators above any phrase operator. Here we do
+ * not need to worry about lexeme positions. As soon as we hit an OP_PHRASE
+ * operator, we pass it off to TS_phrase_execute which does worry.
+ */
+static TSTernaryValue
+TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
+ TSExecuteCallback chkcond)
+{
+ TSTernaryValue lmatch;
+
+ /* since this function recurses, it could be driven to stack overflow */
+ check_stack_depth();
+
+ /* ... and let's check for query cancel while we're at it */
+ CHECK_FOR_INTERRUPTS();
+
+ if (curitem->type == QI_VAL)
+ return chkcond(arg, (QueryOperand *) curitem,
+ NULL /* don't need position info */ );
+
+ switch (curitem->qoperator.oper)
+ {
+ case OP_NOT:
+ if (flags & TS_EXEC_SKIP_NOT)
+ return TS_YES;
+ switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
+ {
+ case TS_NO:
+ return TS_YES;
+ case TS_YES:
+ return TS_NO;
+ case TS_MAYBE:
+ return TS_MAYBE;
+ }
+ break;
+
+ case OP_AND:
+ lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
+ flags, chkcond);
+ if (lmatch == TS_NO)
+ return TS_NO;
+ switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
+ {
+ case TS_NO:
+ return TS_NO;
+ case TS_YES:
+ return lmatch;
+ case TS_MAYBE:
+ return TS_MAYBE;
+ }
+ break;
+
+ case OP_OR:
+ lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
+ flags, chkcond);
+ if (lmatch == TS_YES)
+ return TS_YES;
+ switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
+ {
+ case TS_NO:
+ return lmatch;
+ case TS_YES:
+ return TS_YES;
+ case TS_MAYBE:
+ return TS_MAYBE;
+ }
+ break;
+
+ case OP_PHRASE:
+
+ /*
+ * If we get a MAYBE result, and the caller doesn't want that,
+ * convert it to NO. It would be more consistent, perhaps, to
+ * return the result of TS_phrase_execute() verbatim and then
+ * convert MAYBE results at the top of the recursion. But
+ * converting at the topmost phrase operator gives results that
+ * are bug-compatible with the old implementation, so do it like
+ * this for now.
+ */
+ switch (TS_phrase_execute(curitem, arg, flags, chkcond, NULL))
+ {
+ case TS_NO:
+ return TS_NO;
+ case TS_YES:
+ return TS_YES;
+ case TS_MAYBE:
+ return (flags & TS_EXEC_PHRASE_NO_POS) ? TS_MAYBE : TS_NO;
+ }
+ break;
+
+ default:
+ elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
+ }
+
+ /* not reachable, but keep compiler quiet */
+ return TS_NO;
+}
+
+/*
+ * Evaluate tsquery and report locations of matching terms.
+ *
+ * This is like TS_execute except that it returns match locations not just
+ * success/failure status. The callback function is required to provide
+ * position data (we report failure if it doesn't).
+ *
+ * On successful match, the result is a List of ExecPhraseData structs, one
+ * for each AND'ed term or phrase operator in the query. Each struct includes
+ * a sorted array of lexeme positions matching that term. (Recall that for
+ * phrase operators, the match includes width+1 lexemes, and the recorded
+ * position is that of the rightmost lexeme.)
+ *
+ * OR subexpressions are handled by union'ing their match locations into a
+ * single List element, which is valid since any of those locations contains
+ * a match. However, when some of the OR'ed terms are phrase operators, we
+ * report the maximum width of any of the OR'ed terms, making such cases
+ * slightly imprecise in the conservative direction. (For example, if the
+ * tsquery is "(A <-> B) | C", an occurrence of C in the data would be
+ * reported as though it includes the lexeme to the left of C.)
+ *
+ * Locations of NOT subexpressions are not reported. (Obviously, there can
+ * be no successful NOT matches at top level, or the match would have failed.
+ * So this amounts to ignoring NOTs underneath ORs.)
+ *
+ * The result is NIL if no match, or if position data was not returned.
+ *
+ * Arguments are the same as for TS_execute, although flags is currently
+ * vestigial since none of the defined bits are sensible here.
+ */
+List *
+TS_execute_locations(QueryItem *curitem, void *arg,
+ uint32 flags,
+ TSExecuteCallback chkcond)
+{
+ List *result;
+
+ /* No flags supported, as yet */
+ Assert(flags == TS_EXEC_EMPTY);
+ if (TS_execute_locations_recurse(curitem, arg, chkcond, &result))
+ return result;
+ return NIL;
+}
+
+/*
+ * TS_execute_locations recursion for operators above any phrase operator.
+ * OP_PHRASE subexpressions can be passed off to TS_phrase_execute.
+ */
+static bool
+TS_execute_locations_recurse(QueryItem *curitem, void *arg,
+ TSExecuteCallback chkcond,
+ List **locations)
+{
+ bool lmatch,
+ rmatch;
+ List *llocations,
+ *rlocations;
+ ExecPhraseData *data;
+
+ /* since this function recurses, it could be driven to stack overflow */
+ check_stack_depth();
+
+ /* ... and let's check for query cancel while we're at it */
+ CHECK_FOR_INTERRUPTS();
+
+ /* Default locations result is empty */
+ *locations = NIL;
+
+ if (curitem->type == QI_VAL)
+ {
+ data = palloc0_object(ExecPhraseData);
+ if (chkcond(arg, (QueryOperand *) curitem, data) == TS_YES)
+ {
+ *locations = list_make1(data);
+ return true;
+ }
+ pfree(data);
+ return false;
+ }
+
+ switch (curitem->qoperator.oper)
+ {
+ case OP_NOT:
+ if (!TS_execute_locations_recurse(curitem + 1, arg, chkcond,
+ &llocations))
+ return true; /* we don't pass back any locations */
+ return false;
+
+ case OP_AND:
+ if (!TS_execute_locations_recurse(curitem + curitem->qoperator.left,
+ arg, chkcond,
+ &llocations))
+ return false;
+ if (!TS_execute_locations_recurse(curitem + 1,
+ arg, chkcond,
+ &rlocations))
+ return false;
+ *locations = list_concat(llocations, rlocations);
+ return true;
+
+ case OP_OR:
+ lmatch = TS_execute_locations_recurse(curitem + curitem->qoperator.left,
+ arg, chkcond,
+ &llocations);
+ rmatch = TS_execute_locations_recurse(curitem + 1,
+ arg, chkcond,
+ &rlocations);
+ if (lmatch || rmatch)
+ {
+ /*
+ * We generate an AND'able location struct from each
+ * combination of sub-matches, following the disjunctive law
+ * (A & B) | (C & D) = (A | C) & (A | D) & (B | C) & (B | D).
+ *
+ * However, if either input didn't produce locations (i.e., it
+ * failed or was a NOT), we must just return the other list.
+ */
+ if (llocations == NIL)
+ *locations = rlocations;
+ else if (rlocations == NIL)
+ *locations = llocations;
+ else
+ {
+ ListCell *ll;
+
+ foreach(ll, llocations)
+ {
+ ExecPhraseData *ldata = (ExecPhraseData *) lfirst(ll);
+ ListCell *lr;
+
+ foreach(lr, rlocations)
+ {
+ ExecPhraseData *rdata = (ExecPhraseData *) lfirst(lr);
+
+ data = palloc0_object(ExecPhraseData);
+ (void) TS_phrase_output(data, ldata, rdata,
+ TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
+ 0, 0,
+ ldata->npos + rdata->npos);
+ /* Report the larger width, as explained above. */
+ data->width = Max(ldata->width, rdata->width);
+ *locations = lappend(*locations, data);
+ }
+ }
+ }
+
+ return true;
+ }
+ return false;
+
+ case OP_PHRASE:
+ /* We can hand this off to TS_phrase_execute */
+ data = palloc0_object(ExecPhraseData);
+ if (TS_phrase_execute(curitem, arg, TS_EXEC_EMPTY, chkcond,
+ data) == TS_YES)
+ {
+ if (!data->negate)
+ *locations = list_make1(data);
+ return true;
+ }
+ pfree(data);
+ return false;
+
+ default:
+ elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
+ }
+
+ /* not reachable, but keep compiler quiet */
+ return false;
+}
+
+/*
+ * Detect whether a tsquery boolean expression requires any positive matches
+ * to values shown in the tsquery.
+ *
+ * This is needed to know whether a GIN index search requires full index scan.
+ * For example, 'x & !y' requires a match of x, so it's sufficient to scan
+ * entries for x; but 'x | !y' could match rows containing neither x nor y.
+ */
+bool
+tsquery_requires_match(QueryItem *curitem)
+{
+ /* since this function recurses, it could be driven to stack overflow */
+ check_stack_depth();
+
+ if (curitem->type == QI_VAL)
+ return true;
+
+ switch (curitem->qoperator.oper)
+ {
+ case OP_NOT:
+
+ /*
+ * Assume there are no required matches underneath a NOT. For
+ * some cases with nested NOTs, we could prove there's a required
+ * match, but it seems unlikely to be worth the trouble.
+ */
+ return false;
+
+ case OP_PHRASE:
+
+ /*
+ * Treat OP_PHRASE as OP_AND here
+ */
+ case OP_AND:
+ /* If either side requires a match, we're good */
+ if (tsquery_requires_match(curitem + curitem->qoperator.left))
+ return true;
+ else
+ return tsquery_requires_match(curitem + 1);
+
+ case OP_OR:
+ /* Both sides must require a match */
+ if (tsquery_requires_match(curitem + curitem->qoperator.left))
+ return tsquery_requires_match(curitem + 1);
+ else
+ return false;
+
+ default:
+ elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
+ }
+
+ /* not reachable, but keep compiler quiet */
+ return false;
+}
+
+/*
+ * boolean operations
+ */
+Datum
+ts_match_qv(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq,
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(0)));
+}
+
+Datum
+ts_match_vq(PG_FUNCTION_ARGS)
+{
+ TSVector val = PG_GETARG_TSVECTOR(0);
+ TSQuery query = PG_GETARG_TSQUERY(1);
+ CHKVAL chkval;
+ bool result;
+
+ /* empty query matches nothing */
+ if (!query->size)
+ {
+ PG_FREE_IF_COPY(val, 0);
+ PG_FREE_IF_COPY(query, 1);
+ PG_RETURN_BOOL(false);
+ }
+
+ chkval.arrb = ARRPTR(val);
+ chkval.arre = chkval.arrb + val->size;
+ chkval.values = STRPTR(val);
+ chkval.operand = GETOPERAND(query);
+ result = TS_execute(GETQUERY(query),
+ &chkval,
+ TS_EXEC_EMPTY,
+ checkcondition_str);
+
+ PG_FREE_IF_COPY(val, 0);
+ PG_FREE_IF_COPY(query, 1);
+ PG_RETURN_BOOL(result);
+}
+
+Datum
+ts_match_tt(PG_FUNCTION_ARGS)
+{
+ TSVector vector;
+ TSQuery query;
+ bool res;
+
+ vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
+ PG_GETARG_DATUM(0)));
+ query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery,
+ PG_GETARG_DATUM(1)));
+
+ res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
+ TSVectorGetDatum(vector),
+ TSQueryGetDatum(query)));
+
+ pfree(vector);
+ pfree(query);
+
+ PG_RETURN_BOOL(res);
+}
+
+Datum
+ts_match_tq(PG_FUNCTION_ARGS)
+{
+ TSVector vector;
+ TSQuery query = PG_GETARG_TSQUERY(1);
+ bool res;
+
+ vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
+ PG_GETARG_DATUM(0)));
+
+ res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
+ TSVectorGetDatum(vector),
+ TSQueryGetDatum(query)));
+
+ pfree(vector);
+ PG_FREE_IF_COPY(query, 1);
+
+ PG_RETURN_BOOL(res);
+}
+
+/*
+ * ts_stat statistic function support
+ */
+
+
+/*
+ * Returns the number of positions in value 'wptr' within tsvector 'txt',
+ * that have a weight equal to one of the weights in 'weight' bitmask.
+ */
+static int
+check_weight(TSVector txt, WordEntry *wptr, int8 weight)
+{
+ int len = POSDATALEN(txt, wptr);
+ int num = 0;
+ WordEntryPos *ptr = POSDATAPTR(txt, wptr);
+
+ while (len--)
+ {
+ if (weight & (1 << WEP_GETWEIGHT(*ptr)))
+ num++;
+ ptr++;
+ }
+ return num;
+}
+
+#define compareStatWord(a,e,t) \
+ tsCompareString((a)->lexeme, (a)->lenlexeme, \
+ STRPTR(t) + (e)->pos, (e)->len, \
+ false)
+
+static void
+insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off)
+{
+ WordEntry *we = ARRPTR(txt) + off;
+ StatEntry *node = stat->root,
+ *pnode = NULL;
+ int n,
+ res = 0;
+ uint32 depth = 1;
+
+ if (stat->weight == 0)
+ n = (we->haspos) ? POSDATALEN(txt, we) : 1;
+ else
+ n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0;
+
+ if (n == 0)
+ return; /* nothing to insert */
+
+ while (node)
+ {
+ res = compareStatWord(node, we, txt);
+
+ if (res == 0)
+ {
+ break;
+ }
+ else
+ {
+ pnode = node;
+ node = (res < 0) ? node->left : node->right;
+ }
+ depth++;
+ }
+
+ if (depth > stat->maxdepth)
+ stat->maxdepth = depth;
+
+ if (node == NULL)
+ {
+ node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len);
+ node->left = node->right = NULL;
+ node->ndoc = 1;
+ node->nentry = n;
+ node->lenlexeme = we->len;
+ memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme);
+
+ if (pnode == NULL)
+ {
+ stat->root = node;
+ }
+ else
+ {
+ if (res < 0)
+ pnode->left = node;
+ else
+ pnode->right = node;
+ }
+ }
+ else
+ {
+ node->ndoc++;
+ node->nentry += n;
+ }
+}
+
+static void
+chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt,
+ uint32 low, uint32 high, uint32 offset)
+{
+ uint32 pos;
+ uint32 middle = (low + high) >> 1;
+
+ pos = (low + middle) >> 1;
+ if (low != middle && pos >= offset && pos - offset < txt->size)
+ insertStatEntry(persistentContext, stat, txt, pos - offset);
+ pos = (high + middle + 1) >> 1;
+ if (middle + 1 != high && pos >= offset && pos - offset < txt->size)
+ insertStatEntry(persistentContext, stat, txt, pos - offset);
+
+ if (low != middle)
+ chooseNextStatEntry(persistentContext, stat, txt, low, middle, offset);
+ if (high != middle + 1)
+ chooseNextStatEntry(persistentContext, stat, txt, middle + 1, high, offset);
+}
+
+/*
+ * This is written like a custom aggregate function, because the
+ * original plan was to do just that. Unfortunately, an aggregate function
+ * can't return a set, so that plan was abandoned. If that limitation is
+ * lifted in the future, ts_stat could be a real aggregate function so that
+ * you could use it like this:
+ *
+ * SELECT ts_stat(vector_column) FROM vector_table;
+ *
+ * where vector_column is a tsvector-type column in vector_table.
+ */
+
+static TSVectorStat *
+ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
+{
+ TSVector txt = DatumGetTSVector(data);
+ uint32 i,
+ nbit = 0,
+ offset;
+
+ if (stat == NULL)
+ { /* Init in first */
+ stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
+ stat->maxdepth = 1;
+ }
+
+ /* simple check of correctness */
+ if (txt == NULL || txt->size == 0)
+ {
+ if (txt && txt != (TSVector) DatumGetPointer(data))
+ pfree(txt);
+ return stat;
+ }
+
+ i = txt->size - 1;
+ for (; i > 0; i >>= 1)
+ nbit++;
+
+ nbit = 1 << nbit;
+ offset = (nbit - txt->size) / 2;
+
+ insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset);
+ chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset);
+
+ return stat;
+}
+
+static void
+ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx,
+ TSVectorStat *stat)
+{
+ TupleDesc tupdesc;
+ MemoryContext oldcontext;
+ StatEntry *node;
+
+ funcctx->user_fctx = (void *) stat;
+
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1));
+ stat->stackpos = 0;
+
+ node = stat->root;
+ /* find leftmost value */
+ if (node == NULL)
+ stat->stack[stat->stackpos] = NULL;
+ else
+ for (;;)
+ {
+ stat->stack[stat->stackpos] = node;
+ if (node->left)
+ {
+ stat->stackpos++;
+ node = node->left;
+ }
+ else
+ break;
+ }
+ Assert(stat->stackpos <= stat->maxdepth);
+
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+ funcctx->tuple_desc = tupdesc;
+ funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+static StatEntry *
+walkStatEntryTree(TSVectorStat *stat)
+{
+ StatEntry *node = stat->stack[stat->stackpos];
+
+ if (node == NULL)
+ return NULL;
+
+ if (node->ndoc != 0)
+ {
+ /* return entry itself: we already was at left sublink */
+ return node;
+ }
+ else if (node->right && node->right != stat->stack[stat->stackpos + 1])
+ {
+ /* go on right sublink */
+ stat->stackpos++;
+ node = node->right;
+
+ /* find most-left value */
+ for (;;)
+ {
+ stat->stack[stat->stackpos] = node;
+ if (node->left)
+ {
+ stat->stackpos++;
+ node = node->left;
+ }
+ else
+ break;
+ }
+ Assert(stat->stackpos <= stat->maxdepth);
+ }
+ else
+ {
+ /* we already return all left subtree, itself and right subtree */
+ if (stat->stackpos == 0)
+ return NULL;
+
+ stat->stackpos--;
+ return walkStatEntryTree(stat);
+ }
+
+ return node;
+}
+
+static Datum
+ts_process_call(FuncCallContext *funcctx)
+{
+ TSVectorStat *st;
+ StatEntry *entry;
+
+ st = (TSVectorStat *) funcctx->user_fctx;
+
+ entry = walkStatEntryTree(st);
+
+ if (entry != NULL)
+ {
+ Datum result;
+ char *values[3];
+ char ndoc[16];
+ char nentry[16];
+ HeapTuple tuple;
+
+ values[0] = palloc(entry->lenlexeme + 1);
+ memcpy(values[0], entry->lexeme, entry->lenlexeme);
+ (values[0])[entry->lenlexeme] = '\0';
+ sprintf(ndoc, "%d", entry->ndoc);
+ values[1] = ndoc;
+ sprintf(nentry, "%d", entry->nentry);
+ values[2] = nentry;
+
+ tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
+ result = HeapTupleGetDatum(tuple);
+
+ pfree(values[0]);
+
+ /* mark entry as already visited */
+ entry->ndoc = 0;
+
+ return result;
+ }
+
+ return (Datum) 0;
+}
+
+static TSVectorStat *
+ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
+{
+ char *query = text_to_cstring(txt);
+ TSVectorStat *stat;
+ bool isnull;
+ Portal portal;
+ SPIPlanPtr plan;
+
+ if ((plan = SPI_prepare(query, 0, NULL)) == NULL)
+ /* internal error */
+ elog(ERROR, "SPI_prepare(\"%s\") failed", query);
+
+ if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL)
+ /* internal error */
+ elog(ERROR, "SPI_cursor_open(\"%s\") failed", query);
+
+ SPI_cursor_fetch(portal, true, 100);
+
+ if (SPI_tuptable == NULL ||
+ SPI_tuptable->tupdesc->natts != 1 ||
+ !IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1),
+ TSVECTOROID))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("ts_stat query must return one tsvector column")));
+
+ stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
+ stat->maxdepth = 1;
+
+ if (ws)
+ {
+ char *buf;
+
+ buf = VARDATA_ANY(ws);
+ while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
+ {
+ if (pg_mblen(buf) == 1)
+ {
+ switch (*buf)
+ {
+ case 'A':
+ case 'a':
+ stat->weight |= 1 << 3;
+ break;
+ case 'B':
+ case 'b':
+ stat->weight |= 1 << 2;
+ break;
+ case 'C':
+ case 'c':
+ stat->weight |= 1 << 1;
+ break;
+ case 'D':
+ case 'd':
+ stat->weight |= 1;
+ break;
+ default:
+ stat->weight |= 0;
+ }
+ }
+ buf += pg_mblen(buf);
+ }
+ }
+
+ while (SPI_processed > 0)
+ {
+ uint64 i;
+
+ for (i = 0; i < SPI_processed; i++)
+ {
+ Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull);
+
+ if (!isnull)
+ stat = ts_accum(persistentContext, stat, data);
+ }
+
+ SPI_freetuptable(SPI_tuptable);
+ SPI_cursor_fetch(portal, true, 100);
+ }
+
+ SPI_freetuptable(SPI_tuptable);
+ SPI_cursor_close(portal);
+ SPI_freeplan(plan);
+ pfree(query);
+
+ return stat;
+}
+
+Datum
+ts_stat1(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ TSVectorStat *stat;
+ text *txt = PG_GETARG_TEXT_PP(0);
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ SPI_connect();
+ stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, NULL);
+ PG_FREE_IF_COPY(txt, 0);
+ ts_setup_firstcall(fcinfo, funcctx, stat);
+ SPI_finish();
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+ if ((result = ts_process_call(funcctx)) != (Datum) 0)
+ SRF_RETURN_NEXT(funcctx, result);
+ SRF_RETURN_DONE(funcctx);
+}
+
+Datum
+ts_stat2(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ TSVectorStat *stat;
+ text *txt = PG_GETARG_TEXT_PP(0);
+ text *ws = PG_GETARG_TEXT_PP(1);
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ SPI_connect();
+ stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, ws);
+ PG_FREE_IF_COPY(txt, 0);
+ PG_FREE_IF_COPY(ws, 1);
+ ts_setup_firstcall(fcinfo, funcctx, stat);
+ SPI_finish();
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+ if ((result = ts_process_call(funcctx)) != (Datum) 0)
+ SRF_RETURN_NEXT(funcctx, result);
+ SRF_RETURN_DONE(funcctx);
+}
+
+
+/*
+ * Triggers for automatic update of a tsvector column from text column(s)
+ *
+ * Trigger arguments are either
+ * name of tsvector col, name of tsconfig to use, name(s) of text col(s)
+ * name of tsvector col, name of regconfig col, name(s) of text col(s)
+ * ie, tsconfig can either be specified by name, or indirectly as the
+ * contents of a regconfig field in the row. If the name is used, it must
+ * be explicitly schema-qualified.
+ */
+Datum
+tsvector_update_trigger_byid(PG_FUNCTION_ARGS)
+{
+ return tsvector_update_trigger(fcinfo, false);
+}
+
+Datum
+tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS)
+{
+ return tsvector_update_trigger(fcinfo, true);
+}
+
+static Datum
+tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
+{
+ TriggerData *trigdata;
+ Trigger *trigger;
+ Relation rel;
+ HeapTuple rettuple = NULL;
+ int tsvector_attr_num,
+ i;
+ ParsedText prs;
+ Datum datum;
+ bool isnull;
+ text *txt;
+ Oid cfgId;
+ bool update_needed;
+
+ /* Check call context */
+ if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */
+ elog(ERROR, "tsvector_update_trigger: not fired by trigger manager");
+
+ trigdata = (TriggerData *) fcinfo->context;
+ if (!TRIGGER_FIRED_FOR_ROW(trigdata->tg_event))
+ elog(ERROR, "tsvector_update_trigger: must be fired for row");
+ if (!TRIGGER_FIRED_BEFORE(trigdata->tg_event))
+ elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event");
+
+ if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
+ {
+ rettuple = trigdata->tg_trigtuple;
+ update_needed = true;
+ }
+ else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
+ {
+ rettuple = trigdata->tg_newtuple;
+ update_needed = false; /* computed below */
+ }
+ else
+ elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE");
+
+ trigger = trigdata->tg_trigger;
+ rel = trigdata->tg_relation;
+
+ if (trigger->tgnargs < 3)
+ elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)");
+
+ /* Find the target tsvector column */
+ tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]);
+ if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE)
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_COLUMN),
+ errmsg("tsvector column \"%s\" does not exist",
+ trigger->tgargs[0])));
+ /* This will effectively reject system columns, so no separate test: */
+ if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, tsvector_attr_num),
+ TSVECTOROID))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("column \"%s\" is not of tsvector type",
+ trigger->tgargs[0])));
+
+ /* Find the configuration to use */
+ if (config_column)
+ {
+ int config_attr_num;
+
+ config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]);
+ if (config_attr_num == SPI_ERROR_NOATTRIBUTE)
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_COLUMN),
+ errmsg("configuration column \"%s\" does not exist",
+ trigger->tgargs[1])));
+ if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, config_attr_num),
+ REGCONFIGOID))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("column \"%s\" is not of regconfig type",
+ trigger->tgargs[1])));
+
+ datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull);
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("configuration column \"%s\" must not be null",
+ trigger->tgargs[1])));
+ cfgId = DatumGetObjectId(datum);
+ }
+ else
+ {
+ List *names;
+
+ names = stringToQualifiedNameList(trigger->tgargs[1], NULL);
+ /* require a schema so that results are not search path dependent */
+ if (list_length(names) < 2)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("text search configuration name \"%s\" must be schema-qualified",
+ trigger->tgargs[1])));
+ cfgId = get_ts_config_oid(names, false);
+ }
+
+ /* initialize parse state */
+ prs.lenwords = 32;
+ prs.curwords = 0;
+ prs.pos = 0;
+ prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
+
+ /* find all words in indexable column(s) */
+ for (i = 2; i < trigger->tgnargs; i++)
+ {
+ int numattr;
+
+ numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]);
+ if (numattr == SPI_ERROR_NOATTRIBUTE)
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_COLUMN),
+ errmsg("column \"%s\" does not exist",
+ trigger->tgargs[i])));
+ if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, numattr), TEXTOID))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("column \"%s\" is not of a character type",
+ trigger->tgargs[i])));
+
+ if (bms_is_member(numattr - FirstLowInvalidHeapAttributeNumber, trigdata->tg_updatedcols))
+ update_needed = true;
+
+ datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull);
+ if (isnull)
+ continue;
+
+ txt = DatumGetTextPP(datum);
+
+ parsetext(cfgId, &prs, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt));
+
+ if (txt != (text *) DatumGetPointer(datum))
+ pfree(txt);
+ }
+
+ if (update_needed)
+ {
+ /* make tsvector value */
+ datum = TSVectorGetDatum(make_tsvector(&prs));
+ isnull = false;
+
+ /* and insert it into tuple */
+ rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
+ 1, &tsvector_attr_num,
+ &datum, &isnull);
+
+ pfree(DatumGetPointer(datum));
+ }
+
+ return PointerGetDatum(rettuple);
+}