summaryrefslogtreecommitdiffstats
path: root/src/backend/tsearch/to_tsany.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/tsearch/to_tsany.c
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/tsearch/to_tsany.c')
-rw-r--r--src/backend/tsearch/to_tsany.c724
1 files changed, 724 insertions, 0 deletions
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
new file mode 100644
index 0000000..f4ddfc0
--- /dev/null
+++ b/src/backend/tsearch/to_tsany.c
@@ -0,0 +1,724 @@
+/*-------------------------------------------------------------------------
+ *
+ * to_tsany.c
+ * to_ts* function definitions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/to_tsany.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "common/jsonapi.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+#include "utils/jsonfuncs.h"
+
+
+/*
+ * Opaque data structure, which is passed by parse_tsquery() to pushval_morph().
+ */
+typedef struct MorphOpaque
+{
+ Oid cfg_id;
+
+ /*
+ * Single tsquery morph could be parsed into multiple words. When these
+ * words reside in adjacent positions, they are connected using this
+ * operator. Usually, that is OP_PHRASE, which requires word positions of
+ * a complex morph to exactly match the tsvector.
+ */
+ int qoperator;
+} MorphOpaque;
+
+typedef struct TSVectorBuildState
+{
+ ParsedText *prs;
+ Oid cfgId;
+} TSVectorBuildState;
+
+static void add_to_tsvector(void *_state, char *elem_value, int elem_len);
+
+
+Datum
+get_current_ts_config(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_OID(getTSCurrentConfig(true));
+}
+
+/*
+ * to_tsvector
+ */
+static int
+compareWORD(const void *a, const void *b)
+{
+ int res;
+
+ res = tsCompareString(((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
+ ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
+ false);
+
+ if (res == 0)
+ {
+ if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
+ return 0;
+
+ res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
+ }
+
+ return res;
+}
+
+static int
+uniqueWORD(ParsedWord *a, int32 l)
+{
+ ParsedWord *ptr,
+ *res;
+ int tmppos;
+
+ if (l == 1)
+ {
+ tmppos = LIMITPOS(a->pos.pos);
+ a->alen = 2;
+ a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
+ a->pos.apos[0] = 1;
+ a->pos.apos[1] = tmppos;
+ return l;
+ }
+
+ res = a;
+ ptr = a + 1;
+
+ /*
+ * Sort words with its positions
+ */
+ qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
+
+ /*
+ * Initialize first word and its first position
+ */
+ tmppos = LIMITPOS(a->pos.pos);
+ a->alen = 2;
+ a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
+ a->pos.apos[0] = 1;
+ a->pos.apos[1] = tmppos;
+
+ /*
+ * Summarize position information for each word
+ */
+ while (ptr - a < l)
+ {
+ if (!(ptr->len == res->len &&
+ strncmp(ptr->word, res->word, res->len) == 0))
+ {
+ /*
+ * Got a new word, so put it in result
+ */
+ res++;
+ res->len = ptr->len;
+ res->word = ptr->word;
+ tmppos = LIMITPOS(ptr->pos.pos);
+ res->alen = 2;
+ res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
+ res->pos.apos[0] = 1;
+ res->pos.apos[1] = tmppos;
+ }
+ else
+ {
+ /*
+ * The word already exists, so adjust position information. But
+ * before we should check size of position's array, max allowed
+ * value for position and uniqueness of position
+ */
+ pfree(ptr->word);
+ if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
+ res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
+ {
+ if (res->pos.apos[0] + 1 >= res->alen)
+ {
+ res->alen *= 2;
+ res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
+ }
+ if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
+ {
+ res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
+ res->pos.apos[0]++;
+ }
+ }
+ }
+ ptr++;
+ }
+
+ return res + 1 - a;
+}
+
+/*
+ * make value of tsvector, given parsed text
+ *
+ * Note: frees prs->words and subsidiary data.
+ */
+TSVector
+make_tsvector(ParsedText *prs)
+{
+ int i,
+ j,
+ lenstr = 0,
+ totallen;
+ TSVector in;
+ WordEntry *ptr;
+ char *str;
+ int stroff;
+
+ /* Merge duplicate words */
+ if (prs->curwords > 0)
+ prs->curwords = uniqueWORD(prs->words, prs->curwords);
+
+ /* Determine space needed */
+ for (i = 0; i < prs->curwords; i++)
+ {
+ lenstr += prs->words[i].len;
+ if (prs->words[i].alen)
+ {
+ lenstr = SHORTALIGN(lenstr);
+ lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
+ }
+ }
+
+ if (lenstr > MAXSTRPOS)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
+
+ totallen = CALCDATASIZE(prs->curwords, lenstr);
+ in = (TSVector) palloc0(totallen);
+ SET_VARSIZE(in, totallen);
+ in->size = prs->curwords;
+
+ ptr = ARRPTR(in);
+ str = STRPTR(in);
+ stroff = 0;
+ for (i = 0; i < prs->curwords; i++)
+ {
+ ptr->len = prs->words[i].len;
+ ptr->pos = stroff;
+ memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
+ stroff += prs->words[i].len;
+ pfree(prs->words[i].word);
+ if (prs->words[i].alen)
+ {
+ int k = prs->words[i].pos.apos[0];
+ WordEntryPos *wptr;
+
+ if (k > 0xFFFF)
+ elog(ERROR, "positions array too long");
+
+ ptr->haspos = 1;
+ stroff = SHORTALIGN(stroff);
+ *(uint16 *) (str + stroff) = (uint16) k;
+ wptr = POSDATAPTR(in, ptr);
+ for (j = 0; j < k; j++)
+ {
+ WEP_SETWEIGHT(wptr[j], 0);
+ WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
+ }
+ stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
+ pfree(prs->words[i].pos.apos);
+ }
+ else
+ ptr->haspos = 0;
+ ptr++;
+ }
+
+ if (prs->words)
+ pfree(prs->words);
+
+ return in;
+}
+
+Datum
+to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ text *in = PG_GETARG_TEXT_PP(1);
+ ParsedText prs;
+ TSVector out;
+
+ prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's
+ * number */
+ if (prs.lenwords < 2)
+ prs.lenwords = 2;
+ prs.curwords = 0;
+ prs.pos = 0;
+ prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
+
+ parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
+
+ PG_FREE_IF_COPY(in, 1);
+
+ out = make_tsvector(&prs);
+
+ PG_RETURN_TSVECTOR(out);
+}
+
+Datum
+to_tsvector(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+}
+
+/*
+ * Worker function for jsonb(_string)_to_tsvector(_byid)
+ */
+static TSVector
+jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags)
+{
+ TSVectorBuildState state;
+ ParsedText prs;
+
+ prs.words = NULL;
+ prs.curwords = 0;
+ state.prs = &prs;
+ state.cfgId = cfgId;
+
+ iterate_jsonb_values(jb, flags, &state, add_to_tsvector);
+
+ return make_tsvector(&prs);
+}
+
+Datum
+jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ Jsonb *jb = PG_GETARG_JSONB_P(1);
+ TSVector result;
+
+ result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
+ PG_FREE_IF_COPY(jb, 1);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+jsonb_string_to_tsvector(PG_FUNCTION_ARGS)
+{
+ Jsonb *jb = PG_GETARG_JSONB_P(0);
+ Oid cfgId;
+ TSVector result;
+
+ cfgId = getTSCurrentConfig(true);
+ result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
+ PG_FREE_IF_COPY(jb, 0);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ Jsonb *jb = PG_GETARG_JSONB_P(1);
+ Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
+ TSVector result;
+ uint32 flags = parse_jsonb_index_flags(jbFlags);
+
+ result = jsonb_to_tsvector_worker(cfgId, jb, flags);
+ PG_FREE_IF_COPY(jb, 1);
+ PG_FREE_IF_COPY(jbFlags, 2);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+jsonb_to_tsvector(PG_FUNCTION_ARGS)
+{
+ Jsonb *jb = PG_GETARG_JSONB_P(0);
+ Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
+ Oid cfgId;
+ TSVector result;
+ uint32 flags = parse_jsonb_index_flags(jbFlags);
+
+ cfgId = getTSCurrentConfig(true);
+ result = jsonb_to_tsvector_worker(cfgId, jb, flags);
+ PG_FREE_IF_COPY(jb, 0);
+ PG_FREE_IF_COPY(jbFlags, 1);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+/*
+ * Worker function for json(_string)_to_tsvector(_byid)
+ */
+static TSVector
+json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags)
+{
+ TSVectorBuildState state;
+ ParsedText prs;
+
+ prs.words = NULL;
+ prs.curwords = 0;
+ state.prs = &prs;
+ state.cfgId = cfgId;
+
+ iterate_json_values(json, flags, &state, add_to_tsvector);
+
+ return make_tsvector(&prs);
+}
+
+Datum
+json_string_to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ text *json = PG_GETARG_TEXT_P(1);
+ TSVector result;
+
+ result = json_to_tsvector_worker(cfgId, json, jtiString);
+ PG_FREE_IF_COPY(json, 1);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+json_string_to_tsvector(PG_FUNCTION_ARGS)
+{
+ text *json = PG_GETARG_TEXT_P(0);
+ Oid cfgId;
+ TSVector result;
+
+ cfgId = getTSCurrentConfig(true);
+ result = json_to_tsvector_worker(cfgId, json, jtiString);
+ PG_FREE_IF_COPY(json, 0);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+json_to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ text *json = PG_GETARG_TEXT_P(1);
+ Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
+ TSVector result;
+ uint32 flags = parse_jsonb_index_flags(jbFlags);
+
+ result = json_to_tsvector_worker(cfgId, json, flags);
+ PG_FREE_IF_COPY(json, 1);
+ PG_FREE_IF_COPY(jbFlags, 2);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+json_to_tsvector(PG_FUNCTION_ARGS)
+{
+ text *json = PG_GETARG_TEXT_P(0);
+ Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
+ Oid cfgId;
+ TSVector result;
+ uint32 flags = parse_jsonb_index_flags(jbFlags);
+
+ cfgId = getTSCurrentConfig(true);
+ result = json_to_tsvector_worker(cfgId, json, flags);
+ PG_FREE_IF_COPY(json, 0);
+ PG_FREE_IF_COPY(jbFlags, 1);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+/*
+ * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
+ */
+static void
+add_to_tsvector(void *_state, char *elem_value, int elem_len)
+{
+ TSVectorBuildState *state = (TSVectorBuildState *) _state;
+ ParsedText *prs = state->prs;
+ int32 prevwords;
+
+ if (prs->words == NULL)
+ {
+ /*
+ * First time through: initialize words array to a reasonable size.
+ * (parsetext() will realloc it bigger as needed.)
+ */
+ prs->lenwords = 16;
+ prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
+ prs->curwords = 0;
+ prs->pos = 0;
+ }
+
+ prevwords = prs->curwords;
+
+ parsetext(state->cfgId, prs, elem_value, elem_len);
+
+ /*
+ * If we extracted any words from this JSON element, advance pos to create
+ * an artificial break between elements. This is because we don't want
+ * phrase searches to think that the last word in this element is adjacent
+ * to the first word in the next one.
+ */
+ if (prs->curwords > prevwords)
+ prs->pos += 1;
+}
+
+
+/*
+ * to_tsquery
+ */
+
+
+/*
+ * This function is used for morph parsing.
+ *
+ * The value is passed to parsetext which will call the right dictionary to
+ * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
+ * to the stack.
+ *
+ * All words belonging to the same variant are pushed as an ANDed list,
+ * and different variants are ORed together.
+ */
+static void
+pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
+{
+ int32 count = 0;
+ ParsedText prs;
+ uint32 variant,
+ pos = 0,
+ cntvar = 0,
+ cntpos = 0,
+ cnt = 0;
+ MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
+
+ prs.lenwords = 4;
+ prs.curwords = 0;
+ prs.pos = 0;
+ prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
+
+ parsetext(data->cfg_id, &prs, strval, lenval);
+
+ if (prs.curwords > 0)
+ {
+ while (count < prs.curwords)
+ {
+ /*
+ * Were any stop words removed? If so, fill empty positions with
+ * placeholders linked by an appropriate operator.
+ */
+ if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
+ {
+ while (pos + 1 < prs.words[count].pos.pos)
+ {
+ /* put placeholders for each missing stop word */
+ pushStop(state);
+ if (cntpos)
+ pushOperator(state, data->qoperator, 1);
+ cntpos++;
+ pos++;
+ }
+ }
+
+ /* save current word's position */
+ pos = prs.words[count].pos.pos;
+
+ /* Go through all variants obtained from this token */
+ cntvar = 0;
+ while (count < prs.curwords && pos == prs.words[count].pos.pos)
+ {
+ variant = prs.words[count].nvariant;
+
+ /* Push all words belonging to the same variant */
+ cnt = 0;
+ while (count < prs.curwords &&
+ pos == prs.words[count].pos.pos &&
+ variant == prs.words[count].nvariant)
+ {
+ pushValue(state,
+ prs.words[count].word,
+ prs.words[count].len,
+ weight,
+ ((prs.words[count].flags & TSL_PREFIX) || prefix));
+ pfree(prs.words[count].word);
+ if (cnt)
+ pushOperator(state, OP_AND, 0);
+ cnt++;
+ count++;
+ }
+
+ if (cntvar)
+ pushOperator(state, OP_OR, 0);
+ cntvar++;
+ }
+
+ if (cntpos)
+ {
+ /* distance may be useful */
+ pushOperator(state, data->qoperator, 1);
+ }
+
+ cntpos++;
+ }
+
+ pfree(prs.words);
+
+ }
+ else
+ pushStop(state);
+}
+
+Datum
+to_tsquery_byid(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(1);
+ TSQuery query;
+ MorphOpaque data;
+
+ data.cfg_id = PG_GETARG_OID(0);
+
+ /*
+ * Passing OP_PHRASE as a qoperator makes tsquery require matching of word
+ * positions of a complex morph exactly match the tsvector. Also, when
+ * the complex morphs are connected with OP_PHRASE operator, we connect
+ * all their words into the OP_PHRASE sequence.
+ */
+ data.qoperator = OP_PHRASE;
+
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ 0);
+
+ PG_RETURN_TSQUERY(query);
+}
+
+Datum
+to_tsquery(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+}
+
+Datum
+plainto_tsquery_byid(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(1);
+ TSQuery query;
+ MorphOpaque data;
+
+ data.cfg_id = PG_GETARG_OID(0);
+
+ /*
+ * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
+ * single morph. Passing OP_PHRASE as a qoperator makes tsquery require
+ * matching of all words independently on their positions.
+ */
+ data.qoperator = OP_AND;
+
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ P_TSQ_PLAIN);
+
+ PG_RETURN_POINTER(query);
+}
+
+Datum
+plainto_tsquery(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+}
+
+
+Datum
+phraseto_tsquery_byid(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(1);
+ TSQuery query;
+ MorphOpaque data;
+
+ data.cfg_id = PG_GETARG_OID(0);
+
+ /*
+ * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
+ * single morph. Passing OP_PHRASE as a qoperator makes tsquery require
+ * matching of word positions.
+ */
+ data.qoperator = OP_PHRASE;
+
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ P_TSQ_PLAIN);
+
+ PG_RETURN_TSQUERY(query);
+}
+
+Datum
+phraseto_tsquery(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+}
+
+Datum
+websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(1);
+ MorphOpaque data;
+ TSQuery query = NULL;
+
+ data.cfg_id = PG_GETARG_OID(0);
+
+ /*
+ * Passing OP_PHRASE as a qoperator makes tsquery require matching of word
+ * positions of a complex morph exactly match the tsvector. Also, when
+ * the complex morphs are given in quotes, we connect all their words into
+ * the OP_PHRASE sequence.
+ */
+ data.qoperator = OP_PHRASE;
+
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ P_TSQ_WEB);
+
+ PG_RETURN_TSQUERY(query);
+}
+
+Datum
+websearch_to_tsquery(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+
+}