diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/tsearch/to_tsany.c | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/tsearch/to_tsany.c')
-rw-r--r-- | src/backend/tsearch/to_tsany.c | 724 |
1 files changed, 724 insertions, 0 deletions
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c new file mode 100644 index 0000000..f4ddfc0 --- /dev/null +++ b/src/backend/tsearch/to_tsany.c @@ -0,0 +1,724 @@ +/*------------------------------------------------------------------------- + * + * to_tsany.c + * to_ts* function definitions + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/to_tsany.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "common/jsonapi.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" +#include "utils/jsonfuncs.h" + + +/* + * Opaque data structure, which is passed by parse_tsquery() to pushval_morph(). + */ +typedef struct MorphOpaque +{ + Oid cfg_id; + + /* + * Single tsquery morph could be parsed into multiple words. When these + * words reside in adjacent positions, they are connected using this + * operator. Usually, that is OP_PHRASE, which requires word positions of + * a complex morph to exactly match the tsvector. + */ + int qoperator; +} MorphOpaque; + +typedef struct TSVectorBuildState +{ + ParsedText *prs; + Oid cfgId; +} TSVectorBuildState; + +static void add_to_tsvector(void *_state, char *elem_value, int elem_len); + + +Datum +get_current_ts_config(PG_FUNCTION_ARGS) +{ + PG_RETURN_OID(getTSCurrentConfig(true)); +} + +/* + * to_tsvector + */ +static int +compareWORD(const void *a, const void *b) +{ + int res; + + res = tsCompareString(((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len, + ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len, + false); + + if (res == 0) + { + if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos) + return 0; + + res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1; + } + + return res; +} + +static int +uniqueWORD(ParsedWord *a, int32 l) +{ + ParsedWord *ptr, + *res; + int tmppos; + + if (l == 1) + { + tmppos = LIMITPOS(a->pos.pos); + a->alen = 2; + a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); + a->pos.apos[0] = 1; + a->pos.apos[1] = tmppos; + return l; + } + + res = a; + ptr = a + 1; + + /* + * Sort words with its positions + */ + qsort((void *) a, l, sizeof(ParsedWord), compareWORD); + + /* + * Initialize first word and its first position + */ + tmppos = LIMITPOS(a->pos.pos); + a->alen = 2; + a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); + a->pos.apos[0] = 1; + a->pos.apos[1] = tmppos; + + /* + * Summarize position information for each word + */ + while (ptr - a < l) + { + if (!(ptr->len == res->len && + strncmp(ptr->word, res->word, res->len) == 0)) + { + /* + * Got a new word, so put it in result + */ + res++; + res->len = ptr->len; + res->word = ptr->word; + tmppos = LIMITPOS(ptr->pos.pos); + res->alen = 2; + res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen); + res->pos.apos[0] = 1; + res->pos.apos[1] = tmppos; + } + else + { + /* + * The word already exists, so adjust position information. But + * before we should check size of position's array, max allowed + * value for position and uniqueness of position + */ + pfree(ptr->word); + if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 && + res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) + { + if (res->pos.apos[0] + 1 >= res->alen) + { + res->alen *= 2; + res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen); + } + if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) + { + res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos); + res->pos.apos[0]++; + } + } + } + ptr++; + } + + return res + 1 - a; +} + +/* + * make value of tsvector, given parsed text + * + * Note: frees prs->words and subsidiary data. + */ +TSVector +make_tsvector(ParsedText *prs) +{ + int i, + j, + lenstr = 0, + totallen; + TSVector in; + WordEntry *ptr; + char *str; + int stroff; + + /* Merge duplicate words */ + if (prs->curwords > 0) + prs->curwords = uniqueWORD(prs->words, prs->curwords); + + /* Determine space needed */ + for (i = 0; i < prs->curwords; i++) + { + lenstr += prs->words[i].len; + if (prs->words[i].alen) + { + lenstr = SHORTALIGN(lenstr); + lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); + } + } + + if (lenstr > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS))); + + totallen = CALCDATASIZE(prs->curwords, lenstr); + in = (TSVector) palloc0(totallen); + SET_VARSIZE(in, totallen); + in->size = prs->curwords; + + ptr = ARRPTR(in); + str = STRPTR(in); + stroff = 0; + for (i = 0; i < prs->curwords; i++) + { + ptr->len = prs->words[i].len; + ptr->pos = stroff; + memcpy(str + stroff, prs->words[i].word, prs->words[i].len); + stroff += prs->words[i].len; + pfree(prs->words[i].word); + if (prs->words[i].alen) + { + int k = prs->words[i].pos.apos[0]; + WordEntryPos *wptr; + + if (k > 0xFFFF) + elog(ERROR, "positions array too long"); + + ptr->haspos = 1; + stroff = SHORTALIGN(stroff); + *(uint16 *) (str + stroff) = (uint16) k; + wptr = POSDATAPTR(in, ptr); + for (j = 0; j < k; j++) + { + WEP_SETWEIGHT(wptr[j], 0); + WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]); + } + stroff += sizeof(uint16) + k * sizeof(WordEntryPos); + pfree(prs->words[i].pos.apos); + } + else + ptr->haspos = 0; + ptr++; + } + + if (prs->words) + pfree(prs->words); + + return in; +} + +Datum +to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + text *in = PG_GETARG_TEXT_PP(1); + ParsedText prs; + TSVector out; + + prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's + * number */ + if (prs.lenwords < 2) + prs.lenwords = 2; + prs.curwords = 0; + prs.pos = 0; + prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + + parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in)); + + PG_FREE_IF_COPY(in, 1); + + out = make_tsvector(&prs); + + PG_RETURN_TSVECTOR(out); +} + +Datum +to_tsvector(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} + +/* + * Worker function for jsonb(_string)_to_tsvector(_byid) + */ +static TSVector +jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags) +{ + TSVectorBuildState state; + ParsedText prs; + + prs.words = NULL; + prs.curwords = 0; + state.prs = &prs; + state.cfgId = cfgId; + + iterate_jsonb_values(jb, flags, &state, add_to_tsvector); + + return make_tsvector(&prs); +} + +Datum +jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + Jsonb *jb = PG_GETARG_JSONB_P(1); + TSVector result; + + result = jsonb_to_tsvector_worker(cfgId, jb, jtiString); + PG_FREE_IF_COPY(jb, 1); + + PG_RETURN_TSVECTOR(result); +} + +Datum +jsonb_string_to_tsvector(PG_FUNCTION_ARGS) +{ + Jsonb *jb = PG_GETARG_JSONB_P(0); + Oid cfgId; + TSVector result; + + cfgId = getTSCurrentConfig(true); + result = jsonb_to_tsvector_worker(cfgId, jb, jtiString); + PG_FREE_IF_COPY(jb, 0); + + PG_RETURN_TSVECTOR(result); +} + +Datum +jsonb_to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + Jsonb *jb = PG_GETARG_JSONB_P(1); + Jsonb *jbFlags = PG_GETARG_JSONB_P(2); + TSVector result; + uint32 flags = parse_jsonb_index_flags(jbFlags); + + result = jsonb_to_tsvector_worker(cfgId, jb, flags); + PG_FREE_IF_COPY(jb, 1); + PG_FREE_IF_COPY(jbFlags, 2); + + PG_RETURN_TSVECTOR(result); +} + +Datum +jsonb_to_tsvector(PG_FUNCTION_ARGS) +{ + Jsonb *jb = PG_GETARG_JSONB_P(0); + Jsonb *jbFlags = PG_GETARG_JSONB_P(1); + Oid cfgId; + TSVector result; + uint32 flags = parse_jsonb_index_flags(jbFlags); + + cfgId = getTSCurrentConfig(true); + result = jsonb_to_tsvector_worker(cfgId, jb, flags); + PG_FREE_IF_COPY(jb, 0); + PG_FREE_IF_COPY(jbFlags, 1); + + PG_RETURN_TSVECTOR(result); +} + +/* + * Worker function for json(_string)_to_tsvector(_byid) + */ +static TSVector +json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags) +{ + TSVectorBuildState state; + ParsedText prs; + + prs.words = NULL; + prs.curwords = 0; + state.prs = &prs; + state.cfgId = cfgId; + + iterate_json_values(json, flags, &state, add_to_tsvector); + + return make_tsvector(&prs); +} + +Datum +json_string_to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + text *json = PG_GETARG_TEXT_P(1); + TSVector result; + + result = json_to_tsvector_worker(cfgId, json, jtiString); + PG_FREE_IF_COPY(json, 1); + + PG_RETURN_TSVECTOR(result); +} + +Datum +json_string_to_tsvector(PG_FUNCTION_ARGS) +{ + text *json = PG_GETARG_TEXT_P(0); + Oid cfgId; + TSVector result; + + cfgId = getTSCurrentConfig(true); + result = json_to_tsvector_worker(cfgId, json, jtiString); + PG_FREE_IF_COPY(json, 0); + + PG_RETURN_TSVECTOR(result); +} + +Datum +json_to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + text *json = PG_GETARG_TEXT_P(1); + Jsonb *jbFlags = PG_GETARG_JSONB_P(2); + TSVector result; + uint32 flags = parse_jsonb_index_flags(jbFlags); + + result = json_to_tsvector_worker(cfgId, json, flags); + PG_FREE_IF_COPY(json, 1); + PG_FREE_IF_COPY(jbFlags, 2); + + PG_RETURN_TSVECTOR(result); +} + +Datum +json_to_tsvector(PG_FUNCTION_ARGS) +{ + text *json = PG_GETARG_TEXT_P(0); + Jsonb *jbFlags = PG_GETARG_JSONB_P(1); + Oid cfgId; + TSVector result; + uint32 flags = parse_jsonb_index_flags(jbFlags); + + cfgId = getTSCurrentConfig(true); + result = json_to_tsvector_worker(cfgId, json, flags); + PG_FREE_IF_COPY(json, 0); + PG_FREE_IF_COPY(jbFlags, 1); + + PG_RETURN_TSVECTOR(result); +} + +/* + * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState. + */ +static void +add_to_tsvector(void *_state, char *elem_value, int elem_len) +{ + TSVectorBuildState *state = (TSVectorBuildState *) _state; + ParsedText *prs = state->prs; + int32 prevwords; + + if (prs->words == NULL) + { + /* + * First time through: initialize words array to a reasonable size. + * (parsetext() will realloc it bigger as needed.) + */ + prs->lenwords = 16; + prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords); + prs->curwords = 0; + prs->pos = 0; + } + + prevwords = prs->curwords; + + parsetext(state->cfgId, prs, elem_value, elem_len); + + /* + * If we extracted any words from this JSON element, advance pos to create + * an artificial break between elements. This is because we don't want + * phrase searches to think that the last word in this element is adjacent + * to the first word in the next one. + */ + if (prs->curwords > prevwords) + prs->pos += 1; +} + + +/* + * to_tsquery + */ + + +/* + * This function is used for morph parsing. + * + * The value is passed to parsetext which will call the right dictionary to + * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP + * to the stack. + * + * All words belonging to the same variant are pushed as an ANDed list, + * and different variants are ORed together. + */ +static void +pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix) +{ + int32 count = 0; + ParsedText prs; + uint32 variant, + pos = 0, + cntvar = 0, + cntpos = 0, + cnt = 0; + MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque); + + prs.lenwords = 4; + prs.curwords = 0; + prs.pos = 0; + prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + + parsetext(data->cfg_id, &prs, strval, lenval); + + if (prs.curwords > 0) + { + while (count < prs.curwords) + { + /* + * Were any stop words removed? If so, fill empty positions with + * placeholders linked by an appropriate operator. + */ + if (pos > 0 && pos + 1 < prs.words[count].pos.pos) + { + while (pos + 1 < prs.words[count].pos.pos) + { + /* put placeholders for each missing stop word */ + pushStop(state); + if (cntpos) + pushOperator(state, data->qoperator, 1); + cntpos++; + pos++; + } + } + + /* save current word's position */ + pos = prs.words[count].pos.pos; + + /* Go through all variants obtained from this token */ + cntvar = 0; + while (count < prs.curwords && pos == prs.words[count].pos.pos) + { + variant = prs.words[count].nvariant; + + /* Push all words belonging to the same variant */ + cnt = 0; + while (count < prs.curwords && + pos == prs.words[count].pos.pos && + variant == prs.words[count].nvariant) + { + pushValue(state, + prs.words[count].word, + prs.words[count].len, + weight, + ((prs.words[count].flags & TSL_PREFIX) || prefix)); + pfree(prs.words[count].word); + if (cnt) + pushOperator(state, OP_AND, 0); + cnt++; + count++; + } + + if (cntvar) + pushOperator(state, OP_OR, 0); + cntvar++; + } + + if (cntpos) + { + /* distance may be useful */ + pushOperator(state, data->qoperator, 1); + } + + cntpos++; + } + + pfree(prs.words); + + } + else + pushStop(state); +} + +Datum +to_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + TSQuery query; + MorphOpaque data; + + data.cfg_id = PG_GETARG_OID(0); + + /* + * Passing OP_PHRASE as a qoperator makes tsquery require matching of word + * positions of a complex morph exactly match the tsvector. Also, when + * the complex morphs are connected with OP_PHRASE operator, we connect + * all their words into the OP_PHRASE sequence. + */ + data.qoperator = OP_PHRASE; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + 0); + + PG_RETURN_TSQUERY(query); +} + +Datum +to_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} + +Datum +plainto_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + TSQuery query; + MorphOpaque data; + + data.cfg_id = PG_GETARG_OID(0); + + /* + * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a + * single morph. Passing OP_PHRASE as a qoperator makes tsquery require + * matching of all words independently on their positions. + */ + data.qoperator = OP_AND; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + P_TSQ_PLAIN); + + PG_RETURN_POINTER(query); +} + +Datum +plainto_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} + + +Datum +phraseto_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + TSQuery query; + MorphOpaque data; + + data.cfg_id = PG_GETARG_OID(0); + + /* + * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a + * single morph. Passing OP_PHRASE as a qoperator makes tsquery require + * matching of word positions. + */ + data.qoperator = OP_PHRASE; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + P_TSQ_PLAIN); + + PG_RETURN_TSQUERY(query); +} + +Datum +phraseto_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} + +Datum +websearch_to_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + MorphOpaque data; + TSQuery query = NULL; + + data.cfg_id = PG_GETARG_OID(0); + + /* + * Passing OP_PHRASE as a qoperator makes tsquery require matching of word + * positions of a complex morph exactly match the tsvector. Also, when + * the complex morphs are given in quotes, we connect all their words into + * the OP_PHRASE sequence. + */ + data.qoperator = OP_PHRASE; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + P_TSQ_WEB); + + PG_RETURN_TSQUERY(query); +} + +Datum +websearch_to_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); + +} |