summaryrefslogtreecommitdiffstats
path: root/src/backend/tsearch/ts_parse.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/tsearch/ts_parse.c')
-rw-r--r--src/backend/tsearch/ts_parse.c678
1 files changed, 678 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c
new file mode 100644
index 0000000..a87b442
--- /dev/null
+++ b/src/backend/tsearch/ts_parse.c
@@ -0,0 +1,678 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_parse.c
+ * main parse functions for tsearch
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/ts_parse.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_utils.h"
+
+#define IGNORE_LONGLEXEME 1
+
+/*
+ * Lexize subsystem
+ */
+
+typedef struct ParsedLex
+{
+ int type;
+ char *lemm;
+ int lenlemm;
+ struct ParsedLex *next;
+} ParsedLex;
+
+typedef struct ListParsedLex
+{
+ ParsedLex *head;
+ ParsedLex *tail;
+} ListParsedLex;
+
+typedef struct
+{
+ TSConfigCacheEntry *cfg;
+ Oid curDictId;
+ int posDict;
+ DictSubState dictState;
+ ParsedLex *curSub;
+ ListParsedLex towork; /* current list to work */
+ ListParsedLex waste; /* list of lexemes that already lexized */
+
+ /*
+ * fields to store last variant to lexize (basically, thesaurus or similar
+ * to, which wants several lexemes
+ */
+
+ ParsedLex *lastRes;
+ TSLexeme *tmpRes;
+} LexizeData;
+
+static void
+LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
+{
+ ld->cfg = cfg;
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->towork.head = ld->towork.tail = ld->curSub = NULL;
+ ld->waste.head = ld->waste.tail = NULL;
+ ld->lastRes = NULL;
+ ld->tmpRes = NULL;
+}
+
+static void
+LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
+{
+ if (list->tail)
+ {
+ list->tail->next = newpl;
+ list->tail = newpl;
+ }
+ else
+ list->head = list->tail = newpl;
+ newpl->next = NULL;
+}
+
+static ParsedLex *
+LPLRemoveHead(ListParsedLex *list)
+{
+ ParsedLex *res = list->head;
+
+ if (list->head)
+ list->head = list->head->next;
+
+ if (list->head == NULL)
+ list->tail = NULL;
+
+ return res;
+}
+
+static void
+LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
+{
+ ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
+
+ newpl->type = type;
+ newpl->lemm = lemm;
+ newpl->lenlemm = lenlemm;
+ LPLAddTail(&ld->towork, newpl);
+ ld->curSub = ld->towork.tail;
+}
+
+static void
+RemoveHead(LexizeData *ld)
+{
+ LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
+
+ ld->posDict = 0;
+}
+
+static void
+setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
+{
+ if (correspondLexem)
+ {
+ *correspondLexem = ld->waste.head;
+ }
+ else
+ {
+ ParsedLex *tmp,
+ *ptr = ld->waste.head;
+
+ while (ptr)
+ {
+ tmp = ptr->next;
+ pfree(ptr);
+ ptr = tmp;
+ }
+ }
+ ld->waste.head = ld->waste.tail = NULL;
+}
+
+static void
+moveToWaste(LexizeData *ld, ParsedLex *stop)
+{
+ bool go = true;
+
+ while (ld->towork.head && go)
+ {
+ if (ld->towork.head == stop)
+ {
+ ld->curSub = stop->next;
+ go = false;
+ }
+ RemoveHead(ld);
+ }
+}
+
+static void
+setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
+{
+ if (ld->tmpRes)
+ {
+ TSLexeme *ptr;
+
+ for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
+ pfree(ptr->lexeme);
+ pfree(ld->tmpRes);
+ }
+ ld->tmpRes = res;
+ ld->lastRes = lex;
+}
+
+static TSLexeme *
+LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
+{
+ int i;
+ ListDictionary *map;
+ TSDictionaryCacheEntry *dict;
+ TSLexeme *res;
+
+ if (ld->curDictId == InvalidOid)
+ {
+ /*
+ * usual mode: dictionary wants only one word, but we should keep in
+ * mind that we should go through all stack
+ */
+
+ while (ld->towork.head)
+ {
+ ParsedLex *curVal = ld->towork.head;
+ char *curValLemm = curVal->lemm;
+ int curValLenLemm = curVal->lenlemm;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
+ {
+ /* skip this type of lexeme */
+ RemoveHead(ld);
+ continue;
+ }
+
+ for (i = ld->posDict; i < map->len; i++)
+ {
+ dict = lookup_ts_dictionary_cache(map->dictIds[i]);
+
+ ld->dictState.isend = ld->dictState.getnext = false;
+ ld->dictState.private_state = NULL;
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(curValLemm),
+ Int32GetDatum(curValLenLemm),
+ PointerGetDatum(&ld->dictState)));
+
+ if (ld->dictState.getnext)
+ {
+ /*
+ * dictionary wants next word, so setup and store current
+ * position and go to multiword mode
+ */
+
+ ld->curDictId = DatumGetObjectId(map->dictIds[i]);
+ ld->posDict = i + 1;
+ ld->curSub = curVal->next;
+ if (res)
+ setNewTmpRes(ld, curVal, res);
+ return LexizeExec(ld, correspondLexem);
+ }
+
+ if (!res) /* dictionary doesn't know this lexeme */
+ continue;
+
+ if (res->flags & TSL_FILTER)
+ {
+ curValLemm = res->lexeme;
+ curValLenLemm = strlen(res->lexeme);
+ continue;
+ }
+
+ RemoveHead(ld);
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ RemoveHead(ld);
+ }
+ }
+ else
+ { /* curDictId is valid */
+ dict = lookup_ts_dictionary_cache(ld->curDictId);
+
+ /*
+ * Dictionary ld->curDictId asks us about following words
+ */
+
+ while (ld->curSub)
+ {
+ ParsedLex *curVal = ld->curSub;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type != 0)
+ {
+ bool dictExists = false;
+
+ if (curVal->type >= ld->cfg->lenmap || map->len == 0)
+ {
+ /* skip this type of lexeme */
+ ld->curSub = curVal->next;
+ continue;
+ }
+
+ /*
+ * We should be sure that current type of lexeme is recognized
+ * by our dictionary: we just check is it exist in list of
+ * dictionaries ?
+ */
+ for (i = 0; i < map->len && !dictExists; i++)
+ if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
+ dictExists = true;
+
+ if (!dictExists)
+ {
+ /*
+ * Dictionary can't work with current type of lexeme,
+ * return to basic mode and redo all stored lexemes
+ */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ ld->dictState.isend = (curVal->type == 0);
+ ld->dictState.getnext = false;
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(curVal->lemm),
+ Int32GetDatum(curVal->lenlemm),
+ PointerGetDatum(&ld->dictState)));
+
+ if (ld->dictState.getnext)
+ {
+ /* Dictionary wants one more */
+ ld->curSub = curVal->next;
+ if (res)
+ setNewTmpRes(ld, curVal, res);
+ continue;
+ }
+
+ if (res || ld->tmpRes)
+ {
+ /*
+ * Dictionary normalizes lexemes, so we remove from stack all
+ * used lexemes, return to basic mode and redo end of stack
+ * (if it exists)
+ */
+ if (res)
+ {
+ moveToWaste(ld, ld->curSub);
+ }
+ else
+ {
+ res = ld->tmpRes;
+ moveToWaste(ld, ld->lastRes);
+ }
+
+ /* reset to initial state */
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->lastRes = NULL;
+ ld->tmpRes = NULL;
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ /*
+ * Dict don't want next lexem and didn't recognize anything, redo
+ * from ld->towork.head
+ */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ setCorrLex(ld, correspondLexem);
+ return NULL;
+}
+
+/*
+ * Parse string and lexize words.
+ *
+ * prs will be filled in.
+ */
+void
+parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
+{
+ int type,
+ lenlemm;
+ char *lemm = NULL;
+ LexizeData ldata;
+ TSLexeme *norms;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+ void *prsdata;
+
+ cfg = lookup_ts_config_cache(cfgId);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
+ PointerGetDatum(buf),
+ Int32GetDatum(buflen)));
+
+ LexizeInit(&ldata, cfg);
+
+ do
+ {
+ type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lemm),
+ PointerGetDatum(&lenlemm)));
+
+ if (type > 0 && lenlemm >= MAXSTRLEN)
+ {
+#ifdef IGNORE_LONGLEXEME
+ ereport(NOTICE,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+ continue;
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+#endif
+ }
+
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+ while ((norms = LexizeExec(&ldata, NULL)) != NULL)
+ {
+ TSLexeme *ptr = norms;
+
+ prs->pos++; /* set pos */
+
+ while (ptr->lexeme)
+ {
+ if (prs->curwords == prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
+ }
+
+ if (ptr->flags & TSL_ADDPOS)
+ prs->pos++;
+ prs->words[prs->curwords].len = strlen(ptr->lexeme);
+ prs->words[prs->curwords].word = ptr->lexeme;
+ prs->words[prs->curwords].nvariant = ptr->nvariant;
+ prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
+ prs->words[prs->curwords].alen = 0;
+ prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
+ ptr++;
+ prs->curwords++;
+ }
+ pfree(norms);
+ }
+ } while (type > 0);
+
+ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+/*
+ * Headline framework
+ */
+
+/* Add a word to prs->words[] */
+static void
+hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
+{
+ if (prs->curwords >= prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
+ }
+ memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
+ prs->words[prs->curwords].type = (uint8) type;
+ prs->words[prs->curwords].len = buflen;
+ prs->words[prs->curwords].word = palloc(buflen);
+ memcpy(prs->words[prs->curwords].word, buf, buflen);
+ prs->curwords++;
+}
+
+/*
+ * Add pos and matching-query-item data to the just-added word.
+ * Here, buf/buflen represent a processed lexeme, not raw token text.
+ *
+ * If the query contains more than one matching item, we replicate
+ * the last-added word so that each item can be pointed to. The
+ * duplicate entries are marked with repeated = 1.
+ */
+static void
+hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
+{
+ int i;
+ QueryItem *item = GETQUERY(query);
+ HeadlineWordEntry *word;
+
+ while (prs->curwords + query->size >= prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
+ }
+
+ word = &(prs->words[prs->curwords - 1]);
+ word->pos = LIMITPOS(pos);
+ for (i = 0; i < query->size; i++)
+ {
+ if (item->type == QI_VAL &&
+ tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
+ buf, buflen, item->qoperand.prefix) == 0)
+ {
+ if (word->item)
+ {
+ memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
+ prs->words[prs->curwords].item = &item->qoperand;
+ prs->words[prs->curwords].repeated = 1;
+ prs->curwords++;
+ }
+ else
+ word->item = &item->qoperand;
+ }
+ item++;
+ }
+}
+
+static void
+addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
+{
+ ParsedLex *tmplexs;
+ TSLexeme *ptr;
+ int32 savedpos;
+
+ while (lexs)
+ {
+ if (lexs->type > 0)
+ hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
+
+ ptr = norms;
+ savedpos = prs->vectorpos;
+ while (ptr && ptr->lexeme)
+ {
+ if (ptr->flags & TSL_ADDPOS)
+ savedpos++;
+ hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
+ ptr++;
+ }
+
+ tmplexs = lexs->next;
+ pfree(lexs);
+ lexs = tmplexs;
+ }
+
+ if (norms)
+ {
+ ptr = norms;
+ while (ptr->lexeme)
+ {
+ if (ptr->flags & TSL_ADDPOS)
+ prs->vectorpos++;
+ pfree(ptr->lexeme);
+ ptr++;
+ }
+ pfree(norms);
+ }
+}
+
+void
+hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
+{
+ int type,
+ lenlemm;
+ char *lemm = NULL;
+ LexizeData ldata;
+ TSLexeme *norms;
+ ParsedLex *lexs;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+ void *prsdata;
+
+ cfg = lookup_ts_config_cache(cfgId);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
+ PointerGetDatum(buf),
+ Int32GetDatum(buflen)));
+
+ LexizeInit(&ldata, cfg);
+
+ do
+ {
+ type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lemm),
+ PointerGetDatum(&lenlemm)));
+
+ if (type > 0 && lenlemm >= MAXSTRLEN)
+ {
+#ifdef IGNORE_LONGLEXEME
+ ereport(NOTICE,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+ continue;
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+#endif
+ }
+
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+ do
+ {
+ if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
+ {
+ prs->vectorpos++;
+ addHLParsedLex(prs, query, lexs, norms);
+ }
+ else
+ addHLParsedLex(prs, query, lexs, NULL);
+ } while (norms);
+ } while (type > 0);
+
+ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+/*
+ * Generate the headline, as a text object, from HeadlineParsedText.
+ */
+text *
+generateHeadline(HeadlineParsedText *prs)
+{
+ text *out;
+ char *ptr;
+ int len = 128;
+ int numfragments = 0;
+ int16 infrag = 0;
+
+ HeadlineWordEntry *wrd = prs->words;
+
+ out = (text *) palloc(len);
+ ptr = ((char *) out) + VARHDRSZ;
+
+ while (wrd - prs->words < prs->curwords)
+ {
+ while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
+ {
+ int dist = ptr - ((char *) out);
+
+ len *= 2;
+ out = (text *) repalloc(out, len);
+ ptr = ((char *) out) + dist;
+ }
+
+ if (wrd->in && !wrd->repeated)
+ {
+ if (!infrag)
+ {
+
+ /* start of a new fragment */
+ infrag = 1;
+ numfragments++;
+ /* add a fragment delimiter if this is after the first one */
+ if (numfragments > 1)
+ {
+ memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
+ ptr += prs->fragdelimlen;
+ }
+ }
+ if (wrd->replace)
+ {
+ *ptr = ' ';
+ ptr++;
+ }
+ else if (!wrd->skip)
+ {
+ if (wrd->selected)
+ {
+ memcpy(ptr, prs->startsel, prs->startsellen);
+ ptr += prs->startsellen;
+ }
+ memcpy(ptr, wrd->word, wrd->len);
+ ptr += wrd->len;
+ if (wrd->selected)
+ {
+ memcpy(ptr, prs->stopsel, prs->stopsellen);
+ ptr += prs->stopsellen;
+ }
+ }
+ }
+ else if (!wrd->repeated)
+ {
+ if (infrag)
+ infrag = 0;
+ pfree(wrd->word);
+ }
+
+ wrd++;
+ }
+
+ SET_VARSIZE(out, ptr - ((char *) out));
+ return out;
+}