1 files changed, 678 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c
new file mode 100644
index 0000000..a87b442
--- /dev/null
+++ b/src/backend/tsearch/ts_parse.c
@@ -0,0 +1,678 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_parse.c
+ *		main parse functions for tsearch
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/tsearch/ts_parse.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_utils.h"
+
+#define IGNORE_LONGLEXEME	1
+
+/*
+ * Lexize subsystem
+ */
+
+typedef struct ParsedLex
+{
+	int			type;
+	char	   *lemm;
+	int			lenlemm;
+	struct ParsedLex *next;
+} ParsedLex;
+
+typedef struct ListParsedLex
+{
+	ParsedLex  *head;
+	ParsedLex  *tail;
+} ListParsedLex;
+
+typedef struct
+{
+	TSConfigCacheEntry *cfg;
+	Oid			curDictId;
+	int			posDict;
+	DictSubState dictState;
+	ParsedLex  *curSub;
+	ListParsedLex towork;		/* current list to work */
+	ListParsedLex waste;		/* list of lexemes that already lexized */
+
+	/*
+	 * fields to store last variant to lexize (basically, thesaurus or similar
+	 * to, which wants	several lexemes
+	 */
+
+	ParsedLex  *lastRes;
+	TSLexeme   *tmpRes;
+} LexizeData;
+
+static void
+LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
+{
+	ld->cfg = cfg;
+	ld->curDictId = InvalidOid;
+	ld->posDict = 0;
+	ld->towork.head = ld->towork.tail = ld->curSub = NULL;
+	ld->waste.head = ld->waste.tail = NULL;
+	ld->lastRes = NULL;
+	ld->tmpRes = NULL;
+}
+
+static void
+LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
+{
+	if (list->tail)
+	{
+		list->tail->next = newpl;
+		list->tail = newpl;
+	}
+	else
+		list->head = list->tail = newpl;
+	newpl->next = NULL;
+}
+
+static ParsedLex *
+LPLRemoveHead(ListParsedLex *list)
+{
+	ParsedLex  *res = list->head;
+
+	if (list->head)
+		list->head = list->head->next;
+
+	if (list->head == NULL)
+		list->tail = NULL;
+
+	return res;
+}
+
+static void
+LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
+{
+	ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
+
+	newpl->type = type;
+	newpl->lemm = lemm;
+	newpl->lenlemm = lenlemm;
+	LPLAddTail(&ld->towork, newpl);
+	ld->curSub = ld->towork.tail;
+}
+
+static void
+RemoveHead(LexizeData *ld)
+{
+	LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
+
+	ld->posDict = 0;
+}
+
+static void
+setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
+{
+	if (correspondLexem)
+	{
+		*correspondLexem = ld->waste.head;
+	}
+	else
+	{
+		ParsedLex  *tmp,
+				   *ptr = ld->waste.head;
+
+		while (ptr)
+		{
+			tmp = ptr->next;
+			pfree(ptr);
+			ptr = tmp;
+		}
+	}
+	ld->waste.head = ld->waste.tail = NULL;
+}
+
+static void
+moveToWaste(LexizeData *ld, ParsedLex *stop)
+{
+	bool		go = true;
+
+	while (ld->towork.head && go)
+	{
+		if (ld->towork.head == stop)
+		{
+			ld->curSub = stop->next;
+			go = false;
+		}
+		RemoveHead(ld);
+	}
+}
+
+static void
+setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
+{
+	if (ld->tmpRes)
+	{
+		TSLexeme   *ptr;
+
+		for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
+			pfree(ptr->lexeme);
+		pfree(ld->tmpRes);
+	}
+	ld->tmpRes = res;
+	ld->lastRes = lex;
+}
+
+static TSLexeme *
+LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
+{
+	int			i;
+	ListDictionary *map;
+	TSDictionaryCacheEntry *dict;
+	TSLexeme   *res;
+
+	if (ld->curDictId == InvalidOid)
+	{
+		/*
+		 * usual mode: dictionary wants only one word, but we should keep in
+		 * mind that we should go through all stack
+		 */
+
+		while (ld->towork.head)
+		{
+			ParsedLex  *curVal = ld->towork.head;
+			char	   *curValLemm = curVal->lemm;
+			int			curValLenLemm = curVal->lenlemm;
+
+			map = ld->cfg->map + curVal->type;
+
+			if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
+			{
+				/* skip this type of lexeme */
+				RemoveHead(ld);
+				continue;
+			}
+
+			for (i = ld->posDict; i < map->len; i++)
+			{
+				dict = lookup_ts_dictionary_cache(map->dictIds[i]);
+
+				ld->dictState.isend = ld->dictState.getnext = false;
+				ld->dictState.private_state = NULL;
+				res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+																 PointerGetDatum(dict->dictData),
+																 PointerGetDatum(curValLemm),
+																 Int32GetDatum(curValLenLemm),
+																 PointerGetDatum(&ld->dictState)));
+
+				if (ld->dictState.getnext)
+				{
+					/*
+					 * dictionary wants next word, so setup and store current
+					 * position and go to multiword mode
+					 */
+
+					ld->curDictId = DatumGetObjectId(map->dictIds[i]);
+					ld->posDict = i + 1;
+					ld->curSub = curVal->next;
+					if (res)
+						setNewTmpRes(ld, curVal, res);
+					return LexizeExec(ld, correspondLexem);
+				}
+
+				if (!res)		/* dictionary doesn't know this lexeme */
+					continue;
+
+				if (res->flags & TSL_FILTER)
+				{
+					curValLemm = res->lexeme;
+					curValLenLemm = strlen(res->lexeme);
+					continue;
+				}
+
+				RemoveHead(ld);
+				setCorrLex(ld, correspondLexem);
+				return res;
+			}
+
+			RemoveHead(ld);
+		}
+	}
+	else
+	{							/* curDictId is valid */
+		dict = lookup_ts_dictionary_cache(ld->curDictId);
+
+		/*
+		 * Dictionary ld->curDictId asks us about following words
+		 */
+
+		while (ld->curSub)
+		{
+			ParsedLex  *curVal = ld->curSub;
+
+			map = ld->cfg->map + curVal->type;
+
+			if (curVal->type != 0)
+			{
+				bool		dictExists = false;
+
+				if (curVal->type >= ld->cfg->lenmap || map->len == 0)
+				{
+					/* skip this type of lexeme */
+					ld->curSub = curVal->next;
+					continue;
+				}
+
+				/*
+				 * We should be sure that current type of lexeme is recognized
+				 * by our dictionary: we just check is it exist in list of
+				 * dictionaries ?
+				 */
+				for (i = 0; i < map->len && !dictExists; i++)
+					if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
+						dictExists = true;
+
+				if (!dictExists)
+				{
+					/*
+					 * Dictionary can't work with current type of lexeme,
+					 * return to basic mode and redo all stored lexemes
+					 */
+					ld->curDictId = InvalidOid;
+					return LexizeExec(ld, correspondLexem);
+				}
+			}
+
+			ld->dictState.isend = (curVal->type == 0);
+			ld->dictState.getnext = false;
+
+			res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+															 PointerGetDatum(dict->dictData),
+															 PointerGetDatum(curVal->lemm),
+															 Int32GetDatum(curVal->lenlemm),
+															 PointerGetDatum(&ld->dictState)));
+
+			if (ld->dictState.getnext)
+			{
+				/* Dictionary wants one more */
+				ld->curSub = curVal->next;
+				if (res)
+					setNewTmpRes(ld, curVal, res);
+				continue;
+			}
+
+			if (res || ld->tmpRes)
+			{
+				/*
+				 * Dictionary normalizes lexemes, so we remove from stack all
+				 * used lexemes, return to basic mode and redo end of stack
+				 * (if it exists)
+				 */
+				if (res)
+				{
+					moveToWaste(ld, ld->curSub);
+				}
+				else
+				{
+					res = ld->tmpRes;
+					moveToWaste(ld, ld->lastRes);
+				}
+
+				/* reset to initial state */
+				ld->curDictId = InvalidOid;
+				ld->posDict = 0;
+				ld->lastRes = NULL;
+				ld->tmpRes = NULL;
+				setCorrLex(ld, correspondLexem);
+				return res;
+			}
+
+			/*
+			 * Dict don't want next lexem and didn't recognize anything, redo
+			 * from ld->towork.head
+			 */
+			ld->curDictId = InvalidOid;
+			return LexizeExec(ld, correspondLexem);
+		}
+	}
+
+	setCorrLex(ld, correspondLexem);
+	return NULL;
+}
+
+/*
+ * Parse string and lexize words.
+ *
+ * prs will be filled in.
+ */
+void
+parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
+{
+	int			type,
+				lenlemm;
+	char	   *lemm = NULL;
+	LexizeData	ldata;
+	TSLexeme   *norms;
+	TSConfigCacheEntry *cfg;
+	TSParserCacheEntry *prsobj;
+	void	   *prsdata;
+
+	cfg = lookup_ts_config_cache(cfgId);
+	prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+	prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
+													 PointerGetDatum(buf),
+													 Int32GetDatum(buflen)));
+
+	LexizeInit(&ldata, cfg);
+
+	do
+	{
+		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+										   PointerGetDatum(prsdata),
+										   PointerGetDatum(&lemm),
+										   PointerGetDatum(&lenlemm)));
+
+		if (type > 0 && lenlemm >= MAXSTRLEN)
+		{
+#ifdef IGNORE_LONGLEXEME
+			ereport(NOTICE,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("word is too long to be indexed"),
+					 errdetail("Words longer than %d characters are ignored.",
+							   MAXSTRLEN)));
+			continue;
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("word is too long to be indexed"),
+					 errdetail("Words longer than %d characters are ignored.",
+							   MAXSTRLEN)));
+#endif
+		}
+
+		LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+		while ((norms = LexizeExec(&ldata, NULL)) != NULL)
+		{
+			TSLexeme   *ptr = norms;
+
+			prs->pos++;			/* set pos */
+
+			while (ptr->lexeme)
+			{
+				if (prs->curwords == prs->lenwords)
+				{
+					prs->lenwords *= 2;
+					prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
+				}
+
+				if (ptr->flags & TSL_ADDPOS)
+					prs->pos++;
+				prs->words[prs->curwords].len = strlen(ptr->lexeme);
+				prs->words[prs->curwords].word = ptr->lexeme;
+				prs->words[prs->curwords].nvariant = ptr->nvariant;
+				prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
+				prs->words[prs->curwords].alen = 0;
+				prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
+				ptr++;
+				prs->curwords++;
+			}
+			pfree(norms);
+		}
+	} while (type > 0);
+
+	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+/*
+ * Headline framework
+ */
+
+/* Add a word to prs->words[] */
+static void
+hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
+{
+	if (prs->curwords >= prs->lenwords)
+	{
+		prs->lenwords *= 2;
+		prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
+	}
+	memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
+	prs->words[prs->curwords].type = (uint8) type;
+	prs->words[prs->curwords].len = buflen;
+	prs->words[prs->curwords].word = palloc(buflen);
+	memcpy(prs->words[prs->curwords].word, buf, buflen);
+	prs->curwords++;
+}
+
+/*
+ * Add pos and matching-query-item data to the just-added word.
+ * Here, buf/buflen represent a processed lexeme, not raw token text.
+ *
+ * If the query contains more than one matching item, we replicate
+ * the last-added word so that each item can be pointed to.  The
+ * duplicate entries are marked with repeated = 1.
+ */
+static void
+hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
+{
+	int			i;
+	QueryItem  *item = GETQUERY(query);
+	HeadlineWordEntry *word;
+
+	while (prs->curwords + query->size >= prs->lenwords)
+	{
+		prs->lenwords *= 2;
+		prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
+	}
+
+	word = &(prs->words[prs->curwords - 1]);
+	word->pos = LIMITPOS(pos);
+	for (i = 0; i < query->size; i++)
+	{
+		if (item->type == QI_VAL &&
+			tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
+							buf, buflen, item->qoperand.prefix) == 0)
+		{
+			if (word->item)
+			{
+				memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
+				prs->words[prs->curwords].item = &item->qoperand;
+				prs->words[prs->curwords].repeated = 1;
+				prs->curwords++;
+			}
+			else
+				word->item = &item->qoperand;
+		}
+		item++;
+	}
+}
+
+static void
+addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
+{
+	ParsedLex  *tmplexs;
+	TSLexeme   *ptr;
+	int32		savedpos;
+
+	while (lexs)
+	{
+		if (lexs->type > 0)
+			hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
+
+		ptr = norms;
+		savedpos = prs->vectorpos;
+		while (ptr && ptr->lexeme)
+		{
+			if (ptr->flags & TSL_ADDPOS)
+				savedpos++;
+			hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
+			ptr++;
+		}
+
+		tmplexs = lexs->next;
+		pfree(lexs);
+		lexs = tmplexs;
+	}
+
+	if (norms)
+	{
+		ptr = norms;
+		while (ptr->lexeme)
+		{
+			if (ptr->flags & TSL_ADDPOS)
+				prs->vectorpos++;
+			pfree(ptr->lexeme);
+			ptr++;
+		}
+		pfree(norms);
+	}
+}
+
+void
+hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
+{
+	int			type,
+				lenlemm;
+	char	   *lemm = NULL;
+	LexizeData	ldata;
+	TSLexeme   *norms;
+	ParsedLex  *lexs;
+	TSConfigCacheEntry *cfg;
+	TSParserCacheEntry *prsobj;
+	void	   *prsdata;
+
+	cfg = lookup_ts_config_cache(cfgId);
+	prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+	prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
+													 PointerGetDatum(buf),
+													 Int32GetDatum(buflen)));
+
+	LexizeInit(&ldata, cfg);
+
+	do
+	{
+		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+										   PointerGetDatum(prsdata),
+										   PointerGetDatum(&lemm),
+										   PointerGetDatum(&lenlemm)));
+
+		if (type > 0 && lenlemm >= MAXSTRLEN)
+		{
+#ifdef IGNORE_LONGLEXEME
+			ereport(NOTICE,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("word is too long to be indexed"),
+					 errdetail("Words longer than %d characters are ignored.",
+							   MAXSTRLEN)));
+			continue;
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("word is too long to be indexed"),
+					 errdetail("Words longer than %d characters are ignored.",
+							   MAXSTRLEN)));
+#endif
+		}
+
+		LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+		do
+		{
+			if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
+			{
+				prs->vectorpos++;
+				addHLParsedLex(prs, query, lexs, norms);
+			}
+			else
+				addHLParsedLex(prs, query, lexs, NULL);
+		} while (norms);
+	} while (type > 0);
+
+	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+/*
+ * Generate the headline, as a text object, from HeadlineParsedText.
+ */
+text *
+generateHeadline(HeadlineParsedText *prs)
+{
+	text	   *out;
+	char	   *ptr;
+	int			len = 128;
+	int			numfragments = 0;
+	int16		infrag = 0;
+
+	HeadlineWordEntry *wrd = prs->words;
+
+	out = (text *) palloc(len);
+	ptr = ((char *) out) + VARHDRSZ;
+
+	while (wrd - prs->words < prs->curwords)
+	{
+		while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
+		{
+			int			dist = ptr - ((char *) out);
+
+			len *= 2;
+			out = (text *) repalloc(out, len);
+			ptr = ((char *) out) + dist;
+		}
+
+		if (wrd->in && !wrd->repeated)
+		{
+			if (!infrag)
+			{
+
+				/* start of a new fragment */
+				infrag = 1;
+				numfragments++;
+				/* add a fragment delimiter if this is after the first one */
+				if (numfragments > 1)
+				{
+					memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
+					ptr += prs->fragdelimlen;
+				}
+			}
+			if (wrd->replace)
+			{
+				*ptr = ' ';
+				ptr++;
+			}
+			else if (!wrd->skip)
+			{
+				if (wrd->selected)
+				{
+					memcpy(ptr, prs->startsel, prs->startsellen);
+					ptr += prs->startsellen;
+				}
+				memcpy(ptr, wrd->word, wrd->len);
+				ptr += wrd->len;
+				if (wrd->selected)
+				{
+					memcpy(ptr, prs->stopsel, prs->stopsellen);
+					ptr += prs->stopsellen;
+				}
+			}
+		}
+		else if (!wrd->repeated)
+		{
+			if (infrag)
+				infrag = 0;
+			pfree(wrd->word);
+		}
+
+		wrd++;
+	}
+
+	SET_VARSIZE(out, ptr - ((char *) out));
+	return out;
+}