summaryrefslogtreecommitdiffstats
path: root/src/backend/tsearch/dict_thesaurus.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/tsearch/dict_thesaurus.c
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/tsearch/dict_thesaurus.c')
-rw-r--r--src/backend/tsearch/dict_thesaurus.c877
1 files changed, 877 insertions, 0 deletions
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c
new file mode 100644
index 0000000..a95ed08
--- /dev/null
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -0,0 +1,877 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_thesaurus.c
+ * Thesaurus dictionary: phrase to phrase substitution
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/dict_thesaurus.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/namespace.h"
+#include "commands/defrem.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+#include "utils/regproc.h"
+
+
+/*
+ * Temporary we use TSLexeme.flags for inner use...
+ */
+#define DT_USEASIS 0x1000
+
+typedef struct LexemeInfo
+{
+ uint32 idsubst; /* entry's number in DictThesaurus->subst */
+ uint16 posinsubst; /* pos info in entry */
+ uint16 tnvariant; /* total num lexemes in one variant */
+ struct LexemeInfo *nextentry;
+ struct LexemeInfo *nextvariant;
+} LexemeInfo;
+
+typedef struct
+{
+ char *lexeme;
+ LexemeInfo *entries;
+} TheLexeme;
+
+typedef struct
+{
+ uint16 lastlexeme; /* number lexemes to substitute */
+ uint16 reslen;
+ TSLexeme *res; /* prepared substituted result */
+} TheSubstitute;
+
+typedef struct
+{
+ /* subdictionary to normalize lexemes */
+ Oid subdictOid;
+ TSDictionaryCacheEntry *subdict;
+
+ /* Array to search lexeme by exact match */
+ TheLexeme *wrds;
+ int nwrds; /* current number of words */
+ int ntwrds; /* allocated array length */
+
+ /*
+ * Storage of substituted result, n-th element is for n-th expression
+ */
+ TheSubstitute *subst;
+ int nsubst;
+} DictThesaurus;
+
+
+static void
+newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
+{
+ TheLexeme *ptr;
+
+ if (d->nwrds >= d->ntwrds)
+ {
+ if (d->ntwrds == 0)
+ {
+ d->ntwrds = 16;
+ d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
+ }
+ else
+ {
+ d->ntwrds *= 2;
+ d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
+ }
+ }
+
+ ptr = d->wrds + d->nwrds;
+ d->nwrds++;
+
+ ptr->lexeme = palloc(e - b + 1);
+
+ memcpy(ptr->lexeme, b, e - b);
+ ptr->lexeme[e - b] = '\0';
+
+ ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
+
+ ptr->entries->nextentry = NULL;
+ ptr->entries->idsubst = idsubst;
+ ptr->entries->posinsubst = posinsubst;
+}
+
+static void
+addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
+{
+ static int nres = 0;
+ static int ntres = 0;
+ TheSubstitute *ptr;
+
+ if (nwrd == 0)
+ {
+ nres = ntres = 0;
+
+ if (idsubst >= d->nsubst)
+ {
+ if (d->nsubst == 0)
+ {
+ d->nsubst = 16;
+ d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
+ }
+ else
+ {
+ d->nsubst *= 2;
+ d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
+ }
+ }
+ }
+
+ ptr = d->subst + idsubst;
+
+ ptr->lastlexeme = posinsubst - 1;
+
+ if (nres + 1 >= ntres)
+ {
+ if (ntres == 0)
+ {
+ ntres = 2;
+ ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
+ }
+ else
+ {
+ ntres *= 2;
+ ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
+ }
+ }
+
+ ptr->res[nres].lexeme = palloc(e - b + 1);
+ memcpy(ptr->res[nres].lexeme, b, e - b);
+ ptr->res[nres].lexeme[e - b] = '\0';
+
+ ptr->res[nres].nvariant = nwrd;
+ if (useasis)
+ ptr->res[nres].flags = DT_USEASIS;
+ else
+ ptr->res[nres].flags = 0;
+
+ ptr->res[++nres].lexeme = NULL;
+}
+
+#define TR_WAITLEX 1
+#define TR_INLEX 2
+#define TR_WAITSUBS 3
+#define TR_INSUBS 4
+
+static void
+thesaurusRead(const char *filename, DictThesaurus *d)
+{
+ tsearch_readline_state trst;
+ uint32 idsubst = 0;
+ bool useasis = false;
+ char *line;
+
+ filename = get_tsearch_config_filename(filename, "ths");
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open thesaurus file \"%s\": %m",
+ filename)));
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ char *ptr;
+ int state = TR_WAITLEX;
+ char *beginwrd = NULL;
+ uint32 posinsubst = 0;
+ uint32 nwrd = 0;
+
+ ptr = line;
+
+ /* is it a comment? */
+ while (*ptr && t_isspace(ptr))
+ ptr += pg_mblen(ptr);
+
+ if (t_iseq(ptr, '#') || *ptr == '\0' ||
+ t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
+ {
+ pfree(line);
+ continue;
+ }
+
+ while (*ptr)
+ {
+ if (state == TR_WAITLEX)
+ {
+ if (t_iseq(ptr, ':'))
+ {
+ if (posinsubst == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("unexpected delimiter")));
+ state = TR_WAITSUBS;
+ }
+ else if (!t_isspace(ptr))
+ {
+ beginwrd = ptr;
+ state = TR_INLEX;
+ }
+ }
+ else if (state == TR_INLEX)
+ {
+ if (t_iseq(ptr, ':'))
+ {
+ newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
+ state = TR_WAITSUBS;
+ }
+ else if (t_isspace(ptr))
+ {
+ newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
+ state = TR_WAITLEX;
+ }
+ }
+ else if (state == TR_WAITSUBS)
+ {
+ if (t_iseq(ptr, '*'))
+ {
+ useasis = true;
+ state = TR_INSUBS;
+ beginwrd = ptr + pg_mblen(ptr);
+ }
+ else if (t_iseq(ptr, '\\'))
+ {
+ useasis = false;
+ state = TR_INSUBS;
+ beginwrd = ptr + pg_mblen(ptr);
+ }
+ else if (!t_isspace(ptr))
+ {
+ useasis = false;
+ beginwrd = ptr;
+ state = TR_INSUBS;
+ }
+ }
+ else if (state == TR_INSUBS)
+ {
+ if (t_isspace(ptr))
+ {
+ if (ptr == beginwrd)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("unexpected end of line or lexeme")));
+ addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
+ state = TR_WAITSUBS;
+ }
+ }
+ else
+ elog(ERROR, "unrecognized thesaurus state: %d", state);
+
+ ptr += pg_mblen(ptr);
+ }
+
+ if (state == TR_INSUBS)
+ {
+ if (ptr == beginwrd)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("unexpected end of line or lexeme")));
+ addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
+ }
+
+ idsubst++;
+
+ if (!(nwrd && posinsubst))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("unexpected end of line")));
+
+ if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("too many lexemes in thesaurus entry")));
+
+ pfree(line);
+ }
+
+ d->nsubst = idsubst;
+
+ tsearch_readline_end(&trst);
+}
+
+static TheLexeme *
+addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
+{
+ if (*nnw >= *tnm)
+ {
+ *tnm *= 2;
+ newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
+ }
+
+ newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
+
+ if (lexeme && lexeme->lexeme)
+ {
+ newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
+ newwrds[*nnw].entries->tnvariant = tnvariant;
+ }
+ else
+ {
+ newwrds[*nnw].lexeme = NULL;
+ newwrds[*nnw].entries->tnvariant = 1;
+ }
+
+ newwrds[*nnw].entries->idsubst = src->idsubst;
+ newwrds[*nnw].entries->posinsubst = src->posinsubst;
+
+ newwrds[*nnw].entries->nextentry = NULL;
+
+ (*nnw)++;
+ return newwrds;
+}
+
+static int
+cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
+{
+ if (a == NULL || b == NULL)
+ return 0;
+
+ if (a->idsubst == b->idsubst)
+ {
+ if (a->posinsubst == b->posinsubst)
+ {
+ if (a->tnvariant == b->tnvariant)
+ return 0;
+
+ return (a->tnvariant > b->tnvariant) ? 1 : -1;
+ }
+
+ return (a->posinsubst > b->posinsubst) ? 1 : -1;
+ }
+
+ return (a->idsubst > b->idsubst) ? 1 : -1;
+}
+
+static int
+cmpLexeme(const TheLexeme *a, const TheLexeme *b)
+{
+ if (a->lexeme == NULL)
+ {
+ if (b->lexeme == NULL)
+ return 0;
+ else
+ return 1;
+ }
+ else if (b->lexeme == NULL)
+ return -1;
+
+ return strcmp(a->lexeme, b->lexeme);
+}
+
+static int
+cmpLexemeQ(const void *a, const void *b)
+{
+ return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
+}
+
+static int
+cmpTheLexeme(const void *a, const void *b)
+{
+ const TheLexeme *la = (const TheLexeme *) a;
+ const TheLexeme *lb = (const TheLexeme *) b;
+ int res;
+
+ if ((res = cmpLexeme(la, lb)) != 0)
+ return res;
+
+ return -cmpLexemeInfo(la->entries, lb->entries);
+}
+
+static void
+compileTheLexeme(DictThesaurus *d)
+{
+ int i,
+ nnw = 0,
+ tnm = 16;
+ TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
+ *ptrwrds;
+
+ for (i = 0; i < d->nwrds; i++)
+ {
+ TSLexeme *ptr;
+
+ if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
+ newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
+ else
+ {
+ ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+ PointerGetDatum(d->subdict->dictData),
+ PointerGetDatum(d->wrds[i].lexeme),
+ Int32GetDatum(strlen(d->wrds[i].lexeme)),
+ PointerGetDatum(NULL)));
+
+ if (!ptr)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
+ d->wrds[i].lexeme,
+ d->wrds[i].entries->idsubst + 1)));
+ else if (!(ptr->lexeme))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
+ d->wrds[i].lexeme,
+ d->wrds[i].entries->idsubst + 1),
+ errhint("Use \"?\" to represent a stop word within a sample phrase.")));
+ else
+ {
+ while (ptr->lexeme)
+ {
+ TSLexeme *remptr = ptr + 1;
+ int tnvar = 1;
+ int curvar = ptr->nvariant;
+
+ /* compute n words in one variant */
+ while (remptr->lexeme)
+ {
+ if (remptr->nvariant != (remptr - 1)->nvariant)
+ break;
+ tnvar++;
+ remptr++;
+ }
+
+ remptr = ptr;
+ while (remptr->lexeme && remptr->nvariant == curvar)
+ {
+ newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
+ remptr++;
+ }
+
+ ptr = remptr;
+ }
+ }
+ }
+
+ pfree(d->wrds[i].lexeme);
+ pfree(d->wrds[i].entries);
+ }
+
+ if (d->wrds)
+ pfree(d->wrds);
+ d->wrds = newwrds;
+ d->nwrds = nnw;
+ d->ntwrds = tnm;
+
+ if (d->nwrds > 1)
+ {
+ qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
+
+ /* uniq */
+ newwrds = d->wrds;
+ ptrwrds = d->wrds + 1;
+ while (ptrwrds - d->wrds < d->nwrds)
+ {
+ if (cmpLexeme(ptrwrds, newwrds) == 0)
+ {
+ if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
+ {
+ ptrwrds->entries->nextentry = newwrds->entries;
+ newwrds->entries = ptrwrds->entries;
+ }
+ else
+ pfree(ptrwrds->entries);
+
+ if (ptrwrds->lexeme)
+ pfree(ptrwrds->lexeme);
+ }
+ else
+ {
+ newwrds++;
+ *newwrds = *ptrwrds;
+ }
+
+ ptrwrds++;
+ }
+
+ d->nwrds = newwrds - d->wrds + 1;
+ d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
+ }
+}
+
+static void
+compileTheSubstitute(DictThesaurus *d)
+{
+ int i;
+
+ for (i = 0; i < d->nsubst; i++)
+ {
+ TSLexeme *rem = d->subst[i].res,
+ *outptr,
+ *inptr;
+ int n = 2;
+
+ outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
+ outptr->lexeme = NULL;
+ inptr = rem;
+
+ while (inptr && inptr->lexeme)
+ {
+ TSLexeme *lexized,
+ tmplex[2];
+
+ if (inptr->flags & DT_USEASIS)
+ { /* do not lexize */
+ tmplex[0] = *inptr;
+ tmplex[0].flags = 0;
+ tmplex[1].lexeme = NULL;
+ lexized = tmplex;
+ }
+ else
+ {
+ lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+ PointerGetDatum(d->subdict->dictData),
+ PointerGetDatum(inptr->lexeme),
+ Int32GetDatum(strlen(inptr->lexeme)),
+ PointerGetDatum(NULL)));
+ }
+
+ if (lexized && lexized->lexeme)
+ {
+ int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
+
+ while (lexized->lexeme)
+ {
+ if (outptr - d->subst[i].res + 1 >= n)
+ {
+ int diff = outptr - d->subst[i].res;
+
+ n *= 2;
+ d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
+ outptr = d->subst[i].res + diff;
+ }
+
+ *outptr = *lexized;
+ outptr->lexeme = pstrdup(lexized->lexeme);
+
+ outptr++;
+ lexized++;
+ }
+
+ if (toset > 0)
+ d->subst[i].res[toset].flags |= TSL_ADDPOS;
+ }
+ else if (lexized)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
+ inptr->lexeme, i + 1)));
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
+ inptr->lexeme, i + 1)));
+ }
+
+ if (inptr->lexeme)
+ pfree(inptr->lexeme);
+ inptr++;
+ }
+
+ if (outptr == d->subst[i].res)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus substitute phrase is empty (rule %d)",
+ i + 1)));
+
+ d->subst[i].reslen = outptr - d->subst[i].res;
+
+ pfree(rem);
+ }
+}
+
+Datum
+thesaurus_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ DictThesaurus *d;
+ char *subdictname = NULL;
+ bool fileloaded = false;
+ ListCell *l;
+
+ d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (strcmp(defel->defname, "dictfile") == 0)
+ {
+ if (fileloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple DictFile parameters")));
+ thesaurusRead(defGetString(defel), d);
+ fileloaded = true;
+ }
+ else if (strcmp(defel->defname, "dictionary") == 0)
+ {
+ if (subdictname)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple Dictionary parameters")));
+ subdictname = pstrdup(defGetString(defel));
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized Thesaurus parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ if (!fileloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing DictFile parameter")));
+ if (!subdictname)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing Dictionary parameter")));
+
+ d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false);
+ d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
+
+ compileTheLexeme(d);
+ compileTheSubstitute(d);
+
+ PG_RETURN_POINTER(d);
+}
+
+static LexemeInfo *
+findTheLexeme(DictThesaurus *d, char *lexeme)
+{
+ TheLexeme key,
+ *res;
+
+ if (d->nwrds == 0)
+ return NULL;
+
+ key.lexeme = lexeme;
+ key.entries = NULL;
+
+ res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
+
+ if (res == NULL)
+ return NULL;
+ return res->entries;
+}
+
+static bool
+matchIdSubst(LexemeInfo *stored, uint32 idsubst)
+{
+ bool res = true;
+
+ if (stored)
+ {
+ res = false;
+
+ for (; stored; stored = stored->nextvariant)
+ if (stored->idsubst == idsubst)
+ {
+ res = true;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static LexemeInfo *
+findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
+{
+ for (;;)
+ {
+ int i;
+ LexemeInfo *ptr = newin[0];
+
+ for (i = 0; i < newn; i++)
+ {
+ while (newin[i] && newin[i]->idsubst < ptr->idsubst)
+ newin[i] = newin[i]->nextentry;
+
+ if (newin[i] == NULL)
+ return in;
+
+ if (newin[i]->idsubst > ptr->idsubst)
+ {
+ ptr = newin[i];
+ i = -1;
+ continue;
+ }
+
+ while (newin[i]->idsubst == ptr->idsubst)
+ {
+ if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
+ {
+ ptr = newin[i];
+ break;
+ }
+
+ newin[i] = newin[i]->nextentry;
+ if (newin[i] == NULL)
+ return in;
+ }
+
+ if (newin[i]->idsubst != ptr->idsubst)
+ {
+ ptr = newin[i];
+ i = -1;
+ continue;
+ }
+ }
+
+ if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
+ { /* found */
+
+ ptr->nextvariant = in;
+ in = ptr;
+ }
+
+ /* step forward */
+ for (i = 0; i < newn; i++)
+ newin[i] = newin[i]->nextentry;
+ }
+}
+
+static TSLexeme *
+copyTSLexeme(TheSubstitute *ts)
+{
+ TSLexeme *res;
+ uint16 i;
+
+ res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
+ for (i = 0; i < ts->reslen; i++)
+ {
+ res[i] = ts->res[i];
+ res[i].lexeme = pstrdup(ts->res[i].lexeme);
+ }
+
+ res[ts->reslen].lexeme = NULL;
+
+ return res;
+}
+
+static TSLexeme *
+checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
+{
+ *moreres = false;
+ while (info)
+ {
+ Assert(info->idsubst < d->nsubst);
+ if (info->nextvariant)
+ *moreres = true;
+ if (d->subst[info->idsubst].lastlexeme == curpos)
+ return copyTSLexeme(d->subst + info->idsubst);
+ info = info->nextvariant;
+ }
+
+ return NULL;
+}
+
+Datum
+thesaurus_lexize(PG_FUNCTION_ARGS)
+{
+ DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
+ DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
+ TSLexeme *res = NULL;
+ LexemeInfo *stored,
+ *info = NULL;
+ uint16 curpos = 0;
+ bool moreres = false;
+
+ if (PG_NARGS() != 4 || dstate == NULL)
+ elog(ERROR, "forbidden call of thesaurus or nested call");
+
+ if (dstate->isend)
+ PG_RETURN_POINTER(NULL);
+ stored = (LexemeInfo *) dstate->private_state;
+
+ if (stored)
+ curpos = stored->posinsubst + 1;
+
+ if (!d->subdict->isvalid)
+ d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+ PointerGetDatum(d->subdict->dictData),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2),
+ PointerGetDatum(NULL)));
+
+ if (res && res->lexeme)
+ {
+ TSLexeme *ptr = res,
+ *basevar;
+
+ while (ptr->lexeme)
+ {
+ uint16 nv = ptr->nvariant;
+ uint16 i,
+ nlex = 0;
+ LexemeInfo **infos;
+
+ basevar = ptr;
+ while (ptr->lexeme && nv == ptr->nvariant)
+ {
+ nlex++;
+ ptr++;
+ }
+
+ infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
+ for (i = 0; i < nlex; i++)
+ if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
+ break;
+
+ if (i < nlex)
+ {
+ /* no chance to find */
+ pfree(infos);
+ continue;
+ }
+
+ info = findVariant(info, stored, curpos, infos, nlex);
+ }
+ }
+ else if (res)
+ { /* stop-word */
+ LexemeInfo *infos = findTheLexeme(d, NULL);
+
+ info = findVariant(NULL, stored, curpos, &infos, 1);
+ }
+ else
+ {
+ info = NULL; /* word isn't recognized */
+ }
+
+ dstate->private_state = (void *) info;
+
+ if (!info)
+ {
+ dstate->getnext = false;
+ PG_RETURN_POINTER(NULL);
+ }
+
+ if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
+ {
+ dstate->getnext = moreres;
+ PG_RETURN_POINTER(res);
+ }
+
+ dstate->getnext = true;
+
+ PG_RETURN_POINTER(NULL);
+}