diff options
Diffstat (limited to 'src/backend/tsearch/dict_thesaurus.c')
-rw-r--r-- | src/backend/tsearch/dict_thesaurus.c | 877 |
1 files changed, 877 insertions, 0 deletions
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c new file mode 100644 index 0000000..a95ed08 --- /dev/null +++ b/src/backend/tsearch/dict_thesaurus.c @@ -0,0 +1,877 @@ +/*------------------------------------------------------------------------- + * + * dict_thesaurus.c + * Thesaurus dictionary: phrase to phrase substitution + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/dict_thesaurus.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/namespace.h" +#include "commands/defrem.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" +#include "utils/regproc.h" + + +/* + * Temporary we use TSLexeme.flags for inner use... + */ +#define DT_USEASIS 0x1000 + +typedef struct LexemeInfo +{ + uint32 idsubst; /* entry's number in DictThesaurus->subst */ + uint16 posinsubst; /* pos info in entry */ + uint16 tnvariant; /* total num lexemes in one variant */ + struct LexemeInfo *nextentry; + struct LexemeInfo *nextvariant; +} LexemeInfo; + +typedef struct +{ + char *lexeme; + LexemeInfo *entries; +} TheLexeme; + +typedef struct +{ + uint16 lastlexeme; /* number lexemes to substitute */ + uint16 reslen; + TSLexeme *res; /* prepared substituted result */ +} TheSubstitute; + +typedef struct +{ + /* subdictionary to normalize lexemes */ + Oid subdictOid; + TSDictionaryCacheEntry *subdict; + + /* Array to search lexeme by exact match */ + TheLexeme *wrds; + int nwrds; /* current number of words */ + int ntwrds; /* allocated array length */ + + /* + * Storage of substituted result, n-th element is for n-th expression + */ + TheSubstitute *subst; + int nsubst; +} DictThesaurus; + + +static void +newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst) +{ + TheLexeme *ptr; + + if (d->nwrds >= d->ntwrds) + { + if (d->ntwrds == 0) + { + d->ntwrds = 16; + d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds); + } + else + { + d->ntwrds *= 2; + d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds); + } + } + + ptr = d->wrds + d->nwrds; + d->nwrds++; + + ptr->lexeme = palloc(e - b + 1); + + memcpy(ptr->lexeme, b, e - b); + ptr->lexeme[e - b] = '\0'; + + ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo)); + + ptr->entries->nextentry = NULL; + ptr->entries->idsubst = idsubst; + ptr->entries->posinsubst = posinsubst; +} + +static void +addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis) +{ + static int nres = 0; + static int ntres = 0; + TheSubstitute *ptr; + + if (nwrd == 0) + { + nres = ntres = 0; + + if (idsubst >= d->nsubst) + { + if (d->nsubst == 0) + { + d->nsubst = 16; + d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst); + } + else + { + d->nsubst *= 2; + d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst); + } + } + } + + ptr = d->subst + idsubst; + + ptr->lastlexeme = posinsubst - 1; + + if (nres + 1 >= ntres) + { + if (ntres == 0) + { + ntres = 2; + ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres); + } + else + { + ntres *= 2; + ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres); + } + } + + ptr->res[nres].lexeme = palloc(e - b + 1); + memcpy(ptr->res[nres].lexeme, b, e - b); + ptr->res[nres].lexeme[e - b] = '\0'; + + ptr->res[nres].nvariant = nwrd; + if (useasis) + ptr->res[nres].flags = DT_USEASIS; + else + ptr->res[nres].flags = 0; + + ptr->res[++nres].lexeme = NULL; +} + +#define TR_WAITLEX 1 +#define TR_INLEX 2 +#define TR_WAITSUBS 3 +#define TR_INSUBS 4 + +static void +thesaurusRead(const char *filename, DictThesaurus *d) +{ + tsearch_readline_state trst; + uint32 idsubst = 0; + bool useasis = false; + char *line; + + filename = get_tsearch_config_filename(filename, "ths"); + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open thesaurus file \"%s\": %m", + filename))); + + while ((line = tsearch_readline(&trst)) != NULL) + { + char *ptr; + int state = TR_WAITLEX; + char *beginwrd = NULL; + uint32 posinsubst = 0; + uint32 nwrd = 0; + + ptr = line; + + /* is it a comment? */ + while (*ptr && t_isspace(ptr)) + ptr += pg_mblen(ptr); + + if (t_iseq(ptr, '#') || *ptr == '\0' || + t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) + { + pfree(line); + continue; + } + + while (*ptr) + { + if (state == TR_WAITLEX) + { + if (t_iseq(ptr, ':')) + { + if (posinsubst == 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("unexpected delimiter"))); + state = TR_WAITSUBS; + } + else if (!t_isspace(ptr)) + { + beginwrd = ptr; + state = TR_INLEX; + } + } + else if (state == TR_INLEX) + { + if (t_iseq(ptr, ':')) + { + newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); + state = TR_WAITSUBS; + } + else if (t_isspace(ptr)) + { + newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); + state = TR_WAITLEX; + } + } + else if (state == TR_WAITSUBS) + { + if (t_iseq(ptr, '*')) + { + useasis = true; + state = TR_INSUBS; + beginwrd = ptr + pg_mblen(ptr); + } + else if (t_iseq(ptr, '\\')) + { + useasis = false; + state = TR_INSUBS; + beginwrd = ptr + pg_mblen(ptr); + } + else if (!t_isspace(ptr)) + { + useasis = false; + beginwrd = ptr; + state = TR_INSUBS; + } + } + else if (state == TR_INSUBS) + { + if (t_isspace(ptr)) + { + if (ptr == beginwrd) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("unexpected end of line or lexeme"))); + addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); + state = TR_WAITSUBS; + } + } + else + elog(ERROR, "unrecognized thesaurus state: %d", state); + + ptr += pg_mblen(ptr); + } + + if (state == TR_INSUBS) + { + if (ptr == beginwrd) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("unexpected end of line or lexeme"))); + addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); + } + + idsubst++; + + if (!(nwrd && posinsubst)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("unexpected end of line"))); + + if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("too many lexemes in thesaurus entry"))); + + pfree(line); + } + + d->nsubst = idsubst; + + tsearch_readline_end(&trst); +} + +static TheLexeme * +addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant) +{ + if (*nnw >= *tnm) + { + *tnm *= 2; + newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm); + } + + newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo)); + + if (lexeme && lexeme->lexeme) + { + newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme); + newwrds[*nnw].entries->tnvariant = tnvariant; + } + else + { + newwrds[*nnw].lexeme = NULL; + newwrds[*nnw].entries->tnvariant = 1; + } + + newwrds[*nnw].entries->idsubst = src->idsubst; + newwrds[*nnw].entries->posinsubst = src->posinsubst; + + newwrds[*nnw].entries->nextentry = NULL; + + (*nnw)++; + return newwrds; +} + +static int +cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b) +{ + if (a == NULL || b == NULL) + return 0; + + if (a->idsubst == b->idsubst) + { + if (a->posinsubst == b->posinsubst) + { + if (a->tnvariant == b->tnvariant) + return 0; + + return (a->tnvariant > b->tnvariant) ? 1 : -1; + } + + return (a->posinsubst > b->posinsubst) ? 1 : -1; + } + + return (a->idsubst > b->idsubst) ? 1 : -1; +} + +static int +cmpLexeme(const TheLexeme *a, const TheLexeme *b) +{ + if (a->lexeme == NULL) + { + if (b->lexeme == NULL) + return 0; + else + return 1; + } + else if (b->lexeme == NULL) + return -1; + + return strcmp(a->lexeme, b->lexeme); +} + +static int +cmpLexemeQ(const void *a, const void *b) +{ + return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b); +} + +static int +cmpTheLexeme(const void *a, const void *b) +{ + const TheLexeme *la = (const TheLexeme *) a; + const TheLexeme *lb = (const TheLexeme *) b; + int res; + + if ((res = cmpLexeme(la, lb)) != 0) + return res; + + return -cmpLexemeInfo(la->entries, lb->entries); +} + +static void +compileTheLexeme(DictThesaurus *d) +{ + int i, + nnw = 0, + tnm = 16; + TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm), + *ptrwrds; + + for (i = 0; i < d->nwrds; i++) + { + TSLexeme *ptr; + + if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */ + newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0); + else + { + ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize), + PointerGetDatum(d->subdict->dictData), + PointerGetDatum(d->wrds[i].lexeme), + Int32GetDatum(strlen(d->wrds[i].lexeme)), + PointerGetDatum(NULL))); + + if (!ptr) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)", + d->wrds[i].lexeme, + d->wrds[i].entries->idsubst + 1))); + else if (!(ptr->lexeme)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)", + d->wrds[i].lexeme, + d->wrds[i].entries->idsubst + 1), + errhint("Use \"?\" to represent a stop word within a sample phrase."))); + else + { + while (ptr->lexeme) + { + TSLexeme *remptr = ptr + 1; + int tnvar = 1; + int curvar = ptr->nvariant; + + /* compute n words in one variant */ + while (remptr->lexeme) + { + if (remptr->nvariant != (remptr - 1)->nvariant) + break; + tnvar++; + remptr++; + } + + remptr = ptr; + while (remptr->lexeme && remptr->nvariant == curvar) + { + newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar); + remptr++; + } + + ptr = remptr; + } + } + } + + pfree(d->wrds[i].lexeme); + pfree(d->wrds[i].entries); + } + + if (d->wrds) + pfree(d->wrds); + d->wrds = newwrds; + d->nwrds = nnw; + d->ntwrds = tnm; + + if (d->nwrds > 1) + { + qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme); + + /* uniq */ + newwrds = d->wrds; + ptrwrds = d->wrds + 1; + while (ptrwrds - d->wrds < d->nwrds) + { + if (cmpLexeme(ptrwrds, newwrds) == 0) + { + if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries)) + { + ptrwrds->entries->nextentry = newwrds->entries; + newwrds->entries = ptrwrds->entries; + } + else + pfree(ptrwrds->entries); + + if (ptrwrds->lexeme) + pfree(ptrwrds->lexeme); + } + else + { + newwrds++; + *newwrds = *ptrwrds; + } + + ptrwrds++; + } + + d->nwrds = newwrds - d->wrds + 1; + d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds); + } +} + +static void +compileTheSubstitute(DictThesaurus *d) +{ + int i; + + for (i = 0; i < d->nsubst; i++) + { + TSLexeme *rem = d->subst[i].res, + *outptr, + *inptr; + int n = 2; + + outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n); + outptr->lexeme = NULL; + inptr = rem; + + while (inptr && inptr->lexeme) + { + TSLexeme *lexized, + tmplex[2]; + + if (inptr->flags & DT_USEASIS) + { /* do not lexize */ + tmplex[0] = *inptr; + tmplex[0].flags = 0; + tmplex[1].lexeme = NULL; + lexized = tmplex; + } + else + { + lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize), + PointerGetDatum(d->subdict->dictData), + PointerGetDatum(inptr->lexeme), + Int32GetDatum(strlen(inptr->lexeme)), + PointerGetDatum(NULL))); + } + + if (lexized && lexized->lexeme) + { + int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1; + + while (lexized->lexeme) + { + if (outptr - d->subst[i].res + 1 >= n) + { + int diff = outptr - d->subst[i].res; + + n *= 2; + d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n); + outptr = d->subst[i].res + diff; + } + + *outptr = *lexized; + outptr->lexeme = pstrdup(lexized->lexeme); + + outptr++; + lexized++; + } + + if (toset > 0) + d->subst[i].res[toset].flags |= TSL_ADDPOS; + } + else if (lexized) + { + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)", + inptr->lexeme, i + 1))); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)", + inptr->lexeme, i + 1))); + } + + if (inptr->lexeme) + pfree(inptr->lexeme); + inptr++; + } + + if (outptr == d->subst[i].res) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus substitute phrase is empty (rule %d)", + i + 1))); + + d->subst[i].reslen = outptr - d->subst[i].res; + + pfree(rem); + } +} + +Datum +thesaurus_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictThesaurus *d; + char *subdictname = NULL; + bool fileloaded = false; + ListCell *l; + + d = (DictThesaurus *) palloc0(sizeof(DictThesaurus)); + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (strcmp(defel->defname, "dictfile") == 0) + { + if (fileloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple DictFile parameters"))); + thesaurusRead(defGetString(defel), d); + fileloaded = true; + } + else if (strcmp(defel->defname, "dictionary") == 0) + { + if (subdictname) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple Dictionary parameters"))); + subdictname = pstrdup(defGetString(defel)); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized Thesaurus parameter: \"%s\"", + defel->defname))); + } + } + + if (!fileloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing DictFile parameter"))); + if (!subdictname) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing Dictionary parameter"))); + + d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false); + d->subdict = lookup_ts_dictionary_cache(d->subdictOid); + + compileTheLexeme(d); + compileTheSubstitute(d); + + PG_RETURN_POINTER(d); +} + +static LexemeInfo * +findTheLexeme(DictThesaurus *d, char *lexeme) +{ + TheLexeme key, + *res; + + if (d->nwrds == 0) + return NULL; + + key.lexeme = lexeme; + key.entries = NULL; + + res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ); + + if (res == NULL) + return NULL; + return res->entries; +} + +static bool +matchIdSubst(LexemeInfo *stored, uint32 idsubst) +{ + bool res = true; + + if (stored) + { + res = false; + + for (; stored; stored = stored->nextvariant) + if (stored->idsubst == idsubst) + { + res = true; + break; + } + } + + return res; +} + +static LexemeInfo * +findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn) +{ + for (;;) + { + int i; + LexemeInfo *ptr = newin[0]; + + for (i = 0; i < newn; i++) + { + while (newin[i] && newin[i]->idsubst < ptr->idsubst) + newin[i] = newin[i]->nextentry; + + if (newin[i] == NULL) + return in; + + if (newin[i]->idsubst > ptr->idsubst) + { + ptr = newin[i]; + i = -1; + continue; + } + + while (newin[i]->idsubst == ptr->idsubst) + { + if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn) + { + ptr = newin[i]; + break; + } + + newin[i] = newin[i]->nextentry; + if (newin[i] == NULL) + return in; + } + + if (newin[i]->idsubst != ptr->idsubst) + { + ptr = newin[i]; + i = -1; + continue; + } + } + + if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst))) + { /* found */ + + ptr->nextvariant = in; + in = ptr; + } + + /* step forward */ + for (i = 0; i < newn; i++) + newin[i] = newin[i]->nextentry; + } +} + +static TSLexeme * +copyTSLexeme(TheSubstitute *ts) +{ + TSLexeme *res; + uint16 i; + + res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1)); + for (i = 0; i < ts->reslen; i++) + { + res[i] = ts->res[i]; + res[i].lexeme = pstrdup(ts->res[i].lexeme); + } + + res[ts->reslen].lexeme = NULL; + + return res; +} + +static TSLexeme * +checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres) +{ + *moreres = false; + while (info) + { + Assert(info->idsubst < d->nsubst); + if (info->nextvariant) + *moreres = true; + if (d->subst[info->idsubst].lastlexeme == curpos) + return copyTSLexeme(d->subst + info->idsubst); + info = info->nextvariant; + } + + return NULL; +} + +Datum +thesaurus_lexize(PG_FUNCTION_ARGS) +{ + DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0); + DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3); + TSLexeme *res = NULL; + LexemeInfo *stored, + *info = NULL; + uint16 curpos = 0; + bool moreres = false; + + if (PG_NARGS() != 4 || dstate == NULL) + elog(ERROR, "forbidden call of thesaurus or nested call"); + + if (dstate->isend) + PG_RETURN_POINTER(NULL); + stored = (LexemeInfo *) dstate->private_state; + + if (stored) + curpos = stored->posinsubst + 1; + + if (!d->subdict->isvalid) + d->subdict = lookup_ts_dictionary_cache(d->subdictOid); + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize), + PointerGetDatum(d->subdict->dictData), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2), + PointerGetDatum(NULL))); + + if (res && res->lexeme) + { + TSLexeme *ptr = res, + *basevar; + + while (ptr->lexeme) + { + uint16 nv = ptr->nvariant; + uint16 i, + nlex = 0; + LexemeInfo **infos; + + basevar = ptr; + while (ptr->lexeme && nv == ptr->nvariant) + { + nlex++; + ptr++; + } + + infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex); + for (i = 0; i < nlex; i++) + if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL) + break; + + if (i < nlex) + { + /* no chance to find */ + pfree(infos); + continue; + } + + info = findVariant(info, stored, curpos, infos, nlex); + } + } + else if (res) + { /* stop-word */ + LexemeInfo *infos = findTheLexeme(d, NULL); + + info = findVariant(NULL, stored, curpos, &infos, 1); + } + else + { + info = NULL; /* word isn't recognized */ + } + + dstate->private_state = (void *) info; + + if (!info) + { + dstate->getnext = false; + PG_RETURN_POINTER(NULL); + } + + if ((res = checkMatch(d, info, curpos, &moreres)) != NULL) + { + dstate->getnext = moreres; + PG_RETURN_POINTER(res); + } + + dstate->getnext = true; + + PG_RETURN_POINTER(NULL); +} |