diff options
Diffstat (limited to 'src/backend/tsearch/ts_parse.c')
-rw-r--r-- | src/backend/tsearch/ts_parse.c | 678 |
1 files changed, 678 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c new file mode 100644 index 0000000..a87b442 --- /dev/null +++ b/src/backend/tsearch/ts_parse.c @@ -0,0 +1,678 @@ +/*------------------------------------------------------------------------- + * + * ts_parse.c + * main parse functions for tsearch + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/ts_parse.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/ts_cache.h" +#include "tsearch/ts_utils.h" + +#define IGNORE_LONGLEXEME 1 + +/* + * Lexize subsystem + */ + +typedef struct ParsedLex +{ + int type; + char *lemm; + int lenlemm; + struct ParsedLex *next; +} ParsedLex; + +typedef struct ListParsedLex +{ + ParsedLex *head; + ParsedLex *tail; +} ListParsedLex; + +typedef struct +{ + TSConfigCacheEntry *cfg; + Oid curDictId; + int posDict; + DictSubState dictState; + ParsedLex *curSub; + ListParsedLex towork; /* current list to work */ + ListParsedLex waste; /* list of lexemes that already lexized */ + + /* + * fields to store last variant to lexize (basically, thesaurus or similar + * to, which wants several lexemes + */ + + ParsedLex *lastRes; + TSLexeme *tmpRes; +} LexizeData; + +static void +LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg) +{ + ld->cfg = cfg; + ld->curDictId = InvalidOid; + ld->posDict = 0; + ld->towork.head = ld->towork.tail = ld->curSub = NULL; + ld->waste.head = ld->waste.tail = NULL; + ld->lastRes = NULL; + ld->tmpRes = NULL; +} + +static void +LPLAddTail(ListParsedLex *list, ParsedLex *newpl) +{ + if (list->tail) + { + list->tail->next = newpl; + list->tail = newpl; + } + else + list->head = list->tail = newpl; + newpl->next = NULL; +} + +static ParsedLex * +LPLRemoveHead(ListParsedLex *list) +{ + ParsedLex *res = list->head; + + if (list->head) + list->head = list->head->next; + + if (list->head == NULL) + list->tail = NULL; + + return res; +} + +static void +LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) +{ + ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex)); + + newpl->type = type; + newpl->lemm = lemm; + newpl->lenlemm = lenlemm; + LPLAddTail(&ld->towork, newpl); + ld->curSub = ld->towork.tail; +} + +static void +RemoveHead(LexizeData *ld) +{ + LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork)); + + ld->posDict = 0; +} + +static void +setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) +{ + if (correspondLexem) + { + *correspondLexem = ld->waste.head; + } + else + { + ParsedLex *tmp, + *ptr = ld->waste.head; + + while (ptr) + { + tmp = ptr->next; + pfree(ptr); + ptr = tmp; + } + } + ld->waste.head = ld->waste.tail = NULL; +} + +static void +moveToWaste(LexizeData *ld, ParsedLex *stop) +{ + bool go = true; + + while (ld->towork.head && go) + { + if (ld->towork.head == stop) + { + ld->curSub = stop->next; + go = false; + } + RemoveHead(ld); + } +} + +static void +setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) +{ + if (ld->tmpRes) + { + TSLexeme *ptr; + + for (ptr = ld->tmpRes; ptr->lexeme; ptr++) + pfree(ptr->lexeme); + pfree(ld->tmpRes); + } + ld->tmpRes = res; + ld->lastRes = lex; +} + +static TSLexeme * +LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) +{ + int i; + ListDictionary *map; + TSDictionaryCacheEntry *dict; + TSLexeme *res; + + if (ld->curDictId == InvalidOid) + { + /* + * usual mode: dictionary wants only one word, but we should keep in + * mind that we should go through all stack + */ + + while (ld->towork.head) + { + ParsedLex *curVal = ld->towork.head; + char *curValLemm = curVal->lemm; + int curValLenLemm = curVal->lenlemm; + + map = ld->cfg->map + curVal->type; + + if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0) + { + /* skip this type of lexeme */ + RemoveHead(ld); + continue; + } + + for (i = ld->posDict; i < map->len; i++) + { + dict = lookup_ts_dictionary_cache(map->dictIds[i]); + + ld->dictState.isend = ld->dictState.getnext = false; + ld->dictState.private_state = NULL; + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), + PointerGetDatum(dict->dictData), + PointerGetDatum(curValLemm), + Int32GetDatum(curValLenLemm), + PointerGetDatum(&ld->dictState))); + + if (ld->dictState.getnext) + { + /* + * dictionary wants next word, so setup and store current + * position and go to multiword mode + */ + + ld->curDictId = DatumGetObjectId(map->dictIds[i]); + ld->posDict = i + 1; + ld->curSub = curVal->next; + if (res) + setNewTmpRes(ld, curVal, res); + return LexizeExec(ld, correspondLexem); + } + + if (!res) /* dictionary doesn't know this lexeme */ + continue; + + if (res->flags & TSL_FILTER) + { + curValLemm = res->lexeme; + curValLenLemm = strlen(res->lexeme); + continue; + } + + RemoveHead(ld); + setCorrLex(ld, correspondLexem); + return res; + } + + RemoveHead(ld); + } + } + else + { /* curDictId is valid */ + dict = lookup_ts_dictionary_cache(ld->curDictId); + + /* + * Dictionary ld->curDictId asks us about following words + */ + + while (ld->curSub) + { + ParsedLex *curVal = ld->curSub; + + map = ld->cfg->map + curVal->type; + + if (curVal->type != 0) + { + bool dictExists = false; + + if (curVal->type >= ld->cfg->lenmap || map->len == 0) + { + /* skip this type of lexeme */ + ld->curSub = curVal->next; + continue; + } + + /* + * We should be sure that current type of lexeme is recognized + * by our dictionary: we just check is it exist in list of + * dictionaries ? + */ + for (i = 0; i < map->len && !dictExists; i++) + if (ld->curDictId == DatumGetObjectId(map->dictIds[i])) + dictExists = true; + + if (!dictExists) + { + /* + * Dictionary can't work with current type of lexeme, + * return to basic mode and redo all stored lexemes + */ + ld->curDictId = InvalidOid; + return LexizeExec(ld, correspondLexem); + } + } + + ld->dictState.isend = (curVal->type == 0); + ld->dictState.getnext = false; + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), + PointerGetDatum(dict->dictData), + PointerGetDatum(curVal->lemm), + Int32GetDatum(curVal->lenlemm), + PointerGetDatum(&ld->dictState))); + + if (ld->dictState.getnext) + { + /* Dictionary wants one more */ + ld->curSub = curVal->next; + if (res) + setNewTmpRes(ld, curVal, res); + continue; + } + + if (res || ld->tmpRes) + { + /* + * Dictionary normalizes lexemes, so we remove from stack all + * used lexemes, return to basic mode and redo end of stack + * (if it exists) + */ + if (res) + { + moveToWaste(ld, ld->curSub); + } + else + { + res = ld->tmpRes; + moveToWaste(ld, ld->lastRes); + } + + /* reset to initial state */ + ld->curDictId = InvalidOid; + ld->posDict = 0; + ld->lastRes = NULL; + ld->tmpRes = NULL; + setCorrLex(ld, correspondLexem); + return res; + } + + /* + * Dict don't want next lexem and didn't recognize anything, redo + * from ld->towork.head + */ + ld->curDictId = InvalidOid; + return LexizeExec(ld, correspondLexem); + } + } + + setCorrLex(ld, correspondLexem); + return NULL; +} + +/* + * Parse string and lexize words. + * + * prs will be filled in. + */ +void +parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen) +{ + int type, + lenlemm; + char *lemm = NULL; + LexizeData ldata; + TSLexeme *norms; + TSConfigCacheEntry *cfg; + TSParserCacheEntry *prsobj; + void *prsdata; + + cfg = lookup_ts_config_cache(cfgId); + prsobj = lookup_ts_parser_cache(cfg->prsId); + + prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart, + PointerGetDatum(buf), + Int32GetDatum(buflen))); + + LexizeInit(&ldata, cfg); + + do + { + type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), + PointerGetDatum(prsdata), + PointerGetDatum(&lemm), + PointerGetDatum(&lenlemm))); + + if (type > 0 && lenlemm >= MAXSTRLEN) + { +#ifdef IGNORE_LONGLEXEME + ereport(NOTICE, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); + continue; +#else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); +#endif + } + + LexizeAddLemm(&ldata, type, lemm, lenlemm); + + while ((norms = LexizeExec(&ldata, NULL)) != NULL) + { + TSLexeme *ptr = norms; + + prs->pos++; /* set pos */ + + while (ptr->lexeme) + { + if (prs->curwords == prs->lenwords) + { + prs->lenwords *= 2; + prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord)); + } + + if (ptr->flags & TSL_ADDPOS) + prs->pos++; + prs->words[prs->curwords].len = strlen(ptr->lexeme); + prs->words[prs->curwords].word = ptr->lexeme; + prs->words[prs->curwords].nvariant = ptr->nvariant; + prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX; + prs->words[prs->curwords].alen = 0; + prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos); + ptr++; + prs->curwords++; + } + pfree(norms); + } + } while (type > 0); + + FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); +} + +/* + * Headline framework + */ + +/* Add a word to prs->words[] */ +static void +hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type) +{ + if (prs->curwords >= prs->lenwords) + { + prs->lenwords *= 2; + prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry)); + } + memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry)); + prs->words[prs->curwords].type = (uint8) type; + prs->words[prs->curwords].len = buflen; + prs->words[prs->curwords].word = palloc(buflen); + memcpy(prs->words[prs->curwords].word, buf, buflen); + prs->curwords++; +} + +/* + * Add pos and matching-query-item data to the just-added word. + * Here, buf/buflen represent a processed lexeme, not raw token text. + * + * If the query contains more than one matching item, we replicate + * the last-added word so that each item can be pointed to. The + * duplicate entries are marked with repeated = 1. + */ +static void +hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen) +{ + int i; + QueryItem *item = GETQUERY(query); + HeadlineWordEntry *word; + + while (prs->curwords + query->size >= prs->lenwords) + { + prs->lenwords *= 2; + prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry)); + } + + word = &(prs->words[prs->curwords - 1]); + word->pos = LIMITPOS(pos); + for (i = 0; i < query->size; i++) + { + if (item->type == QI_VAL && + tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length, + buf, buflen, item->qoperand.prefix) == 0) + { + if (word->item) + { + memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry)); + prs->words[prs->curwords].item = &item->qoperand; + prs->words[prs->curwords].repeated = 1; + prs->curwords++; + } + else + word->item = &item->qoperand; + } + item++; + } +} + +static void +addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms) +{ + ParsedLex *tmplexs; + TSLexeme *ptr; + int32 savedpos; + + while (lexs) + { + if (lexs->type > 0) + hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type); + + ptr = norms; + savedpos = prs->vectorpos; + while (ptr && ptr->lexeme) + { + if (ptr->flags & TSL_ADDPOS) + savedpos++; + hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme)); + ptr++; + } + + tmplexs = lexs->next; + pfree(lexs); + lexs = tmplexs; + } + + if (norms) + { + ptr = norms; + while (ptr->lexeme) + { + if (ptr->flags & TSL_ADDPOS) + prs->vectorpos++; + pfree(ptr->lexeme); + ptr++; + } + pfree(norms); + } +} + +void +hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen) +{ + int type, + lenlemm; + char *lemm = NULL; + LexizeData ldata; + TSLexeme *norms; + ParsedLex *lexs; + TSConfigCacheEntry *cfg; + TSParserCacheEntry *prsobj; + void *prsdata; + + cfg = lookup_ts_config_cache(cfgId); + prsobj = lookup_ts_parser_cache(cfg->prsId); + + prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart), + PointerGetDatum(buf), + Int32GetDatum(buflen))); + + LexizeInit(&ldata, cfg); + + do + { + type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), + PointerGetDatum(prsdata), + PointerGetDatum(&lemm), + PointerGetDatum(&lenlemm))); + + if (type > 0 && lenlemm >= MAXSTRLEN) + { +#ifdef IGNORE_LONGLEXEME + ereport(NOTICE, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); + continue; +#else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); +#endif + } + + LexizeAddLemm(&ldata, type, lemm, lenlemm); + + do + { + if ((norms = LexizeExec(&ldata, &lexs)) != NULL) + { + prs->vectorpos++; + addHLParsedLex(prs, query, lexs, norms); + } + else + addHLParsedLex(prs, query, lexs, NULL); + } while (norms); + } while (type > 0); + + FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); +} + +/* + * Generate the headline, as a text object, from HeadlineParsedText. + */ +text * +generateHeadline(HeadlineParsedText *prs) +{ + text *out; + char *ptr; + int len = 128; + int numfragments = 0; + int16 infrag = 0; + + HeadlineWordEntry *wrd = prs->words; + + out = (text *) palloc(len); + ptr = ((char *) out) + VARHDRSZ; + + while (wrd - prs->words < prs->curwords) + { + while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len) + { + int dist = ptr - ((char *) out); + + len *= 2; + out = (text *) repalloc(out, len); + ptr = ((char *) out) + dist; + } + + if (wrd->in && !wrd->repeated) + { + if (!infrag) + { + + /* start of a new fragment */ + infrag = 1; + numfragments++; + /* add a fragment delimiter if this is after the first one */ + if (numfragments > 1) + { + memcpy(ptr, prs->fragdelim, prs->fragdelimlen); + ptr += prs->fragdelimlen; + } + } + if (wrd->replace) + { + *ptr = ' '; + ptr++; + } + else if (!wrd->skip) + { + if (wrd->selected) + { + memcpy(ptr, prs->startsel, prs->startsellen); + ptr += prs->startsellen; + } + memcpy(ptr, wrd->word, wrd->len); + ptr += wrd->len; + if (wrd->selected) + { + memcpy(ptr, prs->stopsel, prs->stopsellen); + ptr += prs->stopsellen; + } + } + } + else if (!wrd->repeated) + { + if (infrag) + infrag = 0; + pfree(wrd->word); + } + + wrd++; + } + + SET_VARSIZE(out, ptr - ((char *) out)); + return out; +} |