From 46651ce6fe013220ed397add242004d764fc0153 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 4 May 2024 14:15:05 +0200 Subject: Adding upstream version 14.5. Signed-off-by: Daniel Baumann --- src/backend/tsearch/Makefile | 54 + src/backend/tsearch/dict.c | 89 + src/backend/tsearch/dict_ispell.c | 148 ++ src/backend/tsearch/dict_simple.c | 105 + src/backend/tsearch/dict_synonym.c | 241 ++ src/backend/tsearch/dict_thesaurus.c | 877 +++++++ src/backend/tsearch/dicts/hunspell_sample.affix | 23 + .../tsearch/dicts/hunspell_sample_long.affix | 53 + .../tsearch/dicts/hunspell_sample_long.dict | 11 + .../tsearch/dicts/hunspell_sample_num.affix | 33 + src/backend/tsearch/dicts/hunspell_sample_num.dict | 9 + src/backend/tsearch/dicts/ispell_sample.affix | 26 + src/backend/tsearch/dicts/ispell_sample.dict | 8 + src/backend/tsearch/dicts/synonym_sample.syn | 5 + src/backend/tsearch/dicts/thesaurus_sample.ths | 17 + src/backend/tsearch/regis.c | 257 ++ src/backend/tsearch/spell.c | 2617 +++++++++++++++++++ src/backend/tsearch/to_tsany.c | 724 ++++++ src/backend/tsearch/ts_locale.c | 325 +++ src/backend/tsearch/ts_parse.c | 667 +++++ src/backend/tsearch/ts_selfuncs.c | 453 ++++ src/backend/tsearch/ts_typanalyze.c | 536 ++++ src/backend/tsearch/ts_utils.c | 146 ++ src/backend/tsearch/wparser.c | 549 ++++ src/backend/tsearch/wparser_def.c | 2634 ++++++++++++++++++++ 25 files changed, 10607 insertions(+) create mode 100644 src/backend/tsearch/Makefile create mode 100644 src/backend/tsearch/dict.c create mode 100644 src/backend/tsearch/dict_ispell.c create mode 100644 src/backend/tsearch/dict_simple.c create mode 100644 src/backend/tsearch/dict_synonym.c create mode 100644 src/backend/tsearch/dict_thesaurus.c create mode 100644 src/backend/tsearch/dicts/hunspell_sample.affix create mode 100644 src/backend/tsearch/dicts/hunspell_sample_long.affix create mode 100644 src/backend/tsearch/dicts/hunspell_sample_long.dict create mode 100644 src/backend/tsearch/dicts/hunspell_sample_num.affix create mode 100644 src/backend/tsearch/dicts/hunspell_sample_num.dict create mode 100644 src/backend/tsearch/dicts/ispell_sample.affix create mode 100644 src/backend/tsearch/dicts/ispell_sample.dict create mode 100644 src/backend/tsearch/dicts/synonym_sample.syn create mode 100644 src/backend/tsearch/dicts/thesaurus_sample.ths create mode 100644 src/backend/tsearch/regis.c create mode 100644 src/backend/tsearch/spell.c create mode 100644 src/backend/tsearch/to_tsany.c create mode 100644 src/backend/tsearch/ts_locale.c create mode 100644 src/backend/tsearch/ts_parse.c create mode 100644 src/backend/tsearch/ts_selfuncs.c create mode 100644 src/backend/tsearch/ts_typanalyze.c create mode 100644 src/backend/tsearch/ts_utils.c create mode 100644 src/backend/tsearch/wparser.c create mode 100644 src/backend/tsearch/wparser_def.c (limited to 'src/backend/tsearch') diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile new file mode 100644 index 0000000..cdb259e --- /dev/null +++ b/src/backend/tsearch/Makefile @@ -0,0 +1,54 @@ +#------------------------------------------------------------------------- +# +# Makefile for backend/tsearch +# +# Copyright (c) 2006-2021, PostgreSQL Global Development Group +# +# src/backend/tsearch/Makefile +# +#------------------------------------------------------------------------- +subdir = src/backend/tsearch +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +DICTDIR=tsearch_data + +# List of dictionaries files +DICTFILES=synonym_sample.syn thesaurus_sample.ths \ + hunspell_sample.affix \ + ispell_sample.affix ispell_sample.dict \ + hunspell_sample_long.affix hunspell_sample_long.dict \ + hunspell_sample_num.affix hunspell_sample_num.dict + +# Local paths to dictionaries files +DICTFILES_PATH=$(addprefix dicts/,$(DICTFILES)) + +OBJS = \ + dict.o \ + dict_ispell.o \ + dict_simple.o \ + dict_synonym.o \ + dict_thesaurus.o \ + regis.o \ + spell.o \ + to_tsany.o \ + ts_locale.o \ + ts_parse.o \ + ts_selfuncs.o \ + ts_typanalyze.o \ + ts_utils.o \ + wparser.o \ + wparser_def.o + +include $(top_srcdir)/src/backend/common.mk + +.PHONY: install-data +install-data: $(DICTFILES_PATH) installdirs + $(INSTALL_DATA) $(addprefix $(srcdir)/,$(DICTFILES_PATH)) '$(DESTDIR)$(datadir)/$(DICTDIR)/' + +installdirs: + $(MKDIR_P) '$(DESTDIR)$(datadir)' '$(DESTDIR)$(datadir)/$(DICTDIR)' + +.PHONY: uninstall-data +uninstall-data: + rm -rf $(addprefix '$(DESTDIR)$(datadir)/$(DICTDIR)/',$(DICTFILES)) diff --git a/src/backend/tsearch/dict.c b/src/backend/tsearch/dict.c new file mode 100644 index 0000000..1e1ccda --- /dev/null +++ b/src/backend/tsearch/dict.c @@ -0,0 +1,89 @@ +/*------------------------------------------------------------------------- + * + * dict.c + * Standard interface to dictionary + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/dict.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/pg_type.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" + + +/* + * Lexize one word by dictionary, mostly debug function + */ +Datum +ts_lexize(PG_FUNCTION_ARGS) +{ + Oid dictId = PG_GETARG_OID(0); + text *in = PG_GETARG_TEXT_PP(1); + ArrayType *a; + TSDictionaryCacheEntry *dict; + TSLexeme *res, + *ptr; + Datum *da; + DictSubState dstate = {false, false, NULL}; + + dict = lookup_ts_dictionary_cache(dictId); + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize, + PointerGetDatum(dict->dictData), + PointerGetDatum(VARDATA_ANY(in)), + Int32GetDatum(VARSIZE_ANY_EXHDR(in)), + PointerGetDatum(&dstate))); + + if (dstate.getnext) + { + dstate.isend = true; + ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize, + PointerGetDatum(dict->dictData), + PointerGetDatum(VARDATA_ANY(in)), + Int32GetDatum(VARSIZE_ANY_EXHDR(in)), + PointerGetDatum(&dstate))); + if (ptr != NULL) + res = ptr; + } + + if (!res) + PG_RETURN_NULL(); + + ptr = res; + while (ptr->lexeme) + ptr++; + da = (Datum *) palloc(sizeof(Datum) * (ptr - res)); + ptr = res; + while (ptr->lexeme) + { + da[ptr - res] = CStringGetTextDatum(ptr->lexeme); + ptr++; + } + + a = construct_array(da, + ptr - res, + TEXTOID, + -1, + false, + TYPALIGN_INT); + + ptr = res; + while (ptr->lexeme) + { + pfree(DatumGetPointer(da[ptr - res])); + pfree(ptr->lexeme); + ptr++; + } + pfree(res); + pfree(da); + + PG_RETURN_POINTER(a); +} diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c new file mode 100644 index 0000000..d93f601 --- /dev/null +++ b/src/backend/tsearch/dict_ispell.c @@ -0,0 +1,148 @@ +/*------------------------------------------------------------------------- + * + * dict_ispell.c + * Ispell dictionary interface + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/dict_ispell.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/defrem.h" +#include "tsearch/dicts/spell.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" + + +typedef struct +{ + StopList stoplist; + IspellDict obj; +} DictISpell; + +Datum +dispell_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictISpell *d; + bool affloaded = false, + dictloaded = false, + stoploaded = false; + ListCell *l; + + d = (DictISpell *) palloc0(sizeof(DictISpell)); + + NIStartBuild(&(d->obj)); + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (strcmp(defel->defname, "dictfile") == 0) + { + if (dictloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple DictFile parameters"))); + NIImportDictionary(&(d->obj), + get_tsearch_config_filename(defGetString(defel), + "dict")); + dictloaded = true; + } + else if (strcmp(defel->defname, "afffile") == 0) + { + if (affloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple AffFile parameters"))); + NIImportAffixes(&(d->obj), + get_tsearch_config_filename(defGetString(defel), + "affix")); + affloaded = true; + } + else if (strcmp(defel->defname, "stopwords") == 0) + { + if (stoploaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple StopWords parameters"))); + readstoplist(defGetString(defel), &(d->stoplist), lowerstr); + stoploaded = true; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized Ispell parameter: \"%s\"", + defel->defname))); + } + } + + if (affloaded && dictloaded) + { + NISortDictionary(&(d->obj)); + NISortAffixes(&(d->obj)); + } + else if (!affloaded) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing AffFile parameter"))); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing DictFile parameter"))); + } + + NIFinishBuild(&(d->obj)); + + PG_RETURN_POINTER(d); +} + +Datum +dispell_lexize(PG_FUNCTION_ARGS) +{ + DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0); + char *in = (char *) PG_GETARG_POINTER(1); + int32 len = PG_GETARG_INT32(2); + char *txt; + TSLexeme *res; + TSLexeme *ptr, + *cptr; + + if (len <= 0) + PG_RETURN_POINTER(NULL); + + txt = lowerstr_with_len(in, len); + res = NINormalizeWord(&(d->obj), txt); + + if (res == NULL) + PG_RETURN_POINTER(NULL); + + cptr = res; + for (ptr = cptr; ptr->lexeme; ptr++) + { + if (searchstoplist(&(d->stoplist), ptr->lexeme)) + { + pfree(ptr->lexeme); + ptr->lexeme = NULL; + } + else + { + if (cptr != ptr) + memcpy(cptr, ptr, sizeof(TSLexeme)); + cptr++; + } + } + cptr->lexeme = NULL; + + PG_RETURN_POINTER(res); +} diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c new file mode 100644 index 0000000..9cd4b6b --- /dev/null +++ b/src/backend/tsearch/dict_simple.c @@ -0,0 +1,105 @@ +/*------------------------------------------------------------------------- + * + * dict_simple.c + * Simple dictionary: just lowercase and check for stopword + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/dict_simple.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/defrem.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" + + +typedef struct +{ + StopList stoplist; + bool accept; +} DictSimple; + + +Datum +dsimple_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple)); + bool stoploaded = false, + acceptloaded = false; + ListCell *l; + + d->accept = true; /* default */ + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (strcmp(defel->defname, "stopwords") == 0) + { + if (stoploaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple StopWords parameters"))); + readstoplist(defGetString(defel), &d->stoplist, lowerstr); + stoploaded = true; + } + else if (strcmp(defel->defname, "accept") == 0) + { + if (acceptloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple Accept parameters"))); + d->accept = defGetBoolean(defel); + acceptloaded = true; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized simple dictionary parameter: \"%s\"", + defel->defname))); + } + } + + PG_RETURN_POINTER(d); +} + +Datum +dsimple_lexize(PG_FUNCTION_ARGS) +{ + DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0); + char *in = (char *) PG_GETARG_POINTER(1); + int32 len = PG_GETARG_INT32(2); + char *txt; + TSLexeme *res; + + txt = lowerstr_with_len(in, len); + + if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) + { + /* reject as stopword */ + pfree(txt); + res = palloc0(sizeof(TSLexeme) * 2); + PG_RETURN_POINTER(res); + } + else if (d->accept) + { + /* accept */ + res = palloc0(sizeof(TSLexeme) * 2); + res[0].lexeme = txt; + PG_RETURN_POINTER(res); + } + else + { + /* report as unrecognized */ + pfree(txt); + PG_RETURN_POINTER(NULL); + } +} diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c new file mode 100644 index 0000000..ed885ca --- /dev/null +++ b/src/backend/tsearch/dict_synonym.c @@ -0,0 +1,241 @@ +/*------------------------------------------------------------------------- + * + * dict_synonym.c + * Synonym dictionary: replace word by its synonym + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/dict_synonym.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/defrem.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" + +typedef struct +{ + char *in; + char *out; + int outlen; + uint16 flags; +} Syn; + +typedef struct +{ + int len; /* length of syn array */ + Syn *syn; + bool case_sensitive; +} DictSyn; + +/* + * Finds the next whitespace-delimited word within the 'in' string. + * Returns a pointer to the first character of the word, and a pointer + * to the next byte after the last character in the word (in *end). + * Character '*' at the end of word will not be treated as word + * character if flags is not null. + */ +static char * +findwrd(char *in, char **end, uint16 *flags) +{ + char *start; + char *lastchar; + + /* Skip leading spaces */ + while (*in && t_isspace(in)) + in += pg_mblen(in); + + /* Return NULL on empty lines */ + if (*in == '\0') + { + *end = NULL; + return NULL; + } + + lastchar = start = in; + + /* Find end of word */ + while (*in && !t_isspace(in)) + { + lastchar = in; + in += pg_mblen(in); + } + + if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags) + { + *flags = TSL_PREFIX; + *end = lastchar; + } + else + { + if (flags) + *flags = 0; + *end = in; + } + + return start; +} + +static int +compareSyn(const void *a, const void *b) +{ + return strcmp(((const Syn *) a)->in, ((const Syn *) b)->in); +} + + +Datum +dsynonym_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictSyn *d; + ListCell *l; + char *filename = NULL; + bool case_sensitive = false; + tsearch_readline_state trst; + char *starti, + *starto, + *end = NULL; + int cur = 0; + char *line = NULL; + uint16 flags = 0; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (strcmp(defel->defname, "synonyms") == 0) + filename = defGetString(defel); + else if (strcmp(defel->defname, "casesensitive") == 0) + case_sensitive = defGetBoolean(defel); + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized synonym parameter: \"%s\"", + defel->defname))); + } + + if (!filename) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing Synonyms parameter"))); + + filename = get_tsearch_config_filename(filename, "syn"); + + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open synonym file \"%s\": %m", + filename))); + + d = (DictSyn *) palloc0(sizeof(DictSyn)); + + while ((line = tsearch_readline(&trst)) != NULL) + { + starti = findwrd(line, &end, NULL); + if (!starti) + { + /* Empty line */ + goto skipline; + } + if (*end == '\0') + { + /* A line with only one word. Ignore silently. */ + goto skipline; + } + *end = '\0'; + + starto = findwrd(end + 1, &end, &flags); + if (!starto) + { + /* A line with only one word (+whitespace). Ignore silently. */ + goto skipline; + } + *end = '\0'; + + /* + * starti now points to the first word, and starto to the second word + * on the line, with a \0 terminator at the end of both words. + */ + + if (cur >= d->len) + { + if (d->len == 0) + { + d->len = 64; + d->syn = (Syn *) palloc(sizeof(Syn) * d->len); + } + else + { + d->len *= 2; + d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); + } + } + + if (case_sensitive) + { + d->syn[cur].in = pstrdup(starti); + d->syn[cur].out = pstrdup(starto); + } + else + { + d->syn[cur].in = lowerstr(starti); + d->syn[cur].out = lowerstr(starto); + } + + d->syn[cur].outlen = strlen(starto); + d->syn[cur].flags = flags; + + cur++; + +skipline: + pfree(line); + } + + tsearch_readline_end(&trst); + + d->len = cur; + qsort(d->syn, d->len, sizeof(Syn), compareSyn); + + d->case_sensitive = case_sensitive; + + PG_RETURN_POINTER(d); +} + +Datum +dsynonym_lexize(PG_FUNCTION_ARGS) +{ + DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0); + char *in = (char *) PG_GETARG_POINTER(1); + int32 len = PG_GETARG_INT32(2); + Syn key, + *found; + TSLexeme *res; + + /* note: d->len test protects against Solaris bsearch-of-no-items bug */ + if (len <= 0 || d->len <= 0) + PG_RETURN_POINTER(NULL); + + if (d->case_sensitive) + key.in = pnstrdup(in, len); + else + key.in = lowerstr_with_len(in, len); + + key.out = NULL; + + found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn); + pfree(key.in); + + if (!found) + PG_RETURN_POINTER(NULL); + + res = palloc0(sizeof(TSLexeme) * 2); + res[0].lexeme = pnstrdup(found->out, found->outlen); + res[0].flags = found->flags; + + PG_RETURN_POINTER(res); +} diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c new file mode 100644 index 0000000..a95ed08 --- /dev/null +++ b/src/backend/tsearch/dict_thesaurus.c @@ -0,0 +1,877 @@ +/*------------------------------------------------------------------------- + * + * dict_thesaurus.c + * Thesaurus dictionary: phrase to phrase substitution + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/dict_thesaurus.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/namespace.h" +#include "commands/defrem.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" +#include "utils/regproc.h" + + +/* + * Temporary we use TSLexeme.flags for inner use... + */ +#define DT_USEASIS 0x1000 + +typedef struct LexemeInfo +{ + uint32 idsubst; /* entry's number in DictThesaurus->subst */ + uint16 posinsubst; /* pos info in entry */ + uint16 tnvariant; /* total num lexemes in one variant */ + struct LexemeInfo *nextentry; + struct LexemeInfo *nextvariant; +} LexemeInfo; + +typedef struct +{ + char *lexeme; + LexemeInfo *entries; +} TheLexeme; + +typedef struct +{ + uint16 lastlexeme; /* number lexemes to substitute */ + uint16 reslen; + TSLexeme *res; /* prepared substituted result */ +} TheSubstitute; + +typedef struct +{ + /* subdictionary to normalize lexemes */ + Oid subdictOid; + TSDictionaryCacheEntry *subdict; + + /* Array to search lexeme by exact match */ + TheLexeme *wrds; + int nwrds; /* current number of words */ + int ntwrds; /* allocated array length */ + + /* + * Storage of substituted result, n-th element is for n-th expression + */ + TheSubstitute *subst; + int nsubst; +} DictThesaurus; + + +static void +newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst) +{ + TheLexeme *ptr; + + if (d->nwrds >= d->ntwrds) + { + if (d->ntwrds == 0) + { + d->ntwrds = 16; + d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds); + } + else + { + d->ntwrds *= 2; + d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds); + } + } + + ptr = d->wrds + d->nwrds; + d->nwrds++; + + ptr->lexeme = palloc(e - b + 1); + + memcpy(ptr->lexeme, b, e - b); + ptr->lexeme[e - b] = '\0'; + + ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo)); + + ptr->entries->nextentry = NULL; + ptr->entries->idsubst = idsubst; + ptr->entries->posinsubst = posinsubst; +} + +static void +addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis) +{ + static int nres = 0; + static int ntres = 0; + TheSubstitute *ptr; + + if (nwrd == 0) + { + nres = ntres = 0; + + if (idsubst >= d->nsubst) + { + if (d->nsubst == 0) + { + d->nsubst = 16; + d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst); + } + else + { + d->nsubst *= 2; + d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst); + } + } + } + + ptr = d->subst + idsubst; + + ptr->lastlexeme = posinsubst - 1; + + if (nres + 1 >= ntres) + { + if (ntres == 0) + { + ntres = 2; + ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres); + } + else + { + ntres *= 2; + ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres); + } + } + + ptr->res[nres].lexeme = palloc(e - b + 1); + memcpy(ptr->res[nres].lexeme, b, e - b); + ptr->res[nres].lexeme[e - b] = '\0'; + + ptr->res[nres].nvariant = nwrd; + if (useasis) + ptr->res[nres].flags = DT_USEASIS; + else + ptr->res[nres].flags = 0; + + ptr->res[++nres].lexeme = NULL; +} + +#define TR_WAITLEX 1 +#define TR_INLEX 2 +#define TR_WAITSUBS 3 +#define TR_INSUBS 4 + +static void +thesaurusRead(const char *filename, DictThesaurus *d) +{ + tsearch_readline_state trst; + uint32 idsubst = 0; + bool useasis = false; + char *line; + + filename = get_tsearch_config_filename(filename, "ths"); + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open thesaurus file \"%s\": %m", + filename))); + + while ((line = tsearch_readline(&trst)) != NULL) + { + char *ptr; + int state = TR_WAITLEX; + char *beginwrd = NULL; + uint32 posinsubst = 0; + uint32 nwrd = 0; + + ptr = line; + + /* is it a comment? */ + while (*ptr && t_isspace(ptr)) + ptr += pg_mblen(ptr); + + if (t_iseq(ptr, '#') || *ptr == '\0' || + t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) + { + pfree(line); + continue; + } + + while (*ptr) + { + if (state == TR_WAITLEX) + { + if (t_iseq(ptr, ':')) + { + if (posinsubst == 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("unexpected delimiter"))); + state = TR_WAITSUBS; + } + else if (!t_isspace(ptr)) + { + beginwrd = ptr; + state = TR_INLEX; + } + } + else if (state == TR_INLEX) + { + if (t_iseq(ptr, ':')) + { + newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); + state = TR_WAITSUBS; + } + else if (t_isspace(ptr)) + { + newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); + state = TR_WAITLEX; + } + } + else if (state == TR_WAITSUBS) + { + if (t_iseq(ptr, '*')) + { + useasis = true; + state = TR_INSUBS; + beginwrd = ptr + pg_mblen(ptr); + } + else if (t_iseq(ptr, '\\')) + { + useasis = false; + state = TR_INSUBS; + beginwrd = ptr + pg_mblen(ptr); + } + else if (!t_isspace(ptr)) + { + useasis = false; + beginwrd = ptr; + state = TR_INSUBS; + } + } + else if (state == TR_INSUBS) + { + if (t_isspace(ptr)) + { + if (ptr == beginwrd) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("unexpected end of line or lexeme"))); + addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); + state = TR_WAITSUBS; + } + } + else + elog(ERROR, "unrecognized thesaurus state: %d", state); + + ptr += pg_mblen(ptr); + } + + if (state == TR_INSUBS) + { + if (ptr == beginwrd) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("unexpected end of line or lexeme"))); + addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); + } + + idsubst++; + + if (!(nwrd && posinsubst)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("unexpected end of line"))); + + if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("too many lexemes in thesaurus entry"))); + + pfree(line); + } + + d->nsubst = idsubst; + + tsearch_readline_end(&trst); +} + +static TheLexeme * +addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant) +{ + if (*nnw >= *tnm) + { + *tnm *= 2; + newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm); + } + + newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo)); + + if (lexeme && lexeme->lexeme) + { + newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme); + newwrds[*nnw].entries->tnvariant = tnvariant; + } + else + { + newwrds[*nnw].lexeme = NULL; + newwrds[*nnw].entries->tnvariant = 1; + } + + newwrds[*nnw].entries->idsubst = src->idsubst; + newwrds[*nnw].entries->posinsubst = src->posinsubst; + + newwrds[*nnw].entries->nextentry = NULL; + + (*nnw)++; + return newwrds; +} + +static int +cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b) +{ + if (a == NULL || b == NULL) + return 0; + + if (a->idsubst == b->idsubst) + { + if (a->posinsubst == b->posinsubst) + { + if (a->tnvariant == b->tnvariant) + return 0; + + return (a->tnvariant > b->tnvariant) ? 1 : -1; + } + + return (a->posinsubst > b->posinsubst) ? 1 : -1; + } + + return (a->idsubst > b->idsubst) ? 1 : -1; +} + +static int +cmpLexeme(const TheLexeme *a, const TheLexeme *b) +{ + if (a->lexeme == NULL) + { + if (b->lexeme == NULL) + return 0; + else + return 1; + } + else if (b->lexeme == NULL) + return -1; + + return strcmp(a->lexeme, b->lexeme); +} + +static int +cmpLexemeQ(const void *a, const void *b) +{ + return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b); +} + +static int +cmpTheLexeme(const void *a, const void *b) +{ + const TheLexeme *la = (const TheLexeme *) a; + const TheLexeme *lb = (const TheLexeme *) b; + int res; + + if ((res = cmpLexeme(la, lb)) != 0) + return res; + + return -cmpLexemeInfo(la->entries, lb->entries); +} + +static void +compileTheLexeme(DictThesaurus *d) +{ + int i, + nnw = 0, + tnm = 16; + TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm), + *ptrwrds; + + for (i = 0; i < d->nwrds; i++) + { + TSLexeme *ptr; + + if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */ + newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0); + else + { + ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize), + PointerGetDatum(d->subdict->dictData), + PointerGetDatum(d->wrds[i].lexeme), + Int32GetDatum(strlen(d->wrds[i].lexeme)), + PointerGetDatum(NULL))); + + if (!ptr) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)", + d->wrds[i].lexeme, + d->wrds[i].entries->idsubst + 1))); + else if (!(ptr->lexeme)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)", + d->wrds[i].lexeme, + d->wrds[i].entries->idsubst + 1), + errhint("Use \"?\" to represent a stop word within a sample phrase."))); + else + { + while (ptr->lexeme) + { + TSLexeme *remptr = ptr + 1; + int tnvar = 1; + int curvar = ptr->nvariant; + + /* compute n words in one variant */ + while (remptr->lexeme) + { + if (remptr->nvariant != (remptr - 1)->nvariant) + break; + tnvar++; + remptr++; + } + + remptr = ptr; + while (remptr->lexeme && remptr->nvariant == curvar) + { + newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar); + remptr++; + } + + ptr = remptr; + } + } + } + + pfree(d->wrds[i].lexeme); + pfree(d->wrds[i].entries); + } + + if (d->wrds) + pfree(d->wrds); + d->wrds = newwrds; + d->nwrds = nnw; + d->ntwrds = tnm; + + if (d->nwrds > 1) + { + qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme); + + /* uniq */ + newwrds = d->wrds; + ptrwrds = d->wrds + 1; + while (ptrwrds - d->wrds < d->nwrds) + { + if (cmpLexeme(ptrwrds, newwrds) == 0) + { + if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries)) + { + ptrwrds->entries->nextentry = newwrds->entries; + newwrds->entries = ptrwrds->entries; + } + else + pfree(ptrwrds->entries); + + if (ptrwrds->lexeme) + pfree(ptrwrds->lexeme); + } + else + { + newwrds++; + *newwrds = *ptrwrds; + } + + ptrwrds++; + } + + d->nwrds = newwrds - d->wrds + 1; + d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds); + } +} + +static void +compileTheSubstitute(DictThesaurus *d) +{ + int i; + + for (i = 0; i < d->nsubst; i++) + { + TSLexeme *rem = d->subst[i].res, + *outptr, + *inptr; + int n = 2; + + outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n); + outptr->lexeme = NULL; + inptr = rem; + + while (inptr && inptr->lexeme) + { + TSLexeme *lexized, + tmplex[2]; + + if (inptr->flags & DT_USEASIS) + { /* do not lexize */ + tmplex[0] = *inptr; + tmplex[0].flags = 0; + tmplex[1].lexeme = NULL; + lexized = tmplex; + } + else + { + lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize), + PointerGetDatum(d->subdict->dictData), + PointerGetDatum(inptr->lexeme), + Int32GetDatum(strlen(inptr->lexeme)), + PointerGetDatum(NULL))); + } + + if (lexized && lexized->lexeme) + { + int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1; + + while (lexized->lexeme) + { + if (outptr - d->subst[i].res + 1 >= n) + { + int diff = outptr - d->subst[i].res; + + n *= 2; + d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n); + outptr = d->subst[i].res + diff; + } + + *outptr = *lexized; + outptr->lexeme = pstrdup(lexized->lexeme); + + outptr++; + lexized++; + } + + if (toset > 0) + d->subst[i].res[toset].flags |= TSL_ADDPOS; + } + else if (lexized) + { + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)", + inptr->lexeme, i + 1))); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)", + inptr->lexeme, i + 1))); + } + + if (inptr->lexeme) + pfree(inptr->lexeme); + inptr++; + } + + if (outptr == d->subst[i].res) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("thesaurus substitute phrase is empty (rule %d)", + i + 1))); + + d->subst[i].reslen = outptr - d->subst[i].res; + + pfree(rem); + } +} + +Datum +thesaurus_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictThesaurus *d; + char *subdictname = NULL; + bool fileloaded = false; + ListCell *l; + + d = (DictThesaurus *) palloc0(sizeof(DictThesaurus)); + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (strcmp(defel->defname, "dictfile") == 0) + { + if (fileloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple DictFile parameters"))); + thesaurusRead(defGetString(defel), d); + fileloaded = true; + } + else if (strcmp(defel->defname, "dictionary") == 0) + { + if (subdictname) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple Dictionary parameters"))); + subdictname = pstrdup(defGetString(defel)); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized Thesaurus parameter: \"%s\"", + defel->defname))); + } + } + + if (!fileloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing DictFile parameter"))); + if (!subdictname) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing Dictionary parameter"))); + + d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false); + d->subdict = lookup_ts_dictionary_cache(d->subdictOid); + + compileTheLexeme(d); + compileTheSubstitute(d); + + PG_RETURN_POINTER(d); +} + +static LexemeInfo * +findTheLexeme(DictThesaurus *d, char *lexeme) +{ + TheLexeme key, + *res; + + if (d->nwrds == 0) + return NULL; + + key.lexeme = lexeme; + key.entries = NULL; + + res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ); + + if (res == NULL) + return NULL; + return res->entries; +} + +static bool +matchIdSubst(LexemeInfo *stored, uint32 idsubst) +{ + bool res = true; + + if (stored) + { + res = false; + + for (; stored; stored = stored->nextvariant) + if (stored->idsubst == idsubst) + { + res = true; + break; + } + } + + return res; +} + +static LexemeInfo * +findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn) +{ + for (;;) + { + int i; + LexemeInfo *ptr = newin[0]; + + for (i = 0; i < newn; i++) + { + while (newin[i] && newin[i]->idsubst < ptr->idsubst) + newin[i] = newin[i]->nextentry; + + if (newin[i] == NULL) + return in; + + if (newin[i]->idsubst > ptr->idsubst) + { + ptr = newin[i]; + i = -1; + continue; + } + + while (newin[i]->idsubst == ptr->idsubst) + { + if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn) + { + ptr = newin[i]; + break; + } + + newin[i] = newin[i]->nextentry; + if (newin[i] == NULL) + return in; + } + + if (newin[i]->idsubst != ptr->idsubst) + { + ptr = newin[i]; + i = -1; + continue; + } + } + + if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst))) + { /* found */ + + ptr->nextvariant = in; + in = ptr; + } + + /* step forward */ + for (i = 0; i < newn; i++) + newin[i] = newin[i]->nextentry; + } +} + +static TSLexeme * +copyTSLexeme(TheSubstitute *ts) +{ + TSLexeme *res; + uint16 i; + + res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1)); + for (i = 0; i < ts->reslen; i++) + { + res[i] = ts->res[i]; + res[i].lexeme = pstrdup(ts->res[i].lexeme); + } + + res[ts->reslen].lexeme = NULL; + + return res; +} + +static TSLexeme * +checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres) +{ + *moreres = false; + while (info) + { + Assert(info->idsubst < d->nsubst); + if (info->nextvariant) + *moreres = true; + if (d->subst[info->idsubst].lastlexeme == curpos) + return copyTSLexeme(d->subst + info->idsubst); + info = info->nextvariant; + } + + return NULL; +} + +Datum +thesaurus_lexize(PG_FUNCTION_ARGS) +{ + DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0); + DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3); + TSLexeme *res = NULL; + LexemeInfo *stored, + *info = NULL; + uint16 curpos = 0; + bool moreres = false; + + if (PG_NARGS() != 4 || dstate == NULL) + elog(ERROR, "forbidden call of thesaurus or nested call"); + + if (dstate->isend) + PG_RETURN_POINTER(NULL); + stored = (LexemeInfo *) dstate->private_state; + + if (stored) + curpos = stored->posinsubst + 1; + + if (!d->subdict->isvalid) + d->subdict = lookup_ts_dictionary_cache(d->subdictOid); + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize), + PointerGetDatum(d->subdict->dictData), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2), + PointerGetDatum(NULL))); + + if (res && res->lexeme) + { + TSLexeme *ptr = res, + *basevar; + + while (ptr->lexeme) + { + uint16 nv = ptr->nvariant; + uint16 i, + nlex = 0; + LexemeInfo **infos; + + basevar = ptr; + while (ptr->lexeme && nv == ptr->nvariant) + { + nlex++; + ptr++; + } + + infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex); + for (i = 0; i < nlex; i++) + if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL) + break; + + if (i < nlex) + { + /* no chance to find */ + pfree(infos); + continue; + } + + info = findVariant(info, stored, curpos, infos, nlex); + } + } + else if (res) + { /* stop-word */ + LexemeInfo *infos = findTheLexeme(d, NULL); + + info = findVariant(NULL, stored, curpos, &infos, 1); + } + else + { + info = NULL; /* word isn't recognized */ + } + + dstate->private_state = (void *) info; + + if (!info) + { + dstate->getnext = false; + PG_RETURN_POINTER(NULL); + } + + if ((res = checkMatch(d, info, curpos, &moreres)) != NULL) + { + dstate->getnext = moreres; + PG_RETURN_POINTER(res); + } + + dstate->getnext = true; + + PG_RETURN_POINTER(NULL); +} diff --git a/src/backend/tsearch/dicts/hunspell_sample.affix b/src/backend/tsearch/dicts/hunspell_sample.affix new file mode 100644 index 0000000..9a64513 --- /dev/null +++ b/src/backend/tsearch/dicts/hunspell_sample.affix @@ -0,0 +1,23 @@ +COMPOUNDFLAG Z +ONLYINCOMPOUND L + +PFX B Y 1 +PFX B 0 re . + +PFX U N 1 +PFX U 0 un . + +SFX J Y 1 +SFX J 0 INGS [^E] + +SFX G Y 1 +SFX G 0 ING [^E] + +SFX S Y 1 +SFX S 0 S [^SXZHY] + +SFX A Y 1 +SFX A Y IES [^AEIOU]Y + +SFX \ N 1 +SFX \ 0 Y/L [^Y] diff --git a/src/backend/tsearch/dicts/hunspell_sample_long.affix b/src/backend/tsearch/dicts/hunspell_sample_long.affix new file mode 100644 index 0000000..d5df7a3 --- /dev/null +++ b/src/backend/tsearch/dicts/hunspell_sample_long.affix @@ -0,0 +1,53 @@ +FLAG long + +AF 11 +AF cZ #1 +AF cL #2 +AF sGsJpUsS #3 +AF sSpB #4 +AF cZsS #5 +AF sScZs\sE #6 +AF sA #7 +AF CaCp #8 +AF CcCp #9 +AF sD #10 +AF sB #11 + +COMPOUNDFLAG cZ +COMPOUNDBEGIN Ca +COMPOUNDMIDDLE Cb +COMPOUNDEND Cc +COMPOUNDPERMITFLAG Cp +ONLYINCOMPOUND cL + +PFX pB Y 1 +PFX pB 0 re . + +PFX pU N 1 +PFX pU 0 un . + +SFX sJ Y 1 +SFX sJ 0 INGS [^E] + +SFX sG Y 1 +SFX sG 0 ING [^E] + +SFX sS Y 1 +SFX sS 0 S [^SXZHY] + +SFX sA Y 1 +SFX sA Y IES [^AEIOU]Y{1} + +SFX sB Y 1 +SFX sB 0 ED K{1} + +# Affixes with compound flags +SFX s\ N 1 +SFX s\ 0 Y/2 [^Y] + +SFX sE N 1 +SFX sE 0 S/2 [^S] + +# Check duplicate affixes +SFX sD N 1 +SFX sD 0 S/2 [^S] diff --git a/src/backend/tsearch/dicts/hunspell_sample_long.dict b/src/backend/tsearch/dicts/hunspell_sample_long.dict new file mode 100644 index 0000000..370c27a --- /dev/null +++ b/src/backend/tsearch/dicts/hunspell_sample_long.dict @@ -0,0 +1,11 @@ +book/3 +book/11 +booking/4 +footballklubber +foot/5 +football/1 +ball/6 +klubber/1 +sky/7 +ex-/8 +machina/9 diff --git a/src/backend/tsearch/dicts/hunspell_sample_num.affix b/src/backend/tsearch/dicts/hunspell_sample_num.affix new file mode 100644 index 0000000..0c4766a --- /dev/null +++ b/src/backend/tsearch/dicts/hunspell_sample_num.affix @@ -0,0 +1,33 @@ +FLAG num + +COMPOUNDFLAG 101 +ONLYINCOMPOUND 102 + +PFX 201 Y 1 +PFX 201 0 re . + +PFX 202 N 1 +PFX 202 0 un . + +SFX 301 Y 1 +SFX 301 0 INGS [^E] + +SFX 302 Y 1 +SFX 302 0 ING [^E] + +SFX 303 Y 1 +SFX 303 0 S [^SXZHY] + +# Remove ED suffix from lexeme for base words with K ending +SFX 306 Y 1 +SFX 306 0 ED K{1} + +# Just add Y to lexeme for base words with Y ending +SFX 307 Y 1 +SFX 307 Y 0 Y* + +SFX 304 Y 1 +SFX 304 Y IES [^AEIOU]Y + +SFX 305 N 1 +SFX 305 0 Y/102 [^Y] diff --git a/src/backend/tsearch/dicts/hunspell_sample_num.dict b/src/backend/tsearch/dicts/hunspell_sample_num.dict new file mode 100644 index 0000000..fbc321d --- /dev/null +++ b/src/backend/tsearch/dicts/hunspell_sample_num.dict @@ -0,0 +1,9 @@ +book/302,301,202,303 +book/306 +booking/303,201 +footballklubber +foot/101,303 +football/101 +ball/303,101,305 +klubber/101 +sky/304,307 diff --git a/src/backend/tsearch/dicts/ispell_sample.affix b/src/backend/tsearch/dicts/ispell_sample.affix new file mode 100644 index 0000000..f29004f --- /dev/null +++ b/src/backend/tsearch/dicts/ispell_sample.affix @@ -0,0 +1,26 @@ +compoundwords controlled Z + +prefixes + +flag *B: + . > RE # As in enter > reenter + +flag U: + . > UN # As in natural > unnatural + +suffixes + +flag *J: + [^E] > INGS # As in cross > crossings + +flag *G: + [^E] > ING # As in cross > crossing + +flag *S: + [^SXZHY] > S # As in bat > bats + +flag *A: + [^AEIOU]Y > -Y,IES # As in imply > implies + +flag ~\\: + [^Y] > Y #~ advarsel > advarsely- diff --git a/src/backend/tsearch/dicts/ispell_sample.dict b/src/backend/tsearch/dicts/ispell_sample.dict new file mode 100644 index 0000000..44df196 --- /dev/null +++ b/src/backend/tsearch/dicts/ispell_sample.dict @@ -0,0 +1,8 @@ +book/GJUS +booking/SB +footballklubber +foot/ZS +football/Z +ball/SZ\ +klubber/Z +sky/A diff --git a/src/backend/tsearch/dicts/synonym_sample.syn b/src/backend/tsearch/dicts/synonym_sample.syn new file mode 100644 index 0000000..3ecbcf9 --- /dev/null +++ b/src/backend/tsearch/dicts/synonym_sample.syn @@ -0,0 +1,5 @@ +postgres pgsql +postgresql pgsql +postgre pgsql +gogle googl +indices index* diff --git a/src/backend/tsearch/dicts/thesaurus_sample.ths b/src/backend/tsearch/dicts/thesaurus_sample.ths new file mode 100644 index 0000000..718f54a --- /dev/null +++ b/src/backend/tsearch/dicts/thesaurus_sample.ths @@ -0,0 +1,17 @@ +# +# Theasurus config file. Character ':' separates string from replacement, eg +# sample-words : substitute-words +# +# Any substitute-word can be marked by preceding '*' character, +# which means do not lexize this word +# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary + +one two three : *123 +one two : *12 +one : *1 +two : *2 + +supernovae stars : *sn +supernovae : *sn +booking tickets : order invitation cards +booking ? tickets : order invitation Cards diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c new file mode 100644 index 0000000..8001717 --- /dev/null +++ b/src/backend/tsearch/regis.c @@ -0,0 +1,257 @@ +/*------------------------------------------------------------------------- + * + * regis.c + * Fast regex subset + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/regis.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/dicts/regis.h" +#include "tsearch/ts_locale.h" + +#define RS_IN_ONEOF 1 +#define RS_IN_ONEOF_IN 2 +#define RS_IN_NONEOF 3 +#define RS_IN_WAIT 4 + + +/* + * Test whether a regex is of the subset supported here. + * Keep this in sync with RS_compile! + */ +bool +RS_isRegis(const char *str) +{ + int state = RS_IN_WAIT; + const char *c = str; + + while (*c) + { + if (state == RS_IN_WAIT) + { + if (t_isalpha(c)) + /* okay */ ; + else if (t_iseq(c, '[')) + state = RS_IN_ONEOF; + else + return false; + } + else if (state == RS_IN_ONEOF) + { + if (t_iseq(c, '^')) + state = RS_IN_NONEOF; + else if (t_isalpha(c)) + state = RS_IN_ONEOF_IN; + else + return false; + } + else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) + { + if (t_isalpha(c)) + /* okay */ ; + else if (t_iseq(c, ']')) + state = RS_IN_WAIT; + else + return false; + } + else + elog(ERROR, "internal error in RS_isRegis: state %d", state); + c += pg_mblen(c); + } + + return (state == RS_IN_WAIT); +} + +static RegisNode * +newRegisNode(RegisNode *prev, int len) +{ + RegisNode *ptr; + + ptr = (RegisNode *) palloc0(RNHDRSZ + len + 1); + if (prev) + prev->next = ptr; + return ptr; +} + +void +RS_compile(Regis *r, bool issuffix, const char *str) +{ + int len = strlen(str); + int state = RS_IN_WAIT; + const char *c = str; + RegisNode *ptr = NULL; + + memset(r, 0, sizeof(Regis)); + r->issuffix = (issuffix) ? 1 : 0; + + while (*c) + { + if (state == RS_IN_WAIT) + { + if (t_isalpha(c)) + { + if (ptr) + ptr = newRegisNode(ptr, len); + else + ptr = r->node = newRegisNode(NULL, len); + COPYCHAR(ptr->data, c); + ptr->type = RSF_ONEOF; + ptr->len = pg_mblen(c); + } + else if (t_iseq(c, '[')) + { + if (ptr) + ptr = newRegisNode(ptr, len); + else + ptr = r->node = newRegisNode(NULL, len); + ptr->type = RSF_ONEOF; + state = RS_IN_ONEOF; + } + else /* shouldn't get here */ + elog(ERROR, "invalid regis pattern: \"%s\"", str); + } + else if (state == RS_IN_ONEOF) + { + if (t_iseq(c, '^')) + { + ptr->type = RSF_NONEOF; + state = RS_IN_NONEOF; + } + else if (t_isalpha(c)) + { + COPYCHAR(ptr->data, c); + ptr->len = pg_mblen(c); + state = RS_IN_ONEOF_IN; + } + else /* shouldn't get here */ + elog(ERROR, "invalid regis pattern: \"%s\"", str); + } + else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) + { + if (t_isalpha(c)) + { + COPYCHAR(ptr->data + ptr->len, c); + ptr->len += pg_mblen(c); + } + else if (t_iseq(c, ']')) + state = RS_IN_WAIT; + else /* shouldn't get here */ + elog(ERROR, "invalid regis pattern: \"%s\"", str); + } + else + elog(ERROR, "internal error in RS_compile: state %d", state); + c += pg_mblen(c); + } + + if (state != RS_IN_WAIT) /* shouldn't get here */ + elog(ERROR, "invalid regis pattern: \"%s\"", str); + + ptr = r->node; + while (ptr) + { + r->nchar++; + ptr = ptr->next; + } +} + +void +RS_free(Regis *r) +{ + RegisNode *ptr = r->node, + *tmp; + + while (ptr) + { + tmp = ptr->next; + pfree(ptr); + ptr = tmp; + } + + r->node = NULL; +} + +static bool +mb_strchr(char *str, char *c) +{ + int clen, + plen, + i; + char *ptr = str; + bool res = false; + + clen = pg_mblen(c); + while (*ptr && !res) + { + plen = pg_mblen(ptr); + if (plen == clen) + { + i = plen; + res = true; + while (i--) + if (*(ptr + i) != *(c + i)) + { + res = false; + break; + } + } + + ptr += plen; + } + + return res; +} + +bool +RS_execute(Regis *r, char *str) +{ + RegisNode *ptr = r->node; + char *c = str; + int len = 0; + + while (*c) + { + len++; + c += pg_mblen(c); + } + + if (len < r->nchar) + return 0; + + c = str; + if (r->issuffix) + { + len -= r->nchar; + while (len-- > 0) + c += pg_mblen(c); + } + + + while (ptr) + { + switch (ptr->type) + { + case RSF_ONEOF: + if (!mb_strchr((char *) ptr->data, c)) + return false; + break; + case RSF_NONEOF: + if (mb_strchr((char *) ptr->data, c)) + return false; + break; + default: + elog(ERROR, "unrecognized regis node type: %d", ptr->type); + } + ptr = ptr->next; + c += pg_mblen(c); + } + + return true; +} diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c new file mode 100644 index 0000000..ebc8960 --- /dev/null +++ b/src/backend/tsearch/spell.c @@ -0,0 +1,2617 @@ +/*------------------------------------------------------------------------- + * + * spell.c + * Normalizing word with ISpell + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * Ispell dictionary + * ----------------- + * + * Rules of dictionaries are defined in two files with .affix and .dict + * extensions. They are used by spell checker programs Ispell and Hunspell. + * + * An .affix file declares morphological rules to get a basic form of words. + * The format of an .affix file has different structure for Ispell and Hunspell + * dictionaries. The Hunspell format is more complicated. But when an .affix + * file is imported and compiled, it is stored in the same structure AffixNode. + * + * A .dict file stores a list of basic forms of words with references to + * affix rules. The format of a .dict file has the same structure for Ispell + * and Hunspell dictionaries. + * + * Compilation of a dictionary + * --------------------------- + * + * A compiled dictionary is stored in the IspellDict structure. Compilation of + * a dictionary is divided into the several steps: + * - NIImportDictionary() - stores each word of a .dict file in the + * temporary Spell field. + * - NIImportAffixes() - stores affix rules of an .affix file in the + * Affix field (not temporary) if an .affix file has the Ispell format. + * -> NIImportOOAffixes() - stores affix rules if an .affix file has the + * Hunspell format. The AffixData field is initialized if AF parameter + * is defined. + * - NISortDictionary() - builds a prefix tree (Trie) from the words list + * and stores it in the Dictionary field. The words list is got from the + * Spell field. The AffixData field is initialized if AF parameter is not + * defined. + * - NISortAffixes(): + * - builds a list of compound affixes from the affix list and stores it + * in the CompoundAffix. + * - builds prefix trees (Trie) from the affix list for prefixes and suffixes + * and stores them in Suffix and Prefix fields. + * The affix list is got from the Affix field. + * + * Memory management + * ----------------- + * + * The IspellDict structure has the Spell field which is used only in compile + * time. The Spell field stores a words list. It can take a lot of memory. + * Therefore when a dictionary is compiled this field is cleared by + * NIFinishBuild(). + * + * All resources which should cleared by NIFinishBuild() is initialized using + * tmpalloc() and tmpalloc0(). + * + * IDENTIFICATION + * src/backend/tsearch/spell.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/pg_collation.h" +#include "tsearch/dicts/spell.h" +#include "tsearch/ts_locale.h" +#include "utils/memutils.h" + + +/* + * Initialization requires a lot of memory that's not needed + * after the initialization is done. During initialization, + * CurrentMemoryContext is the long-lived memory context associated + * with the dictionary cache entry. We keep the short-lived stuff + * in the Conf->buildCxt context. + */ +#define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz)) +#define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz)) + +/* + * Prepare for constructing an ISpell dictionary. + * + * The IspellDict struct is assumed to be zeroed when allocated. + */ +void +NIStartBuild(IspellDict *Conf) +{ + /* + * The temp context is a child of CurTransactionContext, so that it will + * go away automatically on error. + */ + Conf->buildCxt = AllocSetContextCreate(CurTransactionContext, + "Ispell dictionary init context", + ALLOCSET_DEFAULT_SIZES); +} + +/* + * Clean up when dictionary construction is complete. + */ +void +NIFinishBuild(IspellDict *Conf) +{ + /* Release no-longer-needed temp memory */ + MemoryContextDelete(Conf->buildCxt); + /* Just for cleanliness, zero the now-dangling pointers */ + Conf->buildCxt = NULL; + Conf->Spell = NULL; + Conf->firstfree = NULL; + Conf->CompoundAffixFlags = NULL; +} + + +/* + * "Compact" palloc: allocate without extra palloc overhead. + * + * Since we have no need to free the ispell data items individually, there's + * not much value in the per-chunk overhead normally consumed by palloc. + * Getting rid of it is helpful since ispell can allocate a lot of small nodes. + * + * We currently pre-zero all data allocated this way, even though some of it + * doesn't need that. The cpalloc and cpalloc0 macros are just documentation + * to indicate which allocations actually require zeroing. + */ +#define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */ +#define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */ + +static void * +compact_palloc0(IspellDict *Conf, size_t size) +{ + void *result; + + /* Should only be called during init */ + Assert(Conf->buildCxt != NULL); + + /* No point in this for large chunks */ + if (size > COMPACT_MAX_REQ) + return palloc0(size); + + /* Keep everything maxaligned */ + size = MAXALIGN(size); + + /* Need more space? */ + if (size > Conf->avail) + { + Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK); + Conf->avail = COMPACT_ALLOC_CHUNK; + } + + result = (void *) Conf->firstfree; + Conf->firstfree += size; + Conf->avail -= size; + + return result; +} + +#define cpalloc(size) compact_palloc0(Conf, size) +#define cpalloc0(size) compact_palloc0(Conf, size) + +static char * +cpstrdup(IspellDict *Conf, const char *str) +{ + char *res = cpalloc(strlen(str) + 1); + + strcpy(res, str); + return res; +} + + +/* + * Apply lowerstr(), producing a temporary result (in the buildCxt). + */ +static char * +lowerstr_ctx(IspellDict *Conf, const char *src) +{ + MemoryContext saveCtx; + char *dst; + + saveCtx = MemoryContextSwitchTo(Conf->buildCxt); + dst = lowerstr(src); + MemoryContextSwitchTo(saveCtx); + + return dst; +} + +#define MAX_NORM 1024 +#define MAXNORMLEN 256 + +#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) ) +#define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] ) +#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T ) + +static char *VoidString = ""; + +static int +cmpspell(const void *s1, const void *s2) +{ + return strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word); +} + +static int +cmpspellaffix(const void *s1, const void *s2) +{ + return strcmp((*(SPELL *const *) s1)->p.flag, + (*(SPELL *const *) s2)->p.flag); +} + +static int +cmpcmdflag(const void *f1, const void *f2) +{ + CompoundAffixFlag *fv1 = (CompoundAffixFlag *) f1, + *fv2 = (CompoundAffixFlag *) f2; + + Assert(fv1->flagMode == fv2->flagMode); + + if (fv1->flagMode == FM_NUM) + { + if (fv1->flag.i == fv2->flag.i) + return 0; + + return (fv1->flag.i > fv2->flag.i) ? 1 : -1; + } + + return strcmp(fv1->flag.s, fv2->flag.s); +} + +static char * +findchar(char *str, int c) +{ + while (*str) + { + if (t_iseq(str, c)) + return str; + str += pg_mblen(str); + } + + return NULL; +} + +static char * +findchar2(char *str, int c1, int c2) +{ + while (*str) + { + if (t_iseq(str, c1) || t_iseq(str, c2)) + return str; + str += pg_mblen(str); + } + + return NULL; +} + + +/* backward string compare for suffix tree operations */ +static int +strbcmp(const unsigned char *s1, const unsigned char *s2) +{ + int l1 = strlen((const char *) s1) - 1, + l2 = strlen((const char *) s2) - 1; + + while (l1 >= 0 && l2 >= 0) + { + if (s1[l1] < s2[l2]) + return -1; + if (s1[l1] > s2[l2]) + return 1; + l1--; + l2--; + } + if (l1 < l2) + return -1; + if (l1 > l2) + return 1; + + return 0; +} + +static int +strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count) +{ + int l1 = strlen((const char *) s1) - 1, + l2 = strlen((const char *) s2) - 1, + l = count; + + while (l1 >= 0 && l2 >= 0 && l > 0) + { + if (s1[l1] < s2[l2]) + return -1; + if (s1[l1] > s2[l2]) + return 1; + l1--; + l2--; + l--; + } + if (l == 0) + return 0; + if (l1 < l2) + return -1; + if (l1 > l2) + return 1; + return 0; +} + +/* + * Compares affixes. + * First compares the type of an affix. Prefixes should go before affixes. + * If types are equal then compares replaceable string. + */ +static int +cmpaffix(const void *s1, const void *s2) +{ + const AFFIX *a1 = (const AFFIX *) s1; + const AFFIX *a2 = (const AFFIX *) s2; + + if (a1->type < a2->type) + return -1; + if (a1->type > a2->type) + return 1; + if (a1->type == FF_PREFIX) + return strcmp(a1->repl, a2->repl); + else + return strbcmp((const unsigned char *) a1->repl, + (const unsigned char *) a2->repl); +} + +/* + * Gets an affix flag from the set of affix flags (sflagset). + * + * Several flags can be stored in a single string. Flags can be represented by: + * - 1 character (FM_CHAR). A character may be Unicode. + * - 2 characters (FM_LONG). A character may be Unicode. + * - numbers from 1 to 65000 (FM_NUM). + * + * Depending on the flagMode an affix string can have the following format: + * - FM_CHAR: ABCD + * Here we have 4 flags: A, B, C and D + * - FM_LONG: ABCDE* + * Here we have 3 flags: AB, CD and E* + * - FM_NUM: 200,205,50 + * Here we have 3 flags: 200, 205 and 50 + * + * Conf: current dictionary. + * sflagset: the set of affix flags. Returns a reference to the start of a next + * affix flag. + * sflag: returns an affix flag from sflagset. + */ +static void +getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) +{ + int32 s; + char *next, + *sbuf = *sflagset; + int maxstep; + bool stop = false; + bool met_comma = false; + + maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1; + + while (**sflagset) + { + switch (Conf->flagMode) + { + case FM_LONG: + case FM_CHAR: + COPYCHAR(sflag, *sflagset); + sflag += pg_mblen(*sflagset); + + /* Go to start of the next flag */ + *sflagset += pg_mblen(*sflagset); + + /* Check if we get all characters of flag */ + maxstep--; + stop = (maxstep == 0); + break; + case FM_NUM: + s = strtol(*sflagset, &next, 10); + if (*sflagset == next || errno == ERANGE) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix flag \"%s\"", *sflagset))); + if (s < 0 || s > FLAGNUM_MAXSIZE) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("affix flag \"%s\" is out of range", + *sflagset))); + sflag += sprintf(sflag, "%0d", s); + + /* Go to start of the next flag */ + *sflagset = next; + while (**sflagset) + { + if (t_isdigit(*sflagset)) + { + if (!met_comma) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix flag \"%s\"", + *sflagset))); + break; + } + else if (t_iseq(*sflagset, ',')) + { + if (met_comma) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix flag \"%s\"", + *sflagset))); + met_comma = true; + } + else if (!t_isspace(*sflagset)) + { + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid character in affix flag \"%s\"", + *sflagset))); + } + + *sflagset += pg_mblen(*sflagset); + } + stop = true; + break; + default: + elog(ERROR, "unrecognized type of Conf->flagMode: %d", + Conf->flagMode); + } + + if (stop) + break; + } + + if (Conf->flagMode == FM_LONG && maxstep > 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix flag \"%s\" with \"long\" flag value", + sbuf))); + + *sflag = '\0'; +} + +/* + * Checks if the affix set Conf->AffixData[affix] contains affixflag. + * Conf->AffixData[affix] does not contain affixflag if this flag is not used + * actually by the .dict file. + * + * Conf: current dictionary. + * affix: index of the Conf->AffixData array. + * affixflag: the affix flag. + * + * Returns true if the string Conf->AffixData[affix] contains affixflag, + * otherwise returns false. + */ +static bool +IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag) +{ + char *flagcur; + char flag[BUFSIZ]; + + if (*affixflag == 0) + return true; + + Assert(affix < Conf->nAffixData); + + flagcur = Conf->AffixData[affix]; + + while (*flagcur) + { + getNextFlagFromString(Conf, &flagcur, flag); + /* Compare first affix flag in flagcur with affixflag */ + if (strcmp(flag, affixflag) == 0) + return true; + } + + /* Could not find affixflag */ + return false; +} + +/* + * Adds the new word into the temporary array Spell. + * + * Conf: current dictionary. + * word: new word. + * flag: set of affix flags. Single flag can be get by getNextFlagFromString(). + */ +static void +NIAddSpell(IspellDict *Conf, const char *word, const char *flag) +{ + if (Conf->nspell >= Conf->mspell) + { + if (Conf->mspell) + { + Conf->mspell *= 2; + Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *)); + } + else + { + Conf->mspell = 1024 * 20; + Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *)); + } + } + Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1); + strcpy(Conf->Spell[Conf->nspell]->word, word); + Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0') + ? cpstrdup(Conf, flag) : VoidString; + Conf->nspell++; +} + +/* + * Imports dictionary into the temporary array Spell. + * + * Note caller must already have applied get_tsearch_config_filename. + * + * Conf: current dictionary. + * filename: path to the .dict file. + */ +void +NIImportDictionary(IspellDict *Conf, const char *filename) +{ + tsearch_readline_state trst; + char *line; + + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open dictionary file \"%s\": %m", + filename))); + + while ((line = tsearch_readline(&trst)) != NULL) + { + char *s, + *pstr; + + /* Set of affix flags */ + const char *flag; + + /* Extract flag from the line */ + flag = NULL; + if ((s = findchar(line, '/'))) + { + *s++ = '\0'; + flag = s; + while (*s) + { + /* we allow only single encoded flags for faster works */ + if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s)) + s++; + else + { + *s = '\0'; + break; + } + } + } + else + flag = ""; + + /* Remove trailing spaces */ + s = line; + while (*s) + { + if (t_isspace(s)) + { + *s = '\0'; + break; + } + s += pg_mblen(s); + } + pstr = lowerstr_ctx(Conf, line); + + NIAddSpell(Conf, pstr, flag); + pfree(pstr); + + pfree(line); + } + tsearch_readline_end(&trst); +} + +/* + * Searches a basic form of word in the prefix tree. This word was generated + * using an affix rule. This rule may not be presented in an affix set of + * a basic form of word. + * + * For example, we have the entry in the .dict file: + * meter/GMD + * + * The affix rule with the flag S: + * SFX S y ies [^aeiou]y + * is not presented here. + * + * The affix rule with the flag M: + * SFX M 0 's . + * is presented here. + * + * Conf: current dictionary. + * word: basic form of word. + * affixflag: affix flag, by which a basic form of word was generated. + * flag: compound flag used to compare with StopMiddle->compoundflag. + * + * Returns 1 if the word was found in the prefix tree, else returns 0. + */ +static int +FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag) +{ + SPNode *node = Conf->Dictionary; + SPNodeData *StopLow, + *StopHigh, + *StopMiddle; + const uint8 *ptr = (const uint8 *) word; + + flag &= FF_COMPOUNDFLAGMASK; + + while (node && *ptr) + { + StopLow = node->data; + StopHigh = node->data + node->length; + while (StopLow < StopHigh) + { + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); + if (StopMiddle->val == *ptr) + { + if (*(ptr + 1) == '\0' && StopMiddle->isword) + { + if (flag == 0) + { + /* + * The word can be formed only with another word. And + * in the flag parameter there is not a sign that we + * search compound words. + */ + if (StopMiddle->compoundflag & FF_COMPOUNDONLY) + return 0; + } + else if ((flag & StopMiddle->compoundflag) == 0) + return 0; + + /* + * Check if this affix rule is presented in the affix set + * with index StopMiddle->affix. + */ + if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag)) + return 1; + } + node = StopMiddle->node; + ptr++; + break; + } + else if (StopMiddle->val < *ptr) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + if (StopLow >= StopHigh) + break; + } + return 0; +} + +/* + * Context reset/delete callback for a regular expression used in an affix + */ +static void +regex_affix_deletion_callback(void *arg) +{ + aff_regex_struct *pregex = (aff_regex_struct *) arg; + + pg_regfree(&(pregex->regex)); +} + +/* + * Adds a new affix rule to the Affix field. + * + * Conf: current dictionary. + * flag: affix flag ('\' in the below example). + * flagflags: set of flags from the flagval field for this affix rule. This set + * is listed after '/' character in the added string (repl). + * + * For example L flag in the hunspell_sample.affix: + * SFX \ 0 Y/L [^Y] + * + * mask: condition for search ('[^Y]' in the above example). + * find: stripping characters from beginning (at prefix) or end (at suffix) + * of the word ('0' in the above example, 0 means that there is not + * stripping character). + * repl: adding string after stripping ('Y' in the above example). + * type: FF_SUFFIX or FF_PREFIX. + */ +static void +NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask, + const char *find, const char *repl, int type) +{ + AFFIX *Affix; + + if (Conf->naffixes >= Conf->maffixes) + { + if (Conf->maffixes) + { + Conf->maffixes *= 2; + Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX)); + } + else + { + Conf->maffixes = 16; + Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX)); + } + } + + Affix = Conf->Affix + Conf->naffixes; + + /* This affix rule can be applied for words with any ending */ + if (strcmp(mask, ".") == 0 || *mask == '\0') + { + Affix->issimple = 1; + Affix->isregis = 0; + } + /* This affix rule will use regis to search word ending */ + else if (RS_isRegis(mask)) + { + Affix->issimple = 0; + Affix->isregis = 1; + RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX), + *mask ? mask : VoidString); + } + /* This affix rule will use regex_t to search word ending */ + else + { + int masklen; + int wmasklen; + int err; + pg_wchar *wmask; + char *tmask; + aff_regex_struct *pregex; + + Affix->issimple = 0; + Affix->isregis = 0; + tmask = (char *) tmpalloc(strlen(mask) + 3); + if (type == FF_SUFFIX) + sprintf(tmask, "%s$", mask); + else + sprintf(tmask, "^%s", mask); + + masklen = strlen(tmask); + wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar)); + wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen); + + /* + * The regex engine stores its stuff using malloc not palloc, so we + * must arrange to explicitly clean up the regex when the dictionary's + * context is cleared. That means the regex_t has to stay in a fixed + * location within the context; we can't keep it directly in the AFFIX + * struct, since we may sort and resize the array of AFFIXes. + */ + Affix->reg.pregex = pregex = palloc(sizeof(aff_regex_struct)); + + err = pg_regcomp(&(pregex->regex), wmask, wmasklen, + REG_ADVANCED | REG_NOSUB, + DEFAULT_COLLATION_OID); + if (err) + { + char errstr[100]; + + pg_regerror(err, &(pregex->regex), errstr, sizeof(errstr)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("invalid regular expression: %s", errstr))); + } + + pregex->mcallback.func = regex_affix_deletion_callback; + pregex->mcallback.arg = (void *) pregex; + MemoryContextRegisterResetCallback(CurrentMemoryContext, + &pregex->mcallback); + } + + Affix->flagflags = flagflags; + if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG)) + { + if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0) + Affix->flagflags |= FF_COMPOUNDFLAG; + } + Affix->flag = cpstrdup(Conf, flag); + Affix->type = type; + + Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString; + if ((Affix->replen = strlen(repl)) > 0) + Affix->repl = cpstrdup(Conf, repl); + else + Affix->repl = VoidString; + Conf->naffixes++; +} + +/* Parsing states for parse_affentry() and friends */ +#define PAE_WAIT_MASK 0 +#define PAE_INMASK 1 +#define PAE_WAIT_FIND 2 +#define PAE_INFIND 3 +#define PAE_WAIT_REPL 4 +#define PAE_INREPL 5 +#define PAE_WAIT_TYPE 6 +#define PAE_WAIT_FLAG 7 + +/* + * Parse next space-separated field of an .affix file line. + * + * *str is the input pointer (will be advanced past field) + * next is where to copy the field value to, with null termination + * + * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit. + * + * Returns true if we found a field, false if not. + */ +static bool +get_nextfield(char **str, char *next) +{ + int state = PAE_WAIT_MASK; + int avail = BUFSIZ; + + while (**str) + { + if (state == PAE_WAIT_MASK) + { + if (t_iseq(*str, '#')) + return false; + else if (!t_isspace(*str)) + { + int clen = pg_mblen(*str); + + if (clen < avail) + { + COPYCHAR(next, *str); + next += clen; + avail -= clen; + } + state = PAE_INMASK; + } + } + else /* state == PAE_INMASK */ + { + if (t_isspace(*str)) + { + *next = '\0'; + return true; + } + else + { + int clen = pg_mblen(*str); + + if (clen < avail) + { + COPYCHAR(next, *str); + next += clen; + avail -= clen; + } + } + } + *str += pg_mblen(*str); + } + + *next = '\0'; + + return (state == PAE_INMASK); /* OK if we got a nonempty field */ +} + +/* + * Parses entry of an .affix file of MySpell or Hunspell format. + * + * An .affix file entry has the following format: + * - header + * + * - fields after header: + * + * + * str is the input line + * field values are returned to type etc, which must be buffers of size BUFSIZ. + * + * Returns number of fields found; any omitted fields are set to empty strings. + */ +static int +parse_ooaffentry(char *str, char *type, char *flag, char *find, + char *repl, char *mask) +{ + int state = PAE_WAIT_TYPE; + int fields_read = 0; + bool valid = false; + + *type = *flag = *find = *repl = *mask = '\0'; + + while (*str) + { + switch (state) + { + case PAE_WAIT_TYPE: + valid = get_nextfield(&str, type); + state = PAE_WAIT_FLAG; + break; + case PAE_WAIT_FLAG: + valid = get_nextfield(&str, flag); + state = PAE_WAIT_FIND; + break; + case PAE_WAIT_FIND: + valid = get_nextfield(&str, find); + state = PAE_WAIT_REPL; + break; + case PAE_WAIT_REPL: + valid = get_nextfield(&str, repl); + state = PAE_WAIT_MASK; + break; + case PAE_WAIT_MASK: + valid = get_nextfield(&str, mask); + state = -1; /* force loop exit */ + break; + default: + elog(ERROR, "unrecognized state in parse_ooaffentry: %d", + state); + break; + } + if (valid) + fields_read++; + else + break; /* early EOL */ + if (state < 0) + break; /* got all fields */ + } + + return fields_read; +} + +/* + * Parses entry of an .affix file of Ispell format + * + * An .affix file entry has the following format: + * > [-,] + */ +static bool +parse_affentry(char *str, char *mask, char *find, char *repl) +{ + int state = PAE_WAIT_MASK; + char *pmask = mask, + *pfind = find, + *prepl = repl; + + *mask = *find = *repl = '\0'; + + while (*str) + { + if (state == PAE_WAIT_MASK) + { + if (t_iseq(str, '#')) + return false; + else if (!t_isspace(str)) + { + COPYCHAR(pmask, str); + pmask += pg_mblen(str); + state = PAE_INMASK; + } + } + else if (state == PAE_INMASK) + { + if (t_iseq(str, '>')) + { + *pmask = '\0'; + state = PAE_WAIT_FIND; + } + else if (!t_isspace(str)) + { + COPYCHAR(pmask, str); + pmask += pg_mblen(str); + } + } + else if (state == PAE_WAIT_FIND) + { + if (t_iseq(str, '-')) + { + state = PAE_INFIND; + } + else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ ) + { + COPYCHAR(prepl, str); + prepl += pg_mblen(str); + state = PAE_INREPL; + } + else if (!t_isspace(str)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("syntax error"))); + } + else if (state == PAE_INFIND) + { + if (t_iseq(str, ',')) + { + *pfind = '\0'; + state = PAE_WAIT_REPL; + } + else if (t_isalpha(str)) + { + COPYCHAR(pfind, str); + pfind += pg_mblen(str); + } + else if (!t_isspace(str)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("syntax error"))); + } + else if (state == PAE_WAIT_REPL) + { + if (t_iseq(str, '-')) + { + break; /* void repl */ + } + else if (t_isalpha(str)) + { + COPYCHAR(prepl, str); + prepl += pg_mblen(str); + state = PAE_INREPL; + } + else if (!t_isspace(str)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("syntax error"))); + } + else if (state == PAE_INREPL) + { + if (t_iseq(str, '#')) + { + *prepl = '\0'; + break; + } + else if (t_isalpha(str)) + { + COPYCHAR(prepl, str); + prepl += pg_mblen(str); + } + else if (!t_isspace(str)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("syntax error"))); + } + else + elog(ERROR, "unrecognized state in parse_affentry: %d", state); + + str += pg_mblen(str); + } + + *pmask = *pfind = *prepl = '\0'; + + return (*mask && (*find || *repl)); +} + +/* + * Sets a Hunspell options depending on flag type. + */ +static void +setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry, + char *s, uint32 val) +{ + if (Conf->flagMode == FM_NUM) + { + char *next; + int i; + + i = strtol(s, &next, 10); + if (s == next || errno == ERANGE) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix flag \"%s\"", s))); + if (i < 0 || i > FLAGNUM_MAXSIZE) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("affix flag \"%s\" is out of range", s))); + + entry->flag.i = i; + } + else + entry->flag.s = cpstrdup(Conf, s); + + entry->flagMode = Conf->flagMode; + entry->value = val; +} + +/* + * Sets up a correspondence for the affix parameter with the affix flag. + * + * Conf: current dictionary. + * s: affix flag in string. + * val: affix parameter. + */ +static void +addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) +{ + CompoundAffixFlag *newValue; + char sbuf[BUFSIZ]; + char *sflag; + int clen; + + while (*s && t_isspace(s)) + s += pg_mblen(s); + + if (!*s) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("syntax error"))); + + /* Get flag without \n */ + sflag = sbuf; + while (*s && !t_isspace(s) && *s != '\n') + { + clen = pg_mblen(s); + COPYCHAR(sflag, s); + sflag += clen; + s += clen; + } + *sflag = '\0'; + + /* Resize array or allocate memory for array CompoundAffixFlag */ + if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag) + { + if (Conf->mCompoundAffixFlag) + { + Conf->mCompoundAffixFlag *= 2; + Conf->CompoundAffixFlags = (CompoundAffixFlag *) + repalloc((void *) Conf->CompoundAffixFlags, + Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag)); + } + else + { + Conf->mCompoundAffixFlag = 10; + Conf->CompoundAffixFlags = (CompoundAffixFlag *) + tmpalloc(Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag)); + } + } + + newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag; + + setCompoundAffixFlagValue(Conf, newValue, sbuf, val); + + Conf->usecompound = true; + Conf->nCompoundAffixFlag++; +} + +/* + * Returns a set of affix parameters which correspondence to the set of affix + * flags s. + */ +static int +getCompoundAffixFlagValue(IspellDict *Conf, char *s) +{ + uint32 flag = 0; + CompoundAffixFlag *found, + key; + char sflag[BUFSIZ]; + char *flagcur; + + if (Conf->nCompoundAffixFlag == 0) + return 0; + + flagcur = s; + while (*flagcur) + { + getNextFlagFromString(Conf, &flagcur, sflag); + setCompoundAffixFlagValue(Conf, &key, sflag, 0); + + found = (CompoundAffixFlag *) + bsearch(&key, (void *) Conf->CompoundAffixFlags, + Conf->nCompoundAffixFlag, sizeof(CompoundAffixFlag), + cmpcmdflag); + if (found != NULL) + flag |= found->value; + } + + return flag; +} + +/* + * Returns a flag set using the s parameter. + * + * If Conf->useFlagAliases is true then the s parameter is index of the + * Conf->AffixData array and function returns its entry. + * Else function returns the s parameter. + */ +static char * +getAffixFlagSet(IspellDict *Conf, char *s) +{ + if (Conf->useFlagAliases && *s != '\0') + { + int curaffix; + char *end; + + curaffix = strtol(s, &end, 10); + if (s == end || errno == ERANGE) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix alias \"%s\"", s))); + + if (curaffix > 0 && curaffix < Conf->nAffixData) + + /* + * Do not subtract 1 from curaffix because empty string was added + * in NIImportOOAffixes + */ + return Conf->AffixData[curaffix]; + else if (curaffix > Conf->nAffixData) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix alias \"%s\"", s))); + return VoidString; + } + else + return s; +} + +/* + * Import an affix file that follows MySpell or Hunspell format. + * + * Conf: current dictionary. + * filename: path to the .affix file. + */ +static void +NIImportOOAffixes(IspellDict *Conf, const char *filename) +{ + char type[BUFSIZ], + *ptype = NULL; + char sflag[BUFSIZ]; + char mask[BUFSIZ], + *pmask; + char find[BUFSIZ], + *pfind; + char repl[BUFSIZ], + *prepl; + bool isSuffix = false; + int naffix = 0, + curaffix = 0; + int sflaglen = 0; + char flagflags = 0; + tsearch_readline_state trst; + char *recoded; + + /* read file to find any flag */ + Conf->usecompound = false; + Conf->useFlagAliases = false; + Conf->flagMode = FM_CHAR; + + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open affix file \"%s\": %m", + filename))); + + while ((recoded = tsearch_readline(&trst)) != NULL) + { + if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) + { + pfree(recoded); + continue; + } + + if (STRNCMP(recoded, "COMPOUNDFLAG") == 0) + addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"), + FF_COMPOUNDFLAG); + else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0) + addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"), + FF_COMPOUNDBEGIN); + else if (STRNCMP(recoded, "COMPOUNDLAST") == 0) + addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"), + FF_COMPOUNDLAST); + /* COMPOUNDLAST and COMPOUNDEND are synonyms */ + else if (STRNCMP(recoded, "COMPOUNDEND") == 0) + addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"), + FF_COMPOUNDLAST); + else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0) + addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"), + FF_COMPOUNDMIDDLE); + else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0) + addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"), + FF_COMPOUNDONLY); + else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0) + addCompoundAffixFlagValue(Conf, + recoded + strlen("COMPOUNDPERMITFLAG"), + FF_COMPOUNDPERMITFLAG); + else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0) + addCompoundAffixFlagValue(Conf, + recoded + strlen("COMPOUNDFORBIDFLAG"), + FF_COMPOUNDFORBIDFLAG); + else if (STRNCMP(recoded, "FLAG") == 0) + { + char *s = recoded + strlen("FLAG"); + + while (*s && t_isspace(s)) + s += pg_mblen(s); + + if (*s) + { + if (STRNCMP(s, "long") == 0) + Conf->flagMode = FM_LONG; + else if (STRNCMP(s, "num") == 0) + Conf->flagMode = FM_NUM; + else if (STRNCMP(s, "default") != 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("Ispell dictionary supports only " + "\"default\", \"long\", " + "and \"num\" flag values"))); + } + } + + pfree(recoded); + } + tsearch_readline_end(&trst); + + if (Conf->nCompoundAffixFlag > 1) + qsort((void *) Conf->CompoundAffixFlags, Conf->nCompoundAffixFlag, + sizeof(CompoundAffixFlag), cmpcmdflag); + + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open affix file \"%s\": %m", + filename))); + + while ((recoded = tsearch_readline(&trst)) != NULL) + { + int fields_read; + + if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) + goto nextline; + + fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask); + + if (ptype) + pfree(ptype); + ptype = lowerstr_ctx(Conf, type); + + /* First try to parse AF parameter (alias compression) */ + if (STRNCMP(ptype, "af") == 0) + { + /* First line is the number of aliases */ + if (!Conf->useFlagAliases) + { + Conf->useFlagAliases = true; + naffix = atoi(sflag); + if (naffix <= 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid number of flag vector aliases"))); + + /* Also reserve place for empty flag set */ + naffix++; + + Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); + Conf->lenAffixData = Conf->nAffixData = naffix; + + /* Add empty flag set into AffixData */ + Conf->AffixData[curaffix] = VoidString; + curaffix++; + } + /* Other lines are aliases */ + else + { + if (curaffix < naffix) + { + Conf->AffixData[curaffix] = cpstrdup(Conf, sflag); + curaffix++; + } + else + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("number of aliases exceeds specified number %d", + naffix - 1))); + } + goto nextline; + } + /* Else try to parse prefixes and suffixes */ + if (fields_read < 4 || + (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0)) + goto nextline; + + sflaglen = strlen(sflag); + if (sflaglen == 0 + || (sflaglen > 1 && Conf->flagMode == FM_CHAR) + || (sflaglen > 2 && Conf->flagMode == FM_LONG)) + goto nextline; + + /*-------- + * Affix header. For example: + * SFX \ N 1 + *-------- + */ + if (fields_read == 4) + { + isSuffix = (STRNCMP(ptype, "sfx") == 0); + if (t_iseq(find, 'y') || t_iseq(find, 'Y')) + flagflags = FF_CROSSPRODUCT; + else + flagflags = 0; + } + /*-------- + * Affix fields. For example: + * SFX \ 0 Y/L [^Y] + *-------- + */ + else + { + char *ptr; + int aflg = 0; + + /* Get flags after '/' (flags are case sensitive) */ + if ((ptr = strchr(repl, '/')) != NULL) + aflg |= getCompoundAffixFlagValue(Conf, + getAffixFlagSet(Conf, + ptr + 1)); + /* Get lowercased version of string before '/' */ + prepl = lowerstr_ctx(Conf, repl); + if ((ptr = strchr(prepl, '/')) != NULL) + *ptr = '\0'; + pfind = lowerstr_ctx(Conf, find); + pmask = lowerstr_ctx(Conf, mask); + if (t_iseq(find, '0')) + *pfind = '\0'; + if (t_iseq(repl, '0')) + *prepl = '\0'; + + NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl, + isSuffix ? FF_SUFFIX : FF_PREFIX); + pfree(prepl); + pfree(pfind); + pfree(pmask); + } + +nextline: + pfree(recoded); + } + + tsearch_readline_end(&trst); + if (ptype) + pfree(ptype); +} + +/* + * import affixes + * + * Note caller must already have applied get_tsearch_config_filename + * + * This function is responsible for parsing ispell ("old format") affix files. + * If we realize that the file contains new-format commands, we pass off the + * work to NIImportOOAffixes(), which will re-read the whole file. + */ +void +NIImportAffixes(IspellDict *Conf, const char *filename) +{ + char *pstr = NULL; + char flag[BUFSIZ]; + char mask[BUFSIZ]; + char find[BUFSIZ]; + char repl[BUFSIZ]; + char *s; + bool suffixes = false; + bool prefixes = false; + char flagflags = 0; + tsearch_readline_state trst; + bool oldformat = false; + char *recoded = NULL; + + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open affix file \"%s\": %m", + filename))); + + Conf->usecompound = false; + Conf->useFlagAliases = false; + Conf->flagMode = FM_CHAR; + + while ((recoded = tsearch_readline(&trst)) != NULL) + { + pstr = lowerstr(recoded); + + /* Skip comments and empty lines */ + if (*pstr == '#' || *pstr == '\n') + goto nextline; + + if (STRNCMP(pstr, "compoundwords") == 0) + { + /* Find case-insensitive L flag in non-lowercased string */ + s = findchar2(recoded, 'l', 'L'); + if (s) + { + while (*s && !t_isspace(s)) + s += pg_mblen(s); + while (*s && t_isspace(s)) + s += pg_mblen(s); + + if (*s && pg_mblen(s) == 1) + { + addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG); + Conf->usecompound = true; + } + oldformat = true; + goto nextline; + } + } + if (STRNCMP(pstr, "suffixes") == 0) + { + suffixes = true; + prefixes = false; + oldformat = true; + goto nextline; + } + if (STRNCMP(pstr, "prefixes") == 0) + { + suffixes = false; + prefixes = true; + oldformat = true; + goto nextline; + } + if (STRNCMP(pstr, "flag") == 0) + { + s = recoded + 4; /* we need non-lowercased string */ + flagflags = 0; + + while (*s && t_isspace(s)) + s += pg_mblen(s); + + if (*s == '*') + { + flagflags |= FF_CROSSPRODUCT; + s++; + } + else if (*s == '~') + { + flagflags |= FF_COMPOUNDONLY; + s++; + } + + if (*s == '\\') + s++; + + /* + * An old-format flag is a single ASCII character; we expect it to + * be followed by EOL, whitespace, or ':'. Otherwise this is a + * new-format flag command. + */ + if (*s && pg_mblen(s) == 1) + { + COPYCHAR(flag, s); + flag[1] = '\0'; + + s++; + if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' || + t_isspace(s)) + { + oldformat = true; + goto nextline; + } + } + goto isnewformat; + } + if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 || + STRNCMP(recoded, "COMPOUNDMIN") == 0 || + STRNCMP(recoded, "PFX") == 0 || + STRNCMP(recoded, "SFX") == 0) + goto isnewformat; + + if ((!suffixes) && (!prefixes)) + goto nextline; + + if (!parse_affentry(pstr, mask, find, repl)) + goto nextline; + + NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); + +nextline: + pfree(recoded); + pfree(pstr); + } + tsearch_readline_end(&trst); + return; + +isnewformat: + if (oldformat) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("affix file contains both old-style and new-style commands"))); + tsearch_readline_end(&trst); + + NIImportOOAffixes(Conf, filename); +} + +/* + * Merges two affix flag sets and stores a new affix flag set into + * Conf->AffixData. + * + * Returns index of a new affix flag set. + */ +static int +MergeAffix(IspellDict *Conf, int a1, int a2) +{ + char **ptr; + + Assert(a1 < Conf->nAffixData && a2 < Conf->nAffixData); + + /* Do not merge affix flags if one of affix flags is empty */ + if (*Conf->AffixData[a1] == '\0') + return a2; + else if (*Conf->AffixData[a2] == '\0') + return a1; + + while (Conf->nAffixData + 1 >= Conf->lenAffixData) + { + Conf->lenAffixData *= 2; + Conf->AffixData = (char **) repalloc(Conf->AffixData, + sizeof(char *) * Conf->lenAffixData); + } + + ptr = Conf->AffixData + Conf->nAffixData; + if (Conf->flagMode == FM_NUM) + { + *ptr = cpalloc(strlen(Conf->AffixData[a1]) + + strlen(Conf->AffixData[a2]) + + 1 /* comma */ + 1 /* \0 */ ); + sprintf(*ptr, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]); + } + else + { + *ptr = cpalloc(strlen(Conf->AffixData[a1]) + + strlen(Conf->AffixData[a2]) + + 1 /* \0 */ ); + sprintf(*ptr, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]); + } + ptr++; + *ptr = NULL; + Conf->nAffixData++; + + return Conf->nAffixData - 1; +} + +/* + * Returns a set of affix parameters which correspondence to the set of affix + * flags with the given index. + */ +static uint32 +makeCompoundFlags(IspellDict *Conf, int affix) +{ + Assert(affix < Conf->nAffixData); + + return (getCompoundAffixFlagValue(Conf, Conf->AffixData[affix]) & + FF_COMPOUNDFLAGMASK); +} + +/* + * Makes a prefix tree for the given level. + * + * Conf: current dictionary. + * low: lower index of the Conf->Spell array. + * high: upper index of the Conf->Spell array. + * level: current prefix tree level. + */ +static SPNode * +mkSPNode(IspellDict *Conf, int low, int high, int level) +{ + int i; + int nchar = 0; + char lastchar = '\0'; + SPNode *rs; + SPNodeData *data; + int lownew = low; + + for (i = low; i < high; i++) + if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level]) + { + nchar++; + lastchar = Conf->Spell[i]->word[level]; + } + + if (!nchar) + return NULL; + + rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData)); + rs->length = nchar; + data = rs->data; + + lastchar = '\0'; + for (i = low; i < high; i++) + if (Conf->Spell[i]->p.d.len > level) + { + if (lastchar != Conf->Spell[i]->word[level]) + { + if (lastchar) + { + /* Next level of the prefix tree */ + data->node = mkSPNode(Conf, lownew, i, level + 1); + lownew = i; + data++; + } + lastchar = Conf->Spell[i]->word[level]; + } + data->val = ((uint8 *) (Conf->Spell[i]->word))[level]; + if (Conf->Spell[i]->p.d.len == level + 1) + { + bool clearCompoundOnly = false; + + if (data->isword && data->affix != Conf->Spell[i]->p.d.affix) + { + /* + * MergeAffix called a few times. If one of word is + * allowed to be in compound word and another isn't, then + * clear FF_COMPOUNDONLY flag. + */ + + clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag + & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix)) + ? false : true; + data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix); + } + else + data->affix = Conf->Spell[i]->p.d.affix; + data->isword = 1; + + data->compoundflag = makeCompoundFlags(Conf, data->affix); + + if ((data->compoundflag & FF_COMPOUNDONLY) && + (data->compoundflag & FF_COMPOUNDFLAG) == 0) + data->compoundflag |= FF_COMPOUNDFLAG; + + if (clearCompoundOnly) + data->compoundflag &= ~FF_COMPOUNDONLY; + } + } + + /* Next level of the prefix tree */ + data->node = mkSPNode(Conf, lownew, high, level + 1); + + return rs; +} + +/* + * Builds the Conf->Dictionary tree and AffixData from the imported dictionary + * and affixes. + */ +void +NISortDictionary(IspellDict *Conf) +{ + int i; + int naffix; + int curaffix; + + /* compress affixes */ + + /* + * If we use flag aliases then we need to use Conf->AffixData filled in + * the NIImportOOAffixes(). + */ + if (Conf->useFlagAliases) + { + for (i = 0; i < Conf->nspell; i++) + { + char *end; + + if (*Conf->Spell[i]->p.flag != '\0') + { + curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10); + if (Conf->Spell[i]->p.flag == end || errno == ERANGE) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix alias \"%s\"", + Conf->Spell[i]->p.flag))); + if (curaffix < 0 || curaffix >= Conf->nAffixData) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix alias \"%s\"", + Conf->Spell[i]->p.flag))); + if (*end != '\0' && !t_isdigit(end) && !t_isspace(end)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix alias \"%s\"", + Conf->Spell[i]->p.flag))); + } + else + { + /* + * If Conf->Spell[i]->p.flag is empty, then get empty value of + * Conf->AffixData (0 index). + */ + curaffix = 0; + } + + Conf->Spell[i]->p.d.affix = curaffix; + Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); + } + } + /* Otherwise fill Conf->AffixData here */ + else + { + /* Count the number of different flags used in the dictionary */ + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), + cmpspellaffix); + + naffix = 0; + for (i = 0; i < Conf->nspell; i++) + { + if (i == 0 || + strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag) != 0) + naffix++; + } + + /* + * Fill in Conf->AffixData with the affixes that were used in the + * dictionary. Replace textual flag-field of Conf->Spell entries with + * indexes into Conf->AffixData array. + */ + Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); + + curaffix = -1; + for (i = 0; i < Conf->nspell; i++) + { + if (i == 0 || + strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]) != 0) + { + curaffix++; + Assert(curaffix < naffix); + Conf->AffixData[curaffix] = cpstrdup(Conf, + Conf->Spell[i]->p.flag); + } + + Conf->Spell[i]->p.d.affix = curaffix; + Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); + } + + Conf->lenAffixData = Conf->nAffixData = naffix; + } + + /* Start build a prefix tree */ + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell); + Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); +} + +/* + * Makes a prefix tree for the given level using the repl string of an affix + * rule. Affixes with empty replace string do not include in the prefix tree. + * This affixes are included by mkVoidAffix(). + * + * Conf: current dictionary. + * low: lower index of the Conf->Affix array. + * high: upper index of the Conf->Affix array. + * level: current prefix tree level. + * type: FF_SUFFIX or FF_PREFIX. + */ +static AffixNode * +mkANode(IspellDict *Conf, int low, int high, int level, int type) +{ + int i; + int nchar = 0; + uint8 lastchar = '\0'; + AffixNode *rs; + AffixNodeData *data; + int lownew = low; + int naff; + AFFIX **aff; + + for (i = low; i < high; i++) + if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type)) + { + nchar++; + lastchar = GETCHAR(Conf->Affix + i, level, type); + } + + if (!nchar) + return NULL; + + aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1)); + naff = 0; + + rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData)); + rs->length = nchar; + data = rs->data; + + lastchar = '\0'; + for (i = low; i < high; i++) + if (Conf->Affix[i].replen > level) + { + if (lastchar != GETCHAR(Conf->Affix + i, level, type)) + { + if (lastchar) + { + /* Next level of the prefix tree */ + data->node = mkANode(Conf, lownew, i, level + 1, type); + if (naff) + { + data->naff = naff; + data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff); + memcpy(data->aff, aff, sizeof(AFFIX *) * naff); + naff = 0; + } + data++; + lownew = i; + } + lastchar = GETCHAR(Conf->Affix + i, level, type); + } + data->val = GETCHAR(Conf->Affix + i, level, type); + if (Conf->Affix[i].replen == level + 1) + { /* affix stopped */ + aff[naff++] = Conf->Affix + i; + } + } + + /* Next level of the prefix tree */ + data->node = mkANode(Conf, lownew, high, level + 1, type); + if (naff) + { + data->naff = naff; + data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff); + memcpy(data->aff, aff, sizeof(AFFIX *) * naff); + naff = 0; + } + + pfree(aff); + + return rs; +} + +/* + * Makes the root void node in the prefix tree. The root void node is created + * for affixes which have empty replace string ("repl" field). + */ +static void +mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix) +{ + int i, + cnt = 0; + int start = (issuffix) ? startsuffix : 0; + int end = (issuffix) ? Conf->naffixes : startsuffix; + AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData)); + + Affix->length = 1; + Affix->isvoid = 1; + + if (issuffix) + { + Affix->data->node = Conf->Suffix; + Conf->Suffix = Affix; + } + else + { + Affix->data->node = Conf->Prefix; + Conf->Prefix = Affix; + } + + /* Count affixes with empty replace string */ + for (i = start; i < end; i++) + if (Conf->Affix[i].replen == 0) + cnt++; + + /* There is not affixes with empty replace string */ + if (cnt == 0) + return; + + Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt); + Affix->data->naff = (uint32) cnt; + + cnt = 0; + for (i = start; i < end; i++) + if (Conf->Affix[i].replen == 0) + { + Affix->data->aff[cnt] = Conf->Affix + i; + cnt++; + } +} + +/* + * Checks if the affixflag is used by dictionary. Conf->AffixData does not + * contain affixflag if this flag is not used actually by the .dict file. + * + * Conf: current dictionary. + * affixflag: affix flag. + * + * Returns true if the Conf->AffixData array contains affixflag, otherwise + * returns false. + */ +static bool +isAffixInUse(IspellDict *Conf, char *affixflag) +{ + int i; + + for (i = 0; i < Conf->nAffixData; i++) + if (IsAffixFlagInUse(Conf, i, affixflag)) + return true; + + return false; +} + +/* + * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes. + */ +void +NISortAffixes(IspellDict *Conf) +{ + AFFIX *Affix; + size_t i; + CMPDAffix *ptr; + int firstsuffix = Conf->naffixes; + + if (Conf->naffixes == 0) + return; + + /* Store compound affixes in the Conf->CompoundAffix array */ + if (Conf->naffixes > 1) + qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix); + Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes); + ptr->affix = NULL; + + for (i = 0; i < Conf->naffixes; i++) + { + Affix = &(((AFFIX *) Conf->Affix)[i]); + if (Affix->type == FF_SUFFIX && i < firstsuffix) + firstsuffix = i; + + if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 && + isAffixInUse(Conf, Affix->flag)) + { + bool issuffix = (Affix->type == FF_SUFFIX); + + if (ptr == Conf->CompoundAffix || + issuffix != (ptr - 1)->issuffix || + strbncmp((const unsigned char *) (ptr - 1)->affix, + (const unsigned char *) Affix->repl, + (ptr - 1)->len)) + { + /* leave only unique and minimal suffixes */ + ptr->affix = Affix->repl; + ptr->len = Affix->replen; + ptr->issuffix = issuffix; + ptr++; + } + } + } + ptr->affix = NULL; + Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1)); + + /* Start build a prefix tree */ + Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX); + Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX); + mkVoidAffix(Conf, true, firstsuffix); + mkVoidAffix(Conf, false, firstsuffix); +} + +static AffixNodeData * +FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) +{ + AffixNodeData *StopLow, + *StopHigh, + *StopMiddle; + uint8 symbol; + + if (node->isvoid) + { /* search void affixes */ + if (node->data->naff) + return node->data; + node = node->data->node; + } + + while (node && *level < wrdlen) + { + StopLow = node->data; + StopHigh = node->data + node->length; + while (StopLow < StopHigh) + { + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); + symbol = GETWCHAR(word, wrdlen, *level, type); + + if (StopMiddle->val == symbol) + { + (*level)++; + if (StopMiddle->naff) + return StopMiddle; + node = StopMiddle->node; + break; + } + else if (StopMiddle->val < symbol) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + if (StopLow >= StopHigh) + break; + } + return NULL; +} + +static char * +CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen) +{ + /* + * Check compound allow flags + */ + + if (flagflags == 0) + { + if (Affix->flagflags & FF_COMPOUNDONLY) + return NULL; + } + else if (flagflags & FF_COMPOUNDBEGIN) + { + if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG) + return NULL; + if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0) + if (Affix->type == FF_SUFFIX) + return NULL; + } + else if (flagflags & FF_COMPOUNDMIDDLE) + { + if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 || + (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)) + return NULL; + } + else if (flagflags & FF_COMPOUNDLAST) + { + if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG) + return NULL; + if ((Affix->flagflags & FF_COMPOUNDLAST) == 0) + if (Affix->type == FF_PREFIX) + return NULL; + } + + /* + * make replace pattern of affix + */ + if (Affix->type == FF_SUFFIX) + { + strcpy(newword, word); + strcpy(newword + len - Affix->replen, Affix->find); + if (baselen) /* store length of non-changed part of word */ + *baselen = len - Affix->replen; + } + else + { + /* + * if prefix is an all non-changed part's length then all word + * contains only prefix and suffix, so out + */ + if (baselen && *baselen + strlen(Affix->find) <= Affix->replen) + return NULL; + strcpy(newword, Affix->find); + strcat(newword, word + Affix->replen); + } + + /* + * check resulting word + */ + if (Affix->issimple) + return newword; + else if (Affix->isregis) + { + if (RS_execute(&(Affix->reg.regis), newword)) + return newword; + } + else + { + pg_wchar *data; + size_t data_len; + int newword_len; + + /* Convert data string to wide characters */ + newword_len = strlen(newword); + data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(newword, data, newword_len); + + if (pg_regexec(&(Affix->reg.pregex->regex), data, data_len, + 0, NULL, 0, NULL, 0) == REG_OKAY) + { + pfree(data); + return newword; + } + pfree(data); + } + + return NULL; +} + +static int +addToResult(char **forms, char **cur, char *word) +{ + if (cur - forms >= MAX_NORM - 1) + return 0; + if (forms == cur || strcmp(word, *(cur - 1)) != 0) + { + *cur = pstrdup(word); + *(cur + 1) = NULL; + return 1; + } + + return 0; +} + +static char ** +NormalizeSubWord(IspellDict *Conf, char *word, int flag) +{ + AffixNodeData *suffix = NULL, + *prefix = NULL; + int slevel = 0, + plevel = 0; + int wrdlen = strlen(word), + swrdlen; + char **forms; + char **cur; + char newword[2 * MAXNORMLEN] = ""; + char pnewword[2 * MAXNORMLEN] = ""; + AffixNode *snode = Conf->Suffix, + *pnode; + int i, + j; + + if (wrdlen > MAXNORMLEN) + return NULL; + cur = forms = (char **) palloc(MAX_NORM * sizeof(char *)); + *cur = NULL; + + + /* Check that the word itself is normal form */ + if (FindWord(Conf, word, VoidString, flag)) + { + *cur = pstrdup(word); + cur++; + *cur = NULL; + } + + /* Find all other NORMAL forms of the 'word' (check only prefix) */ + pnode = Conf->Prefix; + plevel = 0; + while (pnode) + { + prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX); + if (!prefix) + break; + for (j = 0; j < prefix->naff; j++) + { + if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL)) + { + /* prefix success */ + if (FindWord(Conf, newword, prefix->aff[j]->flag, flag)) + cur += addToResult(forms, cur, newword); + } + } + pnode = prefix->node; + } + + /* + * Find all other NORMAL forms of the 'word' (check suffix and then + * prefix) + */ + while (snode) + { + int baselen = 0; + + /* find possible suffix */ + suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX); + if (!suffix) + break; + /* foreach suffix check affix */ + for (i = 0; i < suffix->naff; i++) + { + if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen)) + { + /* suffix success */ + if (FindWord(Conf, newword, suffix->aff[i]->flag, flag)) + cur += addToResult(forms, cur, newword); + + /* now we will look changed word with prefixes */ + pnode = Conf->Prefix; + plevel = 0; + swrdlen = strlen(newword); + while (pnode) + { + prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX); + if (!prefix) + break; + for (j = 0; j < prefix->naff; j++) + { + if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen)) + { + /* prefix success */ + char *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ? + VoidString : prefix->aff[j]->flag; + + if (FindWord(Conf, pnewword, ff, flag)) + cur += addToResult(forms, cur, pnewword); + } + } + pnode = prefix->node; + } + } + } + + snode = suffix->node; + } + + if (cur == forms) + { + pfree(forms); + return NULL; + } + return forms; +} + +typedef struct SplitVar +{ + int nstem; + int lenstem; + char **stem; + struct SplitVar *next; +} SplitVar; + +static int +CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace) +{ + bool issuffix; + + /* in case CompoundAffix is null: */ + if (*ptr == NULL) + return -1; + + if (CheckInPlace) + { + while ((*ptr)->affix) + { + if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0) + { + len = (*ptr)->len; + issuffix = (*ptr)->issuffix; + (*ptr)++; + return (issuffix) ? len : 0; + } + (*ptr)++; + } + } + else + { + char *affbegin; + + while ((*ptr)->affix) + { + if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL) + { + len = (*ptr)->len + (affbegin - word); + issuffix = (*ptr)->issuffix; + (*ptr)++; + return (issuffix) ? len : 0; + } + (*ptr)++; + } + } + return -1; +} + +static SplitVar * +CopyVar(SplitVar *s, int makedup) +{ + SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar)); + + v->next = NULL; + if (s) + { + int i; + + v->lenstem = s->lenstem; + v->stem = (char **) palloc(sizeof(char *) * v->lenstem); + v->nstem = s->nstem; + for (i = 0; i < s->nstem; i++) + v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i]; + } + else + { + v->lenstem = 16; + v->stem = (char **) palloc(sizeof(char *) * v->lenstem); + v->nstem = 0; + } + return v; +} + +static void +AddStem(SplitVar *v, char *word) +{ + if (v->nstem >= v->lenstem) + { + v->lenstem *= 2; + v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem); + } + + v->stem[v->nstem] = word; + v->nstem++; +} + +static SplitVar * +SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos) +{ + SplitVar *var = NULL; + SPNodeData *StopLow, + *StopHigh, + *StopMiddle = NULL; + SPNode *node = (snode) ? snode : Conf->Dictionary; + int level = (snode) ? minpos : startpos; /* recursive + * minpos==level */ + int lenaff; + CMPDAffix *caff; + char *notprobed; + int compoundflag = 0; + + notprobed = (char *) palloc(wordlen); + memset(notprobed, 1, wordlen); + var = CopyVar(orig, 1); + + while (level < wordlen) + { + /* find word with epenthetic or/and compound affix */ + caff = Conf->CompoundAffix; + while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0) + { + /* + * there is one of compound affixes, so check word for existings + */ + char buf[MAXNORMLEN]; + char **subres; + + lenaff = level - startpos + lenaff; + + if (!notprobed[startpos + lenaff - 1]) + continue; + + if (level + lenaff - 1 <= minpos) + continue; + + if (lenaff >= MAXNORMLEN) + continue; /* skip too big value */ + if (lenaff > 0) + memcpy(buf, word + startpos, lenaff); + buf[lenaff] = '\0'; + + if (level == 0) + compoundflag = FF_COMPOUNDBEGIN; + else if (level == wordlen - 1) + compoundflag = FF_COMPOUNDLAST; + else + compoundflag = FF_COMPOUNDMIDDLE; + subres = NormalizeSubWord(Conf, buf, compoundflag); + if (subres) + { + /* Yes, it was a word from dictionary */ + SplitVar *new = CopyVar(var, 0); + SplitVar *ptr = var; + char **sptr = subres; + + notprobed[startpos + lenaff - 1] = 0; + + while (*sptr) + { + AddStem(new, *sptr); + sptr++; + } + pfree(subres); + + while (ptr->next) + ptr = ptr->next; + ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff); + + pfree(new->stem); + pfree(new); + } + } + + if (!node) + break; + + StopLow = node->data; + StopHigh = node->data + node->length; + while (StopLow < StopHigh) + { + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); + if (StopMiddle->val == ((uint8 *) (word))[level]) + break; + else if (StopMiddle->val < ((uint8 *) (word))[level]) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + + if (StopLow < StopHigh) + { + if (startpos == 0) + compoundflag = FF_COMPOUNDBEGIN; + else if (level == wordlen - 1) + compoundflag = FF_COMPOUNDLAST; + else + compoundflag = FF_COMPOUNDMIDDLE; + + /* find infinitive */ + if (StopMiddle->isword && + (StopMiddle->compoundflag & compoundflag) && + notprobed[level]) + { + /* ok, we found full compoundallowed word */ + if (level > minpos) + { + /* and its length more than minimal */ + if (wordlen == level + 1) + { + /* well, it was last word */ + AddStem(var, pnstrdup(word + startpos, wordlen - startpos)); + pfree(notprobed); + return var; + } + else + { + /* then we will search more big word at the same point */ + SplitVar *ptr = var; + + while (ptr->next) + ptr = ptr->next; + ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level); + /* we can find next word */ + level++; + AddStem(var, pnstrdup(word + startpos, level - startpos)); + node = Conf->Dictionary; + startpos = level; + continue; + } + } + } + node = StopMiddle->node; + } + else + node = NULL; + level++; + } + + AddStem(var, pnstrdup(word + startpos, wordlen - startpos)); + pfree(notprobed); + return var; +} + +static void +addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant) +{ + if (*lres == NULL) + *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme)); + + if (*lcur - *lres < MAX_NORM - 1) + { + (*lcur)->lexeme = word; + (*lcur)->flags = flags; + (*lcur)->nvariant = NVariant; + (*lcur)++; + (*lcur)->lexeme = NULL; + } +} + +TSLexeme * +NINormalizeWord(IspellDict *Conf, char *word) +{ + char **res; + TSLexeme *lcur = NULL, + *lres = NULL; + uint16 NVariant = 1; + + res = NormalizeSubWord(Conf, word, 0); + + if (res) + { + char **ptr = res; + + while (*ptr && (lcur - lres) < MAX_NORM) + { + addNorm(&lres, &lcur, *ptr, 0, NVariant++); + ptr++; + } + pfree(res); + } + + if (Conf->usecompound) + { + int wordlen = strlen(word); + SplitVar *ptr, + *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1); + int i; + + while (var) + { + if (var->nstem > 1) + { + char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST); + + if (subres) + { + char **subptr = subres; + + while (*subptr) + { + for (i = 0; i < var->nstem - 1; i++) + { + addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant); + } + + addNorm(&lres, &lcur, *subptr, 0, NVariant); + subptr++; + NVariant++; + } + + pfree(subres); + var->stem[0] = NULL; + pfree(var->stem[var->nstem - 1]); + } + } + + for (i = 0; i < var->nstem && var->stem[i]; i++) + pfree(var->stem[i]); + ptr = var->next; + pfree(var->stem); + pfree(var); + var = ptr; + } + } + + return lres; +} diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c new file mode 100644 index 0000000..f4ddfc0 --- /dev/null +++ b/src/backend/tsearch/to_tsany.c @@ -0,0 +1,724 @@ +/*------------------------------------------------------------------------- + * + * to_tsany.c + * to_ts* function definitions + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/to_tsany.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "common/jsonapi.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" +#include "utils/jsonfuncs.h" + + +/* + * Opaque data structure, which is passed by parse_tsquery() to pushval_morph(). + */ +typedef struct MorphOpaque +{ + Oid cfg_id; + + /* + * Single tsquery morph could be parsed into multiple words. When these + * words reside in adjacent positions, they are connected using this + * operator. Usually, that is OP_PHRASE, which requires word positions of + * a complex morph to exactly match the tsvector. + */ + int qoperator; +} MorphOpaque; + +typedef struct TSVectorBuildState +{ + ParsedText *prs; + Oid cfgId; +} TSVectorBuildState; + +static void add_to_tsvector(void *_state, char *elem_value, int elem_len); + + +Datum +get_current_ts_config(PG_FUNCTION_ARGS) +{ + PG_RETURN_OID(getTSCurrentConfig(true)); +} + +/* + * to_tsvector + */ +static int +compareWORD(const void *a, const void *b) +{ + int res; + + res = tsCompareString(((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len, + ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len, + false); + + if (res == 0) + { + if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos) + return 0; + + res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1; + } + + return res; +} + +static int +uniqueWORD(ParsedWord *a, int32 l) +{ + ParsedWord *ptr, + *res; + int tmppos; + + if (l == 1) + { + tmppos = LIMITPOS(a->pos.pos); + a->alen = 2; + a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); + a->pos.apos[0] = 1; + a->pos.apos[1] = tmppos; + return l; + } + + res = a; + ptr = a + 1; + + /* + * Sort words with its positions + */ + qsort((void *) a, l, sizeof(ParsedWord), compareWORD); + + /* + * Initialize first word and its first position + */ + tmppos = LIMITPOS(a->pos.pos); + a->alen = 2; + a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); + a->pos.apos[0] = 1; + a->pos.apos[1] = tmppos; + + /* + * Summarize position information for each word + */ + while (ptr - a < l) + { + if (!(ptr->len == res->len && + strncmp(ptr->word, res->word, res->len) == 0)) + { + /* + * Got a new word, so put it in result + */ + res++; + res->len = ptr->len; + res->word = ptr->word; + tmppos = LIMITPOS(ptr->pos.pos); + res->alen = 2; + res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen); + res->pos.apos[0] = 1; + res->pos.apos[1] = tmppos; + } + else + { + /* + * The word already exists, so adjust position information. But + * before we should check size of position's array, max allowed + * value for position and uniqueness of position + */ + pfree(ptr->word); + if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 && + res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) + { + if (res->pos.apos[0] + 1 >= res->alen) + { + res->alen *= 2; + res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen); + } + if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) + { + res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos); + res->pos.apos[0]++; + } + } + } + ptr++; + } + + return res + 1 - a; +} + +/* + * make value of tsvector, given parsed text + * + * Note: frees prs->words and subsidiary data. + */ +TSVector +make_tsvector(ParsedText *prs) +{ + int i, + j, + lenstr = 0, + totallen; + TSVector in; + WordEntry *ptr; + char *str; + int stroff; + + /* Merge duplicate words */ + if (prs->curwords > 0) + prs->curwords = uniqueWORD(prs->words, prs->curwords); + + /* Determine space needed */ + for (i = 0; i < prs->curwords; i++) + { + lenstr += prs->words[i].len; + if (prs->words[i].alen) + { + lenstr = SHORTALIGN(lenstr); + lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); + } + } + + if (lenstr > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS))); + + totallen = CALCDATASIZE(prs->curwords, lenstr); + in = (TSVector) palloc0(totallen); + SET_VARSIZE(in, totallen); + in->size = prs->curwords; + + ptr = ARRPTR(in); + str = STRPTR(in); + stroff = 0; + for (i = 0; i < prs->curwords; i++) + { + ptr->len = prs->words[i].len; + ptr->pos = stroff; + memcpy(str + stroff, prs->words[i].word, prs->words[i].len); + stroff += prs->words[i].len; + pfree(prs->words[i].word); + if (prs->words[i].alen) + { + int k = prs->words[i].pos.apos[0]; + WordEntryPos *wptr; + + if (k > 0xFFFF) + elog(ERROR, "positions array too long"); + + ptr->haspos = 1; + stroff = SHORTALIGN(stroff); + *(uint16 *) (str + stroff) = (uint16) k; + wptr = POSDATAPTR(in, ptr); + for (j = 0; j < k; j++) + { + WEP_SETWEIGHT(wptr[j], 0); + WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]); + } + stroff += sizeof(uint16) + k * sizeof(WordEntryPos); + pfree(prs->words[i].pos.apos); + } + else + ptr->haspos = 0; + ptr++; + } + + if (prs->words) + pfree(prs->words); + + return in; +} + +Datum +to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + text *in = PG_GETARG_TEXT_PP(1); + ParsedText prs; + TSVector out; + + prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's + * number */ + if (prs.lenwords < 2) + prs.lenwords = 2; + prs.curwords = 0; + prs.pos = 0; + prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + + parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in)); + + PG_FREE_IF_COPY(in, 1); + + out = make_tsvector(&prs); + + PG_RETURN_TSVECTOR(out); +} + +Datum +to_tsvector(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} + +/* + * Worker function for jsonb(_string)_to_tsvector(_byid) + */ +static TSVector +jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags) +{ + TSVectorBuildState state; + ParsedText prs; + + prs.words = NULL; + prs.curwords = 0; + state.prs = &prs; + state.cfgId = cfgId; + + iterate_jsonb_values(jb, flags, &state, add_to_tsvector); + + return make_tsvector(&prs); +} + +Datum +jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + Jsonb *jb = PG_GETARG_JSONB_P(1); + TSVector result; + + result = jsonb_to_tsvector_worker(cfgId, jb, jtiString); + PG_FREE_IF_COPY(jb, 1); + + PG_RETURN_TSVECTOR(result); +} + +Datum +jsonb_string_to_tsvector(PG_FUNCTION_ARGS) +{ + Jsonb *jb = PG_GETARG_JSONB_P(0); + Oid cfgId; + TSVector result; + + cfgId = getTSCurrentConfig(true); + result = jsonb_to_tsvector_worker(cfgId, jb, jtiString); + PG_FREE_IF_COPY(jb, 0); + + PG_RETURN_TSVECTOR(result); +} + +Datum +jsonb_to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + Jsonb *jb = PG_GETARG_JSONB_P(1); + Jsonb *jbFlags = PG_GETARG_JSONB_P(2); + TSVector result; + uint32 flags = parse_jsonb_index_flags(jbFlags); + + result = jsonb_to_tsvector_worker(cfgId, jb, flags); + PG_FREE_IF_COPY(jb, 1); + PG_FREE_IF_COPY(jbFlags, 2); + + PG_RETURN_TSVECTOR(result); +} + +Datum +jsonb_to_tsvector(PG_FUNCTION_ARGS) +{ + Jsonb *jb = PG_GETARG_JSONB_P(0); + Jsonb *jbFlags = PG_GETARG_JSONB_P(1); + Oid cfgId; + TSVector result; + uint32 flags = parse_jsonb_index_flags(jbFlags); + + cfgId = getTSCurrentConfig(true); + result = jsonb_to_tsvector_worker(cfgId, jb, flags); + PG_FREE_IF_COPY(jb, 0); + PG_FREE_IF_COPY(jbFlags, 1); + + PG_RETURN_TSVECTOR(result); +} + +/* + * Worker function for json(_string)_to_tsvector(_byid) + */ +static TSVector +json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags) +{ + TSVectorBuildState state; + ParsedText prs; + + prs.words = NULL; + prs.curwords = 0; + state.prs = &prs; + state.cfgId = cfgId; + + iterate_json_values(json, flags, &state, add_to_tsvector); + + return make_tsvector(&prs); +} + +Datum +json_string_to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + text *json = PG_GETARG_TEXT_P(1); + TSVector result; + + result = json_to_tsvector_worker(cfgId, json, jtiString); + PG_FREE_IF_COPY(json, 1); + + PG_RETURN_TSVECTOR(result); +} + +Datum +json_string_to_tsvector(PG_FUNCTION_ARGS) +{ + text *json = PG_GETARG_TEXT_P(0); + Oid cfgId; + TSVector result; + + cfgId = getTSCurrentConfig(true); + result = json_to_tsvector_worker(cfgId, json, jtiString); + PG_FREE_IF_COPY(json, 0); + + PG_RETURN_TSVECTOR(result); +} + +Datum +json_to_tsvector_byid(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + text *json = PG_GETARG_TEXT_P(1); + Jsonb *jbFlags = PG_GETARG_JSONB_P(2); + TSVector result; + uint32 flags = parse_jsonb_index_flags(jbFlags); + + result = json_to_tsvector_worker(cfgId, json, flags); + PG_FREE_IF_COPY(json, 1); + PG_FREE_IF_COPY(jbFlags, 2); + + PG_RETURN_TSVECTOR(result); +} + +Datum +json_to_tsvector(PG_FUNCTION_ARGS) +{ + text *json = PG_GETARG_TEXT_P(0); + Jsonb *jbFlags = PG_GETARG_JSONB_P(1); + Oid cfgId; + TSVector result; + uint32 flags = parse_jsonb_index_flags(jbFlags); + + cfgId = getTSCurrentConfig(true); + result = json_to_tsvector_worker(cfgId, json, flags); + PG_FREE_IF_COPY(json, 0); + PG_FREE_IF_COPY(jbFlags, 1); + + PG_RETURN_TSVECTOR(result); +} + +/* + * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState. + */ +static void +add_to_tsvector(void *_state, char *elem_value, int elem_len) +{ + TSVectorBuildState *state = (TSVectorBuildState *) _state; + ParsedText *prs = state->prs; + int32 prevwords; + + if (prs->words == NULL) + { + /* + * First time through: initialize words array to a reasonable size. + * (parsetext() will realloc it bigger as needed.) + */ + prs->lenwords = 16; + prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords); + prs->curwords = 0; + prs->pos = 0; + } + + prevwords = prs->curwords; + + parsetext(state->cfgId, prs, elem_value, elem_len); + + /* + * If we extracted any words from this JSON element, advance pos to create + * an artificial break between elements. This is because we don't want + * phrase searches to think that the last word in this element is adjacent + * to the first word in the next one. + */ + if (prs->curwords > prevwords) + prs->pos += 1; +} + + +/* + * to_tsquery + */ + + +/* + * This function is used for morph parsing. + * + * The value is passed to parsetext which will call the right dictionary to + * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP + * to the stack. + * + * All words belonging to the same variant are pushed as an ANDed list, + * and different variants are ORed together. + */ +static void +pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix) +{ + int32 count = 0; + ParsedText prs; + uint32 variant, + pos = 0, + cntvar = 0, + cntpos = 0, + cnt = 0; + MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque); + + prs.lenwords = 4; + prs.curwords = 0; + prs.pos = 0; + prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + + parsetext(data->cfg_id, &prs, strval, lenval); + + if (prs.curwords > 0) + { + while (count < prs.curwords) + { + /* + * Were any stop words removed? If so, fill empty positions with + * placeholders linked by an appropriate operator. + */ + if (pos > 0 && pos + 1 < prs.words[count].pos.pos) + { + while (pos + 1 < prs.words[count].pos.pos) + { + /* put placeholders for each missing stop word */ + pushStop(state); + if (cntpos) + pushOperator(state, data->qoperator, 1); + cntpos++; + pos++; + } + } + + /* save current word's position */ + pos = prs.words[count].pos.pos; + + /* Go through all variants obtained from this token */ + cntvar = 0; + while (count < prs.curwords && pos == prs.words[count].pos.pos) + { + variant = prs.words[count].nvariant; + + /* Push all words belonging to the same variant */ + cnt = 0; + while (count < prs.curwords && + pos == prs.words[count].pos.pos && + variant == prs.words[count].nvariant) + { + pushValue(state, + prs.words[count].word, + prs.words[count].len, + weight, + ((prs.words[count].flags & TSL_PREFIX) || prefix)); + pfree(prs.words[count].word); + if (cnt) + pushOperator(state, OP_AND, 0); + cnt++; + count++; + } + + if (cntvar) + pushOperator(state, OP_OR, 0); + cntvar++; + } + + if (cntpos) + { + /* distance may be useful */ + pushOperator(state, data->qoperator, 1); + } + + cntpos++; + } + + pfree(prs.words); + + } + else + pushStop(state); +} + +Datum +to_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + TSQuery query; + MorphOpaque data; + + data.cfg_id = PG_GETARG_OID(0); + + /* + * Passing OP_PHRASE as a qoperator makes tsquery require matching of word + * positions of a complex morph exactly match the tsvector. Also, when + * the complex morphs are connected with OP_PHRASE operator, we connect + * all their words into the OP_PHRASE sequence. + */ + data.qoperator = OP_PHRASE; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + 0); + + PG_RETURN_TSQUERY(query); +} + +Datum +to_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} + +Datum +plainto_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + TSQuery query; + MorphOpaque data; + + data.cfg_id = PG_GETARG_OID(0); + + /* + * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a + * single morph. Passing OP_PHRASE as a qoperator makes tsquery require + * matching of all words independently on their positions. + */ + data.qoperator = OP_AND; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + P_TSQ_PLAIN); + + PG_RETURN_POINTER(query); +} + +Datum +plainto_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} + + +Datum +phraseto_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + TSQuery query; + MorphOpaque data; + + data.cfg_id = PG_GETARG_OID(0); + + /* + * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a + * single morph. Passing OP_PHRASE as a qoperator makes tsquery require + * matching of word positions. + */ + data.qoperator = OP_PHRASE; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + P_TSQ_PLAIN); + + PG_RETURN_TSQUERY(query); +} + +Datum +phraseto_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); +} + +Datum +websearch_to_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + MorphOpaque data; + TSQuery query = NULL; + + data.cfg_id = PG_GETARG_OID(0); + + /* + * Passing OP_PHRASE as a qoperator makes tsquery require matching of word + * positions of a complex morph exactly match the tsvector. Also, when + * the complex morphs are given in quotes, we connect all their words into + * the OP_PHRASE sequence. + */ + data.qoperator = OP_PHRASE; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + P_TSQ_WEB); + + PG_RETURN_TSQUERY(query); +} + +Datum +websearch_to_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); + +} diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c new file mode 100644 index 0000000..f918cc8 --- /dev/null +++ b/src/backend/tsearch/ts_locale.c @@ -0,0 +1,325 @@ +/*------------------------------------------------------------------------- + * + * ts_locale.c + * locale compatibility layer for tsearch + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/ts_locale.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/pg_collation.h" +#include "common/string.h" +#include "storage/fd.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_public.h" + +static void tsearch_readline_callback(void *arg); + + +/* + * The reason these functions use a 3-wchar_t output buffer, not 2 as you + * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be + * getting from char2wchar() is UTF16 not UTF32. A single input character + * may therefore produce a surrogate pair rather than just one wchar_t; + * we also need room for a trailing null. When we do get a surrogate pair, + * we pass just the first code to iswdigit() etc, so that these functions will + * always return false for characters outside the Basic Multilingual Plane. + */ +#define WC_BUF_LEN 3 + +int +t_isdigit(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[WC_BUF_LEN]; + Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ + + if (clen == 1 || lc_ctype_is_c(collation)) + return isdigit(TOUCHAR(ptr)); + + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + + return iswdigit((wint_t) character[0]); +} + +int +t_isspace(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[WC_BUF_LEN]; + Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ + + if (clen == 1 || lc_ctype_is_c(collation)) + return isspace(TOUCHAR(ptr)); + + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + + return iswspace((wint_t) character[0]); +} + +int +t_isalpha(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[WC_BUF_LEN]; + Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ + + if (clen == 1 || lc_ctype_is_c(collation)) + return isalpha(TOUCHAR(ptr)); + + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + + return iswalpha((wint_t) character[0]); +} + +int +t_isprint(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[WC_BUF_LEN]; + Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ + + if (clen == 1 || lc_ctype_is_c(collation)) + return isprint(TOUCHAR(ptr)); + + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + + return iswprint((wint_t) character[0]); +} + + +/* + * Set up to read a file using tsearch_readline(). This facility is + * better than just reading the file directly because it provides error + * context pointing to the specific line where a problem is detected. + * + * Expected usage is: + * + * tsearch_readline_state trst; + * + * if (!tsearch_readline_begin(&trst, filename)) + * ereport(ERROR, + * (errcode(ERRCODE_CONFIG_FILE_ERROR), + * errmsg("could not open stop-word file \"%s\": %m", + * filename))); + * while ((line = tsearch_readline(&trst)) != NULL) + * process line; + * tsearch_readline_end(&trst); + * + * Note that the caller supplies the ereport() for file open failure; + * this is so that a custom message can be provided. The filename string + * passed to tsearch_readline_begin() must remain valid through + * tsearch_readline_end(). + */ +bool +tsearch_readline_begin(tsearch_readline_state *stp, + const char *filename) +{ + if ((stp->fp = AllocateFile(filename, "r")) == NULL) + return false; + stp->filename = filename; + stp->lineno = 0; + initStringInfo(&stp->buf); + stp->curline = NULL; + /* Setup error traceback support for ereport() */ + stp->cb.callback = tsearch_readline_callback; + stp->cb.arg = (void *) stp; + stp->cb.previous = error_context_stack; + error_context_stack = &stp->cb; + return true; +} + +/* + * Read the next line from a tsearch data file (expected to be in UTF-8), and + * convert it to database encoding if needed. The returned string is palloc'd. + * NULL return means EOF. + */ +char * +tsearch_readline(tsearch_readline_state *stp) +{ + char *recoded; + + /* Advance line number to use in error reports */ + stp->lineno++; + + /* Clear curline, it's no longer relevant */ + if (stp->curline) + { + if (stp->curline != stp->buf.data) + pfree(stp->curline); + stp->curline = NULL; + } + + /* Collect next line, if there is one */ + if (!pg_get_line_buf(stp->fp, &stp->buf)) + return NULL; + + /* Validate the input as UTF-8, then convert to DB encoding if needed */ + recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8); + + /* Save the correctly-encoded string for possible error reports */ + stp->curline = recoded; /* might be equal to buf.data */ + + /* + * We always return a freshly pstrdup'd string. This is clearly necessary + * if pg_any_to_server() returned buf.data, and we need a second copy even + * if encoding conversion did occur. The caller is entitled to pfree the + * returned string at any time, which would leave curline pointing to + * recycled storage, causing problems if an error occurs after that point. + * (It's preferable to return the result of pstrdup instead of the output + * of pg_any_to_server, because the conversion result tends to be + * over-allocated. Since callers might save the result string directly + * into a long-lived dictionary structure, we don't want it to be a larger + * palloc chunk than necessary. We'll reclaim the conversion result on + * the next call.) + */ + return pstrdup(recoded); +} + +/* + * Close down after reading a file with tsearch_readline() + */ +void +tsearch_readline_end(tsearch_readline_state *stp) +{ + /* Suppress use of curline in any error reported below */ + if (stp->curline) + { + if (stp->curline != stp->buf.data) + pfree(stp->curline); + stp->curline = NULL; + } + + /* Release other resources */ + pfree(stp->buf.data); + FreeFile(stp->fp); + + /* Pop the error context stack */ + error_context_stack = stp->cb.previous; +} + +/* + * Error context callback for errors occurring while reading a tsearch + * configuration file. + */ +static void +tsearch_readline_callback(void *arg) +{ + tsearch_readline_state *stp = (tsearch_readline_state *) arg; + + /* + * We can't include the text of the config line for errors that occur + * during tsearch_readline() itself. The major cause of such errors is + * encoding violations, and we daren't try to print error messages + * containing badly-encoded data. + */ + if (stp->curline) + errcontext("line %d of configuration file \"%s\": \"%s\"", + stp->lineno, + stp->filename, + stp->curline); + else + errcontext("line %d of configuration file \"%s\"", + stp->lineno, + stp->filename); +} + + +/* + * lowerstr --- fold null-terminated string to lower case + * + * Returned string is palloc'd + */ +char * +lowerstr(const char *str) +{ + return lowerstr_with_len(str, strlen(str)); +} + +/* + * lowerstr_with_len --- fold string to lower case + * + * Input string need not be null-terminated. + * + * Returned string is palloc'd + */ +char * +lowerstr_with_len(const char *str, int len) +{ + char *out; + Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ + + if (len == 0) + return pstrdup(""); + + /* + * Use wide char code only when max encoding length > 1 and ctype != C. + * Some operating systems fail with multi-byte encodings and a C locale. + * Also, for a C locale there is no need to process as multibyte. From + * backend/utils/adt/oracle_compat.c Teodor + */ + if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation)) + { + wchar_t *wstr, + *wptr; + int wlen; + + /* + * alloc number of wchar_t for worst case, len contains number of + * bytes >= number of characters and alloc 1 wchar_t for 0, because + * wchar2char wants zero-terminated string + */ + wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1)); + + wlen = char2wchar(wstr, len + 1, str, len, mylocale); + Assert(wlen <= len); + + while (*wptr) + { + *wptr = towlower((wint_t) *wptr); + wptr++; + } + + /* + * Alloc result string for worst case + '\0' + */ + len = pg_database_encoding_max_length() * wlen + 1; + out = (char *) palloc(len); + + wlen = wchar2char(out, wstr, len, mylocale); + + pfree(wstr); + + if (wlen < 0) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("conversion from wchar_t to server encoding failed: %m"))); + Assert(wlen < len); + } + else + { + const char *ptr = str; + char *outptr; + + outptr = out = (char *) palloc(sizeof(char) * (len + 1)); + while ((ptr - str) < len && *ptr) + { + *outptr++ = tolower(TOUCHAR(ptr)); + ptr++; + } + *outptr = '\0'; + } + + return out; +} diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c new file mode 100644 index 0000000..92d95b4 --- /dev/null +++ b/src/backend/tsearch/ts_parse.c @@ -0,0 +1,667 @@ +/*------------------------------------------------------------------------- + * + * ts_parse.c + * main parse functions for tsearch + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/ts_parse.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/ts_cache.h" +#include "tsearch/ts_utils.h" + +#define IGNORE_LONGLEXEME 1 + +/* + * Lexize subsystem + */ + +typedef struct ParsedLex +{ + int type; + char *lemm; + int lenlemm; + struct ParsedLex *next; +} ParsedLex; + +typedef struct ListParsedLex +{ + ParsedLex *head; + ParsedLex *tail; +} ListParsedLex; + +typedef struct +{ + TSConfigCacheEntry *cfg; + Oid curDictId; + int posDict; + DictSubState dictState; + ParsedLex *curSub; + ListParsedLex towork; /* current list to work */ + ListParsedLex waste; /* list of lexemes that already lexized */ + + /* + * fields to store last variant to lexize (basically, thesaurus or similar + * to, which wants several lexemes + */ + + ParsedLex *lastRes; + TSLexeme *tmpRes; +} LexizeData; + +static void +LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg) +{ + ld->cfg = cfg; + ld->curDictId = InvalidOid; + ld->posDict = 0; + ld->towork.head = ld->towork.tail = ld->curSub = NULL; + ld->waste.head = ld->waste.tail = NULL; + ld->lastRes = NULL; + ld->tmpRes = NULL; +} + +static void +LPLAddTail(ListParsedLex *list, ParsedLex *newpl) +{ + if (list->tail) + { + list->tail->next = newpl; + list->tail = newpl; + } + else + list->head = list->tail = newpl; + newpl->next = NULL; +} + +static ParsedLex * +LPLRemoveHead(ListParsedLex *list) +{ + ParsedLex *res = list->head; + + if (list->head) + list->head = list->head->next; + + if (list->head == NULL) + list->tail = NULL; + + return res; +} + +static void +LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) +{ + ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex)); + + newpl->type = type; + newpl->lemm = lemm; + newpl->lenlemm = lenlemm; + LPLAddTail(&ld->towork, newpl); + ld->curSub = ld->towork.tail; +} + +static void +RemoveHead(LexizeData *ld) +{ + LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork)); + + ld->posDict = 0; +} + +static void +setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) +{ + if (correspondLexem) + { + *correspondLexem = ld->waste.head; + } + else + { + ParsedLex *tmp, + *ptr = ld->waste.head; + + while (ptr) + { + tmp = ptr->next; + pfree(ptr); + ptr = tmp; + } + } + ld->waste.head = ld->waste.tail = NULL; +} + +static void +moveToWaste(LexizeData *ld, ParsedLex *stop) +{ + bool go = true; + + while (ld->towork.head && go) + { + if (ld->towork.head == stop) + { + ld->curSub = stop->next; + go = false; + } + RemoveHead(ld); + } +} + +static void +setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) +{ + if (ld->tmpRes) + { + TSLexeme *ptr; + + for (ptr = ld->tmpRes; ptr->lexeme; ptr++) + pfree(ptr->lexeme); + pfree(ld->tmpRes); + } + ld->tmpRes = res; + ld->lastRes = lex; +} + +static TSLexeme * +LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) +{ + int i; + ListDictionary *map; + TSDictionaryCacheEntry *dict; + TSLexeme *res; + + if (ld->curDictId == InvalidOid) + { + /* + * usual mode: dictionary wants only one word, but we should keep in + * mind that we should go through all stack + */ + + while (ld->towork.head) + { + ParsedLex *curVal = ld->towork.head; + char *curValLemm = curVal->lemm; + int curValLenLemm = curVal->lenlemm; + + map = ld->cfg->map + curVal->type; + + if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0) + { + /* skip this type of lexeme */ + RemoveHead(ld); + continue; + } + + for (i = ld->posDict; i < map->len; i++) + { + dict = lookup_ts_dictionary_cache(map->dictIds[i]); + + ld->dictState.isend = ld->dictState.getnext = false; + ld->dictState.private_state = NULL; + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), + PointerGetDatum(dict->dictData), + PointerGetDatum(curValLemm), + Int32GetDatum(curValLenLemm), + PointerGetDatum(&ld->dictState))); + + if (ld->dictState.getnext) + { + /* + * dictionary wants next word, so setup and store current + * position and go to multiword mode + */ + + ld->curDictId = DatumGetObjectId(map->dictIds[i]); + ld->posDict = i + 1; + ld->curSub = curVal->next; + if (res) + setNewTmpRes(ld, curVal, res); + return LexizeExec(ld, correspondLexem); + } + + if (!res) /* dictionary doesn't know this lexeme */ + continue; + + if (res->flags & TSL_FILTER) + { + curValLemm = res->lexeme; + curValLenLemm = strlen(res->lexeme); + continue; + } + + RemoveHead(ld); + setCorrLex(ld, correspondLexem); + return res; + } + + RemoveHead(ld); + } + } + else + { /* curDictId is valid */ + dict = lookup_ts_dictionary_cache(ld->curDictId); + + /* + * Dictionary ld->curDictId asks us about following words + */ + + while (ld->curSub) + { + ParsedLex *curVal = ld->curSub; + + map = ld->cfg->map + curVal->type; + + if (curVal->type != 0) + { + bool dictExists = false; + + if (curVal->type >= ld->cfg->lenmap || map->len == 0) + { + /* skip this type of lexeme */ + ld->curSub = curVal->next; + continue; + } + + /* + * We should be sure that current type of lexeme is recognized + * by our dictionary: we just check is it exist in list of + * dictionaries ? + */ + for (i = 0; i < map->len && !dictExists; i++) + if (ld->curDictId == DatumGetObjectId(map->dictIds[i])) + dictExists = true; + + if (!dictExists) + { + /* + * Dictionary can't work with current type of lexeme, + * return to basic mode and redo all stored lexemes + */ + ld->curDictId = InvalidOid; + return LexizeExec(ld, correspondLexem); + } + } + + ld->dictState.isend = (curVal->type == 0) ? true : false; + ld->dictState.getnext = false; + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), + PointerGetDatum(dict->dictData), + PointerGetDatum(curVal->lemm), + Int32GetDatum(curVal->lenlemm), + PointerGetDatum(&ld->dictState))); + + if (ld->dictState.getnext) + { + /* Dictionary wants one more */ + ld->curSub = curVal->next; + if (res) + setNewTmpRes(ld, curVal, res); + continue; + } + + if (res || ld->tmpRes) + { + /* + * Dictionary normalizes lexemes, so we remove from stack all + * used lexemes, return to basic mode and redo end of stack + * (if it exists) + */ + if (res) + { + moveToWaste(ld, ld->curSub); + } + else + { + res = ld->tmpRes; + moveToWaste(ld, ld->lastRes); + } + + /* reset to initial state */ + ld->curDictId = InvalidOid; + ld->posDict = 0; + ld->lastRes = NULL; + ld->tmpRes = NULL; + setCorrLex(ld, correspondLexem); + return res; + } + + /* + * Dict don't want next lexem and didn't recognize anything, redo + * from ld->towork.head + */ + ld->curDictId = InvalidOid; + return LexizeExec(ld, correspondLexem); + } + } + + setCorrLex(ld, correspondLexem); + return NULL; +} + +/* + * Parse string and lexize words. + * + * prs will be filled in. + */ +void +parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen) +{ + int type, + lenlemm; + char *lemm = NULL; + LexizeData ldata; + TSLexeme *norms; + TSConfigCacheEntry *cfg; + TSParserCacheEntry *prsobj; + void *prsdata; + + cfg = lookup_ts_config_cache(cfgId); + prsobj = lookup_ts_parser_cache(cfg->prsId); + + prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart, + PointerGetDatum(buf), + Int32GetDatum(buflen))); + + LexizeInit(&ldata, cfg); + + do + { + type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), + PointerGetDatum(prsdata), + PointerGetDatum(&lemm), + PointerGetDatum(&lenlemm))); + + if (type > 0 && lenlemm >= MAXSTRLEN) + { +#ifdef IGNORE_LONGLEXEME + ereport(NOTICE, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); + continue; +#else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); +#endif + } + + LexizeAddLemm(&ldata, type, lemm, lenlemm); + + while ((norms = LexizeExec(&ldata, NULL)) != NULL) + { + TSLexeme *ptr = norms; + + prs->pos++; /* set pos */ + + while (ptr->lexeme) + { + if (prs->curwords == prs->lenwords) + { + prs->lenwords *= 2; + prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord)); + } + + if (ptr->flags & TSL_ADDPOS) + prs->pos++; + prs->words[prs->curwords].len = strlen(ptr->lexeme); + prs->words[prs->curwords].word = ptr->lexeme; + prs->words[prs->curwords].nvariant = ptr->nvariant; + prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX; + prs->words[prs->curwords].alen = 0; + prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos); + ptr++; + prs->curwords++; + } + pfree(norms); + } + } while (type > 0); + + FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); +} + +/* + * Headline framework + */ +static void +hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type) +{ + while (prs->curwords >= prs->lenwords) + { + prs->lenwords *= 2; + prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry)); + } + memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry)); + prs->words[prs->curwords].type = (uint8) type; + prs->words[prs->curwords].len = buflen; + prs->words[prs->curwords].word = palloc(buflen); + memcpy(prs->words[prs->curwords].word, buf, buflen); + prs->curwords++; +} + +static void +hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen) +{ + int i; + QueryItem *item = GETQUERY(query); + HeadlineWordEntry *word; + + while (prs->curwords + query->size >= prs->lenwords) + { + prs->lenwords *= 2; + prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry)); + } + + word = &(prs->words[prs->curwords - 1]); + word->pos = LIMITPOS(pos); + for (i = 0; i < query->size; i++) + { + if (item->type == QI_VAL && + tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length, + buf, buflen, item->qoperand.prefix) == 0) + { + if (word->item) + { + memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry)); + prs->words[prs->curwords].item = &item->qoperand; + prs->words[prs->curwords].repeated = 1; + prs->curwords++; + } + else + word->item = &item->qoperand; + } + item++; + } +} + +static void +addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms) +{ + ParsedLex *tmplexs; + TSLexeme *ptr; + int32 savedpos; + + while (lexs) + { + if (lexs->type > 0) + hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type); + + ptr = norms; + savedpos = prs->vectorpos; + while (ptr && ptr->lexeme) + { + if (ptr->flags & TSL_ADDPOS) + savedpos++; + hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme)); + ptr++; + } + + tmplexs = lexs->next; + pfree(lexs); + lexs = tmplexs; + } + + if (norms) + { + ptr = norms; + while (ptr->lexeme) + { + if (ptr->flags & TSL_ADDPOS) + prs->vectorpos++; + pfree(ptr->lexeme); + ptr++; + } + pfree(norms); + } +} + +void +hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen) +{ + int type, + lenlemm; + char *lemm = NULL; + LexizeData ldata; + TSLexeme *norms; + ParsedLex *lexs; + TSConfigCacheEntry *cfg; + TSParserCacheEntry *prsobj; + void *prsdata; + + cfg = lookup_ts_config_cache(cfgId); + prsobj = lookup_ts_parser_cache(cfg->prsId); + + prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart), + PointerGetDatum(buf), + Int32GetDatum(buflen))); + + LexizeInit(&ldata, cfg); + + do + { + type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), + PointerGetDatum(prsdata), + PointerGetDatum(&lemm), + PointerGetDatum(&lenlemm))); + + if (type > 0 && lenlemm >= MAXSTRLEN) + { +#ifdef IGNORE_LONGLEXEME + ereport(NOTICE, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); + continue; +#else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); +#endif + } + + LexizeAddLemm(&ldata, type, lemm, lenlemm); + + do + { + if ((norms = LexizeExec(&ldata, &lexs)) != NULL) + { + prs->vectorpos++; + addHLParsedLex(prs, query, lexs, norms); + } + else + addHLParsedLex(prs, query, lexs, NULL); + } while (norms); + + } while (type > 0); + + FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); +} + +text * +generateHeadline(HeadlineParsedText *prs) +{ + text *out; + char *ptr; + int len = 128; + int numfragments = 0; + int16 infrag = 0; + + HeadlineWordEntry *wrd = prs->words; + + out = (text *) palloc(len); + ptr = ((char *) out) + VARHDRSZ; + + while (wrd - prs->words < prs->curwords) + { + while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len) + { + int dist = ptr - ((char *) out); + + len *= 2; + out = (text *) repalloc(out, len); + ptr = ((char *) out) + dist; + } + + if (wrd->in && !wrd->repeated) + { + if (!infrag) + { + + /* start of a new fragment */ + infrag = 1; + numfragments++; + /* add a fragment delimiter if this is after the first one */ + if (numfragments > 1) + { + memcpy(ptr, prs->fragdelim, prs->fragdelimlen); + ptr += prs->fragdelimlen; + } + + } + if (wrd->replace) + { + *ptr = ' '; + ptr++; + } + else if (!wrd->skip) + { + if (wrd->selected) + { + memcpy(ptr, prs->startsel, prs->startsellen); + ptr += prs->startsellen; + } + memcpy(ptr, wrd->word, wrd->len); + ptr += wrd->len; + if (wrd->selected) + { + memcpy(ptr, prs->stopsel, prs->stopsellen); + ptr += prs->stopsellen; + } + } + } + else if (!wrd->repeated) + { + if (infrag) + infrag = 0; + pfree(wrd->word); + } + + wrd++; + } + + SET_VARSIZE(out, ptr - ((char *) out)); + return out; +} diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c new file mode 100644 index 0000000..be2546a --- /dev/null +++ b/src/backend/tsearch/ts_selfuncs.c @@ -0,0 +1,453 @@ +/*------------------------------------------------------------------------- + * + * ts_selfuncs.c + * Selectivity estimation functions for text search operators. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/ts_selfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "catalog/pg_statistic.h" +#include "catalog/pg_type.h" +#include "miscadmin.h" +#include "nodes/nodes.h" +#include "tsearch/ts_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/selfuncs.h" +#include "utils/syscache.h" + + +/* + * The default text search selectivity is chosen to be small enough to + * encourage indexscans for typical table densities. See selfuncs.h and + * DEFAULT_EQ_SEL for details. + */ +#define DEFAULT_TS_MATCH_SEL 0.005 + +/* lookup table type for binary searching through MCELEMs */ +typedef struct +{ + text *element; + float4 frequency; +} TextFreq; + +/* type of keys for bsearch'ing through an array of TextFreqs */ +typedef struct +{ + char *lexeme; + int length; +} LexemeKey; + +static Selectivity tsquerysel(VariableStatData *vardata, Datum constval); +static Selectivity mcelem_tsquery_selec(TSQuery query, + Datum *mcelem, int nmcelem, + float4 *numbers, int nnumbers); +static Selectivity tsquery_opr_selec(QueryItem *item, char *operand, + TextFreq *lookup, int length, float4 minfreq); +static int compare_lexeme_textfreq(const void *e1, const void *e2); + +#define tsquery_opr_selec_no_stats(query) \ + tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0) + + +/* + * tsmatchsel -- Selectivity of "@@" + * + * restriction selectivity function for tsvector @@ tsquery and + * tsquery @@ tsvector + */ +Datum +tsmatchsel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + +#ifdef NOT_USED + Oid operator = PG_GETARG_OID(1); +#endif + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + VariableStatData vardata; + Node *other; + bool varonleft; + Selectivity selec; + + /* + * If expression is not variable = something or something = variable, then + * punt and return a default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL); + + /* + * Can't do anything useful if the something is not a constant, either. + */ + if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL); + } + + /* + * The "@@" operator is strict, so we can cope with NULL right away + */ + if (((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(0.0); + } + + /* + * OK, there's a Var and a Const we're dealing with here. We need the + * Const to be a TSQuery, else we can't do anything useful. We have to + * check this because the Var might be the TSQuery not the TSVector. + */ + if (((Const *) other)->consttype == TSQUERYOID) + { + /* tsvector @@ tsquery or the other way around */ + Assert(vardata.vartype == TSVECTOROID); + + selec = tsquerysel(&vardata, ((Const *) other)->constvalue); + } + else + { + /* If we can't see the query structure, must punt */ + selec = DEFAULT_TS_MATCH_SEL; + } + + ReleaseVariableStats(vardata); + + CLAMP_PROBABILITY(selec); + + PG_RETURN_FLOAT8((float8) selec); +} + + +/* + * tsmatchjoinsel -- join selectivity of "@@" + * + * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector + */ +Datum +tsmatchjoinsel(PG_FUNCTION_ARGS) +{ + /* for the moment we just punt */ + PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL); +} + + +/* + * @@ selectivity for tsvector var vs tsquery constant + */ +static Selectivity +tsquerysel(VariableStatData *vardata, Datum constval) +{ + Selectivity selec; + TSQuery query; + + /* The caller made sure the const is a TSQuery, so get it now */ + query = DatumGetTSQuery(constval); + + /* Empty query matches nothing */ + if (query->size == 0) + return (Selectivity) 0.0; + + if (HeapTupleIsValid(vardata->statsTuple)) + { + Form_pg_statistic stats; + AttStatsSlot sslot; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + + /* MCELEM will be an array of TEXT elements for a tsvector column */ + if (get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_MCELEM, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS)) + { + /* + * There is a most-common-elements slot for the tsvector Var, so + * use that. + */ + selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues, + sslot.numbers, sslot.nnumbers); + free_attstatsslot(&sslot); + } + else + { + /* No most-common-elements info, so do without */ + selec = tsquery_opr_selec_no_stats(query); + } + + /* + * MCE stats count only non-null rows, so adjust for null rows. + */ + selec *= (1.0 - stats->stanullfrac); + } + else + { + /* No stats at all, so do without */ + selec = tsquery_opr_selec_no_stats(query); + /* we assume no nulls here, so no stanullfrac correction */ + } + + return selec; +} + +/* + * Extract data from the pg_statistic arrays into useful format. + */ +static Selectivity +mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, + float4 *numbers, int nnumbers) +{ + float4 minfreq; + TextFreq *lookup; + Selectivity selec; + int i; + + /* + * There should be two more Numbers than Values, because the last two + * cells are taken for minimal and maximal frequency. Punt if not. + * + * (Note: the MCELEM statistics slot definition allows for a third extra + * number containing the frequency of nulls, but we're not expecting that + * to appear for a tsvector column.) + */ + if (nnumbers != nmcelem + 2) + return tsquery_opr_selec_no_stats(query); + + /* + * Transpose the data into a single array so we can use bsearch(). + */ + lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem); + for (i = 0; i < nmcelem; i++) + { + /* + * The text Datums came from an array, so it cannot be compressed or + * stored out-of-line -- it's safe to use VARSIZE_ANY*. + */ + Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i])); + lookup[i].element = (text *) DatumGetPointer(mcelem[i]); + lookup[i].frequency = numbers[i]; + } + + /* + * Grab the lowest frequency. compute_tsvector_stats() stored it for us in + * the one before the last cell of the Numbers array. See ts_typanalyze.c + */ + minfreq = numbers[nnumbers - 2]; + + selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup, + nmcelem, minfreq); + + pfree(lookup); + + return selec; +} + +/* + * Traverse the tsquery in preorder, calculating selectivity as: + * + * selec(left_oper) * selec(right_oper) in AND & PHRASE nodes, + * + * selec(left_oper) + selec(right_oper) - + * selec(left_oper) * selec(right_oper) in OR nodes, + * + * 1 - select(oper) in NOT nodes + * + * histogram-based estimation in prefix VAL nodes + * + * freq[val] in exact VAL nodes, if the value is in MCELEM + * min(freq[MCELEM]) / 2 in VAL nodes, if it is not + * + * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use + * binary search for determining freq[MCELEM]. + * + * If we don't have stats for the tsvector, we still use this logic, + * except we use default estimates for VAL nodes. This case is signaled + * by lookup == NULL. + */ +static Selectivity +tsquery_opr_selec(QueryItem *item, char *operand, + TextFreq *lookup, int length, float4 minfreq) +{ + Selectivity selec; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (item->type == QI_VAL) + { + QueryOperand *oper = (QueryOperand *) item; + LexemeKey key; + + /* + * Prepare the key for bsearch(). + */ + key.lexeme = operand + oper->distance; + key.length = oper->length; + + if (oper->prefix) + { + /* Prefix match, ie the query item is lexeme:* */ + Selectivity matched, + allmces; + int i, + n_matched; + + /* + * Our strategy is to scan through the MCELEM list and combine the + * frequencies of the ones that match the prefix. We then + * extrapolate the fraction of matching MCELEMs to the remaining + * rows, assuming that the MCELEMs are representative of the whole + * lexeme population in this respect. (Compare + * histogram_selectivity().) Note that these are most common + * elements not most common values, so they're not mutually + * exclusive. We treat occurrences as independent events. + * + * This is only a good plan if we have a pretty fair number of + * MCELEMs available; we set the threshold at 100. If no stats or + * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4. + */ + if (lookup == NULL || length < 100) + return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4); + + matched = allmces = 0; + n_matched = 0; + for (i = 0; i < length; i++) + { + TextFreq *t = lookup + i; + int tlen = VARSIZE_ANY_EXHDR(t->element); + + if (tlen >= key.length && + strncmp(key.lexeme, VARDATA_ANY(t->element), + key.length) == 0) + { + matched += t->frequency - matched * t->frequency; + n_matched++; + } + allmces += t->frequency - allmces * t->frequency; + } + + /* Clamp to ensure sanity in the face of roundoff error */ + CLAMP_PROBABILITY(matched); + CLAMP_PROBABILITY(allmces); + + selec = matched + (1.0 - allmces) * ((double) n_matched / length); + + /* + * In any case, never believe that a prefix match has selectivity + * less than we would assign for a non-MCELEM lexeme. This + * preserves the property that "word:*" should be estimated to + * match at least as many rows as "word" would be. + */ + selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec); + } + else + { + /* Regular exact lexeme match */ + TextFreq *searchres; + + /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ + if (lookup == NULL) + return (Selectivity) DEFAULT_TS_MATCH_SEL; + + searchres = (TextFreq *) bsearch(&key, lookup, length, + sizeof(TextFreq), + compare_lexeme_textfreq); + + if (searchres) + { + /* + * The element is in MCELEM. Return precise selectivity (or + * at least as precise as ANALYZE could find out). + */ + selec = searchres->frequency; + } + else + { + /* + * The element is not in MCELEM. Punt, but assume that the + * selectivity cannot be more than minfreq / 2. + */ + selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); + } + } + } + else + { + /* Current TSQuery node is an operator */ + Selectivity s1, + s2; + + switch (item->qoperator.oper) + { + case OP_NOT: + selec = 1.0 - tsquery_opr_selec(item + 1, operand, + lookup, length, minfreq); + break; + + case OP_PHRASE: + case OP_AND: + s1 = tsquery_opr_selec(item + 1, operand, + lookup, length, minfreq); + s2 = tsquery_opr_selec(item + item->qoperator.left, operand, + lookup, length, minfreq); + selec = s1 * s2; + break; + + case OP_OR: + s1 = tsquery_opr_selec(item + 1, operand, + lookup, length, minfreq); + s2 = tsquery_opr_selec(item + item->qoperator.left, operand, + lookup, length, minfreq); + selec = s1 + s2 - s1 * s2; + break; + + default: + elog(ERROR, "unrecognized operator: %d", item->qoperator.oper); + selec = 0; /* keep compiler quiet */ + break; + } + } + + /* Clamp intermediate results to stay sane despite roundoff error */ + CLAMP_PROBABILITY(selec); + + return selec; +} + +/* + * bsearch() comparator for a lexeme (non-NULL terminated string with length) + * and a TextFreq. Use length, then byte-for-byte comparison, because that's + * how ANALYZE code sorted data before storing it in a statistic tuple. + * See ts_typanalyze.c for details. + */ +static int +compare_lexeme_textfreq(const void *e1, const void *e2) +{ + const LexemeKey *key = (const LexemeKey *) e1; + const TextFreq *t = (const TextFreq *) e2; + int len1, + len2; + + len1 = key->length; + len2 = VARSIZE_ANY_EXHDR(t->element); + + /* Compare lengths first, possibly avoiding a strncmp call */ + if (len1 > len2) + return 1; + else if (len1 < len2) + return -1; + + /* Fall back on byte-for-byte comparison */ + return strncmp(key->lexeme, VARDATA_ANY(t->element), len1); +} diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c new file mode 100644 index 0000000..56eeb6f --- /dev/null +++ b/src/backend/tsearch/ts_typanalyze.c @@ -0,0 +1,536 @@ +/*------------------------------------------------------------------------- + * + * ts_typanalyze.c + * functions for gathering statistics from tsvector columns + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/ts_typanalyze.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/pg_collation.h" +#include "catalog/pg_operator.h" +#include "commands/vacuum.h" +#include "common/hashfn.h" +#include "tsearch/ts_type.h" +#include "utils/builtins.h" + + +/* A hash key for lexemes */ +typedef struct +{ + char *lexeme; /* lexeme (not NULL terminated!) */ + int length; /* its length in bytes */ +} LexemeHashKey; + +/* A hash table entry for the Lossy Counting algorithm */ +typedef struct +{ + LexemeHashKey key; /* This is 'e' from the LC algorithm. */ + int frequency; /* This is 'f'. */ + int delta; /* And this is 'delta'. */ +} TrackItem; + +static void compute_tsvector_stats(VacAttrStats *stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows); +static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current); +static uint32 lexeme_hash(const void *key, Size keysize); +static int lexeme_match(const void *key1, const void *key2, Size keysize); +static int lexeme_compare(const void *key1, const void *key2); +static int trackitem_compare_frequencies_desc(const void *e1, const void *e2, + void *arg); +static int trackitem_compare_lexemes(const void *e1, const void *e2, + void *arg); + + +/* + * ts_typanalyze -- a custom typanalyze function for tsvector columns + */ +Datum +ts_typanalyze(PG_FUNCTION_ARGS) +{ + VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0); + Form_pg_attribute attr = stats->attr; + + /* If the attstattarget column is negative, use the default value */ + /* NB: it is okay to scribble on stats->attr since it's a copy */ + if (attr->attstattarget < 0) + attr->attstattarget = default_statistics_target; + + stats->compute_stats = compute_tsvector_stats; + /* see comment about the choice of minrows in commands/analyze.c */ + stats->minrows = 300 * attr->attstattarget; + + PG_RETURN_BOOL(true); +} + +/* + * compute_tsvector_stats() -- compute statistics for a tsvector column + * + * This functions computes statistics that are useful for determining @@ + * operations' selectivity, along with the fraction of non-null rows and + * average width. + * + * Instead of finding the most common values, as we do for most datatypes, + * we're looking for the most common lexemes. This is more useful, because + * there most probably won't be any two rows with the same tsvector and thus + * the notion of a MCV is a bit bogus with this datatype. With a list of the + * most common lexemes we can do a better job at figuring out @@ selectivity. + * + * For the same reasons we assume that tsvector columns are unique when + * determining the number of distinct values. + * + * The algorithm used is Lossy Counting, as proposed in the paper "Approximate + * frequency counts over data streams" by G. S. Manku and R. Motwani, in + * Proceedings of the 28th International Conference on Very Large Data Bases, + * Hong Kong, China, August 2002, section 4.2. The paper is available at + * http://www.vldb.org/conf/2002/S10P03.pdf + * + * The Lossy Counting (aka LC) algorithm goes like this: + * Let s be the threshold frequency for an item (the minimum frequency we + * are interested in) and epsilon the error margin for the frequency. Let D + * be a set of triples (e, f, delta), where e is an element value, f is that + * element's frequency (actually, its current occurrence count) and delta is + * the maximum error in f. We start with D empty and process the elements in + * batches of size w. (The batch size is also known as "bucket size" and is + * equal to 1/epsilon.) Let the current batch number be b_current, starting + * with 1. For each element e we either increment its f count, if it's + * already in D, or insert a new triple into D with values (e, 1, b_current + * - 1). After processing each batch we prune D, by removing from it all + * elements with f + delta <= b_current. After the algorithm finishes we + * suppress all elements from D that do not satisfy f >= (s - epsilon) * N, + * where N is the total number of elements in the input. We emit the + * remaining elements with estimated frequency f/N. The LC paper proves + * that this algorithm finds all elements with true frequency at least s, + * and that no frequency is overestimated or is underestimated by more than + * epsilon. Furthermore, given reasonable assumptions about the input + * distribution, the required table size is no more than about 7 times w. + * + * We set s to be the estimated frequency of the K'th word in a natural + * language's frequency table, where K is the target number of entries in + * the MCELEM array plus an arbitrary constant, meant to reflect the fact + * that the most common words in any language would usually be stopwords + * so we will not actually see them in the input. We assume that the + * distribution of word frequencies (including the stopwords) follows Zipf's + * law with an exponent of 1. + * + * Assuming Zipfian distribution, the frequency of the K'th word is equal + * to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of + * words in the language. Putting W as one million, we get roughly 0.07/K. + * Assuming top 10 words are stopwords gives s = 0.07/(K + 10). We set + * epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and + * maximum expected hashtable size of about 1000 * (K + 10). + * + * Note: in the above discussion, s, epsilon, and f/N are in terms of a + * lexeme's frequency as a fraction of all lexemes seen in the input. + * However, what we actually want to store in the finished pg_statistic + * entry is each lexeme's frequency as a fraction of all rows that it occurs + * in. Assuming that the input tsvectors are correctly constructed, no + * lexeme occurs more than once per tsvector, so the final count f is a + * correct estimate of the number of input tsvectors it occurs in, and we + * need only change the divisor from N to nonnull_cnt to get the number we + * want. + */ +static void +compute_tsvector_stats(VacAttrStats *stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows) +{ + int num_mcelem; + int null_cnt = 0; + double total_width = 0; + + /* This is D from the LC algorithm. */ + HTAB *lexemes_tab; + HASHCTL hash_ctl; + HASH_SEQ_STATUS scan_status; + + /* This is the current bucket number from the LC algorithm */ + int b_current; + + /* This is 'w' from the LC algorithm */ + int bucket_width; + int vector_no, + lexeme_no; + LexemeHashKey hash_key; + TrackItem *item; + + /* + * We want statistics_target * 10 lexemes in the MCELEM array. This + * multiplier is pretty arbitrary, but is meant to reflect the fact that + * the number of individual lexeme values tracked in pg_statistic ought to + * be more than the number of values for a simple scalar column. + */ + num_mcelem = stats->attr->attstattarget * 10; + + /* + * We set bucket width equal to (num_mcelem + 10) / 0.007 as per the + * comment above. + */ + bucket_width = (num_mcelem + 10) * 1000 / 7; + + /* + * Create the hashtable. It will be in local memory, so we don't need to + * worry about overflowing the initial size. Also we don't need to pay any + * attention to locking and memory management. + */ + hash_ctl.keysize = sizeof(LexemeHashKey); + hash_ctl.entrysize = sizeof(TrackItem); + hash_ctl.hash = lexeme_hash; + hash_ctl.match = lexeme_match; + hash_ctl.hcxt = CurrentMemoryContext; + lexemes_tab = hash_create("Analyzed lexemes table", + num_mcelem, + &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); + + /* Initialize counters. */ + b_current = 1; + lexeme_no = 0; + + /* Loop over the tsvectors. */ + for (vector_no = 0; vector_no < samplerows; vector_no++) + { + Datum value; + bool isnull; + TSVector vector; + WordEntry *curentryptr; + char *lexemesptr; + int j; + + vacuum_delay_point(); + + value = fetchfunc(stats, vector_no, &isnull); + + /* + * Check for null/nonnull. + */ + if (isnull) + { + null_cnt++; + continue; + } + + /* + * Add up widths for average-width calculation. Since it's a + * tsvector, we know it's varlena. As in the regular + * compute_minimal_stats function, we use the toasted width for this + * calculation. + */ + total_width += VARSIZE_ANY(DatumGetPointer(value)); + + /* + * Now detoast the tsvector if needed. + */ + vector = DatumGetTSVector(value); + + /* + * We loop through the lexemes in the tsvector and add them to our + * tracking hashtable. + */ + lexemesptr = STRPTR(vector); + curentryptr = ARRPTR(vector); + for (j = 0; j < vector->size; j++) + { + bool found; + + /* + * Construct a hash key. The key points into the (detoasted) + * tsvector value at this point, but if a new entry is created, we + * make a copy of it. This way we can free the tsvector value + * once we've processed all its lexemes. + */ + hash_key.lexeme = lexemesptr + curentryptr->pos; + hash_key.length = curentryptr->len; + + /* Lookup current lexeme in hashtable, adding it if new */ + item = (TrackItem *) hash_search(lexemes_tab, + (const void *) &hash_key, + HASH_ENTER, &found); + + if (found) + { + /* The lexeme is already on the tracking list */ + item->frequency++; + } + else + { + /* Initialize new tracking list element */ + item->frequency = 1; + item->delta = b_current - 1; + + item->key.lexeme = palloc(hash_key.length); + memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length); + } + + /* lexeme_no is the number of elements processed (ie N) */ + lexeme_no++; + + /* We prune the D structure after processing each bucket */ + if (lexeme_no % bucket_width == 0) + { + prune_lexemes_hashtable(lexemes_tab, b_current); + b_current++; + } + + /* Advance to the next WordEntry in the tsvector */ + curentryptr++; + } + + /* If the vector was toasted, free the detoasted copy. */ + if (TSVectorGetDatum(vector) != value) + pfree(vector); + } + + /* We can only compute real stats if we found some non-null values. */ + if (null_cnt < samplerows) + { + int nonnull_cnt = samplerows - null_cnt; + int i; + TrackItem **sort_table; + int track_len; + int cutoff_freq; + int minfreq, + maxfreq; + + stats->stats_valid = true; + /* Do the simple null-frac and average width stats */ + stats->stanullfrac = (double) null_cnt / (double) samplerows; + stats->stawidth = total_width / (double) nonnull_cnt; + + /* Assume it's a unique column (see notes above) */ + stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); + + /* + * Construct an array of the interesting hashtable items, that is, + * those meeting the cutoff frequency (s - epsilon)*N. Also identify + * the minimum and maximum frequencies among these items. + * + * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff + * frequency is 9*N / bucket_width. + */ + cutoff_freq = 9 * lexeme_no / bucket_width; + + i = hash_get_num_entries(lexemes_tab); /* surely enough space */ + sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i); + + hash_seq_init(&scan_status, lexemes_tab); + track_len = 0; + minfreq = lexeme_no; + maxfreq = 0; + while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) + { + if (item->frequency > cutoff_freq) + { + sort_table[track_len++] = item; + minfreq = Min(minfreq, item->frequency); + maxfreq = Max(maxfreq, item->frequency); + } + } + Assert(track_len <= i); + + /* emit some statistics for debug purposes */ + elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, " + "# lexemes = %d, hashtable size = %d, usable entries = %d", + num_mcelem, bucket_width, lexeme_no, i, track_len); + + /* + * If we obtained more lexemes than we really want, get rid of those + * with least frequencies. The easiest way is to qsort the array into + * descending frequency order and truncate the array. + */ + if (num_mcelem < track_len) + { + qsort_interruptible(sort_table, track_len, sizeof(TrackItem *), + trackitem_compare_frequencies_desc, NULL); + /* reset minfreq to the smallest frequency we're keeping */ + minfreq = sort_table[num_mcelem - 1]->frequency; + } + else + num_mcelem = track_len; + + /* Generate MCELEM slot entry */ + if (num_mcelem > 0) + { + MemoryContext old_context; + Datum *mcelem_values; + float4 *mcelem_freqs; + + /* + * We want to store statistics sorted on the lexeme value using + * first length, then byte-for-byte comparison. The reason for + * doing length comparison first is that we don't care about the + * ordering so long as it's consistent, and comparing lengths + * first gives us a chance to avoid a strncmp() call. + * + * This is different from what we do with scalar statistics -- + * they get sorted on frequencies. The rationale is that we + * usually search through most common elements looking for a + * specific value, so we can grab its frequency. When values are + * presorted we can employ binary search for that. See + * ts_selfuncs.c for a real usage scenario. + */ + qsort_interruptible(sort_table, num_mcelem, sizeof(TrackItem *), + trackitem_compare_lexemes, NULL); + + /* Must copy the target values into anl_context */ + old_context = MemoryContextSwitchTo(stats->anl_context); + + /* + * We sorted statistics on the lexeme value, but we want to be + * able to find out the minimal and maximal frequency without + * going through all the values. We keep those two extra + * frequencies in two extra cells in mcelem_freqs. + * + * (Note: the MCELEM statistics slot definition allows for a third + * extra number containing the frequency of nulls, but we don't + * create that for a tsvector column, since null elements aren't + * possible.) + */ + mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); + mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4)); + + /* + * See comments above about use of nonnull_cnt as the divisor for + * the final frequency estimates. + */ + for (i = 0; i < num_mcelem; i++) + { + TrackItem *item = sort_table[i]; + + mcelem_values[i] = + PointerGetDatum(cstring_to_text_with_len(item->key.lexeme, + item->key.length)); + mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; + } + mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; + mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt; + MemoryContextSwitchTo(old_context); + + stats->stakind[0] = STATISTIC_KIND_MCELEM; + stats->staop[0] = TextEqualOperator; + stats->stacoll[0] = DEFAULT_COLLATION_OID; + stats->stanumbers[0] = mcelem_freqs; + /* See above comment about two extra frequency fields */ + stats->numnumbers[0] = num_mcelem + 2; + stats->stavalues[0] = mcelem_values; + stats->numvalues[0] = num_mcelem; + /* We are storing text values */ + stats->statypid[0] = TEXTOID; + stats->statyplen[0] = -1; /* typlen, -1 for varlena */ + stats->statypbyval[0] = false; + stats->statypalign[0] = 'i'; + } + } + else + { + /* We found only nulls; assume the column is entirely null */ + stats->stats_valid = true; + stats->stanullfrac = 1.0; + stats->stawidth = 0; /* "unknown" */ + stats->stadistinct = 0.0; /* "unknown" */ + } + + /* + * We don't need to bother cleaning up any of our temporary palloc's. The + * hashtable should also go away, as it used a child memory context. + */ +} + +/* + * A function to prune the D structure from the Lossy Counting algorithm. + * Consult compute_tsvector_stats() for wider explanation. + */ +static void +prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current) +{ + HASH_SEQ_STATUS scan_status; + TrackItem *item; + + hash_seq_init(&scan_status, lexemes_tab); + while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) + { + if (item->frequency + item->delta <= b_current) + { + char *lexeme = item->key.lexeme; + + if (hash_search(lexemes_tab, (const void *) &item->key, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "hash table corrupted"); + pfree(lexeme); + } + } +} + +/* + * Hash functions for lexemes. They are strings, but not NULL terminated, + * so we need a special hash function. + */ +static uint32 +lexeme_hash(const void *key, Size keysize) +{ + const LexemeHashKey *l = (const LexemeHashKey *) key; + + return DatumGetUInt32(hash_any((const unsigned char *) l->lexeme, + l->length)); +} + +/* + * Matching function for lexemes, to be used in hashtable lookups. + */ +static int +lexeme_match(const void *key1, const void *key2, Size keysize) +{ + /* The keysize parameter is superfluous, the keys store their lengths */ + return lexeme_compare(key1, key2); +} + +/* + * Comparison function for lexemes. + */ +static int +lexeme_compare(const void *key1, const void *key2) +{ + const LexemeHashKey *d1 = (const LexemeHashKey *) key1; + const LexemeHashKey *d2 = (const LexemeHashKey *) key2; + + /* First, compare by length */ + if (d1->length > d2->length) + return 1; + else if (d1->length < d2->length) + return -1; + /* Lengths are equal, do a byte-by-byte comparison */ + return strncmp(d1->lexeme, d2->lexeme, d1->length); +} + +/* + * Comparator for sorting TrackItems on frequencies (descending sort) + */ +static int +trackitem_compare_frequencies_desc(const void *e1, const void *e2, void *arg) +{ + const TrackItem *const *t1 = (const TrackItem *const *) e1; + const TrackItem *const *t2 = (const TrackItem *const *) e2; + + return (*t2)->frequency - (*t1)->frequency; +} + +/* + * Comparator for sorting TrackItems on lexemes + */ +static int +trackitem_compare_lexemes(const void *e1, const void *e2, void *arg) +{ + const TrackItem *const *t1 = (const TrackItem *const *) e1; + const TrackItem *const *t2 = (const TrackItem *const *) e2; + + return lexeme_compare(&(*t1)->key, &(*t2)->key); +} diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c new file mode 100644 index 0000000..ed16a2e --- /dev/null +++ b/src/backend/tsearch/ts_utils.c @@ -0,0 +1,146 @@ +/*------------------------------------------------------------------------- + * + * ts_utils.c + * various support functions + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/ts_utils.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "miscadmin.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" + + +/* + * Given the base name and extension of a tsearch config file, return + * its full path name. The base name is assumed to be user-supplied, + * and is checked to prevent pathname attacks. The extension is assumed + * to be safe. + * + * The result is a palloc'd string. + */ +char * +get_tsearch_config_filename(const char *basename, + const char *extension) +{ + char sharepath[MAXPGPATH]; + char *result; + + /* + * We limit the basename to contain a-z, 0-9, and underscores. This may + * be overly restrictive, but we don't want to allow access to anything + * outside the tsearch_data directory, so for instance '/' *must* be + * rejected, and on some platforms '\' and ':' are risky as well. Allowing + * uppercase might result in incompatible behavior between case-sensitive + * and case-insensitive filesystems, and non-ASCII characters create other + * interesting risks, so on the whole a tight policy seems best. + */ + if (strspn(basename, "abcdefghijklmnopqrstuvwxyz0123456789_") != strlen(basename)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid text search configuration file name \"%s\"", + basename))); + + get_share_path(my_exec_path, sharepath); + result = palloc(MAXPGPATH); + snprintf(result, MAXPGPATH, "%s/tsearch_data/%s.%s", + sharepath, basename, extension); + + return result; +} + +/* + * Reads a stop-word file. Each word is run through 'wordop' + * function, if given. wordop may either modify the input in-place, + * or palloc a new version. + */ +void +readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *)) +{ + char **stop = NULL; + + s->len = 0; + if (fname && *fname) + { + char *filename = get_tsearch_config_filename(fname, "stop"); + tsearch_readline_state trst; + char *line; + int reallen = 0; + + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open stop-word file \"%s\": %m", + filename))); + + while ((line = tsearch_readline(&trst)) != NULL) + { + char *pbuf = line; + + /* Trim trailing space */ + while (*pbuf && !t_isspace(pbuf)) + pbuf += pg_mblen(pbuf); + *pbuf = '\0'; + + /* Skip empty lines */ + if (*line == '\0') + { + pfree(line); + continue; + } + + if (s->len >= reallen) + { + if (reallen == 0) + { + reallen = 64; + stop = (char **) palloc(sizeof(char *) * reallen); + } + else + { + reallen *= 2; + stop = (char **) repalloc((void *) stop, + sizeof(char *) * reallen); + } + } + + if (wordop) + { + stop[s->len] = wordop(line); + if (stop[s->len] != line) + pfree(line); + } + else + stop[s->len] = line; + + (s->len)++; + } + + tsearch_readline_end(&trst); + pfree(filename); + } + + s->stop = stop; + + /* Sort to allow binary searching */ + if (s->stop && s->len > 0) + qsort(s->stop, s->len, sizeof(char *), pg_qsort_strcmp); +} + +bool +searchstoplist(StopList *s, char *key) +{ + return (s->stop && s->len > 0 && + bsearch(&key, s->stop, s->len, + sizeof(char *), pg_qsort_strcmp)) ? true : false; +} diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c new file mode 100644 index 0000000..71882dc --- /dev/null +++ b/src/backend/tsearch/wparser.c @@ -0,0 +1,549 @@ +/*------------------------------------------------------------------------- + * + * wparser.c + * Standard interface to word parser + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/wparser.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/namespace.h" +#include "catalog/pg_type.h" +#include "commands/defrem.h" +#include "common/jsonapi.h" +#include "funcapi.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" +#include "utils/jsonfuncs.h" +#include "utils/varlena.h" + +/******sql-level interface******/ + +typedef struct +{ + int cur; + LexDescr *list; +} TSTokenTypeStorage; + +/* state for ts_headline_json_* */ +typedef struct HeadlineJsonState +{ + HeadlineParsedText *prs; + TSConfigCacheEntry *cfg; + TSParserCacheEntry *prsobj; + TSQuery query; + List *prsoptions; + bool transformed; +} HeadlineJsonState; + +static text *headline_json_value(void *_state, char *elem_value, int elem_len); + +static void +tt_setup_firstcall(FuncCallContext *funcctx, Oid prsid) +{ + TupleDesc tupdesc; + MemoryContext oldcontext; + TSTokenTypeStorage *st; + TSParserCacheEntry *prs = lookup_ts_parser_cache(prsid); + + if (!OidIsValid(prs->lextypeOid)) + elog(ERROR, "method lextype isn't defined for text search parser %u", + prsid); + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + st = (TSTokenTypeStorage *) palloc(sizeof(TSTokenTypeStorage)); + st->cur = 0; + /* lextype takes one dummy argument */ + st->list = (LexDescr *) DatumGetPointer(OidFunctionCall1(prs->lextypeOid, + (Datum) 0)); + funcctx->user_fctx = (void *) st; + + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tokid", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "alias", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "description", + TEXTOID, -1, 0); + + funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); + MemoryContextSwitchTo(oldcontext); +} + +static Datum +tt_process_call(FuncCallContext *funcctx) +{ + TSTokenTypeStorage *st; + + st = (TSTokenTypeStorage *) funcctx->user_fctx; + if (st->list && st->list[st->cur].lexid) + { + Datum result; + char *values[3]; + char txtid[16]; + HeapTuple tuple; + + sprintf(txtid, "%d", st->list[st->cur].lexid); + values[0] = txtid; + values[1] = st->list[st->cur].alias; + values[2] = st->list[st->cur].descr; + + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + pfree(values[1]); + pfree(values[2]); + st->cur++; + return result; + } + return (Datum) 0; +} + +Datum +ts_token_type_byid(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + funcctx = SRF_FIRSTCALL_INIT(); + tt_setup_firstcall(funcctx, PG_GETARG_OID(0)); + } + + funcctx = SRF_PERCALL_SETUP(); + + if ((result = tt_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + +Datum +ts_token_type_byname(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + text *prsname = PG_GETARG_TEXT_PP(0); + Oid prsId; + + funcctx = SRF_FIRSTCALL_INIT(); + prsId = get_ts_parser_oid(textToQualifiedNameList(prsname), false); + tt_setup_firstcall(funcctx, prsId); + } + + funcctx = SRF_PERCALL_SETUP(); + + if ((result = tt_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + +typedef struct +{ + int type; + char *lexeme; +} LexemeEntry; + +typedef struct +{ + int cur; + int len; + LexemeEntry *list; +} PrsStorage; + + +static void +prs_setup_firstcall(FuncCallContext *funcctx, Oid prsid, text *txt) +{ + TupleDesc tupdesc; + MemoryContext oldcontext; + PrsStorage *st; + TSParserCacheEntry *prs = lookup_ts_parser_cache(prsid); + char *lex = NULL; + int llen = 0, + type = 0; + void *prsdata; + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + st = (PrsStorage *) palloc(sizeof(PrsStorage)); + st->cur = 0; + st->len = 16; + st->list = (LexemeEntry *) palloc(sizeof(LexemeEntry) * st->len); + + prsdata = (void *) DatumGetPointer(FunctionCall2(&prs->prsstart, + PointerGetDatum(VARDATA_ANY(txt)), + Int32GetDatum(VARSIZE_ANY_EXHDR(txt)))); + + while ((type = DatumGetInt32(FunctionCall3(&prs->prstoken, + PointerGetDatum(prsdata), + PointerGetDatum(&lex), + PointerGetDatum(&llen)))) != 0) + { + if (st->cur >= st->len) + { + st->len = 2 * st->len; + st->list = (LexemeEntry *) repalloc(st->list, sizeof(LexemeEntry) * st->len); + } + st->list[st->cur].lexeme = palloc(llen + 1); + memcpy(st->list[st->cur].lexeme, lex, llen); + st->list[st->cur].lexeme[llen] = '\0'; + st->list[st->cur].type = type; + st->cur++; + } + + FunctionCall1(&prs->prsend, PointerGetDatum(prsdata)); + + st->len = st->cur; + st->cur = 0; + + funcctx->user_fctx = (void *) st; + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tokid", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "token", + TEXTOID, -1, 0); + + funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); + MemoryContextSwitchTo(oldcontext); +} + +static Datum +prs_process_call(FuncCallContext *funcctx) +{ + PrsStorage *st; + + st = (PrsStorage *) funcctx->user_fctx; + if (st->cur < st->len) + { + Datum result; + char *values[2]; + char tid[16]; + HeapTuple tuple; + + values[0] = tid; + sprintf(tid, "%d", st->list[st->cur].type); + values[1] = st->list[st->cur].lexeme; + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + pfree(values[1]); + st->cur++; + return result; + } + return (Datum) 0; +} + +Datum +ts_parse_byid(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + text *txt = PG_GETARG_TEXT_PP(1); + + funcctx = SRF_FIRSTCALL_INIT(); + prs_setup_firstcall(funcctx, PG_GETARG_OID(0), txt); + PG_FREE_IF_COPY(txt, 1); + } + + funcctx = SRF_PERCALL_SETUP(); + + if ((result = prs_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + +Datum +ts_parse_byname(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + text *prsname = PG_GETARG_TEXT_PP(0); + text *txt = PG_GETARG_TEXT_PP(1); + Oid prsId; + + funcctx = SRF_FIRSTCALL_INIT(); + prsId = get_ts_parser_oid(textToQualifiedNameList(prsname), false); + prs_setup_firstcall(funcctx, prsId, txt); + } + + funcctx = SRF_PERCALL_SETUP(); + + if ((result = prs_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + +Datum +ts_headline_byid_opt(PG_FUNCTION_ARGS) +{ + Oid tsconfig = PG_GETARG_OID(0); + text *in = PG_GETARG_TEXT_PP(1); + TSQuery query = PG_GETARG_TSQUERY(2); + text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_PP(3) : NULL; + HeadlineParsedText prs; + List *prsoptions; + text *out; + TSConfigCacheEntry *cfg; + TSParserCacheEntry *prsobj; + + cfg = lookup_ts_config_cache(tsconfig); + prsobj = lookup_ts_parser_cache(cfg->prsId); + + if (!OidIsValid(prsobj->headlineOid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("text search parser does not support headline creation"))); + + memset(&prs, 0, sizeof(HeadlineParsedText)); + prs.lenwords = 32; + prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords); + + hlparsetext(cfg->cfgId, &prs, query, + VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in)); + + if (opt) + prsoptions = deserialize_deflist(PointerGetDatum(opt)); + else + prsoptions = NIL; + + FunctionCall3(&(prsobj->prsheadline), + PointerGetDatum(&prs), + PointerGetDatum(prsoptions), + PointerGetDatum(query)); + + out = generateHeadline(&prs); + + PG_FREE_IF_COPY(in, 1); + PG_FREE_IF_COPY(query, 2); + if (opt) + PG_FREE_IF_COPY(opt, 3); + pfree(prs.words); + pfree(prs.startsel); + pfree(prs.stopsel); + + PG_RETURN_POINTER(out); +} + +Datum +ts_headline_byid(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1))); +} + +Datum +ts_headline_opt(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline_jsonb_byid_opt(PG_FUNCTION_ARGS) +{ + Oid tsconfig = PG_GETARG_OID(0); + Jsonb *jb = PG_GETARG_JSONB_P(1); + TSQuery query = PG_GETARG_TSQUERY(2); + text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL; + Jsonb *out; + JsonTransformStringValuesAction action = (JsonTransformStringValuesAction) headline_json_value; + HeadlineParsedText prs; + HeadlineJsonState *state = palloc0(sizeof(HeadlineJsonState)); + + memset(&prs, 0, sizeof(HeadlineParsedText)); + prs.lenwords = 32; + prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords); + + state->prs = &prs; + state->cfg = lookup_ts_config_cache(tsconfig); + state->prsobj = lookup_ts_parser_cache(state->cfg->prsId); + state->query = query; + if (opt) + state->prsoptions = deserialize_deflist(PointerGetDatum(opt)); + else + state->prsoptions = NIL; + + if (!OidIsValid(state->prsobj->headlineOid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("text search parser does not support headline creation"))); + + out = transform_jsonb_string_values(jb, state, action); + + PG_FREE_IF_COPY(jb, 1); + PG_FREE_IF_COPY(query, 2); + if (opt) + PG_FREE_IF_COPY(opt, 3); + + pfree(prs.words); + + if (state->transformed) + { + pfree(prs.startsel); + pfree(prs.stopsel); + } + + PG_RETURN_JSONB_P(out); +} + +Datum +ts_headline_jsonb(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_jsonb_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1))); +} + +Datum +ts_headline_jsonb_byid(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_jsonb_byid_opt, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline_jsonb_opt(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_jsonb_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline_json_byid_opt(PG_FUNCTION_ARGS) +{ + Oid tsconfig = PG_GETARG_OID(0); + text *json = PG_GETARG_TEXT_P(1); + TSQuery query = PG_GETARG_TSQUERY(2); + text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL; + text *out; + JsonTransformStringValuesAction action = (JsonTransformStringValuesAction) headline_json_value; + + HeadlineParsedText prs; + HeadlineJsonState *state = palloc0(sizeof(HeadlineJsonState)); + + memset(&prs, 0, sizeof(HeadlineParsedText)); + prs.lenwords = 32; + prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords); + + state->prs = &prs; + state->cfg = lookup_ts_config_cache(tsconfig); + state->prsobj = lookup_ts_parser_cache(state->cfg->prsId); + state->query = query; + if (opt) + state->prsoptions = deserialize_deflist(PointerGetDatum(opt)); + else + state->prsoptions = NIL; + + if (!OidIsValid(state->prsobj->headlineOid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("text search parser does not support headline creation"))); + + out = transform_json_string_values(json, state, action); + + PG_FREE_IF_COPY(json, 1); + PG_FREE_IF_COPY(query, 2); + if (opt) + PG_FREE_IF_COPY(opt, 3); + pfree(prs.words); + + if (state->transformed) + { + pfree(prs.startsel); + pfree(prs.stopsel); + } + + PG_RETURN_TEXT_P(out); +} + +Datum +ts_headline_json(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_json_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1))); +} + +Datum +ts_headline_json_byid(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_json_byid_opt, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline_json_opt(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_json_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + + +/* + * Return headline in text from, generated from a json(b) element + */ +static text * +headline_json_value(void *_state, char *elem_value, int elem_len) +{ + HeadlineJsonState *state = (HeadlineJsonState *) _state; + + HeadlineParsedText *prs = state->prs; + TSConfigCacheEntry *cfg = state->cfg; + TSParserCacheEntry *prsobj = state->prsobj; + TSQuery query = state->query; + List *prsoptions = state->prsoptions; + + prs->curwords = 0; + hlparsetext(cfg->cfgId, prs, query, elem_value, elem_len); + FunctionCall3(&(prsobj->prsheadline), + PointerGetDatum(prs), + PointerGetDatum(prsoptions), + PointerGetDatum(query)); + + state->transformed = true; + return generateHeadline(prs); +} diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c new file mode 100644 index 0000000..559dff6 --- /dev/null +++ b/src/backend/tsearch/wparser_def.c @@ -0,0 +1,2634 @@ +/*------------------------------------------------------------------------- + * + * wparser_def.c + * Default text search parser + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/wparser_def.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "catalog/pg_collation.h" +#include "commands/defrem.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_public.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" + + +/* Define me to enable tracing of parser behavior */ +/* #define WPARSER_TRACE */ + + +/* Output token categories */ + +#define ASCIIWORD 1 +#define WORD_T 2 +#define NUMWORD 3 +#define EMAIL 4 +#define URL_T 5 +#define HOST 6 +#define SCIENTIFIC 7 +#define VERSIONNUMBER 8 +#define NUMPARTHWORD 9 +#define PARTHWORD 10 +#define ASCIIPARTHWORD 11 +#define SPACE 12 +#define TAG_T 13 +#define PROTOCOL 14 +#define NUMHWORD 15 +#define ASCIIHWORD 16 +#define HWORD 17 +#define URLPATH 18 +#define FILEPATH 19 +#define DECIMAL_T 20 +#define SIGNEDINT 21 +#define UNSIGNEDINT 22 +#define XMLENTITY 23 + +#define LASTNUM 23 + +static const char *const tok_alias[] = { + "", + "asciiword", + "word", + "numword", + "email", + "url", + "host", + "sfloat", + "version", + "hword_numpart", + "hword_part", + "hword_asciipart", + "blank", + "tag", + "protocol", + "numhword", + "asciihword", + "hword", + "url_path", + "file", + "float", + "int", + "uint", + "entity" +}; + +static const char *const lex_descr[] = { + "", + "Word, all ASCII", + "Word, all letters", + "Word, letters and digits", + "Email address", + "URL", + "Host", + "Scientific notation", + "Version number", + "Hyphenated word part, letters and digits", + "Hyphenated word part, all letters", + "Hyphenated word part, all ASCII", + "Space symbols", + "XML tag", + "Protocol head", + "Hyphenated word, letters and digits", + "Hyphenated word, all ASCII", + "Hyphenated word, all letters", + "URL path", + "File or path name", + "Decimal notation", + "Signed integer", + "Unsigned integer", + "XML entity" +}; + + +/* Parser states */ + +typedef enum +{ + TPS_Base = 0, + TPS_InNumWord, + TPS_InAsciiWord, + TPS_InWord, + TPS_InUnsignedInt, + TPS_InSignedIntFirst, + TPS_InSignedInt, + TPS_InSpace, + TPS_InUDecimalFirst, + TPS_InUDecimal, + TPS_InDecimalFirst, + TPS_InDecimal, + TPS_InVerVersion, + TPS_InSVerVersion, + TPS_InVersionFirst, + TPS_InVersion, + TPS_InMantissaFirst, + TPS_InMantissaSign, + TPS_InMantissa, + TPS_InXMLEntityFirst, + TPS_InXMLEntity, + TPS_InXMLEntityNumFirst, + TPS_InXMLEntityNum, + TPS_InXMLEntityHexNumFirst, + TPS_InXMLEntityHexNum, + TPS_InXMLEntityEnd, + TPS_InTagFirst, + TPS_InXMLBegin, + TPS_InTagCloseFirst, + TPS_InTagName, + TPS_InTagBeginEnd, + TPS_InTag, + TPS_InTagEscapeK, + TPS_InTagEscapeKK, + TPS_InTagBackSleshed, + TPS_InTagEnd, + TPS_InCommentFirst, + TPS_InCommentLast, + TPS_InComment, + TPS_InCloseCommentFirst, + TPS_InCloseCommentLast, + TPS_InCommentEnd, + TPS_InHostFirstDomain, + TPS_InHostDomainSecond, + TPS_InHostDomain, + TPS_InPortFirst, + TPS_InPort, + TPS_InHostFirstAN, + TPS_InHost, + TPS_InEmail, + TPS_InFileFirst, + TPS_InFileTwiddle, + TPS_InPathFirst, + TPS_InPathFirstFirst, + TPS_InPathSecond, + TPS_InFile, + TPS_InFileNext, + TPS_InURLPathFirst, + TPS_InURLPathStart, + TPS_InURLPath, + TPS_InFURL, + TPS_InProtocolFirst, + TPS_InProtocolSecond, + TPS_InProtocolEnd, + TPS_InHyphenAsciiWordFirst, + TPS_InHyphenAsciiWord, + TPS_InHyphenWordFirst, + TPS_InHyphenWord, + TPS_InHyphenNumWordFirst, + TPS_InHyphenNumWord, + TPS_InHyphenDigitLookahead, + TPS_InParseHyphen, + TPS_InParseHyphenHyphen, + TPS_InHyphenWordPart, + TPS_InHyphenAsciiWordPart, + TPS_InHyphenNumWordPart, + TPS_InHyphenUnsignedInt, + TPS_Null /* last state (fake value) */ +} TParserState; + +/* forward declaration */ +struct TParser; + +typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions + * except p_iseq */ +typedef void (*TParserSpecial) (struct TParser *); /* special handler for + * special cases... */ + +typedef struct +{ + TParserCharTest isclass; + char c; + uint16 flags; + TParserState tostate; + int type; + TParserSpecial special; +} TParserStateActionItem; + +/* Flag bits in TParserStateActionItem.flags */ +#define A_NEXT 0x0000 +#define A_BINGO 0x0001 +#define A_POP 0x0002 +#define A_PUSH 0x0004 +#define A_RERUN 0x0008 +#define A_CLEAR 0x0010 +#define A_MERGE 0x0020 +#define A_CLRALL 0x0040 + +typedef struct TParserPosition +{ + int posbyte; /* position of parser in bytes */ + int poschar; /* position of parser in characters */ + int charlen; /* length of current char */ + int lenbytetoken; /* length of token-so-far in bytes */ + int lenchartoken; /* and in chars */ + TParserState state; + struct TParserPosition *prev; + const TParserStateActionItem *pushedAtAction; +} TParserPosition; + +typedef struct TParser +{ + /* string and position information */ + char *str; /* multibyte string */ + int lenstr; /* length of mbstring */ + wchar_t *wstr; /* wide character string */ + pg_wchar *pgwstr; /* wide character string for C-locale */ + bool usewide; + + /* State of parse */ + int charmaxlen; + TParserPosition *state; + bool ignore; + bool wanthost; + + /* silly char */ + char c; + + /* out */ + char *token; + int lenbytetoken; + int lenchartoken; + int type; +} TParser; + + +/* forward decls here */ +static bool TParserGet(TParser *prs); + + +static TParserPosition * +newTParserPosition(TParserPosition *prev) +{ + TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition)); + + if (prev) + memcpy(res, prev, sizeof(TParserPosition)); + else + memset(res, 0, sizeof(TParserPosition)); + + res->prev = prev; + + res->pushedAtAction = NULL; + + return res; +} + +static TParser * +TParserInit(char *str, int len) +{ + TParser *prs = (TParser *) palloc0(sizeof(TParser)); + + prs->charmaxlen = pg_database_encoding_max_length(); + prs->str = str; + prs->lenstr = len; + + /* + * Use wide char code only when max encoding length > 1. + */ + if (prs->charmaxlen > 1) + { + Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ + + prs->usewide = true; + if (lc_ctype_is_c(collation)) + { + /* + * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could + * be different from sizeof(wchar_t) + */ + prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1)); + pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr); + } + else + { + prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); + char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, + mylocale); + } + } + else + prs->usewide = false; + + prs->state = newTParserPosition(NULL); + prs->state->state = TPS_Base; + +#ifdef WPARSER_TRACE + fprintf(stderr, "parsing \"%.*s\"\n", len, str); +#endif + + return prs; +} + +/* + * As an alternative to a full TParserInit one can create a + * TParserCopy which basically is a regular TParser without a private + * copy of the string - instead it uses the one from another TParser. + * This is useful because at some places TParsers are created + * recursively and the repeated copying around of the strings can + * cause major inefficiency if the source string is long. + * The new parser starts parsing at the original's current position. + * + * Obviously one must not close the original TParser before the copy. + */ +static TParser * +TParserCopyInit(const TParser *orig) +{ + TParser *prs = (TParser *) palloc0(sizeof(TParser)); + + prs->charmaxlen = orig->charmaxlen; + prs->str = orig->str + orig->state->posbyte; + prs->lenstr = orig->lenstr - orig->state->posbyte; + prs->usewide = orig->usewide; + + if (orig->pgwstr) + prs->pgwstr = orig->pgwstr + orig->state->poschar; + if (orig->wstr) + prs->wstr = orig->wstr + orig->state->poschar; + + prs->state = newTParserPosition(NULL); + prs->state->state = TPS_Base; + +#ifdef WPARSER_TRACE + fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str); +#endif + + return prs; +} + + +static void +TParserClose(TParser *prs) +{ + while (prs->state) + { + TParserPosition *ptr = prs->state->prev; + + pfree(prs->state); + prs->state = ptr; + } + + if (prs->wstr) + pfree(prs->wstr); + if (prs->pgwstr) + pfree(prs->pgwstr); + +#ifdef WPARSER_TRACE + fprintf(stderr, "closing parser\n"); +#endif + pfree(prs); +} + +/* + * Close a parser created with TParserCopyInit + */ +static void +TParserCopyClose(TParser *prs) +{ + while (prs->state) + { + TParserPosition *ptr = prs->state->prev; + + pfree(prs->state); + prs->state = ptr; + } + +#ifdef WPARSER_TRACE + fprintf(stderr, "closing parser copy\n"); +#endif + pfree(prs); +} + + +/* + * Character-type support functions, equivalent to is* macros, but + * working with any possible encodings and locales. Notes: + * - with multibyte encoding and C-locale isw* function may fail + * or give wrong result. + * - multibyte encoding and C-locale often are used for + * Asian languages. + * - if locale is C then we use pgwstr instead of wstr. + */ + +#define p_iswhat(type, nonascii) \ + \ +static int \ +p_is##type(TParser *prs) \ +{ \ + Assert(prs->state); \ + if (prs->usewide) \ + { \ + if (prs->pgwstr) \ + { \ + unsigned int c = *(prs->pgwstr + prs->state->poschar); \ + if (c > 0x7f) \ + return nonascii; \ + return is##type(c); \ + } \ + return isw##type(*(prs->wstr + prs->state->poschar)); \ + } \ + return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) \ +{ \ + return !p_is##type(prs); \ +} + +/* + * In C locale with a multibyte encoding, any non-ASCII symbol is considered + * an alpha character, but not a member of other char classes. + */ +p_iswhat(alnum, 1) +p_iswhat(alpha, 1) +p_iswhat(digit, 0) +p_iswhat(lower, 0) +p_iswhat(print, 0) +p_iswhat(punct, 0) +p_iswhat(space, 0) +p_iswhat(upper, 0) +p_iswhat(xdigit, 0) + +/* p_iseq should be used only for ascii symbols */ + +static int +p_iseq(TParser *prs, char c) +{ + Assert(prs->state); + return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; +} + +static int +p_isEOF(TParser *prs) +{ + Assert(prs->state); + return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0; +} + +static int +p_iseqC(TParser *prs) +{ + return p_iseq(prs, prs->c); +} + +static int +p_isneC(TParser *prs) +{ + return !p_iseq(prs, prs->c); +} + +static int +p_isascii(TParser *prs) +{ + return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0; +} + +static int +p_isasclet(TParser *prs) +{ + return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0; +} + +static int +p_isurlchar(TParser *prs) +{ + char ch; + + /* no non-ASCII need apply */ + if (prs->state->charlen != 1) + return 0; + ch = *(prs->str + prs->state->posbyte); + /* no spaces or control characters */ + if (ch <= 0x20 || ch >= 0x7F) + return 0; + /* reject characters disallowed by RFC 3986 */ + switch (ch) + { + case '"': + case '<': + case '>': + case '\\': + case '^': + case '`': + case '{': + case '|': + case '}': + return 0; + } + return 1; +} + + +/* deliberately suppress unused-function complaints for the above */ +void _make_compiler_happy(void); +void +_make_compiler_happy(void) +{ + p_isalnum(NULL); + p_isnotalnum(NULL); + p_isalpha(NULL); + p_isnotalpha(NULL); + p_isdigit(NULL); + p_isnotdigit(NULL); + p_islower(NULL); + p_isnotlower(NULL); + p_isprint(NULL); + p_isnotprint(NULL); + p_ispunct(NULL); + p_isnotpunct(NULL); + p_isspace(NULL); + p_isnotspace(NULL); + p_isupper(NULL); + p_isnotupper(NULL); + p_isxdigit(NULL); + p_isnotxdigit(NULL); + p_isEOF(NULL); + p_iseqC(NULL); + p_isneC(NULL); +} + + +static void +SpecialTags(TParser *prs) +{ + switch (prs->state->lenchartoken) + { + case 8: /* token, "ignore = false; + break; + case 7: /*