summaryrefslogtreecommitdiffstats
path: root/src/backend/tsearch
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/tsearch
parentInitial commit. (diff)
downloadpostgresql-14-upstream.tar.xz
postgresql-14-upstream.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/tsearch')
-rw-r--r--src/backend/tsearch/Makefile54
-rw-r--r--src/backend/tsearch/dict.c89
-rw-r--r--src/backend/tsearch/dict_ispell.c148
-rw-r--r--src/backend/tsearch/dict_simple.c105
-rw-r--r--src/backend/tsearch/dict_synonym.c241
-rw-r--r--src/backend/tsearch/dict_thesaurus.c877
-rw-r--r--src/backend/tsearch/dicts/hunspell_sample.affix23
-rw-r--r--src/backend/tsearch/dicts/hunspell_sample_long.affix53
-rw-r--r--src/backend/tsearch/dicts/hunspell_sample_long.dict11
-rw-r--r--src/backend/tsearch/dicts/hunspell_sample_num.affix33
-rw-r--r--src/backend/tsearch/dicts/hunspell_sample_num.dict9
-rw-r--r--src/backend/tsearch/dicts/ispell_sample.affix26
-rw-r--r--src/backend/tsearch/dicts/ispell_sample.dict8
-rw-r--r--src/backend/tsearch/dicts/synonym_sample.syn5
-rw-r--r--src/backend/tsearch/dicts/thesaurus_sample.ths17
-rw-r--r--src/backend/tsearch/regis.c257
-rw-r--r--src/backend/tsearch/spell.c2617
-rw-r--r--src/backend/tsearch/to_tsany.c724
-rw-r--r--src/backend/tsearch/ts_locale.c325
-rw-r--r--src/backend/tsearch/ts_parse.c667
-rw-r--r--src/backend/tsearch/ts_selfuncs.c453
-rw-r--r--src/backend/tsearch/ts_typanalyze.c536
-rw-r--r--src/backend/tsearch/ts_utils.c146
-rw-r--r--src/backend/tsearch/wparser.c549
-rw-r--r--src/backend/tsearch/wparser_def.c2634
25 files changed, 10607 insertions, 0 deletions
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
new file mode 100644
index 0000000..cdb259e
--- /dev/null
+++ b/src/backend/tsearch/Makefile
@@ -0,0 +1,54 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for backend/tsearch
+#
+# Copyright (c) 2006-2021, PostgreSQL Global Development Group
+#
+# src/backend/tsearch/Makefile
+#
+#-------------------------------------------------------------------------
+subdir = src/backend/tsearch
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+DICTDIR=tsearch_data
+
+# List of dictionaries files
+DICTFILES=synonym_sample.syn thesaurus_sample.ths \
+ hunspell_sample.affix \
+ ispell_sample.affix ispell_sample.dict \
+ hunspell_sample_long.affix hunspell_sample_long.dict \
+ hunspell_sample_num.affix hunspell_sample_num.dict
+
+# Local paths to dictionaries files
+DICTFILES_PATH=$(addprefix dicts/,$(DICTFILES))
+
+OBJS = \
+ dict.o \
+ dict_ispell.o \
+ dict_simple.o \
+ dict_synonym.o \
+ dict_thesaurus.o \
+ regis.o \
+ spell.o \
+ to_tsany.o \
+ ts_locale.o \
+ ts_parse.o \
+ ts_selfuncs.o \
+ ts_typanalyze.o \
+ ts_utils.o \
+ wparser.o \
+ wparser_def.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+.PHONY: install-data
+install-data: $(DICTFILES_PATH) installdirs
+ $(INSTALL_DATA) $(addprefix $(srcdir)/,$(DICTFILES_PATH)) '$(DESTDIR)$(datadir)/$(DICTDIR)/'
+
+installdirs:
+ $(MKDIR_P) '$(DESTDIR)$(datadir)' '$(DESTDIR)$(datadir)/$(DICTDIR)'
+
+.PHONY: uninstall-data
+uninstall-data:
+ rm -rf $(addprefix '$(DESTDIR)$(datadir)/$(DICTDIR)/',$(DICTFILES))
diff --git a/src/backend/tsearch/dict.c b/src/backend/tsearch/dict.c
new file mode 100644
index 0000000..1e1ccda
--- /dev/null
+++ b/src/backend/tsearch/dict.c
@@ -0,0 +1,89 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict.c
+ * Standard interface to dictionary
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/dict.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/pg_type.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+
+
+/*
+ * Lexize one word by dictionary, mostly debug function
+ */
+Datum
+ts_lexize(PG_FUNCTION_ARGS)
+{
+ Oid dictId = PG_GETARG_OID(0);
+ text *in = PG_GETARG_TEXT_PP(1);
+ ArrayType *a;
+ TSDictionaryCacheEntry *dict;
+ TSLexeme *res,
+ *ptr;
+ Datum *da;
+ DictSubState dstate = {false, false, NULL};
+
+ dict = lookup_ts_dictionary_cache(dictId);
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize,
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(VARDATA_ANY(in)),
+ Int32GetDatum(VARSIZE_ANY_EXHDR(in)),
+ PointerGetDatum(&dstate)));
+
+ if (dstate.getnext)
+ {
+ dstate.isend = true;
+ ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize,
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(VARDATA_ANY(in)),
+ Int32GetDatum(VARSIZE_ANY_EXHDR(in)),
+ PointerGetDatum(&dstate)));
+ if (ptr != NULL)
+ res = ptr;
+ }
+
+ if (!res)
+ PG_RETURN_NULL();
+
+ ptr = res;
+ while (ptr->lexeme)
+ ptr++;
+ da = (Datum *) palloc(sizeof(Datum) * (ptr - res));
+ ptr = res;
+ while (ptr->lexeme)
+ {
+ da[ptr - res] = CStringGetTextDatum(ptr->lexeme);
+ ptr++;
+ }
+
+ a = construct_array(da,
+ ptr - res,
+ TEXTOID,
+ -1,
+ false,
+ TYPALIGN_INT);
+
+ ptr = res;
+ while (ptr->lexeme)
+ {
+ pfree(DatumGetPointer(da[ptr - res]));
+ pfree(ptr->lexeme);
+ ptr++;
+ }
+ pfree(res);
+ pfree(da);
+
+ PG_RETURN_POINTER(a);
+}
diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c
new file mode 100644
index 0000000..d93f601
--- /dev/null
+++ b/src/backend/tsearch/dict_ispell.c
@@ -0,0 +1,148 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_ispell.c
+ * Ispell dictionary interface
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/dict_ispell.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "commands/defrem.h"
+#include "tsearch/dicts/spell.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+
+
+typedef struct
+{
+ StopList stoplist;
+ IspellDict obj;
+} DictISpell;
+
+Datum
+dispell_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ DictISpell *d;
+ bool affloaded = false,
+ dictloaded = false,
+ stoploaded = false;
+ ListCell *l;
+
+ d = (DictISpell *) palloc0(sizeof(DictISpell));
+
+ NIStartBuild(&(d->obj));
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (strcmp(defel->defname, "dictfile") == 0)
+ {
+ if (dictloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple DictFile parameters")));
+ NIImportDictionary(&(d->obj),
+ get_tsearch_config_filename(defGetString(defel),
+ "dict"));
+ dictloaded = true;
+ }
+ else if (strcmp(defel->defname, "afffile") == 0)
+ {
+ if (affloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple AffFile parameters")));
+ NIImportAffixes(&(d->obj),
+ get_tsearch_config_filename(defGetString(defel),
+ "affix"));
+ affloaded = true;
+ }
+ else if (strcmp(defel->defname, "stopwords") == 0)
+ {
+ if (stoploaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple StopWords parameters")));
+ readstoplist(defGetString(defel), &(d->stoplist), lowerstr);
+ stoploaded = true;
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized Ispell parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ if (affloaded && dictloaded)
+ {
+ NISortDictionary(&(d->obj));
+ NISortAffixes(&(d->obj));
+ }
+ else if (!affloaded)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing AffFile parameter")));
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing DictFile parameter")));
+ }
+
+ NIFinishBuild(&(d->obj));
+
+ PG_RETURN_POINTER(d);
+}
+
+Datum
+dispell_lexize(PG_FUNCTION_ARGS)
+{
+ DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0);
+ char *in = (char *) PG_GETARG_POINTER(1);
+ int32 len = PG_GETARG_INT32(2);
+ char *txt;
+ TSLexeme *res;
+ TSLexeme *ptr,
+ *cptr;
+
+ if (len <= 0)
+ PG_RETURN_POINTER(NULL);
+
+ txt = lowerstr_with_len(in, len);
+ res = NINormalizeWord(&(d->obj), txt);
+
+ if (res == NULL)
+ PG_RETURN_POINTER(NULL);
+
+ cptr = res;
+ for (ptr = cptr; ptr->lexeme; ptr++)
+ {
+ if (searchstoplist(&(d->stoplist), ptr->lexeme))
+ {
+ pfree(ptr->lexeme);
+ ptr->lexeme = NULL;
+ }
+ else
+ {
+ if (cptr != ptr)
+ memcpy(cptr, ptr, sizeof(TSLexeme));
+ cptr++;
+ }
+ }
+ cptr->lexeme = NULL;
+
+ PG_RETURN_POINTER(res);
+}
diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c
new file mode 100644
index 0000000..9cd4b6b
--- /dev/null
+++ b/src/backend/tsearch/dict_simple.c
@@ -0,0 +1,105 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_simple.c
+ * Simple dictionary: just lowercase and check for stopword
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/dict_simple.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "commands/defrem.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+
+
+typedef struct
+{
+ StopList stoplist;
+ bool accept;
+} DictSimple;
+
+
+Datum
+dsimple_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
+ bool stoploaded = false,
+ acceptloaded = false;
+ ListCell *l;
+
+ d->accept = true; /* default */
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (strcmp(defel->defname, "stopwords") == 0)
+ {
+ if (stoploaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple StopWords parameters")));
+ readstoplist(defGetString(defel), &d->stoplist, lowerstr);
+ stoploaded = true;
+ }
+ else if (strcmp(defel->defname, "accept") == 0)
+ {
+ if (acceptloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple Accept parameters")));
+ d->accept = defGetBoolean(defel);
+ acceptloaded = true;
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized simple dictionary parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ PG_RETURN_POINTER(d);
+}
+
+Datum
+dsimple_lexize(PG_FUNCTION_ARGS)
+{
+ DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0);
+ char *in = (char *) PG_GETARG_POINTER(1);
+ int32 len = PG_GETARG_INT32(2);
+ char *txt;
+ TSLexeme *res;
+
+ txt = lowerstr_with_len(in, len);
+
+ if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+ {
+ /* reject as stopword */
+ pfree(txt);
+ res = palloc0(sizeof(TSLexeme) * 2);
+ PG_RETURN_POINTER(res);
+ }
+ else if (d->accept)
+ {
+ /* accept */
+ res = palloc0(sizeof(TSLexeme) * 2);
+ res[0].lexeme = txt;
+ PG_RETURN_POINTER(res);
+ }
+ else
+ {
+ /* report as unrecognized */
+ pfree(txt);
+ PG_RETURN_POINTER(NULL);
+ }
+}
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c
new file mode 100644
index 0000000..ed885ca
--- /dev/null
+++ b/src/backend/tsearch/dict_synonym.c
@@ -0,0 +1,241 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_synonym.c
+ * Synonym dictionary: replace word by its synonym
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/dict_synonym.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "commands/defrem.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+
+typedef struct
+{
+ char *in;
+ char *out;
+ int outlen;
+ uint16 flags;
+} Syn;
+
+typedef struct
+{
+ int len; /* length of syn array */
+ Syn *syn;
+ bool case_sensitive;
+} DictSyn;
+
+/*
+ * Finds the next whitespace-delimited word within the 'in' string.
+ * Returns a pointer to the first character of the word, and a pointer
+ * to the next byte after the last character in the word (in *end).
+ * Character '*' at the end of word will not be treated as word
+ * character if flags is not null.
+ */
+static char *
+findwrd(char *in, char **end, uint16 *flags)
+{
+ char *start;
+ char *lastchar;
+
+ /* Skip leading spaces */
+ while (*in && t_isspace(in))
+ in += pg_mblen(in);
+
+ /* Return NULL on empty lines */
+ if (*in == '\0')
+ {
+ *end = NULL;
+ return NULL;
+ }
+
+ lastchar = start = in;
+
+ /* Find end of word */
+ while (*in && !t_isspace(in))
+ {
+ lastchar = in;
+ in += pg_mblen(in);
+ }
+
+ if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
+ {
+ *flags = TSL_PREFIX;
+ *end = lastchar;
+ }
+ else
+ {
+ if (flags)
+ *flags = 0;
+ *end = in;
+ }
+
+ return start;
+}
+
+static int
+compareSyn(const void *a, const void *b)
+{
+ return strcmp(((const Syn *) a)->in, ((const Syn *) b)->in);
+}
+
+
+Datum
+dsynonym_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ DictSyn *d;
+ ListCell *l;
+ char *filename = NULL;
+ bool case_sensitive = false;
+ tsearch_readline_state trst;
+ char *starti,
+ *starto,
+ *end = NULL;
+ int cur = 0;
+ char *line = NULL;
+ uint16 flags = 0;
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (strcmp(defel->defname, "synonyms") == 0)
+ filename = defGetString(defel);
+ else if (strcmp(defel->defname, "casesensitive") == 0)
+ case_sensitive = defGetBoolean(defel);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized synonym parameter: \"%s\"",
+ defel->defname)));
+ }
+
+ if (!filename)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing Synonyms parameter")));
+
+ filename = get_tsearch_config_filename(filename, "syn");
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open synonym file \"%s\": %m",
+ filename)));
+
+ d = (DictSyn *) palloc0(sizeof(DictSyn));
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ starti = findwrd(line, &end, NULL);
+ if (!starti)
+ {
+ /* Empty line */
+ goto skipline;
+ }
+ if (*end == '\0')
+ {
+ /* A line with only one word. Ignore silently. */
+ goto skipline;
+ }
+ *end = '\0';
+
+ starto = findwrd(end + 1, &end, &flags);
+ if (!starto)
+ {
+ /* A line with only one word (+whitespace). Ignore silently. */
+ goto skipline;
+ }
+ *end = '\0';
+
+ /*
+ * starti now points to the first word, and starto to the second word
+ * on the line, with a \0 terminator at the end of both words.
+ */
+
+ if (cur >= d->len)
+ {
+ if (d->len == 0)
+ {
+ d->len = 64;
+ d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
+ }
+ else
+ {
+ d->len *= 2;
+ d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
+ }
+ }
+
+ if (case_sensitive)
+ {
+ d->syn[cur].in = pstrdup(starti);
+ d->syn[cur].out = pstrdup(starto);
+ }
+ else
+ {
+ d->syn[cur].in = lowerstr(starti);
+ d->syn[cur].out = lowerstr(starto);
+ }
+
+ d->syn[cur].outlen = strlen(starto);
+ d->syn[cur].flags = flags;
+
+ cur++;
+
+skipline:
+ pfree(line);
+ }
+
+ tsearch_readline_end(&trst);
+
+ d->len = cur;
+ qsort(d->syn, d->len, sizeof(Syn), compareSyn);
+
+ d->case_sensitive = case_sensitive;
+
+ PG_RETURN_POINTER(d);
+}
+
+Datum
+dsynonym_lexize(PG_FUNCTION_ARGS)
+{
+ DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
+ char *in = (char *) PG_GETARG_POINTER(1);
+ int32 len = PG_GETARG_INT32(2);
+ Syn key,
+ *found;
+ TSLexeme *res;
+
+ /* note: d->len test protects against Solaris bsearch-of-no-items bug */
+ if (len <= 0 || d->len <= 0)
+ PG_RETURN_POINTER(NULL);
+
+ if (d->case_sensitive)
+ key.in = pnstrdup(in, len);
+ else
+ key.in = lowerstr_with_len(in, len);
+
+ key.out = NULL;
+
+ found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
+ pfree(key.in);
+
+ if (!found)
+ PG_RETURN_POINTER(NULL);
+
+ res = palloc0(sizeof(TSLexeme) * 2);
+ res[0].lexeme = pnstrdup(found->out, found->outlen);
+ res[0].flags = found->flags;
+
+ PG_RETURN_POINTER(res);
+}
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c
new file mode 100644
index 0000000..a95ed08
--- /dev/null
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -0,0 +1,877 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_thesaurus.c
+ * Thesaurus dictionary: phrase to phrase substitution
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/dict_thesaurus.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/namespace.h"
+#include "commands/defrem.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+#include "utils/regproc.h"
+
+
+/*
+ * Temporary we use TSLexeme.flags for inner use...
+ */
+#define DT_USEASIS 0x1000
+
+typedef struct LexemeInfo
+{
+ uint32 idsubst; /* entry's number in DictThesaurus->subst */
+ uint16 posinsubst; /* pos info in entry */
+ uint16 tnvariant; /* total num lexemes in one variant */
+ struct LexemeInfo *nextentry;
+ struct LexemeInfo *nextvariant;
+} LexemeInfo;
+
+typedef struct
+{
+ char *lexeme;
+ LexemeInfo *entries;
+} TheLexeme;
+
+typedef struct
+{
+ uint16 lastlexeme; /* number lexemes to substitute */
+ uint16 reslen;
+ TSLexeme *res; /* prepared substituted result */
+} TheSubstitute;
+
+typedef struct
+{
+ /* subdictionary to normalize lexemes */
+ Oid subdictOid;
+ TSDictionaryCacheEntry *subdict;
+
+ /* Array to search lexeme by exact match */
+ TheLexeme *wrds;
+ int nwrds; /* current number of words */
+ int ntwrds; /* allocated array length */
+
+ /*
+ * Storage of substituted result, n-th element is for n-th expression
+ */
+ TheSubstitute *subst;
+ int nsubst;
+} DictThesaurus;
+
+
+static void
+newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
+{
+ TheLexeme *ptr;
+
+ if (d->nwrds >= d->ntwrds)
+ {
+ if (d->ntwrds == 0)
+ {
+ d->ntwrds = 16;
+ d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
+ }
+ else
+ {
+ d->ntwrds *= 2;
+ d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
+ }
+ }
+
+ ptr = d->wrds + d->nwrds;
+ d->nwrds++;
+
+ ptr->lexeme = palloc(e - b + 1);
+
+ memcpy(ptr->lexeme, b, e - b);
+ ptr->lexeme[e - b] = '\0';
+
+ ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
+
+ ptr->entries->nextentry = NULL;
+ ptr->entries->idsubst = idsubst;
+ ptr->entries->posinsubst = posinsubst;
+}
+
+static void
+addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
+{
+ static int nres = 0;
+ static int ntres = 0;
+ TheSubstitute *ptr;
+
+ if (nwrd == 0)
+ {
+ nres = ntres = 0;
+
+ if (idsubst >= d->nsubst)
+ {
+ if (d->nsubst == 0)
+ {
+ d->nsubst = 16;
+ d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
+ }
+ else
+ {
+ d->nsubst *= 2;
+ d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
+ }
+ }
+ }
+
+ ptr = d->subst + idsubst;
+
+ ptr->lastlexeme = posinsubst - 1;
+
+ if (nres + 1 >= ntres)
+ {
+ if (ntres == 0)
+ {
+ ntres = 2;
+ ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
+ }
+ else
+ {
+ ntres *= 2;
+ ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
+ }
+ }
+
+ ptr->res[nres].lexeme = palloc(e - b + 1);
+ memcpy(ptr->res[nres].lexeme, b, e - b);
+ ptr->res[nres].lexeme[e - b] = '\0';
+
+ ptr->res[nres].nvariant = nwrd;
+ if (useasis)
+ ptr->res[nres].flags = DT_USEASIS;
+ else
+ ptr->res[nres].flags = 0;
+
+ ptr->res[++nres].lexeme = NULL;
+}
+
+#define TR_WAITLEX 1
+#define TR_INLEX 2
+#define TR_WAITSUBS 3
+#define TR_INSUBS 4
+
+static void
+thesaurusRead(const char *filename, DictThesaurus *d)
+{
+ tsearch_readline_state trst;
+ uint32 idsubst = 0;
+ bool useasis = false;
+ char *line;
+
+ filename = get_tsearch_config_filename(filename, "ths");
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open thesaurus file \"%s\": %m",
+ filename)));
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ char *ptr;
+ int state = TR_WAITLEX;
+ char *beginwrd = NULL;
+ uint32 posinsubst = 0;
+ uint32 nwrd = 0;
+
+ ptr = line;
+
+ /* is it a comment? */
+ while (*ptr && t_isspace(ptr))
+ ptr += pg_mblen(ptr);
+
+ if (t_iseq(ptr, '#') || *ptr == '\0' ||
+ t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
+ {
+ pfree(line);
+ continue;
+ }
+
+ while (*ptr)
+ {
+ if (state == TR_WAITLEX)
+ {
+ if (t_iseq(ptr, ':'))
+ {
+ if (posinsubst == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("unexpected delimiter")));
+ state = TR_WAITSUBS;
+ }
+ else if (!t_isspace(ptr))
+ {
+ beginwrd = ptr;
+ state = TR_INLEX;
+ }
+ }
+ else if (state == TR_INLEX)
+ {
+ if (t_iseq(ptr, ':'))
+ {
+ newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
+ state = TR_WAITSUBS;
+ }
+ else if (t_isspace(ptr))
+ {
+ newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
+ state = TR_WAITLEX;
+ }
+ }
+ else if (state == TR_WAITSUBS)
+ {
+ if (t_iseq(ptr, '*'))
+ {
+ useasis = true;
+ state = TR_INSUBS;
+ beginwrd = ptr + pg_mblen(ptr);
+ }
+ else if (t_iseq(ptr, '\\'))
+ {
+ useasis = false;
+ state = TR_INSUBS;
+ beginwrd = ptr + pg_mblen(ptr);
+ }
+ else if (!t_isspace(ptr))
+ {
+ useasis = false;
+ beginwrd = ptr;
+ state = TR_INSUBS;
+ }
+ }
+ else if (state == TR_INSUBS)
+ {
+ if (t_isspace(ptr))
+ {
+ if (ptr == beginwrd)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("unexpected end of line or lexeme")));
+ addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
+ state = TR_WAITSUBS;
+ }
+ }
+ else
+ elog(ERROR, "unrecognized thesaurus state: %d", state);
+
+ ptr += pg_mblen(ptr);
+ }
+
+ if (state == TR_INSUBS)
+ {
+ if (ptr == beginwrd)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("unexpected end of line or lexeme")));
+ addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
+ }
+
+ idsubst++;
+
+ if (!(nwrd && posinsubst))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("unexpected end of line")));
+
+ if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("too many lexemes in thesaurus entry")));
+
+ pfree(line);
+ }
+
+ d->nsubst = idsubst;
+
+ tsearch_readline_end(&trst);
+}
+
+static TheLexeme *
+addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
+{
+ if (*nnw >= *tnm)
+ {
+ *tnm *= 2;
+ newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
+ }
+
+ newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
+
+ if (lexeme && lexeme->lexeme)
+ {
+ newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
+ newwrds[*nnw].entries->tnvariant = tnvariant;
+ }
+ else
+ {
+ newwrds[*nnw].lexeme = NULL;
+ newwrds[*nnw].entries->tnvariant = 1;
+ }
+
+ newwrds[*nnw].entries->idsubst = src->idsubst;
+ newwrds[*nnw].entries->posinsubst = src->posinsubst;
+
+ newwrds[*nnw].entries->nextentry = NULL;
+
+ (*nnw)++;
+ return newwrds;
+}
+
+static int
+cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
+{
+ if (a == NULL || b == NULL)
+ return 0;
+
+ if (a->idsubst == b->idsubst)
+ {
+ if (a->posinsubst == b->posinsubst)
+ {
+ if (a->tnvariant == b->tnvariant)
+ return 0;
+
+ return (a->tnvariant > b->tnvariant) ? 1 : -1;
+ }
+
+ return (a->posinsubst > b->posinsubst) ? 1 : -1;
+ }
+
+ return (a->idsubst > b->idsubst) ? 1 : -1;
+}
+
+static int
+cmpLexeme(const TheLexeme *a, const TheLexeme *b)
+{
+ if (a->lexeme == NULL)
+ {
+ if (b->lexeme == NULL)
+ return 0;
+ else
+ return 1;
+ }
+ else if (b->lexeme == NULL)
+ return -1;
+
+ return strcmp(a->lexeme, b->lexeme);
+}
+
+static int
+cmpLexemeQ(const void *a, const void *b)
+{
+ return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
+}
+
+static int
+cmpTheLexeme(const void *a, const void *b)
+{
+ const TheLexeme *la = (const TheLexeme *) a;
+ const TheLexeme *lb = (const TheLexeme *) b;
+ int res;
+
+ if ((res = cmpLexeme(la, lb)) != 0)
+ return res;
+
+ return -cmpLexemeInfo(la->entries, lb->entries);
+}
+
+static void
+compileTheLexeme(DictThesaurus *d)
+{
+ int i,
+ nnw = 0,
+ tnm = 16;
+ TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
+ *ptrwrds;
+
+ for (i = 0; i < d->nwrds; i++)
+ {
+ TSLexeme *ptr;
+
+ if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
+ newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
+ else
+ {
+ ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+ PointerGetDatum(d->subdict->dictData),
+ PointerGetDatum(d->wrds[i].lexeme),
+ Int32GetDatum(strlen(d->wrds[i].lexeme)),
+ PointerGetDatum(NULL)));
+
+ if (!ptr)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
+ d->wrds[i].lexeme,
+ d->wrds[i].entries->idsubst + 1)));
+ else if (!(ptr->lexeme))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
+ d->wrds[i].lexeme,
+ d->wrds[i].entries->idsubst + 1),
+ errhint("Use \"?\" to represent a stop word within a sample phrase.")));
+ else
+ {
+ while (ptr->lexeme)
+ {
+ TSLexeme *remptr = ptr + 1;
+ int tnvar = 1;
+ int curvar = ptr->nvariant;
+
+ /* compute n words in one variant */
+ while (remptr->lexeme)
+ {
+ if (remptr->nvariant != (remptr - 1)->nvariant)
+ break;
+ tnvar++;
+ remptr++;
+ }
+
+ remptr = ptr;
+ while (remptr->lexeme && remptr->nvariant == curvar)
+ {
+ newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
+ remptr++;
+ }
+
+ ptr = remptr;
+ }
+ }
+ }
+
+ pfree(d->wrds[i].lexeme);
+ pfree(d->wrds[i].entries);
+ }
+
+ if (d->wrds)
+ pfree(d->wrds);
+ d->wrds = newwrds;
+ d->nwrds = nnw;
+ d->ntwrds = tnm;
+
+ if (d->nwrds > 1)
+ {
+ qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
+
+ /* uniq */
+ newwrds = d->wrds;
+ ptrwrds = d->wrds + 1;
+ while (ptrwrds - d->wrds < d->nwrds)
+ {
+ if (cmpLexeme(ptrwrds, newwrds) == 0)
+ {
+ if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
+ {
+ ptrwrds->entries->nextentry = newwrds->entries;
+ newwrds->entries = ptrwrds->entries;
+ }
+ else
+ pfree(ptrwrds->entries);
+
+ if (ptrwrds->lexeme)
+ pfree(ptrwrds->lexeme);
+ }
+ else
+ {
+ newwrds++;
+ *newwrds = *ptrwrds;
+ }
+
+ ptrwrds++;
+ }
+
+ d->nwrds = newwrds - d->wrds + 1;
+ d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
+ }
+}
+
+static void
+compileTheSubstitute(DictThesaurus *d)
+{
+ int i;
+
+ for (i = 0; i < d->nsubst; i++)
+ {
+ TSLexeme *rem = d->subst[i].res,
+ *outptr,
+ *inptr;
+ int n = 2;
+
+ outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
+ outptr->lexeme = NULL;
+ inptr = rem;
+
+ while (inptr && inptr->lexeme)
+ {
+ TSLexeme *lexized,
+ tmplex[2];
+
+ if (inptr->flags & DT_USEASIS)
+ { /* do not lexize */
+ tmplex[0] = *inptr;
+ tmplex[0].flags = 0;
+ tmplex[1].lexeme = NULL;
+ lexized = tmplex;
+ }
+ else
+ {
+ lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+ PointerGetDatum(d->subdict->dictData),
+ PointerGetDatum(inptr->lexeme),
+ Int32GetDatum(strlen(inptr->lexeme)),
+ PointerGetDatum(NULL)));
+ }
+
+ if (lexized && lexized->lexeme)
+ {
+ int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
+
+ while (lexized->lexeme)
+ {
+ if (outptr - d->subst[i].res + 1 >= n)
+ {
+ int diff = outptr - d->subst[i].res;
+
+ n *= 2;
+ d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
+ outptr = d->subst[i].res + diff;
+ }
+
+ *outptr = *lexized;
+ outptr->lexeme = pstrdup(lexized->lexeme);
+
+ outptr++;
+ lexized++;
+ }
+
+ if (toset > 0)
+ d->subst[i].res[toset].flags |= TSL_ADDPOS;
+ }
+ else if (lexized)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
+ inptr->lexeme, i + 1)));
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
+ inptr->lexeme, i + 1)));
+ }
+
+ if (inptr->lexeme)
+ pfree(inptr->lexeme);
+ inptr++;
+ }
+
+ if (outptr == d->subst[i].res)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("thesaurus substitute phrase is empty (rule %d)",
+ i + 1)));
+
+ d->subst[i].reslen = outptr - d->subst[i].res;
+
+ pfree(rem);
+ }
+}
+
+Datum
+thesaurus_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ DictThesaurus *d;
+ char *subdictname = NULL;
+ bool fileloaded = false;
+ ListCell *l;
+
+ d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (strcmp(defel->defname, "dictfile") == 0)
+ {
+ if (fileloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple DictFile parameters")));
+ thesaurusRead(defGetString(defel), d);
+ fileloaded = true;
+ }
+ else if (strcmp(defel->defname, "dictionary") == 0)
+ {
+ if (subdictname)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple Dictionary parameters")));
+ subdictname = pstrdup(defGetString(defel));
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized Thesaurus parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ if (!fileloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing DictFile parameter")));
+ if (!subdictname)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing Dictionary parameter")));
+
+ d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false);
+ d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
+
+ compileTheLexeme(d);
+ compileTheSubstitute(d);
+
+ PG_RETURN_POINTER(d);
+}
+
+static LexemeInfo *
+findTheLexeme(DictThesaurus *d, char *lexeme)
+{
+ TheLexeme key,
+ *res;
+
+ if (d->nwrds == 0)
+ return NULL;
+
+ key.lexeme = lexeme;
+ key.entries = NULL;
+
+ res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
+
+ if (res == NULL)
+ return NULL;
+ return res->entries;
+}
+
+static bool
+matchIdSubst(LexemeInfo *stored, uint32 idsubst)
+{
+ bool res = true;
+
+ if (stored)
+ {
+ res = false;
+
+ for (; stored; stored = stored->nextvariant)
+ if (stored->idsubst == idsubst)
+ {
+ res = true;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static LexemeInfo *
+findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
+{
+ for (;;)
+ {
+ int i;
+ LexemeInfo *ptr = newin[0];
+
+ for (i = 0; i < newn; i++)
+ {
+ while (newin[i] && newin[i]->idsubst < ptr->idsubst)
+ newin[i] = newin[i]->nextentry;
+
+ if (newin[i] == NULL)
+ return in;
+
+ if (newin[i]->idsubst > ptr->idsubst)
+ {
+ ptr = newin[i];
+ i = -1;
+ continue;
+ }
+
+ while (newin[i]->idsubst == ptr->idsubst)
+ {
+ if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
+ {
+ ptr = newin[i];
+ break;
+ }
+
+ newin[i] = newin[i]->nextentry;
+ if (newin[i] == NULL)
+ return in;
+ }
+
+ if (newin[i]->idsubst != ptr->idsubst)
+ {
+ ptr = newin[i];
+ i = -1;
+ continue;
+ }
+ }
+
+ if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
+ { /* found */
+
+ ptr->nextvariant = in;
+ in = ptr;
+ }
+
+ /* step forward */
+ for (i = 0; i < newn; i++)
+ newin[i] = newin[i]->nextentry;
+ }
+}
+
+static TSLexeme *
+copyTSLexeme(TheSubstitute *ts)
+{
+ TSLexeme *res;
+ uint16 i;
+
+ res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
+ for (i = 0; i < ts->reslen; i++)
+ {
+ res[i] = ts->res[i];
+ res[i].lexeme = pstrdup(ts->res[i].lexeme);
+ }
+
+ res[ts->reslen].lexeme = NULL;
+
+ return res;
+}
+
+static TSLexeme *
+checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
+{
+ *moreres = false;
+ while (info)
+ {
+ Assert(info->idsubst < d->nsubst);
+ if (info->nextvariant)
+ *moreres = true;
+ if (d->subst[info->idsubst].lastlexeme == curpos)
+ return copyTSLexeme(d->subst + info->idsubst);
+ info = info->nextvariant;
+ }
+
+ return NULL;
+}
+
+Datum
+thesaurus_lexize(PG_FUNCTION_ARGS)
+{
+ DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
+ DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
+ TSLexeme *res = NULL;
+ LexemeInfo *stored,
+ *info = NULL;
+ uint16 curpos = 0;
+ bool moreres = false;
+
+ if (PG_NARGS() != 4 || dstate == NULL)
+ elog(ERROR, "forbidden call of thesaurus or nested call");
+
+ if (dstate->isend)
+ PG_RETURN_POINTER(NULL);
+ stored = (LexemeInfo *) dstate->private_state;
+
+ if (stored)
+ curpos = stored->posinsubst + 1;
+
+ if (!d->subdict->isvalid)
+ d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+ PointerGetDatum(d->subdict->dictData),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2),
+ PointerGetDatum(NULL)));
+
+ if (res && res->lexeme)
+ {
+ TSLexeme *ptr = res,
+ *basevar;
+
+ while (ptr->lexeme)
+ {
+ uint16 nv = ptr->nvariant;
+ uint16 i,
+ nlex = 0;
+ LexemeInfo **infos;
+
+ basevar = ptr;
+ while (ptr->lexeme && nv == ptr->nvariant)
+ {
+ nlex++;
+ ptr++;
+ }
+
+ infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
+ for (i = 0; i < nlex; i++)
+ if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
+ break;
+
+ if (i < nlex)
+ {
+ /* no chance to find */
+ pfree(infos);
+ continue;
+ }
+
+ info = findVariant(info, stored, curpos, infos, nlex);
+ }
+ }
+ else if (res)
+ { /* stop-word */
+ LexemeInfo *infos = findTheLexeme(d, NULL);
+
+ info = findVariant(NULL, stored, curpos, &infos, 1);
+ }
+ else
+ {
+ info = NULL; /* word isn't recognized */
+ }
+
+ dstate->private_state = (void *) info;
+
+ if (!info)
+ {
+ dstate->getnext = false;
+ PG_RETURN_POINTER(NULL);
+ }
+
+ if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
+ {
+ dstate->getnext = moreres;
+ PG_RETURN_POINTER(res);
+ }
+
+ dstate->getnext = true;
+
+ PG_RETURN_POINTER(NULL);
+}
diff --git a/src/backend/tsearch/dicts/hunspell_sample.affix b/src/backend/tsearch/dicts/hunspell_sample.affix
new file mode 100644
index 0000000..9a64513
--- /dev/null
+++ b/src/backend/tsearch/dicts/hunspell_sample.affix
@@ -0,0 +1,23 @@
+COMPOUNDFLAG Z
+ONLYINCOMPOUND L
+
+PFX B Y 1
+PFX B 0 re .
+
+PFX U N 1
+PFX U 0 un .
+
+SFX J Y 1
+SFX J 0 INGS [^E]
+
+SFX G Y 1
+SFX G 0 ING [^E]
+
+SFX S Y 1
+SFX S 0 S [^SXZHY]
+
+SFX A Y 1
+SFX A Y IES [^AEIOU]Y
+
+SFX \ N 1
+SFX \ 0 Y/L [^Y]
diff --git a/src/backend/tsearch/dicts/hunspell_sample_long.affix b/src/backend/tsearch/dicts/hunspell_sample_long.affix
new file mode 100644
index 0000000..d5df7a3
--- /dev/null
+++ b/src/backend/tsearch/dicts/hunspell_sample_long.affix
@@ -0,0 +1,53 @@
+FLAG long
+
+AF 11
+AF cZ #1
+AF cL #2
+AF sGsJpUsS #3
+AF sSpB #4
+AF cZsS #5
+AF sScZs\sE #6
+AF sA #7
+AF CaCp #8
+AF CcCp #9
+AF sD #10
+AF sB #11
+
+COMPOUNDFLAG cZ
+COMPOUNDBEGIN Ca
+COMPOUNDMIDDLE Cb
+COMPOUNDEND Cc
+COMPOUNDPERMITFLAG Cp
+ONLYINCOMPOUND cL
+
+PFX pB Y 1
+PFX pB 0 re .
+
+PFX pU N 1
+PFX pU 0 un .
+
+SFX sJ Y 1
+SFX sJ 0 INGS [^E]
+
+SFX sG Y 1
+SFX sG 0 ING [^E]
+
+SFX sS Y 1
+SFX sS 0 S [^SXZHY]
+
+SFX sA Y 1
+SFX sA Y IES [^AEIOU]Y{1}
+
+SFX sB Y 1
+SFX sB 0 ED K{1}
+
+# Affixes with compound flags
+SFX s\ N 1
+SFX s\ 0 Y/2 [^Y]
+
+SFX sE N 1
+SFX sE 0 S/2 [^S]
+
+# Check duplicate affixes
+SFX sD N 1
+SFX sD 0 S/2 [^S]
diff --git a/src/backend/tsearch/dicts/hunspell_sample_long.dict b/src/backend/tsearch/dicts/hunspell_sample_long.dict
new file mode 100644
index 0000000..370c27a
--- /dev/null
+++ b/src/backend/tsearch/dicts/hunspell_sample_long.dict
@@ -0,0 +1,11 @@
+book/3
+book/11
+booking/4
+footballklubber
+foot/5
+football/1
+ball/6
+klubber/1
+sky/7
+ex-/8
+machina/9
diff --git a/src/backend/tsearch/dicts/hunspell_sample_num.affix b/src/backend/tsearch/dicts/hunspell_sample_num.affix
new file mode 100644
index 0000000..0c4766a
--- /dev/null
+++ b/src/backend/tsearch/dicts/hunspell_sample_num.affix
@@ -0,0 +1,33 @@
+FLAG num
+
+COMPOUNDFLAG 101
+ONLYINCOMPOUND 102
+
+PFX 201 Y 1
+PFX 201 0 re .
+
+PFX 202 N 1
+PFX 202 0 un .
+
+SFX 301 Y 1
+SFX 301 0 INGS [^E]
+
+SFX 302 Y 1
+SFX 302 0 ING [^E]
+
+SFX 303 Y 1
+SFX 303 0 S [^SXZHY]
+
+# Remove ED suffix from lexeme for base words with K ending
+SFX 306 Y 1
+SFX 306 0 ED K{1}
+
+# Just add Y to lexeme for base words with Y ending
+SFX 307 Y 1
+SFX 307 Y 0 Y*
+
+SFX 304 Y 1
+SFX 304 Y IES [^AEIOU]Y
+
+SFX 305 N 1
+SFX 305 0 Y/102 [^Y]
diff --git a/src/backend/tsearch/dicts/hunspell_sample_num.dict b/src/backend/tsearch/dicts/hunspell_sample_num.dict
new file mode 100644
index 0000000..fbc321d
--- /dev/null
+++ b/src/backend/tsearch/dicts/hunspell_sample_num.dict
@@ -0,0 +1,9 @@
+book/302,301,202,303
+book/306
+booking/303,201
+footballklubber
+foot/101,303
+football/101
+ball/303,101,305
+klubber/101
+sky/304,307
diff --git a/src/backend/tsearch/dicts/ispell_sample.affix b/src/backend/tsearch/dicts/ispell_sample.affix
new file mode 100644
index 0000000..f29004f
--- /dev/null
+++ b/src/backend/tsearch/dicts/ispell_sample.affix
@@ -0,0 +1,26 @@
+compoundwords controlled Z
+
+prefixes
+
+flag *B:
+ . > RE # As in enter > reenter
+
+flag U:
+ . > UN # As in natural > unnatural
+
+suffixes
+
+flag *J:
+ [^E] > INGS # As in cross > crossings
+
+flag *G:
+ [^E] > ING # As in cross > crossing
+
+flag *S:
+ [^SXZHY] > S # As in bat > bats
+
+flag *A:
+ [^AEIOU]Y > -Y,IES # As in imply > implies
+
+flag ~\\:
+ [^Y] > Y #~ advarsel > advarsely-
diff --git a/src/backend/tsearch/dicts/ispell_sample.dict b/src/backend/tsearch/dicts/ispell_sample.dict
new file mode 100644
index 0000000..44df196
--- /dev/null
+++ b/src/backend/tsearch/dicts/ispell_sample.dict
@@ -0,0 +1,8 @@
+book/GJUS
+booking/SB
+footballklubber
+foot/ZS
+football/Z
+ball/SZ\
+klubber/Z
+sky/A
diff --git a/src/backend/tsearch/dicts/synonym_sample.syn b/src/backend/tsearch/dicts/synonym_sample.syn
new file mode 100644
index 0000000..3ecbcf9
--- /dev/null
+++ b/src/backend/tsearch/dicts/synonym_sample.syn
@@ -0,0 +1,5 @@
+postgres pgsql
+postgresql pgsql
+postgre pgsql
+gogle googl
+indices index*
diff --git a/src/backend/tsearch/dicts/thesaurus_sample.ths b/src/backend/tsearch/dicts/thesaurus_sample.ths
new file mode 100644
index 0000000..718f54a
--- /dev/null
+++ b/src/backend/tsearch/dicts/thesaurus_sample.ths
@@ -0,0 +1,17 @@
+#
+# Theasurus config file. Character ':' separates string from replacement, eg
+# sample-words : substitute-words
+#
+# Any substitute-word can be marked by preceding '*' character,
+# which means do not lexize this word
+# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
+
+one two three : *123
+one two : *12
+one : *1
+two : *2
+
+supernovae stars : *sn
+supernovae : *sn
+booking tickets : order invitation cards
+booking ? tickets : order invitation Cards
diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c
new file mode 100644
index 0000000..8001717
--- /dev/null
+++ b/src/backend/tsearch/regis.c
@@ -0,0 +1,257 @@
+/*-------------------------------------------------------------------------
+ *
+ * regis.c
+ * Fast regex subset
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/regis.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "tsearch/dicts/regis.h"
+#include "tsearch/ts_locale.h"
+
+#define RS_IN_ONEOF 1
+#define RS_IN_ONEOF_IN 2
+#define RS_IN_NONEOF 3
+#define RS_IN_WAIT 4
+
+
+/*
+ * Test whether a regex is of the subset supported here.
+ * Keep this in sync with RS_compile!
+ */
+bool
+RS_isRegis(const char *str)
+{
+ int state = RS_IN_WAIT;
+ const char *c = str;
+
+ while (*c)
+ {
+ if (state == RS_IN_WAIT)
+ {
+ if (t_isalpha(c))
+ /* okay */ ;
+ else if (t_iseq(c, '['))
+ state = RS_IN_ONEOF;
+ else
+ return false;
+ }
+ else if (state == RS_IN_ONEOF)
+ {
+ if (t_iseq(c, '^'))
+ state = RS_IN_NONEOF;
+ else if (t_isalpha(c))
+ state = RS_IN_ONEOF_IN;
+ else
+ return false;
+ }
+ else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
+ {
+ if (t_isalpha(c))
+ /* okay */ ;
+ else if (t_iseq(c, ']'))
+ state = RS_IN_WAIT;
+ else
+ return false;
+ }
+ else
+ elog(ERROR, "internal error in RS_isRegis: state %d", state);
+ c += pg_mblen(c);
+ }
+
+ return (state == RS_IN_WAIT);
+}
+
+static RegisNode *
+newRegisNode(RegisNode *prev, int len)
+{
+ RegisNode *ptr;
+
+ ptr = (RegisNode *) palloc0(RNHDRSZ + len + 1);
+ if (prev)
+ prev->next = ptr;
+ return ptr;
+}
+
+void
+RS_compile(Regis *r, bool issuffix, const char *str)
+{
+ int len = strlen(str);
+ int state = RS_IN_WAIT;
+ const char *c = str;
+ RegisNode *ptr = NULL;
+
+ memset(r, 0, sizeof(Regis));
+ r->issuffix = (issuffix) ? 1 : 0;
+
+ while (*c)
+ {
+ if (state == RS_IN_WAIT)
+ {
+ if (t_isalpha(c))
+ {
+ if (ptr)
+ ptr = newRegisNode(ptr, len);
+ else
+ ptr = r->node = newRegisNode(NULL, len);
+ COPYCHAR(ptr->data, c);
+ ptr->type = RSF_ONEOF;
+ ptr->len = pg_mblen(c);
+ }
+ else if (t_iseq(c, '['))
+ {
+ if (ptr)
+ ptr = newRegisNode(ptr, len);
+ else
+ ptr = r->node = newRegisNode(NULL, len);
+ ptr->type = RSF_ONEOF;
+ state = RS_IN_ONEOF;
+ }
+ else /* shouldn't get here */
+ elog(ERROR, "invalid regis pattern: \"%s\"", str);
+ }
+ else if (state == RS_IN_ONEOF)
+ {
+ if (t_iseq(c, '^'))
+ {
+ ptr->type = RSF_NONEOF;
+ state = RS_IN_NONEOF;
+ }
+ else if (t_isalpha(c))
+ {
+ COPYCHAR(ptr->data, c);
+ ptr->len = pg_mblen(c);
+ state = RS_IN_ONEOF_IN;
+ }
+ else /* shouldn't get here */
+ elog(ERROR, "invalid regis pattern: \"%s\"", str);
+ }
+ else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
+ {
+ if (t_isalpha(c))
+ {
+ COPYCHAR(ptr->data + ptr->len, c);
+ ptr->len += pg_mblen(c);
+ }
+ else if (t_iseq(c, ']'))
+ state = RS_IN_WAIT;
+ else /* shouldn't get here */
+ elog(ERROR, "invalid regis pattern: \"%s\"", str);
+ }
+ else
+ elog(ERROR, "internal error in RS_compile: state %d", state);
+ c += pg_mblen(c);
+ }
+
+ if (state != RS_IN_WAIT) /* shouldn't get here */
+ elog(ERROR, "invalid regis pattern: \"%s\"", str);
+
+ ptr = r->node;
+ while (ptr)
+ {
+ r->nchar++;
+ ptr = ptr->next;
+ }
+}
+
+void
+RS_free(Regis *r)
+{
+ RegisNode *ptr = r->node,
+ *tmp;
+
+ while (ptr)
+ {
+ tmp = ptr->next;
+ pfree(ptr);
+ ptr = tmp;
+ }
+
+ r->node = NULL;
+}
+
+static bool
+mb_strchr(char *str, char *c)
+{
+ int clen,
+ plen,
+ i;
+ char *ptr = str;
+ bool res = false;
+
+ clen = pg_mblen(c);
+ while (*ptr && !res)
+ {
+ plen = pg_mblen(ptr);
+ if (plen == clen)
+ {
+ i = plen;
+ res = true;
+ while (i--)
+ if (*(ptr + i) != *(c + i))
+ {
+ res = false;
+ break;
+ }
+ }
+
+ ptr += plen;
+ }
+
+ return res;
+}
+
+bool
+RS_execute(Regis *r, char *str)
+{
+ RegisNode *ptr = r->node;
+ char *c = str;
+ int len = 0;
+
+ while (*c)
+ {
+ len++;
+ c += pg_mblen(c);
+ }
+
+ if (len < r->nchar)
+ return 0;
+
+ c = str;
+ if (r->issuffix)
+ {
+ len -= r->nchar;
+ while (len-- > 0)
+ c += pg_mblen(c);
+ }
+
+
+ while (ptr)
+ {
+ switch (ptr->type)
+ {
+ case RSF_ONEOF:
+ if (!mb_strchr((char *) ptr->data, c))
+ return false;
+ break;
+ case RSF_NONEOF:
+ if (mb_strchr((char *) ptr->data, c))
+ return false;
+ break;
+ default:
+ elog(ERROR, "unrecognized regis node type: %d", ptr->type);
+ }
+ ptr = ptr->next;
+ c += pg_mblen(c);
+ }
+
+ return true;
+}
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
new file mode 100644
index 0000000..ebc8960
--- /dev/null
+++ b/src/backend/tsearch/spell.c
@@ -0,0 +1,2617 @@
+/*-------------------------------------------------------------------------
+ *
+ * spell.c
+ * Normalizing word with ISpell
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ * Ispell dictionary
+ * -----------------
+ *
+ * Rules of dictionaries are defined in two files with .affix and .dict
+ * extensions. They are used by spell checker programs Ispell and Hunspell.
+ *
+ * An .affix file declares morphological rules to get a basic form of words.
+ * The format of an .affix file has different structure for Ispell and Hunspell
+ * dictionaries. The Hunspell format is more complicated. But when an .affix
+ * file is imported and compiled, it is stored in the same structure AffixNode.
+ *
+ * A .dict file stores a list of basic forms of words with references to
+ * affix rules. The format of a .dict file has the same structure for Ispell
+ * and Hunspell dictionaries.
+ *
+ * Compilation of a dictionary
+ * ---------------------------
+ *
+ * A compiled dictionary is stored in the IspellDict structure. Compilation of
+ * a dictionary is divided into the several steps:
+ * - NIImportDictionary() - stores each word of a .dict file in the
+ * temporary Spell field.
+ * - NIImportAffixes() - stores affix rules of an .affix file in the
+ * Affix field (not temporary) if an .affix file has the Ispell format.
+ * -> NIImportOOAffixes() - stores affix rules if an .affix file has the
+ * Hunspell format. The AffixData field is initialized if AF parameter
+ * is defined.
+ * - NISortDictionary() - builds a prefix tree (Trie) from the words list
+ * and stores it in the Dictionary field. The words list is got from the
+ * Spell field. The AffixData field is initialized if AF parameter is not
+ * defined.
+ * - NISortAffixes():
+ * - builds a list of compound affixes from the affix list and stores it
+ * in the CompoundAffix.
+ * - builds prefix trees (Trie) from the affix list for prefixes and suffixes
+ * and stores them in Suffix and Prefix fields.
+ * The affix list is got from the Affix field.
+ *
+ * Memory management
+ * -----------------
+ *
+ * The IspellDict structure has the Spell field which is used only in compile
+ * time. The Spell field stores a words list. It can take a lot of memory.
+ * Therefore when a dictionary is compiled this field is cleared by
+ * NIFinishBuild().
+ *
+ * All resources which should cleared by NIFinishBuild() is initialized using
+ * tmpalloc() and tmpalloc0().
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/spell.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/pg_collation.h"
+#include "tsearch/dicts/spell.h"
+#include "tsearch/ts_locale.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Initialization requires a lot of memory that's not needed
+ * after the initialization is done. During initialization,
+ * CurrentMemoryContext is the long-lived memory context associated
+ * with the dictionary cache entry. We keep the short-lived stuff
+ * in the Conf->buildCxt context.
+ */
+#define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
+#define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
+
+/*
+ * Prepare for constructing an ISpell dictionary.
+ *
+ * The IspellDict struct is assumed to be zeroed when allocated.
+ */
+void
+NIStartBuild(IspellDict *Conf)
+{
+ /*
+ * The temp context is a child of CurTransactionContext, so that it will
+ * go away automatically on error.
+ */
+ Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
+ "Ispell dictionary init context",
+ ALLOCSET_DEFAULT_SIZES);
+}
+
+/*
+ * Clean up when dictionary construction is complete.
+ */
+void
+NIFinishBuild(IspellDict *Conf)
+{
+ /* Release no-longer-needed temp memory */
+ MemoryContextDelete(Conf->buildCxt);
+ /* Just for cleanliness, zero the now-dangling pointers */
+ Conf->buildCxt = NULL;
+ Conf->Spell = NULL;
+ Conf->firstfree = NULL;
+ Conf->CompoundAffixFlags = NULL;
+}
+
+
+/*
+ * "Compact" palloc: allocate without extra palloc overhead.
+ *
+ * Since we have no need to free the ispell data items individually, there's
+ * not much value in the per-chunk overhead normally consumed by palloc.
+ * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
+ *
+ * We currently pre-zero all data allocated this way, even though some of it
+ * doesn't need that. The cpalloc and cpalloc0 macros are just documentation
+ * to indicate which allocations actually require zeroing.
+ */
+#define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */
+#define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */
+
+static void *
+compact_palloc0(IspellDict *Conf, size_t size)
+{
+ void *result;
+
+ /* Should only be called during init */
+ Assert(Conf->buildCxt != NULL);
+
+ /* No point in this for large chunks */
+ if (size > COMPACT_MAX_REQ)
+ return palloc0(size);
+
+ /* Keep everything maxaligned */
+ size = MAXALIGN(size);
+
+ /* Need more space? */
+ if (size > Conf->avail)
+ {
+ Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
+ Conf->avail = COMPACT_ALLOC_CHUNK;
+ }
+
+ result = (void *) Conf->firstfree;
+ Conf->firstfree += size;
+ Conf->avail -= size;
+
+ return result;
+}
+
+#define cpalloc(size) compact_palloc0(Conf, size)
+#define cpalloc0(size) compact_palloc0(Conf, size)
+
+static char *
+cpstrdup(IspellDict *Conf, const char *str)
+{
+ char *res = cpalloc(strlen(str) + 1);
+
+ strcpy(res, str);
+ return res;
+}
+
+
+/*
+ * Apply lowerstr(), producing a temporary result (in the buildCxt).
+ */
+static char *
+lowerstr_ctx(IspellDict *Conf, const char *src)
+{
+ MemoryContext saveCtx;
+ char *dst;
+
+ saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
+ dst = lowerstr(src);
+ MemoryContextSwitchTo(saveCtx);
+
+ return dst;
+}
+
+#define MAX_NORM 1024
+#define MAXNORMLEN 256
+
+#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
+#define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
+#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
+
+static char *VoidString = "";
+
+static int
+cmpspell(const void *s1, const void *s2)
+{
+ return strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word);
+}
+
+static int
+cmpspellaffix(const void *s1, const void *s2)
+{
+ return strcmp((*(SPELL *const *) s1)->p.flag,
+ (*(SPELL *const *) s2)->p.flag);
+}
+
+static int
+cmpcmdflag(const void *f1, const void *f2)
+{
+ CompoundAffixFlag *fv1 = (CompoundAffixFlag *) f1,
+ *fv2 = (CompoundAffixFlag *) f2;
+
+ Assert(fv1->flagMode == fv2->flagMode);
+
+ if (fv1->flagMode == FM_NUM)
+ {
+ if (fv1->flag.i == fv2->flag.i)
+ return 0;
+
+ return (fv1->flag.i > fv2->flag.i) ? 1 : -1;
+ }
+
+ return strcmp(fv1->flag.s, fv2->flag.s);
+}
+
+static char *
+findchar(char *str, int c)
+{
+ while (*str)
+ {
+ if (t_iseq(str, c))
+ return str;
+ str += pg_mblen(str);
+ }
+
+ return NULL;
+}
+
+static char *
+findchar2(char *str, int c1, int c2)
+{
+ while (*str)
+ {
+ if (t_iseq(str, c1) || t_iseq(str, c2))
+ return str;
+ str += pg_mblen(str);
+ }
+
+ return NULL;
+}
+
+
+/* backward string compare for suffix tree operations */
+static int
+strbcmp(const unsigned char *s1, const unsigned char *s2)
+{
+ int l1 = strlen((const char *) s1) - 1,
+ l2 = strlen((const char *) s2) - 1;
+
+ while (l1 >= 0 && l2 >= 0)
+ {
+ if (s1[l1] < s2[l2])
+ return -1;
+ if (s1[l1] > s2[l2])
+ return 1;
+ l1--;
+ l2--;
+ }
+ if (l1 < l2)
+ return -1;
+ if (l1 > l2)
+ return 1;
+
+ return 0;
+}
+
+static int
+strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
+{
+ int l1 = strlen((const char *) s1) - 1,
+ l2 = strlen((const char *) s2) - 1,
+ l = count;
+
+ while (l1 >= 0 && l2 >= 0 && l > 0)
+ {
+ if (s1[l1] < s2[l2])
+ return -1;
+ if (s1[l1] > s2[l2])
+ return 1;
+ l1--;
+ l2--;
+ l--;
+ }
+ if (l == 0)
+ return 0;
+ if (l1 < l2)
+ return -1;
+ if (l1 > l2)
+ return 1;
+ return 0;
+}
+
+/*
+ * Compares affixes.
+ * First compares the type of an affix. Prefixes should go before affixes.
+ * If types are equal then compares replaceable string.
+ */
+static int
+cmpaffix(const void *s1, const void *s2)
+{
+ const AFFIX *a1 = (const AFFIX *) s1;
+ const AFFIX *a2 = (const AFFIX *) s2;
+
+ if (a1->type < a2->type)
+ return -1;
+ if (a1->type > a2->type)
+ return 1;
+ if (a1->type == FF_PREFIX)
+ return strcmp(a1->repl, a2->repl);
+ else
+ return strbcmp((const unsigned char *) a1->repl,
+ (const unsigned char *) a2->repl);
+}
+
+/*
+ * Gets an affix flag from the set of affix flags (sflagset).
+ *
+ * Several flags can be stored in a single string. Flags can be represented by:
+ * - 1 character (FM_CHAR). A character may be Unicode.
+ * - 2 characters (FM_LONG). A character may be Unicode.
+ * - numbers from 1 to 65000 (FM_NUM).
+ *
+ * Depending on the flagMode an affix string can have the following format:
+ * - FM_CHAR: ABCD
+ * Here we have 4 flags: A, B, C and D
+ * - FM_LONG: ABCDE*
+ * Here we have 3 flags: AB, CD and E*
+ * - FM_NUM: 200,205,50
+ * Here we have 3 flags: 200, 205 and 50
+ *
+ * Conf: current dictionary.
+ * sflagset: the set of affix flags. Returns a reference to the start of a next
+ * affix flag.
+ * sflag: returns an affix flag from sflagset.
+ */
+static void
+getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
+{
+ int32 s;
+ char *next,
+ *sbuf = *sflagset;
+ int maxstep;
+ bool stop = false;
+ bool met_comma = false;
+
+ maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1;
+
+ while (**sflagset)
+ {
+ switch (Conf->flagMode)
+ {
+ case FM_LONG:
+ case FM_CHAR:
+ COPYCHAR(sflag, *sflagset);
+ sflag += pg_mblen(*sflagset);
+
+ /* Go to start of the next flag */
+ *sflagset += pg_mblen(*sflagset);
+
+ /* Check if we get all characters of flag */
+ maxstep--;
+ stop = (maxstep == 0);
+ break;
+ case FM_NUM:
+ s = strtol(*sflagset, &next, 10);
+ if (*sflagset == next || errno == ERANGE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\"", *sflagset)));
+ if (s < 0 || s > FLAGNUM_MAXSIZE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("affix flag \"%s\" is out of range",
+ *sflagset)));
+ sflag += sprintf(sflag, "%0d", s);
+
+ /* Go to start of the next flag */
+ *sflagset = next;
+ while (**sflagset)
+ {
+ if (t_isdigit(*sflagset))
+ {
+ if (!met_comma)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\"",
+ *sflagset)));
+ break;
+ }
+ else if (t_iseq(*sflagset, ','))
+ {
+ if (met_comma)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\"",
+ *sflagset)));
+ met_comma = true;
+ }
+ else if (!t_isspace(*sflagset))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid character in affix flag \"%s\"",
+ *sflagset)));
+ }
+
+ *sflagset += pg_mblen(*sflagset);
+ }
+ stop = true;
+ break;
+ default:
+ elog(ERROR, "unrecognized type of Conf->flagMode: %d",
+ Conf->flagMode);
+ }
+
+ if (stop)
+ break;
+ }
+
+ if (Conf->flagMode == FM_LONG && maxstep > 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\" with \"long\" flag value",
+ sbuf)));
+
+ *sflag = '\0';
+}
+
+/*
+ * Checks if the affix set Conf->AffixData[affix] contains affixflag.
+ * Conf->AffixData[affix] does not contain affixflag if this flag is not used
+ * actually by the .dict file.
+ *
+ * Conf: current dictionary.
+ * affix: index of the Conf->AffixData array.
+ * affixflag: the affix flag.
+ *
+ * Returns true if the string Conf->AffixData[affix] contains affixflag,
+ * otherwise returns false.
+ */
+static bool
+IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag)
+{
+ char *flagcur;
+ char flag[BUFSIZ];
+
+ if (*affixflag == 0)
+ return true;
+
+ Assert(affix < Conf->nAffixData);
+
+ flagcur = Conf->AffixData[affix];
+
+ while (*flagcur)
+ {
+ getNextFlagFromString(Conf, &flagcur, flag);
+ /* Compare first affix flag in flagcur with affixflag */
+ if (strcmp(flag, affixflag) == 0)
+ return true;
+ }
+
+ /* Could not find affixflag */
+ return false;
+}
+
+/*
+ * Adds the new word into the temporary array Spell.
+ *
+ * Conf: current dictionary.
+ * word: new word.
+ * flag: set of affix flags. Single flag can be get by getNextFlagFromString().
+ */
+static void
+NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
+{
+ if (Conf->nspell >= Conf->mspell)
+ {
+ if (Conf->mspell)
+ {
+ Conf->mspell *= 2;
+ Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
+ }
+ else
+ {
+ Conf->mspell = 1024 * 20;
+ Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
+ }
+ }
+ Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
+ strcpy(Conf->Spell[Conf->nspell]->word, word);
+ Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
+ ? cpstrdup(Conf, flag) : VoidString;
+ Conf->nspell++;
+}
+
+/*
+ * Imports dictionary into the temporary array Spell.
+ *
+ * Note caller must already have applied get_tsearch_config_filename.
+ *
+ * Conf: current dictionary.
+ * filename: path to the .dict file.
+ */
+void
+NIImportDictionary(IspellDict *Conf, const char *filename)
+{
+ tsearch_readline_state trst;
+ char *line;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open dictionary file \"%s\": %m",
+ filename)));
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ char *s,
+ *pstr;
+
+ /* Set of affix flags */
+ const char *flag;
+
+ /* Extract flag from the line */
+ flag = NULL;
+ if ((s = findchar(line, '/')))
+ {
+ *s++ = '\0';
+ flag = s;
+ while (*s)
+ {
+ /* we allow only single encoded flags for faster works */
+ if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
+ s++;
+ else
+ {
+ *s = '\0';
+ break;
+ }
+ }
+ }
+ else
+ flag = "";
+
+ /* Remove trailing spaces */
+ s = line;
+ while (*s)
+ {
+ if (t_isspace(s))
+ {
+ *s = '\0';
+ break;
+ }
+ s += pg_mblen(s);
+ }
+ pstr = lowerstr_ctx(Conf, line);
+
+ NIAddSpell(Conf, pstr, flag);
+ pfree(pstr);
+
+ pfree(line);
+ }
+ tsearch_readline_end(&trst);
+}
+
+/*
+ * Searches a basic form of word in the prefix tree. This word was generated
+ * using an affix rule. This rule may not be presented in an affix set of
+ * a basic form of word.
+ *
+ * For example, we have the entry in the .dict file:
+ * meter/GMD
+ *
+ * The affix rule with the flag S:
+ * SFX S y ies [^aeiou]y
+ * is not presented here.
+ *
+ * The affix rule with the flag M:
+ * SFX M 0 's .
+ * is presented here.
+ *
+ * Conf: current dictionary.
+ * word: basic form of word.
+ * affixflag: affix flag, by which a basic form of word was generated.
+ * flag: compound flag used to compare with StopMiddle->compoundflag.
+ *
+ * Returns 1 if the word was found in the prefix tree, else returns 0.
+ */
+static int
+FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag)
+{
+ SPNode *node = Conf->Dictionary;
+ SPNodeData *StopLow,
+ *StopHigh,
+ *StopMiddle;
+ const uint8 *ptr = (const uint8 *) word;
+
+ flag &= FF_COMPOUNDFLAGMASK;
+
+ while (node && *ptr)
+ {
+ StopLow = node->data;
+ StopHigh = node->data + node->length;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ if (StopMiddle->val == *ptr)
+ {
+ if (*(ptr + 1) == '\0' && StopMiddle->isword)
+ {
+ if (flag == 0)
+ {
+ /*
+ * The word can be formed only with another word. And
+ * in the flag parameter there is not a sign that we
+ * search compound words.
+ */
+ if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
+ return 0;
+ }
+ else if ((flag & StopMiddle->compoundflag) == 0)
+ return 0;
+
+ /*
+ * Check if this affix rule is presented in the affix set
+ * with index StopMiddle->affix.
+ */
+ if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
+ return 1;
+ }
+ node = StopMiddle->node;
+ ptr++;
+ break;
+ }
+ else if (StopMiddle->val < *ptr)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+ if (StopLow >= StopHigh)
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Context reset/delete callback for a regular expression used in an affix
+ */
+static void
+regex_affix_deletion_callback(void *arg)
+{
+ aff_regex_struct *pregex = (aff_regex_struct *) arg;
+
+ pg_regfree(&(pregex->regex));
+}
+
+/*
+ * Adds a new affix rule to the Affix field.
+ *
+ * Conf: current dictionary.
+ * flag: affix flag ('\' in the below example).
+ * flagflags: set of flags from the flagval field for this affix rule. This set
+ * is listed after '/' character in the added string (repl).
+ *
+ * For example L flag in the hunspell_sample.affix:
+ * SFX \ 0 Y/L [^Y]
+ *
+ * mask: condition for search ('[^Y]' in the above example).
+ * find: stripping characters from beginning (at prefix) or end (at suffix)
+ * of the word ('0' in the above example, 0 means that there is not
+ * stripping character).
+ * repl: adding string after stripping ('Y' in the above example).
+ * type: FF_SUFFIX or FF_PREFIX.
+ */
+static void
+NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask,
+ const char *find, const char *repl, int type)
+{
+ AFFIX *Affix;
+
+ if (Conf->naffixes >= Conf->maffixes)
+ {
+ if (Conf->maffixes)
+ {
+ Conf->maffixes *= 2;
+ Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
+ }
+ else
+ {
+ Conf->maffixes = 16;
+ Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
+ }
+ }
+
+ Affix = Conf->Affix + Conf->naffixes;
+
+ /* This affix rule can be applied for words with any ending */
+ if (strcmp(mask, ".") == 0 || *mask == '\0')
+ {
+ Affix->issimple = 1;
+ Affix->isregis = 0;
+ }
+ /* This affix rule will use regis to search word ending */
+ else if (RS_isRegis(mask))
+ {
+ Affix->issimple = 0;
+ Affix->isregis = 1;
+ RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
+ *mask ? mask : VoidString);
+ }
+ /* This affix rule will use regex_t to search word ending */
+ else
+ {
+ int masklen;
+ int wmasklen;
+ int err;
+ pg_wchar *wmask;
+ char *tmask;
+ aff_regex_struct *pregex;
+
+ Affix->issimple = 0;
+ Affix->isregis = 0;
+ tmask = (char *) tmpalloc(strlen(mask) + 3);
+ if (type == FF_SUFFIX)
+ sprintf(tmask, "%s$", mask);
+ else
+ sprintf(tmask, "^%s", mask);
+
+ masklen = strlen(tmask);
+ wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
+ wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
+
+ /*
+ * The regex engine stores its stuff using malloc not palloc, so we
+ * must arrange to explicitly clean up the regex when the dictionary's
+ * context is cleared. That means the regex_t has to stay in a fixed
+ * location within the context; we can't keep it directly in the AFFIX
+ * struct, since we may sort and resize the array of AFFIXes.
+ */
+ Affix->reg.pregex = pregex = palloc(sizeof(aff_regex_struct));
+
+ err = pg_regcomp(&(pregex->regex), wmask, wmasklen,
+ REG_ADVANCED | REG_NOSUB,
+ DEFAULT_COLLATION_OID);
+ if (err)
+ {
+ char errstr[100];
+
+ pg_regerror(err, &(pregex->regex), errstr, sizeof(errstr));
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
+ errmsg("invalid regular expression: %s", errstr)));
+ }
+
+ pregex->mcallback.func = regex_affix_deletion_callback;
+ pregex->mcallback.arg = (void *) pregex;
+ MemoryContextRegisterResetCallback(CurrentMemoryContext,
+ &pregex->mcallback);
+ }
+
+ Affix->flagflags = flagflags;
+ if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
+ {
+ if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
+ Affix->flagflags |= FF_COMPOUNDFLAG;
+ }
+ Affix->flag = cpstrdup(Conf, flag);
+ Affix->type = type;
+
+ Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
+ if ((Affix->replen = strlen(repl)) > 0)
+ Affix->repl = cpstrdup(Conf, repl);
+ else
+ Affix->repl = VoidString;
+ Conf->naffixes++;
+}
+
+/* Parsing states for parse_affentry() and friends */
+#define PAE_WAIT_MASK 0
+#define PAE_INMASK 1
+#define PAE_WAIT_FIND 2
+#define PAE_INFIND 3
+#define PAE_WAIT_REPL 4
+#define PAE_INREPL 5
+#define PAE_WAIT_TYPE 6
+#define PAE_WAIT_FLAG 7
+
+/*
+ * Parse next space-separated field of an .affix file line.
+ *
+ * *str is the input pointer (will be advanced past field)
+ * next is where to copy the field value to, with null termination
+ *
+ * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
+ *
+ * Returns true if we found a field, false if not.
+ */
+static bool
+get_nextfield(char **str, char *next)
+{
+ int state = PAE_WAIT_MASK;
+ int avail = BUFSIZ;
+
+ while (**str)
+ {
+ if (state == PAE_WAIT_MASK)
+ {
+ if (t_iseq(*str, '#'))
+ return false;
+ else if (!t_isspace(*str))
+ {
+ int clen = pg_mblen(*str);
+
+ if (clen < avail)
+ {
+ COPYCHAR(next, *str);
+ next += clen;
+ avail -= clen;
+ }
+ state = PAE_INMASK;
+ }
+ }
+ else /* state == PAE_INMASK */
+ {
+ if (t_isspace(*str))
+ {
+ *next = '\0';
+ return true;
+ }
+ else
+ {
+ int clen = pg_mblen(*str);
+
+ if (clen < avail)
+ {
+ COPYCHAR(next, *str);
+ next += clen;
+ avail -= clen;
+ }
+ }
+ }
+ *str += pg_mblen(*str);
+ }
+
+ *next = '\0';
+
+ return (state == PAE_INMASK); /* OK if we got a nonempty field */
+}
+
+/*
+ * Parses entry of an .affix file of MySpell or Hunspell format.
+ *
+ * An .affix file entry has the following format:
+ * - header
+ * <type> <flag> <cross_flag> <flag_count>
+ * - fields after header:
+ * <type> <flag> <find> <replace> <mask>
+ *
+ * str is the input line
+ * field values are returned to type etc, which must be buffers of size BUFSIZ.
+ *
+ * Returns number of fields found; any omitted fields are set to empty strings.
+ */
+static int
+parse_ooaffentry(char *str, char *type, char *flag, char *find,
+ char *repl, char *mask)
+{
+ int state = PAE_WAIT_TYPE;
+ int fields_read = 0;
+ bool valid = false;
+
+ *type = *flag = *find = *repl = *mask = '\0';
+
+ while (*str)
+ {
+ switch (state)
+ {
+ case PAE_WAIT_TYPE:
+ valid = get_nextfield(&str, type);
+ state = PAE_WAIT_FLAG;
+ break;
+ case PAE_WAIT_FLAG:
+ valid = get_nextfield(&str, flag);
+ state = PAE_WAIT_FIND;
+ break;
+ case PAE_WAIT_FIND:
+ valid = get_nextfield(&str, find);
+ state = PAE_WAIT_REPL;
+ break;
+ case PAE_WAIT_REPL:
+ valid = get_nextfield(&str, repl);
+ state = PAE_WAIT_MASK;
+ break;
+ case PAE_WAIT_MASK:
+ valid = get_nextfield(&str, mask);
+ state = -1; /* force loop exit */
+ break;
+ default:
+ elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
+ state);
+ break;
+ }
+ if (valid)
+ fields_read++;
+ else
+ break; /* early EOL */
+ if (state < 0)
+ break; /* got all fields */
+ }
+
+ return fields_read;
+}
+
+/*
+ * Parses entry of an .affix file of Ispell format
+ *
+ * An .affix file entry has the following format:
+ * <mask> > [-<find>,]<replace>
+ */
+static bool
+parse_affentry(char *str, char *mask, char *find, char *repl)
+{
+ int state = PAE_WAIT_MASK;
+ char *pmask = mask,
+ *pfind = find,
+ *prepl = repl;
+
+ *mask = *find = *repl = '\0';
+
+ while (*str)
+ {
+ if (state == PAE_WAIT_MASK)
+ {
+ if (t_iseq(str, '#'))
+ return false;
+ else if (!t_isspace(str))
+ {
+ COPYCHAR(pmask, str);
+ pmask += pg_mblen(str);
+ state = PAE_INMASK;
+ }
+ }
+ else if (state == PAE_INMASK)
+ {
+ if (t_iseq(str, '>'))
+ {
+ *pmask = '\0';
+ state = PAE_WAIT_FIND;
+ }
+ else if (!t_isspace(str))
+ {
+ COPYCHAR(pmask, str);
+ pmask += pg_mblen(str);
+ }
+ }
+ else if (state == PAE_WAIT_FIND)
+ {
+ if (t_iseq(str, '-'))
+ {
+ state = PAE_INFIND;
+ }
+ else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
+ {
+ COPYCHAR(prepl, str);
+ prepl += pg_mblen(str);
+ state = PAE_INREPL;
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else if (state == PAE_INFIND)
+ {
+ if (t_iseq(str, ','))
+ {
+ *pfind = '\0';
+ state = PAE_WAIT_REPL;
+ }
+ else if (t_isalpha(str))
+ {
+ COPYCHAR(pfind, str);
+ pfind += pg_mblen(str);
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else if (state == PAE_WAIT_REPL)
+ {
+ if (t_iseq(str, '-'))
+ {
+ break; /* void repl */
+ }
+ else if (t_isalpha(str))
+ {
+ COPYCHAR(prepl, str);
+ prepl += pg_mblen(str);
+ state = PAE_INREPL;
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else if (state == PAE_INREPL)
+ {
+ if (t_iseq(str, '#'))
+ {
+ *prepl = '\0';
+ break;
+ }
+ else if (t_isalpha(str))
+ {
+ COPYCHAR(prepl, str);
+ prepl += pg_mblen(str);
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else
+ elog(ERROR, "unrecognized state in parse_affentry: %d", state);
+
+ str += pg_mblen(str);
+ }
+
+ *pmask = *pfind = *prepl = '\0';
+
+ return (*mask && (*find || *repl));
+}
+
+/*
+ * Sets a Hunspell options depending on flag type.
+ */
+static void
+setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry,
+ char *s, uint32 val)
+{
+ if (Conf->flagMode == FM_NUM)
+ {
+ char *next;
+ int i;
+
+ i = strtol(s, &next, 10);
+ if (s == next || errno == ERANGE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\"", s)));
+ if (i < 0 || i > FLAGNUM_MAXSIZE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("affix flag \"%s\" is out of range", s)));
+
+ entry->flag.i = i;
+ }
+ else
+ entry->flag.s = cpstrdup(Conf, s);
+
+ entry->flagMode = Conf->flagMode;
+ entry->value = val;
+}
+
+/*
+ * Sets up a correspondence for the affix parameter with the affix flag.
+ *
+ * Conf: current dictionary.
+ * s: affix flag in string.
+ * val: affix parameter.
+ */
+static void
+addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
+{
+ CompoundAffixFlag *newValue;
+ char sbuf[BUFSIZ];
+ char *sflag;
+ int clen;
+
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (!*s)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+
+ /* Get flag without \n */
+ sflag = sbuf;
+ while (*s && !t_isspace(s) && *s != '\n')
+ {
+ clen = pg_mblen(s);
+ COPYCHAR(sflag, s);
+ sflag += clen;
+ s += clen;
+ }
+ *sflag = '\0';
+
+ /* Resize array or allocate memory for array CompoundAffixFlag */
+ if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag)
+ {
+ if (Conf->mCompoundAffixFlag)
+ {
+ Conf->mCompoundAffixFlag *= 2;
+ Conf->CompoundAffixFlags = (CompoundAffixFlag *)
+ repalloc((void *) Conf->CompoundAffixFlags,
+ Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
+ }
+ else
+ {
+ Conf->mCompoundAffixFlag = 10;
+ Conf->CompoundAffixFlags = (CompoundAffixFlag *)
+ tmpalloc(Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
+ }
+ }
+
+ newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag;
+
+ setCompoundAffixFlagValue(Conf, newValue, sbuf, val);
+
+ Conf->usecompound = true;
+ Conf->nCompoundAffixFlag++;
+}
+
+/*
+ * Returns a set of affix parameters which correspondence to the set of affix
+ * flags s.
+ */
+static int
+getCompoundAffixFlagValue(IspellDict *Conf, char *s)
+{
+ uint32 flag = 0;
+ CompoundAffixFlag *found,
+ key;
+ char sflag[BUFSIZ];
+ char *flagcur;
+
+ if (Conf->nCompoundAffixFlag == 0)
+ return 0;
+
+ flagcur = s;
+ while (*flagcur)
+ {
+ getNextFlagFromString(Conf, &flagcur, sflag);
+ setCompoundAffixFlagValue(Conf, &key, sflag, 0);
+
+ found = (CompoundAffixFlag *)
+ bsearch(&key, (void *) Conf->CompoundAffixFlags,
+ Conf->nCompoundAffixFlag, sizeof(CompoundAffixFlag),
+ cmpcmdflag);
+ if (found != NULL)
+ flag |= found->value;
+ }
+
+ return flag;
+}
+
+/*
+ * Returns a flag set using the s parameter.
+ *
+ * If Conf->useFlagAliases is true then the s parameter is index of the
+ * Conf->AffixData array and function returns its entry.
+ * Else function returns the s parameter.
+ */
+static char *
+getAffixFlagSet(IspellDict *Conf, char *s)
+{
+ if (Conf->useFlagAliases && *s != '\0')
+ {
+ int curaffix;
+ char *end;
+
+ curaffix = strtol(s, &end, 10);
+ if (s == end || errno == ERANGE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"", s)));
+
+ if (curaffix > 0 && curaffix < Conf->nAffixData)
+
+ /*
+ * Do not subtract 1 from curaffix because empty string was added
+ * in NIImportOOAffixes
+ */
+ return Conf->AffixData[curaffix];
+ else if (curaffix > Conf->nAffixData)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"", s)));
+ return VoidString;
+ }
+ else
+ return s;
+}
+
+/*
+ * Import an affix file that follows MySpell or Hunspell format.
+ *
+ * Conf: current dictionary.
+ * filename: path to the .affix file.
+ */
+static void
+NIImportOOAffixes(IspellDict *Conf, const char *filename)
+{
+ char type[BUFSIZ],
+ *ptype = NULL;
+ char sflag[BUFSIZ];
+ char mask[BUFSIZ],
+ *pmask;
+ char find[BUFSIZ],
+ *pfind;
+ char repl[BUFSIZ],
+ *prepl;
+ bool isSuffix = false;
+ int naffix = 0,
+ curaffix = 0;
+ int sflaglen = 0;
+ char flagflags = 0;
+ tsearch_readline_state trst;
+ char *recoded;
+
+ /* read file to find any flag */
+ Conf->usecompound = false;
+ Conf->useFlagAliases = false;
+ Conf->flagMode = FM_CHAR;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open affix file \"%s\": %m",
+ filename)));
+
+ while ((recoded = tsearch_readline(&trst)) != NULL)
+ {
+ if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+ {
+ pfree(recoded);
+ continue;
+ }
+
+ if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
+ FF_COMPOUNDFLAG);
+ else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
+ FF_COMPOUNDBEGIN);
+ else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
+ FF_COMPOUNDLAST);
+ /* COMPOUNDLAST and COMPOUNDEND are synonyms */
+ else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
+ FF_COMPOUNDLAST);
+ else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
+ FF_COMPOUNDMIDDLE);
+ else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
+ FF_COMPOUNDONLY);
+ else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
+ addCompoundAffixFlagValue(Conf,
+ recoded + strlen("COMPOUNDPERMITFLAG"),
+ FF_COMPOUNDPERMITFLAG);
+ else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
+ addCompoundAffixFlagValue(Conf,
+ recoded + strlen("COMPOUNDFORBIDFLAG"),
+ FF_COMPOUNDFORBIDFLAG);
+ else if (STRNCMP(recoded, "FLAG") == 0)
+ {
+ char *s = recoded + strlen("FLAG");
+
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (*s)
+ {
+ if (STRNCMP(s, "long") == 0)
+ Conf->flagMode = FM_LONG;
+ else if (STRNCMP(s, "num") == 0)
+ Conf->flagMode = FM_NUM;
+ else if (STRNCMP(s, "default") != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("Ispell dictionary supports only "
+ "\"default\", \"long\", "
+ "and \"num\" flag values")));
+ }
+ }
+
+ pfree(recoded);
+ }
+ tsearch_readline_end(&trst);
+
+ if (Conf->nCompoundAffixFlag > 1)
+ qsort((void *) Conf->CompoundAffixFlags, Conf->nCompoundAffixFlag,
+ sizeof(CompoundAffixFlag), cmpcmdflag);
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open affix file \"%s\": %m",
+ filename)));
+
+ while ((recoded = tsearch_readline(&trst)) != NULL)
+ {
+ int fields_read;
+
+ if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+ goto nextline;
+
+ fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
+
+ if (ptype)
+ pfree(ptype);
+ ptype = lowerstr_ctx(Conf, type);
+
+ /* First try to parse AF parameter (alias compression) */
+ if (STRNCMP(ptype, "af") == 0)
+ {
+ /* First line is the number of aliases */
+ if (!Conf->useFlagAliases)
+ {
+ Conf->useFlagAliases = true;
+ naffix = atoi(sflag);
+ if (naffix <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid number of flag vector aliases")));
+
+ /* Also reserve place for empty flag set */
+ naffix++;
+
+ Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
+ Conf->lenAffixData = Conf->nAffixData = naffix;
+
+ /* Add empty flag set into AffixData */
+ Conf->AffixData[curaffix] = VoidString;
+ curaffix++;
+ }
+ /* Other lines are aliases */
+ else
+ {
+ if (curaffix < naffix)
+ {
+ Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
+ curaffix++;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("number of aliases exceeds specified number %d",
+ naffix - 1)));
+ }
+ goto nextline;
+ }
+ /* Else try to parse prefixes and suffixes */
+ if (fields_read < 4 ||
+ (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
+ goto nextline;
+
+ sflaglen = strlen(sflag);
+ if (sflaglen == 0
+ || (sflaglen > 1 && Conf->flagMode == FM_CHAR)
+ || (sflaglen > 2 && Conf->flagMode == FM_LONG))
+ goto nextline;
+
+ /*--------
+ * Affix header. For example:
+ * SFX \ N 1
+ *--------
+ */
+ if (fields_read == 4)
+ {
+ isSuffix = (STRNCMP(ptype, "sfx") == 0);
+ if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
+ flagflags = FF_CROSSPRODUCT;
+ else
+ flagflags = 0;
+ }
+ /*--------
+ * Affix fields. For example:
+ * SFX \ 0 Y/L [^Y]
+ *--------
+ */
+ else
+ {
+ char *ptr;
+ int aflg = 0;
+
+ /* Get flags after '/' (flags are case sensitive) */
+ if ((ptr = strchr(repl, '/')) != NULL)
+ aflg |= getCompoundAffixFlagValue(Conf,
+ getAffixFlagSet(Conf,
+ ptr + 1));
+ /* Get lowercased version of string before '/' */
+ prepl = lowerstr_ctx(Conf, repl);
+ if ((ptr = strchr(prepl, '/')) != NULL)
+ *ptr = '\0';
+ pfind = lowerstr_ctx(Conf, find);
+ pmask = lowerstr_ctx(Conf, mask);
+ if (t_iseq(find, '0'))
+ *pfind = '\0';
+ if (t_iseq(repl, '0'))
+ *prepl = '\0';
+
+ NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl,
+ isSuffix ? FF_SUFFIX : FF_PREFIX);
+ pfree(prepl);
+ pfree(pfind);
+ pfree(pmask);
+ }
+
+nextline:
+ pfree(recoded);
+ }
+
+ tsearch_readline_end(&trst);
+ if (ptype)
+ pfree(ptype);
+}
+
+/*
+ * import affixes
+ *
+ * Note caller must already have applied get_tsearch_config_filename
+ *
+ * This function is responsible for parsing ispell ("old format") affix files.
+ * If we realize that the file contains new-format commands, we pass off the
+ * work to NIImportOOAffixes(), which will re-read the whole file.
+ */
+void
+NIImportAffixes(IspellDict *Conf, const char *filename)
+{
+ char *pstr = NULL;
+ char flag[BUFSIZ];
+ char mask[BUFSIZ];
+ char find[BUFSIZ];
+ char repl[BUFSIZ];
+ char *s;
+ bool suffixes = false;
+ bool prefixes = false;
+ char flagflags = 0;
+ tsearch_readline_state trst;
+ bool oldformat = false;
+ char *recoded = NULL;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open affix file \"%s\": %m",
+ filename)));
+
+ Conf->usecompound = false;
+ Conf->useFlagAliases = false;
+ Conf->flagMode = FM_CHAR;
+
+ while ((recoded = tsearch_readline(&trst)) != NULL)
+ {
+ pstr = lowerstr(recoded);
+
+ /* Skip comments and empty lines */
+ if (*pstr == '#' || *pstr == '\n')
+ goto nextline;
+
+ if (STRNCMP(pstr, "compoundwords") == 0)
+ {
+ /* Find case-insensitive L flag in non-lowercased string */
+ s = findchar2(recoded, 'l', 'L');
+ if (s)
+ {
+ while (*s && !t_isspace(s))
+ s += pg_mblen(s);
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (*s && pg_mblen(s) == 1)
+ {
+ addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
+ Conf->usecompound = true;
+ }
+ oldformat = true;
+ goto nextline;
+ }
+ }
+ if (STRNCMP(pstr, "suffixes") == 0)
+ {
+ suffixes = true;
+ prefixes = false;
+ oldformat = true;
+ goto nextline;
+ }
+ if (STRNCMP(pstr, "prefixes") == 0)
+ {
+ suffixes = false;
+ prefixes = true;
+ oldformat = true;
+ goto nextline;
+ }
+ if (STRNCMP(pstr, "flag") == 0)
+ {
+ s = recoded + 4; /* we need non-lowercased string */
+ flagflags = 0;
+
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (*s == '*')
+ {
+ flagflags |= FF_CROSSPRODUCT;
+ s++;
+ }
+ else if (*s == '~')
+ {
+ flagflags |= FF_COMPOUNDONLY;
+ s++;
+ }
+
+ if (*s == '\\')
+ s++;
+
+ /*
+ * An old-format flag is a single ASCII character; we expect it to
+ * be followed by EOL, whitespace, or ':'. Otherwise this is a
+ * new-format flag command.
+ */
+ if (*s && pg_mblen(s) == 1)
+ {
+ COPYCHAR(flag, s);
+ flag[1] = '\0';
+
+ s++;
+ if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
+ t_isspace(s))
+ {
+ oldformat = true;
+ goto nextline;
+ }
+ }
+ goto isnewformat;
+ }
+ if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 ||
+ STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
+ STRNCMP(recoded, "PFX") == 0 ||
+ STRNCMP(recoded, "SFX") == 0)
+ goto isnewformat;
+
+ if ((!suffixes) && (!prefixes))
+ goto nextline;
+
+ if (!parse_affentry(pstr, mask, find, repl))
+ goto nextline;
+
+ NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
+
+nextline:
+ pfree(recoded);
+ pfree(pstr);
+ }
+ tsearch_readline_end(&trst);
+ return;
+
+isnewformat:
+ if (oldformat)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("affix file contains both old-style and new-style commands")));
+ tsearch_readline_end(&trst);
+
+ NIImportOOAffixes(Conf, filename);
+}
+
+/*
+ * Merges two affix flag sets and stores a new affix flag set into
+ * Conf->AffixData.
+ *
+ * Returns index of a new affix flag set.
+ */
+static int
+MergeAffix(IspellDict *Conf, int a1, int a2)
+{
+ char **ptr;
+
+ Assert(a1 < Conf->nAffixData && a2 < Conf->nAffixData);
+
+ /* Do not merge affix flags if one of affix flags is empty */
+ if (*Conf->AffixData[a1] == '\0')
+ return a2;
+ else if (*Conf->AffixData[a2] == '\0')
+ return a1;
+
+ while (Conf->nAffixData + 1 >= Conf->lenAffixData)
+ {
+ Conf->lenAffixData *= 2;
+ Conf->AffixData = (char **) repalloc(Conf->AffixData,
+ sizeof(char *) * Conf->lenAffixData);
+ }
+
+ ptr = Conf->AffixData + Conf->nAffixData;
+ if (Conf->flagMode == FM_NUM)
+ {
+ *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
+ strlen(Conf->AffixData[a2]) +
+ 1 /* comma */ + 1 /* \0 */ );
+ sprintf(*ptr, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]);
+ }
+ else
+ {
+ *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
+ strlen(Conf->AffixData[a2]) +
+ 1 /* \0 */ );
+ sprintf(*ptr, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]);
+ }
+ ptr++;
+ *ptr = NULL;
+ Conf->nAffixData++;
+
+ return Conf->nAffixData - 1;
+}
+
+/*
+ * Returns a set of affix parameters which correspondence to the set of affix
+ * flags with the given index.
+ */
+static uint32
+makeCompoundFlags(IspellDict *Conf, int affix)
+{
+ Assert(affix < Conf->nAffixData);
+
+ return (getCompoundAffixFlagValue(Conf, Conf->AffixData[affix]) &
+ FF_COMPOUNDFLAGMASK);
+}
+
+/*
+ * Makes a prefix tree for the given level.
+ *
+ * Conf: current dictionary.
+ * low: lower index of the Conf->Spell array.
+ * high: upper index of the Conf->Spell array.
+ * level: current prefix tree level.
+ */
+static SPNode *
+mkSPNode(IspellDict *Conf, int low, int high, int level)
+{
+ int i;
+ int nchar = 0;
+ char lastchar = '\0';
+ SPNode *rs;
+ SPNodeData *data;
+ int lownew = low;
+
+ for (i = low; i < high; i++)
+ if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
+ {
+ nchar++;
+ lastchar = Conf->Spell[i]->word[level];
+ }
+
+ if (!nchar)
+ return NULL;
+
+ rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
+ rs->length = nchar;
+ data = rs->data;
+
+ lastchar = '\0';
+ for (i = low; i < high; i++)
+ if (Conf->Spell[i]->p.d.len > level)
+ {
+ if (lastchar != Conf->Spell[i]->word[level])
+ {
+ if (lastchar)
+ {
+ /* Next level of the prefix tree */
+ data->node = mkSPNode(Conf, lownew, i, level + 1);
+ lownew = i;
+ data++;
+ }
+ lastchar = Conf->Spell[i]->word[level];
+ }
+ data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
+ if (Conf->Spell[i]->p.d.len == level + 1)
+ {
+ bool clearCompoundOnly = false;
+
+ if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
+ {
+ /*
+ * MergeAffix called a few times. If one of word is
+ * allowed to be in compound word and another isn't, then
+ * clear FF_COMPOUNDONLY flag.
+ */
+
+ clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
+ & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
+ ? false : true;
+ data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
+ }
+ else
+ data->affix = Conf->Spell[i]->p.d.affix;
+ data->isword = 1;
+
+ data->compoundflag = makeCompoundFlags(Conf, data->affix);
+
+ if ((data->compoundflag & FF_COMPOUNDONLY) &&
+ (data->compoundflag & FF_COMPOUNDFLAG) == 0)
+ data->compoundflag |= FF_COMPOUNDFLAG;
+
+ if (clearCompoundOnly)
+ data->compoundflag &= ~FF_COMPOUNDONLY;
+ }
+ }
+
+ /* Next level of the prefix tree */
+ data->node = mkSPNode(Conf, lownew, high, level + 1);
+
+ return rs;
+}
+
+/*
+ * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
+ * and affixes.
+ */
+void
+NISortDictionary(IspellDict *Conf)
+{
+ int i;
+ int naffix;
+ int curaffix;
+
+ /* compress affixes */
+
+ /*
+ * If we use flag aliases then we need to use Conf->AffixData filled in
+ * the NIImportOOAffixes().
+ */
+ if (Conf->useFlagAliases)
+ {
+ for (i = 0; i < Conf->nspell; i++)
+ {
+ char *end;
+
+ if (*Conf->Spell[i]->p.flag != '\0')
+ {
+ curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10);
+ if (Conf->Spell[i]->p.flag == end || errno == ERANGE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"",
+ Conf->Spell[i]->p.flag)));
+ if (curaffix < 0 || curaffix >= Conf->nAffixData)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"",
+ Conf->Spell[i]->p.flag)));
+ if (*end != '\0' && !t_isdigit(end) && !t_isspace(end))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"",
+ Conf->Spell[i]->p.flag)));
+ }
+ else
+ {
+ /*
+ * If Conf->Spell[i]->p.flag is empty, then get empty value of
+ * Conf->AffixData (0 index).
+ */
+ curaffix = 0;
+ }
+
+ Conf->Spell[i]->p.d.affix = curaffix;
+ Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
+ }
+ }
+ /* Otherwise fill Conf->AffixData here */
+ else
+ {
+ /* Count the number of different flags used in the dictionary */
+ qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *),
+ cmpspellaffix);
+
+ naffix = 0;
+ for (i = 0; i < Conf->nspell; i++)
+ {
+ if (i == 0 ||
+ strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag) != 0)
+ naffix++;
+ }
+
+ /*
+ * Fill in Conf->AffixData with the affixes that were used in the
+ * dictionary. Replace textual flag-field of Conf->Spell entries with
+ * indexes into Conf->AffixData array.
+ */
+ Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
+
+ curaffix = -1;
+ for (i = 0; i < Conf->nspell; i++)
+ {
+ if (i == 0 ||
+ strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]) != 0)
+ {
+ curaffix++;
+ Assert(curaffix < naffix);
+ Conf->AffixData[curaffix] = cpstrdup(Conf,
+ Conf->Spell[i]->p.flag);
+ }
+
+ Conf->Spell[i]->p.d.affix = curaffix;
+ Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
+ }
+
+ Conf->lenAffixData = Conf->nAffixData = naffix;
+ }
+
+ /* Start build a prefix tree */
+ qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
+ Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
+}
+
+/*
+ * Makes a prefix tree for the given level using the repl string of an affix
+ * rule. Affixes with empty replace string do not include in the prefix tree.
+ * This affixes are included by mkVoidAffix().
+ *
+ * Conf: current dictionary.
+ * low: lower index of the Conf->Affix array.
+ * high: upper index of the Conf->Affix array.
+ * level: current prefix tree level.
+ * type: FF_SUFFIX or FF_PREFIX.
+ */
+static AffixNode *
+mkANode(IspellDict *Conf, int low, int high, int level, int type)
+{
+ int i;
+ int nchar = 0;
+ uint8 lastchar = '\0';
+ AffixNode *rs;
+ AffixNodeData *data;
+ int lownew = low;
+ int naff;
+ AFFIX **aff;
+
+ for (i = low; i < high; i++)
+ if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
+ {
+ nchar++;
+ lastchar = GETCHAR(Conf->Affix + i, level, type);
+ }
+
+ if (!nchar)
+ return NULL;
+
+ aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
+ naff = 0;
+
+ rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
+ rs->length = nchar;
+ data = rs->data;
+
+ lastchar = '\0';
+ for (i = low; i < high; i++)
+ if (Conf->Affix[i].replen > level)
+ {
+ if (lastchar != GETCHAR(Conf->Affix + i, level, type))
+ {
+ if (lastchar)
+ {
+ /* Next level of the prefix tree */
+ data->node = mkANode(Conf, lownew, i, level + 1, type);
+ if (naff)
+ {
+ data->naff = naff;
+ data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
+ memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
+ naff = 0;
+ }
+ data++;
+ lownew = i;
+ }
+ lastchar = GETCHAR(Conf->Affix + i, level, type);
+ }
+ data->val = GETCHAR(Conf->Affix + i, level, type);
+ if (Conf->Affix[i].replen == level + 1)
+ { /* affix stopped */
+ aff[naff++] = Conf->Affix + i;
+ }
+ }
+
+ /* Next level of the prefix tree */
+ data->node = mkANode(Conf, lownew, high, level + 1, type);
+ if (naff)
+ {
+ data->naff = naff;
+ data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
+ memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
+ naff = 0;
+ }
+
+ pfree(aff);
+
+ return rs;
+}
+
+/*
+ * Makes the root void node in the prefix tree. The root void node is created
+ * for affixes which have empty replace string ("repl" field).
+ */
+static void
+mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
+{
+ int i,
+ cnt = 0;
+ int start = (issuffix) ? startsuffix : 0;
+ int end = (issuffix) ? Conf->naffixes : startsuffix;
+ AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
+
+ Affix->length = 1;
+ Affix->isvoid = 1;
+
+ if (issuffix)
+ {
+ Affix->data->node = Conf->Suffix;
+ Conf->Suffix = Affix;
+ }
+ else
+ {
+ Affix->data->node = Conf->Prefix;
+ Conf->Prefix = Affix;
+ }
+
+ /* Count affixes with empty replace string */
+ for (i = start; i < end; i++)
+ if (Conf->Affix[i].replen == 0)
+ cnt++;
+
+ /* There is not affixes with empty replace string */
+ if (cnt == 0)
+ return;
+
+ Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
+ Affix->data->naff = (uint32) cnt;
+
+ cnt = 0;
+ for (i = start; i < end; i++)
+ if (Conf->Affix[i].replen == 0)
+ {
+ Affix->data->aff[cnt] = Conf->Affix + i;
+ cnt++;
+ }
+}
+
+/*
+ * Checks if the affixflag is used by dictionary. Conf->AffixData does not
+ * contain affixflag if this flag is not used actually by the .dict file.
+ *
+ * Conf: current dictionary.
+ * affixflag: affix flag.
+ *
+ * Returns true if the Conf->AffixData array contains affixflag, otherwise
+ * returns false.
+ */
+static bool
+isAffixInUse(IspellDict *Conf, char *affixflag)
+{
+ int i;
+
+ for (i = 0; i < Conf->nAffixData; i++)
+ if (IsAffixFlagInUse(Conf, i, affixflag))
+ return true;
+
+ return false;
+}
+
+/*
+ * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
+ */
+void
+NISortAffixes(IspellDict *Conf)
+{
+ AFFIX *Affix;
+ size_t i;
+ CMPDAffix *ptr;
+ int firstsuffix = Conf->naffixes;
+
+ if (Conf->naffixes == 0)
+ return;
+
+ /* Store compound affixes in the Conf->CompoundAffix array */
+ if (Conf->naffixes > 1)
+ qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
+ Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
+ ptr->affix = NULL;
+
+ for (i = 0; i < Conf->naffixes; i++)
+ {
+ Affix = &(((AFFIX *) Conf->Affix)[i]);
+ if (Affix->type == FF_SUFFIX && i < firstsuffix)
+ firstsuffix = i;
+
+ if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
+ isAffixInUse(Conf, Affix->flag))
+ {
+ bool issuffix = (Affix->type == FF_SUFFIX);
+
+ if (ptr == Conf->CompoundAffix ||
+ issuffix != (ptr - 1)->issuffix ||
+ strbncmp((const unsigned char *) (ptr - 1)->affix,
+ (const unsigned char *) Affix->repl,
+ (ptr - 1)->len))
+ {
+ /* leave only unique and minimal suffixes */
+ ptr->affix = Affix->repl;
+ ptr->len = Affix->replen;
+ ptr->issuffix = issuffix;
+ ptr++;
+ }
+ }
+ }
+ ptr->affix = NULL;
+ Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
+
+ /* Start build a prefix tree */
+ Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
+ Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
+ mkVoidAffix(Conf, true, firstsuffix);
+ mkVoidAffix(Conf, false, firstsuffix);
+}
+
+static AffixNodeData *
+FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
+{
+ AffixNodeData *StopLow,
+ *StopHigh,
+ *StopMiddle;
+ uint8 symbol;
+
+ if (node->isvoid)
+ { /* search void affixes */
+ if (node->data->naff)
+ return node->data;
+ node = node->data->node;
+ }
+
+ while (node && *level < wrdlen)
+ {
+ StopLow = node->data;
+ StopHigh = node->data + node->length;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ symbol = GETWCHAR(word, wrdlen, *level, type);
+
+ if (StopMiddle->val == symbol)
+ {
+ (*level)++;
+ if (StopMiddle->naff)
+ return StopMiddle;
+ node = StopMiddle->node;
+ break;
+ }
+ else if (StopMiddle->val < symbol)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+ if (StopLow >= StopHigh)
+ break;
+ }
+ return NULL;
+}
+
+static char *
+CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
+{
+ /*
+ * Check compound allow flags
+ */
+
+ if (flagflags == 0)
+ {
+ if (Affix->flagflags & FF_COMPOUNDONLY)
+ return NULL;
+ }
+ else if (flagflags & FF_COMPOUNDBEGIN)
+ {
+ if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
+ return NULL;
+ if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
+ if (Affix->type == FF_SUFFIX)
+ return NULL;
+ }
+ else if (flagflags & FF_COMPOUNDMIDDLE)
+ {
+ if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
+ (Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
+ return NULL;
+ }
+ else if (flagflags & FF_COMPOUNDLAST)
+ {
+ if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
+ return NULL;
+ if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
+ if (Affix->type == FF_PREFIX)
+ return NULL;
+ }
+
+ /*
+ * make replace pattern of affix
+ */
+ if (Affix->type == FF_SUFFIX)
+ {
+ strcpy(newword, word);
+ strcpy(newword + len - Affix->replen, Affix->find);
+ if (baselen) /* store length of non-changed part of word */
+ *baselen = len - Affix->replen;
+ }
+ else
+ {
+ /*
+ * if prefix is an all non-changed part's length then all word
+ * contains only prefix and suffix, so out
+ */
+ if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
+ return NULL;
+ strcpy(newword, Affix->find);
+ strcat(newword, word + Affix->replen);
+ }
+
+ /*
+ * check resulting word
+ */
+ if (Affix->issimple)
+ return newword;
+ else if (Affix->isregis)
+ {
+ if (RS_execute(&(Affix->reg.regis), newword))
+ return newword;
+ }
+ else
+ {
+ pg_wchar *data;
+ size_t data_len;
+ int newword_len;
+
+ /* Convert data string to wide characters */
+ newword_len = strlen(newword);
+ data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
+ data_len = pg_mb2wchar_with_len(newword, data, newword_len);
+
+ if (pg_regexec(&(Affix->reg.pregex->regex), data, data_len,
+ 0, NULL, 0, NULL, 0) == REG_OKAY)
+ {
+ pfree(data);
+ return newword;
+ }
+ pfree(data);
+ }
+
+ return NULL;
+}
+
+static int
+addToResult(char **forms, char **cur, char *word)
+{
+ if (cur - forms >= MAX_NORM - 1)
+ return 0;
+ if (forms == cur || strcmp(word, *(cur - 1)) != 0)
+ {
+ *cur = pstrdup(word);
+ *(cur + 1) = NULL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static char **
+NormalizeSubWord(IspellDict *Conf, char *word, int flag)
+{
+ AffixNodeData *suffix = NULL,
+ *prefix = NULL;
+ int slevel = 0,
+ plevel = 0;
+ int wrdlen = strlen(word),
+ swrdlen;
+ char **forms;
+ char **cur;
+ char newword[2 * MAXNORMLEN] = "";
+ char pnewword[2 * MAXNORMLEN] = "";
+ AffixNode *snode = Conf->Suffix,
+ *pnode;
+ int i,
+ j;
+
+ if (wrdlen > MAXNORMLEN)
+ return NULL;
+ cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
+ *cur = NULL;
+
+
+ /* Check that the word itself is normal form */
+ if (FindWord(Conf, word, VoidString, flag))
+ {
+ *cur = pstrdup(word);
+ cur++;
+ *cur = NULL;
+ }
+
+ /* Find all other NORMAL forms of the 'word' (check only prefix) */
+ pnode = Conf->Prefix;
+ plevel = 0;
+ while (pnode)
+ {
+ prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
+ if (!prefix)
+ break;
+ for (j = 0; j < prefix->naff; j++)
+ {
+ if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
+ {
+ /* prefix success */
+ if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
+ cur += addToResult(forms, cur, newword);
+ }
+ }
+ pnode = prefix->node;
+ }
+
+ /*
+ * Find all other NORMAL forms of the 'word' (check suffix and then
+ * prefix)
+ */
+ while (snode)
+ {
+ int baselen = 0;
+
+ /* find possible suffix */
+ suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
+ if (!suffix)
+ break;
+ /* foreach suffix check affix */
+ for (i = 0; i < suffix->naff; i++)
+ {
+ if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
+ {
+ /* suffix success */
+ if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
+ cur += addToResult(forms, cur, newword);
+
+ /* now we will look changed word with prefixes */
+ pnode = Conf->Prefix;
+ plevel = 0;
+ swrdlen = strlen(newword);
+ while (pnode)
+ {
+ prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
+ if (!prefix)
+ break;
+ for (j = 0; j < prefix->naff; j++)
+ {
+ if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
+ {
+ /* prefix success */
+ char *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
+ VoidString : prefix->aff[j]->flag;
+
+ if (FindWord(Conf, pnewword, ff, flag))
+ cur += addToResult(forms, cur, pnewword);
+ }
+ }
+ pnode = prefix->node;
+ }
+ }
+ }
+
+ snode = suffix->node;
+ }
+
+ if (cur == forms)
+ {
+ pfree(forms);
+ return NULL;
+ }
+ return forms;
+}
+
+typedef struct SplitVar
+{
+ int nstem;
+ int lenstem;
+ char **stem;
+ struct SplitVar *next;
+} SplitVar;
+
+static int
+CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
+{
+ bool issuffix;
+
+ /* in case CompoundAffix is null: */
+ if (*ptr == NULL)
+ return -1;
+
+ if (CheckInPlace)
+ {
+ while ((*ptr)->affix)
+ {
+ if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
+ {
+ len = (*ptr)->len;
+ issuffix = (*ptr)->issuffix;
+ (*ptr)++;
+ return (issuffix) ? len : 0;
+ }
+ (*ptr)++;
+ }
+ }
+ else
+ {
+ char *affbegin;
+
+ while ((*ptr)->affix)
+ {
+ if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
+ {
+ len = (*ptr)->len + (affbegin - word);
+ issuffix = (*ptr)->issuffix;
+ (*ptr)++;
+ return (issuffix) ? len : 0;
+ }
+ (*ptr)++;
+ }
+ }
+ return -1;
+}
+
+static SplitVar *
+CopyVar(SplitVar *s, int makedup)
+{
+ SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
+
+ v->next = NULL;
+ if (s)
+ {
+ int i;
+
+ v->lenstem = s->lenstem;
+ v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
+ v->nstem = s->nstem;
+ for (i = 0; i < s->nstem; i++)
+ v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
+ }
+ else
+ {
+ v->lenstem = 16;
+ v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
+ v->nstem = 0;
+ }
+ return v;
+}
+
+static void
+AddStem(SplitVar *v, char *word)
+{
+ if (v->nstem >= v->lenstem)
+ {
+ v->lenstem *= 2;
+ v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
+ }
+
+ v->stem[v->nstem] = word;
+ v->nstem++;
+}
+
+static SplitVar *
+SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
+{
+ SplitVar *var = NULL;
+ SPNodeData *StopLow,
+ *StopHigh,
+ *StopMiddle = NULL;
+ SPNode *node = (snode) ? snode : Conf->Dictionary;
+ int level = (snode) ? minpos : startpos; /* recursive
+ * minpos==level */
+ int lenaff;
+ CMPDAffix *caff;
+ char *notprobed;
+ int compoundflag = 0;
+
+ notprobed = (char *) palloc(wordlen);
+ memset(notprobed, 1, wordlen);
+ var = CopyVar(orig, 1);
+
+ while (level < wordlen)
+ {
+ /* find word with epenthetic or/and compound affix */
+ caff = Conf->CompoundAffix;
+ while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
+ {
+ /*
+ * there is one of compound affixes, so check word for existings
+ */
+ char buf[MAXNORMLEN];
+ char **subres;
+
+ lenaff = level - startpos + lenaff;
+
+ if (!notprobed[startpos + lenaff - 1])
+ continue;
+
+ if (level + lenaff - 1 <= minpos)
+ continue;
+
+ if (lenaff >= MAXNORMLEN)
+ continue; /* skip too big value */
+ if (lenaff > 0)
+ memcpy(buf, word + startpos, lenaff);
+ buf[lenaff] = '\0';
+
+ if (level == 0)
+ compoundflag = FF_COMPOUNDBEGIN;
+ else if (level == wordlen - 1)
+ compoundflag = FF_COMPOUNDLAST;
+ else
+ compoundflag = FF_COMPOUNDMIDDLE;
+ subres = NormalizeSubWord(Conf, buf, compoundflag);
+ if (subres)
+ {
+ /* Yes, it was a word from dictionary */
+ SplitVar *new = CopyVar(var, 0);
+ SplitVar *ptr = var;
+ char **sptr = subres;
+
+ notprobed[startpos + lenaff - 1] = 0;
+
+ while (*sptr)
+ {
+ AddStem(new, *sptr);
+ sptr++;
+ }
+ pfree(subres);
+
+ while (ptr->next)
+ ptr = ptr->next;
+ ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
+
+ pfree(new->stem);
+ pfree(new);
+ }
+ }
+
+ if (!node)
+ break;
+
+ StopLow = node->data;
+ StopHigh = node->data + node->length;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ if (StopMiddle->val == ((uint8 *) (word))[level])
+ break;
+ else if (StopMiddle->val < ((uint8 *) (word))[level])
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+
+ if (StopLow < StopHigh)
+ {
+ if (startpos == 0)
+ compoundflag = FF_COMPOUNDBEGIN;
+ else if (level == wordlen - 1)
+ compoundflag = FF_COMPOUNDLAST;
+ else
+ compoundflag = FF_COMPOUNDMIDDLE;
+
+ /* find infinitive */
+ if (StopMiddle->isword &&
+ (StopMiddle->compoundflag & compoundflag) &&
+ notprobed[level])
+ {
+ /* ok, we found full compoundallowed word */
+ if (level > minpos)
+ {
+ /* and its length more than minimal */
+ if (wordlen == level + 1)
+ {
+ /* well, it was last word */
+ AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
+ pfree(notprobed);
+ return var;
+ }
+ else
+ {
+ /* then we will search more big word at the same point */
+ SplitVar *ptr = var;
+
+ while (ptr->next)
+ ptr = ptr->next;
+ ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
+ /* we can find next word */
+ level++;
+ AddStem(var, pnstrdup(word + startpos, level - startpos));
+ node = Conf->Dictionary;
+ startpos = level;
+ continue;
+ }
+ }
+ }
+ node = StopMiddle->node;
+ }
+ else
+ node = NULL;
+ level++;
+ }
+
+ AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
+ pfree(notprobed);
+ return var;
+}
+
+static void
+addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
+{
+ if (*lres == NULL)
+ *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
+
+ if (*lcur - *lres < MAX_NORM - 1)
+ {
+ (*lcur)->lexeme = word;
+ (*lcur)->flags = flags;
+ (*lcur)->nvariant = NVariant;
+ (*lcur)++;
+ (*lcur)->lexeme = NULL;
+ }
+}
+
+TSLexeme *
+NINormalizeWord(IspellDict *Conf, char *word)
+{
+ char **res;
+ TSLexeme *lcur = NULL,
+ *lres = NULL;
+ uint16 NVariant = 1;
+
+ res = NormalizeSubWord(Conf, word, 0);
+
+ if (res)
+ {
+ char **ptr = res;
+
+ while (*ptr && (lcur - lres) < MAX_NORM)
+ {
+ addNorm(&lres, &lcur, *ptr, 0, NVariant++);
+ ptr++;
+ }
+ pfree(res);
+ }
+
+ if (Conf->usecompound)
+ {
+ int wordlen = strlen(word);
+ SplitVar *ptr,
+ *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
+ int i;
+
+ while (var)
+ {
+ if (var->nstem > 1)
+ {
+ char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
+
+ if (subres)
+ {
+ char **subptr = subres;
+
+ while (*subptr)
+ {
+ for (i = 0; i < var->nstem - 1; i++)
+ {
+ addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
+ }
+
+ addNorm(&lres, &lcur, *subptr, 0, NVariant);
+ subptr++;
+ NVariant++;
+ }
+
+ pfree(subres);
+ var->stem[0] = NULL;
+ pfree(var->stem[var->nstem - 1]);
+ }
+ }
+
+ for (i = 0; i < var->nstem && var->stem[i]; i++)
+ pfree(var->stem[i]);
+ ptr = var->next;
+ pfree(var->stem);
+ pfree(var);
+ var = ptr;
+ }
+ }
+
+ return lres;
+}
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
new file mode 100644
index 0000000..f4ddfc0
--- /dev/null
+++ b/src/backend/tsearch/to_tsany.c
@@ -0,0 +1,724 @@
+/*-------------------------------------------------------------------------
+ *
+ * to_tsany.c
+ * to_ts* function definitions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/to_tsany.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "common/jsonapi.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+#include "utils/jsonfuncs.h"
+
+
+/*
+ * Opaque data structure, which is passed by parse_tsquery() to pushval_morph().
+ */
+typedef struct MorphOpaque
+{
+ Oid cfg_id;
+
+ /*
+ * Single tsquery morph could be parsed into multiple words. When these
+ * words reside in adjacent positions, they are connected using this
+ * operator. Usually, that is OP_PHRASE, which requires word positions of
+ * a complex morph to exactly match the tsvector.
+ */
+ int qoperator;
+} MorphOpaque;
+
+typedef struct TSVectorBuildState
+{
+ ParsedText *prs;
+ Oid cfgId;
+} TSVectorBuildState;
+
+static void add_to_tsvector(void *_state, char *elem_value, int elem_len);
+
+
+Datum
+get_current_ts_config(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_OID(getTSCurrentConfig(true));
+}
+
+/*
+ * to_tsvector
+ */
+static int
+compareWORD(const void *a, const void *b)
+{
+ int res;
+
+ res = tsCompareString(((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
+ ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
+ false);
+
+ if (res == 0)
+ {
+ if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
+ return 0;
+
+ res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
+ }
+
+ return res;
+}
+
+static int
+uniqueWORD(ParsedWord *a, int32 l)
+{
+ ParsedWord *ptr,
+ *res;
+ int tmppos;
+
+ if (l == 1)
+ {
+ tmppos = LIMITPOS(a->pos.pos);
+ a->alen = 2;
+ a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
+ a->pos.apos[0] = 1;
+ a->pos.apos[1] = tmppos;
+ return l;
+ }
+
+ res = a;
+ ptr = a + 1;
+
+ /*
+ * Sort words with its positions
+ */
+ qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
+
+ /*
+ * Initialize first word and its first position
+ */
+ tmppos = LIMITPOS(a->pos.pos);
+ a->alen = 2;
+ a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
+ a->pos.apos[0] = 1;
+ a->pos.apos[1] = tmppos;
+
+ /*
+ * Summarize position information for each word
+ */
+ while (ptr - a < l)
+ {
+ if (!(ptr->len == res->len &&
+ strncmp(ptr->word, res->word, res->len) == 0))
+ {
+ /*
+ * Got a new word, so put it in result
+ */
+ res++;
+ res->len = ptr->len;
+ res->word = ptr->word;
+ tmppos = LIMITPOS(ptr->pos.pos);
+ res->alen = 2;
+ res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
+ res->pos.apos[0] = 1;
+ res->pos.apos[1] = tmppos;
+ }
+ else
+ {
+ /*
+ * The word already exists, so adjust position information. But
+ * before we should check size of position's array, max allowed
+ * value for position and uniqueness of position
+ */
+ pfree(ptr->word);
+ if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
+ res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
+ {
+ if (res->pos.apos[0] + 1 >= res->alen)
+ {
+ res->alen *= 2;
+ res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
+ }
+ if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
+ {
+ res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
+ res->pos.apos[0]++;
+ }
+ }
+ }
+ ptr++;
+ }
+
+ return res + 1 - a;
+}
+
+/*
+ * make value of tsvector, given parsed text
+ *
+ * Note: frees prs->words and subsidiary data.
+ */
+TSVector
+make_tsvector(ParsedText *prs)
+{
+ int i,
+ j,
+ lenstr = 0,
+ totallen;
+ TSVector in;
+ WordEntry *ptr;
+ char *str;
+ int stroff;
+
+ /* Merge duplicate words */
+ if (prs->curwords > 0)
+ prs->curwords = uniqueWORD(prs->words, prs->curwords);
+
+ /* Determine space needed */
+ for (i = 0; i < prs->curwords; i++)
+ {
+ lenstr += prs->words[i].len;
+ if (prs->words[i].alen)
+ {
+ lenstr = SHORTALIGN(lenstr);
+ lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
+ }
+ }
+
+ if (lenstr > MAXSTRPOS)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
+
+ totallen = CALCDATASIZE(prs->curwords, lenstr);
+ in = (TSVector) palloc0(totallen);
+ SET_VARSIZE(in, totallen);
+ in->size = prs->curwords;
+
+ ptr = ARRPTR(in);
+ str = STRPTR(in);
+ stroff = 0;
+ for (i = 0; i < prs->curwords; i++)
+ {
+ ptr->len = prs->words[i].len;
+ ptr->pos = stroff;
+ memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
+ stroff += prs->words[i].len;
+ pfree(prs->words[i].word);
+ if (prs->words[i].alen)
+ {
+ int k = prs->words[i].pos.apos[0];
+ WordEntryPos *wptr;
+
+ if (k > 0xFFFF)
+ elog(ERROR, "positions array too long");
+
+ ptr->haspos = 1;
+ stroff = SHORTALIGN(stroff);
+ *(uint16 *) (str + stroff) = (uint16) k;
+ wptr = POSDATAPTR(in, ptr);
+ for (j = 0; j < k; j++)
+ {
+ WEP_SETWEIGHT(wptr[j], 0);
+ WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
+ }
+ stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
+ pfree(prs->words[i].pos.apos);
+ }
+ else
+ ptr->haspos = 0;
+ ptr++;
+ }
+
+ if (prs->words)
+ pfree(prs->words);
+
+ return in;
+}
+
+Datum
+to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ text *in = PG_GETARG_TEXT_PP(1);
+ ParsedText prs;
+ TSVector out;
+
+ prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's
+ * number */
+ if (prs.lenwords < 2)
+ prs.lenwords = 2;
+ prs.curwords = 0;
+ prs.pos = 0;
+ prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
+
+ parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
+
+ PG_FREE_IF_COPY(in, 1);
+
+ out = make_tsvector(&prs);
+
+ PG_RETURN_TSVECTOR(out);
+}
+
+Datum
+to_tsvector(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+}
+
+/*
+ * Worker function for jsonb(_string)_to_tsvector(_byid)
+ */
+static TSVector
+jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags)
+{
+ TSVectorBuildState state;
+ ParsedText prs;
+
+ prs.words = NULL;
+ prs.curwords = 0;
+ state.prs = &prs;
+ state.cfgId = cfgId;
+
+ iterate_jsonb_values(jb, flags, &state, add_to_tsvector);
+
+ return make_tsvector(&prs);
+}
+
+Datum
+jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ Jsonb *jb = PG_GETARG_JSONB_P(1);
+ TSVector result;
+
+ result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
+ PG_FREE_IF_COPY(jb, 1);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+jsonb_string_to_tsvector(PG_FUNCTION_ARGS)
+{
+ Jsonb *jb = PG_GETARG_JSONB_P(0);
+ Oid cfgId;
+ TSVector result;
+
+ cfgId = getTSCurrentConfig(true);
+ result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
+ PG_FREE_IF_COPY(jb, 0);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ Jsonb *jb = PG_GETARG_JSONB_P(1);
+ Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
+ TSVector result;
+ uint32 flags = parse_jsonb_index_flags(jbFlags);
+
+ result = jsonb_to_tsvector_worker(cfgId, jb, flags);
+ PG_FREE_IF_COPY(jb, 1);
+ PG_FREE_IF_COPY(jbFlags, 2);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+jsonb_to_tsvector(PG_FUNCTION_ARGS)
+{
+ Jsonb *jb = PG_GETARG_JSONB_P(0);
+ Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
+ Oid cfgId;
+ TSVector result;
+ uint32 flags = parse_jsonb_index_flags(jbFlags);
+
+ cfgId = getTSCurrentConfig(true);
+ result = jsonb_to_tsvector_worker(cfgId, jb, flags);
+ PG_FREE_IF_COPY(jb, 0);
+ PG_FREE_IF_COPY(jbFlags, 1);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+/*
+ * Worker function for json(_string)_to_tsvector(_byid)
+ */
+static TSVector
+json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags)
+{
+ TSVectorBuildState state;
+ ParsedText prs;
+
+ prs.words = NULL;
+ prs.curwords = 0;
+ state.prs = &prs;
+ state.cfgId = cfgId;
+
+ iterate_json_values(json, flags, &state, add_to_tsvector);
+
+ return make_tsvector(&prs);
+}
+
+Datum
+json_string_to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ text *json = PG_GETARG_TEXT_P(1);
+ TSVector result;
+
+ result = json_to_tsvector_worker(cfgId, json, jtiString);
+ PG_FREE_IF_COPY(json, 1);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+json_string_to_tsvector(PG_FUNCTION_ARGS)
+{
+ text *json = PG_GETARG_TEXT_P(0);
+ Oid cfgId;
+ TSVector result;
+
+ cfgId = getTSCurrentConfig(true);
+ result = json_to_tsvector_worker(cfgId, json, jtiString);
+ PG_FREE_IF_COPY(json, 0);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+json_to_tsvector_byid(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ text *json = PG_GETARG_TEXT_P(1);
+ Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
+ TSVector result;
+ uint32 flags = parse_jsonb_index_flags(jbFlags);
+
+ result = json_to_tsvector_worker(cfgId, json, flags);
+ PG_FREE_IF_COPY(json, 1);
+ PG_FREE_IF_COPY(jbFlags, 2);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+Datum
+json_to_tsvector(PG_FUNCTION_ARGS)
+{
+ text *json = PG_GETARG_TEXT_P(0);
+ Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
+ Oid cfgId;
+ TSVector result;
+ uint32 flags = parse_jsonb_index_flags(jbFlags);
+
+ cfgId = getTSCurrentConfig(true);
+ result = json_to_tsvector_worker(cfgId, json, flags);
+ PG_FREE_IF_COPY(json, 0);
+ PG_FREE_IF_COPY(jbFlags, 1);
+
+ PG_RETURN_TSVECTOR(result);
+}
+
+/*
+ * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
+ */
+static void
+add_to_tsvector(void *_state, char *elem_value, int elem_len)
+{
+ TSVectorBuildState *state = (TSVectorBuildState *) _state;
+ ParsedText *prs = state->prs;
+ int32 prevwords;
+
+ if (prs->words == NULL)
+ {
+ /*
+ * First time through: initialize words array to a reasonable size.
+ * (parsetext() will realloc it bigger as needed.)
+ */
+ prs->lenwords = 16;
+ prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
+ prs->curwords = 0;
+ prs->pos = 0;
+ }
+
+ prevwords = prs->curwords;
+
+ parsetext(state->cfgId, prs, elem_value, elem_len);
+
+ /*
+ * If we extracted any words from this JSON element, advance pos to create
+ * an artificial break between elements. This is because we don't want
+ * phrase searches to think that the last word in this element is adjacent
+ * to the first word in the next one.
+ */
+ if (prs->curwords > prevwords)
+ prs->pos += 1;
+}
+
+
+/*
+ * to_tsquery
+ */
+
+
+/*
+ * This function is used for morph parsing.
+ *
+ * The value is passed to parsetext which will call the right dictionary to
+ * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
+ * to the stack.
+ *
+ * All words belonging to the same variant are pushed as an ANDed list,
+ * and different variants are ORed together.
+ */
+static void
+pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
+{
+ int32 count = 0;
+ ParsedText prs;
+ uint32 variant,
+ pos = 0,
+ cntvar = 0,
+ cntpos = 0,
+ cnt = 0;
+ MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
+
+ prs.lenwords = 4;
+ prs.curwords = 0;
+ prs.pos = 0;
+ prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
+
+ parsetext(data->cfg_id, &prs, strval, lenval);
+
+ if (prs.curwords > 0)
+ {
+ while (count < prs.curwords)
+ {
+ /*
+ * Were any stop words removed? If so, fill empty positions with
+ * placeholders linked by an appropriate operator.
+ */
+ if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
+ {
+ while (pos + 1 < prs.words[count].pos.pos)
+ {
+ /* put placeholders for each missing stop word */
+ pushStop(state);
+ if (cntpos)
+ pushOperator(state, data->qoperator, 1);
+ cntpos++;
+ pos++;
+ }
+ }
+
+ /* save current word's position */
+ pos = prs.words[count].pos.pos;
+
+ /* Go through all variants obtained from this token */
+ cntvar = 0;
+ while (count < prs.curwords && pos == prs.words[count].pos.pos)
+ {
+ variant = prs.words[count].nvariant;
+
+ /* Push all words belonging to the same variant */
+ cnt = 0;
+ while (count < prs.curwords &&
+ pos == prs.words[count].pos.pos &&
+ variant == prs.words[count].nvariant)
+ {
+ pushValue(state,
+ prs.words[count].word,
+ prs.words[count].len,
+ weight,
+ ((prs.words[count].flags & TSL_PREFIX) || prefix));
+ pfree(prs.words[count].word);
+ if (cnt)
+ pushOperator(state, OP_AND, 0);
+ cnt++;
+ count++;
+ }
+
+ if (cntvar)
+ pushOperator(state, OP_OR, 0);
+ cntvar++;
+ }
+
+ if (cntpos)
+ {
+ /* distance may be useful */
+ pushOperator(state, data->qoperator, 1);
+ }
+
+ cntpos++;
+ }
+
+ pfree(prs.words);
+
+ }
+ else
+ pushStop(state);
+}
+
+Datum
+to_tsquery_byid(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(1);
+ TSQuery query;
+ MorphOpaque data;
+
+ data.cfg_id = PG_GETARG_OID(0);
+
+ /*
+ * Passing OP_PHRASE as a qoperator makes tsquery require matching of word
+ * positions of a complex morph exactly match the tsvector. Also, when
+ * the complex morphs are connected with OP_PHRASE operator, we connect
+ * all their words into the OP_PHRASE sequence.
+ */
+ data.qoperator = OP_PHRASE;
+
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ 0);
+
+ PG_RETURN_TSQUERY(query);
+}
+
+Datum
+to_tsquery(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+}
+
+Datum
+plainto_tsquery_byid(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(1);
+ TSQuery query;
+ MorphOpaque data;
+
+ data.cfg_id = PG_GETARG_OID(0);
+
+ /*
+ * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
+ * single morph. Passing OP_PHRASE as a qoperator makes tsquery require
+ * matching of all words independently on their positions.
+ */
+ data.qoperator = OP_AND;
+
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ P_TSQ_PLAIN);
+
+ PG_RETURN_POINTER(query);
+}
+
+Datum
+plainto_tsquery(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+}
+
+
+Datum
+phraseto_tsquery_byid(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(1);
+ TSQuery query;
+ MorphOpaque data;
+
+ data.cfg_id = PG_GETARG_OID(0);
+
+ /*
+ * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
+ * single morph. Passing OP_PHRASE as a qoperator makes tsquery require
+ * matching of word positions.
+ */
+ data.qoperator = OP_PHRASE;
+
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ P_TSQ_PLAIN);
+
+ PG_RETURN_TSQUERY(query);
+}
+
+Datum
+phraseto_tsquery(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+}
+
+Datum
+websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(1);
+ MorphOpaque data;
+ TSQuery query = NULL;
+
+ data.cfg_id = PG_GETARG_OID(0);
+
+ /*
+ * Passing OP_PHRASE as a qoperator makes tsquery require matching of word
+ * positions of a complex morph exactly match the tsvector. Also, when
+ * the complex morphs are given in quotes, we connect all their words into
+ * the OP_PHRASE sequence.
+ */
+ data.qoperator = OP_PHRASE;
+
+ query = parse_tsquery(text_to_cstring(in),
+ pushval_morph,
+ PointerGetDatum(&data),
+ P_TSQ_WEB);
+
+ PG_RETURN_TSQUERY(query);
+}
+
+Datum
+websearch_to_tsquery(PG_FUNCTION_ARGS)
+{
+ text *in = PG_GETARG_TEXT_PP(0);
+ Oid cfgId;
+
+ cfgId = getTSCurrentConfig(true);
+ PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+
+}
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
new file mode 100644
index 0000000..f918cc8
--- /dev/null
+++ b/src/backend/tsearch/ts_locale.c
@@ -0,0 +1,325 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_locale.c
+ * locale compatibility layer for tsearch
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/ts_locale.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/pg_collation.h"
+#include "common/string.h"
+#include "storage/fd.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+
+static void tsearch_readline_callback(void *arg);
+
+
+/*
+ * The reason these functions use a 3-wchar_t output buffer, not 2 as you
+ * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
+ * getting from char2wchar() is UTF16 not UTF32. A single input character
+ * may therefore produce a surrogate pair rather than just one wchar_t;
+ * we also need room for a trailing null. When we do get a surrogate pair,
+ * we pass just the first code to iswdigit() etc, so that these functions will
+ * always return false for characters outside the Basic Multilingual Plane.
+ */
+#define WC_BUF_LEN 3
+
+int
+t_isdigit(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[WC_BUF_LEN];
+ Oid collation = DEFAULT_COLLATION_OID; /* TODO */
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (clen == 1 || lc_ctype_is_c(collation))
+ return isdigit(TOUCHAR(ptr));
+
+ char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+
+ return iswdigit((wint_t) character[0]);
+}
+
+int
+t_isspace(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[WC_BUF_LEN];
+ Oid collation = DEFAULT_COLLATION_OID; /* TODO */
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (clen == 1 || lc_ctype_is_c(collation))
+ return isspace(TOUCHAR(ptr));
+
+ char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+
+ return iswspace((wint_t) character[0]);
+}
+
+int
+t_isalpha(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[WC_BUF_LEN];
+ Oid collation = DEFAULT_COLLATION_OID; /* TODO */
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (clen == 1 || lc_ctype_is_c(collation))
+ return isalpha(TOUCHAR(ptr));
+
+ char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+
+ return iswalpha((wint_t) character[0]);
+}
+
+int
+t_isprint(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[WC_BUF_LEN];
+ Oid collation = DEFAULT_COLLATION_OID; /* TODO */
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (clen == 1 || lc_ctype_is_c(collation))
+ return isprint(TOUCHAR(ptr));
+
+ char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+
+ return iswprint((wint_t) character[0]);
+}
+
+
+/*
+ * Set up to read a file using tsearch_readline(). This facility is
+ * better than just reading the file directly because it provides error
+ * context pointing to the specific line where a problem is detected.
+ *
+ * Expected usage is:
+ *
+ * tsearch_readline_state trst;
+ *
+ * if (!tsearch_readline_begin(&trst, filename))
+ * ereport(ERROR,
+ * (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ * errmsg("could not open stop-word file \"%s\": %m",
+ * filename)));
+ * while ((line = tsearch_readline(&trst)) != NULL)
+ * process line;
+ * tsearch_readline_end(&trst);
+ *
+ * Note that the caller supplies the ereport() for file open failure;
+ * this is so that a custom message can be provided. The filename string
+ * passed to tsearch_readline_begin() must remain valid through
+ * tsearch_readline_end().
+ */
+bool
+tsearch_readline_begin(tsearch_readline_state *stp,
+ const char *filename)
+{
+ if ((stp->fp = AllocateFile(filename, "r")) == NULL)
+ return false;
+ stp->filename = filename;
+ stp->lineno = 0;
+ initStringInfo(&stp->buf);
+ stp->curline = NULL;
+ /* Setup error traceback support for ereport() */
+ stp->cb.callback = tsearch_readline_callback;
+ stp->cb.arg = (void *) stp;
+ stp->cb.previous = error_context_stack;
+ error_context_stack = &stp->cb;
+ return true;
+}
+
+/*
+ * Read the next line from a tsearch data file (expected to be in UTF-8), and
+ * convert it to database encoding if needed. The returned string is palloc'd.
+ * NULL return means EOF.
+ */
+char *
+tsearch_readline(tsearch_readline_state *stp)
+{
+ char *recoded;
+
+ /* Advance line number to use in error reports */
+ stp->lineno++;
+
+ /* Clear curline, it's no longer relevant */
+ if (stp->curline)
+ {
+ if (stp->curline != stp->buf.data)
+ pfree(stp->curline);
+ stp->curline = NULL;
+ }
+
+ /* Collect next line, if there is one */
+ if (!pg_get_line_buf(stp->fp, &stp->buf))
+ return NULL;
+
+ /* Validate the input as UTF-8, then convert to DB encoding if needed */
+ recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
+
+ /* Save the correctly-encoded string for possible error reports */
+ stp->curline = recoded; /* might be equal to buf.data */
+
+ /*
+ * We always return a freshly pstrdup'd string. This is clearly necessary
+ * if pg_any_to_server() returned buf.data, and we need a second copy even
+ * if encoding conversion did occur. The caller is entitled to pfree the
+ * returned string at any time, which would leave curline pointing to
+ * recycled storage, causing problems if an error occurs after that point.
+ * (It's preferable to return the result of pstrdup instead of the output
+ * of pg_any_to_server, because the conversion result tends to be
+ * over-allocated. Since callers might save the result string directly
+ * into a long-lived dictionary structure, we don't want it to be a larger
+ * palloc chunk than necessary. We'll reclaim the conversion result on
+ * the next call.)
+ */
+ return pstrdup(recoded);
+}
+
+/*
+ * Close down after reading a file with tsearch_readline()
+ */
+void
+tsearch_readline_end(tsearch_readline_state *stp)
+{
+ /* Suppress use of curline in any error reported below */
+ if (stp->curline)
+ {
+ if (stp->curline != stp->buf.data)
+ pfree(stp->curline);
+ stp->curline = NULL;
+ }
+
+ /* Release other resources */
+ pfree(stp->buf.data);
+ FreeFile(stp->fp);
+
+ /* Pop the error context stack */
+ error_context_stack = stp->cb.previous;
+}
+
+/*
+ * Error context callback for errors occurring while reading a tsearch
+ * configuration file.
+ */
+static void
+tsearch_readline_callback(void *arg)
+{
+ tsearch_readline_state *stp = (tsearch_readline_state *) arg;
+
+ /*
+ * We can't include the text of the config line for errors that occur
+ * during tsearch_readline() itself. The major cause of such errors is
+ * encoding violations, and we daren't try to print error messages
+ * containing badly-encoded data.
+ */
+ if (stp->curline)
+ errcontext("line %d of configuration file \"%s\": \"%s\"",
+ stp->lineno,
+ stp->filename,
+ stp->curline);
+ else
+ errcontext("line %d of configuration file \"%s\"",
+ stp->lineno,
+ stp->filename);
+}
+
+
+/*
+ * lowerstr --- fold null-terminated string to lower case
+ *
+ * Returned string is palloc'd
+ */
+char *
+lowerstr(const char *str)
+{
+ return lowerstr_with_len(str, strlen(str));
+}
+
+/*
+ * lowerstr_with_len --- fold string to lower case
+ *
+ * Input string need not be null-terminated.
+ *
+ * Returned string is palloc'd
+ */
+char *
+lowerstr_with_len(const char *str, int len)
+{
+ char *out;
+ Oid collation = DEFAULT_COLLATION_OID; /* TODO */
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (len == 0)
+ return pstrdup("");
+
+ /*
+ * Use wide char code only when max encoding length > 1 and ctype != C.
+ * Some operating systems fail with multi-byte encodings and a C locale.
+ * Also, for a C locale there is no need to process as multibyte. From
+ * backend/utils/adt/oracle_compat.c Teodor
+ */
+ if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
+ {
+ wchar_t *wstr,
+ *wptr;
+ int wlen;
+
+ /*
+ * alloc number of wchar_t for worst case, len contains number of
+ * bytes >= number of characters and alloc 1 wchar_t for 0, because
+ * wchar2char wants zero-terminated string
+ */
+ wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
+
+ wlen = char2wchar(wstr, len + 1, str, len, mylocale);
+ Assert(wlen <= len);
+
+ while (*wptr)
+ {
+ *wptr = towlower((wint_t) *wptr);
+ wptr++;
+ }
+
+ /*
+ * Alloc result string for worst case + '\0'
+ */
+ len = pg_database_encoding_max_length() * wlen + 1;
+ out = (char *) palloc(len);
+
+ wlen = wchar2char(out, wstr, len, mylocale);
+
+ pfree(wstr);
+
+ if (wlen < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("conversion from wchar_t to server encoding failed: %m")));
+ Assert(wlen < len);
+ }
+ else
+ {
+ const char *ptr = str;
+ char *outptr;
+
+ outptr = out = (char *) palloc(sizeof(char) * (len + 1));
+ while ((ptr - str) < len && *ptr)
+ {
+ *outptr++ = tolower(TOUCHAR(ptr));
+ ptr++;
+ }
+ *outptr = '\0';
+ }
+
+ return out;
+}
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c
new file mode 100644
index 0000000..92d95b4
--- /dev/null
+++ b/src/backend/tsearch/ts_parse.c
@@ -0,0 +1,667 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_parse.c
+ * main parse functions for tsearch
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/ts_parse.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_utils.h"
+
+#define IGNORE_LONGLEXEME 1
+
+/*
+ * Lexize subsystem
+ */
+
+typedef struct ParsedLex
+{
+ int type;
+ char *lemm;
+ int lenlemm;
+ struct ParsedLex *next;
+} ParsedLex;
+
+typedef struct ListParsedLex
+{
+ ParsedLex *head;
+ ParsedLex *tail;
+} ListParsedLex;
+
+typedef struct
+{
+ TSConfigCacheEntry *cfg;
+ Oid curDictId;
+ int posDict;
+ DictSubState dictState;
+ ParsedLex *curSub;
+ ListParsedLex towork; /* current list to work */
+ ListParsedLex waste; /* list of lexemes that already lexized */
+
+ /*
+ * fields to store last variant to lexize (basically, thesaurus or similar
+ * to, which wants several lexemes
+ */
+
+ ParsedLex *lastRes;
+ TSLexeme *tmpRes;
+} LexizeData;
+
+static void
+LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
+{
+ ld->cfg = cfg;
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->towork.head = ld->towork.tail = ld->curSub = NULL;
+ ld->waste.head = ld->waste.tail = NULL;
+ ld->lastRes = NULL;
+ ld->tmpRes = NULL;
+}
+
+static void
+LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
+{
+ if (list->tail)
+ {
+ list->tail->next = newpl;
+ list->tail = newpl;
+ }
+ else
+ list->head = list->tail = newpl;
+ newpl->next = NULL;
+}
+
+static ParsedLex *
+LPLRemoveHead(ListParsedLex *list)
+{
+ ParsedLex *res = list->head;
+
+ if (list->head)
+ list->head = list->head->next;
+
+ if (list->head == NULL)
+ list->tail = NULL;
+
+ return res;
+}
+
+static void
+LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
+{
+ ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
+
+ newpl->type = type;
+ newpl->lemm = lemm;
+ newpl->lenlemm = lenlemm;
+ LPLAddTail(&ld->towork, newpl);
+ ld->curSub = ld->towork.tail;
+}
+
+static void
+RemoveHead(LexizeData *ld)
+{
+ LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
+
+ ld->posDict = 0;
+}
+
+static void
+setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
+{
+ if (correspondLexem)
+ {
+ *correspondLexem = ld->waste.head;
+ }
+ else
+ {
+ ParsedLex *tmp,
+ *ptr = ld->waste.head;
+
+ while (ptr)
+ {
+ tmp = ptr->next;
+ pfree(ptr);
+ ptr = tmp;
+ }
+ }
+ ld->waste.head = ld->waste.tail = NULL;
+}
+
+static void
+moveToWaste(LexizeData *ld, ParsedLex *stop)
+{
+ bool go = true;
+
+ while (ld->towork.head && go)
+ {
+ if (ld->towork.head == stop)
+ {
+ ld->curSub = stop->next;
+ go = false;
+ }
+ RemoveHead(ld);
+ }
+}
+
+static void
+setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
+{
+ if (ld->tmpRes)
+ {
+ TSLexeme *ptr;
+
+ for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
+ pfree(ptr->lexeme);
+ pfree(ld->tmpRes);
+ }
+ ld->tmpRes = res;
+ ld->lastRes = lex;
+}
+
+static TSLexeme *
+LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
+{
+ int i;
+ ListDictionary *map;
+ TSDictionaryCacheEntry *dict;
+ TSLexeme *res;
+
+ if (ld->curDictId == InvalidOid)
+ {
+ /*
+ * usual mode: dictionary wants only one word, but we should keep in
+ * mind that we should go through all stack
+ */
+
+ while (ld->towork.head)
+ {
+ ParsedLex *curVal = ld->towork.head;
+ char *curValLemm = curVal->lemm;
+ int curValLenLemm = curVal->lenlemm;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
+ {
+ /* skip this type of lexeme */
+ RemoveHead(ld);
+ continue;
+ }
+
+ for (i = ld->posDict; i < map->len; i++)
+ {
+ dict = lookup_ts_dictionary_cache(map->dictIds[i]);
+
+ ld->dictState.isend = ld->dictState.getnext = false;
+ ld->dictState.private_state = NULL;
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(curValLemm),
+ Int32GetDatum(curValLenLemm),
+ PointerGetDatum(&ld->dictState)));
+
+ if (ld->dictState.getnext)
+ {
+ /*
+ * dictionary wants next word, so setup and store current
+ * position and go to multiword mode
+ */
+
+ ld->curDictId = DatumGetObjectId(map->dictIds[i]);
+ ld->posDict = i + 1;
+ ld->curSub = curVal->next;
+ if (res)
+ setNewTmpRes(ld, curVal, res);
+ return LexizeExec(ld, correspondLexem);
+ }
+
+ if (!res) /* dictionary doesn't know this lexeme */
+ continue;
+
+ if (res->flags & TSL_FILTER)
+ {
+ curValLemm = res->lexeme;
+ curValLenLemm = strlen(res->lexeme);
+ continue;
+ }
+
+ RemoveHead(ld);
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ RemoveHead(ld);
+ }
+ }
+ else
+ { /* curDictId is valid */
+ dict = lookup_ts_dictionary_cache(ld->curDictId);
+
+ /*
+ * Dictionary ld->curDictId asks us about following words
+ */
+
+ while (ld->curSub)
+ {
+ ParsedLex *curVal = ld->curSub;
+
+ map = ld->cfg->map + curVal->type;
+
+ if (curVal->type != 0)
+ {
+ bool dictExists = false;
+
+ if (curVal->type >= ld->cfg->lenmap || map->len == 0)
+ {
+ /* skip this type of lexeme */
+ ld->curSub = curVal->next;
+ continue;
+ }
+
+ /*
+ * We should be sure that current type of lexeme is recognized
+ * by our dictionary: we just check is it exist in list of
+ * dictionaries ?
+ */
+ for (i = 0; i < map->len && !dictExists; i++)
+ if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
+ dictExists = true;
+
+ if (!dictExists)
+ {
+ /*
+ * Dictionary can't work with current type of lexeme,
+ * return to basic mode and redo all stored lexemes
+ */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ ld->dictState.isend = (curVal->type == 0) ? true : false;
+ ld->dictState.getnext = false;
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(curVal->lemm),
+ Int32GetDatum(curVal->lenlemm),
+ PointerGetDatum(&ld->dictState)));
+
+ if (ld->dictState.getnext)
+ {
+ /* Dictionary wants one more */
+ ld->curSub = curVal->next;
+ if (res)
+ setNewTmpRes(ld, curVal, res);
+ continue;
+ }
+
+ if (res || ld->tmpRes)
+ {
+ /*
+ * Dictionary normalizes lexemes, so we remove from stack all
+ * used lexemes, return to basic mode and redo end of stack
+ * (if it exists)
+ */
+ if (res)
+ {
+ moveToWaste(ld, ld->curSub);
+ }
+ else
+ {
+ res = ld->tmpRes;
+ moveToWaste(ld, ld->lastRes);
+ }
+
+ /* reset to initial state */
+ ld->curDictId = InvalidOid;
+ ld->posDict = 0;
+ ld->lastRes = NULL;
+ ld->tmpRes = NULL;
+ setCorrLex(ld, correspondLexem);
+ return res;
+ }
+
+ /*
+ * Dict don't want next lexem and didn't recognize anything, redo
+ * from ld->towork.head
+ */
+ ld->curDictId = InvalidOid;
+ return LexizeExec(ld, correspondLexem);
+ }
+ }
+
+ setCorrLex(ld, correspondLexem);
+ return NULL;
+}
+
+/*
+ * Parse string and lexize words.
+ *
+ * prs will be filled in.
+ */
+void
+parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
+{
+ int type,
+ lenlemm;
+ char *lemm = NULL;
+ LexizeData ldata;
+ TSLexeme *norms;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+ void *prsdata;
+
+ cfg = lookup_ts_config_cache(cfgId);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
+ PointerGetDatum(buf),
+ Int32GetDatum(buflen)));
+
+ LexizeInit(&ldata, cfg);
+
+ do
+ {
+ type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lemm),
+ PointerGetDatum(&lenlemm)));
+
+ if (type > 0 && lenlemm >= MAXSTRLEN)
+ {
+#ifdef IGNORE_LONGLEXEME
+ ereport(NOTICE,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+ continue;
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+#endif
+ }
+
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+ while ((norms = LexizeExec(&ldata, NULL)) != NULL)
+ {
+ TSLexeme *ptr = norms;
+
+ prs->pos++; /* set pos */
+
+ while (ptr->lexeme)
+ {
+ if (prs->curwords == prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
+ }
+
+ if (ptr->flags & TSL_ADDPOS)
+ prs->pos++;
+ prs->words[prs->curwords].len = strlen(ptr->lexeme);
+ prs->words[prs->curwords].word = ptr->lexeme;
+ prs->words[prs->curwords].nvariant = ptr->nvariant;
+ prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
+ prs->words[prs->curwords].alen = 0;
+ prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
+ ptr++;
+ prs->curwords++;
+ }
+ pfree(norms);
+ }
+ } while (type > 0);
+
+ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+/*
+ * Headline framework
+ */
+static void
+hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
+{
+ while (prs->curwords >= prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
+ }
+ memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
+ prs->words[prs->curwords].type = (uint8) type;
+ prs->words[prs->curwords].len = buflen;
+ prs->words[prs->curwords].word = palloc(buflen);
+ memcpy(prs->words[prs->curwords].word, buf, buflen);
+ prs->curwords++;
+}
+
+static void
+hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
+{
+ int i;
+ QueryItem *item = GETQUERY(query);
+ HeadlineWordEntry *word;
+
+ while (prs->curwords + query->size >= prs->lenwords)
+ {
+ prs->lenwords *= 2;
+ prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
+ }
+
+ word = &(prs->words[prs->curwords - 1]);
+ word->pos = LIMITPOS(pos);
+ for (i = 0; i < query->size; i++)
+ {
+ if (item->type == QI_VAL &&
+ tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
+ buf, buflen, item->qoperand.prefix) == 0)
+ {
+ if (word->item)
+ {
+ memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
+ prs->words[prs->curwords].item = &item->qoperand;
+ prs->words[prs->curwords].repeated = 1;
+ prs->curwords++;
+ }
+ else
+ word->item = &item->qoperand;
+ }
+ item++;
+ }
+}
+
+static void
+addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
+{
+ ParsedLex *tmplexs;
+ TSLexeme *ptr;
+ int32 savedpos;
+
+ while (lexs)
+ {
+ if (lexs->type > 0)
+ hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
+
+ ptr = norms;
+ savedpos = prs->vectorpos;
+ while (ptr && ptr->lexeme)
+ {
+ if (ptr->flags & TSL_ADDPOS)
+ savedpos++;
+ hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
+ ptr++;
+ }
+
+ tmplexs = lexs->next;
+ pfree(lexs);
+ lexs = tmplexs;
+ }
+
+ if (norms)
+ {
+ ptr = norms;
+ while (ptr->lexeme)
+ {
+ if (ptr->flags & TSL_ADDPOS)
+ prs->vectorpos++;
+ pfree(ptr->lexeme);
+ ptr++;
+ }
+ pfree(norms);
+ }
+}
+
+void
+hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
+{
+ int type,
+ lenlemm;
+ char *lemm = NULL;
+ LexizeData ldata;
+ TSLexeme *norms;
+ ParsedLex *lexs;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+ void *prsdata;
+
+ cfg = lookup_ts_config_cache(cfgId);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
+ PointerGetDatum(buf),
+ Int32GetDatum(buflen)));
+
+ LexizeInit(&ldata, cfg);
+
+ do
+ {
+ type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lemm),
+ PointerGetDatum(&lenlemm)));
+
+ if (type > 0 && lenlemm >= MAXSTRLEN)
+ {
+#ifdef IGNORE_LONGLEXEME
+ ereport(NOTICE,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+ continue;
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("word is too long to be indexed"),
+ errdetail("Words longer than %d characters are ignored.",
+ MAXSTRLEN)));
+#endif
+ }
+
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+ do
+ {
+ if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
+ {
+ prs->vectorpos++;
+ addHLParsedLex(prs, query, lexs, norms);
+ }
+ else
+ addHLParsedLex(prs, query, lexs, NULL);
+ } while (norms);
+
+ } while (type > 0);
+
+ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
+text *
+generateHeadline(HeadlineParsedText *prs)
+{
+ text *out;
+ char *ptr;
+ int len = 128;
+ int numfragments = 0;
+ int16 infrag = 0;
+
+ HeadlineWordEntry *wrd = prs->words;
+
+ out = (text *) palloc(len);
+ ptr = ((char *) out) + VARHDRSZ;
+
+ while (wrd - prs->words < prs->curwords)
+ {
+ while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
+ {
+ int dist = ptr - ((char *) out);
+
+ len *= 2;
+ out = (text *) repalloc(out, len);
+ ptr = ((char *) out) + dist;
+ }
+
+ if (wrd->in && !wrd->repeated)
+ {
+ if (!infrag)
+ {
+
+ /* start of a new fragment */
+ infrag = 1;
+ numfragments++;
+ /* add a fragment delimiter if this is after the first one */
+ if (numfragments > 1)
+ {
+ memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
+ ptr += prs->fragdelimlen;
+ }
+
+ }
+ if (wrd->replace)
+ {
+ *ptr = ' ';
+ ptr++;
+ }
+ else if (!wrd->skip)
+ {
+ if (wrd->selected)
+ {
+ memcpy(ptr, prs->startsel, prs->startsellen);
+ ptr += prs->startsellen;
+ }
+ memcpy(ptr, wrd->word, wrd->len);
+ ptr += wrd->len;
+ if (wrd->selected)
+ {
+ memcpy(ptr, prs->stopsel, prs->stopsellen);
+ ptr += prs->stopsellen;
+ }
+ }
+ }
+ else if (!wrd->repeated)
+ {
+ if (infrag)
+ infrag = 0;
+ pfree(wrd->word);
+ }
+
+ wrd++;
+ }
+
+ SET_VARSIZE(out, ptr - ((char *) out));
+ return out;
+}
diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
new file mode 100644
index 0000000..be2546a
--- /dev/null
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -0,0 +1,453 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_selfuncs.c
+ * Selectivity estimation functions for text search operators.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/ts_selfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "catalog/pg_statistic.h"
+#include "catalog/pg_type.h"
+#include "miscadmin.h"
+#include "nodes/nodes.h"
+#include "tsearch/ts_type.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
+#include "utils/syscache.h"
+
+
+/*
+ * The default text search selectivity is chosen to be small enough to
+ * encourage indexscans for typical table densities. See selfuncs.h and
+ * DEFAULT_EQ_SEL for details.
+ */
+#define DEFAULT_TS_MATCH_SEL 0.005
+
+/* lookup table type for binary searching through MCELEMs */
+typedef struct
+{
+ text *element;
+ float4 frequency;
+} TextFreq;
+
+/* type of keys for bsearch'ing through an array of TextFreqs */
+typedef struct
+{
+ char *lexeme;
+ int length;
+} LexemeKey;
+
+static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
+static Selectivity mcelem_tsquery_selec(TSQuery query,
+ Datum *mcelem, int nmcelem,
+ float4 *numbers, int nnumbers);
+static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
+ TextFreq *lookup, int length, float4 minfreq);
+static int compare_lexeme_textfreq(const void *e1, const void *e2);
+
+#define tsquery_opr_selec_no_stats(query) \
+ tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
+
+
+/*
+ * tsmatchsel -- Selectivity of "@@"
+ *
+ * restriction selectivity function for tsvector @@ tsquery and
+ * tsquery @@ tsvector
+ */
+Datum
+tsmatchsel(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+
+#ifdef NOT_USED
+ Oid operator = PG_GETARG_OID(1);
+#endif
+ List *args = (List *) PG_GETARG_POINTER(2);
+ int varRelid = PG_GETARG_INT32(3);
+ VariableStatData vardata;
+ Node *other;
+ bool varonleft;
+ Selectivity selec;
+
+ /*
+ * If expression is not variable = something or something = variable, then
+ * punt and return a default estimate.
+ */
+ if (!get_restriction_variable(root, args, varRelid,
+ &vardata, &other, &varonleft))
+ PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
+
+ /*
+ * Can't do anything useful if the something is not a constant, either.
+ */
+ if (!IsA(other, Const))
+ {
+ ReleaseVariableStats(vardata);
+ PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
+ }
+
+ /*
+ * The "@@" operator is strict, so we can cope with NULL right away
+ */
+ if (((Const *) other)->constisnull)
+ {
+ ReleaseVariableStats(vardata);
+ PG_RETURN_FLOAT8(0.0);
+ }
+
+ /*
+ * OK, there's a Var and a Const we're dealing with here. We need the
+ * Const to be a TSQuery, else we can't do anything useful. We have to
+ * check this because the Var might be the TSQuery not the TSVector.
+ */
+ if (((Const *) other)->consttype == TSQUERYOID)
+ {
+ /* tsvector @@ tsquery or the other way around */
+ Assert(vardata.vartype == TSVECTOROID);
+
+ selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+ }
+ else
+ {
+ /* If we can't see the query structure, must punt */
+ selec = DEFAULT_TS_MATCH_SEL;
+ }
+
+ ReleaseVariableStats(vardata);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+}
+
+
+/*
+ * tsmatchjoinsel -- join selectivity of "@@"
+ *
+ * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
+ */
+Datum
+tsmatchjoinsel(PG_FUNCTION_ARGS)
+{
+ /* for the moment we just punt */
+ PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
+}
+
+
+/*
+ * @@ selectivity for tsvector var vs tsquery constant
+ */
+static Selectivity
+tsquerysel(VariableStatData *vardata, Datum constval)
+{
+ Selectivity selec;
+ TSQuery query;
+
+ /* The caller made sure the const is a TSQuery, so get it now */
+ query = DatumGetTSQuery(constval);
+
+ /* Empty query matches nothing */
+ if (query->size == 0)
+ return (Selectivity) 0.0;
+
+ if (HeapTupleIsValid(vardata->statsTuple))
+ {
+ Form_pg_statistic stats;
+ AttStatsSlot sslot;
+
+ stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+ /* MCELEM will be an array of TEXT elements for a tsvector column */
+ if (get_attstatsslot(&sslot, vardata->statsTuple,
+ STATISTIC_KIND_MCELEM, InvalidOid,
+ ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
+ {
+ /*
+ * There is a most-common-elements slot for the tsvector Var, so
+ * use that.
+ */
+ selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
+ sslot.numbers, sslot.nnumbers);
+ free_attstatsslot(&sslot);
+ }
+ else
+ {
+ /* No most-common-elements info, so do without */
+ selec = tsquery_opr_selec_no_stats(query);
+ }
+
+ /*
+ * MCE stats count only non-null rows, so adjust for null rows.
+ */
+ selec *= (1.0 - stats->stanullfrac);
+ }
+ else
+ {
+ /* No stats at all, so do without */
+ selec = tsquery_opr_selec_no_stats(query);
+ /* we assume no nulls here, so no stanullfrac correction */
+ }
+
+ return selec;
+}
+
+/*
+ * Extract data from the pg_statistic arrays into useful format.
+ */
+static Selectivity
+mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+ float4 *numbers, int nnumbers)
+{
+ float4 minfreq;
+ TextFreq *lookup;
+ Selectivity selec;
+ int i;
+
+ /*
+ * There should be two more Numbers than Values, because the last two
+ * cells are taken for minimal and maximal frequency. Punt if not.
+ *
+ * (Note: the MCELEM statistics slot definition allows for a third extra
+ * number containing the frequency of nulls, but we're not expecting that
+ * to appear for a tsvector column.)
+ */
+ if (nnumbers != nmcelem + 2)
+ return tsquery_opr_selec_no_stats(query);
+
+ /*
+ * Transpose the data into a single array so we can use bsearch().
+ */
+ lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+ for (i = 0; i < nmcelem; i++)
+ {
+ /*
+ * The text Datums came from an array, so it cannot be compressed or
+ * stored out-of-line -- it's safe to use VARSIZE_ANY*.
+ */
+ Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
+ lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
+ lookup[i].frequency = numbers[i];
+ }
+
+ /*
+ * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
+ * the one before the last cell of the Numbers array. See ts_typanalyze.c
+ */
+ minfreq = numbers[nnumbers - 2];
+
+ selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+ nmcelem, minfreq);
+
+ pfree(lookup);
+
+ return selec;
+}
+
+/*
+ * Traverse the tsquery in preorder, calculating selectivity as:
+ *
+ * selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
+ *
+ * selec(left_oper) + selec(right_oper) -
+ * selec(left_oper) * selec(right_oper) in OR nodes,
+ *
+ * 1 - select(oper) in NOT nodes
+ *
+ * histogram-based estimation in prefix VAL nodes
+ *
+ * freq[val] in exact VAL nodes, if the value is in MCELEM
+ * min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+ *
+ * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
+ * binary search for determining freq[MCELEM].
+ *
+ * If we don't have stats for the tsvector, we still use this logic,
+ * except we use default estimates for VAL nodes. This case is signaled
+ * by lookup == NULL.
+ */
+static Selectivity
+tsquery_opr_selec(QueryItem *item, char *operand,
+ TextFreq *lookup, int length, float4 minfreq)
+{
+ Selectivity selec;
+
+ /* since this function recurses, it could be driven to stack overflow */
+ check_stack_depth();
+
+ if (item->type == QI_VAL)
+ {
+ QueryOperand *oper = (QueryOperand *) item;
+ LexemeKey key;
+
+ /*
+ * Prepare the key for bsearch().
+ */
+ key.lexeme = operand + oper->distance;
+ key.length = oper->length;
+
+ if (oper->prefix)
+ {
+ /* Prefix match, ie the query item is lexeme:* */
+ Selectivity matched,
+ allmces;
+ int i,
+ n_matched;
+
+ /*
+ * Our strategy is to scan through the MCELEM list and combine the
+ * frequencies of the ones that match the prefix. We then
+ * extrapolate the fraction of matching MCELEMs to the remaining
+ * rows, assuming that the MCELEMs are representative of the whole
+ * lexeme population in this respect. (Compare
+ * histogram_selectivity().) Note that these are most common
+ * elements not most common values, so they're not mutually
+ * exclusive. We treat occurrences as independent events.
+ *
+ * This is only a good plan if we have a pretty fair number of
+ * MCELEMs available; we set the threshold at 100. If no stats or
+ * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
+ */
+ if (lookup == NULL || length < 100)
+ return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
+ matched = allmces = 0;
+ n_matched = 0;
+ for (i = 0; i < length; i++)
+ {
+ TextFreq *t = lookup + i;
+ int tlen = VARSIZE_ANY_EXHDR(t->element);
+
+ if (tlen >= key.length &&
+ strncmp(key.lexeme, VARDATA_ANY(t->element),
+ key.length) == 0)
+ {
+ matched += t->frequency - matched * t->frequency;
+ n_matched++;
+ }
+ allmces += t->frequency - allmces * t->frequency;
+ }
+
+ /* Clamp to ensure sanity in the face of roundoff error */
+ CLAMP_PROBABILITY(matched);
+ CLAMP_PROBABILITY(allmces);
+
+ selec = matched + (1.0 - allmces) * ((double) n_matched / length);
+
+ /*
+ * In any case, never believe that a prefix match has selectivity
+ * less than we would assign for a non-MCELEM lexeme. This
+ * preserves the property that "word:*" should be estimated to
+ * match at least as many rows as "word" would be.
+ */
+ selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
+ }
+ else
+ {
+ /* Regular exact lexeme match */
+ TextFreq *searchres;
+
+ /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
+ if (lookup == NULL)
+ return (Selectivity) DEFAULT_TS_MATCH_SEL;
+
+ searchres = (TextFreq *) bsearch(&key, lookup, length,
+ sizeof(TextFreq),
+ compare_lexeme_textfreq);
+
+ if (searchres)
+ {
+ /*
+ * The element is in MCELEM. Return precise selectivity (or
+ * at least as precise as ANALYZE could find out).
+ */
+ selec = searchres->frequency;
+ }
+ else
+ {
+ /*
+ * The element is not in MCELEM. Punt, but assume that the
+ * selectivity cannot be more than minfreq / 2.
+ */
+ selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+ }
+ }
+ }
+ else
+ {
+ /* Current TSQuery node is an operator */
+ Selectivity s1,
+ s2;
+
+ switch (item->qoperator.oper)
+ {
+ case OP_NOT:
+ selec = 1.0 - tsquery_opr_selec(item + 1, operand,
+ lookup, length, minfreq);
+ break;
+
+ case OP_PHRASE:
+ case OP_AND:
+ s1 = tsquery_opr_selec(item + 1, operand,
+ lookup, length, minfreq);
+ s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+ lookup, length, minfreq);
+ selec = s1 * s2;
+ break;
+
+ case OP_OR:
+ s1 = tsquery_opr_selec(item + 1, operand,
+ lookup, length, minfreq);
+ s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+ lookup, length, minfreq);
+ selec = s1 + s2 - s1 * s2;
+ break;
+
+ default:
+ elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
+ selec = 0; /* keep compiler quiet */
+ break;
+ }
+ }
+
+ /* Clamp intermediate results to stay sane despite roundoff error */
+ CLAMP_PROBABILITY(selec);
+
+ return selec;
+}
+
+/*
+ * bsearch() comparator for a lexeme (non-NULL terminated string with length)
+ * and a TextFreq. Use length, then byte-for-byte comparison, because that's
+ * how ANALYZE code sorted data before storing it in a statistic tuple.
+ * See ts_typanalyze.c for details.
+ */
+static int
+compare_lexeme_textfreq(const void *e1, const void *e2)
+{
+ const LexemeKey *key = (const LexemeKey *) e1;
+ const TextFreq *t = (const TextFreq *) e2;
+ int len1,
+ len2;
+
+ len1 = key->length;
+ len2 = VARSIZE_ANY_EXHDR(t->element);
+
+ /* Compare lengths first, possibly avoiding a strncmp call */
+ if (len1 > len2)
+ return 1;
+ else if (len1 < len2)
+ return -1;
+
+ /* Fall back on byte-for-byte comparison */
+ return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
+}
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
new file mode 100644
index 0000000..56eeb6f
--- /dev/null
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -0,0 +1,536 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_typanalyze.c
+ * functions for gathering statistics from tsvector columns
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/ts_typanalyze.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/pg_collation.h"
+#include "catalog/pg_operator.h"
+#include "commands/vacuum.h"
+#include "common/hashfn.h"
+#include "tsearch/ts_type.h"
+#include "utils/builtins.h"
+
+
+/* A hash key for lexemes */
+typedef struct
+{
+ char *lexeme; /* lexeme (not NULL terminated!) */
+ int length; /* its length in bytes */
+} LexemeHashKey;
+
+/* A hash table entry for the Lossy Counting algorithm */
+typedef struct
+{
+ LexemeHashKey key; /* This is 'e' from the LC algorithm. */
+ int frequency; /* This is 'f'. */
+ int delta; /* And this is 'delta'. */
+} TrackItem;
+
+static void compute_tsvector_stats(VacAttrStats *stats,
+ AnalyzeAttrFetchFunc fetchfunc,
+ int samplerows,
+ double totalrows);
+static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
+static uint32 lexeme_hash(const void *key, Size keysize);
+static int lexeme_match(const void *key1, const void *key2, Size keysize);
+static int lexeme_compare(const void *key1, const void *key2);
+static int trackitem_compare_frequencies_desc(const void *e1, const void *e2,
+ void *arg);
+static int trackitem_compare_lexemes(const void *e1, const void *e2,
+ void *arg);
+
+
+/*
+ * ts_typanalyze -- a custom typanalyze function for tsvector columns
+ */
+Datum
+ts_typanalyze(PG_FUNCTION_ARGS)
+{
+ VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
+ Form_pg_attribute attr = stats->attr;
+
+ /* If the attstattarget column is negative, use the default value */
+ /* NB: it is okay to scribble on stats->attr since it's a copy */
+ if (attr->attstattarget < 0)
+ attr->attstattarget = default_statistics_target;
+
+ stats->compute_stats = compute_tsvector_stats;
+ /* see comment about the choice of minrows in commands/analyze.c */
+ stats->minrows = 300 * attr->attstattarget;
+
+ PG_RETURN_BOOL(true);
+}
+
+/*
+ * compute_tsvector_stats() -- compute statistics for a tsvector column
+ *
+ * This functions computes statistics that are useful for determining @@
+ * operations' selectivity, along with the fraction of non-null rows and
+ * average width.
+ *
+ * Instead of finding the most common values, as we do for most datatypes,
+ * we're looking for the most common lexemes. This is more useful, because
+ * there most probably won't be any two rows with the same tsvector and thus
+ * the notion of a MCV is a bit bogus with this datatype. With a list of the
+ * most common lexemes we can do a better job at figuring out @@ selectivity.
+ *
+ * For the same reasons we assume that tsvector columns are unique when
+ * determining the number of distinct values.
+ *
+ * The algorithm used is Lossy Counting, as proposed in the paper "Approximate
+ * frequency counts over data streams" by G. S. Manku and R. Motwani, in
+ * Proceedings of the 28th International Conference on Very Large Data Bases,
+ * Hong Kong, China, August 2002, section 4.2. The paper is available at
+ * http://www.vldb.org/conf/2002/S10P03.pdf
+ *
+ * The Lossy Counting (aka LC) algorithm goes like this:
+ * Let s be the threshold frequency for an item (the minimum frequency we
+ * are interested in) and epsilon the error margin for the frequency. Let D
+ * be a set of triples (e, f, delta), where e is an element value, f is that
+ * element's frequency (actually, its current occurrence count) and delta is
+ * the maximum error in f. We start with D empty and process the elements in
+ * batches of size w. (The batch size is also known as "bucket size" and is
+ * equal to 1/epsilon.) Let the current batch number be b_current, starting
+ * with 1. For each element e we either increment its f count, if it's
+ * already in D, or insert a new triple into D with values (e, 1, b_current
+ * - 1). After processing each batch we prune D, by removing from it all
+ * elements with f + delta <= b_current. After the algorithm finishes we
+ * suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
+ * where N is the total number of elements in the input. We emit the
+ * remaining elements with estimated frequency f/N. The LC paper proves
+ * that this algorithm finds all elements with true frequency at least s,
+ * and that no frequency is overestimated or is underestimated by more than
+ * epsilon. Furthermore, given reasonable assumptions about the input
+ * distribution, the required table size is no more than about 7 times w.
+ *
+ * We set s to be the estimated frequency of the K'th word in a natural
+ * language's frequency table, where K is the target number of entries in
+ * the MCELEM array plus an arbitrary constant, meant to reflect the fact
+ * that the most common words in any language would usually be stopwords
+ * so we will not actually see them in the input. We assume that the
+ * distribution of word frequencies (including the stopwords) follows Zipf's
+ * law with an exponent of 1.
+ *
+ * Assuming Zipfian distribution, the frequency of the K'th word is equal
+ * to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of
+ * words in the language. Putting W as one million, we get roughly 0.07/K.
+ * Assuming top 10 words are stopwords gives s = 0.07/(K + 10). We set
+ * epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and
+ * maximum expected hashtable size of about 1000 * (K + 10).
+ *
+ * Note: in the above discussion, s, epsilon, and f/N are in terms of a
+ * lexeme's frequency as a fraction of all lexemes seen in the input.
+ * However, what we actually want to store in the finished pg_statistic
+ * entry is each lexeme's frequency as a fraction of all rows that it occurs
+ * in. Assuming that the input tsvectors are correctly constructed, no
+ * lexeme occurs more than once per tsvector, so the final count f is a
+ * correct estimate of the number of input tsvectors it occurs in, and we
+ * need only change the divisor from N to nonnull_cnt to get the number we
+ * want.
+ */
+static void
+compute_tsvector_stats(VacAttrStats *stats,
+ AnalyzeAttrFetchFunc fetchfunc,
+ int samplerows,
+ double totalrows)
+{
+ int num_mcelem;
+ int null_cnt = 0;
+ double total_width = 0;
+
+ /* This is D from the LC algorithm. */
+ HTAB *lexemes_tab;
+ HASHCTL hash_ctl;
+ HASH_SEQ_STATUS scan_status;
+
+ /* This is the current bucket number from the LC algorithm */
+ int b_current;
+
+ /* This is 'w' from the LC algorithm */
+ int bucket_width;
+ int vector_no,
+ lexeme_no;
+ LexemeHashKey hash_key;
+ TrackItem *item;
+
+ /*
+ * We want statistics_target * 10 lexemes in the MCELEM array. This
+ * multiplier is pretty arbitrary, but is meant to reflect the fact that
+ * the number of individual lexeme values tracked in pg_statistic ought to
+ * be more than the number of values for a simple scalar column.
+ */
+ num_mcelem = stats->attr->attstattarget * 10;
+
+ /*
+ * We set bucket width equal to (num_mcelem + 10) / 0.007 as per the
+ * comment above.
+ */
+ bucket_width = (num_mcelem + 10) * 1000 / 7;
+
+ /*
+ * Create the hashtable. It will be in local memory, so we don't need to
+ * worry about overflowing the initial size. Also we don't need to pay any
+ * attention to locking and memory management.
+ */
+ hash_ctl.keysize = sizeof(LexemeHashKey);
+ hash_ctl.entrysize = sizeof(TrackItem);
+ hash_ctl.hash = lexeme_hash;
+ hash_ctl.match = lexeme_match;
+ hash_ctl.hcxt = CurrentMemoryContext;
+ lexemes_tab = hash_create("Analyzed lexemes table",
+ num_mcelem,
+ &hash_ctl,
+ HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
+
+ /* Initialize counters. */
+ b_current = 1;
+ lexeme_no = 0;
+
+ /* Loop over the tsvectors. */
+ for (vector_no = 0; vector_no < samplerows; vector_no++)
+ {
+ Datum value;
+ bool isnull;
+ TSVector vector;
+ WordEntry *curentryptr;
+ char *lexemesptr;
+ int j;
+
+ vacuum_delay_point();
+
+ value = fetchfunc(stats, vector_no, &isnull);
+
+ /*
+ * Check for null/nonnull.
+ */
+ if (isnull)
+ {
+ null_cnt++;
+ continue;
+ }
+
+ /*
+ * Add up widths for average-width calculation. Since it's a
+ * tsvector, we know it's varlena. As in the regular
+ * compute_minimal_stats function, we use the toasted width for this
+ * calculation.
+ */
+ total_width += VARSIZE_ANY(DatumGetPointer(value));
+
+ /*
+ * Now detoast the tsvector if needed.
+ */
+ vector = DatumGetTSVector(value);
+
+ /*
+ * We loop through the lexemes in the tsvector and add them to our
+ * tracking hashtable.
+ */
+ lexemesptr = STRPTR(vector);
+ curentryptr = ARRPTR(vector);
+ for (j = 0; j < vector->size; j++)
+ {
+ bool found;
+
+ /*
+ * Construct a hash key. The key points into the (detoasted)
+ * tsvector value at this point, but if a new entry is created, we
+ * make a copy of it. This way we can free the tsvector value
+ * once we've processed all its lexemes.
+ */
+ hash_key.lexeme = lexemesptr + curentryptr->pos;
+ hash_key.length = curentryptr->len;
+
+ /* Lookup current lexeme in hashtable, adding it if new */
+ item = (TrackItem *) hash_search(lexemes_tab,
+ (const void *) &hash_key,
+ HASH_ENTER, &found);
+
+ if (found)
+ {
+ /* The lexeme is already on the tracking list */
+ item->frequency++;
+ }
+ else
+ {
+ /* Initialize new tracking list element */
+ item->frequency = 1;
+ item->delta = b_current - 1;
+
+ item->key.lexeme = palloc(hash_key.length);
+ memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length);
+ }
+
+ /* lexeme_no is the number of elements processed (ie N) */
+ lexeme_no++;
+
+ /* We prune the D structure after processing each bucket */
+ if (lexeme_no % bucket_width == 0)
+ {
+ prune_lexemes_hashtable(lexemes_tab, b_current);
+ b_current++;
+ }
+
+ /* Advance to the next WordEntry in the tsvector */
+ curentryptr++;
+ }
+
+ /* If the vector was toasted, free the detoasted copy. */
+ if (TSVectorGetDatum(vector) != value)
+ pfree(vector);
+ }
+
+ /* We can only compute real stats if we found some non-null values. */
+ if (null_cnt < samplerows)
+ {
+ int nonnull_cnt = samplerows - null_cnt;
+ int i;
+ TrackItem **sort_table;
+ int track_len;
+ int cutoff_freq;
+ int minfreq,
+ maxfreq;
+
+ stats->stats_valid = true;
+ /* Do the simple null-frac and average width stats */
+ stats->stanullfrac = (double) null_cnt / (double) samplerows;
+ stats->stawidth = total_width / (double) nonnull_cnt;
+
+ /* Assume it's a unique column (see notes above) */
+ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
+
+ /*
+ * Construct an array of the interesting hashtable items, that is,
+ * those meeting the cutoff frequency (s - epsilon)*N. Also identify
+ * the minimum and maximum frequencies among these items.
+ *
+ * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
+ * frequency is 9*N / bucket_width.
+ */
+ cutoff_freq = 9 * lexeme_no / bucket_width;
+
+ i = hash_get_num_entries(lexemes_tab); /* surely enough space */
+ sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);
+
+ hash_seq_init(&scan_status, lexemes_tab);
+ track_len = 0;
+ minfreq = lexeme_no;
+ maxfreq = 0;
+ while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
+ {
+ if (item->frequency > cutoff_freq)
+ {
+ sort_table[track_len++] = item;
+ minfreq = Min(minfreq, item->frequency);
+ maxfreq = Max(maxfreq, item->frequency);
+ }
+ }
+ Assert(track_len <= i);
+
+ /* emit some statistics for debug purposes */
+ elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, "
+ "# lexemes = %d, hashtable size = %d, usable entries = %d",
+ num_mcelem, bucket_width, lexeme_no, i, track_len);
+
+ /*
+ * If we obtained more lexemes than we really want, get rid of those
+ * with least frequencies. The easiest way is to qsort the array into
+ * descending frequency order and truncate the array.
+ */
+ if (num_mcelem < track_len)
+ {
+ qsort_interruptible(sort_table, track_len, sizeof(TrackItem *),
+ trackitem_compare_frequencies_desc, NULL);
+ /* reset minfreq to the smallest frequency we're keeping */
+ minfreq = sort_table[num_mcelem - 1]->frequency;
+ }
+ else
+ num_mcelem = track_len;
+
+ /* Generate MCELEM slot entry */
+ if (num_mcelem > 0)
+ {
+ MemoryContext old_context;
+ Datum *mcelem_values;
+ float4 *mcelem_freqs;
+
+ /*
+ * We want to store statistics sorted on the lexeme value using
+ * first length, then byte-for-byte comparison. The reason for
+ * doing length comparison first is that we don't care about the
+ * ordering so long as it's consistent, and comparing lengths
+ * first gives us a chance to avoid a strncmp() call.
+ *
+ * This is different from what we do with scalar statistics --
+ * they get sorted on frequencies. The rationale is that we
+ * usually search through most common elements looking for a
+ * specific value, so we can grab its frequency. When values are
+ * presorted we can employ binary search for that. See
+ * ts_selfuncs.c for a real usage scenario.
+ */
+ qsort_interruptible(sort_table, num_mcelem, sizeof(TrackItem *),
+ trackitem_compare_lexemes, NULL);
+
+ /* Must copy the target values into anl_context */
+ old_context = MemoryContextSwitchTo(stats->anl_context);
+
+ /*
+ * We sorted statistics on the lexeme value, but we want to be
+ * able to find out the minimal and maximal frequency without
+ * going through all the values. We keep those two extra
+ * frequencies in two extra cells in mcelem_freqs.
+ *
+ * (Note: the MCELEM statistics slot definition allows for a third
+ * extra number containing the frequency of nulls, but we don't
+ * create that for a tsvector column, since null elements aren't
+ * possible.)
+ */
+ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
+ mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));
+
+ /*
+ * See comments above about use of nonnull_cnt as the divisor for
+ * the final frequency estimates.
+ */
+ for (i = 0; i < num_mcelem; i++)
+ {
+ TrackItem *item = sort_table[i];
+
+ mcelem_values[i] =
+ PointerGetDatum(cstring_to_text_with_len(item->key.lexeme,
+ item->key.length));
+ mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt;
+ }
+ mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
+ mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt;
+ MemoryContextSwitchTo(old_context);
+
+ stats->stakind[0] = STATISTIC_KIND_MCELEM;
+ stats->staop[0] = TextEqualOperator;
+ stats->stacoll[0] = DEFAULT_COLLATION_OID;
+ stats->stanumbers[0] = mcelem_freqs;
+ /* See above comment about two extra frequency fields */
+ stats->numnumbers[0] = num_mcelem + 2;
+ stats->stavalues[0] = mcelem_values;
+ stats->numvalues[0] = num_mcelem;
+ /* We are storing text values */
+ stats->statypid[0] = TEXTOID;
+ stats->statyplen[0] = -1; /* typlen, -1 for varlena */
+ stats->statypbyval[0] = false;
+ stats->statypalign[0] = 'i';
+ }
+ }
+ else
+ {
+ /* We found only nulls; assume the column is entirely null */
+ stats->stats_valid = true;
+ stats->stanullfrac = 1.0;
+ stats->stawidth = 0; /* "unknown" */
+ stats->stadistinct = 0.0; /* "unknown" */
+ }
+
+ /*
+ * We don't need to bother cleaning up any of our temporary palloc's. The
+ * hashtable should also go away, as it used a child memory context.
+ */
+}
+
+/*
+ * A function to prune the D structure from the Lossy Counting algorithm.
+ * Consult compute_tsvector_stats() for wider explanation.
+ */
+static void
+prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current)
+{
+ HASH_SEQ_STATUS scan_status;
+ TrackItem *item;
+
+ hash_seq_init(&scan_status, lexemes_tab);
+ while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
+ {
+ if (item->frequency + item->delta <= b_current)
+ {
+ char *lexeme = item->key.lexeme;
+
+ if (hash_search(lexemes_tab, (const void *) &item->key,
+ HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "hash table corrupted");
+ pfree(lexeme);
+ }
+ }
+}
+
+/*
+ * Hash functions for lexemes. They are strings, but not NULL terminated,
+ * so we need a special hash function.
+ */
+static uint32
+lexeme_hash(const void *key, Size keysize)
+{
+ const LexemeHashKey *l = (const LexemeHashKey *) key;
+
+ return DatumGetUInt32(hash_any((const unsigned char *) l->lexeme,
+ l->length));
+}
+
+/*
+ * Matching function for lexemes, to be used in hashtable lookups.
+ */
+static int
+lexeme_match(const void *key1, const void *key2, Size keysize)
+{
+ /* The keysize parameter is superfluous, the keys store their lengths */
+ return lexeme_compare(key1, key2);
+}
+
+/*
+ * Comparison function for lexemes.
+ */
+static int
+lexeme_compare(const void *key1, const void *key2)
+{
+ const LexemeHashKey *d1 = (const LexemeHashKey *) key1;
+ const LexemeHashKey *d2 = (const LexemeHashKey *) key2;
+
+ /* First, compare by length */
+ if (d1->length > d2->length)
+ return 1;
+ else if (d1->length < d2->length)
+ return -1;
+ /* Lengths are equal, do a byte-by-byte comparison */
+ return strncmp(d1->lexeme, d2->lexeme, d1->length);
+}
+
+/*
+ * Comparator for sorting TrackItems on frequencies (descending sort)
+ */
+static int
+trackitem_compare_frequencies_desc(const void *e1, const void *e2, void *arg)
+{
+ const TrackItem *const *t1 = (const TrackItem *const *) e1;
+ const TrackItem *const *t2 = (const TrackItem *const *) e2;
+
+ return (*t2)->frequency - (*t1)->frequency;
+}
+
+/*
+ * Comparator for sorting TrackItems on lexemes
+ */
+static int
+trackitem_compare_lexemes(const void *e1, const void *e2, void *arg)
+{
+ const TrackItem *const *t1 = (const TrackItem *const *) e1;
+ const TrackItem *const *t2 = (const TrackItem *const *) e2;
+
+ return lexeme_compare(&(*t1)->key, &(*t2)->key);
+}
diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c
new file mode 100644
index 0000000..ed16a2e
--- /dev/null
+++ b/src/backend/tsearch/ts_utils.c
@@ -0,0 +1,146 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_utils.c
+ * various support functions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/ts_utils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+
+#include "miscadmin.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+
+
+/*
+ * Given the base name and extension of a tsearch config file, return
+ * its full path name. The base name is assumed to be user-supplied,
+ * and is checked to prevent pathname attacks. The extension is assumed
+ * to be safe.
+ *
+ * The result is a palloc'd string.
+ */
+char *
+get_tsearch_config_filename(const char *basename,
+ const char *extension)
+{
+ char sharepath[MAXPGPATH];
+ char *result;
+
+ /*
+ * We limit the basename to contain a-z, 0-9, and underscores. This may
+ * be overly restrictive, but we don't want to allow access to anything
+ * outside the tsearch_data directory, so for instance '/' *must* be
+ * rejected, and on some platforms '\' and ':' are risky as well. Allowing
+ * uppercase might result in incompatible behavior between case-sensitive
+ * and case-insensitive filesystems, and non-ASCII characters create other
+ * interesting risks, so on the whole a tight policy seems best.
+ */
+ if (strspn(basename, "abcdefghijklmnopqrstuvwxyz0123456789_") != strlen(basename))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid text search configuration file name \"%s\"",
+ basename)));
+
+ get_share_path(my_exec_path, sharepath);
+ result = palloc(MAXPGPATH);
+ snprintf(result, MAXPGPATH, "%s/tsearch_data/%s.%s",
+ sharepath, basename, extension);
+
+ return result;
+}
+
+/*
+ * Reads a stop-word file. Each word is run through 'wordop'
+ * function, if given. wordop may either modify the input in-place,
+ * or palloc a new version.
+ */
+void
+readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
+{
+ char **stop = NULL;
+
+ s->len = 0;
+ if (fname && *fname)
+ {
+ char *filename = get_tsearch_config_filename(fname, "stop");
+ tsearch_readline_state trst;
+ char *line;
+ int reallen = 0;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open stop-word file \"%s\": %m",
+ filename)));
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ char *pbuf = line;
+
+ /* Trim trailing space */
+ while (*pbuf && !t_isspace(pbuf))
+ pbuf += pg_mblen(pbuf);
+ *pbuf = '\0';
+
+ /* Skip empty lines */
+ if (*line == '\0')
+ {
+ pfree(line);
+ continue;
+ }
+
+ if (s->len >= reallen)
+ {
+ if (reallen == 0)
+ {
+ reallen = 64;
+ stop = (char **) palloc(sizeof(char *) * reallen);
+ }
+ else
+ {
+ reallen *= 2;
+ stop = (char **) repalloc((void *) stop,
+ sizeof(char *) * reallen);
+ }
+ }
+
+ if (wordop)
+ {
+ stop[s->len] = wordop(line);
+ if (stop[s->len] != line)
+ pfree(line);
+ }
+ else
+ stop[s->len] = line;
+
+ (s->len)++;
+ }
+
+ tsearch_readline_end(&trst);
+ pfree(filename);
+ }
+
+ s->stop = stop;
+
+ /* Sort to allow binary searching */
+ if (s->stop && s->len > 0)
+ qsort(s->stop, s->len, sizeof(char *), pg_qsort_strcmp);
+}
+
+bool
+searchstoplist(StopList *s, char *key)
+{
+ return (s->stop && s->len > 0 &&
+ bsearch(&key, s->stop, s->len,
+ sizeof(char *), pg_qsort_strcmp)) ? true : false;
+}
diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c
new file mode 100644
index 0000000..71882dc
--- /dev/null
+++ b/src/backend/tsearch/wparser.c
@@ -0,0 +1,549 @@
+/*-------------------------------------------------------------------------
+ *
+ * wparser.c
+ * Standard interface to word parser
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/wparser.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/namespace.h"
+#include "catalog/pg_type.h"
+#include "commands/defrem.h"
+#include "common/jsonapi.h"
+#include "funcapi.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+#include "utils/jsonfuncs.h"
+#include "utils/varlena.h"
+
+/******sql-level interface******/
+
+typedef struct
+{
+ int cur;
+ LexDescr *list;
+} TSTokenTypeStorage;
+
+/* state for ts_headline_json_* */
+typedef struct HeadlineJsonState
+{
+ HeadlineParsedText *prs;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+ TSQuery query;
+ List *prsoptions;
+ bool transformed;
+} HeadlineJsonState;
+
+static text *headline_json_value(void *_state, char *elem_value, int elem_len);
+
+static void
+tt_setup_firstcall(FuncCallContext *funcctx, Oid prsid)
+{
+ TupleDesc tupdesc;
+ MemoryContext oldcontext;
+ TSTokenTypeStorage *st;
+ TSParserCacheEntry *prs = lookup_ts_parser_cache(prsid);
+
+ if (!OidIsValid(prs->lextypeOid))
+ elog(ERROR, "method lextype isn't defined for text search parser %u",
+ prsid);
+
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ st = (TSTokenTypeStorage *) palloc(sizeof(TSTokenTypeStorage));
+ st->cur = 0;
+ /* lextype takes one dummy argument */
+ st->list = (LexDescr *) DatumGetPointer(OidFunctionCall1(prs->lextypeOid,
+ (Datum) 0));
+ funcctx->user_fctx = (void *) st;
+
+ tupdesc = CreateTemplateTupleDesc(3);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tokid",
+ INT4OID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "alias",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "description",
+ TEXTOID, -1, 0);
+
+ funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+ MemoryContextSwitchTo(oldcontext);
+}
+
+static Datum
+tt_process_call(FuncCallContext *funcctx)
+{
+ TSTokenTypeStorage *st;
+
+ st = (TSTokenTypeStorage *) funcctx->user_fctx;
+ if (st->list && st->list[st->cur].lexid)
+ {
+ Datum result;
+ char *values[3];
+ char txtid[16];
+ HeapTuple tuple;
+
+ sprintf(txtid, "%d", st->list[st->cur].lexid);
+ values[0] = txtid;
+ values[1] = st->list[st->cur].alias;
+ values[2] = st->list[st->cur].descr;
+
+ tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
+ result = HeapTupleGetDatum(tuple);
+
+ pfree(values[1]);
+ pfree(values[2]);
+ st->cur++;
+ return result;
+ }
+ return (Datum) 0;
+}
+
+Datum
+ts_token_type_byid(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ funcctx = SRF_FIRSTCALL_INIT();
+ tt_setup_firstcall(funcctx, PG_GETARG_OID(0));
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ if ((result = tt_process_call(funcctx)) != (Datum) 0)
+ SRF_RETURN_NEXT(funcctx, result);
+ SRF_RETURN_DONE(funcctx);
+}
+
+Datum
+ts_token_type_byname(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ text *prsname = PG_GETARG_TEXT_PP(0);
+ Oid prsId;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ prsId = get_ts_parser_oid(textToQualifiedNameList(prsname), false);
+ tt_setup_firstcall(funcctx, prsId);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ if ((result = tt_process_call(funcctx)) != (Datum) 0)
+ SRF_RETURN_NEXT(funcctx, result);
+ SRF_RETURN_DONE(funcctx);
+}
+
+typedef struct
+{
+ int type;
+ char *lexeme;
+} LexemeEntry;
+
+typedef struct
+{
+ int cur;
+ int len;
+ LexemeEntry *list;
+} PrsStorage;
+
+
+static void
+prs_setup_firstcall(FuncCallContext *funcctx, Oid prsid, text *txt)
+{
+ TupleDesc tupdesc;
+ MemoryContext oldcontext;
+ PrsStorage *st;
+ TSParserCacheEntry *prs = lookup_ts_parser_cache(prsid);
+ char *lex = NULL;
+ int llen = 0,
+ type = 0;
+ void *prsdata;
+
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ st = (PrsStorage *) palloc(sizeof(PrsStorage));
+ st->cur = 0;
+ st->len = 16;
+ st->list = (LexemeEntry *) palloc(sizeof(LexemeEntry) * st->len);
+
+ prsdata = (void *) DatumGetPointer(FunctionCall2(&prs->prsstart,
+ PointerGetDatum(VARDATA_ANY(txt)),
+ Int32GetDatum(VARSIZE_ANY_EXHDR(txt))));
+
+ while ((type = DatumGetInt32(FunctionCall3(&prs->prstoken,
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lex),
+ PointerGetDatum(&llen)))) != 0)
+ {
+ if (st->cur >= st->len)
+ {
+ st->len = 2 * st->len;
+ st->list = (LexemeEntry *) repalloc(st->list, sizeof(LexemeEntry) * st->len);
+ }
+ st->list[st->cur].lexeme = palloc(llen + 1);
+ memcpy(st->list[st->cur].lexeme, lex, llen);
+ st->list[st->cur].lexeme[llen] = '\0';
+ st->list[st->cur].type = type;
+ st->cur++;
+ }
+
+ FunctionCall1(&prs->prsend, PointerGetDatum(prsdata));
+
+ st->len = st->cur;
+ st->cur = 0;
+
+ funcctx->user_fctx = (void *) st;
+ tupdesc = CreateTemplateTupleDesc(2);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tokid",
+ INT4OID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "token",
+ TEXTOID, -1, 0);
+
+ funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+ MemoryContextSwitchTo(oldcontext);
+}
+
+static Datum
+prs_process_call(FuncCallContext *funcctx)
+{
+ PrsStorage *st;
+
+ st = (PrsStorage *) funcctx->user_fctx;
+ if (st->cur < st->len)
+ {
+ Datum result;
+ char *values[2];
+ char tid[16];
+ HeapTuple tuple;
+
+ values[0] = tid;
+ sprintf(tid, "%d", st->list[st->cur].type);
+ values[1] = st->list[st->cur].lexeme;
+ tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
+ result = HeapTupleGetDatum(tuple);
+
+ pfree(values[1]);
+ st->cur++;
+ return result;
+ }
+ return (Datum) 0;
+}
+
+Datum
+ts_parse_byid(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ text *txt = PG_GETARG_TEXT_PP(1);
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ prs_setup_firstcall(funcctx, PG_GETARG_OID(0), txt);
+ PG_FREE_IF_COPY(txt, 1);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ if ((result = prs_process_call(funcctx)) != (Datum) 0)
+ SRF_RETURN_NEXT(funcctx, result);
+ SRF_RETURN_DONE(funcctx);
+}
+
+Datum
+ts_parse_byname(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Datum result;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ text *prsname = PG_GETARG_TEXT_PP(0);
+ text *txt = PG_GETARG_TEXT_PP(1);
+ Oid prsId;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ prsId = get_ts_parser_oid(textToQualifiedNameList(prsname), false);
+ prs_setup_firstcall(funcctx, prsId, txt);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ if ((result = prs_process_call(funcctx)) != (Datum) 0)
+ SRF_RETURN_NEXT(funcctx, result);
+ SRF_RETURN_DONE(funcctx);
+}
+
+Datum
+ts_headline_byid_opt(PG_FUNCTION_ARGS)
+{
+ Oid tsconfig = PG_GETARG_OID(0);
+ text *in = PG_GETARG_TEXT_PP(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+ text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_PP(3) : NULL;
+ HeadlineParsedText prs;
+ List *prsoptions;
+ text *out;
+ TSConfigCacheEntry *cfg;
+ TSParserCacheEntry *prsobj;
+
+ cfg = lookup_ts_config_cache(tsconfig);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ if (!OidIsValid(prsobj->headlineOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("text search parser does not support headline creation")));
+
+ memset(&prs, 0, sizeof(HeadlineParsedText));
+ prs.lenwords = 32;
+ prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords);
+
+ hlparsetext(cfg->cfgId, &prs, query,
+ VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
+
+ if (opt)
+ prsoptions = deserialize_deflist(PointerGetDatum(opt));
+ else
+ prsoptions = NIL;
+
+ FunctionCall3(&(prsobj->prsheadline),
+ PointerGetDatum(&prs),
+ PointerGetDatum(prsoptions),
+ PointerGetDatum(query));
+
+ out = generateHeadline(&prs);
+
+ PG_FREE_IF_COPY(in, 1);
+ PG_FREE_IF_COPY(query, 2);
+ if (opt)
+ PG_FREE_IF_COPY(opt, 3);
+ pfree(prs.words);
+ pfree(prs.startsel);
+ pfree(prs.stopsel);
+
+ PG_RETURN_POINTER(out);
+}
+
+Datum
+ts_headline_byid(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2)));
+}
+
+Datum
+ts_headline(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
+ ObjectIdGetDatum(getTSCurrentConfig(true)),
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1)));
+}
+
+Datum
+ts_headline_opt(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt,
+ ObjectIdGetDatum(getTSCurrentConfig(true)),
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2)));
+}
+
+Datum
+ts_headline_jsonb_byid_opt(PG_FUNCTION_ARGS)
+{
+ Oid tsconfig = PG_GETARG_OID(0);
+ Jsonb *jb = PG_GETARG_JSONB_P(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+ text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
+ Jsonb *out;
+ JsonTransformStringValuesAction action = (JsonTransformStringValuesAction) headline_json_value;
+ HeadlineParsedText prs;
+ HeadlineJsonState *state = palloc0(sizeof(HeadlineJsonState));
+
+ memset(&prs, 0, sizeof(HeadlineParsedText));
+ prs.lenwords = 32;
+ prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords);
+
+ state->prs = &prs;
+ state->cfg = lookup_ts_config_cache(tsconfig);
+ state->prsobj = lookup_ts_parser_cache(state->cfg->prsId);
+ state->query = query;
+ if (opt)
+ state->prsoptions = deserialize_deflist(PointerGetDatum(opt));
+ else
+ state->prsoptions = NIL;
+
+ if (!OidIsValid(state->prsobj->headlineOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("text search parser does not support headline creation")));
+
+ out = transform_jsonb_string_values(jb, state, action);
+
+ PG_FREE_IF_COPY(jb, 1);
+ PG_FREE_IF_COPY(query, 2);
+ if (opt)
+ PG_FREE_IF_COPY(opt, 3);
+
+ pfree(prs.words);
+
+ if (state->transformed)
+ {
+ pfree(prs.startsel);
+ pfree(prs.stopsel);
+ }
+
+ PG_RETURN_JSONB_P(out);
+}
+
+Datum
+ts_headline_jsonb(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_jsonb_byid_opt,
+ ObjectIdGetDatum(getTSCurrentConfig(true)),
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1)));
+}
+
+Datum
+ts_headline_jsonb_byid(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_jsonb_byid_opt,
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2)));
+}
+
+Datum
+ts_headline_jsonb_opt(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_jsonb_byid_opt,
+ ObjectIdGetDatum(getTSCurrentConfig(true)),
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2)));
+}
+
+Datum
+ts_headline_json_byid_opt(PG_FUNCTION_ARGS)
+{
+ Oid tsconfig = PG_GETARG_OID(0);
+ text *json = PG_GETARG_TEXT_P(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+ text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
+ text *out;
+ JsonTransformStringValuesAction action = (JsonTransformStringValuesAction) headline_json_value;
+
+ HeadlineParsedText prs;
+ HeadlineJsonState *state = palloc0(sizeof(HeadlineJsonState));
+
+ memset(&prs, 0, sizeof(HeadlineParsedText));
+ prs.lenwords = 32;
+ prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords);
+
+ state->prs = &prs;
+ state->cfg = lookup_ts_config_cache(tsconfig);
+ state->prsobj = lookup_ts_parser_cache(state->cfg->prsId);
+ state->query = query;
+ if (opt)
+ state->prsoptions = deserialize_deflist(PointerGetDatum(opt));
+ else
+ state->prsoptions = NIL;
+
+ if (!OidIsValid(state->prsobj->headlineOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("text search parser does not support headline creation")));
+
+ out = transform_json_string_values(json, state, action);
+
+ PG_FREE_IF_COPY(json, 1);
+ PG_FREE_IF_COPY(query, 2);
+ if (opt)
+ PG_FREE_IF_COPY(opt, 3);
+ pfree(prs.words);
+
+ if (state->transformed)
+ {
+ pfree(prs.startsel);
+ pfree(prs.stopsel);
+ }
+
+ PG_RETURN_TEXT_P(out);
+}
+
+Datum
+ts_headline_json(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_json_byid_opt,
+ ObjectIdGetDatum(getTSCurrentConfig(true)),
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1)));
+}
+
+Datum
+ts_headline_json_byid(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_json_byid_opt,
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2)));
+}
+
+Datum
+ts_headline_json_opt(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_json_byid_opt,
+ ObjectIdGetDatum(getTSCurrentConfig(true)),
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2)));
+}
+
+
+/*
+ * Return headline in text from, generated from a json(b) element
+ */
+static text *
+headline_json_value(void *_state, char *elem_value, int elem_len)
+{
+ HeadlineJsonState *state = (HeadlineJsonState *) _state;
+
+ HeadlineParsedText *prs = state->prs;
+ TSConfigCacheEntry *cfg = state->cfg;
+ TSParserCacheEntry *prsobj = state->prsobj;
+ TSQuery query = state->query;
+ List *prsoptions = state->prsoptions;
+
+ prs->curwords = 0;
+ hlparsetext(cfg->cfgId, prs, query, elem_value, elem_len);
+ FunctionCall3(&(prsobj->prsheadline),
+ PointerGetDatum(prs),
+ PointerGetDatum(prsoptions),
+ PointerGetDatum(query));
+
+ state->transformed = true;
+ return generateHeadline(prs);
+}
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
new file mode 100644
index 0000000..559dff6
--- /dev/null
+++ b/src/backend/tsearch/wparser_def.c
@@ -0,0 +1,2634 @@
+/*-------------------------------------------------------------------------
+ *
+ * wparser_def.c
+ * Default text search parser
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/wparser_def.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "catalog/pg_collation.h"
+#include "commands/defrem.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "tsearch/ts_type.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+
+
+/* Define me to enable tracing of parser behavior */
+/* #define WPARSER_TRACE */
+
+
+/* Output token categories */
+
+#define ASCIIWORD 1
+#define WORD_T 2
+#define NUMWORD 3
+#define EMAIL 4
+#define URL_T 5
+#define HOST 6
+#define SCIENTIFIC 7
+#define VERSIONNUMBER 8
+#define NUMPARTHWORD 9
+#define PARTHWORD 10
+#define ASCIIPARTHWORD 11
+#define SPACE 12
+#define TAG_T 13
+#define PROTOCOL 14
+#define NUMHWORD 15
+#define ASCIIHWORD 16
+#define HWORD 17
+#define URLPATH 18
+#define FILEPATH 19
+#define DECIMAL_T 20
+#define SIGNEDINT 21
+#define UNSIGNEDINT 22
+#define XMLENTITY 23
+
+#define LASTNUM 23
+
+static const char *const tok_alias[] = {
+ "",
+ "asciiword",
+ "word",
+ "numword",
+ "email",
+ "url",
+ "host",
+ "sfloat",
+ "version",
+ "hword_numpart",
+ "hword_part",
+ "hword_asciipart",
+ "blank",
+ "tag",
+ "protocol",
+ "numhword",
+ "asciihword",
+ "hword",
+ "url_path",
+ "file",
+ "float",
+ "int",
+ "uint",
+ "entity"
+};
+
+static const char *const lex_descr[] = {
+ "",
+ "Word, all ASCII",
+ "Word, all letters",
+ "Word, letters and digits",
+ "Email address",
+ "URL",
+ "Host",
+ "Scientific notation",
+ "Version number",
+ "Hyphenated word part, letters and digits",
+ "Hyphenated word part, all letters",
+ "Hyphenated word part, all ASCII",
+ "Space symbols",
+ "XML tag",
+ "Protocol head",
+ "Hyphenated word, letters and digits",
+ "Hyphenated word, all ASCII",
+ "Hyphenated word, all letters",
+ "URL path",
+ "File or path name",
+ "Decimal notation",
+ "Signed integer",
+ "Unsigned integer",
+ "XML entity"
+};
+
+
+/* Parser states */
+
+typedef enum
+{
+ TPS_Base = 0,
+ TPS_InNumWord,
+ TPS_InAsciiWord,
+ TPS_InWord,
+ TPS_InUnsignedInt,
+ TPS_InSignedIntFirst,
+ TPS_InSignedInt,
+ TPS_InSpace,
+ TPS_InUDecimalFirst,
+ TPS_InUDecimal,
+ TPS_InDecimalFirst,
+ TPS_InDecimal,
+ TPS_InVerVersion,
+ TPS_InSVerVersion,
+ TPS_InVersionFirst,
+ TPS_InVersion,
+ TPS_InMantissaFirst,
+ TPS_InMantissaSign,
+ TPS_InMantissa,
+ TPS_InXMLEntityFirst,
+ TPS_InXMLEntity,
+ TPS_InXMLEntityNumFirst,
+ TPS_InXMLEntityNum,
+ TPS_InXMLEntityHexNumFirst,
+ TPS_InXMLEntityHexNum,
+ TPS_InXMLEntityEnd,
+ TPS_InTagFirst,
+ TPS_InXMLBegin,
+ TPS_InTagCloseFirst,
+ TPS_InTagName,
+ TPS_InTagBeginEnd,
+ TPS_InTag,
+ TPS_InTagEscapeK,
+ TPS_InTagEscapeKK,
+ TPS_InTagBackSleshed,
+ TPS_InTagEnd,
+ TPS_InCommentFirst,
+ TPS_InCommentLast,
+ TPS_InComment,
+ TPS_InCloseCommentFirst,
+ TPS_InCloseCommentLast,
+ TPS_InCommentEnd,
+ TPS_InHostFirstDomain,
+ TPS_InHostDomainSecond,
+ TPS_InHostDomain,
+ TPS_InPortFirst,
+ TPS_InPort,
+ TPS_InHostFirstAN,
+ TPS_InHost,
+ TPS_InEmail,
+ TPS_InFileFirst,
+ TPS_InFileTwiddle,
+ TPS_InPathFirst,
+ TPS_InPathFirstFirst,
+ TPS_InPathSecond,
+ TPS_InFile,
+ TPS_InFileNext,
+ TPS_InURLPathFirst,
+ TPS_InURLPathStart,
+ TPS_InURLPath,
+ TPS_InFURL,
+ TPS_InProtocolFirst,
+ TPS_InProtocolSecond,
+ TPS_InProtocolEnd,
+ TPS_InHyphenAsciiWordFirst,
+ TPS_InHyphenAsciiWord,
+ TPS_InHyphenWordFirst,
+ TPS_InHyphenWord,
+ TPS_InHyphenNumWordFirst,
+ TPS_InHyphenNumWord,
+ TPS_InHyphenDigitLookahead,
+ TPS_InParseHyphen,
+ TPS_InParseHyphenHyphen,
+ TPS_InHyphenWordPart,
+ TPS_InHyphenAsciiWordPart,
+ TPS_InHyphenNumWordPart,
+ TPS_InHyphenUnsignedInt,
+ TPS_Null /* last state (fake value) */
+} TParserState;
+
+/* forward declaration */
+struct TParser;
+
+typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
+ * except p_iseq */
+typedef void (*TParserSpecial) (struct TParser *); /* special handler for
+ * special cases... */
+
+typedef struct
+{
+ TParserCharTest isclass;
+ char c;
+ uint16 flags;
+ TParserState tostate;
+ int type;
+ TParserSpecial special;
+} TParserStateActionItem;
+
+/* Flag bits in TParserStateActionItem.flags */
+#define A_NEXT 0x0000
+#define A_BINGO 0x0001
+#define A_POP 0x0002
+#define A_PUSH 0x0004
+#define A_RERUN 0x0008
+#define A_CLEAR 0x0010
+#define A_MERGE 0x0020
+#define A_CLRALL 0x0040
+
+typedef struct TParserPosition
+{
+ int posbyte; /* position of parser in bytes */
+ int poschar; /* position of parser in characters */
+ int charlen; /* length of current char */
+ int lenbytetoken; /* length of token-so-far in bytes */
+ int lenchartoken; /* and in chars */
+ TParserState state;
+ struct TParserPosition *prev;
+ const TParserStateActionItem *pushedAtAction;
+} TParserPosition;
+
+typedef struct TParser
+{
+ /* string and position information */
+ char *str; /* multibyte string */
+ int lenstr; /* length of mbstring */
+ wchar_t *wstr; /* wide character string */
+ pg_wchar *pgwstr; /* wide character string for C-locale */
+ bool usewide;
+
+ /* State of parse */
+ int charmaxlen;
+ TParserPosition *state;
+ bool ignore;
+ bool wanthost;
+
+ /* silly char */
+ char c;
+
+ /* out */
+ char *token;
+ int lenbytetoken;
+ int lenchartoken;
+ int type;
+} TParser;
+
+
+/* forward decls here */
+static bool TParserGet(TParser *prs);
+
+
+static TParserPosition *
+newTParserPosition(TParserPosition *prev)
+{
+ TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
+
+ if (prev)
+ memcpy(res, prev, sizeof(TParserPosition));
+ else
+ memset(res, 0, sizeof(TParserPosition));
+
+ res->prev = prev;
+
+ res->pushedAtAction = NULL;
+
+ return res;
+}
+
+static TParser *
+TParserInit(char *str, int len)
+{
+ TParser *prs = (TParser *) palloc0(sizeof(TParser));
+
+ prs->charmaxlen = pg_database_encoding_max_length();
+ prs->str = str;
+ prs->lenstr = len;
+
+ /*
+ * Use wide char code only when max encoding length > 1.
+ */
+ if (prs->charmaxlen > 1)
+ {
+ Oid collation = DEFAULT_COLLATION_OID; /* TODO */
+ pg_locale_t mylocale = 0; /* TODO */
+
+ prs->usewide = true;
+ if (lc_ctype_is_c(collation))
+ {
+ /*
+ * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
+ * be different from sizeof(wchar_t)
+ */
+ prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
+ pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
+ }
+ else
+ {
+ prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
+ char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
+ mylocale);
+ }
+ }
+ else
+ prs->usewide = false;
+
+ prs->state = newTParserPosition(NULL);
+ prs->state->state = TPS_Base;
+
+#ifdef WPARSER_TRACE
+ fprintf(stderr, "parsing \"%.*s\"\n", len, str);
+#endif
+
+ return prs;
+}
+
+/*
+ * As an alternative to a full TParserInit one can create a
+ * TParserCopy which basically is a regular TParser without a private
+ * copy of the string - instead it uses the one from another TParser.
+ * This is useful because at some places TParsers are created
+ * recursively and the repeated copying around of the strings can
+ * cause major inefficiency if the source string is long.
+ * The new parser starts parsing at the original's current position.
+ *
+ * Obviously one must not close the original TParser before the copy.
+ */
+static TParser *
+TParserCopyInit(const TParser *orig)
+{
+ TParser *prs = (TParser *) palloc0(sizeof(TParser));
+
+ prs->charmaxlen = orig->charmaxlen;
+ prs->str = orig->str + orig->state->posbyte;
+ prs->lenstr = orig->lenstr - orig->state->posbyte;
+ prs->usewide = orig->usewide;
+
+ if (orig->pgwstr)
+ prs->pgwstr = orig->pgwstr + orig->state->poschar;
+ if (orig->wstr)
+ prs->wstr = orig->wstr + orig->state->poschar;
+
+ prs->state = newTParserPosition(NULL);
+ prs->state->state = TPS_Base;
+
+#ifdef WPARSER_TRACE
+ fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
+#endif
+
+ return prs;
+}
+
+
+static void
+TParserClose(TParser *prs)
+{
+ while (prs->state)
+ {
+ TParserPosition *ptr = prs->state->prev;
+
+ pfree(prs->state);
+ prs->state = ptr;
+ }
+
+ if (prs->wstr)
+ pfree(prs->wstr);
+ if (prs->pgwstr)
+ pfree(prs->pgwstr);
+
+#ifdef WPARSER_TRACE
+ fprintf(stderr, "closing parser\n");
+#endif
+ pfree(prs);
+}
+
+/*
+ * Close a parser created with TParserCopyInit
+ */
+static void
+TParserCopyClose(TParser *prs)
+{
+ while (prs->state)
+ {
+ TParserPosition *ptr = prs->state->prev;
+
+ pfree(prs->state);
+ prs->state = ptr;
+ }
+
+#ifdef WPARSER_TRACE
+ fprintf(stderr, "closing parser copy\n");
+#endif
+ pfree(prs);
+}
+
+
+/*
+ * Character-type support functions, equivalent to is* macros, but
+ * working with any possible encodings and locales. Notes:
+ * - with multibyte encoding and C-locale isw* function may fail
+ * or give wrong result.
+ * - multibyte encoding and C-locale often are used for
+ * Asian languages.
+ * - if locale is C then we use pgwstr instead of wstr.
+ */
+
+#define p_iswhat(type, nonascii) \
+ \
+static int \
+p_is##type(TParser *prs) \
+{ \
+ Assert(prs->state); \
+ if (prs->usewide) \
+ { \
+ if (prs->pgwstr) \
+ { \
+ unsigned int c = *(prs->pgwstr + prs->state->poschar); \
+ if (c > 0x7f) \
+ return nonascii; \
+ return is##type(c); \
+ } \
+ return isw##type(*(prs->wstr + prs->state->poschar)); \
+ } \
+ return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
+} \
+ \
+static int \
+p_isnot##type(TParser *prs) \
+{ \
+ return !p_is##type(prs); \
+}
+
+/*
+ * In C locale with a multibyte encoding, any non-ASCII symbol is considered
+ * an alpha character, but not a member of other char classes.
+ */
+p_iswhat(alnum, 1)
+p_iswhat(alpha, 1)
+p_iswhat(digit, 0)
+p_iswhat(lower, 0)
+p_iswhat(print, 0)
+p_iswhat(punct, 0)
+p_iswhat(space, 0)
+p_iswhat(upper, 0)
+p_iswhat(xdigit, 0)
+
+/* p_iseq should be used only for ascii symbols */
+
+static int
+p_iseq(TParser *prs, char c)
+{
+ Assert(prs->state);
+ return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
+}
+
+static int
+p_isEOF(TParser *prs)
+{
+ Assert(prs->state);
+ return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
+}
+
+static int
+p_iseqC(TParser *prs)
+{
+ return p_iseq(prs, prs->c);
+}
+
+static int
+p_isneC(TParser *prs)
+{
+ return !p_iseq(prs, prs->c);
+}
+
+static int
+p_isascii(TParser *prs)
+{
+ return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
+}
+
+static int
+p_isasclet(TParser *prs)
+{
+ return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
+}
+
+static int
+p_isurlchar(TParser *prs)
+{
+ char ch;
+
+ /* no non-ASCII need apply */
+ if (prs->state->charlen != 1)
+ return 0;
+ ch = *(prs->str + prs->state->posbyte);
+ /* no spaces or control characters */
+ if (ch <= 0x20 || ch >= 0x7F)
+ return 0;
+ /* reject characters disallowed by RFC 3986 */
+ switch (ch)
+ {
+ case '"':
+ case '<':
+ case '>':
+ case '\\':
+ case '^':
+ case '`':
+ case '{':
+ case '|':
+ case '}':
+ return 0;
+ }
+ return 1;
+}
+
+
+/* deliberately suppress unused-function complaints for the above */
+void _make_compiler_happy(void);
+void
+_make_compiler_happy(void)
+{
+ p_isalnum(NULL);
+ p_isnotalnum(NULL);
+ p_isalpha(NULL);
+ p_isnotalpha(NULL);
+ p_isdigit(NULL);
+ p_isnotdigit(NULL);
+ p_islower(NULL);
+ p_isnotlower(NULL);
+ p_isprint(NULL);
+ p_isnotprint(NULL);
+ p_ispunct(NULL);
+ p_isnotpunct(NULL);
+ p_isspace(NULL);
+ p_isnotspace(NULL);
+ p_isupper(NULL);
+ p_isnotupper(NULL);
+ p_isxdigit(NULL);
+ p_isnotxdigit(NULL);
+ p_isEOF(NULL);
+ p_iseqC(NULL);
+ p_isneC(NULL);
+}
+
+
+static void
+SpecialTags(TParser *prs)
+{
+ switch (prs->state->lenchartoken)
+ {
+ case 8: /* </script */
+ if (pg_strncasecmp(prs->token, "</script", 8) == 0)
+ prs->ignore = false;
+ break;
+ case 7: /* <script || </style */
+ if (pg_strncasecmp(prs->token, "</style", 7) == 0)
+ prs->ignore = false;
+ else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
+ prs->ignore = true;
+ break;
+ case 6: /* <style */
+ if (pg_strncasecmp(prs->token, "<style", 6) == 0)
+ prs->ignore = true;
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+SpecialFURL(TParser *prs)
+{
+ prs->wanthost = true;
+ prs->state->posbyte -= prs->state->lenbytetoken;
+ prs->state->poschar -= prs->state->lenchartoken;
+}
+
+static void
+SpecialHyphen(TParser *prs)
+{
+ prs->state->posbyte -= prs->state->lenbytetoken;
+ prs->state->poschar -= prs->state->lenchartoken;
+}
+
+static void
+SpecialVerVersion(TParser *prs)
+{
+ prs->state->posbyte -= prs->state->lenbytetoken;
+ prs->state->poschar -= prs->state->lenchartoken;
+ prs->state->lenbytetoken = 0;
+ prs->state->lenchartoken = 0;
+}
+
+static int
+p_isstophost(TParser *prs)
+{
+ if (prs->wanthost)
+ {
+ prs->wanthost = false;
+ return 1;
+ }
+ return 0;
+}
+
+static int
+p_isignore(TParser *prs)
+{
+ return (prs->ignore) ? 1 : 0;
+}
+
+static int
+p_ishost(TParser *prs)
+{
+ TParser *tmpprs = TParserCopyInit(prs);
+ int res = 0;
+
+ tmpprs->wanthost = true;
+
+ if (TParserGet(tmpprs) && tmpprs->type == HOST)
+ {
+ prs->state->posbyte += tmpprs->lenbytetoken;
+ prs->state->poschar += tmpprs->lenchartoken;
+ prs->state->lenbytetoken += tmpprs->lenbytetoken;
+ prs->state->lenchartoken += tmpprs->lenchartoken;
+ prs->state->charlen = tmpprs->state->charlen;
+ res = 1;
+ }
+ TParserCopyClose(tmpprs);
+
+ return res;
+}
+
+static int
+p_isURLPath(TParser *prs)
+{
+ TParser *tmpprs = TParserCopyInit(prs);
+ int res = 0;
+
+ tmpprs->state = newTParserPosition(tmpprs->state);
+ tmpprs->state->state = TPS_InURLPathFirst;
+
+ if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
+ {
+ prs->state->posbyte += tmpprs->lenbytetoken;
+ prs->state->poschar += tmpprs->lenchartoken;
+ prs->state->lenbytetoken += tmpprs->lenbytetoken;
+ prs->state->lenchartoken += tmpprs->lenchartoken;
+ prs->state->charlen = tmpprs->state->charlen;
+ res = 1;
+ }
+ TParserCopyClose(tmpprs);
+
+ return res;
+}
+
+/*
+ * returns true if current character has zero display length or
+ * it's a special sign in several languages. Such characters
+ * aren't a word-breaker although they aren't an isalpha.
+ * In beginning of word they aren't a part of it.
+ */
+static int
+p_isspecial(TParser *prs)
+{
+ /*
+ * pg_dsplen could return -1 which means error or control character
+ */
+ if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
+ return 1;
+
+ /*
+ * Unicode Characters in the 'Mark, Spacing Combining' Category That
+ * characters are not alpha although they are not breakers of word too.
+ * Check that only in utf encoding, because other encodings aren't
+ * supported by postgres or even exists.
+ */
+ if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
+ {
+ static const pg_wchar strange_letter[] = {
+ /*
+ * use binary search, so elements should be ordered
+ */
+ 0x0903, /* DEVANAGARI SIGN VISARGA */
+ 0x093E, /* DEVANAGARI VOWEL SIGN AA */
+ 0x093F, /* DEVANAGARI VOWEL SIGN I */
+ 0x0940, /* DEVANAGARI VOWEL SIGN II */
+ 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
+ 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
+ 0x094B, /* DEVANAGARI VOWEL SIGN O */
+ 0x094C, /* DEVANAGARI VOWEL SIGN AU */
+ 0x0982, /* BENGALI SIGN ANUSVARA */
+ 0x0983, /* BENGALI SIGN VISARGA */
+ 0x09BE, /* BENGALI VOWEL SIGN AA */
+ 0x09BF, /* BENGALI VOWEL SIGN I */
+ 0x09C0, /* BENGALI VOWEL SIGN II */
+ 0x09C7, /* BENGALI VOWEL SIGN E */
+ 0x09C8, /* BENGALI VOWEL SIGN AI */
+ 0x09CB, /* BENGALI VOWEL SIGN O */
+ 0x09CC, /* BENGALI VOWEL SIGN AU */
+ 0x09D7, /* BENGALI AU LENGTH MARK */
+ 0x0A03, /* GURMUKHI SIGN VISARGA */
+ 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
+ 0x0A3F, /* GURMUKHI VOWEL SIGN I */
+ 0x0A40, /* GURMUKHI VOWEL SIGN II */
+ 0x0A83, /* GUJARATI SIGN VISARGA */
+ 0x0ABE, /* GUJARATI VOWEL SIGN AA */
+ 0x0ABF, /* GUJARATI VOWEL SIGN I */
+ 0x0AC0, /* GUJARATI VOWEL SIGN II */
+ 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
+ 0x0ACB, /* GUJARATI VOWEL SIGN O */
+ 0x0ACC, /* GUJARATI VOWEL SIGN AU */
+ 0x0B02, /* ORIYA SIGN ANUSVARA */
+ 0x0B03, /* ORIYA SIGN VISARGA */
+ 0x0B3E, /* ORIYA VOWEL SIGN AA */
+ 0x0B40, /* ORIYA VOWEL SIGN II */
+ 0x0B47, /* ORIYA VOWEL SIGN E */
+ 0x0B48, /* ORIYA VOWEL SIGN AI */
+ 0x0B4B, /* ORIYA VOWEL SIGN O */
+ 0x0B4C, /* ORIYA VOWEL SIGN AU */
+ 0x0B57, /* ORIYA AU LENGTH MARK */
+ 0x0BBE, /* TAMIL VOWEL SIGN AA */
+ 0x0BBF, /* TAMIL VOWEL SIGN I */
+ 0x0BC1, /* TAMIL VOWEL SIGN U */
+ 0x0BC2, /* TAMIL VOWEL SIGN UU */
+ 0x0BC6, /* TAMIL VOWEL SIGN E */
+ 0x0BC7, /* TAMIL VOWEL SIGN EE */
+ 0x0BC8, /* TAMIL VOWEL SIGN AI */
+ 0x0BCA, /* TAMIL VOWEL SIGN O */
+ 0x0BCB, /* TAMIL VOWEL SIGN OO */
+ 0x0BCC, /* TAMIL VOWEL SIGN AU */
+ 0x0BD7, /* TAMIL AU LENGTH MARK */
+ 0x0C01, /* TELUGU SIGN CANDRABINDU */
+ 0x0C02, /* TELUGU SIGN ANUSVARA */
+ 0x0C03, /* TELUGU SIGN VISARGA */
+ 0x0C41, /* TELUGU VOWEL SIGN U */
+ 0x0C42, /* TELUGU VOWEL SIGN UU */
+ 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
+ 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
+ 0x0C82, /* KANNADA SIGN ANUSVARA */
+ 0x0C83, /* KANNADA SIGN VISARGA */
+ 0x0CBE, /* KANNADA VOWEL SIGN AA */
+ 0x0CC0, /* KANNADA VOWEL SIGN II */
+ 0x0CC1, /* KANNADA VOWEL SIGN U */
+ 0x0CC2, /* KANNADA VOWEL SIGN UU */
+ 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
+ 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
+ 0x0CC7, /* KANNADA VOWEL SIGN EE */
+ 0x0CC8, /* KANNADA VOWEL SIGN AI */
+ 0x0CCA, /* KANNADA VOWEL SIGN O */
+ 0x0CCB, /* KANNADA VOWEL SIGN OO */
+ 0x0CD5, /* KANNADA LENGTH MARK */
+ 0x0CD6, /* KANNADA AI LENGTH MARK */
+ 0x0D02, /* MALAYALAM SIGN ANUSVARA */
+ 0x0D03, /* MALAYALAM SIGN VISARGA */
+ 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
+ 0x0D3F, /* MALAYALAM VOWEL SIGN I */
+ 0x0D40, /* MALAYALAM VOWEL SIGN II */
+ 0x0D46, /* MALAYALAM VOWEL SIGN E */
+ 0x0D47, /* MALAYALAM VOWEL SIGN EE */
+ 0x0D48, /* MALAYALAM VOWEL SIGN AI */
+ 0x0D4A, /* MALAYALAM VOWEL SIGN O */
+ 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
+ 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
+ 0x0D57, /* MALAYALAM AU LENGTH MARK */
+ 0x0D82, /* SINHALA SIGN ANUSVARAYA */
+ 0x0D83, /* SINHALA SIGN VISARGAYA */
+ 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
+ 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
+ 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
+ 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
+ 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
+ 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
+ 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
+ 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
+ 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
+ * AELA-PILLA */
+ 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
+ 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
+ 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
+ 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
+ 0x0F3E, /* TIBETAN SIGN YAR TSHES */
+ 0x0F3F, /* TIBETAN SIGN MAR TSHES */
+ 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
+ 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
+ 0x102C, /* MYANMAR VOWEL SIGN AA */
+ 0x1031, /* MYANMAR VOWEL SIGN E */
+ 0x1038, /* MYANMAR SIGN VISARGA */
+ 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
+ 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
+ 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
+ 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
+ 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
+ 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
+ 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
+ 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
+ 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
+ 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
+ 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
+ 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
+ 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
+ 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
+ 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
+ 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
+ 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
+ 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
+ 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
+ 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
+ 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
+ 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
+ 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
+ 0x17B6, /* KHMER VOWEL SIGN AA */
+ 0x17BE, /* KHMER VOWEL SIGN OE */
+ 0x17BF, /* KHMER VOWEL SIGN YA */
+ 0x17C0, /* KHMER VOWEL SIGN IE */
+ 0x17C1, /* KHMER VOWEL SIGN E */
+ 0x17C2, /* KHMER VOWEL SIGN AE */
+ 0x17C3, /* KHMER VOWEL SIGN AI */
+ 0x17C4, /* KHMER VOWEL SIGN OO */
+ 0x17C5, /* KHMER VOWEL SIGN AU */
+ 0x17C7, /* KHMER SIGN REAHMUK */
+ 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
+ 0x1923, /* LIMBU VOWEL SIGN EE */
+ 0x1924, /* LIMBU VOWEL SIGN AI */
+ 0x1925, /* LIMBU VOWEL SIGN OO */
+ 0x1926, /* LIMBU VOWEL SIGN AU */
+ 0x1929, /* LIMBU SUBJOINED LETTER YA */
+ 0x192A, /* LIMBU SUBJOINED LETTER RA */
+ 0x192B, /* LIMBU SUBJOINED LETTER WA */
+ 0x1930, /* LIMBU SMALL LETTER KA */
+ 0x1931, /* LIMBU SMALL LETTER NGA */
+ 0x1933, /* LIMBU SMALL LETTER TA */
+ 0x1934, /* LIMBU SMALL LETTER NA */
+ 0x1935, /* LIMBU SMALL LETTER PA */
+ 0x1936, /* LIMBU SMALL LETTER MA */
+ 0x1937, /* LIMBU SMALL LETTER RA */
+ 0x1938, /* LIMBU SMALL LETTER LA */
+ 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
+ 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
+ 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
+ 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
+ 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
+ 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
+ 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
+ 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
+ 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
+ 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
+ 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
+ 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
+ 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
+ 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
+ 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
+ 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
+ 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
+ 0x19C8, /* NEW TAI LUE TONE MARK-1 */
+ 0x19C9, /* NEW TAI LUE TONE MARK-2 */
+ 0x1A19, /* BUGINESE VOWEL SIGN E */
+ 0x1A1A, /* BUGINESE VOWEL SIGN O */
+ 0x1A1B, /* BUGINESE VOWEL SIGN AE */
+ 0x1B04, /* BALINESE SIGN BISAH */
+ 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
+ 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
+ 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
+ 0x1B3E, /* BALINESE VOWEL SIGN TALING */
+ 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
+ 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
+ 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
+ 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
+ 0x1B44, /* BALINESE ADEG ADEG */
+ 0x1B82, /* SUNDANESE SIGN PANGWISAD */
+ 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
+ 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
+ 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
+ 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
+ 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
+ 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
+ 0x1C26, /* LEPCHA VOWEL SIGN AA */
+ 0x1C27, /* LEPCHA VOWEL SIGN I */
+ 0x1C28, /* LEPCHA VOWEL SIGN O */
+ 0x1C29, /* LEPCHA VOWEL SIGN OO */
+ 0x1C2A, /* LEPCHA VOWEL SIGN U */
+ 0x1C2B, /* LEPCHA VOWEL SIGN UU */
+ 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
+ 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
+ 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
+ 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
+ 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
+ 0xA880, /* SAURASHTRA SIGN ANUSVARA */
+ 0xA881, /* SAURASHTRA SIGN VISARGA */
+ 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
+ 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
+ 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
+ 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
+ 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
+ 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
+ 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
+ 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
+ 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
+ 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
+ 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
+ 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
+ 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
+ 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
+ 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
+ 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
+ 0xA952, /* REJANG CONSONANT SIGN H */
+ 0xA953, /* REJANG VIRAMA */
+ 0xAA2F, /* CHAM VOWEL SIGN O */
+ 0xAA30, /* CHAM VOWEL SIGN AI */
+ 0xAA33, /* CHAM CONSONANT SIGN YA */
+ 0xAA34, /* CHAM CONSONANT SIGN RA */
+ 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
+ };
+ const pg_wchar *StopLow = strange_letter,
+ *StopHigh = strange_letter + lengthof(strange_letter),
+ *StopMiddle;
+ pg_wchar c;
+
+ if (prs->pgwstr)
+ c = *(prs->pgwstr + prs->state->poschar);
+ else
+ c = (pg_wchar) *(prs->wstr + prs->state->poschar);
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ if (*StopMiddle == c)
+ return 1;
+ else if (*StopMiddle < c)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Table of state/action of parser
+ */
+
+static const TParserStateActionItem actionTPS_Base[] = {
+ {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
+ {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
+ {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
+ {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
+ {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
+ {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
+ {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
+};
+
+
+static const TParserStateActionItem actionTPS_InNumWord[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
+ {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
+ {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InAsciiWord[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
+ {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
+ {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
+ {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
+ {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InWord[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
+ {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
+ {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
+ {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InSignedInt[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
+ {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
+ {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InSpace[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
+ {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
+ {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
+ {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
+ {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
+ {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
+ {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InUDecimal[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
+ {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
+ {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InDecimal[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
+ {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
+ {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InVerVersion[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InSVerVersion[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
+ {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
+};
+
+
+static const TParserStateActionItem actionTPS_InVersionFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InVersion[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
+ {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InMantissaSign[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InMantissa[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
+ {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
+ {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLEntity[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
+ {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
+ {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
+ {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
+ {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
+ {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
+ {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
+ {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
+ {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InTagFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
+ {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
+ {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
+ {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
+ {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
+ {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InXMLBegin[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ /* <?xml ... */
+ /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
+ {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InTagName[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ /* <br/> case */
+ {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
+ {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
+ {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
+ {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InTag[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
+ {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
+ {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
+ {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
+ {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
+ {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
+ {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
+ {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InTagEnd[] = {
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InCommentFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
+ /* <!DOCTYPE ...> */
+ {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
+ {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InCommentLast[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InComment[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
+ {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
+ {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
+ {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
+ {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InCommentEnd[] = {
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
+ {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
+ {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHostDomain[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
+ {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
+ {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
+ {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
+ {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
+ {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InPortFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InPort[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
+ {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
+ {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHost[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InEmail[] = {
+ {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
+ {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InFileFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
+ {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
+ {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
+ {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InPathFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
+ {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
+ {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
+ {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InPathSecond[] = {
+ {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
+ {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
+ {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
+ {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InFile[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
+ {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
+ {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
+ {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InFileNext[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
+ {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
+ {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL},
+};
+
+static const TParserStateActionItem actionTPS_InURLPathStart[] = {
+ {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InURLPath[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
+ {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InFURL[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenWord[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
+ {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InParseHyphen[] = {
+ {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+ {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
+ {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
+ {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
+ {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
+};
+
+static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
+ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
+ {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+
+/*
+ * main table of per-state parser actions
+ */
+typedef struct
+{
+ const TParserStateActionItem *action; /* the actual state info */
+ TParserState state; /* only for Assert crosscheck */
+#ifdef WPARSER_TRACE
+ const char *state_name; /* only for debug printout */
+#endif
+} TParserStateAction;
+
+#ifdef WPARSER_TRACE
+#define TPARSERSTATEACTION(state) \
+ { CppConcat(action,state), state, CppAsString(state) }
+#else
+#define TPARSERSTATEACTION(state) \
+ { CppConcat(action,state), state }
+#endif
+
+/*
+ * order must be the same as in typedef enum {} TParserState!!
+ */
+
+static const TParserStateAction Actions[] = {
+ TPARSERSTATEACTION(TPS_Base),
+ TPARSERSTATEACTION(TPS_InNumWord),
+ TPARSERSTATEACTION(TPS_InAsciiWord),
+ TPARSERSTATEACTION(TPS_InWord),
+ TPARSERSTATEACTION(TPS_InUnsignedInt),
+ TPARSERSTATEACTION(TPS_InSignedIntFirst),
+ TPARSERSTATEACTION(TPS_InSignedInt),
+ TPARSERSTATEACTION(TPS_InSpace),
+ TPARSERSTATEACTION(TPS_InUDecimalFirst),
+ TPARSERSTATEACTION(TPS_InUDecimal),
+ TPARSERSTATEACTION(TPS_InDecimalFirst),
+ TPARSERSTATEACTION(TPS_InDecimal),
+ TPARSERSTATEACTION(TPS_InVerVersion),
+ TPARSERSTATEACTION(TPS_InSVerVersion),
+ TPARSERSTATEACTION(TPS_InVersionFirst),
+ TPARSERSTATEACTION(TPS_InVersion),
+ TPARSERSTATEACTION(TPS_InMantissaFirst),
+ TPARSERSTATEACTION(TPS_InMantissaSign),
+ TPARSERSTATEACTION(TPS_InMantissa),
+ TPARSERSTATEACTION(TPS_InXMLEntityFirst),
+ TPARSERSTATEACTION(TPS_InXMLEntity),
+ TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
+ TPARSERSTATEACTION(TPS_InXMLEntityNum),
+ TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
+ TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
+ TPARSERSTATEACTION(TPS_InXMLEntityEnd),
+ TPARSERSTATEACTION(TPS_InTagFirst),
+ TPARSERSTATEACTION(TPS_InXMLBegin),
+ TPARSERSTATEACTION(TPS_InTagCloseFirst),
+ TPARSERSTATEACTION(TPS_InTagName),
+ TPARSERSTATEACTION(TPS_InTagBeginEnd),
+ TPARSERSTATEACTION(TPS_InTag),
+ TPARSERSTATEACTION(TPS_InTagEscapeK),
+ TPARSERSTATEACTION(TPS_InTagEscapeKK),
+ TPARSERSTATEACTION(TPS_InTagBackSleshed),
+ TPARSERSTATEACTION(TPS_InTagEnd),
+ TPARSERSTATEACTION(TPS_InCommentFirst),
+ TPARSERSTATEACTION(TPS_InCommentLast),
+ TPARSERSTATEACTION(TPS_InComment),
+ TPARSERSTATEACTION(TPS_InCloseCommentFirst),
+ TPARSERSTATEACTION(TPS_InCloseCommentLast),
+ TPARSERSTATEACTION(TPS_InCommentEnd),
+ TPARSERSTATEACTION(TPS_InHostFirstDomain),
+ TPARSERSTATEACTION(TPS_InHostDomainSecond),
+ TPARSERSTATEACTION(TPS_InHostDomain),
+ TPARSERSTATEACTION(TPS_InPortFirst),
+ TPARSERSTATEACTION(TPS_InPort),
+ TPARSERSTATEACTION(TPS_InHostFirstAN),
+ TPARSERSTATEACTION(TPS_InHost),
+ TPARSERSTATEACTION(TPS_InEmail),
+ TPARSERSTATEACTION(TPS_InFileFirst),
+ TPARSERSTATEACTION(TPS_InFileTwiddle),
+ TPARSERSTATEACTION(TPS_InPathFirst),
+ TPARSERSTATEACTION(TPS_InPathFirstFirst),
+ TPARSERSTATEACTION(TPS_InPathSecond),
+ TPARSERSTATEACTION(TPS_InFile),
+ TPARSERSTATEACTION(TPS_InFileNext),
+ TPARSERSTATEACTION(TPS_InURLPathFirst),
+ TPARSERSTATEACTION(TPS_InURLPathStart),
+ TPARSERSTATEACTION(TPS_InURLPath),
+ TPARSERSTATEACTION(TPS_InFURL),
+ TPARSERSTATEACTION(TPS_InProtocolFirst),
+ TPARSERSTATEACTION(TPS_InProtocolSecond),
+ TPARSERSTATEACTION(TPS_InProtocolEnd),
+ TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
+ TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
+ TPARSERSTATEACTION(TPS_InHyphenWordFirst),
+ TPARSERSTATEACTION(TPS_InHyphenWord),
+ TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
+ TPARSERSTATEACTION(TPS_InHyphenNumWord),
+ TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
+ TPARSERSTATEACTION(TPS_InParseHyphen),
+ TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
+ TPARSERSTATEACTION(TPS_InHyphenWordPart),
+ TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
+ TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
+ TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
+};
+
+
+static bool
+TParserGet(TParser *prs)
+{
+ const TParserStateActionItem *item = NULL;
+
+ Assert(prs->state);
+
+ if (prs->state->posbyte >= prs->lenstr)
+ return false;
+
+ prs->token = prs->str + prs->state->posbyte;
+ prs->state->pushedAtAction = NULL;
+
+ /* look at string */
+ while (prs->state->posbyte <= prs->lenstr)
+ {
+ if (prs->state->posbyte == prs->lenstr)
+ prs->state->charlen = 0;
+ else
+ prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
+ pg_mblen(prs->str + prs->state->posbyte);
+
+ Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
+ Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
+ Assert(Actions[prs->state->state].state == prs->state->state);
+
+ if (prs->state->pushedAtAction)
+ {
+ /* After a POP, pick up at the next test */
+ item = prs->state->pushedAtAction + 1;
+ prs->state->pushedAtAction = NULL;
+ }
+ else
+ {
+ item = Actions[prs->state->state].action;
+ Assert(item != NULL);
+ }
+
+ /* find action by character class */
+ while (item->isclass)
+ {
+ prs->c = item->c;
+ if (item->isclass(prs) != 0)
+ break;
+ item++;
+ }
+
+#ifdef WPARSER_TRACE
+ {
+ TParserPosition *ptr;
+
+ fprintf(stderr, "state ");
+ /* indent according to stack depth */
+ for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
+ fprintf(stderr, " ");
+ fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
+ if (prs->state->posbyte < prs->lenstr)
+ fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
+ else
+ fprintf(stderr, "at EOF");
+ fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
+ (int) (item - Actions[prs->state->state].action),
+ (item->flags & A_BINGO) ? " BINGO" : "",
+ (item->flags & A_POP) ? " POP" : "",
+ (item->flags & A_PUSH) ? " PUSH" : "",
+ (item->flags & A_RERUN) ? " RERUN" : "",
+ (item->flags & A_CLEAR) ? " CLEAR" : "",
+ (item->flags & A_MERGE) ? " MERGE" : "",
+ (item->flags & A_CLRALL) ? " CLRALL" : "",
+ (item->tostate != TPS_Null) ? " tostate " : "",
+ (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
+ (item->type > 0) ? " type " : "",
+ tok_alias[item->type]);
+ }
+#endif
+
+ /* call special handler if exists */
+ if (item->special)
+ item->special(prs);
+
+ /* BINGO, token is found */
+ if (item->flags & A_BINGO)
+ {
+ Assert(item->type > 0);
+ prs->lenbytetoken = prs->state->lenbytetoken;
+ prs->lenchartoken = prs->state->lenchartoken;
+ prs->state->lenbytetoken = prs->state->lenchartoken = 0;
+ prs->type = item->type;
+ }
+
+ /* do various actions by flags */
+ if (item->flags & A_POP)
+ { /* pop stored state in stack */
+ TParserPosition *ptr = prs->state->prev;
+
+ pfree(prs->state);
+ prs->state = ptr;
+ Assert(prs->state);
+ }
+ else if (item->flags & A_PUSH)
+ { /* push (store) state in stack */
+ prs->state->pushedAtAction = item; /* remember where we push */
+ prs->state = newTParserPosition(prs->state);
+ }
+ else if (item->flags & A_CLEAR)
+ { /* clear previous pushed state */
+ TParserPosition *ptr;
+
+ Assert(prs->state->prev);
+ ptr = prs->state->prev->prev;
+ pfree(prs->state->prev);
+ prs->state->prev = ptr;
+ }
+ else if (item->flags & A_CLRALL)
+ { /* clear all previous pushed state */
+ TParserPosition *ptr;
+
+ while (prs->state->prev)
+ {
+ ptr = prs->state->prev->prev;
+ pfree(prs->state->prev);
+ prs->state->prev = ptr;
+ }
+ }
+ else if (item->flags & A_MERGE)
+ { /* merge posinfo with current and pushed state */
+ TParserPosition *ptr = prs->state;
+
+ Assert(prs->state->prev);
+ prs->state = prs->state->prev;
+
+ prs->state->posbyte = ptr->posbyte;
+ prs->state->poschar = ptr->poschar;
+ prs->state->charlen = ptr->charlen;
+ prs->state->lenbytetoken = ptr->lenbytetoken;
+ prs->state->lenchartoken = ptr->lenchartoken;
+ pfree(ptr);
+ }
+
+ /* set new state if pointed */
+ if (item->tostate != TPS_Null)
+ prs->state->state = item->tostate;
+
+ /* check for go away */
+ if ((item->flags & A_BINGO) ||
+ (prs->state->posbyte >= prs->lenstr &&
+ (item->flags & A_RERUN) == 0))
+ break;
+
+ /* go to beginning of loop if we should rerun or we just restore state */
+ if (item->flags & (A_RERUN | A_POP))
+ continue;
+
+ /* move forward */
+ if (prs->state->charlen)
+ {
+ prs->state->posbyte += prs->state->charlen;
+ prs->state->lenbytetoken += prs->state->charlen;
+ prs->state->poschar++;
+ prs->state->lenchartoken++;
+ }
+ }
+
+ return (item && (item->flags & A_BINGO)) ? true : false;
+}
+
+Datum
+prsd_lextype(PG_FUNCTION_ARGS)
+{
+ LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
+ int i;
+
+ for (i = 1; i <= LASTNUM; i++)
+ {
+ descr[i - 1].lexid = i;
+ descr[i - 1].alias = pstrdup(tok_alias[i]);
+ descr[i - 1].descr = pstrdup(lex_descr[i]);
+ }
+
+ descr[LASTNUM].lexid = 0;
+
+ PG_RETURN_POINTER(descr);
+}
+
+Datum
+prsd_start(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
+}
+
+Datum
+prsd_nexttoken(PG_FUNCTION_ARGS)
+{
+ TParser *p = (TParser *) PG_GETARG_POINTER(0);
+ char **t = (char **) PG_GETARG_POINTER(1);
+ int *tlen = (int *) PG_GETARG_POINTER(2);
+
+ if (!TParserGet(p))
+ PG_RETURN_INT32(0);
+
+ *t = p->token;
+ *tlen = p->lenbytetoken;
+
+ PG_RETURN_INT32(p->type);
+}
+
+Datum
+prsd_end(PG_FUNCTION_ARGS)
+{
+ TParser *p = (TParser *) PG_GETARG_POINTER(0);
+
+ TParserClose(p);
+ PG_RETURN_VOID();
+}
+
+
+/*
+ * ts_headline support begins here
+ */
+
+/* token type classification macros */
+#define LEAVETOKEN(x) ( (x)==SPACE )
+#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define ENDPUNCTOKEN(x) ( (x)==SPACE )
+
+#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
+#define HLIDREPLACE(x) ( (x)==TAG_T )
+#define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
+#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
+
+/*
+ * Macros useful in headline selection. These rely on availability of
+ * "HeadlineParsedText *prs" describing some text, and "int shortword"
+ * describing the "short word" length parameter.
+ */
+
+/* Interesting words are non-repeated search terms */
+#define INTERESTINGWORD(j) \
+ (prs->words[j].item && !prs->words[j].repeated)
+
+/* Don't want to end at a non-word or a short word, unless interesting */
+#define BADENDPOINT(j) \
+ ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
+ !INTERESTINGWORD(j))
+
+typedef struct
+{
+ /* one cover (well, really one fragment) for mark_hl_fragments */
+ int32 startpos; /* fragment's starting word index */
+ int32 endpos; /* ending word index (inclusive) */
+ int32 poslen; /* number of interesting words */
+ int32 curlen; /* total number of words */
+ bool chosen; /* chosen? */
+ bool excluded; /* excluded? */
+} CoverPos;
+
+typedef struct
+{
+ /* callback data for checkcondition_HL */
+ HeadlineWordEntry *words;
+ int len;
+} hlCheck;
+
+
+/*
+ * TS_execute callback for matching a tsquery operand to headline words
+ */
+static TSTernaryValue
+checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
+{
+ hlCheck *checkval = (hlCheck *) opaque;
+ int i;
+
+ /* scan words array for marching items */
+ for (i = 0; i < checkval->len; i++)
+ {
+ if (checkval->words[i].item == val)
+ {
+ /* if data == NULL, don't need to report positions */
+ if (!data)
+ return TS_YES;
+
+ if (!data->pos)
+ {
+ data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
+ data->allocated = true;
+ data->npos = 1;
+ data->pos[0] = checkval->words[i].pos;
+ }
+ else if (data->pos[data->npos - 1] < checkval->words[i].pos)
+ {
+ data->pos[data->npos++] = checkval->words[i].pos;
+ }
+ }
+ }
+
+ if (data && data->npos > 0)
+ return TS_YES;
+
+ return TS_NO;
+}
+
+/*
+ * hlFirstIndex: find first index >= pos containing any word used in query
+ *
+ * Returns -1 if no such index
+ */
+static int
+hlFirstIndex(HeadlineParsedText *prs, int pos)
+{
+ int i;
+
+ for (i = pos; i < prs->curwords; i++)
+ {
+ if (prs->words[i].item != NULL)
+ return i;
+ }
+ return -1;
+}
+
+/*
+ * hlCover: try to find a substring of prs' word list that satisfies query
+ *
+ * At entry, *p must be the first word index to consider (initialize this
+ * to zero, or to the next index after a previous successful search).
+ * We will consider all substrings starting at or after that word, and
+ * containing no more than max_cover words. (We need a length limit to
+ * keep this from taking O(N^2) time for a long document with many query
+ * words but few complete matches. Actually, since checkcondition_HL is
+ * roughly O(N) in the length of the substring being checked, it's even
+ * worse than that.)
+ *
+ * On success, sets *p to first word index and *q to last word index of the
+ * cover substring, and returns true.
+ *
+ * The result is a minimal cover, in the sense that both *p and *q will be
+ * words used in the query.
+ */
+static bool
+hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
+ int *p, int *q)
+{
+ int pmin,
+ pmax,
+ nextpmin,
+ nextpmax;
+ hlCheck ch;
+
+ /*
+ * We look for the earliest, shortest substring of prs->words that
+ * satisfies the query. Both the pmin and pmax indices must be words
+ * appearing in the query; there's no point in trying endpoints in between
+ * such points.
+ */
+ pmin = hlFirstIndex(prs, *p);
+ while (pmin >= 0)
+ {
+ /* This useless assignment just keeps stupider compilers quiet */
+ nextpmin = -1;
+ /* Consider substrings starting at pmin */
+ ch.words = &(prs->words[pmin]);
+ /* Consider the length-one substring first, then longer substrings */
+ pmax = pmin;
+ do
+ {
+ /* Try to match query against pmin .. pmax substring */
+ ch.len = pmax - pmin + 1;
+ if (TS_execute(GETQUERY(query), &ch,
+ TS_EXEC_EMPTY, checkcondition_HL))
+ {
+ *p = pmin;
+ *q = pmax;
+ return true;
+ }
+ /* Nope, so advance pmax to next feasible endpoint */
+ nextpmax = hlFirstIndex(prs, pmax + 1);
+
+ /*
+ * If this is our first advance past pmin, then the result is also
+ * the next feasible value of pmin; remember it to save a
+ * redundant search.
+ */
+ if (pmax == pmin)
+ nextpmin = nextpmax;
+ pmax = nextpmax;
+ }
+ while (pmax >= 0 && pmax - pmin < max_cover);
+ /* No luck here, so try next feasible startpoint */
+ pmin = nextpmin;
+ }
+ return false;
+}
+
+/*
+ * Apply suitable highlight marking to words selected by headline selector
+ *
+ * The words from startpos to endpos inclusive are marked per highlightall
+ */
+static void
+mark_fragment(HeadlineParsedText *prs, bool highlightall,
+ int startpos, int endpos)
+{
+ int i;
+
+ for (i = startpos; i <= endpos; i++)
+ {
+ if (prs->words[i].item)
+ prs->words[i].selected = 1;
+ if (!highlightall)
+ {
+ if (HLIDREPLACE(prs->words[i].type))
+ prs->words[i].replace = 1;
+ else if (HLIDSKIP(prs->words[i].type))
+ prs->words[i].skip = 1;
+ }
+ else
+ {
+ if (XMLHLIDSKIP(prs->words[i].type))
+ prs->words[i].skip = 1;
+ }
+
+ prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
+ }
+}
+
+/*
+ * split a cover substring into fragments not longer than max_words
+ *
+ * At entry, *startpos and *endpos are the (remaining) bounds of the cover
+ * substring. They are updated to hold the bounds of the next fragment.
+ *
+ * *curlen and *poslen are set to the fragment's length, in words and
+ * interesting words respectively.
+ */
+static void
+get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
+ int *curlen, int *poslen, int max_words)
+{
+ int i;
+
+ /*
+ * Objective: select a fragment of words between startpos and endpos such
+ * that it has at most max_words and both ends have query words. If the
+ * startpos and endpos are the endpoints of the cover and the cover has
+ * fewer words than max_words, then this function should just return the
+ * cover
+ */
+ /* first move startpos to an item */
+ for (i = *startpos; i <= *endpos; i++)
+ {
+ *startpos = i;
+ if (INTERESTINGWORD(i))
+ break;
+ }
+ /* cut endpos to have only max_words */
+ *curlen = 0;
+ *poslen = 0;
+ for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ *curlen += 1;
+ if (INTERESTINGWORD(i))
+ *poslen += 1;
+ }
+ /* if the cover was cut then move back endpos to a query item */
+ if (*endpos > i)
+ {
+ *endpos = i;
+ for (i = *endpos; i >= *startpos; i--)
+ {
+ *endpos = i;
+ if (INTERESTINGWORD(i))
+ break;
+ if (!NONWORDTOKEN(prs->words[i].type))
+ *curlen -= 1;
+ }
+ }
+}
+
+/*
+ * Headline selector used when MaxFragments > 0
+ *
+ * Note: in this mode, highlightall is disregarded for phrase selection;
+ * it only controls presentation details.
+ */
+static void
+mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
+ int shortword, int min_words,
+ int max_words, int max_fragments, int max_cover)
+{
+ int32 poslen,
+ curlen,
+ i,
+ f,
+ num_f = 0;
+ int32 stretch,
+ maxstretch,
+ posmarker;
+
+ int32 startpos = 0,
+ endpos = 0,
+ p = 0,
+ q = 0;
+
+ int32 numcovers = 0,
+ maxcovers = 32;
+
+ int32 minI,
+ minwords,
+ maxitems;
+ CoverPos *covers;
+
+ covers = palloc(maxcovers * sizeof(CoverPos));
+
+ /* get all covers */
+ while (hlCover(prs, query, max_cover, &p, &q))
+ {
+ startpos = p;
+ endpos = q;
+
+ /*
+ * Break the cover into smaller fragments such that each fragment has
+ * at most max_words. Also ensure that each end of each fragment is a
+ * query word. This will allow us to stretch the fragment in either
+ * direction
+ */
+
+ while (startpos <= endpos)
+ {
+ get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
+ if (numcovers >= maxcovers)
+ {
+ maxcovers *= 2;
+ covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
+ }
+ covers[numcovers].startpos = startpos;
+ covers[numcovers].endpos = endpos;
+ covers[numcovers].curlen = curlen;
+ covers[numcovers].poslen = poslen;
+ covers[numcovers].chosen = false;
+ covers[numcovers].excluded = false;
+ numcovers++;
+ startpos = endpos + 1;
+ endpos = q;
+ }
+
+ /* move p to generate the next cover */
+ p++;
+ }
+
+ /* choose best covers */
+ for (f = 0; f < max_fragments; f++)
+ {
+ maxitems = 0;
+ minwords = PG_INT32_MAX;
+ minI = -1;
+
+ /*
+ * Choose the cover that contains max items. In case of tie choose the
+ * one with smaller number of words.
+ */
+ for (i = 0; i < numcovers; i++)
+ {
+ if (!covers[i].chosen && !covers[i].excluded &&
+ (maxitems < covers[i].poslen ||
+ (maxitems == covers[i].poslen &&
+ minwords > covers[i].curlen)))
+ {
+ maxitems = covers[i].poslen;
+ minwords = covers[i].curlen;
+ minI = i;
+ }
+ }
+ /* if a cover was found mark it */
+ if (minI >= 0)
+ {
+ covers[minI].chosen = true;
+ /* adjust the size of cover */
+ startpos = covers[minI].startpos;
+ endpos = covers[minI].endpos;
+ curlen = covers[minI].curlen;
+ /* stretch the cover if cover size is lower than max_words */
+ if (curlen < max_words)
+ {
+ /* divide the stretch on both sides of cover */
+ maxstretch = (max_words - curlen) / 2;
+
+ /*
+ * first stretch the startpos stop stretching if 1. we hit the
+ * beginning of document 2. exceed maxstretch 3. we hit an
+ * already marked fragment
+ */
+ stretch = 0;
+ posmarker = startpos;
+ for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ {
+ curlen++;
+ stretch++;
+ }
+ posmarker = i;
+ }
+ /* cut back startpos till we find a good endpoint */
+ for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen--;
+ }
+ startpos = i;
+ /* now stretch the endpos as much as possible */
+ posmarker = endpos;
+ for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ posmarker = i;
+ }
+ /* cut back endpos till we find a good endpoint */
+ for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen--;
+ }
+ endpos = i;
+ }
+ covers[minI].startpos = startpos;
+ covers[minI].endpos = endpos;
+ covers[minI].curlen = curlen;
+ /* Mark the chosen fragments (covers) */
+ mark_fragment(prs, highlightall, startpos, endpos);
+ num_f++;
+ /* Exclude covers overlapping this one from future consideration */
+ for (i = 0; i < numcovers; i++)
+ {
+ if (i != minI &&
+ ((covers[i].startpos >= startpos &&
+ covers[i].startpos <= endpos) ||
+ (covers[i].endpos >= startpos &&
+ covers[i].endpos <= endpos) ||
+ (covers[i].startpos < startpos &&
+ covers[i].endpos > endpos)))
+ covers[i].excluded = true;
+ }
+ }
+ else
+ break; /* no selectable covers remain */
+ }
+
+ /* show the first min_words words if we have not marked anything */
+ if (num_f <= 0)
+ {
+ startpos = endpos = curlen = 0;
+ for (i = 0; i < prs->curwords && curlen < min_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ endpos = i;
+ }
+ mark_fragment(prs, highlightall, startpos, endpos);
+ }
+
+ pfree(covers);
+}
+
+/*
+ * Headline selector used when MaxFragments == 0
+ */
+static void
+mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
+ int shortword, int min_words, int max_words, int max_cover)
+{
+ int p = 0,
+ q = 0;
+ int bestb = -1,
+ beste = -1;
+ int bestlen = -1;
+ bool bestcover = false;
+ int pose,
+ posb,
+ poslen,
+ curlen;
+ bool poscover;
+ int i;
+
+ if (!highlightall)
+ {
+ /* examine all covers, select a headline using the best one */
+ while (hlCover(prs, query, max_cover, &p, &q))
+ {
+ /*
+ * Count words (curlen) and interesting words (poslen) within
+ * cover, but stop once we reach max_words. This step doesn't
+ * consider whether that's a good stopping point. posb and pose
+ * are set to the start and end indexes of the possible headline.
+ */
+ curlen = 0;
+ poslen = 0;
+ posb = pose = p;
+ for (i = p; i <= q && curlen < max_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ if (INTERESTINGWORD(i))
+ poslen++;
+ pose = i;
+ }
+
+ if (curlen < max_words)
+ {
+ /*
+ * We have room to lengthen the headline, so search forward
+ * until it's full or we find a good stopping point. We'll
+ * reconsider the word at "q", then move forward.
+ */
+ for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
+ {
+ if (i > q)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ if (INTERESTINGWORD(i))
+ poslen++;
+ }
+ pose = i;
+ if (BADENDPOINT(i))
+ continue;
+ if (curlen >= min_words)
+ break;
+ }
+ if (curlen < min_words)
+ {
+ /*
+ * Reached end of text and our headline is still shorter
+ * than min_words, so try to extend it to the left.
+ */
+ for (i = p - 1; i >= 0; i--)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ if (INTERESTINGWORD(i))
+ poslen++;
+ if (curlen >= max_words)
+ break;
+ if (BADENDPOINT(i))
+ continue;
+ if (curlen >= min_words)
+ break;
+ }
+ posb = (i >= 0) ? i : 0;
+ }
+ }
+ else
+ {
+ /*
+ * Can't make headline longer, so consider making it shorter
+ * if needed to avoid a bad endpoint.
+ */
+ if (i > q)
+ i = q;
+ for (; curlen > min_words; i--)
+ {
+ if (!BADENDPOINT(i))
+ break;
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen--;
+ if (INTERESTINGWORD(i))
+ poslen--;
+ pose = i - 1;
+ }
+ }
+
+ /*
+ * Check whether the proposed headline includes the original
+ * cover; it might not if we trimmed it due to max_words.
+ */
+ poscover = (posb <= p && pose >= q);
+
+ /*
+ * Adopt this headline if it's better than the last one, giving
+ * highest priority to headlines including the cover, then to
+ * headlines with more interesting words, then to headlines with
+ * good stopping points. (Since bestlen is initially -1, we will
+ * certainly adopt the first headline.)
+ */
+ if (poscover > bestcover ||
+ (poscover == bestcover && poslen > bestlen) ||
+ (poscover == bestcover && poslen == bestlen &&
+ !BADENDPOINT(pose) && BADENDPOINT(beste)))
+ {
+ bestb = posb;
+ beste = pose;
+ bestlen = poslen;
+ bestcover = poscover;
+ }
+
+ /* move p to generate the next cover */
+ p++;
+ }
+
+ /*
+ * If we found nothing acceptable, select min_words words starting at
+ * the beginning.
+ */
+ if (bestlen < 0)
+ {
+ curlen = 0;
+ pose = 0;
+ for (i = 0; i < prs->curwords && curlen < min_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ pose = i;
+ }
+ bestb = 0;
+ beste = pose;
+ }
+ }
+ else
+ {
+ /* highlightall mode: headline is whole document */
+ bestb = 0;
+ beste = prs->curwords - 1;
+ }
+
+ mark_fragment(prs, highlightall, bestb, beste);
+}
+
+/*
+ * Default parser's prsheadline function
+ */
+Datum
+prsd_headline(PG_FUNCTION_ARGS)
+{
+ HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
+ List *prsoptions = (List *) PG_GETARG_POINTER(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+
+ /* default option values: */
+ int min_words = 15;
+ int max_words = 35;
+ int shortword = 3;
+ int max_fragments = 0;
+ bool highlightall = false;
+ int max_cover;
+ ListCell *l;
+
+ /* Extract configuration option values */
+ prs->startsel = NULL;
+ prs->stopsel = NULL;
+ prs->fragdelim = NULL;
+ foreach(l, prsoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+ char *val = defGetString(defel);
+
+ if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+ max_words = pg_strtoint32(val);
+ else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
+ min_words = pg_strtoint32(val);
+ else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
+ shortword = pg_strtoint32(val);
+ else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
+ max_fragments = pg_strtoint32(val);
+ else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+ prs->startsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+ prs->stopsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
+ prs->fragdelim = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
+ highlightall = (pg_strcasecmp(val, "1") == 0 ||
+ pg_strcasecmp(val, "on") == 0 ||
+ pg_strcasecmp(val, "true") == 0 ||
+ pg_strcasecmp(val, "t") == 0 ||
+ pg_strcasecmp(val, "y") == 0 ||
+ pg_strcasecmp(val, "yes") == 0);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized headline parameter: \"%s\"",
+ defel->defname)));
+ }
+
+ /*
+ * We might eventually make max_cover a user-settable parameter, but for
+ * now, just compute a reasonable value based on max_words and
+ * max_fragments.
+ */
+ max_cover = Max(max_words * 10, 100);
+ if (max_fragments > 0)
+ max_cover *= max_fragments;
+
+ /* in HighlightAll mode these parameters are ignored */
+ if (!highlightall)
+ {
+ if (min_words >= max_words)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be less than MaxWords")));
+ if (min_words <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be positive")));
+ if (shortword < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("ShortWord should be >= 0")));
+ if (max_fragments < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MaxFragments should be >= 0")));
+ }
+
+ /* Apply appropriate headline selector */
+ if (max_fragments == 0)
+ mark_hl_words(prs, query, highlightall, shortword,
+ min_words, max_words, max_cover);
+ else
+ mark_hl_fragments(prs, query, highlightall, shortword,
+ min_words, max_words, max_fragments, max_cover);
+
+ /* Fill in default values for string options */
+ if (!prs->startsel)
+ prs->startsel = pstrdup("<b>");
+ if (!prs->stopsel)
+ prs->stopsel = pstrdup("</b>");
+ if (!prs->fragdelim)
+ prs->fragdelim = pstrdup(" ... ");
+
+ /* Caller will need these lengths, too */
+ prs->startsellen = strlen(prs->startsel);
+ prs->stopsellen = strlen(prs->stopsel);
+ prs->fragdelimlen = strlen(prs->fragdelim);
+
+ PG_RETURN_POINTER(prs);
+}