summaryrefslogtreecommitdiffstats
path: root/src/backend/tsearch/spell.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/tsearch/spell.c')
-rw-r--r--src/backend/tsearch/spell.c2617
1 files changed, 2617 insertions, 0 deletions
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
new file mode 100644
index 0000000..ebc8960
--- /dev/null
+++ b/src/backend/tsearch/spell.c
@@ -0,0 +1,2617 @@
+/*-------------------------------------------------------------------------
+ *
+ * spell.c
+ * Normalizing word with ISpell
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ * Ispell dictionary
+ * -----------------
+ *
+ * Rules of dictionaries are defined in two files with .affix and .dict
+ * extensions. They are used by spell checker programs Ispell and Hunspell.
+ *
+ * An .affix file declares morphological rules to get a basic form of words.
+ * The format of an .affix file has different structure for Ispell and Hunspell
+ * dictionaries. The Hunspell format is more complicated. But when an .affix
+ * file is imported and compiled, it is stored in the same structure AffixNode.
+ *
+ * A .dict file stores a list of basic forms of words with references to
+ * affix rules. The format of a .dict file has the same structure for Ispell
+ * and Hunspell dictionaries.
+ *
+ * Compilation of a dictionary
+ * ---------------------------
+ *
+ * A compiled dictionary is stored in the IspellDict structure. Compilation of
+ * a dictionary is divided into the several steps:
+ * - NIImportDictionary() - stores each word of a .dict file in the
+ * temporary Spell field.
+ * - NIImportAffixes() - stores affix rules of an .affix file in the
+ * Affix field (not temporary) if an .affix file has the Ispell format.
+ * -> NIImportOOAffixes() - stores affix rules if an .affix file has the
+ * Hunspell format. The AffixData field is initialized if AF parameter
+ * is defined.
+ * - NISortDictionary() - builds a prefix tree (Trie) from the words list
+ * and stores it in the Dictionary field. The words list is got from the
+ * Spell field. The AffixData field is initialized if AF parameter is not
+ * defined.
+ * - NISortAffixes():
+ * - builds a list of compound affixes from the affix list and stores it
+ * in the CompoundAffix.
+ * - builds prefix trees (Trie) from the affix list for prefixes and suffixes
+ * and stores them in Suffix and Prefix fields.
+ * The affix list is got from the Affix field.
+ *
+ * Memory management
+ * -----------------
+ *
+ * The IspellDict structure has the Spell field which is used only in compile
+ * time. The Spell field stores a words list. It can take a lot of memory.
+ * Therefore when a dictionary is compiled this field is cleared by
+ * NIFinishBuild().
+ *
+ * All resources which should cleared by NIFinishBuild() is initialized using
+ * tmpalloc() and tmpalloc0().
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/spell.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/pg_collation.h"
+#include "tsearch/dicts/spell.h"
+#include "tsearch/ts_locale.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Initialization requires a lot of memory that's not needed
+ * after the initialization is done. During initialization,
+ * CurrentMemoryContext is the long-lived memory context associated
+ * with the dictionary cache entry. We keep the short-lived stuff
+ * in the Conf->buildCxt context.
+ */
+#define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
+#define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
+
+/*
+ * Prepare for constructing an ISpell dictionary.
+ *
+ * The IspellDict struct is assumed to be zeroed when allocated.
+ */
+void
+NIStartBuild(IspellDict *Conf)
+{
+ /*
+ * The temp context is a child of CurTransactionContext, so that it will
+ * go away automatically on error.
+ */
+ Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
+ "Ispell dictionary init context",
+ ALLOCSET_DEFAULT_SIZES);
+}
+
+/*
+ * Clean up when dictionary construction is complete.
+ */
+void
+NIFinishBuild(IspellDict *Conf)
+{
+ /* Release no-longer-needed temp memory */
+ MemoryContextDelete(Conf->buildCxt);
+ /* Just for cleanliness, zero the now-dangling pointers */
+ Conf->buildCxt = NULL;
+ Conf->Spell = NULL;
+ Conf->firstfree = NULL;
+ Conf->CompoundAffixFlags = NULL;
+}
+
+
+/*
+ * "Compact" palloc: allocate without extra palloc overhead.
+ *
+ * Since we have no need to free the ispell data items individually, there's
+ * not much value in the per-chunk overhead normally consumed by palloc.
+ * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
+ *
+ * We currently pre-zero all data allocated this way, even though some of it
+ * doesn't need that. The cpalloc and cpalloc0 macros are just documentation
+ * to indicate which allocations actually require zeroing.
+ */
+#define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */
+#define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */
+
+static void *
+compact_palloc0(IspellDict *Conf, size_t size)
+{
+ void *result;
+
+ /* Should only be called during init */
+ Assert(Conf->buildCxt != NULL);
+
+ /* No point in this for large chunks */
+ if (size > COMPACT_MAX_REQ)
+ return palloc0(size);
+
+ /* Keep everything maxaligned */
+ size = MAXALIGN(size);
+
+ /* Need more space? */
+ if (size > Conf->avail)
+ {
+ Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
+ Conf->avail = COMPACT_ALLOC_CHUNK;
+ }
+
+ result = (void *) Conf->firstfree;
+ Conf->firstfree += size;
+ Conf->avail -= size;
+
+ return result;
+}
+
+#define cpalloc(size) compact_palloc0(Conf, size)
+#define cpalloc0(size) compact_palloc0(Conf, size)
+
+static char *
+cpstrdup(IspellDict *Conf, const char *str)
+{
+ char *res = cpalloc(strlen(str) + 1);
+
+ strcpy(res, str);
+ return res;
+}
+
+
+/*
+ * Apply lowerstr(), producing a temporary result (in the buildCxt).
+ */
+static char *
+lowerstr_ctx(IspellDict *Conf, const char *src)
+{
+ MemoryContext saveCtx;
+ char *dst;
+
+ saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
+ dst = lowerstr(src);
+ MemoryContextSwitchTo(saveCtx);
+
+ return dst;
+}
+
+#define MAX_NORM 1024
+#define MAXNORMLEN 256
+
+#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
+#define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
+#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
+
+static char *VoidString = "";
+
+static int
+cmpspell(const void *s1, const void *s2)
+{
+ return strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word);
+}
+
+static int
+cmpspellaffix(const void *s1, const void *s2)
+{
+ return strcmp((*(SPELL *const *) s1)->p.flag,
+ (*(SPELL *const *) s2)->p.flag);
+}
+
+static int
+cmpcmdflag(const void *f1, const void *f2)
+{
+ CompoundAffixFlag *fv1 = (CompoundAffixFlag *) f1,
+ *fv2 = (CompoundAffixFlag *) f2;
+
+ Assert(fv1->flagMode == fv2->flagMode);
+
+ if (fv1->flagMode == FM_NUM)
+ {
+ if (fv1->flag.i == fv2->flag.i)
+ return 0;
+
+ return (fv1->flag.i > fv2->flag.i) ? 1 : -1;
+ }
+
+ return strcmp(fv1->flag.s, fv2->flag.s);
+}
+
+static char *
+findchar(char *str, int c)
+{
+ while (*str)
+ {
+ if (t_iseq(str, c))
+ return str;
+ str += pg_mblen(str);
+ }
+
+ return NULL;
+}
+
+static char *
+findchar2(char *str, int c1, int c2)
+{
+ while (*str)
+ {
+ if (t_iseq(str, c1) || t_iseq(str, c2))
+ return str;
+ str += pg_mblen(str);
+ }
+
+ return NULL;
+}
+
+
+/* backward string compare for suffix tree operations */
+static int
+strbcmp(const unsigned char *s1, const unsigned char *s2)
+{
+ int l1 = strlen((const char *) s1) - 1,
+ l2 = strlen((const char *) s2) - 1;
+
+ while (l1 >= 0 && l2 >= 0)
+ {
+ if (s1[l1] < s2[l2])
+ return -1;
+ if (s1[l1] > s2[l2])
+ return 1;
+ l1--;
+ l2--;
+ }
+ if (l1 < l2)
+ return -1;
+ if (l1 > l2)
+ return 1;
+
+ return 0;
+}
+
+static int
+strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
+{
+ int l1 = strlen((const char *) s1) - 1,
+ l2 = strlen((const char *) s2) - 1,
+ l = count;
+
+ while (l1 >= 0 && l2 >= 0 && l > 0)
+ {
+ if (s1[l1] < s2[l2])
+ return -1;
+ if (s1[l1] > s2[l2])
+ return 1;
+ l1--;
+ l2--;
+ l--;
+ }
+ if (l == 0)
+ return 0;
+ if (l1 < l2)
+ return -1;
+ if (l1 > l2)
+ return 1;
+ return 0;
+}
+
+/*
+ * Compares affixes.
+ * First compares the type of an affix. Prefixes should go before affixes.
+ * If types are equal then compares replaceable string.
+ */
+static int
+cmpaffix(const void *s1, const void *s2)
+{
+ const AFFIX *a1 = (const AFFIX *) s1;
+ const AFFIX *a2 = (const AFFIX *) s2;
+
+ if (a1->type < a2->type)
+ return -1;
+ if (a1->type > a2->type)
+ return 1;
+ if (a1->type == FF_PREFIX)
+ return strcmp(a1->repl, a2->repl);
+ else
+ return strbcmp((const unsigned char *) a1->repl,
+ (const unsigned char *) a2->repl);
+}
+
+/*
+ * Gets an affix flag from the set of affix flags (sflagset).
+ *
+ * Several flags can be stored in a single string. Flags can be represented by:
+ * - 1 character (FM_CHAR). A character may be Unicode.
+ * - 2 characters (FM_LONG). A character may be Unicode.
+ * - numbers from 1 to 65000 (FM_NUM).
+ *
+ * Depending on the flagMode an affix string can have the following format:
+ * - FM_CHAR: ABCD
+ * Here we have 4 flags: A, B, C and D
+ * - FM_LONG: ABCDE*
+ * Here we have 3 flags: AB, CD and E*
+ * - FM_NUM: 200,205,50
+ * Here we have 3 flags: 200, 205 and 50
+ *
+ * Conf: current dictionary.
+ * sflagset: the set of affix flags. Returns a reference to the start of a next
+ * affix flag.
+ * sflag: returns an affix flag from sflagset.
+ */
+static void
+getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
+{
+ int32 s;
+ char *next,
+ *sbuf = *sflagset;
+ int maxstep;
+ bool stop = false;
+ bool met_comma = false;
+
+ maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1;
+
+ while (**sflagset)
+ {
+ switch (Conf->flagMode)
+ {
+ case FM_LONG:
+ case FM_CHAR:
+ COPYCHAR(sflag, *sflagset);
+ sflag += pg_mblen(*sflagset);
+
+ /* Go to start of the next flag */
+ *sflagset += pg_mblen(*sflagset);
+
+ /* Check if we get all characters of flag */
+ maxstep--;
+ stop = (maxstep == 0);
+ break;
+ case FM_NUM:
+ s = strtol(*sflagset, &next, 10);
+ if (*sflagset == next || errno == ERANGE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\"", *sflagset)));
+ if (s < 0 || s > FLAGNUM_MAXSIZE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("affix flag \"%s\" is out of range",
+ *sflagset)));
+ sflag += sprintf(sflag, "%0d", s);
+
+ /* Go to start of the next flag */
+ *sflagset = next;
+ while (**sflagset)
+ {
+ if (t_isdigit(*sflagset))
+ {
+ if (!met_comma)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\"",
+ *sflagset)));
+ break;
+ }
+ else if (t_iseq(*sflagset, ','))
+ {
+ if (met_comma)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\"",
+ *sflagset)));
+ met_comma = true;
+ }
+ else if (!t_isspace(*sflagset))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid character in affix flag \"%s\"",
+ *sflagset)));
+ }
+
+ *sflagset += pg_mblen(*sflagset);
+ }
+ stop = true;
+ break;
+ default:
+ elog(ERROR, "unrecognized type of Conf->flagMode: %d",
+ Conf->flagMode);
+ }
+
+ if (stop)
+ break;
+ }
+
+ if (Conf->flagMode == FM_LONG && maxstep > 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\" with \"long\" flag value",
+ sbuf)));
+
+ *sflag = '\0';
+}
+
+/*
+ * Checks if the affix set Conf->AffixData[affix] contains affixflag.
+ * Conf->AffixData[affix] does not contain affixflag if this flag is not used
+ * actually by the .dict file.
+ *
+ * Conf: current dictionary.
+ * affix: index of the Conf->AffixData array.
+ * affixflag: the affix flag.
+ *
+ * Returns true if the string Conf->AffixData[affix] contains affixflag,
+ * otherwise returns false.
+ */
+static bool
+IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag)
+{
+ char *flagcur;
+ char flag[BUFSIZ];
+
+ if (*affixflag == 0)
+ return true;
+
+ Assert(affix < Conf->nAffixData);
+
+ flagcur = Conf->AffixData[affix];
+
+ while (*flagcur)
+ {
+ getNextFlagFromString(Conf, &flagcur, flag);
+ /* Compare first affix flag in flagcur with affixflag */
+ if (strcmp(flag, affixflag) == 0)
+ return true;
+ }
+
+ /* Could not find affixflag */
+ return false;
+}
+
+/*
+ * Adds the new word into the temporary array Spell.
+ *
+ * Conf: current dictionary.
+ * word: new word.
+ * flag: set of affix flags. Single flag can be get by getNextFlagFromString().
+ */
+static void
+NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
+{
+ if (Conf->nspell >= Conf->mspell)
+ {
+ if (Conf->mspell)
+ {
+ Conf->mspell *= 2;
+ Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
+ }
+ else
+ {
+ Conf->mspell = 1024 * 20;
+ Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
+ }
+ }
+ Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
+ strcpy(Conf->Spell[Conf->nspell]->word, word);
+ Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
+ ? cpstrdup(Conf, flag) : VoidString;
+ Conf->nspell++;
+}
+
+/*
+ * Imports dictionary into the temporary array Spell.
+ *
+ * Note caller must already have applied get_tsearch_config_filename.
+ *
+ * Conf: current dictionary.
+ * filename: path to the .dict file.
+ */
+void
+NIImportDictionary(IspellDict *Conf, const char *filename)
+{
+ tsearch_readline_state trst;
+ char *line;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open dictionary file \"%s\": %m",
+ filename)));
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ char *s,
+ *pstr;
+
+ /* Set of affix flags */
+ const char *flag;
+
+ /* Extract flag from the line */
+ flag = NULL;
+ if ((s = findchar(line, '/')))
+ {
+ *s++ = '\0';
+ flag = s;
+ while (*s)
+ {
+ /* we allow only single encoded flags for faster works */
+ if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
+ s++;
+ else
+ {
+ *s = '\0';
+ break;
+ }
+ }
+ }
+ else
+ flag = "";
+
+ /* Remove trailing spaces */
+ s = line;
+ while (*s)
+ {
+ if (t_isspace(s))
+ {
+ *s = '\0';
+ break;
+ }
+ s += pg_mblen(s);
+ }
+ pstr = lowerstr_ctx(Conf, line);
+
+ NIAddSpell(Conf, pstr, flag);
+ pfree(pstr);
+
+ pfree(line);
+ }
+ tsearch_readline_end(&trst);
+}
+
+/*
+ * Searches a basic form of word in the prefix tree. This word was generated
+ * using an affix rule. This rule may not be presented in an affix set of
+ * a basic form of word.
+ *
+ * For example, we have the entry in the .dict file:
+ * meter/GMD
+ *
+ * The affix rule with the flag S:
+ * SFX S y ies [^aeiou]y
+ * is not presented here.
+ *
+ * The affix rule with the flag M:
+ * SFX M 0 's .
+ * is presented here.
+ *
+ * Conf: current dictionary.
+ * word: basic form of word.
+ * affixflag: affix flag, by which a basic form of word was generated.
+ * flag: compound flag used to compare with StopMiddle->compoundflag.
+ *
+ * Returns 1 if the word was found in the prefix tree, else returns 0.
+ */
+static int
+FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag)
+{
+ SPNode *node = Conf->Dictionary;
+ SPNodeData *StopLow,
+ *StopHigh,
+ *StopMiddle;
+ const uint8 *ptr = (const uint8 *) word;
+
+ flag &= FF_COMPOUNDFLAGMASK;
+
+ while (node && *ptr)
+ {
+ StopLow = node->data;
+ StopHigh = node->data + node->length;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ if (StopMiddle->val == *ptr)
+ {
+ if (*(ptr + 1) == '\0' && StopMiddle->isword)
+ {
+ if (flag == 0)
+ {
+ /*
+ * The word can be formed only with another word. And
+ * in the flag parameter there is not a sign that we
+ * search compound words.
+ */
+ if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
+ return 0;
+ }
+ else if ((flag & StopMiddle->compoundflag) == 0)
+ return 0;
+
+ /*
+ * Check if this affix rule is presented in the affix set
+ * with index StopMiddle->affix.
+ */
+ if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
+ return 1;
+ }
+ node = StopMiddle->node;
+ ptr++;
+ break;
+ }
+ else if (StopMiddle->val < *ptr)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+ if (StopLow >= StopHigh)
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Context reset/delete callback for a regular expression used in an affix
+ */
+static void
+regex_affix_deletion_callback(void *arg)
+{
+ aff_regex_struct *pregex = (aff_regex_struct *) arg;
+
+ pg_regfree(&(pregex->regex));
+}
+
+/*
+ * Adds a new affix rule to the Affix field.
+ *
+ * Conf: current dictionary.
+ * flag: affix flag ('\' in the below example).
+ * flagflags: set of flags from the flagval field for this affix rule. This set
+ * is listed after '/' character in the added string (repl).
+ *
+ * For example L flag in the hunspell_sample.affix:
+ * SFX \ 0 Y/L [^Y]
+ *
+ * mask: condition for search ('[^Y]' in the above example).
+ * find: stripping characters from beginning (at prefix) or end (at suffix)
+ * of the word ('0' in the above example, 0 means that there is not
+ * stripping character).
+ * repl: adding string after stripping ('Y' in the above example).
+ * type: FF_SUFFIX or FF_PREFIX.
+ */
+static void
+NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask,
+ const char *find, const char *repl, int type)
+{
+ AFFIX *Affix;
+
+ if (Conf->naffixes >= Conf->maffixes)
+ {
+ if (Conf->maffixes)
+ {
+ Conf->maffixes *= 2;
+ Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
+ }
+ else
+ {
+ Conf->maffixes = 16;
+ Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
+ }
+ }
+
+ Affix = Conf->Affix + Conf->naffixes;
+
+ /* This affix rule can be applied for words with any ending */
+ if (strcmp(mask, ".") == 0 || *mask == '\0')
+ {
+ Affix->issimple = 1;
+ Affix->isregis = 0;
+ }
+ /* This affix rule will use regis to search word ending */
+ else if (RS_isRegis(mask))
+ {
+ Affix->issimple = 0;
+ Affix->isregis = 1;
+ RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
+ *mask ? mask : VoidString);
+ }
+ /* This affix rule will use regex_t to search word ending */
+ else
+ {
+ int masklen;
+ int wmasklen;
+ int err;
+ pg_wchar *wmask;
+ char *tmask;
+ aff_regex_struct *pregex;
+
+ Affix->issimple = 0;
+ Affix->isregis = 0;
+ tmask = (char *) tmpalloc(strlen(mask) + 3);
+ if (type == FF_SUFFIX)
+ sprintf(tmask, "%s$", mask);
+ else
+ sprintf(tmask, "^%s", mask);
+
+ masklen = strlen(tmask);
+ wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
+ wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
+
+ /*
+ * The regex engine stores its stuff using malloc not palloc, so we
+ * must arrange to explicitly clean up the regex when the dictionary's
+ * context is cleared. That means the regex_t has to stay in a fixed
+ * location within the context; we can't keep it directly in the AFFIX
+ * struct, since we may sort and resize the array of AFFIXes.
+ */
+ Affix->reg.pregex = pregex = palloc(sizeof(aff_regex_struct));
+
+ err = pg_regcomp(&(pregex->regex), wmask, wmasklen,
+ REG_ADVANCED | REG_NOSUB,
+ DEFAULT_COLLATION_OID);
+ if (err)
+ {
+ char errstr[100];
+
+ pg_regerror(err, &(pregex->regex), errstr, sizeof(errstr));
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
+ errmsg("invalid regular expression: %s", errstr)));
+ }
+
+ pregex->mcallback.func = regex_affix_deletion_callback;
+ pregex->mcallback.arg = (void *) pregex;
+ MemoryContextRegisterResetCallback(CurrentMemoryContext,
+ &pregex->mcallback);
+ }
+
+ Affix->flagflags = flagflags;
+ if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
+ {
+ if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
+ Affix->flagflags |= FF_COMPOUNDFLAG;
+ }
+ Affix->flag = cpstrdup(Conf, flag);
+ Affix->type = type;
+
+ Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
+ if ((Affix->replen = strlen(repl)) > 0)
+ Affix->repl = cpstrdup(Conf, repl);
+ else
+ Affix->repl = VoidString;
+ Conf->naffixes++;
+}
+
+/* Parsing states for parse_affentry() and friends */
+#define PAE_WAIT_MASK 0
+#define PAE_INMASK 1
+#define PAE_WAIT_FIND 2
+#define PAE_INFIND 3
+#define PAE_WAIT_REPL 4
+#define PAE_INREPL 5
+#define PAE_WAIT_TYPE 6
+#define PAE_WAIT_FLAG 7
+
+/*
+ * Parse next space-separated field of an .affix file line.
+ *
+ * *str is the input pointer (will be advanced past field)
+ * next is where to copy the field value to, with null termination
+ *
+ * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
+ *
+ * Returns true if we found a field, false if not.
+ */
+static bool
+get_nextfield(char **str, char *next)
+{
+ int state = PAE_WAIT_MASK;
+ int avail = BUFSIZ;
+
+ while (**str)
+ {
+ if (state == PAE_WAIT_MASK)
+ {
+ if (t_iseq(*str, '#'))
+ return false;
+ else if (!t_isspace(*str))
+ {
+ int clen = pg_mblen(*str);
+
+ if (clen < avail)
+ {
+ COPYCHAR(next, *str);
+ next += clen;
+ avail -= clen;
+ }
+ state = PAE_INMASK;
+ }
+ }
+ else /* state == PAE_INMASK */
+ {
+ if (t_isspace(*str))
+ {
+ *next = '\0';
+ return true;
+ }
+ else
+ {
+ int clen = pg_mblen(*str);
+
+ if (clen < avail)
+ {
+ COPYCHAR(next, *str);
+ next += clen;
+ avail -= clen;
+ }
+ }
+ }
+ *str += pg_mblen(*str);
+ }
+
+ *next = '\0';
+
+ return (state == PAE_INMASK); /* OK if we got a nonempty field */
+}
+
+/*
+ * Parses entry of an .affix file of MySpell or Hunspell format.
+ *
+ * An .affix file entry has the following format:
+ * - header
+ * <type> <flag> <cross_flag> <flag_count>
+ * - fields after header:
+ * <type> <flag> <find> <replace> <mask>
+ *
+ * str is the input line
+ * field values are returned to type etc, which must be buffers of size BUFSIZ.
+ *
+ * Returns number of fields found; any omitted fields are set to empty strings.
+ */
+static int
+parse_ooaffentry(char *str, char *type, char *flag, char *find,
+ char *repl, char *mask)
+{
+ int state = PAE_WAIT_TYPE;
+ int fields_read = 0;
+ bool valid = false;
+
+ *type = *flag = *find = *repl = *mask = '\0';
+
+ while (*str)
+ {
+ switch (state)
+ {
+ case PAE_WAIT_TYPE:
+ valid = get_nextfield(&str, type);
+ state = PAE_WAIT_FLAG;
+ break;
+ case PAE_WAIT_FLAG:
+ valid = get_nextfield(&str, flag);
+ state = PAE_WAIT_FIND;
+ break;
+ case PAE_WAIT_FIND:
+ valid = get_nextfield(&str, find);
+ state = PAE_WAIT_REPL;
+ break;
+ case PAE_WAIT_REPL:
+ valid = get_nextfield(&str, repl);
+ state = PAE_WAIT_MASK;
+ break;
+ case PAE_WAIT_MASK:
+ valid = get_nextfield(&str, mask);
+ state = -1; /* force loop exit */
+ break;
+ default:
+ elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
+ state);
+ break;
+ }
+ if (valid)
+ fields_read++;
+ else
+ break; /* early EOL */
+ if (state < 0)
+ break; /* got all fields */
+ }
+
+ return fields_read;
+}
+
+/*
+ * Parses entry of an .affix file of Ispell format
+ *
+ * An .affix file entry has the following format:
+ * <mask> > [-<find>,]<replace>
+ */
+static bool
+parse_affentry(char *str, char *mask, char *find, char *repl)
+{
+ int state = PAE_WAIT_MASK;
+ char *pmask = mask,
+ *pfind = find,
+ *prepl = repl;
+
+ *mask = *find = *repl = '\0';
+
+ while (*str)
+ {
+ if (state == PAE_WAIT_MASK)
+ {
+ if (t_iseq(str, '#'))
+ return false;
+ else if (!t_isspace(str))
+ {
+ COPYCHAR(pmask, str);
+ pmask += pg_mblen(str);
+ state = PAE_INMASK;
+ }
+ }
+ else if (state == PAE_INMASK)
+ {
+ if (t_iseq(str, '>'))
+ {
+ *pmask = '\0';
+ state = PAE_WAIT_FIND;
+ }
+ else if (!t_isspace(str))
+ {
+ COPYCHAR(pmask, str);
+ pmask += pg_mblen(str);
+ }
+ }
+ else if (state == PAE_WAIT_FIND)
+ {
+ if (t_iseq(str, '-'))
+ {
+ state = PAE_INFIND;
+ }
+ else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
+ {
+ COPYCHAR(prepl, str);
+ prepl += pg_mblen(str);
+ state = PAE_INREPL;
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else if (state == PAE_INFIND)
+ {
+ if (t_iseq(str, ','))
+ {
+ *pfind = '\0';
+ state = PAE_WAIT_REPL;
+ }
+ else if (t_isalpha(str))
+ {
+ COPYCHAR(pfind, str);
+ pfind += pg_mblen(str);
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else if (state == PAE_WAIT_REPL)
+ {
+ if (t_iseq(str, '-'))
+ {
+ break; /* void repl */
+ }
+ else if (t_isalpha(str))
+ {
+ COPYCHAR(prepl, str);
+ prepl += pg_mblen(str);
+ state = PAE_INREPL;
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else if (state == PAE_INREPL)
+ {
+ if (t_iseq(str, '#'))
+ {
+ *prepl = '\0';
+ break;
+ }
+ else if (t_isalpha(str))
+ {
+ COPYCHAR(prepl, str);
+ prepl += pg_mblen(str);
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else
+ elog(ERROR, "unrecognized state in parse_affentry: %d", state);
+
+ str += pg_mblen(str);
+ }
+
+ *pmask = *pfind = *prepl = '\0';
+
+ return (*mask && (*find || *repl));
+}
+
+/*
+ * Sets a Hunspell options depending on flag type.
+ */
+static void
+setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry,
+ char *s, uint32 val)
+{
+ if (Conf->flagMode == FM_NUM)
+ {
+ char *next;
+ int i;
+
+ i = strtol(s, &next, 10);
+ if (s == next || errno == ERANGE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix flag \"%s\"", s)));
+ if (i < 0 || i > FLAGNUM_MAXSIZE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("affix flag \"%s\" is out of range", s)));
+
+ entry->flag.i = i;
+ }
+ else
+ entry->flag.s = cpstrdup(Conf, s);
+
+ entry->flagMode = Conf->flagMode;
+ entry->value = val;
+}
+
+/*
+ * Sets up a correspondence for the affix parameter with the affix flag.
+ *
+ * Conf: current dictionary.
+ * s: affix flag in string.
+ * val: affix parameter.
+ */
+static void
+addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
+{
+ CompoundAffixFlag *newValue;
+ char sbuf[BUFSIZ];
+ char *sflag;
+ int clen;
+
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (!*s)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+
+ /* Get flag without \n */
+ sflag = sbuf;
+ while (*s && !t_isspace(s) && *s != '\n')
+ {
+ clen = pg_mblen(s);
+ COPYCHAR(sflag, s);
+ sflag += clen;
+ s += clen;
+ }
+ *sflag = '\0';
+
+ /* Resize array or allocate memory for array CompoundAffixFlag */
+ if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag)
+ {
+ if (Conf->mCompoundAffixFlag)
+ {
+ Conf->mCompoundAffixFlag *= 2;
+ Conf->CompoundAffixFlags = (CompoundAffixFlag *)
+ repalloc((void *) Conf->CompoundAffixFlags,
+ Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
+ }
+ else
+ {
+ Conf->mCompoundAffixFlag = 10;
+ Conf->CompoundAffixFlags = (CompoundAffixFlag *)
+ tmpalloc(Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
+ }
+ }
+
+ newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag;
+
+ setCompoundAffixFlagValue(Conf, newValue, sbuf, val);
+
+ Conf->usecompound = true;
+ Conf->nCompoundAffixFlag++;
+}
+
+/*
+ * Returns a set of affix parameters which correspondence to the set of affix
+ * flags s.
+ */
+static int
+getCompoundAffixFlagValue(IspellDict *Conf, char *s)
+{
+ uint32 flag = 0;
+ CompoundAffixFlag *found,
+ key;
+ char sflag[BUFSIZ];
+ char *flagcur;
+
+ if (Conf->nCompoundAffixFlag == 0)
+ return 0;
+
+ flagcur = s;
+ while (*flagcur)
+ {
+ getNextFlagFromString(Conf, &flagcur, sflag);
+ setCompoundAffixFlagValue(Conf, &key, sflag, 0);
+
+ found = (CompoundAffixFlag *)
+ bsearch(&key, (void *) Conf->CompoundAffixFlags,
+ Conf->nCompoundAffixFlag, sizeof(CompoundAffixFlag),
+ cmpcmdflag);
+ if (found != NULL)
+ flag |= found->value;
+ }
+
+ return flag;
+}
+
+/*
+ * Returns a flag set using the s parameter.
+ *
+ * If Conf->useFlagAliases is true then the s parameter is index of the
+ * Conf->AffixData array and function returns its entry.
+ * Else function returns the s parameter.
+ */
+static char *
+getAffixFlagSet(IspellDict *Conf, char *s)
+{
+ if (Conf->useFlagAliases && *s != '\0')
+ {
+ int curaffix;
+ char *end;
+
+ curaffix = strtol(s, &end, 10);
+ if (s == end || errno == ERANGE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"", s)));
+
+ if (curaffix > 0 && curaffix < Conf->nAffixData)
+
+ /*
+ * Do not subtract 1 from curaffix because empty string was added
+ * in NIImportOOAffixes
+ */
+ return Conf->AffixData[curaffix];
+ else if (curaffix > Conf->nAffixData)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"", s)));
+ return VoidString;
+ }
+ else
+ return s;
+}
+
+/*
+ * Import an affix file that follows MySpell or Hunspell format.
+ *
+ * Conf: current dictionary.
+ * filename: path to the .affix file.
+ */
+static void
+NIImportOOAffixes(IspellDict *Conf, const char *filename)
+{
+ char type[BUFSIZ],
+ *ptype = NULL;
+ char sflag[BUFSIZ];
+ char mask[BUFSIZ],
+ *pmask;
+ char find[BUFSIZ],
+ *pfind;
+ char repl[BUFSIZ],
+ *prepl;
+ bool isSuffix = false;
+ int naffix = 0,
+ curaffix = 0;
+ int sflaglen = 0;
+ char flagflags = 0;
+ tsearch_readline_state trst;
+ char *recoded;
+
+ /* read file to find any flag */
+ Conf->usecompound = false;
+ Conf->useFlagAliases = false;
+ Conf->flagMode = FM_CHAR;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open affix file \"%s\": %m",
+ filename)));
+
+ while ((recoded = tsearch_readline(&trst)) != NULL)
+ {
+ if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+ {
+ pfree(recoded);
+ continue;
+ }
+
+ if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
+ FF_COMPOUNDFLAG);
+ else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
+ FF_COMPOUNDBEGIN);
+ else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
+ FF_COMPOUNDLAST);
+ /* COMPOUNDLAST and COMPOUNDEND are synonyms */
+ else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
+ FF_COMPOUNDLAST);
+ else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
+ FF_COMPOUNDMIDDLE);
+ else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
+ addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
+ FF_COMPOUNDONLY);
+ else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
+ addCompoundAffixFlagValue(Conf,
+ recoded + strlen("COMPOUNDPERMITFLAG"),
+ FF_COMPOUNDPERMITFLAG);
+ else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
+ addCompoundAffixFlagValue(Conf,
+ recoded + strlen("COMPOUNDFORBIDFLAG"),
+ FF_COMPOUNDFORBIDFLAG);
+ else if (STRNCMP(recoded, "FLAG") == 0)
+ {
+ char *s = recoded + strlen("FLAG");
+
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (*s)
+ {
+ if (STRNCMP(s, "long") == 0)
+ Conf->flagMode = FM_LONG;
+ else if (STRNCMP(s, "num") == 0)
+ Conf->flagMode = FM_NUM;
+ else if (STRNCMP(s, "default") != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("Ispell dictionary supports only "
+ "\"default\", \"long\", "
+ "and \"num\" flag values")));
+ }
+ }
+
+ pfree(recoded);
+ }
+ tsearch_readline_end(&trst);
+
+ if (Conf->nCompoundAffixFlag > 1)
+ qsort((void *) Conf->CompoundAffixFlags, Conf->nCompoundAffixFlag,
+ sizeof(CompoundAffixFlag), cmpcmdflag);
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open affix file \"%s\": %m",
+ filename)));
+
+ while ((recoded = tsearch_readline(&trst)) != NULL)
+ {
+ int fields_read;
+
+ if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+ goto nextline;
+
+ fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
+
+ if (ptype)
+ pfree(ptype);
+ ptype = lowerstr_ctx(Conf, type);
+
+ /* First try to parse AF parameter (alias compression) */
+ if (STRNCMP(ptype, "af") == 0)
+ {
+ /* First line is the number of aliases */
+ if (!Conf->useFlagAliases)
+ {
+ Conf->useFlagAliases = true;
+ naffix = atoi(sflag);
+ if (naffix <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid number of flag vector aliases")));
+
+ /* Also reserve place for empty flag set */
+ naffix++;
+
+ Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
+ Conf->lenAffixData = Conf->nAffixData = naffix;
+
+ /* Add empty flag set into AffixData */
+ Conf->AffixData[curaffix] = VoidString;
+ curaffix++;
+ }
+ /* Other lines are aliases */
+ else
+ {
+ if (curaffix < naffix)
+ {
+ Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
+ curaffix++;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("number of aliases exceeds specified number %d",
+ naffix - 1)));
+ }
+ goto nextline;
+ }
+ /* Else try to parse prefixes and suffixes */
+ if (fields_read < 4 ||
+ (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
+ goto nextline;
+
+ sflaglen = strlen(sflag);
+ if (sflaglen == 0
+ || (sflaglen > 1 && Conf->flagMode == FM_CHAR)
+ || (sflaglen > 2 && Conf->flagMode == FM_LONG))
+ goto nextline;
+
+ /*--------
+ * Affix header. For example:
+ * SFX \ N 1
+ *--------
+ */
+ if (fields_read == 4)
+ {
+ isSuffix = (STRNCMP(ptype, "sfx") == 0);
+ if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
+ flagflags = FF_CROSSPRODUCT;
+ else
+ flagflags = 0;
+ }
+ /*--------
+ * Affix fields. For example:
+ * SFX \ 0 Y/L [^Y]
+ *--------
+ */
+ else
+ {
+ char *ptr;
+ int aflg = 0;
+
+ /* Get flags after '/' (flags are case sensitive) */
+ if ((ptr = strchr(repl, '/')) != NULL)
+ aflg |= getCompoundAffixFlagValue(Conf,
+ getAffixFlagSet(Conf,
+ ptr + 1));
+ /* Get lowercased version of string before '/' */
+ prepl = lowerstr_ctx(Conf, repl);
+ if ((ptr = strchr(prepl, '/')) != NULL)
+ *ptr = '\0';
+ pfind = lowerstr_ctx(Conf, find);
+ pmask = lowerstr_ctx(Conf, mask);
+ if (t_iseq(find, '0'))
+ *pfind = '\0';
+ if (t_iseq(repl, '0'))
+ *prepl = '\0';
+
+ NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl,
+ isSuffix ? FF_SUFFIX : FF_PREFIX);
+ pfree(prepl);
+ pfree(pfind);
+ pfree(pmask);
+ }
+
+nextline:
+ pfree(recoded);
+ }
+
+ tsearch_readline_end(&trst);
+ if (ptype)
+ pfree(ptype);
+}
+
+/*
+ * import affixes
+ *
+ * Note caller must already have applied get_tsearch_config_filename
+ *
+ * This function is responsible for parsing ispell ("old format") affix files.
+ * If we realize that the file contains new-format commands, we pass off the
+ * work to NIImportOOAffixes(), which will re-read the whole file.
+ */
+void
+NIImportAffixes(IspellDict *Conf, const char *filename)
+{
+ char *pstr = NULL;
+ char flag[BUFSIZ];
+ char mask[BUFSIZ];
+ char find[BUFSIZ];
+ char repl[BUFSIZ];
+ char *s;
+ bool suffixes = false;
+ bool prefixes = false;
+ char flagflags = 0;
+ tsearch_readline_state trst;
+ bool oldformat = false;
+ char *recoded = NULL;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open affix file \"%s\": %m",
+ filename)));
+
+ Conf->usecompound = false;
+ Conf->useFlagAliases = false;
+ Conf->flagMode = FM_CHAR;
+
+ while ((recoded = tsearch_readline(&trst)) != NULL)
+ {
+ pstr = lowerstr(recoded);
+
+ /* Skip comments and empty lines */
+ if (*pstr == '#' || *pstr == '\n')
+ goto nextline;
+
+ if (STRNCMP(pstr, "compoundwords") == 0)
+ {
+ /* Find case-insensitive L flag in non-lowercased string */
+ s = findchar2(recoded, 'l', 'L');
+ if (s)
+ {
+ while (*s && !t_isspace(s))
+ s += pg_mblen(s);
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (*s && pg_mblen(s) == 1)
+ {
+ addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
+ Conf->usecompound = true;
+ }
+ oldformat = true;
+ goto nextline;
+ }
+ }
+ if (STRNCMP(pstr, "suffixes") == 0)
+ {
+ suffixes = true;
+ prefixes = false;
+ oldformat = true;
+ goto nextline;
+ }
+ if (STRNCMP(pstr, "prefixes") == 0)
+ {
+ suffixes = false;
+ prefixes = true;
+ oldformat = true;
+ goto nextline;
+ }
+ if (STRNCMP(pstr, "flag") == 0)
+ {
+ s = recoded + 4; /* we need non-lowercased string */
+ flagflags = 0;
+
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (*s == '*')
+ {
+ flagflags |= FF_CROSSPRODUCT;
+ s++;
+ }
+ else if (*s == '~')
+ {
+ flagflags |= FF_COMPOUNDONLY;
+ s++;
+ }
+
+ if (*s == '\\')
+ s++;
+
+ /*
+ * An old-format flag is a single ASCII character; we expect it to
+ * be followed by EOL, whitespace, or ':'. Otherwise this is a
+ * new-format flag command.
+ */
+ if (*s && pg_mblen(s) == 1)
+ {
+ COPYCHAR(flag, s);
+ flag[1] = '\0';
+
+ s++;
+ if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
+ t_isspace(s))
+ {
+ oldformat = true;
+ goto nextline;
+ }
+ }
+ goto isnewformat;
+ }
+ if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 ||
+ STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
+ STRNCMP(recoded, "PFX") == 0 ||
+ STRNCMP(recoded, "SFX") == 0)
+ goto isnewformat;
+
+ if ((!suffixes) && (!prefixes))
+ goto nextline;
+
+ if (!parse_affentry(pstr, mask, find, repl))
+ goto nextline;
+
+ NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
+
+nextline:
+ pfree(recoded);
+ pfree(pstr);
+ }
+ tsearch_readline_end(&trst);
+ return;
+
+isnewformat:
+ if (oldformat)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("affix file contains both old-style and new-style commands")));
+ tsearch_readline_end(&trst);
+
+ NIImportOOAffixes(Conf, filename);
+}
+
+/*
+ * Merges two affix flag sets and stores a new affix flag set into
+ * Conf->AffixData.
+ *
+ * Returns index of a new affix flag set.
+ */
+static int
+MergeAffix(IspellDict *Conf, int a1, int a2)
+{
+ char **ptr;
+
+ Assert(a1 < Conf->nAffixData && a2 < Conf->nAffixData);
+
+ /* Do not merge affix flags if one of affix flags is empty */
+ if (*Conf->AffixData[a1] == '\0')
+ return a2;
+ else if (*Conf->AffixData[a2] == '\0')
+ return a1;
+
+ while (Conf->nAffixData + 1 >= Conf->lenAffixData)
+ {
+ Conf->lenAffixData *= 2;
+ Conf->AffixData = (char **) repalloc(Conf->AffixData,
+ sizeof(char *) * Conf->lenAffixData);
+ }
+
+ ptr = Conf->AffixData + Conf->nAffixData;
+ if (Conf->flagMode == FM_NUM)
+ {
+ *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
+ strlen(Conf->AffixData[a2]) +
+ 1 /* comma */ + 1 /* \0 */ );
+ sprintf(*ptr, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]);
+ }
+ else
+ {
+ *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
+ strlen(Conf->AffixData[a2]) +
+ 1 /* \0 */ );
+ sprintf(*ptr, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]);
+ }
+ ptr++;
+ *ptr = NULL;
+ Conf->nAffixData++;
+
+ return Conf->nAffixData - 1;
+}
+
+/*
+ * Returns a set of affix parameters which correspondence to the set of affix
+ * flags with the given index.
+ */
+static uint32
+makeCompoundFlags(IspellDict *Conf, int affix)
+{
+ Assert(affix < Conf->nAffixData);
+
+ return (getCompoundAffixFlagValue(Conf, Conf->AffixData[affix]) &
+ FF_COMPOUNDFLAGMASK);
+}
+
+/*
+ * Makes a prefix tree for the given level.
+ *
+ * Conf: current dictionary.
+ * low: lower index of the Conf->Spell array.
+ * high: upper index of the Conf->Spell array.
+ * level: current prefix tree level.
+ */
+static SPNode *
+mkSPNode(IspellDict *Conf, int low, int high, int level)
+{
+ int i;
+ int nchar = 0;
+ char lastchar = '\0';
+ SPNode *rs;
+ SPNodeData *data;
+ int lownew = low;
+
+ for (i = low; i < high; i++)
+ if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
+ {
+ nchar++;
+ lastchar = Conf->Spell[i]->word[level];
+ }
+
+ if (!nchar)
+ return NULL;
+
+ rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
+ rs->length = nchar;
+ data = rs->data;
+
+ lastchar = '\0';
+ for (i = low; i < high; i++)
+ if (Conf->Spell[i]->p.d.len > level)
+ {
+ if (lastchar != Conf->Spell[i]->word[level])
+ {
+ if (lastchar)
+ {
+ /* Next level of the prefix tree */
+ data->node = mkSPNode(Conf, lownew, i, level + 1);
+ lownew = i;
+ data++;
+ }
+ lastchar = Conf->Spell[i]->word[level];
+ }
+ data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
+ if (Conf->Spell[i]->p.d.len == level + 1)
+ {
+ bool clearCompoundOnly = false;
+
+ if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
+ {
+ /*
+ * MergeAffix called a few times. If one of word is
+ * allowed to be in compound word and another isn't, then
+ * clear FF_COMPOUNDONLY flag.
+ */
+
+ clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
+ & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
+ ? false : true;
+ data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
+ }
+ else
+ data->affix = Conf->Spell[i]->p.d.affix;
+ data->isword = 1;
+
+ data->compoundflag = makeCompoundFlags(Conf, data->affix);
+
+ if ((data->compoundflag & FF_COMPOUNDONLY) &&
+ (data->compoundflag & FF_COMPOUNDFLAG) == 0)
+ data->compoundflag |= FF_COMPOUNDFLAG;
+
+ if (clearCompoundOnly)
+ data->compoundflag &= ~FF_COMPOUNDONLY;
+ }
+ }
+
+ /* Next level of the prefix tree */
+ data->node = mkSPNode(Conf, lownew, high, level + 1);
+
+ return rs;
+}
+
+/*
+ * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
+ * and affixes.
+ */
+void
+NISortDictionary(IspellDict *Conf)
+{
+ int i;
+ int naffix;
+ int curaffix;
+
+ /* compress affixes */
+
+ /*
+ * If we use flag aliases then we need to use Conf->AffixData filled in
+ * the NIImportOOAffixes().
+ */
+ if (Conf->useFlagAliases)
+ {
+ for (i = 0; i < Conf->nspell; i++)
+ {
+ char *end;
+
+ if (*Conf->Spell[i]->p.flag != '\0')
+ {
+ curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10);
+ if (Conf->Spell[i]->p.flag == end || errno == ERANGE)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"",
+ Conf->Spell[i]->p.flag)));
+ if (curaffix < 0 || curaffix >= Conf->nAffixData)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"",
+ Conf->Spell[i]->p.flag)));
+ if (*end != '\0' && !t_isdigit(end) && !t_isspace(end))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("invalid affix alias \"%s\"",
+ Conf->Spell[i]->p.flag)));
+ }
+ else
+ {
+ /*
+ * If Conf->Spell[i]->p.flag is empty, then get empty value of
+ * Conf->AffixData (0 index).
+ */
+ curaffix = 0;
+ }
+
+ Conf->Spell[i]->p.d.affix = curaffix;
+ Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
+ }
+ }
+ /* Otherwise fill Conf->AffixData here */
+ else
+ {
+ /* Count the number of different flags used in the dictionary */
+ qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *),
+ cmpspellaffix);
+
+ naffix = 0;
+ for (i = 0; i < Conf->nspell; i++)
+ {
+ if (i == 0 ||
+ strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag) != 0)
+ naffix++;
+ }
+
+ /*
+ * Fill in Conf->AffixData with the affixes that were used in the
+ * dictionary. Replace textual flag-field of Conf->Spell entries with
+ * indexes into Conf->AffixData array.
+ */
+ Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
+
+ curaffix = -1;
+ for (i = 0; i < Conf->nspell; i++)
+ {
+ if (i == 0 ||
+ strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]) != 0)
+ {
+ curaffix++;
+ Assert(curaffix < naffix);
+ Conf->AffixData[curaffix] = cpstrdup(Conf,
+ Conf->Spell[i]->p.flag);
+ }
+
+ Conf->Spell[i]->p.d.affix = curaffix;
+ Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
+ }
+
+ Conf->lenAffixData = Conf->nAffixData = naffix;
+ }
+
+ /* Start build a prefix tree */
+ qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
+ Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
+}
+
+/*
+ * Makes a prefix tree for the given level using the repl string of an affix
+ * rule. Affixes with empty replace string do not include in the prefix tree.
+ * This affixes are included by mkVoidAffix().
+ *
+ * Conf: current dictionary.
+ * low: lower index of the Conf->Affix array.
+ * high: upper index of the Conf->Affix array.
+ * level: current prefix tree level.
+ * type: FF_SUFFIX or FF_PREFIX.
+ */
+static AffixNode *
+mkANode(IspellDict *Conf, int low, int high, int level, int type)
+{
+ int i;
+ int nchar = 0;
+ uint8 lastchar = '\0';
+ AffixNode *rs;
+ AffixNodeData *data;
+ int lownew = low;
+ int naff;
+ AFFIX **aff;
+
+ for (i = low; i < high; i++)
+ if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
+ {
+ nchar++;
+ lastchar = GETCHAR(Conf->Affix + i, level, type);
+ }
+
+ if (!nchar)
+ return NULL;
+
+ aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
+ naff = 0;
+
+ rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
+ rs->length = nchar;
+ data = rs->data;
+
+ lastchar = '\0';
+ for (i = low; i < high; i++)
+ if (Conf->Affix[i].replen > level)
+ {
+ if (lastchar != GETCHAR(Conf->Affix + i, level, type))
+ {
+ if (lastchar)
+ {
+ /* Next level of the prefix tree */
+ data->node = mkANode(Conf, lownew, i, level + 1, type);
+ if (naff)
+ {
+ data->naff = naff;
+ data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
+ memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
+ naff = 0;
+ }
+ data++;
+ lownew = i;
+ }
+ lastchar = GETCHAR(Conf->Affix + i, level, type);
+ }
+ data->val = GETCHAR(Conf->Affix + i, level, type);
+ if (Conf->Affix[i].replen == level + 1)
+ { /* affix stopped */
+ aff[naff++] = Conf->Affix + i;
+ }
+ }
+
+ /* Next level of the prefix tree */
+ data->node = mkANode(Conf, lownew, high, level + 1, type);
+ if (naff)
+ {
+ data->naff = naff;
+ data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
+ memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
+ naff = 0;
+ }
+
+ pfree(aff);
+
+ return rs;
+}
+
+/*
+ * Makes the root void node in the prefix tree. The root void node is created
+ * for affixes which have empty replace string ("repl" field).
+ */
+static void
+mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
+{
+ int i,
+ cnt = 0;
+ int start = (issuffix) ? startsuffix : 0;
+ int end = (issuffix) ? Conf->naffixes : startsuffix;
+ AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
+
+ Affix->length = 1;
+ Affix->isvoid = 1;
+
+ if (issuffix)
+ {
+ Affix->data->node = Conf->Suffix;
+ Conf->Suffix = Affix;
+ }
+ else
+ {
+ Affix->data->node = Conf->Prefix;
+ Conf->Prefix = Affix;
+ }
+
+ /* Count affixes with empty replace string */
+ for (i = start; i < end; i++)
+ if (Conf->Affix[i].replen == 0)
+ cnt++;
+
+ /* There is not affixes with empty replace string */
+ if (cnt == 0)
+ return;
+
+ Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
+ Affix->data->naff = (uint32) cnt;
+
+ cnt = 0;
+ for (i = start; i < end; i++)
+ if (Conf->Affix[i].replen == 0)
+ {
+ Affix->data->aff[cnt] = Conf->Affix + i;
+ cnt++;
+ }
+}
+
+/*
+ * Checks if the affixflag is used by dictionary. Conf->AffixData does not
+ * contain affixflag if this flag is not used actually by the .dict file.
+ *
+ * Conf: current dictionary.
+ * affixflag: affix flag.
+ *
+ * Returns true if the Conf->AffixData array contains affixflag, otherwise
+ * returns false.
+ */
+static bool
+isAffixInUse(IspellDict *Conf, char *affixflag)
+{
+ int i;
+
+ for (i = 0; i < Conf->nAffixData; i++)
+ if (IsAffixFlagInUse(Conf, i, affixflag))
+ return true;
+
+ return false;
+}
+
+/*
+ * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
+ */
+void
+NISortAffixes(IspellDict *Conf)
+{
+ AFFIX *Affix;
+ size_t i;
+ CMPDAffix *ptr;
+ int firstsuffix = Conf->naffixes;
+
+ if (Conf->naffixes == 0)
+ return;
+
+ /* Store compound affixes in the Conf->CompoundAffix array */
+ if (Conf->naffixes > 1)
+ qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
+ Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
+ ptr->affix = NULL;
+
+ for (i = 0; i < Conf->naffixes; i++)
+ {
+ Affix = &(((AFFIX *) Conf->Affix)[i]);
+ if (Affix->type == FF_SUFFIX && i < firstsuffix)
+ firstsuffix = i;
+
+ if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
+ isAffixInUse(Conf, Affix->flag))
+ {
+ bool issuffix = (Affix->type == FF_SUFFIX);
+
+ if (ptr == Conf->CompoundAffix ||
+ issuffix != (ptr - 1)->issuffix ||
+ strbncmp((const unsigned char *) (ptr - 1)->affix,
+ (const unsigned char *) Affix->repl,
+ (ptr - 1)->len))
+ {
+ /* leave only unique and minimal suffixes */
+ ptr->affix = Affix->repl;
+ ptr->len = Affix->replen;
+ ptr->issuffix = issuffix;
+ ptr++;
+ }
+ }
+ }
+ ptr->affix = NULL;
+ Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
+
+ /* Start build a prefix tree */
+ Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
+ Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
+ mkVoidAffix(Conf, true, firstsuffix);
+ mkVoidAffix(Conf, false, firstsuffix);
+}
+
+static AffixNodeData *
+FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
+{
+ AffixNodeData *StopLow,
+ *StopHigh,
+ *StopMiddle;
+ uint8 symbol;
+
+ if (node->isvoid)
+ { /* search void affixes */
+ if (node->data->naff)
+ return node->data;
+ node = node->data->node;
+ }
+
+ while (node && *level < wrdlen)
+ {
+ StopLow = node->data;
+ StopHigh = node->data + node->length;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ symbol = GETWCHAR(word, wrdlen, *level, type);
+
+ if (StopMiddle->val == symbol)
+ {
+ (*level)++;
+ if (StopMiddle->naff)
+ return StopMiddle;
+ node = StopMiddle->node;
+ break;
+ }
+ else if (StopMiddle->val < symbol)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+ if (StopLow >= StopHigh)
+ break;
+ }
+ return NULL;
+}
+
+static char *
+CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
+{
+ /*
+ * Check compound allow flags
+ */
+
+ if (flagflags == 0)
+ {
+ if (Affix->flagflags & FF_COMPOUNDONLY)
+ return NULL;
+ }
+ else if (flagflags & FF_COMPOUNDBEGIN)
+ {
+ if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
+ return NULL;
+ if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
+ if (Affix->type == FF_SUFFIX)
+ return NULL;
+ }
+ else if (flagflags & FF_COMPOUNDMIDDLE)
+ {
+ if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
+ (Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
+ return NULL;
+ }
+ else if (flagflags & FF_COMPOUNDLAST)
+ {
+ if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
+ return NULL;
+ if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
+ if (Affix->type == FF_PREFIX)
+ return NULL;
+ }
+
+ /*
+ * make replace pattern of affix
+ */
+ if (Affix->type == FF_SUFFIX)
+ {
+ strcpy(newword, word);
+ strcpy(newword + len - Affix->replen, Affix->find);
+ if (baselen) /* store length of non-changed part of word */
+ *baselen = len - Affix->replen;
+ }
+ else
+ {
+ /*
+ * if prefix is an all non-changed part's length then all word
+ * contains only prefix and suffix, so out
+ */
+ if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
+ return NULL;
+ strcpy(newword, Affix->find);
+ strcat(newword, word + Affix->replen);
+ }
+
+ /*
+ * check resulting word
+ */
+ if (Affix->issimple)
+ return newword;
+ else if (Affix->isregis)
+ {
+ if (RS_execute(&(Affix->reg.regis), newword))
+ return newword;
+ }
+ else
+ {
+ pg_wchar *data;
+ size_t data_len;
+ int newword_len;
+
+ /* Convert data string to wide characters */
+ newword_len = strlen(newword);
+ data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
+ data_len = pg_mb2wchar_with_len(newword, data, newword_len);
+
+ if (pg_regexec(&(Affix->reg.pregex->regex), data, data_len,
+ 0, NULL, 0, NULL, 0) == REG_OKAY)
+ {
+ pfree(data);
+ return newword;
+ }
+ pfree(data);
+ }
+
+ return NULL;
+}
+
+static int
+addToResult(char **forms, char **cur, char *word)
+{
+ if (cur - forms >= MAX_NORM - 1)
+ return 0;
+ if (forms == cur || strcmp(word, *(cur - 1)) != 0)
+ {
+ *cur = pstrdup(word);
+ *(cur + 1) = NULL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static char **
+NormalizeSubWord(IspellDict *Conf, char *word, int flag)
+{
+ AffixNodeData *suffix = NULL,
+ *prefix = NULL;
+ int slevel = 0,
+ plevel = 0;
+ int wrdlen = strlen(word),
+ swrdlen;
+ char **forms;
+ char **cur;
+ char newword[2 * MAXNORMLEN] = "";
+ char pnewword[2 * MAXNORMLEN] = "";
+ AffixNode *snode = Conf->Suffix,
+ *pnode;
+ int i,
+ j;
+
+ if (wrdlen > MAXNORMLEN)
+ return NULL;
+ cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
+ *cur = NULL;
+
+
+ /* Check that the word itself is normal form */
+ if (FindWord(Conf, word, VoidString, flag))
+ {
+ *cur = pstrdup(word);
+ cur++;
+ *cur = NULL;
+ }
+
+ /* Find all other NORMAL forms of the 'word' (check only prefix) */
+ pnode = Conf->Prefix;
+ plevel = 0;
+ while (pnode)
+ {
+ prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
+ if (!prefix)
+ break;
+ for (j = 0; j < prefix->naff; j++)
+ {
+ if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
+ {
+ /* prefix success */
+ if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
+ cur += addToResult(forms, cur, newword);
+ }
+ }
+ pnode = prefix->node;
+ }
+
+ /*
+ * Find all other NORMAL forms of the 'word' (check suffix and then
+ * prefix)
+ */
+ while (snode)
+ {
+ int baselen = 0;
+
+ /* find possible suffix */
+ suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
+ if (!suffix)
+ break;
+ /* foreach suffix check affix */
+ for (i = 0; i < suffix->naff; i++)
+ {
+ if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
+ {
+ /* suffix success */
+ if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
+ cur += addToResult(forms, cur, newword);
+
+ /* now we will look changed word with prefixes */
+ pnode = Conf->Prefix;
+ plevel = 0;
+ swrdlen = strlen(newword);
+ while (pnode)
+ {
+ prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
+ if (!prefix)
+ break;
+ for (j = 0; j < prefix->naff; j++)
+ {
+ if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
+ {
+ /* prefix success */
+ char *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
+ VoidString : prefix->aff[j]->flag;
+
+ if (FindWord(Conf, pnewword, ff, flag))
+ cur += addToResult(forms, cur, pnewword);
+ }
+ }
+ pnode = prefix->node;
+ }
+ }
+ }
+
+ snode = suffix->node;
+ }
+
+ if (cur == forms)
+ {
+ pfree(forms);
+ return NULL;
+ }
+ return forms;
+}
+
+typedef struct SplitVar
+{
+ int nstem;
+ int lenstem;
+ char **stem;
+ struct SplitVar *next;
+} SplitVar;
+
+static int
+CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
+{
+ bool issuffix;
+
+ /* in case CompoundAffix is null: */
+ if (*ptr == NULL)
+ return -1;
+
+ if (CheckInPlace)
+ {
+ while ((*ptr)->affix)
+ {
+ if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
+ {
+ len = (*ptr)->len;
+ issuffix = (*ptr)->issuffix;
+ (*ptr)++;
+ return (issuffix) ? len : 0;
+ }
+ (*ptr)++;
+ }
+ }
+ else
+ {
+ char *affbegin;
+
+ while ((*ptr)->affix)
+ {
+ if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
+ {
+ len = (*ptr)->len + (affbegin - word);
+ issuffix = (*ptr)->issuffix;
+ (*ptr)++;
+ return (issuffix) ? len : 0;
+ }
+ (*ptr)++;
+ }
+ }
+ return -1;
+}
+
+static SplitVar *
+CopyVar(SplitVar *s, int makedup)
+{
+ SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
+
+ v->next = NULL;
+ if (s)
+ {
+ int i;
+
+ v->lenstem = s->lenstem;
+ v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
+ v->nstem = s->nstem;
+ for (i = 0; i < s->nstem; i++)
+ v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
+ }
+ else
+ {
+ v->lenstem = 16;
+ v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
+ v->nstem = 0;
+ }
+ return v;
+}
+
+static void
+AddStem(SplitVar *v, char *word)
+{
+ if (v->nstem >= v->lenstem)
+ {
+ v->lenstem *= 2;
+ v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
+ }
+
+ v->stem[v->nstem] = word;
+ v->nstem++;
+}
+
+static SplitVar *
+SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
+{
+ SplitVar *var = NULL;
+ SPNodeData *StopLow,
+ *StopHigh,
+ *StopMiddle = NULL;
+ SPNode *node = (snode) ? snode : Conf->Dictionary;
+ int level = (snode) ? minpos : startpos; /* recursive
+ * minpos==level */
+ int lenaff;
+ CMPDAffix *caff;
+ char *notprobed;
+ int compoundflag = 0;
+
+ notprobed = (char *) palloc(wordlen);
+ memset(notprobed, 1, wordlen);
+ var = CopyVar(orig, 1);
+
+ while (level < wordlen)
+ {
+ /* find word with epenthetic or/and compound affix */
+ caff = Conf->CompoundAffix;
+ while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
+ {
+ /*
+ * there is one of compound affixes, so check word for existings
+ */
+ char buf[MAXNORMLEN];
+ char **subres;
+
+ lenaff = level - startpos + lenaff;
+
+ if (!notprobed[startpos + lenaff - 1])
+ continue;
+
+ if (level + lenaff - 1 <= minpos)
+ continue;
+
+ if (lenaff >= MAXNORMLEN)
+ continue; /* skip too big value */
+ if (lenaff > 0)
+ memcpy(buf, word + startpos, lenaff);
+ buf[lenaff] = '\0';
+
+ if (level == 0)
+ compoundflag = FF_COMPOUNDBEGIN;
+ else if (level == wordlen - 1)
+ compoundflag = FF_COMPOUNDLAST;
+ else
+ compoundflag = FF_COMPOUNDMIDDLE;
+ subres = NormalizeSubWord(Conf, buf, compoundflag);
+ if (subres)
+ {
+ /* Yes, it was a word from dictionary */
+ SplitVar *new = CopyVar(var, 0);
+ SplitVar *ptr = var;
+ char **sptr = subres;
+
+ notprobed[startpos + lenaff - 1] = 0;
+
+ while (*sptr)
+ {
+ AddStem(new, *sptr);
+ sptr++;
+ }
+ pfree(subres);
+
+ while (ptr->next)
+ ptr = ptr->next;
+ ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
+
+ pfree(new->stem);
+ pfree(new);
+ }
+ }
+
+ if (!node)
+ break;
+
+ StopLow = node->data;
+ StopHigh = node->data + node->length;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ if (StopMiddle->val == ((uint8 *) (word))[level])
+ break;
+ else if (StopMiddle->val < ((uint8 *) (word))[level])
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+
+ if (StopLow < StopHigh)
+ {
+ if (startpos == 0)
+ compoundflag = FF_COMPOUNDBEGIN;
+ else if (level == wordlen - 1)
+ compoundflag = FF_COMPOUNDLAST;
+ else
+ compoundflag = FF_COMPOUNDMIDDLE;
+
+ /* find infinitive */
+ if (StopMiddle->isword &&
+ (StopMiddle->compoundflag & compoundflag) &&
+ notprobed[level])
+ {
+ /* ok, we found full compoundallowed word */
+ if (level > minpos)
+ {
+ /* and its length more than minimal */
+ if (wordlen == level + 1)
+ {
+ /* well, it was last word */
+ AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
+ pfree(notprobed);
+ return var;
+ }
+ else
+ {
+ /* then we will search more big word at the same point */
+ SplitVar *ptr = var;
+
+ while (ptr->next)
+ ptr = ptr->next;
+ ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
+ /* we can find next word */
+ level++;
+ AddStem(var, pnstrdup(word + startpos, level - startpos));
+ node = Conf->Dictionary;
+ startpos = level;
+ continue;
+ }
+ }
+ }
+ node = StopMiddle->node;
+ }
+ else
+ node = NULL;
+ level++;
+ }
+
+ AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
+ pfree(notprobed);
+ return var;
+}
+
+static void
+addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
+{
+ if (*lres == NULL)
+ *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
+
+ if (*lcur - *lres < MAX_NORM - 1)
+ {
+ (*lcur)->lexeme = word;
+ (*lcur)->flags = flags;
+ (*lcur)->nvariant = NVariant;
+ (*lcur)++;
+ (*lcur)->lexeme = NULL;
+ }
+}
+
+TSLexeme *
+NINormalizeWord(IspellDict *Conf, char *word)
+{
+ char **res;
+ TSLexeme *lcur = NULL,
+ *lres = NULL;
+ uint16 NVariant = 1;
+
+ res = NormalizeSubWord(Conf, word, 0);
+
+ if (res)
+ {
+ char **ptr = res;
+
+ while (*ptr && (lcur - lres) < MAX_NORM)
+ {
+ addNorm(&lres, &lcur, *ptr, 0, NVariant++);
+ ptr++;
+ }
+ pfree(res);
+ }
+
+ if (Conf->usecompound)
+ {
+ int wordlen = strlen(word);
+ SplitVar *ptr,
+ *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
+ int i;
+
+ while (var)
+ {
+ if (var->nstem > 1)
+ {
+ char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
+
+ if (subres)
+ {
+ char **subptr = subres;
+
+ while (*subptr)
+ {
+ for (i = 0; i < var->nstem - 1; i++)
+ {
+ addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
+ }
+
+ addNorm(&lres, &lcur, *subptr, 0, NVariant);
+ subptr++;
+ NVariant++;
+ }
+
+ pfree(subres);
+ var->stem[0] = NULL;
+ pfree(var->stem[var->nstem - 1]);
+ }
+ }
+
+ for (i = 0; i < var->nstem && var->stem[i]; i++)
+ pfree(var->stem[i]);
+ ptr = var->next;
+ pfree(var->stem);
+ pfree(var);
+ var = ptr;
+ }
+ }
+
+ return lres;
+}