summaryrefslogtreecommitdiffstats
path: root/src/backend/snowball/dict_snowball.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/snowball/dict_snowball.c')
-rw-r--r--src/backend/snowball/dict_snowball.c331
1 files changed, 331 insertions, 0 deletions
diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c
new file mode 100644
index 0000000..8c25f3e
--- /dev/null
+++ b/src/backend/snowball/dict_snowball.c
@@ -0,0 +1,331 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_snowball.c
+ * Snowball dictionary
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/snowball/dict_snowball.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "commands/defrem.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+
+/* Some platforms define MAXINT and/or MININT, causing conflicts */
+#ifdef MAXINT
+#undef MAXINT
+#endif
+#ifdef MININT
+#undef MININT
+#endif
+
+/* Now we can include the original Snowball header.h */
+#include "snowball/libstemmer/header.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_english.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_french.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_german.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
+#include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
+#include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
+#include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
+#include "snowball/libstemmer/stem_KOI8_R_russian.h"
+#include "snowball/libstemmer/stem_UTF_8_arabic.h"
+#include "snowball/libstemmer/stem_UTF_8_armenian.h"
+#include "snowball/libstemmer/stem_UTF_8_basque.h"
+#include "snowball/libstemmer/stem_UTF_8_catalan.h"
+#include "snowball/libstemmer/stem_UTF_8_danish.h"
+#include "snowball/libstemmer/stem_UTF_8_dutch.h"
+#include "snowball/libstemmer/stem_UTF_8_english.h"
+#include "snowball/libstemmer/stem_UTF_8_finnish.h"
+#include "snowball/libstemmer/stem_UTF_8_french.h"
+#include "snowball/libstemmer/stem_UTF_8_german.h"
+#include "snowball/libstemmer/stem_UTF_8_greek.h"
+#include "snowball/libstemmer/stem_UTF_8_hindi.h"
+#include "snowball/libstemmer/stem_UTF_8_hungarian.h"
+#include "snowball/libstemmer/stem_UTF_8_indonesian.h"
+#include "snowball/libstemmer/stem_UTF_8_irish.h"
+#include "snowball/libstemmer/stem_UTF_8_italian.h"
+#include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
+#include "snowball/libstemmer/stem_UTF_8_nepali.h"
+#include "snowball/libstemmer/stem_UTF_8_norwegian.h"
+#include "snowball/libstemmer/stem_UTF_8_porter.h"
+#include "snowball/libstemmer/stem_UTF_8_portuguese.h"
+#include "snowball/libstemmer/stem_UTF_8_romanian.h"
+#include "snowball/libstemmer/stem_UTF_8_russian.h"
+#include "snowball/libstemmer/stem_UTF_8_serbian.h"
+#include "snowball/libstemmer/stem_UTF_8_spanish.h"
+#include "snowball/libstemmer/stem_UTF_8_swedish.h"
+#include "snowball/libstemmer/stem_UTF_8_tamil.h"
+#include "snowball/libstemmer/stem_UTF_8_turkish.h"
+#include "snowball/libstemmer/stem_UTF_8_yiddish.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(dsnowball_init);
+
+PG_FUNCTION_INFO_V1(dsnowball_lexize);
+
+/* List of supported modules */
+typedef struct stemmer_module
+{
+ const char *name;
+ pg_enc enc;
+ struct SN_env *(*create) (void);
+ void (*close) (struct SN_env *);
+ int (*stem) (struct SN_env *);
+} stemmer_module;
+
+/* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
+#define STEMMER_MODULE(name,enc,senc) \
+ {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
+
+static const stemmer_module stemmer_modules[] =
+{
+ /*
+ * Stemmers list from Snowball distribution
+ */
+ STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
+ STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
+ STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
+ STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
+ STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
+ STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
+ STEMMER_MODULE(basque, PG_UTF8, UTF_8),
+ STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
+ STEMMER_MODULE(danish, PG_UTF8, UTF_8),
+ STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
+ STEMMER_MODULE(english, PG_UTF8, UTF_8),
+ STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
+ STEMMER_MODULE(french, PG_UTF8, UTF_8),
+ STEMMER_MODULE(german, PG_UTF8, UTF_8),
+ STEMMER_MODULE(greek, PG_UTF8, UTF_8),
+ STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
+ STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
+ STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
+ STEMMER_MODULE(irish, PG_UTF8, UTF_8),
+ STEMMER_MODULE(italian, PG_UTF8, UTF_8),
+ STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
+ STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
+ STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
+ STEMMER_MODULE(porter, PG_UTF8, UTF_8),
+ STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
+ STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
+ STEMMER_MODULE(russian, PG_UTF8, UTF_8),
+ STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
+ STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
+ STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
+ STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
+ STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
+ STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
+
+ /*
+ * Stemmer with PG_SQL_ASCII encoding should be valid for any server
+ * encoding
+ */
+ STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
+
+ {NULL, 0, NULL, NULL, NULL} /* list end marker */
+};
+
+
+typedef struct DictSnowball
+{
+ struct SN_env *z;
+ StopList stoplist;
+ bool needrecode; /* needs recoding before/after call stem */
+ int (*stem) (struct SN_env *z);
+
+ /*
+ * snowball saves alloced memory between calls, so we should run it in our
+ * private memory context. Note, init function is executed in long lived
+ * context, so we just remember CurrentMemoryContext
+ */
+ MemoryContext dictCtx;
+} DictSnowball;
+
+
+static void
+locate_stem_module(DictSnowball *d, const char *lang)
+{
+ const stemmer_module *m;
+
+ /*
+ * First, try to find exact match of stemmer module. Stemmer with
+ * PG_SQL_ASCII encoding is treated as working with any server encoding
+ */
+ for (m = stemmer_modules; m->name; m++)
+ {
+ if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
+ pg_strcasecmp(m->name, lang) == 0)
+ {
+ d->stem = m->stem;
+ d->z = m->create();
+ d->needrecode = false;
+ return;
+ }
+ }
+
+ /*
+ * Second, try to find stemmer for needed language for UTF8 encoding.
+ */
+ for (m = stemmer_modules; m->name; m++)
+ {
+ if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
+ {
+ d->stem = m->stem;
+ d->z = m->create();
+ d->needrecode = true;
+ return;
+ }
+ }
+
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
+ lang, GetDatabaseEncodingName())));
+}
+
+Datum
+dsnowball_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ DictSnowball *d;
+ bool stoploaded = false;
+ ListCell *l;
+
+ d = (DictSnowball *) palloc0(sizeof(DictSnowball));
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (strcmp(defel->defname, "stopwords") == 0)
+ {
+ if (stoploaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple StopWords parameters")));
+ readstoplist(defGetString(defel), &d->stoplist, lowerstr);
+ stoploaded = true;
+ }
+ else if (strcmp(defel->defname, "language") == 0)
+ {
+ if (d->stem)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple Language parameters")));
+ locate_stem_module(d, defGetString(defel));
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized Snowball parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ if (!d->stem)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing Language parameter")));
+
+ d->dictCtx = CurrentMemoryContext;
+
+ PG_RETURN_POINTER(d);
+}
+
+Datum
+dsnowball_lexize(PG_FUNCTION_ARGS)
+{
+ DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
+ char *in = (char *) PG_GETARG_POINTER(1);
+ int32 len = PG_GETARG_INT32(2);
+ char *txt = lowerstr_with_len(in, len);
+ TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
+
+ if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+ {
+ pfree(txt);
+ }
+ else
+ {
+ MemoryContext saveCtx;
+
+ /*
+ * recode to utf8 if stemmer is utf8 and doesn't match server encoding
+ */
+ if (d->needrecode)
+ {
+ char *recoded;
+
+ recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
+ if (recoded != txt)
+ {
+ pfree(txt);
+ txt = recoded;
+ }
+ }
+
+ /* see comment about d->dictCtx */
+ saveCtx = MemoryContextSwitchTo(d->dictCtx);
+ SN_set_current(d->z, strlen(txt), (symbol *) txt);
+ d->stem(d->z);
+ MemoryContextSwitchTo(saveCtx);
+
+ if (d->z->p && d->z->l)
+ {
+ txt = repalloc(txt, d->z->l + 1);
+ memcpy(txt, d->z->p, d->z->l);
+ txt[d->z->l] = '\0';
+ }
+
+ /* back recode if needed */
+ if (d->needrecode)
+ {
+ char *recoded;
+
+ recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
+ if (recoded != txt)
+ {
+ pfree(txt);
+ txt = recoded;
+ }
+ }
+
+ res->lexeme = txt;
+ }
+
+ PG_RETURN_POINTER(res);
+}