summaryrefslogtreecommitdiffstats
path: root/contrib/snowball/include
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
commit133a45c109da5310add55824db21af5239951f93 (patch)
treeba6ac4c0a950a0dda56451944315d66409923918 /contrib/snowball/include
parentInitial commit. (diff)
downloadrspamd-133a45c109da5310add55824db21af5239951f93.tar.xz
rspamd-133a45c109da5310add55824db21af5239951f93.zip
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/snowball/include')
-rw-r--r--contrib/snowball/include/libstemmer.h78
1 files changed, 78 insertions, 0 deletions
diff --git a/contrib/snowball/include/libstemmer.h b/contrib/snowball/include/libstemmer.h
new file mode 100644
index 0000000..98051e1
--- /dev/null
+++ b/contrib/snowball/include/libstemmer.h
@@ -0,0 +1,78 @@
+
+/* Make header file work when included from C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sb_stemmer;
+typedef unsigned char sb_symbol;
+
+/* FIXME - should be able to get a version number for each stemming
+ * algorithm (which will be incremented each time the output changes). */
+
+/** Returns an array of the names of the available stemming algorithms.
+ * Note that these are the canonical names - aliases (ie, other names for
+ * the same algorithm) will not be included in the list.
+ * The list is terminated with a null pointer.
+ *
+ * The list must not be modified in any way.
+ */
+const char ** sb_stemmer_list(void);
+
+/** Create a new stemmer object, using the specified algorithm, for the
+ * specified character encoding.
+ *
+ * All algorithms will usually be available in UTF-8, but may also be
+ * available in other character encodings.
+ *
+ * @param algorithm The algorithm name. This is either the english
+ * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
+ * language. Note that case is significant in this parameter - the
+ * value should be supplied in lower case.
+ *
+ * @param charenc The character encoding. NULL may be passed as
+ * this value, in which case UTF-8 encoding will be assumed. Otherwise,
+ * the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1),
+ * "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is
+ * significant in this parameter.
+ *
+ * @return NULL if the specified algorithm is not recognised, or the
+ * algorithm is not available for the requested encoding. Otherwise,
+ * returns a pointer to a newly created stemmer for the requested algorithm.
+ * The returned pointer must be deleted by calling sb_stemmer_delete().
+ *
+ * @note NULL will also be returned if an out of memory error occurs.
+ */
+struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
+
+/** Delete a stemmer object.
+ *
+ * This frees all resources allocated for the stemmer. After calling
+ * this function, the supplied stemmer may no longer be used in any way.
+ *
+ * It is safe to pass a null pointer to this function - this will have
+ * no effect.
+ */
+void sb_stemmer_delete(struct sb_stemmer * stemmer);
+
+/** Stem a word.
+ *
+ * The return value is owned by the stemmer - it must not be freed or
+ * modified, and it will become invalid when the stemmer is called again,
+ * or if the stemmer is freed.
+ *
+ * The length of the return value can be obtained using sb_stemmer_length().
+ *
+ * If an out-of-memory error occurs, this will return NULL.
+ */
+const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
+ const sb_symbol * word, int size);
+
+/** Get the length of the result of the last stemmed word.
+ * This should not be called before sb_stemmer_stem() has been called.
+ */
+int sb_stemmer_length(struct sb_stemmer * stemmer);
+
+#ifdef __cplusplus
+}
+#endif