diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /contrib/snowball/include | |
parent | Initial commit. (diff) | |
download | rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/snowball/include')
-rw-r--r-- | contrib/snowball/include/libstemmer.h | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/contrib/snowball/include/libstemmer.h b/contrib/snowball/include/libstemmer.h new file mode 100644 index 0000000..98051e1 --- /dev/null +++ b/contrib/snowball/include/libstemmer.h @@ -0,0 +1,78 @@ + +/* Make header file work when included from C++ */ +#ifdef __cplusplus +extern "C" { +#endif + +struct sb_stemmer; +typedef unsigned char sb_symbol; + +/* FIXME - should be able to get a version number for each stemming + * algorithm (which will be incremented each time the output changes). */ + +/** Returns an array of the names of the available stemming algorithms. + * Note that these are the canonical names - aliases (ie, other names for + * the same algorithm) will not be included in the list. + * The list is terminated with a null pointer. + * + * The list must not be modified in any way. + */ +const char ** sb_stemmer_list(void); + +/** Create a new stemmer object, using the specified algorithm, for the + * specified character encoding. + * + * All algorithms will usually be available in UTF-8, but may also be + * available in other character encodings. + * + * @param algorithm The algorithm name. This is either the english + * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the + * language. Note that case is significant in this parameter - the + * value should be supplied in lower case. + * + * @param charenc The character encoding. NULL may be passed as + * this value, in which case UTF-8 encoding will be assumed. Otherwise, + * the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1), + * "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is + * significant in this parameter. + * + * @return NULL if the specified algorithm is not recognised, or the + * algorithm is not available for the requested encoding. Otherwise, + * returns a pointer to a newly created stemmer for the requested algorithm. + * The returned pointer must be deleted by calling sb_stemmer_delete(). + * + * @note NULL will also be returned if an out of memory error occurs. + */ +struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc); + +/** Delete a stemmer object. + * + * This frees all resources allocated for the stemmer. After calling + * this function, the supplied stemmer may no longer be used in any way. + * + * It is safe to pass a null pointer to this function - this will have + * no effect. + */ +void sb_stemmer_delete(struct sb_stemmer * stemmer); + +/** Stem a word. + * + * The return value is owned by the stemmer - it must not be freed or + * modified, and it will become invalid when the stemmer is called again, + * or if the stemmer is freed. + * + * The length of the return value can be obtained using sb_stemmer_length(). + * + * If an out-of-memory error occurs, this will return NULL. + */ +const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, + const sb_symbol * word, int size); + +/** Get the length of the result of the last stemmed word. + * This should not be called before sb_stemmer_stem() has been called. + */ +int sb_stemmer_length(struct sb_stemmer * stemmer); + +#ifdef __cplusplus +} +#endif |