summaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers/tokenizers.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/libstat/tokenizers/tokenizers.h100
1 files changed, 100 insertions, 0 deletions
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
new file mode 100644
index 0000000..d696364
--- /dev/null
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TOKENIZERS_H
+#define TOKENIZERS_H
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+#include "rspamd.h"
+#include "stat_api.h"
+
+#include <unicode/utext.h>
+
+#define RSPAMD_DEFAULT_TOKENIZER "osb"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_tokenizer_runtime;
+struct rspamd_stat_ctx;
+
+/* Common tokenizer structure */
+struct rspamd_stat_tokenizer {
+ gchar *name;
+
+ gpointer (*get_config)(rspamd_mempool_t *pool,
+ struct rspamd_tokenizer_config *cf, gsize *len);
+
+ gint (*tokenize_func)(struct rspamd_stat_ctx *ctx,
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result);
+};
+
+enum rspamd_tokenize_type {
+ RSPAMD_TOKENIZE_UTF = 0,
+ RSPAMD_TOKENIZE_RAW,
+ RSPAMD_TOKENIZE_UNICODE
+};
+
+/* Compare two token nodes */
+gint token_node_compare_func(gconstpointer a, gconstpointer b);
+
+
+/* Tokenize text into array of words (rspamd_stat_token_t type) */
+GArray *rspamd_tokenize_text(const gchar *text, gsize len,
+ const UText *utxt,
+ enum rspamd_tokenize_type how,
+ struct rspamd_config *cfg,
+ GList *exceptions,
+ guint64 *hash,
+ GArray *cur_words,
+ rspamd_mempool_t *pool);
+
+/* OSB tokenize function */
+gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result);
+
+gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool,
+ struct rspamd_tokenizer_config *cf,
+ gsize *len);
+
+struct rspamd_lang_detector;
+
+void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool);
+
+void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool);
+
+void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+ const gchar *language,
+ struct rspamd_lang_detector *lang_detector);
+
+void rspamd_tokenize_meta_words(struct rspamd_task *task);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif