summaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers/tokenizers.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libstat/tokenizers/tokenizers.c')
-rw-r--r--src/libstat/tokenizers/tokenizers.c955
1 files changed, 955 insertions, 0 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
new file mode 100644
index 0000000..ee7234d
--- /dev/null
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -0,0 +1,955 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Common tokenization functions
+ */
+
+#include "rspamd.h"
+#include "tokenizers.h"
+#include "stat_internal.h"
+#include "contrib/mumhash/mum.h"
+#include "libmime/lang_detection.h"
+#include "libstemmer.h"
+
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <unicode/uiter.h>
+#include <unicode/ubrk.h>
+#include <unicode/ucnv.h>
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+#include <unicode/unorm2.h>
+#endif
+
+#include <math.h>
+
+typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, gchar const **pos,
+ rspamd_stat_token_t *token,
+ GList **exceptions, gsize *rl, gboolean check_signature);
+
+const gchar t_delimiters[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0};
+
+/* Get next word from specified f_str_t buf */
+static gboolean
+rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf,
+ gchar const **cur, rspamd_stat_token_t *token,
+ GList **exceptions, gsize *rl, gboolean unused)
+{
+ gsize remain, pos;
+ const gchar *p;
+ struct rspamd_process_exception *ex = NULL;
+
+ if (buf == NULL) {
+ return FALSE;
+ }
+
+ g_assert(cur != NULL);
+
+ if (exceptions != NULL && *exceptions != NULL) {
+ ex = (*exceptions)->data;
+ }
+
+ if (token->original.begin == NULL || *cur == NULL) {
+ if (ex != NULL) {
+ if (ex->pos == 0) {
+ token->original.begin = buf->original.begin + ex->len;
+ token->original.len = ex->len;
+ token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+ }
+ else {
+ token->original.begin = buf->original.begin;
+ token->original.len = 0;
+ }
+ }
+ else {
+ token->original.begin = buf->original.begin;
+ token->original.len = 0;
+ }
+ *cur = token->original.begin;
+ }
+
+ token->original.len = 0;
+
+ pos = *cur - buf->original.begin;
+ if (pos >= buf->original.len) {
+ return FALSE;
+ }
+
+ remain = buf->original.len - pos;
+ p = *cur;
+
+ /* Skip non delimiters symbols */
+ do {
+ if (ex != NULL && ex->pos == pos) {
+ /* Go to the next exception */
+ *exceptions = g_list_next(*exceptions);
+ *cur = p + ex->len;
+ return TRUE;
+ }
+ pos++;
+ p++;
+ remain--;
+ } while (remain > 0 && t_delimiters[(guchar) *p]);
+
+ token->original.begin = p;
+
+ while (remain > 0 && !t_delimiters[(guchar) *p]) {
+ if (ex != NULL && ex->pos == pos) {
+ *exceptions = g_list_next(*exceptions);
+ *cur = p + ex->len;
+ return TRUE;
+ }
+ token->original.len++;
+ pos++;
+ remain--;
+ p++;
+ }
+
+ if (remain == 0) {
+ return FALSE;
+ }
+
+ if (rl) {
+ *rl = token->original.len;
+ }
+
+ token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
+
+ *cur = p;
+
+ return TRUE;
+}
+
+static inline gboolean
+rspamd_tokenize_check_limit(gboolean decay,
+ guint word_decay,
+ guint nwords,
+ guint64 *hv,
+ guint64 *prob,
+ const rspamd_stat_token_t *token,
+ gssize remain,
+ gssize total)
+{
+ static const gdouble avg_word_len = 6.0;
+
+ if (!decay) {
+ if (token->original.len >= sizeof(guint64)) {
+ guint64 tmp;
+ memcpy(&tmp, token->original.begin, sizeof(tmp));
+ *hv = mum_hash_step(*hv, tmp);
+ }
+
+ /* Check for decay */
+ if (word_decay > 0 && nwords > word_decay && remain < (gssize) total) {
+ /* Start decay */
+ gdouble decay_prob;
+
+ *hv = mum_hash_finish(*hv);
+
+ /* We assume that word is 6 symbols length in average */
+ decay_prob = (gdouble) word_decay / ((total - (remain)) / avg_word_len) * 10;
+ decay_prob = floor(decay_prob) / 10.0;
+
+ if (decay_prob >= 1.0) {
+ *prob = G_MAXUINT64;
+ }
+ else {
+ *prob = (guint64) (decay_prob * (double) G_MAXUINT64);
+ }
+
+ return TRUE;
+ }
+ }
+ else {
+ /* Decaying probability */
+ /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
+ *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
+
+ if (*hv > *prob) {
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+static inline gboolean
+rspamd_utf_word_valid(const guchar *text, const guchar *end,
+ gint32 start, gint32 finish)
+{
+ const guchar *st = text + start, *fin = text + finish;
+ UChar32 c;
+
+ if (st >= end || fin > end || st >= fin) {
+ return FALSE;
+ }
+
+ U8_NEXT(text, start, finish, c);
+
+ if (u_isJavaIDPart(c)) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+#define SHIFT_EX \
+ do { \
+ cur = g_list_next(cur); \
+ if (cur) { \
+ ex = (struct rspamd_process_exception *) cur->data; \
+ } \
+ else { \
+ ex = NULL; \
+ } \
+ } while (0)
+
+static inline void
+rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
+{
+ rspamd_stat_token_t token;
+
+ memset(&token, 0, sizeof(token));
+
+ if (ex->type == RSPAMD_EXCEPTION_GENERIC) {
+ token.original.begin = "!!EX!!";
+ token.original.len = sizeof("!!EX!!") - 1;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+
+ g_array_append_val(res, token);
+ token.flags = 0;
+ }
+ else if (ex->type == RSPAMD_EXCEPTION_URL) {
+ struct rspamd_url *uri;
+
+ uri = ex->ptr;
+
+ if (uri && uri->tldlen > 0) {
+ token.original.begin = rspamd_url_tld_unsafe(uri);
+ token.original.len = uri->tldlen;
+ }
+ else {
+ token.original.begin = "!!EX!!";
+ token.original.len = sizeof("!!EX!!") - 1;
+ }
+
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+ g_array_append_val(res, token);
+ token.flags = 0;
+ }
+}
+
+
+GArray *
+rspamd_tokenize_text(const gchar *text, gsize len,
+ const UText *utxt,
+ enum rspamd_tokenize_type how,
+ struct rspamd_config *cfg,
+ GList *exceptions,
+ guint64 *hash,
+ GArray *cur_words,
+ rspamd_mempool_t *pool)
+{
+ rspamd_stat_token_t token, buf;
+ const gchar *pos = NULL;
+ gsize l = 0;
+ GArray *res;
+ GList *cur = exceptions;
+ guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
+ guint64 hv = 0;
+ gboolean decay = FALSE, long_text_mode = FALSE;
+ guint64 prob = 0;
+ static UBreakIterator *bi = NULL;
+ static const gsize long_text_limit = 1 * 1024 * 1024;
+ static const ev_tstamp max_exec_time = 0.2; /* 200 ms */
+ ev_tstamp start;
+
+ if (text == NULL) {
+ return cur_words;
+ }
+
+ if (len > long_text_limit) {
+ /*
+ * In this mode we do additional checks to avoid performance issues
+ */
+ long_text_mode = TRUE;
+ start = ev_time();
+ }
+
+ buf.original.begin = text;
+ buf.original.len = len;
+ buf.flags = 0;
+
+ memset(&token, 0, sizeof(token));
+
+ if (cfg != NULL) {
+ min_len = cfg->min_word_len;
+ max_len = cfg->max_word_len;
+ word_decay = cfg->words_decay;
+ initial_size = word_decay * 2;
+ }
+
+ if (!cur_words) {
+ res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t),
+ initial_size);
+ }
+ else {
+ res = cur_words;
+ }
+
+ if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
+ while (rspamd_tokenizer_get_word_raw(&buf, &pos, &token, &cur, &l, FALSE)) {
+ if (l == 0 || (min_len > 0 && l < min_len) ||
+ (max_len > 0 && l > max_len)) {
+ token.original.begin = pos;
+ continue;
+ }
+
+ if (token.original.len > 0 &&
+ rspamd_tokenize_check_limit(decay, word_decay, res->len,
+ &hv, &prob, &token, pos - text, len)) {
+ if (!decay) {
+ decay = TRUE;
+ }
+ else {
+ token.original.begin = pos;
+ continue;
+ }
+ }
+
+ if (long_text_mode) {
+ if ((res->len + 1) % 16 == 0) {
+ ev_tstamp now = ev_time();
+
+ if (now - start > max_exec_time) {
+ msg_warn_pool_check(
+ "too long time has been spent on tokenization:"
+ " %.1f ms, limit is %.1f ms; %d words added so far",
+ (now - start) * 1e3, max_exec_time * 1e3,
+ res->len);
+
+ goto end;
+ }
+ }
+ }
+
+ g_array_append_val(res, token);
+
+ if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+ /* Due to bug in glib ! */
+ msg_err_pool_check(
+ "too many words found: %d, stop tokenization to avoid DoS",
+ res->len);
+
+ goto end;
+ }
+
+ token.original.begin = pos;
+ }
+ }
+ else {
+ /* UTF8 boundaries */
+ UErrorCode uc_err = U_ZERO_ERROR;
+ int32_t last, p;
+ struct rspamd_process_exception *ex = NULL;
+
+ if (bi == NULL) {
+ bi = ubrk_open(UBRK_WORD, NULL, NULL, 0, &uc_err);
+
+ g_assert(U_SUCCESS(uc_err));
+ }
+
+ ubrk_setUText(bi, (UText *) utxt, &uc_err);
+ last = ubrk_first(bi);
+ p = last;
+
+ if (cur) {
+ ex = (struct rspamd_process_exception *) cur->data;
+ }
+
+ while (p != UBRK_DONE) {
+ start_over:
+ token.original.len = 0;
+
+ if (p > last) {
+ if (ex && cur) {
+ /* Check exception */
+ if (ex->pos >= last && ex->pos <= p) {
+ /* We have an exception within boundary */
+ /* First, start to drain exceptions from the start */
+ while (cur && ex->pos <= last) {
+ /* We have an exception at the beginning, skip those */
+ last += ex->len;
+ rspamd_tokenize_exception(ex, res);
+
+ if (last > p) {
+ /* Exception spread over the boundaries */
+ while (last > p && p != UBRK_DONE) {
+ gint32 old_p = p;
+ p = ubrk_next(bi);
+
+ if (p != UBRK_DONE && p <= old_p) {
+ msg_warn_pool_check(
+ "tokenization reversed back on position %d,"
+ "%d new position (%d backward), likely libicu bug!",
+ (gint) (p), (gint) (old_p), old_p - p);
+
+ goto end;
+ }
+ }
+
+ /* We need to reset our scan with new p and last */
+ SHIFT_EX;
+ goto start_over;
+ }
+
+ SHIFT_EX;
+ }
+
+ /* Now, we can have an exception within boundary again */
+ if (cur && ex->pos >= last && ex->pos <= p) {
+ /* Append the first part */
+ if (rspamd_utf_word_valid(text, text + len, last,
+ ex->pos)) {
+ token.original.begin = text + last;
+ token.original.len = ex->pos - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
+ RSPAMD_STAT_TOKEN_FLAG_UTF;
+ }
+
+ /* Process the current exception */
+ last += ex->len + (ex->pos - last);
+
+ rspamd_tokenize_exception(ex, res);
+
+ if (last > p) {
+ /* Exception spread over the boundaries */
+ while (last > p && p != UBRK_DONE) {
+ gint32 old_p = p;
+ p = ubrk_next(bi);
+ if (p != UBRK_DONE && p <= old_p) {
+ msg_warn_pool_check(
+ "tokenization reversed back on position %d,"
+ "%d new position (%d backward), likely libicu bug!",
+ (gint) (p), (gint) (old_p), old_p - p);
+
+ goto end;
+ }
+ }
+ /* We need to reset our scan with new p and last */
+ SHIFT_EX;
+ goto start_over;
+ }
+
+ SHIFT_EX;
+ }
+ else if (p > last) {
+ if (rspamd_utf_word_valid(text, text + len, last, p)) {
+ token.original.begin = text + last;
+ token.original.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
+ RSPAMD_STAT_TOKEN_FLAG_UTF;
+ }
+ }
+ }
+ else if (ex->pos < last) {
+ /* Forward exceptions list */
+ while (cur && ex->pos <= last) {
+ /* We have an exception at the beginning, skip those */
+ SHIFT_EX;
+ }
+
+ if (rspamd_utf_word_valid(text, text + len, last, p)) {
+ token.original.begin = text + last;
+ token.original.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
+ RSPAMD_STAT_TOKEN_FLAG_UTF;
+ }
+ }
+ else {
+ /* No exceptions within boundary */
+ if (rspamd_utf_word_valid(text, text + len, last, p)) {
+ token.original.begin = text + last;
+ token.original.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
+ RSPAMD_STAT_TOKEN_FLAG_UTF;
+ }
+ }
+ }
+ else {
+ if (rspamd_utf_word_valid(text, text + len, last, p)) {
+ token.original.begin = text + last;
+ token.original.len = p - last;
+ token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
+ RSPAMD_STAT_TOKEN_FLAG_UTF;
+ }
+ }
+
+ if (token.original.len > 0 &&
+ rspamd_tokenize_check_limit(decay, word_decay, res->len,
+ &hv, &prob, &token, p, len)) {
+ if (!decay) {
+ decay = TRUE;
+ }
+ else {
+ token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED;
+ }
+ }
+ }
+
+ if (token.original.len > 0) {
+ /* Additional check for number of words */
+ if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+ /* Due to bug in glib ! */
+ msg_err("too many words found: %d, stop tokenization to avoid DoS",
+ res->len);
+
+ goto end;
+ }
+
+ g_array_append_val(res, token);
+ }
+
+ /* Also check for long text mode */
+ if (long_text_mode) {
+ /* Check time each 128 words added */
+ const int words_check_mask = 0x7F;
+
+ if ((res->len & words_check_mask) == words_check_mask) {
+ ev_tstamp now = ev_time();
+
+ if (now - start > max_exec_time) {
+ msg_warn_pool_check(
+ "too long time has been spent on tokenization:"
+ " %.1f ms, limit is %.1f ms; %d words added so far",
+ (now - start) * 1e3, max_exec_time * 1e3,
+ res->len);
+
+ goto end;
+ }
+ }
+ }
+
+ last = p;
+ p = ubrk_next(bi);
+
+ if (p != UBRK_DONE && p <= last) {
+ msg_warn_pool_check("tokenization reversed back on position %d,"
+ "%d new position (%d backward), likely libicu bug!",
+ (gint) (p), (gint) (last), last - p);
+
+ goto end;
+ }
+ }
+ }
+
+end:
+ if (!decay) {
+ hv = mum_hash_finish(hv);
+ }
+
+ if (hash) {
+ *hash = hv;
+ }
+
+ return res;
+}
+
+#undef SHIFT_EX
+
+static void
+rspamd_add_metawords_from_str(const gchar *beg, gsize len,
+ struct rspamd_task *task)
+{
+ UText utxt = UTEXT_INITIALIZER;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ guint i = 0;
+ UChar32 uc;
+ gboolean valid_utf = TRUE;
+
+ while (i < len) {
+ U8_NEXT(beg, i, len, uc);
+
+ if (((gint32) uc) < 0) {
+ valid_utf = FALSE;
+ break;
+ }
+
+#if U_ICU_VERSION_MAJOR_NUM < 50
+ if (u_isalpha(uc)) {
+ gint32 sc = ublock_getCode(uc);
+
+ if (sc == UBLOCK_THAI) {
+ valid_utf = FALSE;
+ msg_info_task("enable workaround for Thai characters for old libicu");
+ break;
+ }
+ }
+#endif
+ }
+
+ if (valid_utf) {
+ utext_openUTF8(&utxt,
+ beg,
+ len,
+ &uc_err);
+
+ task->meta_words = rspamd_tokenize_text(beg, len,
+ &utxt, RSPAMD_TOKENIZE_UTF,
+ task->cfg, NULL, NULL,
+ task->meta_words,
+ task->task_pool);
+
+ utext_close(&utxt);
+ }
+ else {
+ task->meta_words = rspamd_tokenize_text(beg, len,
+ NULL, RSPAMD_TOKENIZE_RAW,
+ task->cfg, NULL, NULL, task->meta_words,
+ task->task_pool);
+ }
+}
+
+void rspamd_tokenize_meta_words(struct rspamd_task *task)
+{
+ guint i = 0;
+ rspamd_stat_token_t *tok;
+
+ if (MESSAGE_FIELD(task, subject)) {
+ rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject),
+ strlen(MESSAGE_FIELD(task, subject)), task);
+ }
+
+ if (MESSAGE_FIELD(task, from_mime) && MESSAGE_FIELD(task, from_mime)->len > 0) {
+ struct rspamd_email_address *addr;
+
+ addr = g_ptr_array_index(MESSAGE_FIELD(task, from_mime), 0);
+
+ if (addr->name) {
+ rspamd_add_metawords_from_str(addr->name, strlen(addr->name), task);
+ }
+ }
+
+ if (task->meta_words != NULL) {
+ const gchar *language = NULL;
+
+ if (MESSAGE_FIELD(task, text_parts) &&
+ MESSAGE_FIELD(task, text_parts)->len > 0) {
+ struct rspamd_mime_text_part *tp = g_ptr_array_index(
+ MESSAGE_FIELD(task, text_parts), 0);
+
+ if (tp->language) {
+ language = tp->language;
+ }
+ }
+
+ rspamd_normalize_words(task->meta_words, task->task_pool);
+ rspamd_stem_words(task->meta_words, task->task_pool, language,
+ task->lang_det);
+
+ for (i = 0; i < task->meta_words->len; i++) {
+ tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
+ }
+ }
+}
+
+static inline void
+rspamd_uchars_to_ucs32(const UChar *src, gsize srclen,
+ rspamd_stat_token_t *tok,
+ rspamd_mempool_t *pool)
+{
+ UChar32 *dest, t, *d;
+ gint32 i = 0;
+
+ dest = rspamd_mempool_alloc(pool, srclen * sizeof(UChar32));
+ d = dest;
+
+ while (i < srclen) {
+ U16_NEXT_UNSAFE(src, i, t);
+
+ if (u_isgraph(t)) {
+ UCharCategory cat;
+
+ cat = u_charType(t);
+#if U_ICU_VERSION_MAJOR_NUM >= 57
+ if (u_hasBinaryProperty(t, UCHAR_EMOJI)) {
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI;
+ }
+#endif
+
+ if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) ||
+ cat == U_CONNECTOR_PUNCTUATION ||
+ cat == U_MATH_SYMBOL ||
+ cat == U_CURRENCY_SYMBOL) {
+ *d++ = u_tolower(t);
+ }
+ }
+ else {
+ /* Invisible spaces ! */
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
+ }
+ }
+
+ tok->unicode.begin = dest;
+ tok->unicode.len = d - dest;
+}
+
+static inline void
+rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok,
+ rspamd_mempool_t *pool)
+{
+ guint i, doff = 0;
+ gsize utflen = 0;
+ gchar *dest;
+ UChar32 t;
+
+ for (i = 0; i < tok->unicode.len; i++) {
+ utflen += U8_LENGTH(tok->unicode.begin[i]);
+ }
+
+ dest = rspamd_mempool_alloc(pool, utflen + 1);
+
+ for (i = 0; i < tok->unicode.len; i++) {
+ t = tok->unicode.begin[i];
+ U8_APPEND_UNSAFE(dest, doff, t);
+ }
+
+ g_assert(doff <= utflen);
+ dest[doff] = '\0';
+
+ tok->normalized.len = doff;
+ tok->normalized.begin = dest;
+}
+
+void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+ UConverter *utf8_converter;
+ UChar tmpbuf[1024]; /* Assume that we have no longer words... */
+ gsize ulen;
+
+ utf8_converter = rspamd_get_utf8_converter();
+
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+ ulen = ucnv_toUChars(utf8_converter,
+ tmpbuf,
+ G_N_ELEMENTS(tmpbuf),
+ tok->original.begin,
+ tok->original.len,
+ &uc_err);
+
+ /* Now, we need to understand if we need to normalise the word */
+ if (!U_SUCCESS(uc_err)) {
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+ tok->unicode.begin = NULL;
+ tok->unicode.len = 0;
+ tok->normalized.begin = NULL;
+ tok->normalized.len = 0;
+ }
+ else {
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+ const UNormalizer2 *norm = rspamd_get_unicode_normalizer();
+ gint32 end;
+
+ /* We can now check if we need to decompose */
+ end = unorm2_spanQuickCheckYes(norm, tmpbuf, ulen, &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool);
+ tok->normalized.begin = NULL;
+ tok->normalized.len = 0;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+ }
+ else {
+ if (end == ulen) {
+ /* Already normalised, just lowercase */
+ rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool);
+ rspamd_ucs32_to_normalised(tok, pool);
+ }
+ else {
+ /* Perform normalization */
+ UChar normbuf[1024];
+
+ g_assert(end < G_N_ELEMENTS(normbuf));
+ /* First part */
+ memcpy(normbuf, tmpbuf, end * sizeof(UChar));
+ /* Second part */
+ ulen = unorm2_normalizeSecondAndAppend(norm,
+ normbuf, end,
+ G_N_ELEMENTS(normbuf),
+ tmpbuf + end,
+ ulen - end,
+ &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+ msg_warn_pool_check("cannot normalise text '%*s': %s",
+ (gint) tok->original.len, tok->original.begin,
+ u_errorName(uc_err));
+ rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool);
+ rspamd_ucs32_to_normalised(tok, pool);
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
+ }
+ }
+ else {
+ /* Copy normalised back */
+ rspamd_uchars_to_ucs32(normbuf, ulen, tok, pool);
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
+ rspamd_ucs32_to_normalised(tok, pool);
+ }
+ }
+ }
+#else
+ /* Legacy version with no unorm2 interface */
+ rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool);
+ rspamd_ucs32_to_normalised(tok, pool);
+#endif
+ }
+ }
+ else {
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ /* Simple lowercase */
+ gchar *dest;
+
+ dest = rspamd_mempool_alloc(pool, tok->original.len + 1);
+ rspamd_strlcpy(dest, tok->original.begin, tok->original.len + 1);
+ rspamd_str_lc(dest, tok->original.len);
+ tok->normalized.len = tok->original.len;
+ tok->normalized.begin = dest;
+ }
+ }
+}
+
+void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool)
+{
+ rspamd_stat_token_t *tok;
+ guint i;
+
+ for (i = 0; i < words->len; i++) {
+ tok = &g_array_index(words, rspamd_stat_token_t, i);
+ rspamd_normalize_single_word(tok, pool);
+ }
+}
+
+void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+ const gchar *language,
+ struct rspamd_lang_detector *lang_detector)
+{
+ static GHashTable *stemmers = NULL;
+ struct sb_stemmer *stem = NULL;
+ guint i;
+ rspamd_stat_token_t *tok;
+ gchar *dest;
+ gsize dlen;
+
+ if (!stemmers) {
+ stemmers = g_hash_table_new(rspamd_strcase_hash,
+ rspamd_strcase_equal);
+ }
+
+ if (language && language[0] != '\0') {
+ stem = g_hash_table_lookup(stemmers, language);
+
+ if (stem == NULL) {
+
+ stem = sb_stemmer_new(language, "UTF_8");
+
+ if (stem == NULL) {
+ msg_debug_pool(
+ "cannot create lemmatizer for %s language",
+ language);
+ g_hash_table_insert(stemmers, g_strdup(language),
+ GINT_TO_POINTER(-1));
+ }
+ else {
+ g_hash_table_insert(stemmers, g_strdup(language),
+ stem);
+ }
+ }
+ else if (stem == GINT_TO_POINTER(-1)) {
+ /* Negative cache */
+ stem = NULL;
+ }
+ }
+ for (i = 0; i < words->len; i++) {
+ tok = &g_array_index(words, rspamd_stat_token_t, i);
+
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+ if (stem) {
+ const gchar *stemmed = NULL;
+
+ stemmed = sb_stemmer_stem(stem,
+ tok->normalized.begin, tok->normalized.len);
+
+ dlen = sb_stemmer_length(stem);
+
+ if (stemmed != NULL && dlen > 0) {
+ dest = rspamd_mempool_alloc(pool, dlen);
+ memcpy(dest, stemmed, dlen);
+ tok->stemmed.len = dlen;
+ tok->stemmed.begin = dest;
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
+ }
+ else {
+ /* Fallback */
+ tok->stemmed.len = tok->normalized.len;
+ tok->stemmed.begin = tok->normalized.begin;
+ }
+ }
+ else {
+ tok->stemmed.len = tok->normalized.len;
+ tok->stemmed.begin = tok->normalized.begin;
+ }
+
+ if (tok->stemmed.len > 0 && lang_detector != NULL &&
+ rspamd_language_detector_is_stop_word(lang_detector, tok->stemmed.begin, tok->stemmed.len)) {
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
+ }
+ }
+ else {
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ /* Raw text, lowercase */
+ tok->stemmed.len = tok->normalized.len;
+ tok->stemmed.begin = tok->normalized.begin;
+ }
+ }
+ }
+} \ No newline at end of file