diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /src/libstat/tokenizers | |
parent | Initial commit. (diff) | |
download | rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/osb.c | 424 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 955 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 100 |
3 files changed, 1479 insertions, 0 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c new file mode 100644 index 0000000..d871c7a --- /dev/null +++ b/src/libstat/tokenizers/osb.c @@ -0,0 +1,424 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * OSB tokenizer + */ + + +#include "tokenizers.h" +#include "stat_internal.h" +#include "libmime/lang_detection.h" + +/* Size for features pipe */ +#define DEFAULT_FEATURE_WINDOW_SIZE 5 +#define DEFAULT_OSB_VERSION 2 + +static const int primes[] = { + 1, + 7, + 3, + 13, + 5, + 29, + 11, + 51, + 23, + 101, + 47, + 203, + 97, + 407, + 197, + 817, + 397, + 1637, + 797, + 3277, +}; + +static const guchar osb_tokenizer_magic[] = {'o', 's', 'b', 't', 'o', 'k', 'v', '2'}; + +enum rspamd_osb_hash_type { + RSPAMD_OSB_HASH_COMPAT = 0, + RSPAMD_OSB_HASH_XXHASH, + RSPAMD_OSB_HASH_SIPHASH +}; + +struct rspamd_osb_tokenizer_config { + guchar magic[8]; + gshort version; + gshort window_size; + enum rspamd_osb_hash_type ht; + guint64 seed; + rspamd_sipkey_t sk; +}; + +/* + * Return default config + */ +static struct rspamd_osb_tokenizer_config * +rspamd_tokenizer_osb_default_config(void) +{ + static struct rspamd_osb_tokenizer_config def; + + if (memcmp(def.magic, osb_tokenizer_magic, sizeof(osb_tokenizer_magic)) != 0) { + memset(&def, 0, sizeof(def)); + memcpy(def.magic, osb_tokenizer_magic, sizeof(osb_tokenizer_magic)); + def.version = DEFAULT_OSB_VERSION; + def.window_size = DEFAULT_FEATURE_WINDOW_SIZE; + def.ht = RSPAMD_OSB_HASH_XXHASH; + def.seed = 0xdeadbabe; + } + + return &def; +} + +static struct rspamd_osb_tokenizer_config * +rspamd_tokenizer_osb_config_from_ucl(rspamd_mempool_t *pool, + const ucl_object_t *obj) +{ + const ucl_object_t *elt; + struct rspamd_osb_tokenizer_config *cf, *def; + guchar *key = NULL; + gsize keylen; + + + if (pool != NULL) { + cf = rspamd_mempool_alloc0(pool, sizeof(*cf)); + } + else { + cf = g_malloc0(sizeof(*cf)); + } + + /* Use default config */ + def = rspamd_tokenizer_osb_default_config(); + memcpy(cf, def, sizeof(*cf)); + + elt = ucl_object_lookup(obj, "hash"); + if (elt != NULL && ucl_object_type(elt) == UCL_STRING) { + if (g_ascii_strncasecmp(ucl_object_tostring(elt), "xxh", 3) == 0) { + cf->ht = RSPAMD_OSB_HASH_XXHASH; + elt = ucl_object_lookup(obj, "seed"); + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + cf->seed = ucl_object_toint(elt); + } + } + else if (g_ascii_strncasecmp(ucl_object_tostring(elt), "sip", 3) == 0) { + cf->ht = RSPAMD_OSB_HASH_SIPHASH; + elt = ucl_object_lookup(obj, "key"); + + if (elt != NULL && ucl_object_type(elt) == UCL_STRING) { + key = rspamd_decode_base32(ucl_object_tostring(elt), + 0, &keylen, RSPAMD_BASE32_DEFAULT); + if (keylen < sizeof(rspamd_sipkey_t)) { + msg_warn("siphash key is too short: %z", keylen); + g_free(key); + } + else { + memcpy(cf->sk, key, sizeof(cf->sk)); + g_free(key); + } + } + else { + msg_warn_pool("siphash cannot be used without key"); + } + } + } + else { + elt = ucl_object_lookup(obj, "compat"); + if (elt != NULL && ucl_object_toboolean(elt)) { + cf->ht = RSPAMD_OSB_HASH_COMPAT; + } + } + + elt = ucl_object_lookup(obj, "window"); + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + cf->window_size = ucl_object_toint(elt); + if (cf->window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) { + msg_err_pool("too large window size: %d", cf->window_size); + cf->window_size = DEFAULT_FEATURE_WINDOW_SIZE; + } + } + + return cf; +} + +gpointer +rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, + gsize *len) +{ + struct rspamd_osb_tokenizer_config *osb_cf, *def; + + if (cf != NULL && cf->opts != NULL) { + osb_cf = rspamd_tokenizer_osb_config_from_ucl(pool, cf->opts); + } + else { + def = rspamd_tokenizer_osb_default_config(); + osb_cf = rspamd_mempool_alloc(pool, sizeof(*osb_cf)); + memcpy(osb_cf, def, sizeof(*osb_cf)); + /* Do not write sipkey to statfile */ + } + + if (osb_cf->ht == RSPAMD_OSB_HASH_SIPHASH) { + msg_info_pool("siphash key is not stored into statfiles, so you'd " + "need to keep it inside the configuration"); + } + + memset(osb_cf->sk, 0, sizeof(osb_cf->sk)); + + if (len != NULL) { + *len = sizeof(*osb_cf); + } + + return osb_cf; +} + +#if 0 +gboolean +rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, + gpointer ptr, gsize len) +{ + struct rspamd_osb_tokenizer_config *osb_cf, *test_cf; + gboolean ret = FALSE; + + test_cf = rt->config; + g_assert (test_cf != NULL); + + if (len == sizeof (*osb_cf)) { + osb_cf = ptr; + + if (memcmp (osb_cf, osb_tokenizer_magic, sizeof (osb_tokenizer_magic)) != 0) { + ret = test_cf->ht == RSPAMD_OSB_HASH_COMPAT; + } + else { + if (osb_cf->version == DEFAULT_OSB_VERSION) { + /* We can compare them directly now */ + ret = (memcmp (osb_cf, test_cf, sizeof (*osb_cf) + - sizeof (osb_cf->sk))) == 0; + } + } + } + else { + /* We are compatible now merely with fallback config */ + if (test_cf->ht == RSPAMD_OSB_HASH_COMPAT) { + ret = TRUE; + } + } + + return ret; +} + +gboolean +rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, + struct rspamd_tokenizer_runtime *rt, + gpointer ptr, gsize len) +{ + struct rspamd_osb_tokenizer_config *osb_cf; + + if (ptr == NULL || len == 0) { + osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts); + + if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) { + /* Trying to load incompatible configuration */ + msg_err_pool ("cannot load tokenizer configuration from a legacy " + "statfile; maybe you have forgotten to set 'compat' option" + " in the tokenizer configuration"); + + return FALSE; + } + } + else { + g_assert (len == sizeof (*osb_cf)); + osb_cf = ptr; + } + + rt->config = osb_cf; + rt->conf_len = sizeof (*osb_cf); + + return TRUE; +} + +gboolean +rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt) +{ + struct rspamd_osb_tokenizer_config *osb_cf = rt->config; + + return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT); +} +#endif + +struct token_pipe_entry { + guint64 h; + rspamd_stat_token_t *t; +}; + +gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result) +{ + rspamd_token_t *new_tok = NULL; + rspamd_stat_token_t *token; + struct rspamd_osb_tokenizer_config *osb_cf; + guint64 cur, seed; + struct token_pipe_entry *hashpipe; + guint32 h1, h2; + gsize token_size; + guint processed = 0, i, w, window_size, token_flags = 0; + + if (words == NULL) { + return FALSE; + } + + osb_cf = ctx->tkcf; + window_size = osb_cf->window_size; + + if (prefix) { + seed = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + prefix, strlen(prefix), osb_cf->seed); + } + else { + seed = osb_cf->seed; + } + + hashpipe = g_alloca(window_size * sizeof(hashpipe[0])); + for (i = 0; i < window_size; i++) { + hashpipe[i].h = 0xfe; + hashpipe[i].t = NULL; + } + + token_size = sizeof(rspamd_token_t) + + sizeof(gdouble) * ctx->statfiles->len; + g_assert(token_size > 0); + + for (w = 0; w < words->len; w++) { + token = &g_array_index(words, rspamd_stat_token_t, w); + token_flags = token->flags; + const gchar *begin; + gsize len; + + if (token->flags & + (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD | RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) { + /* Skip stop/skipped words */ + continue; + } + + if (token->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + begin = token->stemmed.begin; + len = token->stemmed.len; + } + else { + begin = token->original.begin; + len = token->original.len; + } + + if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { + rspamd_ftok_t ftok; + + ftok.begin = begin; + ftok.len = len; + cur = rspamd_fstrhash_lc(&ftok, is_utf); + } + else { + /* We know that the words are normalized */ + if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) { + cur = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + begin, len, osb_cf->seed); + } + else { + rspamd_cryptobox_siphash((guchar *) &cur, begin, + len, osb_cf->sk); + + if (prefix) { + cur ^= seed; + } + } + } + + if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) { + new_tok = rspamd_mempool_alloc0(task->task_pool, token_size); + new_tok->flags = token_flags; + new_tok->t1 = token; + new_tok->t2 = token; + new_tok->data = cur; + new_tok->window_idx = 0; + g_ptr_array_add(result, new_tok); + + continue; + } + +#define ADD_TOKEN \ + do { \ + new_tok = rspamd_mempool_alloc0(task->task_pool, token_size); \ + new_tok->flags = token_flags; \ + new_tok->t1 = hashpipe[0].t; \ + new_tok->t2 = hashpipe[i].t; \ + if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ + h1 = ((guint32) hashpipe[0].h) * primes[0] + \ + ((guint32) hashpipe[i].h) * primes[i << 1]; \ + h2 = ((guint32) hashpipe[0].h) * primes[1] + \ + ((guint32) hashpipe[i].h) * primes[(i << 1) - 1]; \ + memcpy((guchar *) &new_tok->data, &h1, sizeof(h1)); \ + memcpy(((guchar *) &new_tok->data) + sizeof(h1), &h2, sizeof(h2)); \ + } \ + else { \ + new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \ + } \ + new_tok->window_idx = i; \ + g_ptr_array_add(result, new_tok); \ + } while (0) + + if (processed < window_size) { + /* Just fill a hashpipe */ + ++processed; + hashpipe[window_size - processed].h = cur; + hashpipe[window_size - processed].t = token; + } + else { + /* Shift hashpipe */ + for (i = window_size - 1; i > 0; i--) { + hashpipe[i] = hashpipe[i - 1]; + } + hashpipe[0].h = cur; + hashpipe[0].t = token; + + processed++; + + for (i = 1; i < window_size; i++) { + if (!(hashpipe[i].t->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION)) { + ADD_TOKEN; + } + } + } + } + + if (processed > 1 && processed <= window_size) { + processed--; + memmove(hashpipe, &hashpipe[window_size - processed], + processed * sizeof(hashpipe[0])); + + for (i = 1; i < processed; i++) { + ADD_TOKEN; + } + } + +#undef ADD_TOKEN + + return TRUE; +} diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c new file mode 100644 index 0000000..ee7234d --- /dev/null +++ b/src/libstat/tokenizers/tokenizers.c @@ -0,0 +1,955 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Common tokenization functions + */ + +#include "rspamd.h" +#include "tokenizers.h" +#include "stat_internal.h" +#include "contrib/mumhash/mum.h" +#include "libmime/lang_detection.h" +#include "libstemmer.h" + +#include <unicode/utf8.h> +#include <unicode/uchar.h> +#include <unicode/uiter.h> +#include <unicode/ubrk.h> +#include <unicode/ucnv.h> +#if U_ICU_VERSION_MAJOR_NUM >= 44 +#include <unicode/unorm2.h> +#endif + +#include <math.h> + +typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, gchar const **pos, + rspamd_stat_token_t *token, + GList **exceptions, gsize *rl, gboolean check_signature); + +const gchar t_delimiters[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0}; + +/* Get next word from specified f_str_t buf */ +static gboolean +rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf, + gchar const **cur, rspamd_stat_token_t *token, + GList **exceptions, gsize *rl, gboolean unused) +{ + gsize remain, pos; + const gchar *p; + struct rspamd_process_exception *ex = NULL; + + if (buf == NULL) { + return FALSE; + } + + g_assert(cur != NULL); + + if (exceptions != NULL && *exceptions != NULL) { + ex = (*exceptions)->data; + } + + if (token->original.begin == NULL || *cur == NULL) { + if (ex != NULL) { + if (ex->pos == 0) { + token->original.begin = buf->original.begin + ex->len; + token->original.len = ex->len; + token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + } + else { + token->original.begin = buf->original.begin; + token->original.len = 0; + } + } + else { + token->original.begin = buf->original.begin; + token->original.len = 0; + } + *cur = token->original.begin; + } + + token->original.len = 0; + + pos = *cur - buf->original.begin; + if (pos >= buf->original.len) { + return FALSE; + } + + remain = buf->original.len - pos; + p = *cur; + + /* Skip non delimiters symbols */ + do { + if (ex != NULL && ex->pos == pos) { + /* Go to the next exception */ + *exceptions = g_list_next(*exceptions); + *cur = p + ex->len; + return TRUE; + } + pos++; + p++; + remain--; + } while (remain > 0 && t_delimiters[(guchar) *p]); + + token->original.begin = p; + + while (remain > 0 && !t_delimiters[(guchar) *p]) { + if (ex != NULL && ex->pos == pos) { + *exceptions = g_list_next(*exceptions); + *cur = p + ex->len; + return TRUE; + } + token->original.len++; + pos++; + remain--; + p++; + } + + if (remain == 0) { + return FALSE; + } + + if (rl) { + *rl = token->original.len; + } + + token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; + + *cur = p; + + return TRUE; +} + +static inline gboolean +rspamd_tokenize_check_limit(gboolean decay, + guint word_decay, + guint nwords, + guint64 *hv, + guint64 *prob, + const rspamd_stat_token_t *token, + gssize remain, + gssize total) +{ + static const gdouble avg_word_len = 6.0; + + if (!decay) { + if (token->original.len >= sizeof(guint64)) { + guint64 tmp; + memcpy(&tmp, token->original.begin, sizeof(tmp)); + *hv = mum_hash_step(*hv, tmp); + } + + /* Check for decay */ + if (word_decay > 0 && nwords > word_decay && remain < (gssize) total) { + /* Start decay */ + gdouble decay_prob; + + *hv = mum_hash_finish(*hv); + + /* We assume that word is 6 symbols length in average */ + decay_prob = (gdouble) word_decay / ((total - (remain)) / avg_word_len) * 10; + decay_prob = floor(decay_prob) / 10.0; + + if (decay_prob >= 1.0) { + *prob = G_MAXUINT64; + } + else { + *prob = (guint64) (decay_prob * (double) G_MAXUINT64); + } + + return TRUE; + } + } + else { + /* Decaying probability */ + /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */ + *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL; + + if (*hv > *prob) { + return TRUE; + } + } + + return FALSE; +} + +static inline gboolean +rspamd_utf_word_valid(const guchar *text, const guchar *end, + gint32 start, gint32 finish) +{ + const guchar *st = text + start, *fin = text + finish; + UChar32 c; + + if (st >= end || fin > end || st >= fin) { + return FALSE; + } + + U8_NEXT(text, start, finish, c); + + if (u_isJavaIDPart(c)) { + return TRUE; + } + + return FALSE; +} +#define SHIFT_EX \ + do { \ + cur = g_list_next(cur); \ + if (cur) { \ + ex = (struct rspamd_process_exception *) cur->data; \ + } \ + else { \ + ex = NULL; \ + } \ + } while (0) + +static inline void +rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res) +{ + rspamd_stat_token_t token; + + memset(&token, 0, sizeof(token)); + + if (ex->type == RSPAMD_EXCEPTION_GENERIC) { + token.original.begin = "!!EX!!"; + token.original.len = sizeof("!!EX!!") - 1; + token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + + g_array_append_val(res, token); + token.flags = 0; + } + else if (ex->type == RSPAMD_EXCEPTION_URL) { + struct rspamd_url *uri; + + uri = ex->ptr; + + if (uri && uri->tldlen > 0) { + token.original.begin = rspamd_url_tld_unsafe(uri); + token.original.len = uri->tldlen; + } + else { + token.original.begin = "!!EX!!"; + token.original.len = sizeof("!!EX!!") - 1; + } + + token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + g_array_append_val(res, token); + token.flags = 0; + } +} + + +GArray * +rspamd_tokenize_text(const gchar *text, gsize len, + const UText *utxt, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, + GList *exceptions, + guint64 *hash, + GArray *cur_words, + rspamd_mempool_t *pool) +{ + rspamd_stat_token_t token, buf; + const gchar *pos = NULL; + gsize l = 0; + GArray *res; + GList *cur = exceptions; + guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; + guint64 hv = 0; + gboolean decay = FALSE, long_text_mode = FALSE; + guint64 prob = 0; + static UBreakIterator *bi = NULL; + static const gsize long_text_limit = 1 * 1024 * 1024; + static const ev_tstamp max_exec_time = 0.2; /* 200 ms */ + ev_tstamp start; + + if (text == NULL) { + return cur_words; + } + + if (len > long_text_limit) { + /* + * In this mode we do additional checks to avoid performance issues + */ + long_text_mode = TRUE; + start = ev_time(); + } + + buf.original.begin = text; + buf.original.len = len; + buf.flags = 0; + + memset(&token, 0, sizeof(token)); + + if (cfg != NULL) { + min_len = cfg->min_word_len; + max_len = cfg->max_word_len; + word_decay = cfg->words_decay; + initial_size = word_decay * 2; + } + + if (!cur_words) { + res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), + initial_size); + } + else { + res = cur_words; + } + + if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { + while (rspamd_tokenizer_get_word_raw(&buf, &pos, &token, &cur, &l, FALSE)) { + if (l == 0 || (min_len > 0 && l < min_len) || + (max_len > 0 && l > max_len)) { + token.original.begin = pos; + continue; + } + + if (token.original.len > 0 && + rspamd_tokenize_check_limit(decay, word_decay, res->len, + &hv, &prob, &token, pos - text, len)) { + if (!decay) { + decay = TRUE; + } + else { + token.original.begin = pos; + continue; + } + } + + if (long_text_mode) { + if ((res->len + 1) % 16 == 0) { + ev_tstamp now = ev_time(); + + if (now - start > max_exec_time) { + msg_warn_pool_check( + "too long time has been spent on tokenization:" + " %.1f ms, limit is %.1f ms; %d words added so far", + (now - start) * 1e3, max_exec_time * 1e3, + res->len); + + goto end; + } + } + } + + g_array_append_val(res, token); + + if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) { + /* Due to bug in glib ! */ + msg_err_pool_check( + "too many words found: %d, stop tokenization to avoid DoS", + res->len); + + goto end; + } + + token.original.begin = pos; + } + } + else { + /* UTF8 boundaries */ + UErrorCode uc_err = U_ZERO_ERROR; + int32_t last, p; + struct rspamd_process_exception *ex = NULL; + + if (bi == NULL) { + bi = ubrk_open(UBRK_WORD, NULL, NULL, 0, &uc_err); + + g_assert(U_SUCCESS(uc_err)); + } + + ubrk_setUText(bi, (UText *) utxt, &uc_err); + last = ubrk_first(bi); + p = last; + + if (cur) { + ex = (struct rspamd_process_exception *) cur->data; + } + + while (p != UBRK_DONE) { + start_over: + token.original.len = 0; + + if (p > last) { + if (ex && cur) { + /* Check exception */ + if (ex->pos >= last && ex->pos <= p) { + /* We have an exception within boundary */ + /* First, start to drain exceptions from the start */ + while (cur && ex->pos <= last) { + /* We have an exception at the beginning, skip those */ + last += ex->len; + rspamd_tokenize_exception(ex, res); + + if (last > p) { + /* Exception spread over the boundaries */ + while (last > p && p != UBRK_DONE) { + gint32 old_p = p; + p = ubrk_next(bi); + + if (p != UBRK_DONE && p <= old_p) { + msg_warn_pool_check( + "tokenization reversed back on position %d," + "%d new position (%d backward), likely libicu bug!", + (gint) (p), (gint) (old_p), old_p - p); + + goto end; + } + } + + /* We need to reset our scan with new p and last */ + SHIFT_EX; + goto start_over; + } + + SHIFT_EX; + } + + /* Now, we can have an exception within boundary again */ + if (cur && ex->pos >= last && ex->pos <= p) { + /* Append the first part */ + if (rspamd_utf_word_valid(text, text + len, last, + ex->pos)) { + token.original.begin = text + last; + token.original.len = ex->pos - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | + RSPAMD_STAT_TOKEN_FLAG_UTF; + } + + /* Process the current exception */ + last += ex->len + (ex->pos - last); + + rspamd_tokenize_exception(ex, res); + + if (last > p) { + /* Exception spread over the boundaries */ + while (last > p && p != UBRK_DONE) { + gint32 old_p = p; + p = ubrk_next(bi); + if (p != UBRK_DONE && p <= old_p) { + msg_warn_pool_check( + "tokenization reversed back on position %d," + "%d new position (%d backward), likely libicu bug!", + (gint) (p), (gint) (old_p), old_p - p); + + goto end; + } + } + /* We need to reset our scan with new p and last */ + SHIFT_EX; + goto start_over; + } + + SHIFT_EX; + } + else if (p > last) { + if (rspamd_utf_word_valid(text, text + len, last, p)) { + token.original.begin = text + last; + token.original.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | + RSPAMD_STAT_TOKEN_FLAG_UTF; + } + } + } + else if (ex->pos < last) { + /* Forward exceptions list */ + while (cur && ex->pos <= last) { + /* We have an exception at the beginning, skip those */ + SHIFT_EX; + } + + if (rspamd_utf_word_valid(text, text + len, last, p)) { + token.original.begin = text + last; + token.original.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | + RSPAMD_STAT_TOKEN_FLAG_UTF; + } + } + else { + /* No exceptions within boundary */ + if (rspamd_utf_word_valid(text, text + len, last, p)) { + token.original.begin = text + last; + token.original.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | + RSPAMD_STAT_TOKEN_FLAG_UTF; + } + } + } + else { + if (rspamd_utf_word_valid(text, text + len, last, p)) { + token.original.begin = text + last; + token.original.len = p - last; + token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | + RSPAMD_STAT_TOKEN_FLAG_UTF; + } + } + + if (token.original.len > 0 && + rspamd_tokenize_check_limit(decay, word_decay, res->len, + &hv, &prob, &token, p, len)) { + if (!decay) { + decay = TRUE; + } + else { + token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED; + } + } + } + + if (token.original.len > 0) { + /* Additional check for number of words */ + if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) { + /* Due to bug in glib ! */ + msg_err("too many words found: %d, stop tokenization to avoid DoS", + res->len); + + goto end; + } + + g_array_append_val(res, token); + } + + /* Also check for long text mode */ + if (long_text_mode) { + /* Check time each 128 words added */ + const int words_check_mask = 0x7F; + + if ((res->len & words_check_mask) == words_check_mask) { + ev_tstamp now = ev_time(); + + if (now - start > max_exec_time) { + msg_warn_pool_check( + "too long time has been spent on tokenization:" + " %.1f ms, limit is %.1f ms; %d words added so far", + (now - start) * 1e3, max_exec_time * 1e3, + res->len); + + goto end; + } + } + } + + last = p; + p = ubrk_next(bi); + + if (p != UBRK_DONE && p <= last) { + msg_warn_pool_check("tokenization reversed back on position %d," + "%d new position (%d backward), likely libicu bug!", + (gint) (p), (gint) (last), last - p); + + goto end; + } + } + } + +end: + if (!decay) { + hv = mum_hash_finish(hv); + } + + if (hash) { + *hash = hv; + } + + return res; +} + +#undef SHIFT_EX + +static void +rspamd_add_metawords_from_str(const gchar *beg, gsize len, + struct rspamd_task *task) +{ + UText utxt = UTEXT_INITIALIZER; + UErrorCode uc_err = U_ZERO_ERROR; + guint i = 0; + UChar32 uc; + gboolean valid_utf = TRUE; + + while (i < len) { + U8_NEXT(beg, i, len, uc); + + if (((gint32) uc) < 0) { + valid_utf = FALSE; + break; + } + +#if U_ICU_VERSION_MAJOR_NUM < 50 + if (u_isalpha(uc)) { + gint32 sc = ublock_getCode(uc); + + if (sc == UBLOCK_THAI) { + valid_utf = FALSE; + msg_info_task("enable workaround for Thai characters for old libicu"); + break; + } + } +#endif + } + + if (valid_utf) { + utext_openUTF8(&utxt, + beg, + len, + &uc_err); + + task->meta_words = rspamd_tokenize_text(beg, len, + &utxt, RSPAMD_TOKENIZE_UTF, + task->cfg, NULL, NULL, + task->meta_words, + task->task_pool); + + utext_close(&utxt); + } + else { + task->meta_words = rspamd_tokenize_text(beg, len, + NULL, RSPAMD_TOKENIZE_RAW, + task->cfg, NULL, NULL, task->meta_words, + task->task_pool); + } +} + +void rspamd_tokenize_meta_words(struct rspamd_task *task) +{ + guint i = 0; + rspamd_stat_token_t *tok; + + if (MESSAGE_FIELD(task, subject)) { + rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject), + strlen(MESSAGE_FIELD(task, subject)), task); + } + + if (MESSAGE_FIELD(task, from_mime) && MESSAGE_FIELD(task, from_mime)->len > 0) { + struct rspamd_email_address *addr; + + addr = g_ptr_array_index(MESSAGE_FIELD(task, from_mime), 0); + + if (addr->name) { + rspamd_add_metawords_from_str(addr->name, strlen(addr->name), task); + } + } + + if (task->meta_words != NULL) { + const gchar *language = NULL; + + if (MESSAGE_FIELD(task, text_parts) && + MESSAGE_FIELD(task, text_parts)->len > 0) { + struct rspamd_mime_text_part *tp = g_ptr_array_index( + MESSAGE_FIELD(task, text_parts), 0); + + if (tp->language) { + language = tp->language; + } + } + + rspamd_normalize_words(task->meta_words, task->task_pool); + rspamd_stem_words(task->meta_words, task->task_pool, language, + task->lang_det); + + for (i = 0; i < task->meta_words->len; i++) { + tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER; + } + } +} + +static inline void +rspamd_uchars_to_ucs32(const UChar *src, gsize srclen, + rspamd_stat_token_t *tok, + rspamd_mempool_t *pool) +{ + UChar32 *dest, t, *d; + gint32 i = 0; + + dest = rspamd_mempool_alloc(pool, srclen * sizeof(UChar32)); + d = dest; + + while (i < srclen) { + U16_NEXT_UNSAFE(src, i, t); + + if (u_isgraph(t)) { + UCharCategory cat; + + cat = u_charType(t); +#if U_ICU_VERSION_MAJOR_NUM >= 57 + if (u_hasBinaryProperty(t, UCHAR_EMOJI)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI; + } +#endif + + if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) || + cat == U_CONNECTOR_PUNCTUATION || + cat == U_MATH_SYMBOL || + cat == U_CURRENCY_SYMBOL) { + *d++ = u_tolower(t); + } + } + else { + /* Invisible spaces ! */ + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES; + } + } + + tok->unicode.begin = dest; + tok->unicode.len = d - dest; +} + +static inline void +rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok, + rspamd_mempool_t *pool) +{ + guint i, doff = 0; + gsize utflen = 0; + gchar *dest; + UChar32 t; + + for (i = 0; i < tok->unicode.len; i++) { + utflen += U8_LENGTH(tok->unicode.begin[i]); + } + + dest = rspamd_mempool_alloc(pool, utflen + 1); + + for (i = 0; i < tok->unicode.len; i++) { + t = tok->unicode.begin[i]; + U8_APPEND_UNSAFE(dest, doff, t); + } + + g_assert(doff <= utflen); + dest[doff] = '\0'; + + tok->normalized.len = doff; + tok->normalized.begin = dest; +} + +void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool) +{ + UErrorCode uc_err = U_ZERO_ERROR; + UConverter *utf8_converter; + UChar tmpbuf[1024]; /* Assume that we have no longer words... */ + gsize ulen; + + utf8_converter = rspamd_get_utf8_converter(); + + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { + ulen = ucnv_toUChars(utf8_converter, + tmpbuf, + G_N_ELEMENTS(tmpbuf), + tok->original.begin, + tok->original.len, + &uc_err); + + /* Now, we need to understand if we need to normalise the word */ + if (!U_SUCCESS(uc_err)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + tok->unicode.begin = NULL; + tok->unicode.len = 0; + tok->normalized.begin = NULL; + tok->normalized.len = 0; + } + else { +#if U_ICU_VERSION_MAJOR_NUM >= 44 + const UNormalizer2 *norm = rspamd_get_unicode_normalizer(); + gint32 end; + + /* We can now check if we need to decompose */ + end = unorm2_spanQuickCheckYes(norm, tmpbuf, ulen, &uc_err); + + if (!U_SUCCESS(uc_err)) { + rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool); + tok->normalized.begin = NULL; + tok->normalized.len = 0; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + } + else { + if (end == ulen) { + /* Already normalised, just lowercase */ + rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool); + rspamd_ucs32_to_normalised(tok, pool); + } + else { + /* Perform normalization */ + UChar normbuf[1024]; + + g_assert(end < G_N_ELEMENTS(normbuf)); + /* First part */ + memcpy(normbuf, tmpbuf, end * sizeof(UChar)); + /* Second part */ + ulen = unorm2_normalizeSecondAndAppend(norm, + normbuf, end, + G_N_ELEMENTS(normbuf), + tmpbuf + end, + ulen - end, + &uc_err); + + if (!U_SUCCESS(uc_err)) { + if (uc_err != U_BUFFER_OVERFLOW_ERROR) { + msg_warn_pool_check("cannot normalise text '%*s': %s", + (gint) tok->original.len, tok->original.begin, + u_errorName(uc_err)); + rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool); + rspamd_ucs32_to_normalised(tok, pool); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE; + } + } + else { + /* Copy normalised back */ + rspamd_uchars_to_ucs32(normbuf, ulen, tok, pool); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED; + rspamd_ucs32_to_normalised(tok, pool); + } + } + } +#else + /* Legacy version with no unorm2 interface */ + rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool); + rspamd_ucs32_to_normalised(tok, pool); +#endif + } + } + else { + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + /* Simple lowercase */ + gchar *dest; + + dest = rspamd_mempool_alloc(pool, tok->original.len + 1); + rspamd_strlcpy(dest, tok->original.begin, tok->original.len + 1); + rspamd_str_lc(dest, tok->original.len); + tok->normalized.len = tok->original.len; + tok->normalized.begin = dest; + } + } +} + +void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool) +{ + rspamd_stat_token_t *tok; + guint i; + + for (i = 0; i < words->len; i++) { + tok = &g_array_index(words, rspamd_stat_token_t, i); + rspamd_normalize_single_word(tok, pool); + } +} + +void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, + const gchar *language, + struct rspamd_lang_detector *lang_detector) +{ + static GHashTable *stemmers = NULL; + struct sb_stemmer *stem = NULL; + guint i; + rspamd_stat_token_t *tok; + gchar *dest; + gsize dlen; + + if (!stemmers) { + stemmers = g_hash_table_new(rspamd_strcase_hash, + rspamd_strcase_equal); + } + + if (language && language[0] != '\0') { + stem = g_hash_table_lookup(stemmers, language); + + if (stem == NULL) { + + stem = sb_stemmer_new(language, "UTF_8"); + + if (stem == NULL) { + msg_debug_pool( + "cannot create lemmatizer for %s language", + language); + g_hash_table_insert(stemmers, g_strdup(language), + GINT_TO_POINTER(-1)); + } + else { + g_hash_table_insert(stemmers, g_strdup(language), + stem); + } + } + else if (stem == GINT_TO_POINTER(-1)) { + /* Negative cache */ + stem = NULL; + } + } + for (i = 0; i < words->len; i++) { + tok = &g_array_index(words, rspamd_stat_token_t, i); + + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { + if (stem) { + const gchar *stemmed = NULL; + + stemmed = sb_stemmer_stem(stem, + tok->normalized.begin, tok->normalized.len); + + dlen = sb_stemmer_length(stem); + + if (stemmed != NULL && dlen > 0) { + dest = rspamd_mempool_alloc(pool, dlen); + memcpy(dest, stemmed, dlen); + tok->stemmed.len = dlen; + tok->stemmed.begin = dest; + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED; + } + else { + /* Fallback */ + tok->stemmed.len = tok->normalized.len; + tok->stemmed.begin = tok->normalized.begin; + } + } + else { + tok->stemmed.len = tok->normalized.len; + tok->stemmed.begin = tok->normalized.begin; + } + + if (tok->stemmed.len > 0 && lang_detector != NULL && + rspamd_language_detector_is_stop_word(lang_detector, tok->stemmed.begin, tok->stemmed.len)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD; + } + } + else { + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + /* Raw text, lowercase */ + tok->stemmed.len = tok->normalized.len; + tok->stemmed.begin = tok->normalized.begin; + } + } + } +}
\ No newline at end of file diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h new file mode 100644 index 0000000..d696364 --- /dev/null +++ b/src/libstat/tokenizers/tokenizers.h @@ -0,0 +1,100 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TOKENIZERS_H +#define TOKENIZERS_H + +#include "config.h" +#include "mem_pool.h" +#include "fstring.h" +#include "rspamd.h" +#include "stat_api.h" + +#include <unicode/utext.h> + +#define RSPAMD_DEFAULT_TOKENIZER "osb" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_tokenizer_runtime; +struct rspamd_stat_ctx; + +/* Common tokenizer structure */ +struct rspamd_stat_tokenizer { + gchar *name; + + gpointer (*get_config)(rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, gsize *len); + + gint (*tokenize_func)(struct rspamd_stat_ctx *ctx, + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result); +}; + +enum rspamd_tokenize_type { + RSPAMD_TOKENIZE_UTF = 0, + RSPAMD_TOKENIZE_RAW, + RSPAMD_TOKENIZE_UNICODE +}; + +/* Compare two token nodes */ +gint token_node_compare_func(gconstpointer a, gconstpointer b); + + +/* Tokenize text into array of words (rspamd_stat_token_t type) */ +GArray *rspamd_tokenize_text(const gchar *text, gsize len, + const UText *utxt, + enum rspamd_tokenize_type how, + struct rspamd_config *cfg, + GList *exceptions, + guint64 *hash, + GArray *cur_words, + rspamd_mempool_t *pool); + +/* OSB tokenize function */ +gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result); + +gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, + gsize *len); + +struct rspamd_lang_detector; + +void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool); + +void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool); + +void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool, + const gchar *language, + struct rspamd_lang_detector *lang_detector); + +void rspamd_tokenize_meta_words(struct rspamd_task *task); + +#ifdef __cplusplus +} +#endif + +#endif |