Adding upstream version 3.8.1.upstream/3.8.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
commit: 133a45c109da5310add55824db21af5239951f93 (patch)
tree: ba6ac4c0a950a0dda56451944315d66409923918 /src/libstat/tokenizers/osb.c
parent: Initial commit. (diff)
download: rspamd-upstream.tar.xz
rspamd-upstream.zip
1 files changed, 424 insertions, 0 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
new file mode 100644
index 0000000..d871c7a
--- /dev/null
+++ b/src/libstat/tokenizers/osb.c
@@ -0,0 +1,424 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * OSB tokenizer
+ */
+
+
+#include "tokenizers.h"
+#include "stat_internal.h"
+#include "libmime/lang_detection.h"
+
+/* Size for features pipe */
+#define DEFAULT_FEATURE_WINDOW_SIZE 5
+#define DEFAULT_OSB_VERSION 2
+
+static const int primes[] = {
+	1,
+	7,
+	3,
+	13,
+	5,
+	29,
+	11,
+	51,
+	23,
+	101,
+	47,
+	203,
+	97,
+	407,
+	197,
+	817,
+	397,
+	1637,
+	797,
+	3277,
+};
+
+static const guchar osb_tokenizer_magic[] = {'o', 's', 'b', 't', 'o', 'k', 'v', '2'};
+
+enum rspamd_osb_hash_type {
+	RSPAMD_OSB_HASH_COMPAT = 0,
+	RSPAMD_OSB_HASH_XXHASH,
+	RSPAMD_OSB_HASH_SIPHASH
+};
+
+struct rspamd_osb_tokenizer_config {
+	guchar magic[8];
+	gshort version;
+	gshort window_size;
+	enum rspamd_osb_hash_type ht;
+	guint64 seed;
+	rspamd_sipkey_t sk;
+};
+
+/*
+ * Return default config
+ */
+static struct rspamd_osb_tokenizer_config *
+rspamd_tokenizer_osb_default_config(void)
+{
+	static struct rspamd_osb_tokenizer_config def;
+
+	if (memcmp(def.magic, osb_tokenizer_magic, sizeof(osb_tokenizer_magic)) != 0) {
+		memset(&def, 0, sizeof(def));
+		memcpy(def.magic, osb_tokenizer_magic, sizeof(osb_tokenizer_magic));
+		def.version = DEFAULT_OSB_VERSION;
+		def.window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+		def.ht = RSPAMD_OSB_HASH_XXHASH;
+		def.seed = 0xdeadbabe;
+	}
+
+	return &def;
+}
+
+static struct rspamd_osb_tokenizer_config *
+rspamd_tokenizer_osb_config_from_ucl(rspamd_mempool_t *pool,
+									 const ucl_object_t *obj)
+{
+	const ucl_object_t *elt;
+	struct rspamd_osb_tokenizer_config *cf, *def;
+	guchar *key = NULL;
+	gsize keylen;
+
+
+	if (pool != NULL) {
+		cf = rspamd_mempool_alloc0(pool, sizeof(*cf));
+	}
+	else {
+		cf = g_malloc0(sizeof(*cf));
+	}
+
+	/* Use default config */
+	def = rspamd_tokenizer_osb_default_config();
+	memcpy(cf, def, sizeof(*cf));
+
+	elt = ucl_object_lookup(obj, "hash");
+	if (elt != NULL && ucl_object_type(elt) == UCL_STRING) {
+		if (g_ascii_strncasecmp(ucl_object_tostring(elt), "xxh", 3) == 0) {
+			cf->ht = RSPAMD_OSB_HASH_XXHASH;
+			elt = ucl_object_lookup(obj, "seed");
+			if (elt != NULL && ucl_object_type(elt) == UCL_INT) {
+				cf->seed = ucl_object_toint(elt);
+			}
+		}
+		else if (g_ascii_strncasecmp(ucl_object_tostring(elt), "sip", 3) == 0) {
+			cf->ht = RSPAMD_OSB_HASH_SIPHASH;
+			elt = ucl_object_lookup(obj, "key");
+
+			if (elt != NULL && ucl_object_type(elt) == UCL_STRING) {
+				key = rspamd_decode_base32(ucl_object_tostring(elt),
+										   0, &keylen, RSPAMD_BASE32_DEFAULT);
+				if (keylen < sizeof(rspamd_sipkey_t)) {
+					msg_warn("siphash key is too short: %z", keylen);
+					g_free(key);
+				}
+				else {
+					memcpy(cf->sk, key, sizeof(cf->sk));
+					g_free(key);
+				}
+			}
+			else {
+				msg_warn_pool("siphash cannot be used without key");
+			}
+		}
+	}
+	else {
+		elt = ucl_object_lookup(obj, "compat");
+		if (elt != NULL && ucl_object_toboolean(elt)) {
+			cf->ht = RSPAMD_OSB_HASH_COMPAT;
+		}
+	}
+
+	elt = ucl_object_lookup(obj, "window");
+	if (elt != NULL && ucl_object_type(elt) == UCL_INT) {
+		cf->window_size = ucl_object_toint(elt);
+		if (cf->window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) {
+			msg_err_pool("too large window size: %d", cf->window_size);
+			cf->window_size = DEFAULT_FEATURE_WINDOW_SIZE;
+		}
+	}
+
+	return cf;
+}
+
+gpointer
+rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool,
+								struct rspamd_tokenizer_config *cf,
+								gsize *len)
+{
+	struct rspamd_osb_tokenizer_config *osb_cf, *def;
+
+	if (cf != NULL && cf->opts != NULL) {
+		osb_cf = rspamd_tokenizer_osb_config_from_ucl(pool, cf->opts);
+	}
+	else {
+		def = rspamd_tokenizer_osb_default_config();
+		osb_cf = rspamd_mempool_alloc(pool, sizeof(*osb_cf));
+		memcpy(osb_cf, def, sizeof(*osb_cf));
+		/* Do not write sipkey to statfile */
+	}
+
+	if (osb_cf->ht == RSPAMD_OSB_HASH_SIPHASH) {
+		msg_info_pool("siphash key is not stored into statfiles, so you'd "
+					  "need to keep it inside the configuration");
+	}
+
+	memset(osb_cf->sk, 0, sizeof(osb_cf->sk));
+
+	if (len != NULL) {
+		*len = sizeof(*osb_cf);
+	}
+
+	return osb_cf;
+}
+
+#if 0
+gboolean
+rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
+			gpointer ptr, gsize len)
+{
+	struct rspamd_osb_tokenizer_config *osb_cf, *test_cf;
+	gboolean ret = FALSE;
+
+	test_cf = rt->config;
+	g_assert (test_cf != NULL);
+
+	if (len == sizeof (*osb_cf)) {
+		osb_cf = ptr;
+
+		if (memcmp (osb_cf, osb_tokenizer_magic, sizeof (osb_tokenizer_magic)) != 0) {
+			ret = test_cf->ht == RSPAMD_OSB_HASH_COMPAT;
+		}
+		else {
+			if (osb_cf->version == DEFAULT_OSB_VERSION) {
+				/* We can compare them directly now */
+				ret = (memcmp (osb_cf, test_cf, sizeof (*osb_cf)
+						- sizeof (osb_cf->sk))) == 0;
+			}
+		}
+	}
+	else {
+		/* We are compatible now merely with fallback config */
+		if (test_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
+			ret = TRUE;
+		}
+	}
+
+	return ret;
+}
+
+gboolean
+rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool,
+		struct rspamd_tokenizer_runtime *rt,
+		gpointer ptr, gsize len)
+{
+	struct rspamd_osb_tokenizer_config *osb_cf;
+
+	if (ptr == NULL || len == 0) {
+		osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts);
+
+		if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) {
+			/* Trying to load incompatible configuration */
+			msg_err_pool ("cannot load tokenizer configuration from a legacy "
+					"statfile; maybe you have forgotten to set 'compat' option"
+					" in the tokenizer configuration");
+
+			return FALSE;
+		}
+	}
+	else {
+		g_assert (len == sizeof (*osb_cf));
+		osb_cf = ptr;
+	}
+
+	rt->config = osb_cf;
+	rt->conf_len = sizeof (*osb_cf);
+
+	return TRUE;
+}
+
+gboolean
+rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt)
+{
+	struct rspamd_osb_tokenizer_config *osb_cf = rt->config;
+
+	return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT);
+}
+#endif
+
+struct token_pipe_entry {
+	guint64 h;
+	rspamd_stat_token_t *t;
+};
+
+gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
+						  struct rspamd_task *task,
+						  GArray *words,
+						  gboolean is_utf,
+						  const gchar *prefix,
+						  GPtrArray *result)
+{
+	rspamd_token_t *new_tok = NULL;
+	rspamd_stat_token_t *token;
+	struct rspamd_osb_tokenizer_config *osb_cf;
+	guint64 cur, seed;
+	struct token_pipe_entry *hashpipe;
+	guint32 h1, h2;
+	gsize token_size;
+	guint processed = 0, i, w, window_size, token_flags = 0;
+
+	if (words == NULL) {
+		return FALSE;
+	}
+
+	osb_cf = ctx->tkcf;
+	window_size = osb_cf->window_size;
+
+	if (prefix) {
+		seed = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64,
+												   prefix, strlen(prefix), osb_cf->seed);
+	}
+	else {
+		seed = osb_cf->seed;
+	}
+
+	hashpipe = g_alloca(window_size * sizeof(hashpipe[0]));
+	for (i = 0; i < window_size; i++) {
+		hashpipe[i].h = 0xfe;
+		hashpipe[i].t = NULL;
+	}
+
+	token_size = sizeof(rspamd_token_t) +
+				 sizeof(gdouble) * ctx->statfiles->len;
+	g_assert(token_size > 0);
+
+	for (w = 0; w < words->len; w++) {
+		token = &g_array_index(words, rspamd_stat_token_t, w);
+		token_flags = token->flags;
+		const gchar *begin;
+		gsize len;
+
+		if (token->flags &
+			(RSPAMD_STAT_TOKEN_FLAG_STOP_WORD | RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) {
+			/* Skip stop/skipped words */
+			continue;
+		}
+
+		if (token->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+			begin = token->stemmed.begin;
+			len = token->stemmed.len;
+		}
+		else {
+			begin = token->original.begin;
+			len = token->original.len;
+		}
+
+		if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
+			rspamd_ftok_t ftok;
+
+			ftok.begin = begin;
+			ftok.len = len;
+			cur = rspamd_fstrhash_lc(&ftok, is_utf);
+		}
+		else {
+			/* We know that the words are normalized */
+			if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) {
+				cur = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64,
+														  begin, len, osb_cf->seed);
+			}
+			else {
+				rspamd_cryptobox_siphash((guchar *) &cur, begin,
+										 len, osb_cf->sk);
+
+				if (prefix) {
+					cur ^= seed;
+				}
+			}
+		}
+
+		if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) {
+			new_tok = rspamd_mempool_alloc0(task->task_pool, token_size);
+			new_tok->flags = token_flags;
+			new_tok->t1 = token;
+			new_tok->t2 = token;
+			new_tok->data = cur;
+			new_tok->window_idx = 0;
+			g_ptr_array_add(result, new_tok);
+
+			continue;
+		}
+
+#define ADD_TOKEN                                                                       \
+	do {                                                                                \
+		new_tok = rspamd_mempool_alloc0(task->task_pool, token_size);                   \
+		new_tok->flags = token_flags;                                                   \
+		new_tok->t1 = hashpipe[0].t;                                                    \
+		new_tok->t2 = hashpipe[i].t;                                                    \
+		if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {                                     \
+			h1 = ((guint32) hashpipe[0].h) * primes[0] +                                \
+				 ((guint32) hashpipe[i].h) * primes[i << 1];                            \
+			h2 = ((guint32) hashpipe[0].h) * primes[1] +                                \
+				 ((guint32) hashpipe[i].h) * primes[(i << 1) - 1];                      \
+			memcpy((guchar *) &new_tok->data, &h1, sizeof(h1));                         \
+			memcpy(((guchar *) &new_tok->data) + sizeof(h1), &h2, sizeof(h2));          \
+		}                                                                               \
+		else {                                                                          \
+			new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \
+		}                                                                               \
+		new_tok->window_idx = i;                                                        \
+		g_ptr_array_add(result, new_tok);                                               \
+	} while (0)
+
+		if (processed < window_size) {
+			/* Just fill a hashpipe */
+			++processed;
+			hashpipe[window_size - processed].h = cur;
+			hashpipe[window_size - processed].t = token;
+		}
+		else {
+			/* Shift hashpipe */
+			for (i = window_size - 1; i > 0; i--) {
+				hashpipe[i] = hashpipe[i - 1];
+			}
+			hashpipe[0].h = cur;
+			hashpipe[0].t = token;
+
+			processed++;
+
+			for (i = 1; i < window_size; i++) {
+				if (!(hashpipe[i].t->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION)) {
+					ADD_TOKEN;
+				}
+			}
+		}
+	}
+
+	if (processed > 1 && processed <= window_size) {
+		processed--;
+		memmove(hashpipe, &hashpipe[window_size - processed],
+				processed * sizeof(hashpipe[0]));
+
+		for (i = 1; i < processed; i++) {
+			ADD_TOKEN;
+		}
+	}
+
+#undef ADD_TOKEN
+
+	return TRUE;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
commit	133a45c109da5310add55824db21af5239951f93 (patch)
tree	ba6ac4c0a950a0dda56451944315d66409923918 /src/libstat/tokenizers/osb.c
parent	Initial commit. (diff)
download	rspamd-upstream.tar.xz rspamd-upstream.zip