diff options
Diffstat (limited to '')
-rw-r--r-- | src/libstat/tokenizers/osb.c | 424 |
1 files changed, 424 insertions, 0 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c new file mode 100644 index 0000000..d871c7a --- /dev/null +++ b/src/libstat/tokenizers/osb.c @@ -0,0 +1,424 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * OSB tokenizer + */ + + +#include "tokenizers.h" +#include "stat_internal.h" +#include "libmime/lang_detection.h" + +/* Size for features pipe */ +#define DEFAULT_FEATURE_WINDOW_SIZE 5 +#define DEFAULT_OSB_VERSION 2 + +static const int primes[] = { + 1, + 7, + 3, + 13, + 5, + 29, + 11, + 51, + 23, + 101, + 47, + 203, + 97, + 407, + 197, + 817, + 397, + 1637, + 797, + 3277, +}; + +static const guchar osb_tokenizer_magic[] = {'o', 's', 'b', 't', 'o', 'k', 'v', '2'}; + +enum rspamd_osb_hash_type { + RSPAMD_OSB_HASH_COMPAT = 0, + RSPAMD_OSB_HASH_XXHASH, + RSPAMD_OSB_HASH_SIPHASH +}; + +struct rspamd_osb_tokenizer_config { + guchar magic[8]; + gshort version; + gshort window_size; + enum rspamd_osb_hash_type ht; + guint64 seed; + rspamd_sipkey_t sk; +}; + +/* + * Return default config + */ +static struct rspamd_osb_tokenizer_config * +rspamd_tokenizer_osb_default_config(void) +{ + static struct rspamd_osb_tokenizer_config def; + + if (memcmp(def.magic, osb_tokenizer_magic, sizeof(osb_tokenizer_magic)) != 0) { + memset(&def, 0, sizeof(def)); + memcpy(def.magic, osb_tokenizer_magic, sizeof(osb_tokenizer_magic)); + def.version = DEFAULT_OSB_VERSION; + def.window_size = DEFAULT_FEATURE_WINDOW_SIZE; + def.ht = RSPAMD_OSB_HASH_XXHASH; + def.seed = 0xdeadbabe; + } + + return &def; +} + +static struct rspamd_osb_tokenizer_config * +rspamd_tokenizer_osb_config_from_ucl(rspamd_mempool_t *pool, + const ucl_object_t *obj) +{ + const ucl_object_t *elt; + struct rspamd_osb_tokenizer_config *cf, *def; + guchar *key = NULL; + gsize keylen; + + + if (pool != NULL) { + cf = rspamd_mempool_alloc0(pool, sizeof(*cf)); + } + else { + cf = g_malloc0(sizeof(*cf)); + } + + /* Use default config */ + def = rspamd_tokenizer_osb_default_config(); + memcpy(cf, def, sizeof(*cf)); + + elt = ucl_object_lookup(obj, "hash"); + if (elt != NULL && ucl_object_type(elt) == UCL_STRING) { + if (g_ascii_strncasecmp(ucl_object_tostring(elt), "xxh", 3) == 0) { + cf->ht = RSPAMD_OSB_HASH_XXHASH; + elt = ucl_object_lookup(obj, "seed"); + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + cf->seed = ucl_object_toint(elt); + } + } + else if (g_ascii_strncasecmp(ucl_object_tostring(elt), "sip", 3) == 0) { + cf->ht = RSPAMD_OSB_HASH_SIPHASH; + elt = ucl_object_lookup(obj, "key"); + + if (elt != NULL && ucl_object_type(elt) == UCL_STRING) { + key = rspamd_decode_base32(ucl_object_tostring(elt), + 0, &keylen, RSPAMD_BASE32_DEFAULT); + if (keylen < sizeof(rspamd_sipkey_t)) { + msg_warn("siphash key is too short: %z", keylen); + g_free(key); + } + else { + memcpy(cf->sk, key, sizeof(cf->sk)); + g_free(key); + } + } + else { + msg_warn_pool("siphash cannot be used without key"); + } + } + } + else { + elt = ucl_object_lookup(obj, "compat"); + if (elt != NULL && ucl_object_toboolean(elt)) { + cf->ht = RSPAMD_OSB_HASH_COMPAT; + } + } + + elt = ucl_object_lookup(obj, "window"); + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + cf->window_size = ucl_object_toint(elt); + if (cf->window_size > DEFAULT_FEATURE_WINDOW_SIZE * 4) { + msg_err_pool("too large window size: %d", cf->window_size); + cf->window_size = DEFAULT_FEATURE_WINDOW_SIZE; + } + } + + return cf; +} + +gpointer +rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool, + struct rspamd_tokenizer_config *cf, + gsize *len) +{ + struct rspamd_osb_tokenizer_config *osb_cf, *def; + + if (cf != NULL && cf->opts != NULL) { + osb_cf = rspamd_tokenizer_osb_config_from_ucl(pool, cf->opts); + } + else { + def = rspamd_tokenizer_osb_default_config(); + osb_cf = rspamd_mempool_alloc(pool, sizeof(*osb_cf)); + memcpy(osb_cf, def, sizeof(*osb_cf)); + /* Do not write sipkey to statfile */ + } + + if (osb_cf->ht == RSPAMD_OSB_HASH_SIPHASH) { + msg_info_pool("siphash key is not stored into statfiles, so you'd " + "need to keep it inside the configuration"); + } + + memset(osb_cf->sk, 0, sizeof(osb_cf->sk)); + + if (len != NULL) { + *len = sizeof(*osb_cf); + } + + return osb_cf; +} + +#if 0 +gboolean +rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt, + gpointer ptr, gsize len) +{ + struct rspamd_osb_tokenizer_config *osb_cf, *test_cf; + gboolean ret = FALSE; + + test_cf = rt->config; + g_assert (test_cf != NULL); + + if (len == sizeof (*osb_cf)) { + osb_cf = ptr; + + if (memcmp (osb_cf, osb_tokenizer_magic, sizeof (osb_tokenizer_magic)) != 0) { + ret = test_cf->ht == RSPAMD_OSB_HASH_COMPAT; + } + else { + if (osb_cf->version == DEFAULT_OSB_VERSION) { + /* We can compare them directly now */ + ret = (memcmp (osb_cf, test_cf, sizeof (*osb_cf) + - sizeof (osb_cf->sk))) == 0; + } + } + } + else { + /* We are compatible now merely with fallback config */ + if (test_cf->ht == RSPAMD_OSB_HASH_COMPAT) { + ret = TRUE; + } + } + + return ret; +} + +gboolean +rspamd_tokenizer_osb_load_config (rspamd_mempool_t *pool, + struct rspamd_tokenizer_runtime *rt, + gpointer ptr, gsize len) +{ + struct rspamd_osb_tokenizer_config *osb_cf; + + if (ptr == NULL || len == 0) { + osb_cf = rspamd_tokenizer_osb_config_from_ucl (pool, rt->tkcf->opts); + + if (osb_cf->ht != RSPAMD_OSB_HASH_COMPAT) { + /* Trying to load incompatible configuration */ + msg_err_pool ("cannot load tokenizer configuration from a legacy " + "statfile; maybe you have forgotten to set 'compat' option" + " in the tokenizer configuration"); + + return FALSE; + } + } + else { + g_assert (len == sizeof (*osb_cf)); + osb_cf = ptr; + } + + rt->config = osb_cf; + rt->conf_len = sizeof (*osb_cf); + + return TRUE; +} + +gboolean +rspamd_tokenizer_osb_is_compat (struct rspamd_tokenizer_runtime *rt) +{ + struct rspamd_osb_tokenizer_config *osb_cf = rt->config; + + return (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT); +} +#endif + +struct token_pipe_entry { + guint64 h; + rspamd_stat_token_t *t; +}; + +gint rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx, + struct rspamd_task *task, + GArray *words, + gboolean is_utf, + const gchar *prefix, + GPtrArray *result) +{ + rspamd_token_t *new_tok = NULL; + rspamd_stat_token_t *token; + struct rspamd_osb_tokenizer_config *osb_cf; + guint64 cur, seed; + struct token_pipe_entry *hashpipe; + guint32 h1, h2; + gsize token_size; + guint processed = 0, i, w, window_size, token_flags = 0; + + if (words == NULL) { + return FALSE; + } + + osb_cf = ctx->tkcf; + window_size = osb_cf->window_size; + + if (prefix) { + seed = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + prefix, strlen(prefix), osb_cf->seed); + } + else { + seed = osb_cf->seed; + } + + hashpipe = g_alloca(window_size * sizeof(hashpipe[0])); + for (i = 0; i < window_size; i++) { + hashpipe[i].h = 0xfe; + hashpipe[i].t = NULL; + } + + token_size = sizeof(rspamd_token_t) + + sizeof(gdouble) * ctx->statfiles->len; + g_assert(token_size > 0); + + for (w = 0; w < words->len; w++) { + token = &g_array_index(words, rspamd_stat_token_t, w); + token_flags = token->flags; + const gchar *begin; + gsize len; + + if (token->flags & + (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD | RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) { + /* Skip stop/skipped words */ + continue; + } + + if (token->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + begin = token->stemmed.begin; + len = token->stemmed.len; + } + else { + begin = token->original.begin; + len = token->original.len; + } + + if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { + rspamd_ftok_t ftok; + + ftok.begin = begin; + ftok.len = len; + cur = rspamd_fstrhash_lc(&ftok, is_utf); + } + else { + /* We know that the words are normalized */ + if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) { + cur = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + begin, len, osb_cf->seed); + } + else { + rspamd_cryptobox_siphash((guchar *) &cur, begin, + len, osb_cf->sk); + + if (prefix) { + cur ^= seed; + } + } + } + + if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) { + new_tok = rspamd_mempool_alloc0(task->task_pool, token_size); + new_tok->flags = token_flags; + new_tok->t1 = token; + new_tok->t2 = token; + new_tok->data = cur; + new_tok->window_idx = 0; + g_ptr_array_add(result, new_tok); + + continue; + } + +#define ADD_TOKEN \ + do { \ + new_tok = rspamd_mempool_alloc0(task->task_pool, token_size); \ + new_tok->flags = token_flags; \ + new_tok->t1 = hashpipe[0].t; \ + new_tok->t2 = hashpipe[i].t; \ + if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { \ + h1 = ((guint32) hashpipe[0].h) * primes[0] + \ + ((guint32) hashpipe[i].h) * primes[i << 1]; \ + h2 = ((guint32) hashpipe[0].h) * primes[1] + \ + ((guint32) hashpipe[i].h) * primes[(i << 1) - 1]; \ + memcpy((guchar *) &new_tok->data, &h1, sizeof(h1)); \ + memcpy(((guchar *) &new_tok->data) + sizeof(h1), &h2, sizeof(h2)); \ + } \ + else { \ + new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \ + } \ + new_tok->window_idx = i; \ + g_ptr_array_add(result, new_tok); \ + } while (0) + + if (processed < window_size) { + /* Just fill a hashpipe */ + ++processed; + hashpipe[window_size - processed].h = cur; + hashpipe[window_size - processed].t = token; + } + else { + /* Shift hashpipe */ + for (i = window_size - 1; i > 0; i--) { + hashpipe[i] = hashpipe[i - 1]; + } + hashpipe[0].h = cur; + hashpipe[0].t = token; + + processed++; + + for (i = 1; i < window_size; i++) { + if (!(hashpipe[i].t->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION)) { + ADD_TOKEN; + } + } + } + } + + if (processed > 1 && processed <= window_size) { + processed--; + memmove(hashpipe, &hashpipe[window_size - processed], + processed * sizeof(hashpipe[0])); + + for (i = 1; i < processed; i++) { + ADD_TOKEN; + } + } + +#undef ADD_TOKEN + + return TRUE; +} |