diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /src/libserver/re_cache.c | |
parent | Initial commit. (diff) | |
download | rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/libserver/re_cache.c')
-rw-r--r-- | src/libserver/re_cache.c | 2712 |
1 files changed, 2712 insertions, 0 deletions
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c new file mode 100644 index 0000000..d51dba6 --- /dev/null +++ b/src/libserver/re_cache.c @@ -0,0 +1,2712 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "libmime/message.h" +#include "re_cache.h" +#include "cryptobox.h" +#include "ref.h" +#include "libserver/url.h" +#include "libserver/task.h" +#include "libserver/cfg_file.h" +#include "libutil/util.h" +#include "libutil/regexp.h" +#include "lua/lua_common.h" +#include "libstat/stat_api.h" +#include "contrib/uthash/utlist.h" + +#include "khash.h" + +#ifdef WITH_HYPERSCAN +#include "hs.h" +#include "hyperscan_tools.h" +#endif + +#include "unix-std.h" +#include <signal.h> +#include <stdalign.h> +#include <math.h> +#include "contrib/libev/ev.h" + +#ifndef WITH_PCRE2 +#include <pcre.h> +#else +#include <pcre2.h> +#endif + +#include "contrib/fastutf8/fastutf8.h" + +#ifdef HAVE_SYS_WAIT_H +#include <sys/wait.h> +#endif + +#define msg_err_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "re_cache", cache->hash, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "re_cache", cache->hash, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "re_cache", cache->hash, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +#define msg_debug_re_task(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_re_cache_log_id, "re_cache", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_re_cache(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_re_cache_log_id, "re_cache", cache->hash, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(re_cache) + +#ifdef WITH_HYPERSCAN +#define RSPAMD_HS_MAGIC_LEN (sizeof(rspamd_hs_magic)) +static const guchar rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'}, + rspamd_hs_magic_vector[] = {'r', 's', 'h', 's', 'r', 'v', '1', '1'}; +#endif + + +struct rspamd_re_class { + guint64 id; + enum rspamd_re_type type; + gboolean has_utf8; /* if there are any utf8 regexps */ + gpointer type_data; + gsize type_len; + GHashTable *re; + rspamd_cryptobox_hash_state_t *st; + + gchar hash[rspamd_cryptobox_HASHBYTES + 1]; + +#ifdef WITH_HYPERSCAN + rspamd_hyperscan_t *hs_db; + hs_scratch_t *hs_scratch; + gint *hs_ids; + guint nhs; +#endif +}; + +enum rspamd_re_cache_elt_match_type { + RSPAMD_RE_CACHE_PCRE = 0, + RSPAMD_RE_CACHE_HYPERSCAN, + RSPAMD_RE_CACHE_HYPERSCAN_PRE +}; + +struct rspamd_re_cache_elt { + rspamd_regexp_t *re; + gint lua_cbref; + enum rspamd_re_cache_elt_match_type match_type; +}; + +KHASH_INIT(lua_selectors_hash, gchar *, int, 1, kh_str_hash_func, kh_str_hash_equal); + +struct rspamd_re_cache { + GHashTable *re_classes; + + GPtrArray *re; + khash_t(lua_selectors_hash) * selectors; + ref_entry_t ref; + guint nre; + guint max_re_data; + gchar hash[rspamd_cryptobox_HASHBYTES + 1]; + lua_State *L; +#ifdef WITH_HYPERSCAN + enum rspamd_hyperscan_status hyperscan_loaded; + gboolean disable_hyperscan; + hs_platform_info_t plt; +#endif +}; + +struct rspamd_re_selector_result { + guchar **scvec; + guint *lenvec; + guint cnt; +}; + +KHASH_INIT(selectors_results_hash, int, struct rspamd_re_selector_result, 1, + kh_int_hash_func, kh_int_hash_equal); + +struct rspamd_re_runtime { + guchar *checked; + guchar *results; + khash_t(selectors_results_hash) * sel_cache; + struct rspamd_re_cache *cache; + struct rspamd_re_cache_stat stat; + gboolean has_hs; +}; + +static GQuark +rspamd_re_cache_quark(void) +{ + return g_quark_from_static_string("re_cache"); +} + +static guint64 +rspamd_re_cache_class_id(enum rspamd_re_type type, + gconstpointer type_data, + gsize datalen) +{ + rspamd_cryptobox_fast_hash_state_t st; + + rspamd_cryptobox_fast_hash_init(&st, 0xdeadbabe); + rspamd_cryptobox_fast_hash_update(&st, &type, sizeof(type)); + + if (datalen > 0) { + rspamd_cryptobox_fast_hash_update(&st, type_data, datalen); + } + + return rspamd_cryptobox_fast_hash_final(&st); +} + +static void +rspamd_re_cache_destroy(struct rspamd_re_cache *cache) +{ + GHashTableIter it; + gpointer k, v; + struct rspamd_re_class *re_class; + gchar *skey; + gint sref; + + g_assert(cache != NULL); + g_hash_table_iter_init(&it, cache->re_classes); + + while (g_hash_table_iter_next(&it, &k, &v)) { + re_class = v; + g_hash_table_iter_steal(&it); + g_hash_table_unref(re_class->re); + + if (re_class->type_data) { + g_free(re_class->type_data); + } + +#ifdef WITH_HYPERSCAN + if (re_class->hs_db) { + rspamd_hyperscan_free(re_class->hs_db, false); + } + if (re_class->hs_scratch) { + hs_free_scratch(re_class->hs_scratch); + } + if (re_class->hs_ids) { + g_free(re_class->hs_ids); + } +#endif + g_free(re_class); + } + + if (cache->L) { + kh_foreach(cache->selectors, skey, sref, { + luaL_unref(cache->L, LUA_REGISTRYINDEX, sref); + g_free(skey); + }); + + struct rspamd_re_cache_elt *elt; + guint i; + + PTR_ARRAY_FOREACH(cache->re, i, elt) + { + if (elt->lua_cbref != -1) { + luaL_unref(cache->L, LUA_REGISTRYINDEX, elt->lua_cbref); + } + } + } + + kh_destroy(lua_selectors_hash, cache->selectors); + + g_hash_table_unref(cache->re_classes); + g_ptr_array_free(cache->re, TRUE); + g_free(cache); +} + +static void +rspamd_re_cache_elt_dtor(gpointer e) +{ + struct rspamd_re_cache_elt *elt = e; + + rspamd_regexp_unref(elt->re); + g_free(elt); +} + +struct rspamd_re_cache * +rspamd_re_cache_new(void) +{ + struct rspamd_re_cache *cache; + + cache = g_malloc0(sizeof(*cache)); + cache->re_classes = g_hash_table_new(g_int64_hash, g_int64_equal); + cache->nre = 0; + cache->re = g_ptr_array_new_full(256, rspamd_re_cache_elt_dtor); + cache->selectors = kh_init(lua_selectors_hash); +#ifdef WITH_HYPERSCAN + cache->hyperscan_loaded = RSPAMD_HYPERSCAN_UNKNOWN; +#endif + REF_INIT_RETAIN(cache, rspamd_re_cache_destroy); + + return cache; +} + +enum rspamd_hyperscan_status +rspamd_re_cache_is_hs_loaded(struct rspamd_re_cache *cache) +{ + g_assert(cache != NULL); + +#ifdef WITH_HYPERSCAN + return cache->hyperscan_loaded; +#else + return RSPAMD_HYPERSCAN_UNSUPPORTED; +#endif +} + +rspamd_regexp_t * +rspamd_re_cache_add(struct rspamd_re_cache *cache, + rspamd_regexp_t *re, + enum rspamd_re_type type, + gconstpointer type_data, gsize datalen, + gint lua_cbref) +{ + guint64 class_id; + struct rspamd_re_class *re_class; + rspamd_regexp_t *nre; + struct rspamd_re_cache_elt *elt; + + g_assert(cache != NULL); + g_assert(re != NULL); + + class_id = rspamd_re_cache_class_id(type, type_data, datalen); + re_class = g_hash_table_lookup(cache->re_classes, &class_id); + + if (re_class == NULL) { + re_class = g_malloc0(sizeof(*re_class)); + re_class->id = class_id; + re_class->type_len = datalen; + re_class->type = type; + re_class->re = g_hash_table_new_full(rspamd_regexp_hash, + rspamd_regexp_equal, NULL, (GDestroyNotify) rspamd_regexp_unref); + + if (datalen > 0) { + re_class->type_data = g_malloc0(datalen); + memcpy(re_class->type_data, type_data, datalen); + } + + g_hash_table_insert(cache->re_classes, &re_class->id, re_class); + } + + if ((nre = g_hash_table_lookup(re_class->re, rspamd_regexp_get_id(re))) == NULL) { + /* + * We set re id based on the global position in the cache + */ + elt = g_malloc0(sizeof(*elt)); + /* One ref for re_class */ + nre = rspamd_regexp_ref(re); + rspamd_regexp_set_cache_id(re, cache->nre++); + /* One ref for cache */ + elt->re = rspamd_regexp_ref(re); + g_ptr_array_add(cache->re, elt); + rspamd_regexp_set_class(re, re_class); + elt->lua_cbref = lua_cbref; + + g_hash_table_insert(re_class->re, rspamd_regexp_get_id(nre), nre); + } + + if (rspamd_regexp_get_flags(re) & RSPAMD_REGEXP_FLAG_UTF) { + re_class->has_utf8 = TRUE; + } + + return nre; +} + +void rspamd_re_cache_replace(struct rspamd_re_cache *cache, + rspamd_regexp_t *what, + rspamd_regexp_t *with) +{ + guint64 re_id; + struct rspamd_re_class *re_class; + rspamd_regexp_t *src; + struct rspamd_re_cache_elt *elt; + + g_assert(cache != NULL); + g_assert(what != NULL); + g_assert(with != NULL); + + re_class = rspamd_regexp_get_class(what); + + if (re_class != NULL) { + re_id = rspamd_regexp_get_cache_id(what); + + g_assert(re_id != RSPAMD_INVALID_ID); + src = g_hash_table_lookup(re_class->re, rspamd_regexp_get_id(what)); + elt = g_ptr_array_index(cache->re, re_id); + g_assert(elt != NULL); + g_assert(src != NULL); + + rspamd_regexp_set_cache_id(what, RSPAMD_INVALID_ID); + rspamd_regexp_set_class(what, NULL); + rspamd_regexp_set_cache_id(with, re_id); + rspamd_regexp_set_class(with, re_class); + /* + * On calling of this function, we actually unref old re (what) + */ + g_hash_table_insert(re_class->re, + rspamd_regexp_get_id(what), + rspamd_regexp_ref(with)); + + rspamd_regexp_unref(elt->re); + elt->re = rspamd_regexp_ref(with); + /* XXX: do not touch match type here */ + } +} + +static gint +rspamd_re_cache_sort_func(gconstpointer a, gconstpointer b) +{ + struct rspamd_re_cache_elt *const *re1 = a, *const *re2 = b; + + return rspamd_regexp_cmp(rspamd_regexp_get_id((*re1)->re), + rspamd_regexp_get_id((*re2)->re)); +} + +void rspamd_re_cache_init(struct rspamd_re_cache *cache, struct rspamd_config *cfg) +{ + guint i, fl; + GHashTableIter it; + gpointer k, v; + struct rspamd_re_class *re_class; + rspamd_cryptobox_hash_state_t st_global; + rspamd_regexp_t *re; + struct rspamd_re_cache_elt *elt; + guchar hash_out[rspamd_cryptobox_HASHBYTES]; + + g_assert(cache != NULL); + + rspamd_cryptobox_hash_init(&st_global, NULL, 0); + /* Resort all regexps */ + g_ptr_array_sort(cache->re, rspamd_re_cache_sort_func); + + for (i = 0; i < cache->re->len; i++) { + elt = g_ptr_array_index(cache->re, i); + re = elt->re; + re_class = rspamd_regexp_get_class(re); + g_assert(re_class != NULL); + rspamd_regexp_set_cache_id(re, i); + + if (re_class->st == NULL) { + (void) !posix_memalign((void **) &re_class->st, RSPAMD_ALIGNOF(rspamd_cryptobox_hash_state_t), + sizeof(*re_class->st)); + g_assert(re_class->st != NULL); + rspamd_cryptobox_hash_init(re_class->st, NULL, 0); + } + + /* Update hashes */ + /* Id of re class */ + rspamd_cryptobox_hash_update(re_class->st, (gpointer) &re_class->id, + sizeof(re_class->id)); + rspamd_cryptobox_hash_update(&st_global, (gpointer) &re_class->id, + sizeof(re_class->id)); + /* Id of re expression */ + rspamd_cryptobox_hash_update(re_class->st, rspamd_regexp_get_id(re), + rspamd_cryptobox_HASHBYTES); + rspamd_cryptobox_hash_update(&st_global, rspamd_regexp_get_id(re), + rspamd_cryptobox_HASHBYTES); + /* PCRE flags */ + fl = rspamd_regexp_get_pcre_flags(re); + rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &fl, + sizeof(fl)); + rspamd_cryptobox_hash_update(&st_global, (const guchar *) &fl, + sizeof(fl)); + /* Rspamd flags */ + fl = rspamd_regexp_get_flags(re); + rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &fl, + sizeof(fl)); + rspamd_cryptobox_hash_update(&st_global, (const guchar *) &fl, + sizeof(fl)); + /* Limit of hits */ + fl = rspamd_regexp_get_maxhits(re); + rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &fl, + sizeof(fl)); + rspamd_cryptobox_hash_update(&st_global, (const guchar *) &fl, + sizeof(fl)); + /* Numeric order */ + rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &i, + sizeof(i)); + rspamd_cryptobox_hash_update(&st_global, (const guchar *) &i, + sizeof(i)); + } + + rspamd_cryptobox_hash_final(&st_global, hash_out); + rspamd_snprintf(cache->hash, sizeof(cache->hash), "%*xs", + (gint) rspamd_cryptobox_HASHBYTES, hash_out); + + /* Now finalize all classes */ + g_hash_table_iter_init(&it, cache->re_classes); + + while (g_hash_table_iter_next(&it, &k, &v)) { + re_class = v; + + if (re_class->st) { + /* + * We finally update all classes with the number of expressions + * in the cache to ensure that if even a single re has been changed + * we won't be broken due to id mismatch + */ + rspamd_cryptobox_hash_update(re_class->st, + (gpointer) &cache->re->len, + sizeof(cache->re->len)); + rspamd_cryptobox_hash_final(re_class->st, hash_out); + rspamd_snprintf(re_class->hash, sizeof(re_class->hash), "%*xs", + (gint) rspamd_cryptobox_HASHBYTES, hash_out); + free(re_class->st); /* Due to posix_memalign */ + re_class->st = NULL; + } + } + + cache->L = cfg->lua_state; + +#ifdef WITH_HYPERSCAN + const gchar *platform = "generic"; + rspamd_fstring_t *features = rspamd_fstring_new(); + + cache->disable_hyperscan = cfg->disable_hyperscan; + + g_assert(hs_populate_platform(&cache->plt) == HS_SUCCESS); + + /* Now decode what we do have */ + switch (cache->plt.tune) { + case HS_TUNE_FAMILY_HSW: + platform = "haswell"; + break; + case HS_TUNE_FAMILY_SNB: + platform = "sandy"; + break; + case HS_TUNE_FAMILY_BDW: + platform = "broadwell"; + break; + case HS_TUNE_FAMILY_IVB: + platform = "ivy"; + break; + default: + break; + } + + if (cache->plt.cpu_features & HS_CPU_FEATURES_AVX2) { + features = rspamd_fstring_append(features, "AVX2", 4); + } + + hs_set_allocator(g_malloc, g_free); + + msg_info_re_cache("loaded hyperscan engine with cpu tune '%s' and features '%V'", + platform, features); + + rspamd_fstring_free(features); +#endif +} + +struct rspamd_re_runtime * +rspamd_re_cache_runtime_new(struct rspamd_re_cache *cache) +{ + struct rspamd_re_runtime *rt; + g_assert(cache != NULL); + + rt = g_malloc0(sizeof(*rt) + NBYTES(cache->nre) + cache->nre); + rt->cache = cache; + REF_RETAIN(cache); + rt->checked = ((guchar *) rt) + sizeof(*rt); + rt->results = rt->checked + NBYTES(cache->nre); + rt->stat.regexp_total = cache->nre; +#ifdef WITH_HYPERSCAN + rt->has_hs = cache->hyperscan_loaded; +#endif + + return rt; +} + +const struct rspamd_re_cache_stat * +rspamd_re_cache_get_stat(struct rspamd_re_runtime *rt) +{ + g_assert(rt != NULL); + + return &rt->stat; +} + +static gboolean +rspamd_re_cache_check_lua_condition(struct rspamd_task *task, + rspamd_regexp_t *re, + const guchar *in, gsize len, + goffset start, goffset end, + gint lua_cbref) +{ + lua_State *L = (lua_State *) task->cfg->lua_state; + GError *err = NULL; + struct rspamd_lua_text __attribute__((unused)) * t; + gint text_pos; + + if (G_LIKELY(lua_cbref == -1)) { + return TRUE; + } + + t = lua_new_text(L, in, len, FALSE); + text_pos = lua_gettop(L); + + if (!rspamd_lua_universal_pcall(L, lua_cbref, + G_STRLOC, 1, "utii", &err, + "rspamd{task}", task, + text_pos, start, end)) { + msg_warn_task("cannot call for re_cache_check_lua_condition for re %s: %e", + rspamd_regexp_get_pattern(re), err); + g_error_free(err); + lua_settop(L, text_pos - 1); + + return TRUE; + } + + gboolean res = lua_toboolean(L, -1); + + lua_settop(L, text_pos - 1); + + return res; +} + +static guint +rspamd_re_cache_process_pcre(struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, struct rspamd_task *task, + const guchar *in, gsize len, + gboolean is_raw, + gint lua_cbref) +{ + guint r = 0; + const gchar *start = NULL, *end = NULL; + guint max_hits = rspamd_regexp_get_maxhits(re); + guint64 id = rspamd_regexp_get_cache_id(re); + gdouble t1 = NAN, t2, pr; + const gdouble slow_time = 1e8; + + if (in == NULL) { + return rt->results[id]; + } + + if (len == 0) { + return rt->results[id]; + } + + if (rt->cache->max_re_data > 0 && len > rt->cache->max_re_data) { + len = rt->cache->max_re_data; + } + + r = rt->results[id]; + + if (max_hits == 0 || r < max_hits) { + pr = rspamd_random_double_fast(); + + if (pr > 0.9) { + t1 = rspamd_get_ticks(TRUE); + } + + while (rspamd_regexp_search(re, + in, + len, + &start, + &end, + is_raw, + NULL)) { + if (rspamd_re_cache_check_lua_condition(task, re, in, len, + start - (const gchar *) in, end - (const gchar *) in, lua_cbref)) { + r++; + msg_debug_re_task("found regexp /%s/, total hits: %d", + rspamd_regexp_get_pattern(re), r); + } + + if (max_hits > 0 && r >= max_hits) { + break; + } + } + + rt->results[id] += r; + rt->stat.regexp_checked++; + rt->stat.bytes_scanned_pcre += len; + rt->stat.bytes_scanned += len; + + if (r > 0) { + rt->stat.regexp_matched += r; + } + + if (!isnan(t1)) { + t2 = rspamd_get_ticks(TRUE); + + if (t2 - t1 > slow_time) { + rspamd_symcache_enable_profile(task); + msg_info_task("regexp '%16s' took %.0f ticks to execute", + rspamd_regexp_get_pattern(re), t2 - t1); + } + } + } + + return r; +} + +#ifdef WITH_HYPERSCAN +struct rspamd_re_hyperscan_cbdata { + struct rspamd_re_runtime *rt; + const guchar **ins; + const guint *lens; + guint count; + rspamd_regexp_t *re; + struct rspamd_task *task; +}; + +static gint +rspamd_re_cache_hyperscan_cb(unsigned int id, + unsigned long long from, + unsigned long long to, + unsigned int flags, + void *ud) +{ + struct rspamd_re_hyperscan_cbdata *cbdata = ud; + struct rspamd_re_runtime *rt; + struct rspamd_re_cache_elt *cache_elt; + guint ret, maxhits, i, processed; + struct rspamd_task *task; + + rt = cbdata->rt; + task = cbdata->task; + cache_elt = g_ptr_array_index(rt->cache->re, id); + maxhits = rspamd_regexp_get_maxhits(cache_elt->re); + + if (cache_elt->match_type == RSPAMD_RE_CACHE_HYPERSCAN) { + if (rspamd_re_cache_check_lua_condition(task, cache_elt->re, + cbdata->ins[0], cbdata->lens[0], from, to, cache_elt->lua_cbref)) { + ret = 1; + setbit(rt->checked, id); + + if (maxhits == 0 || rt->results[id] < maxhits) { + rt->results[id] += ret; + rt->stat.regexp_matched++; + } + msg_debug_re_task("found regexp /%s/ using hyperscan only, total hits: %d", + rspamd_regexp_get_pattern(cache_elt->re), rt->results[id]); + } + } + else { + if (!isset(rt->checked, id)) { + + processed = 0; + + for (i = 0; i < cbdata->count; i++) { + rspamd_re_cache_process_pcre(rt, + cache_elt->re, + cbdata->task, + cbdata->ins[i], + cbdata->lens[i], + FALSE, + cache_elt->lua_cbref); + setbit(rt->checked, id); + + processed += cbdata->lens[i]; + + if (processed >= to) { + break; + } + } + } + } + + return 0; +} +#endif + +static guint +rspamd_re_cache_process_regexp_data(struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, struct rspamd_task *task, + const guchar **in, guint *lens, + guint count, + gboolean is_raw, + gboolean *processed_hyperscan) +{ + + guint64 re_id; + guint ret = 0; + guint i; + struct rspamd_re_cache_elt *cache_elt; + + re_id = rspamd_regexp_get_cache_id(re); + + if (count == 0 || in == NULL) { + /* We assume this as absence of the specified data */ + setbit(rt->checked, re_id); + rt->results[re_id] = ret; + return ret; + } + + cache_elt = (struct rspamd_re_cache_elt *) g_ptr_array_index(rt->cache->re, re_id); + +#ifndef WITH_HYPERSCAN + for (i = 0; i < count; i++) { + ret = rspamd_re_cache_process_pcre(rt, + re, + task, + in[i], + lens[i], + is_raw, + cache_elt->lua_cbref); + rt->results[re_id] = ret; + } + + setbit(rt->checked, re_id); +#else + struct rspamd_re_class *re_class; + struct rspamd_re_hyperscan_cbdata cbdata; + + cache_elt = g_ptr_array_index(rt->cache->re, re_id); + re_class = rspamd_regexp_get_class(re); + + if (rt->cache->disable_hyperscan || cache_elt->match_type == RSPAMD_RE_CACHE_PCRE || + !rt->has_hs || (is_raw && re_class->has_utf8)) { + for (i = 0; i < count; i++) { + ret = rspamd_re_cache_process_pcre(rt, + re, + task, + in[i], + lens[i], + is_raw, + cache_elt->lua_cbref); + } + + setbit(rt->checked, re_id); + } + else { + for (i = 0; i < count; i++) { + /* For Hyperscan we can probably safely disable all those limits */ +#if 0 + if (rt->cache->max_re_data > 0 && lens[i] > rt->cache->max_re_data) { + lens[i] = rt->cache->max_re_data; + } +#endif + rt->stat.bytes_scanned += lens[i]; + } + + g_assert(re_class->hs_scratch != NULL); + g_assert(re_class->hs_db != NULL); + + /* Go through hyperscan API */ + for (i = 0; i < count; i++) { + cbdata.ins = &in[i]; + cbdata.re = re; + cbdata.rt = rt; + cbdata.lens = &lens[i]; + cbdata.count = 1; + cbdata.task = task; + + if ((hs_scan(rspamd_hyperscan_get_database(re_class->hs_db), + in[i], lens[i], 0, + re_class->hs_scratch, + rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) { + ret = 0; + } + else { + ret = rt->results[re_id]; + *processed_hyperscan = TRUE; + } + } + } +#endif + + return ret; +} + +static void +rspamd_re_cache_finish_class(struct rspamd_task *task, + struct rspamd_re_runtime *rt, + struct rspamd_re_class *re_class, + const gchar *class_name) +{ +#ifdef WITH_HYPERSCAN + guint i; + guint64 re_id; + guint found = 0; + + /* Set all bits that are not checked and included in hyperscan to 1 */ + for (i = 0; i < re_class->nhs; i++) { + re_id = re_class->hs_ids[i]; + + if (!isset(rt->checked, re_id)) { + g_assert(rt->results[re_id] == 0); + rt->results[re_id] = 0; + setbit(rt->checked, re_id); + } + else { + found++; + } + } + + msg_debug_re_task("finished hyperscan for class %s; %d " + "matches found; %d hyperscan supported regexps; %d total regexps", + class_name, found, re_class->nhs, (gint) g_hash_table_size(re_class->re)); +#endif +} + +static gboolean +rspamd_re_cache_process_selector(struct rspamd_task *task, + struct rspamd_re_runtime *rt, + const gchar *name, + guchar ***svec, + guint **lenvec, + guint *n) +{ + gint ref; + khiter_t k; + lua_State *L; + gint err_idx, ret; + struct rspamd_task **ptask; + gboolean result = FALSE; + struct rspamd_re_cache *cache = rt->cache; + struct rspamd_re_selector_result *sr; + + L = cache->L; + k = kh_get(lua_selectors_hash, cache->selectors, (gchar *) name); + + if (k == kh_end(cache->selectors)) { + msg_err_task("cannot find selector %s, not registered", name); + + return FALSE; + } + + ref = kh_value(cache->selectors, k); + + /* First, search for the cached result */ + if (rt->sel_cache) { + k = kh_get(selectors_results_hash, rt->sel_cache, ref); + + if (k != kh_end(rt->sel_cache)) { + sr = &kh_value(rt->sel_cache, k); + + *svec = sr->scvec; + *lenvec = sr->lenvec; + *n = sr->cnt; + + return TRUE; + } + } + else { + rt->sel_cache = kh_init(selectors_results_hash); + } + + lua_pushcfunction(L, &rspamd_lua_traceback); + err_idx = lua_gettop(L); + + lua_rawgeti(L, LUA_REGISTRYINDEX, ref); + ptask = lua_newuserdata(L, sizeof(*ptask)); + *ptask = task; + rspamd_lua_setclass(L, "rspamd{task}", -1); + + if ((ret = lua_pcall(L, 1, 1, err_idx)) != 0) { + msg_err_task("call to selector %s " + "failed (%d): %s", + name, ret, + lua_tostring(L, -1)); + } + else { + struct rspamd_lua_text *txt; + gsize slen; + const gchar *sel_data; + + if (lua_type(L, -1) != LUA_TTABLE) { + txt = lua_check_text_or_string(L, -1); + + + if (txt) { + msg_debug_re_cache("re selector %s returned 1 element", name); + sel_data = txt->start; + slen = txt->len; + *n = 1; + *svec = g_malloc(sizeof(guchar *)); + *lenvec = g_malloc(sizeof(guint)); + (*svec)[0] = g_malloc(slen); + memcpy((*svec)[0], sel_data, slen); + (*lenvec)[0] = slen; + result = TRUE; + } + else { + msg_debug_re_cache("re selector %s returned NULL", name); + } + } + else { + *n = rspamd_lua_table_size(L, -1); + + msg_debug_re_cache("re selector %s returned %d elements", name, *n); + + if (*n > 0) { + *svec = g_malloc(sizeof(guchar *) * (*n)); + *lenvec = g_malloc(sizeof(guint) * (*n)); + + for (int i = 0; i < *n; i++) { + lua_rawgeti(L, -1, i + 1); + + txt = lua_check_text_or_string(L, -1); + if (txt && txt->len > 0) { + sel_data = txt->start; + slen = txt->len; + (*svec)[i] = g_malloc(slen); + memcpy((*svec)[i], sel_data, slen); + } + else { + /* A hack to avoid malloc(0) */ + sel_data = ""; + slen = 0; + (*svec)[i] = g_malloc(1); + memcpy((*svec)[i], sel_data, 1); + } + + (*lenvec)[i] = slen; + lua_pop(L, 1); + } + } + + /* Empty table is also a valid result */ + result = TRUE; + } + } + + lua_settop(L, err_idx - 1); + + if (result) { + k = kh_put(selectors_results_hash, rt->sel_cache, ref, &ret); + sr = &kh_value(rt->sel_cache, k); + + sr->cnt = *n; + sr->scvec = *svec; + sr->lenvec = *lenvec; + } + + return result; +} + +static inline guint +rspamd_process_words_vector(GArray *words, + const guchar **scvec, + guint *lenvec, + struct rspamd_re_class *re_class, + guint cnt, + gboolean *raw) +{ + guint j; + rspamd_stat_token_t *tok; + + if (words) { + for (j = 0; j < words->len; j++) { + tok = &g_array_index(words, rspamd_stat_token_t, j); + + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) { + if (!re_class->has_utf8) { + *raw = TRUE; + } + else { + continue; /* Skip */ + } + } + } + else { + continue; /* Skip non text */ + } + + if (re_class->type == RSPAMD_RE_RAWWORDS) { + if (tok->original.len > 0) { + scvec[cnt] = tok->original.begin; + lenvec[cnt++] = tok->original.len; + } + } + else if (re_class->type == RSPAMD_RE_WORDS) { + if (tok->normalized.len > 0) { + scvec[cnt] = tok->normalized.begin; + lenvec[cnt++] = tok->normalized.len; + } + } + else { + /* Stemmed words */ + if (tok->stemmed.len > 0) { + scvec[cnt] = tok->stemmed.begin; + lenvec[cnt++] = tok->stemmed.len; + } + } + } + } + + return cnt; +} + +static guint +rspamd_re_cache_process_headers_list(struct rspamd_task *task, + struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, + struct rspamd_re_class *re_class, + struct rspamd_mime_header *rh, + gboolean is_strong, + gboolean *processed_hyperscan) +{ + const guchar **scvec, *in; + gboolean raw = FALSE; + guint *lenvec; + struct rspamd_mime_header *cur; + guint cnt = 0, i = 0, ret = 0; + + DL_COUNT(rh, cur, cnt); + + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + DL_FOREACH(rh, cur) + { + + if (is_strong && strcmp(cur->name, re_class->type_data) != 0) { + /* Skip a different case */ + continue; + } + + if (re_class->type == RSPAMD_RE_RAWHEADER) { + in = (const guchar *) cur->value; + lenvec[i] = strlen(cur->value); + + if (rspamd_fast_utf8_validate(in, lenvec[i]) != 0) { + raw = TRUE; + } + } + else { + in = (const guchar *) cur->decoded; + /* Validate input^W^WNo need to validate as it is already valid */ + if (!in) { + lenvec[i] = 0; + scvec[i] = (guchar *) ""; + continue; + } + + lenvec[i] = strlen(in); + } + + scvec[i] = in; + + i++; + } + + if (i > 0) { + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, i, raw, processed_hyperscan); + msg_debug_re_task("checking header %s regexp: %s=%*s -> %d", + re_class->type_data, + rspamd_regexp_get_pattern(re), + (int) lenvec[0], scvec[0], ret); + } + + g_free(scvec); + g_free(lenvec); + + return ret; +} + +/* + * Calculates the specified regexp for the specified class if it's not calculated + */ +static guint +rspamd_re_cache_exec_re(struct rspamd_task *task, + struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, + struct rspamd_re_class *re_class, + gboolean is_strong) +{ + guint ret = 0, i, re_id; + struct rspamd_mime_header *rh; + const gchar *in; + const guchar **scvec = NULL; + guint *lenvec = NULL; + gboolean raw = FALSE, processed_hyperscan = FALSE; + struct rspamd_mime_text_part *text_part; + struct rspamd_mime_part *mime_part; + struct rspamd_url *url; + guint len = 0, cnt = 0; + const gchar *class_name; + + class_name = rspamd_re_cache_type_to_string(re_class->type); + msg_debug_re_task("start check re type: %s: /%s/", + class_name, + rspamd_regexp_get_pattern(re)); + re_id = rspamd_regexp_get_cache_id(re); + + switch (re_class->type) { + case RSPAMD_RE_HEADER: + case RSPAMD_RE_RAWHEADER: + /* Get list of specified headers */ + rh = rspamd_message_get_header_array(task, + re_class->type_data, FALSE); + + if (rh) { + ret = rspamd_re_cache_process_headers_list(task, rt, re, + re_class, rh, is_strong, &processed_hyperscan); + msg_debug_re_task("checked header(%s) regexp: %s -> %d", + (const char *) re_class->type_data, + rspamd_regexp_get_pattern(re), + ret); + } + break; + case RSPAMD_RE_ALLHEADER: + raw = TRUE; + in = MESSAGE_FIELD(task, raw_headers_content).begin; + len = MESSAGE_FIELD(task, raw_headers_content).len; + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, (const guchar **) &in, &len, 1, raw, &processed_hyperscan); + msg_debug_re_task("checked allheader regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + break; + case RSPAMD_RE_MIMEHEADER: + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, mime_part) + { + if (mime_part->parent_part == NULL || + !IS_PART_MULTIPART(mime_part->parent_part) || + IS_PART_MESSAGE(mime_part)) { + /* We filter parts that have no multipart parent or are a messages here */ + continue; + } + rh = rspamd_message_get_header_from_hash(mime_part->raw_headers, + re_class->type_data, FALSE); + + if (rh) { + ret += rspamd_re_cache_process_headers_list(task, rt, re, + re_class, rh, is_strong, &processed_hyperscan); + } + msg_debug_re_task("checked mime header(%s) regexp: %s -> %d", + (const char *) re_class->type_data, + rspamd_regexp_get_pattern(re), + ret); + } + break; + case RSPAMD_RE_MIME: + case RSPAMD_RE_RAWMIME: + /* Iterate through text parts */ + if (MESSAGE_FIELD(task, text_parts)->len > 0) { + cnt = MESSAGE_FIELD(task, text_parts)->len; + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + /* Select data for regexp */ + if (re_class->type == RSPAMD_RE_RAWMIME) { + if (text_part->raw.len == 0) { + len = 0; + in = ""; + } + else { + in = text_part->raw.begin; + len = text_part->raw.len; + } + + raw = TRUE; + } + else { + /* Skip empty parts */ + if (IS_TEXT_PART_EMPTY(text_part)) { + len = 0; + in = ""; + } + else { + /* Check raw flags */ + if (!IS_TEXT_PART_UTF(text_part)) { + raw = TRUE; + } + + in = text_part->utf_content.begin; + len = text_part->utf_content.len; + } + } + + scvec[i] = (guchar *) in; + lenvec[i] = len; + } + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + msg_debug_re_task("checked mime regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + break; + case RSPAMD_RE_URL: + cnt = kh_size(MESSAGE_FIELD(task, urls)); + + if (cnt > 0) { + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + i = 0; + raw = FALSE; + + kh_foreach_key(MESSAGE_FIELD(task, urls), url, { + if ((url->protocol & PROTOCOL_MAILTO)) { + continue; + } + in = url->string; + len = url->urllen; + + if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) { + scvec[i] = (guchar *) in; + lenvec[i++] = len; + } + }); + + /* URL regexps do not include emails, that's why the code below is commented */ +#if 0 + g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails)); + + while (g_hash_table_iter_next (&it, &k, &v)) { + url = v; + in = url->string; + len = url->urllen; + + if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) { + scvec[i] = (guchar *) in; + lenvec[i++] = len; + } + } +#endif + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, i, raw, &processed_hyperscan); + msg_debug_re_task("checked url regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + break; + case RSPAMD_RE_EMAIL: + cnt = kh_size(MESSAGE_FIELD(task, urls)); + + if (cnt > 0) { + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + i = 0; + raw = FALSE; + + kh_foreach_key(MESSAGE_FIELD(task, urls), url, { + if (!(url->protocol & PROTOCOL_MAILTO)) { + continue; + } + if (url->userlen == 0 || url->hostlen == 0) { + continue; + } + + in = rspamd_url_user_unsafe(url); + len = url->userlen + 1 + url->hostlen; + scvec[i] = (guchar *) in; + lenvec[i++] = len; + }); + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, i, raw, &processed_hyperscan); + msg_debug_re_task("checked email regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + break; + case RSPAMD_RE_BODY: + raw = TRUE; + in = task->msg.begin; + len = task->msg.len; + + ret = rspamd_re_cache_process_regexp_data(rt, re, task, + (const guchar **) &in, &len, 1, raw, &processed_hyperscan); + msg_debug_re_task("checked rawbody regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + break; + case RSPAMD_RE_SABODY: + /* According to SA docs: + * The 'body' in this case is the textual parts of the message body; + * any non-text MIME parts are stripped, and the message decoded from + * Quoted-Printable or Base-64-encoded format if necessary. The message + * Subject header is considered part of the body and becomes the first + * paragraph when running the rules. All HTML tags and line breaks will + * be removed before matching. + */ + cnt = MESSAGE_FIELD(task, text_parts)->len + 1; + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + /* + * Body rules also include the Subject as the first line + * of the body content. + */ + + rh = rspamd_message_get_header_array(task, "Subject", FALSE); + + if (rh) { + scvec[0] = (guchar *) rh->decoded; + lenvec[0] = strlen(rh->decoded); + } + else { + scvec[0] = (guchar *) ""; + lenvec[0] = 0; + } + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + if (text_part->utf_stripped_content) { + scvec[i + 1] = (guchar *) text_part->utf_stripped_content->data; + lenvec[i + 1] = text_part->utf_stripped_content->len; + + if (!IS_TEXT_PART_UTF(text_part)) { + raw = TRUE; + } + } + else { + scvec[i + 1] = (guchar *) ""; + lenvec[i + 1] = 0; + } + } + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + msg_debug_re_task("checked sa body regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + break; + case RSPAMD_RE_SARAWBODY: + /* According to SA docs: + * The 'raw body' of a message is the raw data inside all textual + * parts. The text will be decoded from base64 or quoted-printable + * encoding, but HTML tags and line breaks will still be present. + * Multiline expressions will need to be used to match strings that are + * broken by line breaks. + */ + if (MESSAGE_FIELD(task, text_parts)->len > 0) { + cnt = MESSAGE_FIELD(task, text_parts)->len; + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + for (i = 0; i < cnt; i++) { + text_part = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), i); + + if (text_part->parsed.len > 0) { + scvec[i] = (guchar *) text_part->parsed.begin; + lenvec[i] = text_part->parsed.len; + + if (!IS_TEXT_PART_UTF(text_part)) { + raw = TRUE; + } + } + else { + scvec[i] = (guchar *) ""; + lenvec[i] = 0; + } + } + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + msg_debug_re_task("checked sa rawbody regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + break; + case RSPAMD_RE_WORDS: + case RSPAMD_RE_STEMWORDS: + case RSPAMD_RE_RAWWORDS: + if (MESSAGE_FIELD(task, text_parts)->len > 0) { + cnt = 0; + raw = FALSE; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + if (text_part->utf_words) { + cnt += text_part->utf_words->len; + } + } + + if (task->meta_words && task->meta_words->len > 0) { + cnt += task->meta_words->len; + } + + if (cnt > 0) { + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + cnt = 0; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + if (text_part->utf_words) { + cnt = rspamd_process_words_vector(text_part->utf_words, + scvec, lenvec, re_class, cnt, &raw); + } + } + + if (task->meta_words) { + cnt = rspamd_process_words_vector(task->meta_words, + scvec, lenvec, re_class, cnt, &raw); + } + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + + msg_debug_re_task("checked sa words regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + } + break; + case RSPAMD_RE_SELECTOR: + if (rspamd_re_cache_process_selector(task, rt, + re_class->type_data, + (guchar ***) &scvec, + &lenvec, &cnt)) { + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + msg_debug_re_task("checked selector(%s) regexp: %s -> %d", + re_class->type_data, + rspamd_regexp_get_pattern(re), ret); + + /* Do not free vectors as they are managed by rt->sel_cache */ + } + break; + case RSPAMD_RE_MAX: + msg_err_task("regexp of class invalid has been called: %s", + rspamd_regexp_get_pattern(re)); + break; + } + +#if WITH_HYPERSCAN + if (processed_hyperscan) { + rspamd_re_cache_finish_class(task, rt, re_class, class_name); + } +#endif + + setbit(rt->checked, re_id); + + return rt->results[re_id]; +} + +gint rspamd_re_cache_process(struct rspamd_task *task, + rspamd_regexp_t *re, + enum rspamd_re_type type, + gconstpointer type_data, + gsize datalen, + gboolean is_strong) +{ + guint64 re_id; + struct rspamd_re_class *re_class; + struct rspamd_re_cache *cache; + struct rspamd_re_runtime *rt; + + g_assert(task != NULL); + rt = task->re_rt; + g_assert(rt != NULL); + g_assert(re != NULL); + + cache = rt->cache; + re_id = rspamd_regexp_get_cache_id(re); + + if (re_id == RSPAMD_INVALID_ID || re_id > cache->nre) { + msg_err_task("re '%s' has no valid id for the cache", + rspamd_regexp_get_pattern(re)); + return 0; + } + + if (isset(rt->checked, re_id)) { + /* Fast path */ + rt->stat.regexp_fast_cached++; + return rt->results[re_id]; + } + else { + /* Slow path */ + re_class = rspamd_regexp_get_class(re); + + if (re_class == NULL) { + msg_err_task("cannot find re class for regexp '%s'", + rspamd_regexp_get_pattern(re)); + return 0; + } + + return rspamd_re_cache_exec_re(task, rt, re, re_class, + is_strong); + } + + return 0; +} + +int rspamd_re_cache_process_ffi(void *ptask, + void *pre, + int type, + void *type_data, + int is_strong) +{ + struct rspamd_lua_regexp **lua_re = pre; + struct rspamd_task **real_task = ptask; + gsize typelen = 0; + + if (type_data) { + typelen = strlen(type_data); + } + + return rspamd_re_cache_process(*real_task, (*lua_re)->re, + type, type_data, typelen, is_strong); +} + +void rspamd_re_cache_runtime_destroy(struct rspamd_re_runtime *rt) +{ + g_assert(rt != NULL); + + if (rt->sel_cache) { + struct rspamd_re_selector_result sr; + + kh_foreach_value(rt->sel_cache, sr, { + for (guint i = 0; i < sr.cnt; i++) { + g_free((gpointer) sr.scvec[i]); + } + + g_free(sr.scvec); + g_free(sr.lenvec); + }); + kh_destroy(selectors_results_hash, rt->sel_cache); + } + + REF_RELEASE(rt->cache); + g_free(rt); +} + +void rspamd_re_cache_unref(struct rspamd_re_cache *cache) +{ + if (cache) { + REF_RELEASE(cache); + } +} + +struct rspamd_re_cache * +rspamd_re_cache_ref(struct rspamd_re_cache *cache) +{ + if (cache) { + REF_RETAIN(cache); + } + + return cache; +} + +guint rspamd_re_cache_set_limit(struct rspamd_re_cache *cache, guint limit) +{ + guint old; + + g_assert(cache != NULL); + + old = cache->max_re_data; + cache->max_re_data = limit; + + return old; +} + +const gchar * +rspamd_re_cache_type_to_string(enum rspamd_re_type type) +{ + const gchar *ret = "unknown"; + + switch (type) { + case RSPAMD_RE_HEADER: + ret = "header"; + break; + case RSPAMD_RE_RAWHEADER: + ret = "raw header"; + break; + case RSPAMD_RE_MIMEHEADER: + ret = "mime header"; + break; + case RSPAMD_RE_ALLHEADER: + ret = "all headers"; + break; + case RSPAMD_RE_MIME: + ret = "part"; + break; + case RSPAMD_RE_RAWMIME: + ret = "raw part"; + break; + case RSPAMD_RE_BODY: + ret = "rawbody"; + break; + case RSPAMD_RE_URL: + ret = "url"; + break; + case RSPAMD_RE_EMAIL: + ret = "email"; + break; + case RSPAMD_RE_SABODY: + ret = "sa body"; + break; + case RSPAMD_RE_SARAWBODY: + ret = "sa raw body"; + break; + case RSPAMD_RE_SELECTOR: + ret = "selector"; + break; + case RSPAMD_RE_WORDS: + ret = "words"; + break; + case RSPAMD_RE_RAWWORDS: + ret = "raw_words"; + break; + case RSPAMD_RE_STEMWORDS: + ret = "stem_words"; + break; + case RSPAMD_RE_MAX: + default: + ret = "invalid class"; + break; + } + + return ret; +} + +enum rspamd_re_type +rspamd_re_cache_type_from_string(const char *str) +{ + enum rspamd_re_type ret; + guint64 h; + + /* + * To optimize this function, we apply hash to input string and + * pre-select it from the values + */ + + if (str != NULL) { + h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + str, strlen(str), 0xdeadbabe); + + switch (h) { + case G_GUINT64_CONSTANT(0x298b9c8a58887d44): /* header */ + ret = RSPAMD_RE_HEADER; + break; + case G_GUINT64_CONSTANT(0x467bfb5cd7ddf890): /* rawheader */ + ret = RSPAMD_RE_RAWHEADER; + break; + case G_GUINT64_CONSTANT(0xda081341fb600389): /* mime */ + ret = RSPAMD_RE_MIME; + break; + case G_GUINT64_CONSTANT(0xc35831e067a8221d): /* rawmime */ + ret = RSPAMD_RE_RAWMIME; + break; + case G_GUINT64_CONSTANT(0xc625e13dbe636de2): /* body */ + case G_GUINT64_CONSTANT(0xCCDEBA43518F721C): /* message */ + ret = RSPAMD_RE_BODY; + break; + case G_GUINT64_CONSTANT(0x286edbe164c791d2): /* url */ + case G_GUINT64_CONSTANT(0x7D9ACDF6685661A1): /* uri */ + ret = RSPAMD_RE_URL; + break; + case G_GUINT64_CONSTANT(0x7e232b0f60b571be): /* email */ + ret = RSPAMD_RE_EMAIL; + break; + case G_GUINT64_CONSTANT(0x796d62205a8778c7): /* allheader */ + ret = RSPAMD_RE_ALLHEADER; + break; + case G_GUINT64_CONSTANT(0xa3c6c153b3b00a5e): /* mimeheader */ + ret = RSPAMD_RE_MIMEHEADER; + break; + case G_GUINT64_CONSTANT(0x7794501506e604e9): /* sabody */ + ret = RSPAMD_RE_SABODY; + break; + case G_GUINT64_CONSTANT(0x28828962E7D2A05F): /* sarawbody */ + ret = RSPAMD_RE_SARAWBODY; + break; + default: + ret = RSPAMD_RE_MAX; + break; + } + } + else { + ret = RSPAMD_RE_MAX; + } + + return ret; +} + +#ifdef WITH_HYPERSCAN +static gchar * +rspamd_re_cache_hs_pattern_from_pcre(rspamd_regexp_t *re) +{ + /* + * Workaround for bug in ragel 7.0.0.11 + * https://github.com/intel/hyperscan/issues/133 + */ + const gchar *pat = rspamd_regexp_get_pattern(re); + guint flags = rspamd_regexp_get_flags(re), esc_flags = RSPAMD_REGEXP_ESCAPE_RE; + gchar *escaped; + gsize esc_len; + + if (flags & RSPAMD_REGEXP_FLAG_UTF) { + esc_flags |= RSPAMD_REGEXP_ESCAPE_UTF; + } + + escaped = rspamd_str_regexp_escape(pat, strlen(pat), &esc_len, esc_flags); + + return escaped; +} + +static gboolean +rspamd_re_cache_is_finite(struct rspamd_re_cache *cache, + rspamd_regexp_t *re, gint flags, gdouble max_time) +{ + pid_t cld; + gint status; + struct timespec ts; + hs_compile_error_t *hs_errors; + hs_database_t *test_db; + gdouble wait_time; + const gint max_tries = 10; + gint tries = 0, rc; + void (*old_hdl)(int); + + wait_time = max_time / max_tries; + /* We need to restore SIGCHLD processing */ + old_hdl = signal(SIGCHLD, SIG_DFL); + cld = fork(); + + if (cld == 0) { + /* Try to compile pattern */ + + gchar *pat = rspamd_re_cache_hs_pattern_from_pcre(re); + + if (hs_compile(pat, + flags | HS_FLAG_PREFILTER, + HS_MODE_BLOCK, + &cache->plt, + &test_db, + &hs_errors) != HS_SUCCESS) { + + msg_info_re_cache("cannot compile (prefilter mode) '%s' to hyperscan: '%s'", + pat, + hs_errors != NULL ? hs_errors->message : "unknown error"); + + hs_free_compile_error(hs_errors); + g_free(pat); + + exit(EXIT_FAILURE); + } + + g_free(pat); + exit(EXIT_SUCCESS); + } + else if (cld > 0) { + double_to_ts(wait_time, &ts); + + while ((rc = waitpid(cld, &status, WNOHANG)) == 0 && tries++ < max_tries) { + (void) nanosleep(&ts, NULL); + } + + /* Child has been terminated */ + if (rc > 0) { + /* Forget about SIGCHLD after this point */ + signal(SIGCHLD, old_hdl); + + if (WIFEXITED(status) && WEXITSTATUS(status) == EXIT_SUCCESS) { + return TRUE; + } + else { + msg_err_re_cache( + "cannot approximate %s to hyperscan", + rspamd_regexp_get_pattern(re)); + + return FALSE; + } + } + else { + /* We consider that as timeout */ + kill(cld, SIGKILL); + g_assert(waitpid(cld, &status, 0) != -1); + msg_err_re_cache( + "cannot approximate %s to hyperscan: timeout waiting", + rspamd_regexp_get_pattern(re)); + signal(SIGCHLD, old_hdl); + } + } + else { + msg_err_re_cache( + "cannot approximate %s to hyperscan: fork failed: %s", + rspamd_regexp_get_pattern(re), strerror(errno)); + signal(SIGCHLD, old_hdl); + } + + return FALSE; +} +#endif + +#ifdef WITH_HYPERSCAN +struct rspamd_re_cache_hs_compile_cbdata { + GHashTableIter it; + struct rspamd_re_cache *cache; + const char *cache_dir; + gdouble max_time; + gboolean silent; + guint total; + void (*cb)(guint ncompiled, GError *err, void *cbd); + void *cbd; +}; + +static void +rspamd_re_cache_compile_err(EV_P_ ev_timer *w, GError *err, + struct rspamd_re_cache_hs_compile_cbdata *cbdata, bool is_fatal) +{ + cbdata->cb(cbdata->total, err, cbdata->cbd); + + if (is_fatal) { + ev_timer_stop(EV_A_ w); + g_free(w); + g_free(cbdata); + } + else { + /* Continue compilation */ + ev_timer_again(EV_A_ w); + } + g_error_free(err); +} + +static void +rspamd_re_cache_compile_timer_cb(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_re_cache_hs_compile_cbdata *cbdata = + (struct rspamd_re_cache_hs_compile_cbdata *) w->data; + GHashTableIter cit; + gpointer k, v; + struct rspamd_re_class *re_class; + gchar path[PATH_MAX], npath[PATH_MAX]; + hs_database_t *test_db; + gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags; + rspamd_cryptobox_fast_hash_state_t crc_st; + guint64 crc; + rspamd_regexp_t *re; + hs_compile_error_t *hs_errors = NULL; + guint *hs_flags = NULL; + const hs_expr_ext_t **hs_exts = NULL; + gchar **hs_pats = NULL; + gchar *hs_serialized = NULL; + gsize serialized_len; + struct iovec iov[7]; + struct rspamd_re_cache *cache; + GError *err; + pid_t our_pid = getpid(); + + cache = cbdata->cache; + + if (!g_hash_table_iter_next(&cbdata->it, &k, &v)) { + /* All done */ + ev_timer_stop(EV_A_ w); + cbdata->cb(cbdata->total, NULL, cbdata->cbd); + g_free(w); + g_free(cbdata); + + return; + } + + re_class = v; + rspamd_snprintf(path, sizeof(path), "%s%c%s.hs", cbdata->cache_dir, + G_DIR_SEPARATOR, re_class->hash); + + if (rspamd_re_cache_is_valid_hyperscan_file(cache, path, TRUE, TRUE, NULL)) { + + fd = open(path, O_RDONLY, 00600); + + /* Read number of regexps */ + g_assert(fd != -1); + g_assert(lseek(fd, RSPAMD_HS_MAGIC_LEN + sizeof(cache->plt), SEEK_SET) != -1); + g_assert(read(fd, &n, sizeof(n)) == sizeof(n)); + close(fd); + + if (re_class->type_len > 0) { + if (!cbdata->silent) { + msg_info_re_cache( + "skip already valid class %s(%*s) to cache %6s, %d regexps", + rspamd_re_cache_type_to_string(re_class->type), + (gint) re_class->type_len - 1, + re_class->type_data, + re_class->hash, + n); + } + } + else { + if (!cbdata->silent) { + msg_info_re_cache( + "skip already valid class %s to cache %6s, %d regexps", + rspamd_re_cache_type_to_string(re_class->type), + re_class->hash, + n); + } + } + + ev_timer_again(EV_A_ w); + return; + } + + rspamd_snprintf(path, sizeof(path), "%s%c%s%P-XXXXXXXXXX", cbdata->cache_dir, + G_DIR_SEPARATOR, re_class->hash, our_pid); + fd = g_mkstemp_full(path, O_CREAT | O_TRUNC | O_EXCL | O_WRONLY, 00600); + + if (fd == -1) { + err = g_error_new(rspamd_re_cache_quark(), errno, + "cannot open file %s: %s", path, strerror(errno)); + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + return; + } + + g_hash_table_iter_init(&cit, re_class->re); + n = g_hash_table_size(re_class->re); + hs_flags = g_new0(guint, n); + hs_ids = g_new0(guint, n); + hs_pats = g_new0(char *, n); + hs_exts = g_new0(const hs_expr_ext_t *, n); + i = 0; + + while (g_hash_table_iter_next(&cit, &k, &v)) { + re = v; + + pcre_flags = rspamd_regexp_get_pcre_flags(re); + re_flags = rspamd_regexp_get_flags(re); + + if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) { + /* Do not try to compile bad regexp */ + msg_info_re_cache( + "do not try compile %s to hyperscan as it is PCRE only", + rspamd_regexp_get_pattern(re)); + continue; + } + + hs_flags[i] = 0; + hs_exts[i] = NULL; +#ifndef WITH_PCRE2 + if (pcre_flags & PCRE_FLAG(UTF8)) { + hs_flags[i] |= HS_FLAG_UTF8; + } +#else + if (pcre_flags & PCRE_FLAG(UTF)) { + hs_flags[i] |= HS_FLAG_UTF8; + } +#endif + if (pcre_flags & PCRE_FLAG(CASELESS)) { + hs_flags[i] |= HS_FLAG_CASELESS; + } + if (pcre_flags & PCRE_FLAG(MULTILINE)) { + hs_flags[i] |= HS_FLAG_MULTILINE; + } + if (pcre_flags & PCRE_FLAG(DOTALL)) { + hs_flags[i] |= HS_FLAG_DOTALL; + } + + + if (re_flags & RSPAMD_REGEXP_FLAG_LEFTMOST) { + hs_flags[i] |= HS_FLAG_SOM_LEFTMOST; + } + else if (rspamd_regexp_get_maxhits(re) == 1) { + hs_flags[i] |= HS_FLAG_SINGLEMATCH; + } + + gchar *pat = rspamd_re_cache_hs_pattern_from_pcre(re); + + if (hs_compile(pat, + hs_flags[i], + HS_MODE_BLOCK, + &cache->plt, + &test_db, + &hs_errors) != HS_SUCCESS) { + msg_info_re_cache("cannot compile '%s' to hyperscan: '%s', try prefilter match", + pat, + hs_errors != NULL ? hs_errors->message : "unknown error"); + hs_free_compile_error(hs_errors); + + /* The approximation operation might take a significant + * amount of time, so we need to check if it's finite + */ + if (rspamd_re_cache_is_finite(cache, re, hs_flags[i], cbdata->max_time)) { + hs_flags[i] |= HS_FLAG_PREFILTER; + hs_ids[i] = rspamd_regexp_get_cache_id(re); + hs_pats[i] = pat; + i++; + } + else { + g_free(pat); /* Avoid leak */ + } + } + else { + hs_ids[i] = rspamd_regexp_get_cache_id(re); + hs_pats[i] = pat; + i++; + hs_free_database(test_db); + } + } + /* Adjust real re number */ + n = i; + +#define CLEANUP_ALLOCATED(is_err) \ + do { \ + g_free(hs_flags); \ + g_free(hs_ids); \ + for (guint j = 0; j < i; j++) { \ + g_free(hs_pats[j]); \ + } \ + g_free(hs_pats); \ + g_free(hs_exts); \ + if (is_err) { \ + close(fd); \ + unlink(path); \ + if (hs_errors) hs_free_compile_error(hs_errors); \ + } \ + } while (0) + + if (n > 0) { + /* Create the hs tree */ + hs_errors = NULL; + if (hs_compile_ext_multi((const char **) hs_pats, + hs_flags, + hs_ids, + hs_exts, + n, + HS_MODE_BLOCK, + &cache->plt, + &test_db, + &hs_errors) != HS_SUCCESS) { + + err = g_error_new(rspamd_re_cache_quark(), EINVAL, + "cannot create tree of regexp when processing '%s': %s", + hs_pats[hs_errors->expression], hs_errors->message); + CLEANUP_ALLOCATED(true); + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + + return; + } + + if (hs_serialize_database(test_db, &hs_serialized, + &serialized_len) != HS_SUCCESS) { + err = g_error_new(rspamd_re_cache_quark(), + errno, + "cannot serialize tree of regexp for %s", + re_class->hash); + + CLEANUP_ALLOCATED(true); + hs_free_database(test_db); + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + return; + } + + hs_free_database(test_db); + + /* + * Magic - 8 bytes + * Platform - sizeof (platform) + * n - number of regexps + * n * <regexp ids> + * n * <regexp flags> + * crc - 8 bytes checksum + * <hyperscan blob> + */ + rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); + /* IDs -> Flags -> Hs blob */ + rspamd_cryptobox_fast_hash_update(&crc_st, + hs_ids, sizeof(*hs_ids) * n); + rspamd_cryptobox_fast_hash_update(&crc_st, + hs_flags, sizeof(*hs_flags) * n); + rspamd_cryptobox_fast_hash_update(&crc_st, + hs_serialized, serialized_len); + crc = rspamd_cryptobox_fast_hash_final(&crc_st); + + + iov[0].iov_base = (void *) rspamd_hs_magic; + iov[0].iov_len = RSPAMD_HS_MAGIC_LEN; + iov[1].iov_base = &cache->plt; + iov[1].iov_len = sizeof(cache->plt); + iov[2].iov_base = &n; + iov[2].iov_len = sizeof(n); + iov[3].iov_base = hs_ids; + iov[3].iov_len = sizeof(*hs_ids) * n; + iov[4].iov_base = hs_flags; + iov[4].iov_len = sizeof(*hs_flags) * n; + iov[5].iov_base = &crc; + iov[5].iov_len = sizeof(crc); + iov[6].iov_base = hs_serialized; + iov[6].iov_len = serialized_len; + + if (writev(fd, iov, G_N_ELEMENTS(iov)) == -1) { + err = g_error_new(rspamd_re_cache_quark(), + errno, + "cannot serialize tree of regexp to %s: %s", + path, strerror(errno)); + + CLEANUP_ALLOCATED(true); + g_free(hs_serialized); + + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + return; + } + + if (re_class->type_len > 0) { + msg_info_re_cache( + "compiled class %s(%*s) to cache %6s, %d/%d regexps", + rspamd_re_cache_type_to_string(re_class->type), + (gint) re_class->type_len - 1, + re_class->type_data, + re_class->hash, + n, + (gint) g_hash_table_size(re_class->re)); + } + else { + msg_info_re_cache( + "compiled class %s to cache %6s, %d/%d regexps", + rspamd_re_cache_type_to_string(re_class->type), + re_class->hash, + n, + (gint) g_hash_table_size(re_class->re)); + } + + cbdata->total += n; + CLEANUP_ALLOCATED(false); + + /* Now rename temporary file to the new .hs file */ + rspamd_snprintf(npath, sizeof(npath), "%s%c%s.hs", cbdata->cache_dir, + G_DIR_SEPARATOR, re_class->hash); + + if (rename(path, npath) == -1) { + err = g_error_new(rspamd_re_cache_quark(), + errno, + "cannot rename %s to %s: %s", + path, npath, strerror(errno)); + unlink(path); + close(fd); + + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + return; + } + + close(fd); + } + else { + err = g_error_new(rspamd_re_cache_quark(), + errno, + "no suitable regular expressions %s (%d original): " + "remove temporary file %s", + rspamd_re_cache_type_to_string(re_class->type), + (gint) g_hash_table_size(re_class->re), + path); + + CLEANUP_ALLOCATED(true); + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + + return; + } + + /* Continue process */ + ev_timer_again(EV_A_ w); +} + +#endif + +gint rspamd_re_cache_compile_hyperscan(struct rspamd_re_cache *cache, + const char *cache_dir, + gdouble max_time, + gboolean silent, + struct ev_loop *event_loop, + void (*cb)(guint ncompiled, GError *err, void *cbd), + void *cbd) +{ + g_assert(cache != NULL); + g_assert(cache_dir != NULL); + +#ifndef WITH_HYPERSCAN + return -1; +#else + static ev_timer *timer; + static const ev_tstamp timer_interval = 0.1; + struct rspamd_re_cache_hs_compile_cbdata *cbdata; + + cbdata = g_malloc0(sizeof(*cbdata)); + g_hash_table_iter_init(&cbdata->it, cache->re_classes); + cbdata->cache = cache; + cbdata->cache_dir = cache_dir; + cbdata->cb = cb; + cbdata->cbd = cbd; + cbdata->max_time = max_time; + cbdata->silent = silent; + cbdata->total = 0; + timer = g_malloc0(sizeof(*timer)); + timer->data = (void *) cbdata; /* static */ + + ev_timer_init(timer, rspamd_re_cache_compile_timer_cb, + timer_interval, timer_interval); + ev_timer_start(event_loop, timer); + + return 0; +#endif +} + +gboolean +rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache, + const char *path, gboolean silent, gboolean try_load, GError **err) +{ + g_assert(cache != NULL); + g_assert(path != NULL); + +#ifndef WITH_HYPERSCAN + return FALSE; +#else + gint fd, n, ret; + guchar magicbuf[RSPAMD_HS_MAGIC_LEN]; + const guchar *mb; + GHashTableIter it; + gpointer k, v; + struct rspamd_re_class *re_class; + gsize len; + const gchar *hash_pos; + hs_platform_info_t test_plt; + hs_database_t *test_db = NULL; + guchar *map, *p, *end; + rspamd_cryptobox_fast_hash_state_t crc_st; + guint64 crc, valid_crc; + + len = strlen(path); + + if (len < sizeof(rspamd_cryptobox_HASHBYTES + 3)) { + if (!silent) { + msg_err_re_cache("cannot open hyperscan cache file %s: too short filename", + path); + } + g_set_error(err, rspamd_re_cache_quark(), 0, + "too short filename"); + + return FALSE; + } + + if (memcmp(path + len - 3, ".hs", 3) != 0) { + if (!silent) { + msg_err_re_cache("cannot open hyperscan cache file %s: not ending with .hs", + path); + } + g_set_error(err, rspamd_re_cache_quark(), 0, + "not ending with .hs"); + return FALSE; + } + + hash_pos = path + len - 3 - (sizeof(re_class->hash) - 1); + g_hash_table_iter_init(&it, cache->re_classes); + + while (g_hash_table_iter_next(&it, &k, &v)) { + re_class = v; + + if (memcmp(hash_pos, re_class->hash, sizeof(re_class->hash) - 1) == 0) { + /* Open file and check magic */ + gssize r; + + fd = open(path, O_RDONLY); + + if (fd == -1) { + if (errno != ENOENT || !silent) { + msg_err_re_cache("cannot open hyperscan cache file %s: %s", + path, strerror(errno)); + } + g_set_error(err, rspamd_re_cache_quark(), 0, + "%s", + strerror(errno)); + return FALSE; + } + + if ((r = read(fd, magicbuf, sizeof(magicbuf))) != sizeof(magicbuf)) { + if (r == -1) { + msg_err_re_cache("cannot read magic from hyperscan " + "cache file %s: %s", + path, strerror(errno)); + g_set_error(err, rspamd_re_cache_quark(), 0, + "cannot read magic: %s", + strerror(errno)); + } + else { + msg_err_re_cache("truncated read magic from hyperscan " + "cache file %s: %z, %z wanted", + path, r, (gsize) sizeof(magicbuf)); + g_set_error(err, rspamd_re_cache_quark(), 0, + "truncated read magic %zd, %zd wanted", + r, (gsize) sizeof(magicbuf)); + } + + close(fd); + return FALSE; + } + + mb = rspamd_hs_magic; + + if (memcmp(magicbuf, mb, sizeof(magicbuf)) != 0) { + msg_err_re_cache("cannot open hyperscan cache file %s: " + "bad magic ('%*xs', '%*xs' expected)", + path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf, + (int) RSPAMD_HS_MAGIC_LEN, mb); + + close(fd); + g_set_error(err, rspamd_re_cache_quark(), 0, "invalid magic"); + return FALSE; + } + + if ((r = read(fd, &test_plt, sizeof(test_plt))) != sizeof(test_plt)) { + if (r == -1) { + msg_err_re_cache("cannot read platform data from hyperscan " + "cache file %s: %s", + path, strerror(errno)); + } + else { + msg_err_re_cache("truncated read platform data from hyperscan " + "cache file %s: %z, %z wanted", + path, r, (gsize) sizeof(magicbuf)); + } + + g_set_error(err, rspamd_re_cache_quark(), 0, + "cannot read platform data: %s", strerror(errno)); + + close(fd); + return FALSE; + } + + if (test_plt.cpu_features != cache->plt.cpu_features) { + msg_err_re_cache("cannot open hyperscan cache file %s: " + "compiled for a different platform", + path); + g_set_error(err, rspamd_re_cache_quark(), 0, + "compiled for a different platform"); + + close(fd); + return FALSE; + } + + close(fd); + + if (try_load) { + map = rspamd_file_xmap(path, PROT_READ, &len, TRUE); + + if (map == NULL) { + msg_err_re_cache("cannot mmap hyperscan cache file %s: " + "%s", + path, strerror(errno)); + g_set_error(err, rspamd_re_cache_quark(), 0, + "mmap error: %s", strerror(errno)); + return FALSE; + } + + p = map + RSPAMD_HS_MAGIC_LEN + sizeof(test_plt); + end = map + len; + memcpy(&n, p, sizeof(n)); + p += sizeof(gint); + + if (n <= 0 || 2 * n * sizeof(gint) + /* IDs + flags */ + sizeof(guint64) + /* crc */ + RSPAMD_HS_MAGIC_LEN + /* header */ + sizeof(cache->plt) > + len) { + /* Some wrong amount of regexps */ + msg_err_re_cache("bad number of expressions in %s: %d", + path, n); + g_set_error(err, rspamd_re_cache_quark(), 0, + "bad number of expressions: %d", n); + munmap(map, len); + return FALSE; + } + + /* + * Magic - 8 bytes + * Platform - sizeof (platform) + * n - number of regexps + * n * <regexp ids> + * n * <regexp flags> + * crc - 8 bytes checksum + * <hyperscan blob> + */ + + memcpy(&crc, p + n * 2 * sizeof(gint), sizeof(crc)); + rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); + /* IDs */ + rspamd_cryptobox_fast_hash_update(&crc_st, p, n * sizeof(gint)); + /* Flags */ + rspamd_cryptobox_fast_hash_update(&crc_st, p + n * sizeof(gint), + n * sizeof(gint)); + /* HS database */ + p += n * sizeof(gint) * 2 + sizeof(guint64); + rspamd_cryptobox_fast_hash_update(&crc_st, p, end - p); + valid_crc = rspamd_cryptobox_fast_hash_final(&crc_st); + + if (crc != valid_crc) { + msg_warn_re_cache("outdated or invalid hs database in %s: " + "crc read %xL, crc expected %xL", + path, crc, valid_crc); + g_set_error(err, rspamd_re_cache_quark(), 0, + "outdated or invalid hs database, crc check failure"); + munmap(map, len); + + return FALSE; + } + + if ((ret = hs_deserialize_database(p, end - p, &test_db)) != HS_SUCCESS) { + msg_err_re_cache("bad hs database in %s: %d", path, ret); + g_set_error(err, rspamd_re_cache_quark(), 0, + "deserialize error: %d", ret); + munmap(map, len); + + return FALSE; + } + + hs_free_database(test_db); + munmap(map, len); + } + /* XXX: add crc check */ + + return TRUE; + } + } + + if (!silent) { + msg_warn_re_cache("unknown hyperscan cache file %s", path); + } + + g_set_error(err, rspamd_re_cache_quark(), 0, + "unknown hyperscan file"); + + return FALSE; +#endif +} + + +enum rspamd_hyperscan_status +rspamd_re_cache_load_hyperscan(struct rspamd_re_cache *cache, + const char *cache_dir, bool try_load) +{ + g_assert(cache != NULL); + g_assert(cache_dir != NULL); + +#ifndef WITH_HYPERSCAN + return RSPAMD_HYPERSCAN_UNSUPPORTED; +#else + gchar path[PATH_MAX]; + gint fd, i, n, *hs_ids = NULL, *hs_flags = NULL, total = 0, ret; + GHashTableIter it; + gpointer k, v; + guint8 *map, *p; + struct rspamd_re_class *re_class; + struct rspamd_re_cache_elt *elt; + struct stat st; + gboolean has_valid = FALSE, all_valid = FALSE; + + g_hash_table_iter_init(&it, cache->re_classes); + + while (g_hash_table_iter_next(&it, &k, &v)) { + re_class = v; + rspamd_snprintf(path, sizeof(path), "%s%c%s.hs", cache_dir, + G_DIR_SEPARATOR, re_class->hash); + + if (rspamd_re_cache_is_valid_hyperscan_file(cache, path, try_load, FALSE, NULL)) { + msg_debug_re_cache("load hyperscan database from '%s'", + re_class->hash); + + fd = open(path, O_RDONLY); + + /* Read number of regexps */ + g_assert(fd != -1); + fstat(fd, &st); + + map = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + + if (map == MAP_FAILED) { + if (!try_load) { + msg_err_re_cache("cannot mmap %s: %s", path, strerror(errno)); + } + else { + msg_debug_re_cache("cannot mmap %s: %s", path, strerror(errno)); + } + + close(fd); + all_valid = FALSE; + continue; + } + + close(fd); + p = map + RSPAMD_HS_MAGIC_LEN + sizeof(cache->plt); + n = *(gint *) p; + + if (n <= 0 || 2 * n * sizeof(gint) + /* IDs + flags */ + sizeof(guint64) + /* crc */ + RSPAMD_HS_MAGIC_LEN + /* header */ + sizeof(cache->plt) > + (gsize) st.st_size) { + /* Some wrong amount of regexps */ + if (!try_load) { + msg_err_re_cache("bad number of expressions in %s: %d", + path, n); + } + else { + msg_debug_re_cache("bad number of expressions in %s: %d", + path, n); + } + + munmap(map, st.st_size); + all_valid = FALSE; + continue; + } + + total += n; + p += sizeof(n); + hs_ids = g_malloc(n * sizeof(*hs_ids)); + memcpy(hs_ids, p, n * sizeof(*hs_ids)); + p += n * sizeof(*hs_ids); + hs_flags = g_malloc(n * sizeof(*hs_flags)); + memcpy(hs_flags, p, n * sizeof(*hs_flags)); + + /* Skip crc */ + p += n * sizeof(*hs_ids) + sizeof(guint64); + + /* Cleanup */ + if (re_class->hs_scratch != NULL) { + hs_free_scratch(re_class->hs_scratch); + } + + if (re_class->hs_db != NULL) { + rspamd_hyperscan_free(re_class->hs_db, false); + } + + if (re_class->hs_ids) { + g_free(re_class->hs_ids); + } + + re_class->hs_ids = NULL; + re_class->hs_scratch = NULL; + re_class->hs_db = NULL; + munmap(map, st.st_size); + + re_class->hs_db = rspamd_hyperscan_maybe_load(path, p - map); + if (re_class->hs_db == NULL) { + if (!try_load) { + msg_err_re_cache("bad hs database in %s", path); + } + else { + msg_debug_re_cache("bad hs database in %s", path); + } + g_free(hs_ids); + g_free(hs_flags); + + re_class->hs_ids = NULL; + re_class->hs_scratch = NULL; + re_class->hs_db = NULL; + all_valid = FALSE; + + continue; + } + + if ((ret = hs_alloc_scratch(rspamd_hyperscan_get_database(re_class->hs_db), + &re_class->hs_scratch)) != HS_SUCCESS) { + if (!try_load) { + msg_err_re_cache("bad hs database in %s; error code: %d", path, ret); + } + else { + msg_debug_re_cache("bad hs database in %s; error code: %d", path, ret); + } + g_free(hs_ids); + g_free(hs_flags); + + rspamd_hyperscan_free(re_class->hs_db, true); + re_class->hs_ids = NULL; + re_class->hs_scratch = NULL; + re_class->hs_db = NULL; + all_valid = FALSE; + + continue; + } + + /* + * Now find hyperscan elts that are successfully compiled and + * specify that they should be matched using hyperscan + */ + for (i = 0; i < n; i++) { + g_assert((gint) cache->re->len > hs_ids[i] && hs_ids[i] >= 0); + elt = g_ptr_array_index(cache->re, hs_ids[i]); + + if (hs_flags[i] & HS_FLAG_PREFILTER) { + elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN_PRE; + } + else { + elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN; + } + } + + re_class->hs_ids = hs_ids; + g_free(hs_flags); + re_class->nhs = n; + + if (!has_valid) { + has_valid = TRUE; + all_valid = TRUE; + } + } + else { + if (!try_load) { + msg_err_re_cache("invalid hyperscan hash file '%s'", + path); + } + else { + msg_debug_re_cache("invalid hyperscan hash file '%s'", + path); + } + all_valid = FALSE; + continue; + } + } + + if (has_valid) { + if (all_valid) { + msg_info_re_cache("full hyperscan database of %d regexps has been loaded", total); + cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_FULL; + } + else { + msg_info_re_cache("partial hyperscan database of %d regexps has been loaded", total); + cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_PARTIAL; + } + } + else { + msg_info_re_cache("hyperscan database has NOT been loaded; no valid expressions"); + cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOAD_ERROR; + } + + + return cache->hyperscan_loaded; +#endif +} + +void rspamd_re_cache_add_selector(struct rspamd_re_cache *cache, + const gchar *sname, + gint ref) +{ + khiter_t k; + + k = kh_get(lua_selectors_hash, cache->selectors, (gchar *) sname); + + if (k == kh_end(cache->selectors)) { + gchar *cpy = g_strdup(sname); + gint res; + + k = kh_put(lua_selectors_hash, cache->selectors, cpy, &res); + + kh_value(cache->selectors, k) = ref; + } + else { + msg_warn_re_cache("replacing selector with name %s", sname); + + if (cache->L) { + luaL_unref(cache->L, LUA_REGISTRYINDEX, kh_value(cache->selectors, k)); + } + + kh_value(cache->selectors, k) = ref; + } +} |