diff options
Diffstat (limited to 'src/libserver/composites')
-rw-r--r-- | src/libserver/composites/composites.cxx | 989 | ||||
-rw-r--r-- | src/libserver/composites/composites.h | 64 | ||||
-rw-r--r-- | src/libserver/composites/composites_internal.hxx | 112 | ||||
-rw-r--r-- | src/libserver/composites/composites_manager.cxx | 330 |
4 files changed, 1495 insertions, 0 deletions
diff --git a/src/libserver/composites/composites.cxx b/src/libserver/composites/composites.cxx new file mode 100644 index 0000000..aa231a3 --- /dev/null +++ b/src/libserver/composites/composites.cxx @@ -0,0 +1,989 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "logger.h" +#include "expression.h" +#include "task.h" +#include "utlist.h" +#include "scan_result.h" +#include "composites.h" + +#include <cmath> +#include <vector> +#include <variant> +#include "libutil/cxx/util.hxx" +#include "contrib/ankerl/unordered_dense.h" + +#include "composites_internal.hxx" + +#define msg_err_composites(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "composites", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_composites(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "composites", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_composites(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "composites", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +#define msg_debug_composites(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \ + rspamd_composites_log_id, "composites", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(composites) + + +namespace rspamd::composites { +static rspamd_expression_atom_t *rspamd_composite_expr_parse(const gchar *line, gsize len, + rspamd_mempool_t *pool, + gpointer ud, GError **err); +static gdouble rspamd_composite_expr_process(void *ud, rspamd_expression_atom_t *atom); +static gint rspamd_composite_expr_priority(rspamd_expression_atom_t *atom); +static void rspamd_composite_expr_destroy(rspamd_expression_atom_t *atom); +static void composites_foreach_callback(gpointer key, gpointer value, void *data); + +const struct rspamd_atom_subr composite_expr_subr = { + .parse = rspamd::composites::rspamd_composite_expr_parse, + .process = rspamd::composites::rspamd_composite_expr_process, + .priority = rspamd::composites::rspamd_composite_expr_priority, + .destroy = rspamd::composites::rspamd_composite_expr_destroy}; +}// namespace rspamd::composites + +namespace rspamd::composites { + +static constexpr const double epsilon = 0.00001; + +struct symbol_remove_data { + const char *sym; + struct rspamd_composite *comp; + GNode *parent; + std::uint8_t action; +}; + +struct composites_data { + struct rspamd_task *task; + struct rspamd_composite *composite; + struct rspamd_scan_result *metric_res; + ankerl::unordered_dense::map<std::string_view, + std::vector<symbol_remove_data>> + symbols_to_remove; + std::vector<bool> checked; + + explicit composites_data(struct rspamd_task *task, struct rspamd_scan_result *mres) + : task(task), composite(nullptr), metric_res(mres) + { + checked.resize(rspamd_composites_manager_nelts(task->cfg->composites_manager) * 2, + false); + } +}; + +struct rspamd_composite_option_match { + rspamd_regexp_t *re; + std::string match; + + explicit rspamd_composite_option_match(const char *start, std::size_t len) noexcept + : re(nullptr), match(start, len) + { + } + + explicit rspamd_composite_option_match(rspamd_regexp_t *re) noexcept + : re(rspamd_regexp_ref(re)) + { + } + + rspamd_composite_option_match(const rspamd_composite_option_match &other) noexcept + { + if (other.re) { + re = rspamd_regexp_ref(other.re); + } + else { + match = other.match; + re = nullptr; + } + } + rspamd_composite_option_match &operator=(const rspamd_composite_option_match &other) noexcept + { + if (other.re) { + if (re) { + rspamd_regexp_unref(re); + } + re = rspamd_regexp_ref(other.re); + } + else { + if (re) { + rspamd_regexp_unref(re); + } + re = nullptr; + match = other.match; + } + + return *this; + } + + rspamd_composite_option_match(rspamd_composite_option_match &&other) noexcept + { + if (other.re) { + re = other.re; + other.re = nullptr; + } + else { + re = nullptr; + match = std::move(other.match); + } + } + rspamd_composite_option_match &operator=(rspamd_composite_option_match &&other) noexcept + { + if (other.re) { + if (re) { + rspamd_regexp_unref(re); + } + re = other.re; + other.re = nullptr; + } + else { + if (re) { + rspamd_regexp_unref(re); + } + re = nullptr; + match = std::move(other.match); + } + + return *this; + } + + ~rspamd_composite_option_match() + { + if (re) { + rspamd_regexp_unref(re); + } + } + + auto match_opt(const std::string_view &data) const -> bool + { + if (re) { + return rspamd_regexp_search(re, + data.data(), data.size(), + nullptr, nullptr, false, nullptr); + } + else { + return data == match; + } + } + + auto get_pat() const -> std::string_view + { + if (re) { + return std::string_view(rspamd_regexp_get_pattern(re)); + } + else { + return match; + } + } +}; + +enum class rspamd_composite_atom_type { + ATOM_UNKNOWN, + ATOM_COMPOSITE, + ATOM_PLAIN +}; + +struct rspamd_composite_atom { + std::string symbol; + std::string_view norm_symbol; + rspamd_composite_atom_type comp_type = rspamd_composite_atom_type::ATOM_UNKNOWN; + const struct rspamd_composite *ncomp; /* underlying composite */ + std::vector<rspamd_composite_option_match> opts; +}; + +enum rspamd_composite_action : std::uint8_t { + RSPAMD_COMPOSITE_UNTOUCH = 0, + RSPAMD_COMPOSITE_REMOVE_SYMBOL = (1u << 0), + RSPAMD_COMPOSITE_REMOVE_WEIGHT = (1u << 1), + RSPAMD_COMPOSITE_REMOVE_FORCED = (1u << 2) +}; + +static GQuark +rspamd_composites_quark(void) +{ + return g_quark_from_static_string("composites"); +} + +static auto +rspamd_composite_atom_dtor(void *ptr) +{ + auto *atom = reinterpret_cast<rspamd_composite_atom *>(ptr); + + delete atom; +} + +static rspamd_expression_atom_t * +rspamd_composite_expr_parse(const gchar *line, gsize len, + rspamd_mempool_t *pool, + gpointer ud, GError **err) +{ + gsize clen = 0; + const gchar *p, *end; + enum composite_expr_state { + comp_state_read_symbol = 0, + comp_state_read_obrace, + comp_state_read_option, + comp_state_read_regexp, + comp_state_read_regexp_end, + comp_state_read_comma, + comp_state_read_ebrace, + comp_state_read_end + } state = comp_state_read_symbol; + + end = line + len; + p = line; + + /* Find length of the atom using a reduced state machine */ + while (p < end) { + if (state == comp_state_read_end) { + break; + } + + switch (state) { + case comp_state_read_symbol: + clen = rspamd_memcspn(p, "[; \t()><!|&\n", len); + p += clen; + + if (*p == '[') { + state = comp_state_read_obrace; + } + else { + state = comp_state_read_end; + } + break; + case comp_state_read_obrace: + p++; + + if (*p == '/') { + p++; + state = comp_state_read_regexp; + } + else { + state = comp_state_read_option; + } + break; + case comp_state_read_regexp: + if (*p == '\\' && p + 1 < end) { + /* Escaping */ + p++; + } + else if (*p == '/') { + /* End of regexp, possible flags */ + state = comp_state_read_regexp_end; + } + p++; + break; + case comp_state_read_option: + case comp_state_read_regexp_end: + if (*p == ',') { + p++; + state = comp_state_read_comma; + } + else if (*p == ']') { + state = comp_state_read_ebrace; + } + else { + p++; + } + break; + case comp_state_read_comma: + if (!g_ascii_isspace(*p)) { + if (*p == '/') { + state = comp_state_read_regexp; + } + else if (*p == ']') { + state = comp_state_read_ebrace; + } + else { + state = comp_state_read_option; + } + } + else { + /* Skip spaces after comma */ + p++; + } + break; + case comp_state_read_ebrace: + p++; + state = comp_state_read_end; + break; + case comp_state_read_end: + g_assert_not_reached(); + } + } + + if (state != comp_state_read_end) { + g_set_error(err, rspamd_composites_quark(), 100, "invalid composite: %s;" + "parser stopped in state %d", + line, state); + return NULL; + } + + clen = p - line; + p = line; + state = comp_state_read_symbol; + + auto *atom = new rspamd_composite_atom; + auto *res = rspamd_mempool_alloc0_type(pool, rspamd_expression_atom_t); + res->len = clen; + res->str = line; + + /* Full state machine to fill a composite atom */ + const gchar *opt_start = nullptr; + + while (p < end) { + if (state == comp_state_read_end) { + break; + } + + switch (state) { + case comp_state_read_symbol: { + clen = rspamd_memcspn(p, "[; \t()><!|&\n", len); + p += clen; + + if (*p == '[') { + state = comp_state_read_obrace; + } + else { + state = comp_state_read_end; + } + + atom->symbol = std::string{line, clen}; + auto norm_start = std::find_if(atom->symbol.begin(), atom->symbol.end(), + [](char c) { return g_ascii_isalnum(c); }); + if (norm_start == atom->symbol.end()) { + msg_err_pool("invalid composite atom: %s", atom->symbol.c_str()); + } + atom->norm_symbol = make_string_view_from_it(norm_start, atom->symbol.end()); + break; + } + case comp_state_read_obrace: + p++; + + if (*p == '/') { + opt_start = p; + p++; /* Starting slash */ + state = comp_state_read_regexp; + } + else { + state = comp_state_read_option; + opt_start = p; + } + + break; + case comp_state_read_regexp: + if (*p == '\\' && p + 1 < end) { + /* Escaping */ + p++; + } + else if (*p == '/') { + /* End of regexp, possible flags */ + state = comp_state_read_regexp_end; + } + p++; + break; + case comp_state_read_option: + if (*p == ',' || *p == ']') { + /* Plain match, copy option to ensure string_view validity */ + gint opt_len = p - opt_start; + auto *opt_buf = rspamd_mempool_alloc_buffer(pool, opt_len + 1); + rspamd_strlcpy(opt_buf, opt_start, opt_len + 1); + opt_buf = g_strstrip(opt_buf); + atom->opts.emplace_back(opt_buf, strlen(opt_buf)); + + if (*p == ',') { + p++; + state = comp_state_read_comma; + } + else { + state = comp_state_read_ebrace; + } + } + else { + p++; + } + break; + case comp_state_read_regexp_end: + if (*p == ',' || *p == ']') { + auto opt_len = p - opt_start; + rspamd_regexp_t *re; + GError *re_err = nullptr; + + re = rspamd_regexp_new_len(opt_start, opt_len, nullptr, &re_err); + + if (re == nullptr) { + msg_err_pool("cannot create regexp from string %*s: %e", + opt_len, opt_start, re_err); + + g_error_free(re_err); + } + else { + atom->opts.emplace_back(re); + rspamd_regexp_unref(re); + } + + if (*p == ',') { + p++; + state = comp_state_read_comma; + } + else { + state = comp_state_read_ebrace; + } + } + else { + p++; + } + break; + case comp_state_read_comma: + if (!g_ascii_isspace(*p)) { + if (*p == '/') { + state = comp_state_read_regexp; + opt_start = p; + } + else if (*p == ']') { + state = comp_state_read_ebrace; + } + else { + opt_start = p; + state = comp_state_read_option; + } + } + else { + /* Skip spaces after comma */ + p++; + } + break; + case comp_state_read_ebrace: + p++; + state = comp_state_read_end; + break; + case comp_state_read_end: + g_assert_not_reached(); + } + } + + res->data = atom; + + return res; +} + +static auto +process_symbol_removal(rspamd_expression_atom_t *atom, + struct composites_data *cd, + struct rspamd_symbol_result *ms, + const std::string &beg) -> void +{ + struct rspamd_task *task = cd->task; + + if (ms == nullptr) { + return; + } + + /* + * At this point we know that we need to do something about this symbol, + * however, we don't know whether we need to delete it unfortunately, + * that depends on the later decisions when the complete expression is + * evaluated. + */ + auto rd_it = cd->symbols_to_remove.find(ms->name); + + auto fill_removal_structure = [&](symbol_remove_data &nrd) { + nrd.sym = ms->name; + + /* By default remove symbols */ + switch (cd->composite->policy) { + case rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_ALL: + default: + nrd.action = (RSPAMD_COMPOSITE_REMOVE_SYMBOL | RSPAMD_COMPOSITE_REMOVE_WEIGHT); + break; + case rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL: + nrd.action = RSPAMD_COMPOSITE_REMOVE_SYMBOL; + break; + case rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT: + nrd.action = RSPAMD_COMPOSITE_REMOVE_WEIGHT; + break; + case rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_LEAVE: + nrd.action = 0; + break; + } + + for (auto t: beg) { + if (t == '~') { + nrd.action &= ~RSPAMD_COMPOSITE_REMOVE_SYMBOL; + } + else if (t == '-') { + nrd.action &= ~(RSPAMD_COMPOSITE_REMOVE_WEIGHT | + RSPAMD_COMPOSITE_REMOVE_SYMBOL); + } + else if (t == '^') { + nrd.action |= RSPAMD_COMPOSITE_REMOVE_FORCED; + } + else { + break; + } + } + + nrd.comp = cd->composite; + nrd.parent = atom->parent; + }; + + if (rd_it != cd->symbols_to_remove.end()) { + fill_removal_structure(rd_it->second.emplace_back()); + msg_debug_composites("%s: added symbol %s to removal: %d policy, from composite %s", + cd->metric_res->name, + ms->name, rd_it->second.back().action, + cd->composite->sym.c_str()); + } + else { + std::vector<symbol_remove_data> nrd; + fill_removal_structure(nrd.emplace_back()); + msg_debug_composites("%s: added symbol %s to removal: %d policy, from composite %s", + cd->metric_res->name, + ms->name, nrd.front().action, + cd->composite->sym.c_str()); + cd->symbols_to_remove[ms->name] = std::move(nrd); + } +} + +static auto +process_single_symbol(struct composites_data *cd, + std::string_view sym, + struct rspamd_symbol_result **pms, + struct rspamd_composite_atom *atom) -> double +{ + struct rspamd_symbol_result *ms = nullptr; + gdouble rc = 0; + struct rspamd_task *task = cd->task; + + if ((ms = rspamd_task_find_symbol_result(cd->task, sym.data(), cd->metric_res)) == nullptr) { + msg_debug_composites("not found symbol %s in composite %s", sym.data(), + cd->composite->sym.c_str()); + + if (G_UNLIKELY(atom->comp_type == rspamd_composite_atom_type::ATOM_UNKNOWN)) { + const struct rspamd_composite *ncomp; + + if ((ncomp = COMPOSITE_MANAGER_FROM_PTR(task->cfg->composites_manager)->find(sym)) != NULL) { + atom->comp_type = rspamd_composite_atom_type::ATOM_COMPOSITE; + atom->ncomp = ncomp; + } + else { + atom->comp_type = rspamd_composite_atom_type::ATOM_PLAIN; + } + } + + if (atom->comp_type == rspamd_composite_atom_type::ATOM_COMPOSITE) { + msg_debug_composites("symbol %s for composite %s is another composite", + sym.data(), cd->composite->sym.c_str()); + + if (!cd->checked[atom->ncomp->id * 2]) { + msg_debug_composites("composite dependency %s for %s is not checked", + sym.data(), cd->composite->sym.c_str()); + /* Set checked for this symbol to avoid cyclic references */ + cd->checked[cd->composite->id * 2] = true; + auto *saved = cd->composite; /* Save the current composite */ + composites_foreach_callback((gpointer) atom->ncomp->sym.c_str(), + (gpointer) atom->ncomp, (gpointer) cd); + /* Restore state */ + cd->composite = saved; + cd->checked[cd->composite->id * 2] = false; + + ms = rspamd_task_find_symbol_result(cd->task, sym.data(), + cd->metric_res); + } + else { + /* + * XXX: in case of cyclic references this would return 0 + */ + if (cd->checked[atom->ncomp->id * 2 + 1]) { + ms = rspamd_task_find_symbol_result(cd->task, sym.data(), + cd->metric_res); + } + } + } + } + + if (ms) { + msg_debug_composites("found symbol %s in composite %s, weight: %.3f", + sym.data(), cd->composite->sym.c_str(), ms->score); + + /* Now check options */ + for (const auto &cur_opt: atom->opts) { + struct rspamd_symbol_option *opt; + auto found = false; + + DL_FOREACH(ms->opts_head, opt) + { + if (cur_opt.match_opt({opt->option, opt->optlen})) { + found = true; + break; + } + } + + if (!found) { + auto pat = cur_opt.get_pat(); + msg_debug_composites("symbol %s in composite %s misses required option %*s", + sym.data(), + cd->composite->sym.c_str(), + (int) pat.size(), pat.data()); + ms = nullptr; + + break; + } + } + + if (ms) { + if (ms->score == 0) { + rc = epsilon * 16.0; /* Distinguish from 0 */ + } + else { + rc = ms->score; + } + } + } + + *pms = ms; + return rc; +} + +static auto +rspamd_composite_expr_process(void *ud, rspamd_expression_atom_t *atom) -> double +{ + struct composites_data *cd = (struct composites_data *) ud; + struct rspamd_composite_atom *comp_atom = (struct rspamd_composite_atom *) atom->data; + + struct rspamd_symbol_result *ms = NULL; + struct rspamd_task *task = cd->task; + gdouble rc = 0; + + if (cd->checked[cd->composite->id * 2]) { + /* We have already checked this composite, so just return its value */ + if (cd->checked[cd->composite->id * 2 + 1]) { + ms = rspamd_task_find_symbol_result(cd->task, + comp_atom->norm_symbol.data(), + cd->metric_res); + } + + if (ms) { + if (ms->score == 0) { + rc = epsilon; /* Distinguish from 0 */ + } + else { + /* Treat negative and positive scores equally... */ + rc = fabs(ms->score); + } + } + + msg_debug_composites("composite %s is already checked, result: %.2f", + cd->composite->sym.c_str(), rc); + + return rc; + } + + /* Note: sym is zero terminated as it is a view on std::string */ + auto sym = comp_atom->norm_symbol; + auto group_process_functor = [&](auto cond, int sub_start) -> double { + auto max = 0.; + GHashTableIter it; + gpointer k, v; + struct rspamd_symbols_group *gr; + + gr = (struct rspamd_symbols_group *) g_hash_table_lookup(cd->task->cfg->groups, + sym.substr(sub_start).data()); + + if (gr != nullptr) { + g_hash_table_iter_init(&it, gr->symbols); + + while (g_hash_table_iter_next(&it, &k, &v)) { + auto *sdef = (rspamd_symbol *) v; + + if (cond(sdef->score)) { + rc = process_single_symbol(cd, + std::string_view(sdef->name), + &ms, + comp_atom); + + if (fabs(rc) > epsilon) { + process_symbol_removal(atom, + cd, + ms, + comp_atom->symbol); + + if (fabs(rc) > max) { + max = fabs(rc); + } + } + } + } + } + + return max; + }; + + if (sym.size() > 2) { + if (sym.substr(0, 2) == "g:") { + rc = group_process_functor([](auto _) { return true; }, 2); + } + else if (sym.substr(0, 3) == "g+:") { + /* Group, positive symbols only */ + rc = group_process_functor([](auto sc) { return sc > 0.; }, 3); + } + else if (sym.substr(0, 3) == "g-:") { + rc = group_process_functor([](auto sc) { return sc < 0.; }, 3); + } + else { + rc = process_single_symbol(cd, sym, &ms, comp_atom); + + if (fabs(rc) > epsilon) { + process_symbol_removal(atom, + cd, + ms, + comp_atom->symbol); + } + } + } + else { + rc = process_single_symbol(cd, sym, &ms, comp_atom); + + if (fabs(rc) > epsilon) { + process_symbol_removal(atom, + cd, + ms, + comp_atom->symbol); + } + } + + msg_debug_composites("%s: result for atom %s in composite %s is %.4f", + cd->metric_res->name, + comp_atom->norm_symbol.data(), + cd->composite->sym.c_str(), rc); + + return rc; +} + +/* + * We don't have preferences for composites + */ +static gint +rspamd_composite_expr_priority(rspamd_expression_atom_t *atom) +{ + return 0; +} + +static void +rspamd_composite_expr_destroy(rspamd_expression_atom_t *atom) +{ + rspamd_composite_atom_dtor(atom->data); +} + +static void +composites_foreach_callback(gpointer key, gpointer value, void *data) +{ + auto *cd = (struct composites_data *) data; + auto *comp = (struct rspamd_composite *) value; + auto *str_key = (const gchar *) key; + struct rspamd_task *task; + gdouble rc; + + cd->composite = comp; + task = cd->task; + + msg_debug_composites("process composite %s", str_key); + + if (!cd->checked[cd->composite->id * 2]) { + if (rspamd_symcache_is_checked(cd->task, cd->task->cfg->cache, + str_key)) { + msg_debug_composites("composite %s is checked in symcache but not " + "in composites bitfield", + cd->composite->sym.c_str()); + cd->checked[comp->id * 2] = true; + cd->checked[comp->id * 2 + 1] = false; + } + else { + if (rspamd_task_find_symbol_result(cd->task, str_key, + cd->metric_res) != nullptr) { + /* Already set, no need to check */ + msg_debug_composites("composite %s is already in metric " + "in composites bitfield", + cd->composite->sym.c_str()); + cd->checked[comp->id * 2] = true; + cd->checked[comp->id * 2 + 1] = true; + + return; + } + + msg_debug_composites("%s: start processing composite %s", + cd->metric_res->name, + cd->composite->sym.c_str()); + + rc = rspamd_process_expression(comp->expr, RSPAMD_EXPRESSION_FLAG_NOOPT, + cd); + + /* Checked bit */ + cd->checked[comp->id * 2] = true; + + msg_debug_composites("%s: final result for composite %s is %.4f", + cd->metric_res->name, + cd->composite->sym.c_str(), rc); + + /* Result bit */ + if (fabs(rc) > epsilon) { + cd->checked[comp->id * 2 + 1] = true; + rspamd_task_insert_result_full(cd->task, str_key, 1.0, NULL, + RSPAMD_SYMBOL_INSERT_SINGLE, cd->metric_res); + } + else { + cd->checked[comp->id * 2 + 1] = false; + } + } + } +} + + +static auto +remove_symbols(const composites_data &cd, const std::vector<symbol_remove_data> &rd) -> void +{ + struct rspamd_task *task = cd.task; + gboolean skip = FALSE, + has_valid_op = FALSE, + want_remove_score = TRUE, + want_remove_symbol = TRUE, + want_forced = FALSE; + const gchar *disable_score_reason = "no policy", + *disable_symbol_reason = "no policy"; + + task = cd.task; + + for (const auto &cur: rd) { + if (!cd.checked[cur.comp->id * 2 + 1]) { + continue; + } + /* + * First of all exclude all elements with any parent that is negation: + * !A || B -> here we can have both !A and B matched, but we do *NOT* + * want to remove symbol in that case + */ + auto *par = cur.parent; + skip = FALSE; + + while (par) { + if (rspamd_expression_node_is_op(par, OP_NOT)) { + skip = TRUE; + break; + } + + par = par->parent; + } + + if (skip) { + continue; + } + + has_valid_op = TRUE; + /* + * Now we can try to remove symbols/scores + * + * We apply the following logic here: + * - if no composites would like to save score then we remove score + * - if no composites would like to save symbol then we remove symbol + */ + if (!want_forced) { + if (!(cur.action & RSPAMD_COMPOSITE_REMOVE_SYMBOL)) { + want_remove_symbol = FALSE; + disable_symbol_reason = cur.comp->sym.c_str(); + } + + if (!(cur.action & RSPAMD_COMPOSITE_REMOVE_WEIGHT)) { + want_remove_score = FALSE; + disable_score_reason = cur.comp->sym.c_str(); + } + + if (cur.action & RSPAMD_COMPOSITE_REMOVE_FORCED) { + want_forced = TRUE; + disable_symbol_reason = cur.comp->sym.c_str(); + disable_score_reason = cur.comp->sym.c_str(); + } + } + } + + auto *ms = rspamd_task_find_symbol_result(task, rd.front().sym, cd.metric_res); + + if (has_valid_op && ms && !(ms->flags & RSPAMD_SYMBOL_RESULT_IGNORED)) { + + if (want_remove_score || want_forced) { + msg_debug_composites("%s: %s remove symbol weight for %s (was %.2f), " + "score removal affected by %s, symbol removal affected by %s", + cd.metric_res->name, + (want_forced ? "forced" : "normal"), rd.front().sym, ms->score, + disable_score_reason, disable_symbol_reason); + cd.metric_res->score -= ms->score; + ms->score = 0.0; + } + + if (want_remove_symbol || want_forced) { + ms->flags |= RSPAMD_SYMBOL_RESULT_IGNORED; + msg_debug_composites("%s: %s remove symbol %s (score %.2f), " + "score removal affected by %s, symbol removal affected by %s", + cd.metric_res->name, + (want_forced ? "forced" : "normal"), rd.front().sym, ms->score, + disable_score_reason, disable_symbol_reason); + } + } +} + +static void +composites_metric_callback(struct rspamd_task *task) +{ + std::vector<composites_data> comp_data_vec; + struct rspamd_scan_result *mres; + + comp_data_vec.reserve(1); + + DL_FOREACH(task->result, mres) + { + auto &cd = comp_data_vec.emplace_back(task, mres); + + /* Process metric result */ + rspamd_symcache_composites_foreach(task, + task->cfg->cache, + composites_foreach_callback, + &cd); + } + + for (const auto &cd: comp_data_vec) { + /* Remove symbols that are in composites */ + for (const auto &srd_it: cd.symbols_to_remove) { + remove_symbols(cd, srd_it.second); + } + } +} + +}// namespace rspamd::composites + + +void rspamd_composites_process_task(struct rspamd_task *task) +{ + if (task->result && !RSPAMD_TASK_IS_SKIPPED(task)) { + rspamd::composites::composites_metric_callback(task); + } +} diff --git a/src/libserver/composites/composites.h b/src/libserver/composites/composites.h new file mode 100644 index 0000000..5d58029 --- /dev/null +++ b/src/libserver/composites/composites.h @@ -0,0 +1,64 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBSERVER_COMPOSITES_H_ +#define SRC_LIBSERVER_COMPOSITES_H_ + +#include "config.h" +#include "contrib/libucl/ucl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct rspamd_config; + +/** + * Process all results and form composite metrics from existent metrics as it is defined in config + * @param task worker's task that present message from user + */ +void rspamd_composites_process_task(struct rspamd_task *task); + +/** + * Creates a composites manager + * @param cfg + * @return + */ +void *rspamd_composites_manager_create(struct rspamd_config *cfg); +/** + * Returns number of elements in a composite manager + * @return + */ +gsize rspamd_composites_manager_nelts(void *); +/** + * Adds a composite from config + * @return + */ +void *rspamd_composites_manager_add_from_ucl(void *, const char *, const ucl_object_t *); +void *rspamd_composites_manager_add_from_ucl_silent(void *, const char *, const ucl_object_t *); + +/** + * Adds a composite from config + * @return + */ +void *rspamd_composites_manager_add_from_string(void *, const char *, const char *); +void *rspamd_composites_manager_add_from_string_silent(void *, const char *, const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBSERVER_COMPOSITES_H_ */ diff --git a/src/libserver/composites/composites_internal.hxx b/src/libserver/composites/composites_internal.hxx new file mode 100644 index 0000000..038e217 --- /dev/null +++ b/src/libserver/composites/composites_internal.hxx @@ -0,0 +1,112 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_COMPOSITES_INTERNAL_HXX +#define RSPAMD_COMPOSITES_INTERNAL_HXX +#pragma once + +#include <string> +#include "libutil/expression.h" +#include "libutil/cxx/hash_util.hxx" +#include "libserver/cfg_file.h" + +namespace rspamd::composites { + +/** + * Subr for composite expressions + */ +extern const struct rspamd_atom_subr composite_expr_subr; + +enum class rspamd_composite_policy { + RSPAMD_COMPOSITE_POLICY_REMOVE_ALL = 0, + RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL, + RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT, + RSPAMD_COMPOSITE_POLICY_LEAVE, + RSPAMD_COMPOSITE_POLICY_UNKNOWN +}; + +/** + * Static composites structure + */ +struct rspamd_composite { + std::string str_expr; + std::string sym; + struct rspamd_expression *expr; + gint id; + rspamd_composite_policy policy; +}; + +#define COMPOSITE_MANAGER_FROM_PTR(ptr) (reinterpret_cast<rspamd::composites::composites_manager *>(ptr)) + +class composites_manager { +public: + composites_manager(struct rspamd_config *_cfg) + : cfg(_cfg) + { + rspamd_mempool_add_destructor(_cfg->cfg_pool, composites_manager_dtor, this); + } + + auto size(void) const -> std::size_t + { + return all_composites.size(); + } + + auto find(std::string_view name) const -> const rspamd_composite * + { + auto found = composites.find(std::string(name)); + + if (found != composites.end()) { + return found->second.get(); + } + + return nullptr; + } + + auto add_composite(std::string_view, const ucl_object_t *, bool silent_duplicate) -> rspamd_composite *; + auto add_composite(std::string_view name, std::string_view expression, bool silent_duplicate, double score = NAN) -> rspamd_composite *; + +private: + ~composites_manager() = default; + static void composites_manager_dtor(void *ptr) + { + delete COMPOSITE_MANAGER_FROM_PTR(ptr); + } + + auto new_composite(std::string_view composite_name, rspamd_expression *expr, + std::string_view composite_expression) -> auto + { + auto &composite = all_composites.emplace_back(std::make_shared<rspamd_composite>()); + composite->expr = expr; + composite->id = all_composites.size() - 1; + composite->str_expr = composite_expression; + composite->sym = composite_name; + + composites[composite->sym] = composite; + + return composite; + } + + ankerl::unordered_dense::map<std::string, + std::shared_ptr<rspamd_composite>, rspamd::smart_str_hash, rspamd::smart_str_equal> + composites; + /* Store all composites here, even if we have duplicates */ + std::vector<std::shared_ptr<rspamd_composite>> all_composites; + struct rspamd_config *cfg; +}; + +}// namespace rspamd::composites + +#endif//RSPAMD_COMPOSITES_INTERNAL_HXX diff --git a/src/libserver/composites/composites_manager.cxx b/src/libserver/composites/composites_manager.cxx new file mode 100644 index 0000000..1ee5c40 --- /dev/null +++ b/src/libserver/composites/composites_manager.cxx @@ -0,0 +1,330 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <memory> +#include <vector> +#include <cmath> +#include "contrib/ankerl/unordered_dense.h" + +#include "composites.h" +#include "composites_internal.hxx" +#include "libserver/cfg_file.h" +#include "libserver/logger.h" +#include "libserver/maps/map.h" +#include "libutil/cxx/util.hxx" + +namespace rspamd::composites { + +static auto +composite_policy_from_str(const std::string_view &inp) -> enum rspamd_composite_policy { + const static ankerl::unordered_dense::map<std::string_view, + enum rspamd_composite_policy> + names{ + {"remove", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_ALL}, + {"remove_all", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_ALL}, + {"default", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_ALL}, + {"remove_symbol", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL}, + {"remove_weight", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT}, + {"leave", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_LEAVE}, + {"remove_none", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_LEAVE}, + }; + + auto found = names.find(inp); + if (found != names.end()){ + return found->second;} + +return rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_UNKNOWN; +}// namespace rspamd::composites + +auto composites_manager::add_composite(std::string_view composite_name, const ucl_object_t *obj, bool silent_duplicate) -> rspamd_composite * +{ + + const auto *val = ucl_object_lookup(obj, "enabled"); + if (val != nullptr && !ucl_object_toboolean(val)) { + msg_info_config("composite %s is disabled", composite_name.data()); + return nullptr; + } + + if (composites.contains(composite_name)) { + if (silent_duplicate) { + msg_debug_config("composite %s is redefined", composite_name.data()); + return nullptr; + } + else { + msg_warn_config("composite %s is redefined", composite_name.data()); + } + } + + const char *composite_expression = nullptr; + val = ucl_object_lookup(obj, "expression"); + + if (val == NULL || !ucl_object_tostring_safe(val, &composite_expression)) { + msg_err_config("composite must have an expression defined in %s", + composite_name.data()); + return nullptr; + } + + GError *err = nullptr; + rspamd_expression *expr = nullptr; + + if (!rspamd_parse_expression(composite_expression, 0, &composite_expr_subr, + NULL, cfg->cfg_pool, &err, &expr)) { + msg_err_config("cannot parse composite expression for %s: %e", + composite_name.data(), err); + + if (err) { + g_error_free(err); + } + + return nullptr; + } + + const auto &composite = new_composite(composite_name, expr, composite_expression); + + auto score = std::isnan(cfg->unknown_weight) ? 0.0 : cfg->unknown_weight; + val = ucl_object_lookup(obj, "score"); + + if (val != nullptr) { + ucl_object_todouble_safe(val, &score); + } + + /* Also set score in the metric */ + const auto *group = "composite"; + val = ucl_object_lookup(obj, "group"); + if (val != nullptr) { + group = ucl_object_tostring(val); + } + + const auto *description = composite_expression; + val = ucl_object_lookup(obj, "description"); + if (val != nullptr) { + description = ucl_object_tostring(val); + } + + rspamd_config_add_symbol(cfg, composite_name.data(), score, + description, group, + 0, + ucl_object_get_priority(obj), /* No +1 as it is default... */ + 1); + + const auto *elt = ucl_object_lookup(obj, "groups"); + if (elt && ucl_object_type(elt) == UCL_ARRAY) { + const ucl_object_t *cur_gr; + auto *gr_it = ucl_object_iterate_new(elt); + + while ((cur_gr = ucl_object_iterate_safe(gr_it, true)) != nullptr) { + rspamd_config_add_symbol_group(cfg, composite_name.data(), + ucl_object_tostring(cur_gr)); + } + + ucl_object_iterate_free(gr_it); + } + + val = ucl_object_lookup(obj, "policy"); + if (val) { + composite->policy = composite_policy_from_str(ucl_object_tostring(val)); + + if (composite->policy == rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_UNKNOWN) { + msg_err_config("composite %s has incorrect policy", composite_name.data()); + return nullptr; + } + } + + return composite.get(); +} + +auto composites_manager::add_composite(std::string_view composite_name, + std::string_view composite_expression, + bool silent_duplicate, double score) -> rspamd_composite * +{ + GError *err = nullptr; + rspamd_expression *expr = nullptr; + + if (composites.contains(composite_name)) { + /* Duplicate composite - refuse to add */ + if (silent_duplicate) { + msg_debug_config("composite %s is redefined", composite_name.data()); + return nullptr; + } + else { + msg_warn_config("composite %s is redefined", composite_name.data()); + } + } + + if (!rspamd_parse_expression(composite_expression.data(), + composite_expression.size(), &composite_expr_subr, + nullptr, cfg->cfg_pool, &err, &expr)) { + msg_err_config("cannot parse composite expression for %s: %e", + composite_name.data(), err); + + if (err) { + g_error_free(err); + } + + return nullptr; + } + + auto final_score = std::isnan(score) ? (std::isnan(cfg->unknown_weight) ? 0.0 : cfg->unknown_weight) : score; + rspamd_config_add_symbol(cfg, composite_name.data(), final_score, + composite_name.data(), "composite", + 0, + 0, + 1); + + return new_composite(composite_name, expr, composite_expression).get(); +} + +struct map_cbdata { + composites_manager *cm; + struct rspamd_config *cfg; + std::string buf; + + explicit map_cbdata(struct rspamd_config *cfg) + : cfg(cfg) + { + cm = COMPOSITE_MANAGER_FROM_PTR(cfg->composites_manager); + } + + static char *map_read(char *chunk, int len, + struct map_cb_data *data, + gboolean _final) + { + + if (data->cur_data == nullptr) { + data->cur_data = data->prev_data; + reinterpret_cast<map_cbdata *>(data->cur_data)->buf.clear(); + } + + auto *cbd = reinterpret_cast<map_cbdata *>(data->cur_data); + + cbd->buf.append(chunk, len); + return nullptr; + } + + static void + map_fin(struct map_cb_data *data, void **target) + { + auto *cbd = reinterpret_cast<map_cbdata *>(data->cur_data); + + if (data->errored) { + if (cbd) { + cbd->buf.clear(); + } + } + else if (cbd != nullptr) { + if (target) { + *target = data->cur_data; + } + + rspamd::string_foreach_line(cbd->buf, [&](std::string_view line) { + auto [name_and_score, expr] = rspamd::string_split_on(line, ' '); + auto [name, score] = rspamd::string_split_on(name_and_score, ':'); + + if (!score.empty()) { + /* I wish it was supported properly */ + //auto conv_res = std::from_chars(value->data(), value->size(), num); + char numbuf[128], *endptr = nullptr; + rspamd_strlcpy(numbuf, score.data(), MIN(score.size(), sizeof(numbuf))); + auto num = g_ascii_strtod(numbuf, &endptr); + + if (fabs(num) >= G_MAXFLOAT || std::isnan(num)) { + msg_err("invalid score for %*s", (int) name_and_score.size(), name_and_score.data()); + return; + } + + auto ret = cbd->cm->add_composite(name, expr, true, num); + + if (ret == nullptr) { + msg_err("cannot add composite %*s", (int) name_and_score.size(), name_and_score.data()); + return; + } + } + else { + msg_err("missing score for %*s", (int) name_and_score.size(), name_and_score.data()); + return; + } + }); + } + else { + msg_err("no data read for composites map"); + } + } + + static void + map_dtor(struct map_cb_data *data) + { + auto *cbd = reinterpret_cast<map_cbdata *>(data->cur_data); + delete cbd; + } +}; +} + + +void * +rspamd_composites_manager_create(struct rspamd_config *cfg) +{ + auto *cm = new rspamd::composites::composites_manager(cfg); + + return reinterpret_cast<void *>(cm); +} + + +gsize rspamd_composites_manager_nelts(void *ptr) +{ + return COMPOSITE_MANAGER_FROM_PTR(ptr)->size(); +} + +void * +rspamd_composites_manager_add_from_ucl(void *cm, const char *sym, const ucl_object_t *obj) +{ + return reinterpret_cast<void *>(COMPOSITE_MANAGER_FROM_PTR(cm)->add_composite(sym, obj, false)); +} + +void * +rspamd_composites_manager_add_from_string(void *cm, const char *sym, const char *expr) +{ + return reinterpret_cast<void *>(COMPOSITE_MANAGER_FROM_PTR(cm)->add_composite(sym, expr, false)); +} + +void * +rspamd_composites_manager_add_from_ucl_silent(void *cm, const char *sym, const ucl_object_t *obj) +{ + return reinterpret_cast<void *>(COMPOSITE_MANAGER_FROM_PTR(cm)->add_composite(sym, obj, true)); +} + +void * +rspamd_composites_manager_add_from_string_silent(void *cm, const char *sym, const char *expr) +{ + return reinterpret_cast<void *>(COMPOSITE_MANAGER_FROM_PTR(cm)->add_composite(sym, expr, true)); +} + + +bool rspamd_composites_add_map_handlers(const ucl_object_t *obj, struct rspamd_config *cfg) +{ + auto **pcbdata = rspamd_mempool_alloc_type(cfg->cfg_pool, rspamd::composites::map_cbdata *); + auto *cbdata = new rspamd::composites::map_cbdata{cfg}; + *pcbdata = cbdata; + + if (struct rspamd_map * m; (m = rspamd_map_add_from_ucl(cfg, obj, "composites map", + rspamd::composites::map_cbdata::map_read, rspamd::composites::map_cbdata::map_fin, + rspamd::composites::map_cbdata::map_dtor, (void **) pcbdata, + nullptr, RSPAMD_MAP_DEFAULT)) == nullptr) { + msg_err_config("cannot load composites map from %s", ucl_object_key(obj)); + return false; + } + + return true; +}
\ No newline at end of file |