summaryrefslogtreecommitdiffstats
path: root/src/plugins/regexp.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/plugins/regexp.c564
1 files changed, 564 insertions, 0 deletions
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
new file mode 100644
index 0000000..59a84c5
--- /dev/null
+++ b/src/plugins/regexp.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/***MODULE:regexp
+ * rspamd module that implements different regexp rules
+ */
+
+
+#include "config.h"
+#include "libmime/message.h"
+#include "expression.h"
+#include "mime_expressions.h"
+#include "libserver/maps/map.h"
+#include "lua/lua_common.h"
+
+static const guint64 rspamd_regexp_cb_magic = 0xca9d9649fc3e2659ULL;
+
+struct regexp_module_item {
+ guint64 magic;
+ struct rspamd_expression *expr;
+ const gchar *symbol;
+ struct ucl_lua_funcdata *lua_function;
+};
+
+struct regexp_ctx {
+ struct module_ctx ctx;
+ gsize max_size;
+};
+
+static void process_regexp_item(struct rspamd_task *task,
+ struct rspamd_symcache_dynamic_item *item,
+ void *user_data);
+
+
+/* Initialization */
+gint regexp_module_init(struct rspamd_config *cfg, struct module_ctx **ctx);
+gint regexp_module_config(struct rspamd_config *cfg, bool validate);
+gint regexp_module_reconfig(struct rspamd_config *cfg);
+
+module_t regexp_module = {
+ "regexp",
+ regexp_module_init,
+ regexp_module_config,
+ regexp_module_reconfig,
+ NULL,
+ RSPAMD_MODULE_VER,
+ (guint) -1,
+};
+
+
+static inline struct regexp_ctx *
+regexp_get_context(struct rspamd_config *cfg)
+{
+ return (struct regexp_ctx *) g_ptr_array_index(cfg->c_modules,
+ regexp_module.ctx_offset);
+}
+
+/* Process regexp expression */
+static gboolean
+read_regexp_expression(rspamd_mempool_t *pool,
+ struct regexp_module_item *chain,
+ const gchar *symbol,
+ const gchar *line,
+ struct rspamd_mime_expr_ud *ud)
+{
+ struct rspamd_expression *e = NULL;
+ GError *err = NULL;
+
+ if (!rspamd_parse_expression(line, 0, &mime_expr_subr, ud, pool, &err,
+ &e)) {
+ msg_warn_pool("%s = \"%s\" is invalid regexp expression: %e", symbol,
+ line,
+ err);
+ g_error_free(err);
+
+ return FALSE;
+ }
+
+ g_assert(e != NULL);
+ chain->expr = e;
+
+ return TRUE;
+}
+
+
+/* Init function */
+gint regexp_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
+{
+ struct regexp_ctx *regexp_module_ctx;
+
+ regexp_module_ctx = rspamd_mempool_alloc0(cfg->cfg_pool,
+ sizeof(*regexp_module_ctx));
+
+ *ctx = (struct module_ctx *) regexp_module_ctx;
+
+ rspamd_rcl_add_doc_by_path(cfg,
+ NULL,
+ "Regular expressions rules plugin",
+ "regexp",
+ UCL_OBJECT,
+ NULL,
+ 0,
+ NULL,
+ 0);
+
+ rspamd_rcl_add_doc_by_path(cfg,
+ "regexp",
+ "Maximum size of data chunk scanned with any regexp (further data is truncated)",
+ "max_size",
+ UCL_INT,
+ NULL,
+ 0,
+ NULL,
+ 0);
+
+ return 0;
+}
+
+gint regexp_module_config(struct rspamd_config *cfg, bool validate)
+{
+ struct regexp_ctx *regexp_module_ctx = regexp_get_context(cfg);
+ struct regexp_module_item *cur_item = NULL;
+ const ucl_object_t *sec, *value, *elt;
+ ucl_object_iter_t it = NULL;
+ gint res = TRUE, nre = 0, nlua = 0, nshots = cfg->default_max_shots;
+
+ if (!rspamd_config_is_module_enabled(cfg, "regexp")) {
+ return TRUE;
+ }
+
+ sec = ucl_object_lookup(cfg->cfg_ucl_obj, "regexp");
+ if (sec == NULL) {
+ msg_err_config("regexp module enabled, but no rules are defined");
+ return TRUE;
+ }
+
+ regexp_module_ctx->max_size = 0;
+
+ while ((value = ucl_object_iterate(sec, &it, true)) != NULL) {
+ if (g_ascii_strncasecmp(ucl_object_key(value), "max_size",
+ sizeof("max_size") - 1) == 0) {
+ regexp_module_ctx->max_size = ucl_obj_toint(value);
+ rspamd_re_cache_set_limit(cfg->re_cache, regexp_module_ctx->max_size);
+ }
+ else if (g_ascii_strncasecmp(ucl_object_key(value), "max_threads",
+ sizeof("max_threads") - 1) == 0) {
+ msg_warn_config("regexp module is now single threaded, max_threads is ignored");
+ }
+ else if (value->type == UCL_STRING) {
+ struct rspamd_mime_expr_ud ud;
+
+ cur_item = rspamd_mempool_alloc0(cfg->cfg_pool,
+ sizeof(struct regexp_module_item));
+ cur_item->symbol = ucl_object_key(value);
+ cur_item->magic = rspamd_regexp_cb_magic;
+
+ ud.conf_obj = NULL;
+ ud.cfg = cfg;
+
+ if (!read_regexp_expression(cfg->cfg_pool,
+ cur_item, ucl_object_key(value),
+ ucl_obj_tostring(value), &ud)) {
+ if (validate) {
+ return FALSE;
+ }
+ }
+ else {
+ rspamd_symcache_add_symbol(cfg->cache,
+ cur_item->symbol,
+ 0,
+ process_regexp_item,
+ cur_item,
+ SYMBOL_TYPE_NORMAL, -1);
+ nre++;
+ }
+ }
+ else if (value->type == UCL_USERDATA) {
+ /* Just a lua function */
+ cur_item = rspamd_mempool_alloc0(cfg->cfg_pool,
+ sizeof(struct regexp_module_item));
+ cur_item->magic = rspamd_regexp_cb_magic;
+ cur_item->symbol = ucl_object_key(value);
+ cur_item->lua_function = ucl_object_toclosure(value);
+
+ rspamd_symcache_add_symbol(cfg->cache,
+ cur_item->symbol,
+ 0,
+ process_regexp_item,
+ cur_item,
+ SYMBOL_TYPE_NORMAL, -1);
+ nlua++;
+ }
+ else if (value->type == UCL_OBJECT) {
+ const gchar *description = NULL, *group = NULL;
+ gdouble score = 0.0;
+ guint flags = 0, priority = 0;
+ gboolean is_lua = FALSE, valid_expression = TRUE;
+ struct rspamd_mime_expr_ud ud;
+
+ /* We have some lua table, extract its arguments */
+ elt = ucl_object_lookup(value, "callback");
+
+ if (elt == NULL || elt->type != UCL_USERDATA) {
+
+ /* Try plain regexp expression */
+ elt = ucl_object_lookup_any(value, "regexp", "re", NULL);
+
+ if (elt != NULL && ucl_object_type(elt) == UCL_STRING) {
+ cur_item = rspamd_mempool_alloc0(cfg->cfg_pool,
+ sizeof(struct regexp_module_item));
+ cur_item->symbol = ucl_object_key(value);
+ cur_item->magic = rspamd_regexp_cb_magic;
+ ud.cfg = cfg;
+ ud.conf_obj = value;
+
+ if (!read_regexp_expression(cfg->cfg_pool,
+ cur_item, ucl_object_key(value),
+ ucl_obj_tostring(elt), &ud)) {
+ if (validate) {
+ return FALSE;
+ }
+ }
+ else {
+ valid_expression = TRUE;
+ nre++;
+ }
+ }
+ else {
+ msg_err_config(
+ "no callback/expression defined for regexp symbol: "
+ "%s",
+ ucl_object_key(value));
+ }
+ }
+ else {
+ is_lua = TRUE;
+ nlua++;
+ cur_item = rspamd_mempool_alloc0(
+ cfg->cfg_pool,
+ sizeof(struct regexp_module_item));
+ cur_item->magic = rspamd_regexp_cb_magic;
+ cur_item->symbol = ucl_object_key(value);
+ cur_item->lua_function = ucl_object_toclosure(value);
+ }
+
+ if (cur_item && (is_lua || valid_expression)) {
+
+ flags = SYMBOL_TYPE_NORMAL;
+ elt = ucl_object_lookup(value, "mime_only");
+
+ if (elt) {
+ if (ucl_object_type(elt) != UCL_BOOLEAN) {
+ msg_err_config(
+ "mime_only attribute is not boolean for symbol: '%s'",
+ cur_item->symbol);
+
+ if (validate) {
+ return FALSE;
+ }
+ }
+ else {
+ if (ucl_object_toboolean(elt)) {
+ flags |= SYMBOL_TYPE_MIME_ONLY;
+ }
+ }
+ }
+
+ rspamd_symcache_add_symbol(cfg->cache,
+ cur_item->symbol,
+ 0,
+ process_regexp_item,
+ cur_item,
+ flags, -1);
+
+ /* Reset flags */
+ flags = 0;
+
+ elt = ucl_object_lookup(value, "condition");
+
+ if (elt != NULL && ucl_object_type(elt) == UCL_USERDATA) {
+ struct ucl_lua_funcdata *conddata;
+
+ g_assert(cur_item->symbol != NULL);
+ conddata = ucl_object_toclosure(elt);
+ rspamd_symcache_add_condition_delayed(cfg->cache,
+ cur_item->symbol,
+ conddata->L, conddata->idx);
+ }
+
+ elt = ucl_object_lookup(value, "description");
+
+ if (elt) {
+ description = ucl_object_tostring(elt);
+ }
+
+ elt = ucl_object_lookup(value, "group");
+
+ if (elt) {
+ group = ucl_object_tostring(elt);
+ }
+
+ elt = ucl_object_lookup(value, "score");
+
+ if (elt) {
+ if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) {
+ msg_err_config(
+ "score attribute is not numeric for symbol: '%s'",
+ cur_item->symbol);
+
+ if (validate) {
+ return FALSE;
+ }
+ }
+ else {
+ score = ucl_object_todouble(elt);
+ }
+ }
+
+ elt = ucl_object_lookup(value, "one_shot");
+
+ if (elt) {
+ if (ucl_object_type(elt) != UCL_BOOLEAN) {
+ msg_err_config(
+ "one_shot attribute is not boolean for symbol: '%s'",
+ cur_item->symbol);
+
+ if (validate) {
+ return FALSE;
+ }
+ }
+ else {
+ if (ucl_object_toboolean(elt)) {
+ nshots = 1;
+ }
+ }
+ }
+
+ if ((elt = ucl_object_lookup(value, "any_shot")) != NULL) {
+ if (ucl_object_type(elt) != UCL_BOOLEAN) {
+ msg_err_config(
+ "any_shot attribute is not boolean for symbol: '%s'",
+ cur_item->symbol);
+
+ if (validate) {
+ return FALSE;
+ }
+ }
+ else {
+ if (ucl_object_toboolean(elt)) {
+ nshots = -1;
+ }
+ }
+ }
+
+ if ((elt = ucl_object_lookup(value, "nshots")) != NULL) {
+ if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) {
+ msg_err_config(
+ "nshots attribute is not numeric for symbol: '%s'",
+ cur_item->symbol);
+
+ if (validate) {
+ return FALSE;
+ }
+ }
+ else {
+ nshots = ucl_object_toint(elt);
+ }
+ }
+
+ elt = ucl_object_lookup(value, "one_param");
+
+ if (elt) {
+ if (ucl_object_type(elt) != UCL_BOOLEAN) {
+ msg_err_config(
+ "one_param attribute is not boolean for symbol: '%s'",
+ cur_item->symbol);
+
+ if (validate) {
+ return FALSE;
+ }
+ }
+ else {
+ if (ucl_object_toboolean(elt)) {
+ flags |= RSPAMD_SYMBOL_FLAG_ONEPARAM;
+ }
+ }
+ }
+
+ elt = ucl_object_lookup(value, "priority");
+
+ if (elt) {
+ if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) {
+ msg_err_config(
+ "priority attribute is not numeric for symbol: '%s'",
+ cur_item->symbol);
+
+ if (validate) {
+ return FALSE;
+ }
+ }
+ else {
+ priority = ucl_object_toint(elt);
+ }
+ }
+ else {
+ priority = 0;
+ }
+
+ rspamd_config_add_symbol(cfg, cur_item->symbol,
+ score, description, group, flags, priority, nshots);
+
+ elt = ucl_object_lookup(value, "groups");
+
+ if (elt) {
+ ucl_object_iter_t gr_it;
+ const ucl_object_t *cur_gr;
+
+ gr_it = ucl_object_iterate_new(elt);
+
+ while ((cur_gr = ucl_object_iterate_safe(gr_it, true)) != NULL) {
+ rspamd_config_add_symbol_group(cfg, cur_item->symbol,
+ ucl_object_tostring(cur_gr));
+ }
+
+ ucl_object_iterate_free(gr_it);
+ }
+ }
+ }
+ else {
+ msg_warn_config("unknown type of attribute %s for regexp module",
+ ucl_object_key(value));
+ }
+ }
+
+ if (res) {
+ msg_info_config("init internal regexp module, %d regexp rules and %d "
+ "lua rules are loaded",
+ nre, nlua);
+ }
+ else {
+ msg_err_config("fatal regexp module error");
+ }
+
+ return res;
+}
+
+gint regexp_module_reconfig(struct rspamd_config *cfg)
+{
+ return regexp_module_config(cfg, false);
+}
+
+static gboolean
+rspamd_lua_call_expression_func(struct ucl_lua_funcdata *lua_data,
+ struct rspamd_task *task,
+ GArray *args, gdouble *res,
+ const gchar *symbol)
+{
+ lua_State *L = lua_data->L;
+ struct rspamd_task **ptask;
+ struct expression_argument *arg;
+ gint pop = 0, i, nargs = 0;
+
+ lua_rawgeti(L, LUA_REGISTRYINDEX, lua_data->idx);
+ /* Now we got function in top of stack */
+ ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
+ rspamd_lua_setclass(L, "rspamd{task}", -1);
+ *ptask = task;
+
+ /* Now push all arguments */
+ if (args) {
+ for (i = 0; i < (gint) args->len; i++) {
+ arg = &g_array_index(args, struct expression_argument, i);
+ if (arg) {
+ switch (arg->type) {
+ case EXPRESSION_ARGUMENT_NORMAL:
+ lua_pushstring(L, (const gchar *) arg->data);
+ break;
+ case EXPRESSION_ARGUMENT_BOOL:
+ lua_pushboolean(L, (gboolean) GPOINTER_TO_SIZE(arg->data));
+ break;
+ default:
+ msg_err_task("%s: cannot pass custom params to lua function",
+ symbol);
+ return FALSE;
+ }
+ }
+ }
+ nargs = args->len;
+ }
+
+ if (lua_pcall(L, nargs + 1, 1, 0) != 0) {
+ msg_info_task("%s: call to lua function failed: %s", symbol,
+ lua_tostring(L, -1));
+ lua_pop(L, 1);
+
+ return FALSE;
+ }
+
+ pop++;
+
+ if (lua_type(L, -1) == LUA_TNUMBER) {
+ *res = lua_tonumber(L, -1);
+ }
+ else if (lua_type(L, -1) == LUA_TBOOLEAN) {
+ *res = lua_toboolean(L, -1);
+ }
+ else {
+ msg_info_task("%s: lua function must return a boolean", symbol);
+ *res = FALSE;
+ }
+
+ lua_pop(L, pop);
+
+ return TRUE;
+}
+
+
+static void
+process_regexp_item(struct rspamd_task *task,
+ struct rspamd_symcache_dynamic_item *symcache_item,
+ void *user_data)
+{
+ struct regexp_module_item *item = user_data;
+ gdouble res = FALSE;
+
+ /* Non-threaded version */
+ if (item->lua_function) {
+ /* Just call function */
+ res = FALSE;
+ if (!rspamd_lua_call_expression_func(item->lua_function, task, NULL,
+ &res, item->symbol)) {
+ msg_err_task("error occurred when checking symbol %s",
+ item->symbol);
+ }
+ }
+ else {
+ /* Process expression */
+ if (item->expr) {
+ res = rspamd_process_expression(item->expr, 0, task);
+ }
+ else {
+ msg_warn_task("FIXME: %s symbol is broken with new expressions",
+ item->symbol);
+ }
+ }
+
+ if (res != 0) {
+ rspamd_task_insert_result(task, item->symbol, res, NULL);
+ }
+
+ rspamd_symcache_finalize_item(task, symcache_item);
+}