diff options
Diffstat (limited to 'src/libserver/maps/map_helpers.c')
-rw-r--r-- | src/libserver/maps/map_helpers.c | 1845 |
1 files changed, 1845 insertions, 0 deletions
diff --git a/src/libserver/maps/map_helpers.c b/src/libserver/maps/map_helpers.c new file mode 100644 index 0000000..65478c5 --- /dev/null +++ b/src/libserver/maps/map_helpers.c @@ -0,0 +1,1845 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "map_helpers.h" +#include "map_private.h" +#include "khash.h" +#include "radix.h" +#include "rspamd.h" +#include "cryptobox.h" +#include "mempool_vars_internal.h" +#include "contrib/fastutf8/fastutf8.h" +#include "contrib/cdb/cdb.h" + +#ifdef WITH_HYPERSCAN +#include "hs.h" +#include "hyperscan_tools.h" +#endif +#ifndef WITH_PCRE2 +#include <pcre.h> +#else +#include <pcre2.h> +#endif + + +static const guint64 map_hash_seed = 0xdeadbabeULL; +static const gchar *const hash_fill = "1"; + +struct rspamd_map_helper_value { + gsize hits; + gconstpointer key; + gchar value[]; /* Null terminated */ +}; + +#define rspamd_map_ftok_hash(t) (rspamd_icase_hash((t).begin, (t).len, rspamd_hash_seed())) +#define rspamd_map_ftok_equal(a, b) ((a).len == (b).len && rspamd_lc_cmp((a).begin, (b).begin, (a).len) == 0) + +KHASH_INIT(rspamd_map_hash, rspamd_ftok_t, + struct rspamd_map_helper_value *, true, + rspamd_map_ftok_hash, rspamd_map_ftok_equal); + +struct rspamd_radix_map_helper { + rspamd_mempool_t *pool; + khash_t(rspamd_map_hash) * htb; + radix_compressed_t *trie; + struct rspamd_map *map; + rspamd_cryptobox_fast_hash_state_t hst; +}; + +struct rspamd_hash_map_helper { + rspamd_mempool_t *pool; + khash_t(rspamd_map_hash) * htb; + struct rspamd_map *map; + rspamd_cryptobox_fast_hash_state_t hst; +}; + +struct rspamd_cdb_map_helper { + GQueue cdbs; + struct rspamd_map *map; + rspamd_cryptobox_fast_hash_state_t hst; + gsize total_size; +}; + +struct rspamd_regexp_map_helper { + rspamd_cryptobox_hash_state_t hst; + guchar re_digest[rspamd_cryptobox_HASHBYTES]; + rspamd_mempool_t *pool; + struct rspamd_map *map; + GPtrArray *regexps; + GPtrArray *values; + khash_t(rspamd_map_hash) * htb; + enum rspamd_regexp_map_flags map_flags; +#ifdef WITH_HYPERSCAN + rspamd_hyperscan_t *hs_db; + hs_scratch_t *hs_scratch; + gchar **patterns; + gint *flags; + gint *ids; +#endif +}; + +/** + * FSM for parsing lists + */ + +#define MAP_STORE_KEY \ + do { \ + while (g_ascii_isspace(*c) && p > c) { c++; } \ + key = g_malloc(p - c + 1); \ + rspamd_strlcpy(key, c, p - c + 1); \ + stripped_key = g_strstrip(key); \ + } while (0) + +#define MAP_STORE_VALUE \ + do { \ + while (g_ascii_isspace(*c) && p > c) { c++; } \ + value = g_malloc(p - c + 1); \ + rspamd_strlcpy(value, c, p - c + 1); \ + stripped_value = g_strstrip(value); \ + } while (0) + +gchar * +rspamd_parse_kv_list( + gchar *chunk, + gint len, + struct map_cb_data *data, + rspamd_map_insert_func func, + const gchar *default_value, + gboolean final) +{ + enum { + map_skip_spaces_before_key = 0, + map_read_key, + map_read_key_quoted, + map_read_key_slashed, + map_skip_spaces_after_key, + map_backslash_quoted, + map_backslash_slashed, + map_read_key_after_slash, + map_read_value, + map_read_comment_start, + map_skip_comment, + map_read_eol, + }; + + gchar *c, *p, *key = NULL, *value = NULL, *stripped_key, *stripped_value, *end; + struct rspamd_map *map = data->map; + guint line_number = 0; + + p = chunk; + c = p; + end = p + len; + + while (p < end) { + switch (data->state) { + case map_skip_spaces_before_key: + if (g_ascii_isspace(*p)) { + p++; + } + else { + if (*p == '"') { + p++; + c = p; + data->state = map_read_key_quoted; + } + else if (*p == '/') { + /* Note that c is on '/' here as '/' is a part of key */ + c = p; + p++; + data->state = map_read_key_slashed; + } + else { + c = p; + data->state = map_read_key; + } + } + break; + case map_read_key: + /* read key */ + /* Check here comments, eol and end of buffer */ + if (*p == '#' && (p == c || *(p - 1) != '\\')) { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + } + + key = NULL; + data->state = map_read_comment_start; + } + else if (*p == '\r' || *p == '\n') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + } + + data->state = map_read_eol; + key = NULL; + } + else if (g_ascii_isspace(*p)) { + if (p - c > 0) { + MAP_STORE_KEY; + data->state = map_skip_spaces_after_key; + } + else { + msg_err_map("empty or invalid key found on line %d", line_number); + data->state = map_skip_comment; + } + } + else { + p++; + } + break; + case map_read_key_quoted: + if (*p == '\\') { + data->state = map_backslash_quoted; + p++; + } + else if (*p == '"') { + /* Allow empty keys in this case */ + if (p - c >= 0) { + MAP_STORE_KEY; + data->state = map_skip_spaces_after_key; + } + else { + g_assert_not_reached(); + } + p++; + } + else { + p++; + } + break; + case map_read_key_slashed: + if (*p == '\\') { + data->state = map_backslash_slashed; + p++; + } + else if (*p == '/') { + /* Allow empty keys in this case */ + if (p - c >= 0) { + data->state = map_read_key_after_slash; + } + else { + g_assert_not_reached(); + } + } + else { + p++; + } + break; + case map_read_key_after_slash: + /* + * This state is equal to reading of key but '/' is not + * treated specially + */ + if (*p == '#') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + key = NULL; + } + + data->state = map_read_comment_start; + } + else if (*p == '\r' || *p == '\n') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + key = NULL; + } + + data->state = map_read_eol; + key = NULL; + } + else if (g_ascii_isspace(*p)) { + if (p - c > 0) { + MAP_STORE_KEY; + data->state = map_skip_spaces_after_key; + } + else { + msg_err_map("empty or invalid key found on line %d", line_number); + data->state = map_skip_comment; + } + } + else { + p++; + } + break; + case map_backslash_quoted: + p++; + data->state = map_read_key_quoted; + break; + case map_backslash_slashed: + p++; + data->state = map_read_key_slashed; + break; + case map_skip_spaces_after_key: + if (*p == ' ' || *p == '\t') { + p++; + } + else { + c = p; + data->state = map_read_value; + } + break; + case map_read_value: + if (key == NULL) { + /* Ignore line */ + msg_err_map("empty or invalid key found on line %d", line_number); + data->state = map_skip_comment; + } + else { + if (*p == '#') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_VALUE; + func(data->cur_data, stripped_key, stripped_value); + msg_debug_map("insert key value pair: %s -> %s; line: %d", + stripped_key, stripped_value, line_number); + g_free(key); + g_free(value); + key = NULL; + value = NULL; + } + else { + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + key = NULL; + } + + data->state = map_read_comment_start; + } + else if (*p == '\r' || *p == '\n') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_VALUE; + func(data->cur_data, stripped_key, stripped_value); + msg_debug_map("insert key value pair: %s -> %s", + stripped_key, stripped_value); + g_free(key); + g_free(value); + key = NULL; + value = NULL; + } + else { + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s", + stripped_key, default_value); + g_free(key); + key = NULL; + } + + data->state = map_read_eol; + key = NULL; + } + else { + p++; + } + } + break; + case map_read_comment_start: + if (*p == '#') { + data->state = map_skip_comment; + p++; + key = NULL; + value = NULL; + } + else { + g_assert_not_reached(); + } + break; + case map_skip_comment: + if (*p == '\r' || *p == '\n') { + data->state = map_read_eol; + } + else { + p++; + } + break; + case map_read_eol: + /* Skip \r\n and whitespaces */ + if (*p == '\r' || *p == '\n') { + if (*p == '\n') { + /* We don't care about \r only line separators, they are too rare */ + line_number++; + } + p++; + } + else { + data->state = map_skip_spaces_before_key; + } + break; + default: + g_assert_not_reached(); + break; + } + } + + if (final) { + /* Examine the state */ + switch (data->state) { + case map_read_key: + case map_read_key_slashed: + case map_read_key_quoted: + case map_read_key_after_slash: + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s", + stripped_key, default_value); + g_free(key); + key = NULL; + } + break; + case map_read_value: + if (key == NULL) { + /* Ignore line */ + msg_err_map("empty or invalid key found on line %d", line_number); + data->state = map_skip_comment; + } + else { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_VALUE; + func(data->cur_data, stripped_key, stripped_value); + msg_debug_map("insert key value pair: %s -> %s", + stripped_key, stripped_value); + g_free(key); + g_free(value); + key = NULL; + value = NULL; + } + else { + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s", + stripped_key, default_value); + g_free(key); + key = NULL; + } + } + break; + } + + data->state = map_skip_spaces_before_key; + } + + return c; +} + +/** + * Radix tree helper function + */ +void rspamd_map_helper_insert_radix(gpointer st, gconstpointer key, gconstpointer value) +{ + struct rspamd_radix_map_helper *r = (struct rspamd_radix_map_helper *) st; + struct rspamd_map_helper_value *val; + gsize vlen; + khiter_t k; + gconstpointer nk; + rspamd_ftok_t tok; + gint res; + struct rspamd_map *map; + + map = r->map; + tok.begin = key; + tok.len = strlen(key); + + k = kh_get(rspamd_map_hash, r->htb, tok); + + if (k == kh_end(r->htb)) { + nk = rspamd_mempool_strdup(r->pool, key); + tok.begin = nk; + k = kh_put(rspamd_map_hash, r->htb, tok, &res); + } + else { + val = kh_value(r->htb, k); + + if (strcmp(value, val->value) == 0) { + /* Same element, skip */ + return; + } + else { + msg_warn_map("duplicate radix entry found for map %s: %s (old value: '%s', new: '%s')", + map->name, key, val->value, value); + } + + nk = kh_key(r->htb, k).begin; + val->key = nk; + kh_value(r->htb, k) = val; + + return; /* do not touch radix in case of exact duplicate */ + } + + vlen = strlen(value); + val = rspamd_mempool_alloc0(r->pool, sizeof(*val) + + vlen + 1); + memcpy(val->value, value, vlen); + + nk = kh_key(r->htb, k).begin; + val->key = nk; + kh_value(r->htb, k) = val; + rspamd_radix_add_iplist(key, ",", r->trie, val, FALSE, + r->map->name); + rspamd_cryptobox_fast_hash_update(&r->hst, nk, tok.len); +} + +void rspamd_map_helper_insert_radix_resolve(gpointer st, gconstpointer key, gconstpointer value) +{ + struct rspamd_radix_map_helper *r = (struct rspamd_radix_map_helper *) st; + struct rspamd_map_helper_value *val; + gsize vlen; + khiter_t k; + gconstpointer nk; + rspamd_ftok_t tok; + gint res; + struct rspamd_map *map; + + map = r->map; + + if (!key) { + msg_warn_map("cannot insert NULL value in the map: %s", + map->name); + return; + } + + tok.begin = key; + tok.len = strlen(key); + + k = kh_get(rspamd_map_hash, r->htb, tok); + + if (k == kh_end(r->htb)) { + nk = rspamd_mempool_strdup(r->pool, key); + tok.begin = nk; + k = kh_put(rspamd_map_hash, r->htb, tok, &res); + } + else { + val = kh_value(r->htb, k); + + if (strcmp(value, val->value) == 0) { + /* Same element, skip */ + return; + } + else { + msg_warn_map("duplicate radix entry found for map %s: %s (old value: '%s', new: '%s')", + map->name, key, val->value, value); + } + + nk = kh_key(r->htb, k).begin; + val->key = nk; + kh_value(r->htb, k) = val; + + return; /* do not touch radix in case of exact duplicate */ + } + + vlen = strlen(value); + val = rspamd_mempool_alloc0(r->pool, sizeof(*val) + + vlen + 1); + memcpy(val->value, value, vlen); + nk = kh_key(r->htb, k).begin; + val->key = nk; + kh_value(r->htb, k) = val; + rspamd_radix_add_iplist(key, ",", r->trie, val, TRUE, + r->map->name); + rspamd_cryptobox_fast_hash_update(&r->hst, nk, tok.len); +} + +void rspamd_map_helper_insert_hash(gpointer st, gconstpointer key, gconstpointer value) +{ + struct rspamd_hash_map_helper *ht = st; + struct rspamd_map_helper_value *val; + khiter_t k; + gconstpointer nk; + gsize vlen; + gint r; + rspamd_ftok_t tok; + struct rspamd_map *map; + + tok.begin = key; + tok.len = strlen(key); + map = ht->map; + + k = kh_get(rspamd_map_hash, ht->htb, tok); + + if (k == kh_end(ht->htb)) { + nk = rspamd_mempool_strdup(ht->pool, key); + tok.begin = nk; + k = kh_put(rspamd_map_hash, ht->htb, tok, &r); + } + else { + val = kh_value(ht->htb, k); + + if (strcmp(value, val->value) == 0) { + /* Same element, skip */ + return; + } + else { + msg_warn_map("duplicate hash entry found for map %s: %s (old value: '%s', new: '%s')", + map->name, key, val->value, value); + } + } + + /* Null termination due to alloc0 */ + vlen = strlen(value); + val = rspamd_mempool_alloc0(ht->pool, sizeof(*val) + vlen + 1); + memcpy(val->value, value, vlen); + + tok = kh_key(ht->htb, k); + nk = tok.begin; + val->key = nk; + kh_value(ht->htb, k) = val; + + rspamd_cryptobox_fast_hash_update(&ht->hst, nk, tok.len); +} + +void rspamd_map_helper_insert_re(gpointer st, gconstpointer key, gconstpointer value) +{ + struct rspamd_regexp_map_helper *re_map = st; + struct rspamd_map *map; + rspamd_regexp_t *re; + gchar *escaped; + GError *err = NULL; + gint pcre_flags; + gsize escaped_len; + struct rspamd_map_helper_value *val; + khiter_t k; + rspamd_ftok_t tok; + gconstpointer nk; + gsize vlen; + gint r; + + map = re_map->map; + + tok.begin = key; + tok.len = strlen(key); + + k = kh_get(rspamd_map_hash, re_map->htb, tok); + + if (k == kh_end(re_map->htb)) { + nk = rspamd_mempool_strdup(re_map->pool, key); + tok.begin = nk; + k = kh_put(rspamd_map_hash, re_map->htb, tok, &r); + } + else { + val = kh_value(re_map->htb, k); + + /* Always warn about regexp duplicate as it's likely a bad mistake */ + msg_warn_map("duplicate re entry found for map %s: %s (old value: '%s', new: '%s')", + map->name, key, val->value, value); + + if (strcmp(val->value, value) == 0) { + /* Same value, skip */ + return; + } + + /* Replace value but do not touch regexp */ + nk = kh_key(re_map->htb, k).begin; + val->key = nk; + kh_value(re_map->htb, k) = val; + + return; + } + + /* Check regexp stuff */ + if (re_map->map_flags & RSPAMD_REGEXP_MAP_FLAG_GLOB) { + escaped = rspamd_str_regexp_escape(key, strlen(key), &escaped_len, + RSPAMD_REGEXP_ESCAPE_GLOB | RSPAMD_REGEXP_ESCAPE_UTF); + re = rspamd_regexp_new(escaped, NULL, &err); + g_free(escaped); + } + else { + re = rspamd_regexp_new(key, NULL, &err); + } + + if (re == NULL) { + msg_err_map("cannot parse regexp %s: %e", key, err); + + if (err) { + g_error_free(err); + } + + return; + } + + vlen = strlen(value); + val = rspamd_mempool_alloc0(re_map->pool, sizeof(*val) + + vlen + 1); + memcpy(val->value, value, vlen); /* Null terminated due to alloc0 previously */ + nk = kh_key(re_map->htb, k).begin; + val->key = nk; + kh_value(re_map->htb, k) = val; + rspamd_cryptobox_hash_update(&re_map->hst, nk, tok.len); + + pcre_flags = rspamd_regexp_get_pcre_flags(re); + +#ifndef WITH_PCRE2 + if (pcre_flags & PCRE_FLAG(UTF8)) { + re_map->map_flags |= RSPAMD_REGEXP_MAP_FLAG_UTF; + } +#else + if (pcre_flags & PCRE_FLAG(UTF)) { + re_map->map_flags |= RSPAMD_REGEXP_MAP_FLAG_UTF; + } +#endif + + g_ptr_array_add(re_map->regexps, re); + g_ptr_array_add(re_map->values, val); +} + +static void +rspamd_map_helper_traverse_regexp(void *data, + rspamd_map_traverse_cb cb, + gpointer cbdata, + gboolean reset_hits) +{ + rspamd_ftok_t tok; + struct rspamd_map_helper_value *val; + struct rspamd_regexp_map_helper *re_map = data; + + kh_foreach(re_map->htb, tok, val, { + if (!cb(tok.begin, val->value, val->hits, cbdata)) { + break; + } + + if (reset_hits) { + val->hits = 0; + } + }); +} + +struct rspamd_hash_map_helper * +rspamd_map_helper_new_hash(struct rspamd_map *map) +{ + struct rspamd_hash_map_helper *htb; + rspamd_mempool_t *pool; + + if (map) { + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + map->tag, 0); + } + else { + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + NULL, 0); + } + + htb = rspamd_mempool_alloc0_type(pool, struct rspamd_hash_map_helper); + htb->htb = kh_init(rspamd_map_hash); + htb->pool = pool; + htb->map = map; + rspamd_cryptobox_fast_hash_init(&htb->hst, map_hash_seed); + + return htb; +} + +void rspamd_map_helper_destroy_hash(struct rspamd_hash_map_helper *r) +{ + if (r == NULL || r->pool == NULL) { + return; + } + + rspamd_mempool_t *pool = r->pool; + kh_destroy(rspamd_map_hash, r->htb); + memset(r, 0, sizeof(*r)); + rspamd_mempool_delete(pool); +} + +static void +rspamd_map_helper_traverse_hash(void *data, + rspamd_map_traverse_cb cb, + gpointer cbdata, + gboolean reset_hits) +{ + rspamd_ftok_t tok; + struct rspamd_map_helper_value *val; + struct rspamd_hash_map_helper *ht = data; + + kh_foreach(ht->htb, tok, val, { + if (!cb(tok.begin, val->value, val->hits, cbdata)) { + break; + } + + if (reset_hits) { + val->hits = 0; + } + }); +} + +struct rspamd_radix_map_helper * +rspamd_map_helper_new_radix(struct rspamd_map *map) +{ + struct rspamd_radix_map_helper *r; + rspamd_mempool_t *pool; + const gchar *name = "unnamed"; + + if (map) { + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + map->tag, 0); + name = map->name; + } + else { + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + NULL, 0); + } + + r = rspamd_mempool_alloc0_type(pool, struct rspamd_radix_map_helper); + r->trie = radix_create_compressed_with_pool(pool, name); + r->htb = kh_init(rspamd_map_hash); + r->pool = pool; + r->map = map; + rspamd_cryptobox_fast_hash_init(&r->hst, map_hash_seed); + + return r; +} + +void rspamd_map_helper_destroy_radix(struct rspamd_radix_map_helper *r) +{ + if (r == NULL || !r->pool) { + return; + } + + kh_destroy(rspamd_map_hash, r->htb); + rspamd_mempool_t *pool = r->pool; + memset(r, 0, sizeof(*r)); + rspamd_mempool_delete(pool); +} + +static void +rspamd_map_helper_traverse_radix(void *data, + rspamd_map_traverse_cb cb, + gpointer cbdata, + gboolean reset_hits) +{ + rspamd_ftok_t tok; + struct rspamd_map_helper_value *val; + struct rspamd_radix_map_helper *r = data; + + kh_foreach(r->htb, tok, val, { + if (!cb(tok.begin, val->value, val->hits, cbdata)) { + break; + } + + if (reset_hits) { + val->hits = 0; + } + }); +} + +struct rspamd_regexp_map_helper * +rspamd_map_helper_new_regexp(struct rspamd_map *map, + enum rspamd_regexp_map_flags flags) +{ + struct rspamd_regexp_map_helper *re_map; + rspamd_mempool_t *pool; + + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + map->tag, 0); + + re_map = rspamd_mempool_alloc0_type(pool, struct rspamd_regexp_map_helper); + re_map->pool = pool; + re_map->values = g_ptr_array_new(); + re_map->regexps = g_ptr_array_new(); + re_map->map = map; + re_map->map_flags = flags; + re_map->htb = kh_init(rspamd_map_hash); + rspamd_cryptobox_hash_init(&re_map->hst, NULL, 0); + + return re_map; +} + + +void rspamd_map_helper_destroy_regexp(struct rspamd_regexp_map_helper *re_map) +{ + rspamd_regexp_t *re; + guint i; + + if (!re_map || !re_map->regexps) { + return; + } + +#ifdef WITH_HYPERSCAN + if (re_map->hs_scratch) { + hs_free_scratch(re_map->hs_scratch); + } + if (re_map->hs_db) { + rspamd_hyperscan_free(re_map->hs_db, false); + } + if (re_map->patterns) { + for (i = 0; i < re_map->regexps->len; i++) { + g_free(re_map->patterns[i]); + } + + g_free(re_map->patterns); + } + if (re_map->flags) { + g_free(re_map->flags); + } + if (re_map->ids) { + g_free(re_map->ids); + } +#endif + + for (i = 0; i < re_map->regexps->len; i++) { + re = g_ptr_array_index(re_map->regexps, i); + rspamd_regexp_unref(re); + } + + g_ptr_array_free(re_map->regexps, TRUE); + g_ptr_array_free(re_map->values, TRUE); + kh_destroy(rspamd_map_hash, re_map->htb); + + rspamd_mempool_t *pool = re_map->pool; + memset(re_map, 0, sizeof(*re_map)); + rspamd_mempool_delete(pool); +} + +gchar * +rspamd_kv_list_read( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + if (data->cur_data == NULL) { + data->cur_data = rspamd_map_helper_new_hash(data->map); + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_hash, + "", + final); +} + +void rspamd_kv_list_fin(struct map_cb_data *data, void **target) +{ + struct rspamd_map *map = data->map; + struct rspamd_hash_map_helper *htb; + + if (data->errored) { + /* Clean up the current data and do not touch prev data */ + if (data->cur_data) { + msg_info_map("cleanup unfinished new data as error occurred for %s", + map->name); + htb = (struct rspamd_hash_map_helper *) data->cur_data; + rspamd_map_helper_destroy_hash(htb); + data->cur_data = NULL; + } + } + else { + if (data->cur_data) { + htb = (struct rspamd_hash_map_helper *) data->cur_data; + msg_info_map("read hash of %d elements from %s", kh_size(htb->htb), + map->name); + data->map->traverse_function = rspamd_map_helper_traverse_hash; + data->map->nelts = kh_size(htb->htb); + data->map->digest = rspamd_cryptobox_fast_hash_final(&htb->hst); + } + + if (target) { + *target = data->cur_data; + } + + if (data->prev_data) { + htb = (struct rspamd_hash_map_helper *) data->prev_data; + rspamd_map_helper_destroy_hash(htb); + } + } +} + +void rspamd_kv_list_dtor(struct map_cb_data *data) +{ + struct rspamd_hash_map_helper *htb; + + if (data->cur_data) { + htb = (struct rspamd_hash_map_helper *) data->cur_data; + rspamd_map_helper_destroy_hash(htb); + } +} + +gchar * +rspamd_radix_read( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_radix_map_helper *r; + struct rspamd_map *map = data->map; + + if (data->cur_data == NULL) { + r = rspamd_map_helper_new_radix(map); + data->cur_data = r; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_radix, + hash_fill, + final); +} + +void rspamd_radix_fin(struct map_cb_data *data, void **target) +{ + struct rspamd_map *map = data->map; + struct rspamd_radix_map_helper *r; + + if (data->errored) { + /* Clean up the current data and do not touch prev data */ + if (data->cur_data) { + msg_info_map("cleanup unfinished new data as error occurred for %s", + map->name); + r = (struct rspamd_radix_map_helper *) data->cur_data; + rspamd_map_helper_destroy_radix(r); + data->cur_data = NULL; + } + } + else { + if (data->cur_data) { + r = (struct rspamd_radix_map_helper *) data->cur_data; + msg_info_map("read radix trie of %z elements: %s", + radix_get_size(r->trie), radix_get_info(r->trie)); + data->map->traverse_function = rspamd_map_helper_traverse_radix; + data->map->nelts = kh_size(r->htb); + data->map->digest = rspamd_cryptobox_fast_hash_final(&r->hst); + } + + if (target) { + *target = data->cur_data; + } + + if (data->prev_data) { + r = (struct rspamd_radix_map_helper *) data->prev_data; + rspamd_map_helper_destroy_radix(r); + } + } +} + +void rspamd_radix_dtor(struct map_cb_data *data) +{ + struct rspamd_radix_map_helper *r; + + if (data->cur_data) { + r = (struct rspamd_radix_map_helper *) data->cur_data; + rspamd_map_helper_destroy_radix(r); + } +} + +#ifdef WITH_HYPERSCAN + +static gboolean +rspamd_try_load_re_map_cache(struct rspamd_regexp_map_helper *re_map) +{ + gchar fp[PATH_MAX]; + struct rspamd_map *map; + + map = re_map->map; + + if (!map->cfg->hs_cache_dir) { + return FALSE; + } + + rspamd_snprintf(fp, sizeof(fp), "%s/%*xs.hsmc", + map->cfg->hs_cache_dir, + (gint) rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest); + + re_map->hs_db = rspamd_hyperscan_maybe_load(fp, 0); + + return re_map->hs_db != NULL; +} + +static gboolean +rspamd_try_save_re_map_cache(struct rspamd_regexp_map_helper *re_map) +{ + gchar fp[PATH_MAX], np[PATH_MAX]; + gsize len; + gint fd; + char *bytes = NULL; + struct rspamd_map *map; + + map = re_map->map; + + if (!map->cfg->hs_cache_dir) { + return FALSE; + } + + rspamd_snprintf(fp, sizeof(fp), "%s/hsmc-XXXXXXXXXXXXX", + re_map->map->cfg->hs_cache_dir); + + if ((fd = g_mkstemp_full(fp, O_WRONLY | O_CREAT | O_EXCL, 00644)) != -1) { + if (hs_serialize_database(rspamd_hyperscan_get_database(re_map->hs_db), &bytes, &len) == HS_SUCCESS) { + if (write(fd, bytes, len) == -1) { + msg_warn_map("cannot write hyperscan cache to %s: %s", + fp, strerror(errno)); + unlink(fp); + free(bytes); + } + else { + free(bytes); + fsync(fd); + + rspamd_snprintf(np, sizeof(np), "%s/%*xs.hsmc", + re_map->map->cfg->hs_cache_dir, + (gint) rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest); + + if (rename(fp, np) == -1) { + msg_warn_map("cannot rename hyperscan cache from %s to %s: %s", + fp, np, strerror(errno)); + unlink(fp); + } + else { + msg_info_map("written cached hyperscan data for %s to %s (%Hz length)", + map->name, np, len); + rspamd_hyperscan_notice_known(np); + } + } + } + else { + msg_warn_map("cannot serialize hyperscan cache to %s: %s", + fp, strerror(errno)); + unlink(fp); + } + + + close(fd); + } + + return FALSE; +} + +#endif + +static void +rspamd_re_map_finalize(struct rspamd_regexp_map_helper *re_map) +{ +#ifdef WITH_HYPERSCAN + guint i; + hs_platform_info_t plt; + hs_compile_error_t *err; + struct rspamd_map *map; + rspamd_regexp_t *re; + gint pcre_flags; + + map = re_map->map; + +#if !defined(__aarch64__) && !defined(__powerpc64__) + if (!(map->cfg->libs_ctx->crypto_ctx->cpu_config & CPUID_SSSE3)) { + msg_info_map("disable hyperscan for map %s, ssse3 instructions are not supported by CPU", + map->name); + return; + } +#endif + + if (hs_populate_platform(&plt) != HS_SUCCESS) { + msg_err_map("cannot populate hyperscan platform"); + return; + } + + re_map->patterns = g_new(gchar *, re_map->regexps->len); + re_map->flags = g_new(gint, re_map->regexps->len); + re_map->ids = g_new(gint, re_map->regexps->len); + + for (i = 0; i < re_map->regexps->len; i++) { + const gchar *pat; + gchar *escaped; + gint pat_flags; + + re = g_ptr_array_index(re_map->regexps, i); + pcre_flags = rspamd_regexp_get_pcre_flags(re); + pat = rspamd_regexp_get_pattern(re); + pat_flags = rspamd_regexp_get_flags(re); + + if (pat_flags & RSPAMD_REGEXP_FLAG_UTF) { + escaped = rspamd_str_regexp_escape(pat, strlen(pat), NULL, + RSPAMD_REGEXP_ESCAPE_RE | RSPAMD_REGEXP_ESCAPE_UTF); + re_map->flags[i] |= HS_FLAG_UTF8; + } + else { + escaped = rspamd_str_regexp_escape(pat, strlen(pat), NULL, + RSPAMD_REGEXP_ESCAPE_RE); + } + + re_map->patterns[i] = escaped; + re_map->flags[i] = HS_FLAG_SINGLEMATCH; + +#ifndef WITH_PCRE2 + if (pcre_flags & PCRE_FLAG(UTF8)) { + re_map->flags[i] |= HS_FLAG_UTF8; + } +#else + if (pcre_flags & PCRE_FLAG(UTF)) { + re_map->flags[i] |= HS_FLAG_UTF8; + } +#endif + if (pcre_flags & PCRE_FLAG(CASELESS)) { + re_map->flags[i] |= HS_FLAG_CASELESS; + } + if (pcre_flags & PCRE_FLAG(MULTILINE)) { + re_map->flags[i] |= HS_FLAG_MULTILINE; + } + if (pcre_flags & PCRE_FLAG(DOTALL)) { + re_map->flags[i] |= HS_FLAG_DOTALL; + } + if (rspamd_regexp_get_maxhits(re) == 1) { + re_map->flags[i] |= HS_FLAG_SINGLEMATCH; + } + + re_map->ids[i] = i; + } + + if (re_map->regexps->len > 0 && re_map->patterns) { + + if (!rspamd_try_load_re_map_cache(re_map)) { + gdouble ts1 = rspamd_get_ticks(FALSE); + hs_database_t *hs_db = NULL; + + if (hs_compile_multi((const gchar **) re_map->patterns, + re_map->flags, + re_map->ids, + re_map->regexps->len, + HS_MODE_BLOCK, + &plt, + &hs_db, + &err) != HS_SUCCESS) { + + msg_err_map("cannot create tree of regexp when processing '%s': %s", + err->expression >= 0 ? re_map->patterns[err->expression] : "unknown regexp", err->message); + re_map->hs_db = NULL; + hs_free_compile_error(err); + + return; + } + + if (re_map->map->cfg->hs_cache_dir) { + char fpath[PATH_MAX]; + rspamd_snprintf(fpath, sizeof(fpath), "%s/%*xs.hsmc", + re_map->map->cfg->hs_cache_dir, + (gint) rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest); + re_map->hs_db = rspamd_hyperscan_from_raw_db(hs_db, fpath); + } + else { + re_map->hs_db = rspamd_hyperscan_from_raw_db(hs_db, NULL); + } + + ts1 = (rspamd_get_ticks(FALSE) - ts1) * 1000.0; + msg_info_map("hyperscan compiled %d regular expressions from %s in %.1f ms", + re_map->regexps->len, re_map->map->name, ts1); + rspamd_try_save_re_map_cache(re_map); + } + else { + msg_info_map("hyperscan read %d cached regular expressions from %s", + re_map->regexps->len, re_map->map->name); + } + + if (hs_alloc_scratch(rspamd_hyperscan_get_database(re_map->hs_db), &re_map->hs_scratch) != HS_SUCCESS) { + msg_err_map("cannot allocate scratch space for hyperscan"); + rspamd_hyperscan_free(re_map->hs_db, true); + re_map->hs_db = NULL; + } + } + else { + msg_err_map("regexp map is empty"); + } +#endif +} + +gchar * +rspamd_regexp_list_read_single( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_regexp_map_helper *re_map; + + if (data->cur_data == NULL) { + re_map = rspamd_map_helper_new_regexp(data->map, 0); + data->cur_data = re_map; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_re, + hash_fill, + final); +} + +gchar * +rspamd_glob_list_read_single( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_regexp_map_helper *re_map; + + if (data->cur_data == NULL) { + re_map = rspamd_map_helper_new_regexp(data->map, RSPAMD_REGEXP_MAP_FLAG_GLOB); + data->cur_data = re_map; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_re, + hash_fill, + final); +} + +gchar * +rspamd_regexp_list_read_multiple( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_regexp_map_helper *re_map; + + if (data->cur_data == NULL) { + re_map = rspamd_map_helper_new_regexp(data->map, + RSPAMD_REGEXP_MAP_FLAG_MULTIPLE); + data->cur_data = re_map; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_re, + hash_fill, + final); +} + +gchar * +rspamd_glob_list_read_multiple( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_regexp_map_helper *re_map; + + if (data->cur_data == NULL) { + re_map = rspamd_map_helper_new_regexp(data->map, + RSPAMD_REGEXP_MAP_FLAG_GLOB | RSPAMD_REGEXP_MAP_FLAG_MULTIPLE); + data->cur_data = re_map; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_re, + hash_fill, + final); +} + + +void rspamd_regexp_list_fin(struct map_cb_data *data, void **target) +{ + struct rspamd_regexp_map_helper *re_map = NULL, *old_re_map; + struct rspamd_map *map = data->map; + + if (data->errored) { + /* Clean up the current data and do not touch prev data */ + if (data->cur_data) { + msg_info_map("cleanup unfinished new data as error occurred for %s", + map->name); + re_map = (struct rspamd_regexp_map_helper *) data->cur_data; + rspamd_map_helper_destroy_regexp(re_map); + data->cur_data = NULL; + } + } + else { + if (data->cur_data) { + re_map = data->cur_data; + rspamd_cryptobox_hash_final(&re_map->hst, re_map->re_digest); + memcpy(&data->map->digest, re_map->re_digest, sizeof(data->map->digest)); + rspamd_re_map_finalize(re_map); + msg_info_map("read regexp list of %ud elements", + re_map->regexps->len); + data->map->traverse_function = rspamd_map_helper_traverse_regexp; + data->map->nelts = kh_size(re_map->htb); + } + + if (target) { + *target = data->cur_data; + } + + if (data->prev_data) { + old_re_map = data->prev_data; + rspamd_map_helper_destroy_regexp(old_re_map); + } + } +} +void rspamd_regexp_list_dtor(struct map_cb_data *data) +{ + if (data->cur_data) { + rspamd_map_helper_destroy_regexp(data->cur_data); + } +} + +#ifdef WITH_HYPERSCAN +static int +rspamd_match_hs_single_handler(unsigned int id, unsigned long long from, + unsigned long long to, + unsigned int flags, void *context) +{ + guint *i = context; + /* Always return non-zero as we need a single match here */ + + *i = id; + + return 1; +} +#endif + +gconstpointer +rspamd_match_regexp_map_single(struct rspamd_regexp_map_helper *map, + const gchar *in, gsize len) +{ + guint i; + rspamd_regexp_t *re; + gint res = 0; + gpointer ret = NULL; + struct rspamd_map_helper_value *val; + gboolean validated = FALSE; + + g_assert(in != NULL); + + if (map == NULL || len == 0 || map->regexps == NULL) { + return NULL; + } + + if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) { + if (rspamd_fast_utf8_validate(in, len) == 0) { + validated = TRUE; + } + } + else { + validated = TRUE; + } + +#ifdef WITH_HYPERSCAN + if (map->hs_db && map->hs_scratch) { + + if (validated) { + + res = hs_scan(rspamd_hyperscan_get_database(map->hs_db), in, len, 0, + map->hs_scratch, + rspamd_match_hs_single_handler, (void *) &i); + + if (res == HS_SCAN_TERMINATED) { + res = 1; + val = g_ptr_array_index(map->values, i); + + ret = val->value; + val->hits++; + } + + return ret; + } + } +#endif + + if (!res) { + /* PCRE version */ + for (i = 0; i < map->regexps->len; i++) { + re = g_ptr_array_index(map->regexps, i); + + if (rspamd_regexp_search(re, in, len, NULL, NULL, !validated, NULL)) { + val = g_ptr_array_index(map->values, i); + + ret = val->value; + val->hits++; + break; + } + } + } + + return ret; +} + +#ifdef WITH_HYPERSCAN +struct rspamd_multiple_cbdata { + GPtrArray *ar; + struct rspamd_regexp_map_helper *map; +}; + +static int +rspamd_match_hs_multiple_handler(unsigned int id, unsigned long long from, + unsigned long long to, + unsigned int flags, void *context) +{ + struct rspamd_multiple_cbdata *cbd = context; + struct rspamd_map_helper_value *val; + + + if (id < cbd->map->values->len) { + val = g_ptr_array_index(cbd->map->values, id); + val->hits++; + g_ptr_array_add(cbd->ar, val->value); + } + + /* Always return zero as we need all matches here */ + return 0; +} +#endif + +GPtrArray * +rspamd_match_regexp_map_all(struct rspamd_regexp_map_helper *map, + const gchar *in, gsize len) +{ + guint i; + rspamd_regexp_t *re; + GPtrArray *ret; + gint res = 0; + gboolean validated = FALSE; + struct rspamd_map_helper_value *val; + + if (map == NULL || map->regexps == NULL || len == 0) { + return NULL; + } + + g_assert(in != NULL); + + if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) { + if (rspamd_fast_utf8_validate(in, len) == 0) { + validated = TRUE; + } + } + else { + validated = TRUE; + } + + ret = g_ptr_array_new(); + +#ifdef WITH_HYPERSCAN + if (map->hs_db && map->hs_scratch) { + + if (validated) { + struct rspamd_multiple_cbdata cbd; + + cbd.ar = ret; + cbd.map = map; + + if (hs_scan(rspamd_hyperscan_get_database(map->hs_db), in, len, + 0, map->hs_scratch, + rspamd_match_hs_multiple_handler, &cbd) == HS_SUCCESS) { + res = 1; + } + } + } +#endif + + if (!res) { + /* PCRE version */ + for (i = 0; i < map->regexps->len; i++) { + re = g_ptr_array_index(map->regexps, i); + + if (rspamd_regexp_search(re, in, len, NULL, NULL, + !validated, NULL)) { + val = g_ptr_array_index(map->values, i); + val->hits++; + g_ptr_array_add(ret, val->value); + } + } + } + + if (ret->len > 0) { + return ret; + } + + g_ptr_array_free(ret, TRUE); + + return NULL; +} + +gconstpointer +rspamd_match_hash_map(struct rspamd_hash_map_helper *map, const gchar *in, + gsize len) +{ + khiter_t k; + struct rspamd_map_helper_value *val; + rspamd_ftok_t tok; + + if (map == NULL || map->htb == NULL) { + return NULL; + } + + tok.begin = in; + tok.len = len; + + k = kh_get(rspamd_map_hash, map->htb, tok); + + if (k != kh_end(map->htb)) { + val = kh_value(map->htb, k); + val->hits++; + + return val->value; + } + + return NULL; +} + +gconstpointer +rspamd_match_radix_map(struct rspamd_radix_map_helper *map, + const guchar *in, gsize inlen) +{ + struct rspamd_map_helper_value *val; + + if (map == NULL || map->trie == NULL) { + return NULL; + } + + val = (struct rspamd_map_helper_value *) radix_find_compressed(map->trie, + in, inlen); + + if (val != (gconstpointer) RADIX_NO_VALUE) { + val->hits++; + + return val->value; + } + + return NULL; +} + +gconstpointer +rspamd_match_radix_map_addr(struct rspamd_radix_map_helper *map, + const rspamd_inet_addr_t *addr) +{ + struct rspamd_map_helper_value *val; + + if (map == NULL || map->trie == NULL) { + return NULL; + } + + val = (struct rspamd_map_helper_value *) radix_find_compressed_addr(map->trie, addr); + + if (val != (gconstpointer) RADIX_NO_VALUE) { + val->hits++; + + return val->value; + } + + return NULL; +} + + +/* + * CBD stuff + */ + +struct rspamd_cdb_map_helper * +rspamd_map_helper_new_cdb(struct rspamd_map *map) +{ + struct rspamd_cdb_map_helper *n; + + n = g_malloc0(sizeof(*n)); + n->cdbs = (GQueue) G_QUEUE_INIT; + n->map = map; + + rspamd_cryptobox_fast_hash_init(&n->hst, map_hash_seed); + + return n; +} + +void rspamd_map_helper_destroy_cdb(struct rspamd_cdb_map_helper *c) +{ + if (c == NULL) { + return; + } + + GList *cur = c->cdbs.head; + + while (cur) { + struct cdb *cdb = (struct cdb *) cur->data; + + cdb_free(cdb); + g_free(cdb->filename); + close(cdb->cdb_fd); + g_free(cdb); + + cur = g_list_next(cur); + } + + g_queue_clear(&c->cdbs); + + g_free(c); +} + +gchar * +rspamd_cdb_list_read(gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_cdb_map_helper *cdb_data; + struct cdb *found = NULL; + struct rspamd_map *map = data->map; + + g_assert(map->no_file_read); + + if (data->cur_data == NULL) { + cdb_data = rspamd_map_helper_new_cdb(data->map); + data->cur_data = cdb_data; + } + else { + cdb_data = (struct rspamd_cdb_map_helper *) data->cur_data; + } + + GList *cur = cdb_data->cdbs.head; + + while (cur) { + struct cdb *elt = (struct cdb *) cur->data; + + if (strcmp(elt->filename, chunk) == 0) { + found = elt; + break; + } + + cur = g_list_next(cur); + } + + if (found == NULL) { + /* New cdb */ + gint fd; + struct cdb *cdb; + + fd = rspamd_file_xopen(chunk, O_RDONLY, 0, TRUE); + + if (fd == -1) { + msg_err_map("cannot open cdb map from %s: %s", chunk, strerror(errno)); + + return NULL; + } + + cdb = g_malloc0(sizeof(struct cdb)); + + if (cdb_init(cdb, fd) == -1) { + g_free(cdb); + msg_err_map("cannot init cdb map from %s: %s", chunk, strerror(errno)); + + return NULL; + } + + cdb->filename = g_strdup(chunk); + g_queue_push_tail(&cdb_data->cdbs, cdb); + cdb_data->total_size += cdb->cdb_fsize; + rspamd_cryptobox_fast_hash_update(&cdb_data->hst, chunk, len); + } + + return chunk + len; +} + +void rspamd_cdb_list_fin(struct map_cb_data *data, void **target) +{ + struct rspamd_map *map = data->map; + struct rspamd_cdb_map_helper *cdb_data; + + if (data->errored) { + /* Clean up the current data and do not touch prev data */ + if (data->cur_data) { + msg_info_map("cleanup unfinished new data as error occurred for %s", + map->name); + cdb_data = (struct rspamd_cdb_map_helper *) data->cur_data; + rspamd_map_helper_destroy_cdb(cdb_data); + data->cur_data = NULL; + } + } + else { + if (data->cur_data) { + cdb_data = (struct rspamd_cdb_map_helper *) data->cur_data; + msg_info_map("read cdb of %Hz size", cdb_data->total_size); + data->map->traverse_function = NULL; + data->map->nelts = 0; + data->map->digest = rspamd_cryptobox_fast_hash_final(&cdb_data->hst); + } + + if (target) { + *target = data->cur_data; + } + + if (data->prev_data) { + cdb_data = (struct rspamd_cdb_map_helper *) data->prev_data; + rspamd_map_helper_destroy_cdb(cdb_data); + } + } +} +void rspamd_cdb_list_dtor(struct map_cb_data *data) +{ + if (data->cur_data) { + rspamd_map_helper_destroy_cdb(data->cur_data); + } +} + +gconstpointer +rspamd_match_cdb_map(struct rspamd_cdb_map_helper *map, + const gchar *in, gsize inlen) +{ + if (map == NULL || map->cdbs.head == NULL) { + return NULL; + } + + GList *cur = map->cdbs.head; + static rspamd_ftok_t found; + + while (cur) { + struct cdb *cdb = (struct cdb *) cur->data; + + if (cdb_find(cdb, in, inlen) > 0) { + /* Extract and push value to lua as string */ + unsigned vlen; + gconstpointer vpos; + + vpos = cdb->cdb_mem + cdb_datapos(cdb); + vlen = cdb_datalen(cdb); + found.len = vlen; + found.begin = vpos; + + return &found; /* Do not reuse! */ + } + + cur = g_list_next(cur); + } + + return NULL; +} |