diff options
Diffstat (limited to '')
-rw-r--r-- | src/libutil/multipattern.c | 821 |
1 files changed, 821 insertions, 0 deletions
diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c new file mode 100644 index 0000000..630b1f9 --- /dev/null +++ b/src/libutil/multipattern.c @@ -0,0 +1,821 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "libutil/multipattern.h" +#include "libutil/str_util.h" +#include "libcryptobox/cryptobox.h" + +#ifdef WITH_HYPERSCAN +#include "logger.h" +#include "unix-std.h" +#include "hs.h" +#include "libserver/hyperscan_tools.h" +#endif +#include "acism.h" +#include "libutil/regexp.h" +#include <stdalign.h> + +#define MAX_SCRATCH 4 + +enum rspamd_hs_check_state { + RSPAMD_HS_UNCHECKED = 0, + RSPAMD_HS_SUPPORTED, + RSPAMD_HS_UNSUPPORTED +}; + +static const char *hs_cache_dir = NULL; +static enum rspamd_hs_check_state hs_suitable_cpu = RSPAMD_HS_UNCHECKED; + + +struct RSPAMD_ALIGNED(64) rspamd_multipattern { +#ifdef WITH_HYPERSCAN + rspamd_cryptobox_hash_state_t hash_state; + rspamd_hyperscan_t *hs_db; + hs_scratch_t *scratch[MAX_SCRATCH]; + GArray *hs_pats; + GArray *hs_ids; + GArray *hs_flags; + guint scratch_used; +#endif + ac_trie_t *t; + GArray *pats; + GArray *res; + + gboolean compiled; + guint cnt; + enum rspamd_multipattern_flags flags; +}; + +static GQuark +rspamd_multipattern_quark(void) +{ + return g_quark_from_static_string("multipattern"); +} + +static inline gboolean +rspamd_hs_check(void) +{ +#ifdef WITH_HYPERSCAN + if (G_UNLIKELY(hs_suitable_cpu == RSPAMD_HS_UNCHECKED)) { + if (hs_valid_platform() == HS_SUCCESS) { + hs_suitable_cpu = RSPAMD_HS_SUPPORTED; + } + else { + hs_suitable_cpu = RSPAMD_HS_UNSUPPORTED; + } + } +#endif + + return hs_suitable_cpu == RSPAMD_HS_SUPPORTED; +} + +void rspamd_multipattern_library_init(const gchar *cache_dir) +{ + hs_cache_dir = cache_dir; +#ifdef WITH_HYPERSCAN + rspamd_hs_check(); +#endif +} + +#ifdef WITH_HYPERSCAN +static gchar * +rspamd_multipattern_escape_tld_hyperscan(const gchar *pattern, gsize slen, + gsize *dst_len) +{ + gsize len; + const gchar *p, *prefix, *suffix; + gchar *res; + + /* + * We understand the following cases + * 1) blah -> .blah\b + * 2) *.blah -> ..*\\.blah\b|$ + * 3) ??? + */ + + if (pattern[0] == '*') { + p = strchr(pattern, '.'); + + if (p == NULL) { + /* XXX: bad */ + p = pattern; + } + else { + p++; + } + + prefix = "\\."; + len = slen + strlen(prefix); + } + else { + prefix = "\\."; + p = pattern; + len = slen + strlen(prefix); + } + + suffix = "(:?\\b|$)"; + len += strlen(suffix); + + res = g_malloc(len + 1); + slen = rspamd_strlcpy(res, prefix, len + 1); + slen += rspamd_strlcpy(res + slen, p, len + 1 - slen); + slen += rspamd_strlcpy(res + slen, suffix, len + 1 - slen); + + *dst_len = slen; + + return res; +} + +#endif +static gchar * +rspamd_multipattern_escape_tld_acism(const gchar *pattern, gsize len, + gsize *dst_len) +{ + gsize dlen, slen; + const gchar *p, *prefix; + gchar *res; + + /* + * We understand the following cases + * 1) blah -> \\.blah + * 2) *.blah -> \\..*\\.blah + * 3) ??? + */ + slen = len; + + if (pattern[0] == '*') { + dlen = slen; + p = memchr(pattern, '.', len); + + if (p == NULL) { + /* XXX: bad */ + p = pattern; + } + else { + p++; + } + + dlen -= p - pattern; + prefix = "."; + dlen++; + } + else { + dlen = slen + 1; + prefix = "."; + p = pattern; + } + + res = g_malloc(dlen + 1); + slen = strlen(prefix); + memcpy(res, prefix, slen); + rspamd_strlcpy(res + slen, p, dlen - slen + 1); + + *dst_len = dlen; + + return res; +} + +/* + * Escapes special characters from specific pattern + */ +static gchar * +rspamd_multipattern_pattern_filter(const gchar *pattern, gsize len, + enum rspamd_multipattern_flags flags, + gsize *dst_len) +{ + gchar *ret = NULL; + gint gl_flags = RSPAMD_REGEXP_ESCAPE_ASCII; + + if (flags & RSPAMD_MULTIPATTERN_UTF8) { + gl_flags |= RSPAMD_REGEXP_ESCAPE_UTF; + } + +#ifdef WITH_HYPERSCAN + if (rspamd_hs_check()) { + if (flags & RSPAMD_MULTIPATTERN_TLD) { + gchar *tmp; + gsize tlen; + tmp = rspamd_multipattern_escape_tld_hyperscan(pattern, len, &tlen); + + ret = rspamd_str_regexp_escape(tmp, tlen, dst_len, + gl_flags | RSPAMD_REGEXP_ESCAPE_RE); + g_free(tmp); + } + else if (flags & RSPAMD_MULTIPATTERN_RE) { + ret = rspamd_str_regexp_escape(pattern, len, dst_len, gl_flags | RSPAMD_REGEXP_ESCAPE_RE); + } + else if (flags & RSPAMD_MULTIPATTERN_GLOB) { + ret = rspamd_str_regexp_escape(pattern, len, dst_len, + gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB); + } + else { + ret = rspamd_str_regexp_escape(pattern, len, dst_len, gl_flags); + } + + return ret; + } +#endif + + if (flags & RSPAMD_MULTIPATTERN_TLD) { + ret = rspamd_multipattern_escape_tld_acism(pattern, len, dst_len); + } + else if (flags & RSPAMD_MULTIPATTERN_RE) { + ret = rspamd_str_regexp_escape(pattern, len, dst_len, gl_flags | RSPAMD_REGEXP_ESCAPE_RE); + } + else if (flags & RSPAMD_MULTIPATTERN_GLOB) { + ret = rspamd_str_regexp_escape(pattern, len, dst_len, + gl_flags | RSPAMD_REGEXP_ESCAPE_GLOB); + } + else { + ret = malloc(len + 1); + *dst_len = rspamd_strlcpy(ret, pattern, len + 1); + } + + return ret; +} + +struct rspamd_multipattern * +rspamd_multipattern_create(enum rspamd_multipattern_flags flags) +{ + struct rspamd_multipattern *mp; + + /* Align due to blake2b state */ + (void) !posix_memalign((void **) &mp, RSPAMD_ALIGNOF(struct rspamd_multipattern), + sizeof(*mp)); + g_assert(mp != NULL); + memset(mp, 0, sizeof(*mp)); + mp->flags = flags; + +#ifdef WITH_HYPERSCAN + if (rspamd_hs_check()) { + mp->hs_pats = g_array_new(FALSE, TRUE, sizeof(gchar *)); + mp->hs_flags = g_array_new(FALSE, TRUE, sizeof(gint)); + mp->hs_ids = g_array_new(FALSE, TRUE, sizeof(gint)); + rspamd_cryptobox_hash_init(&mp->hash_state, NULL, 0); + + return mp; + } +#endif + + mp->pats = g_array_new(FALSE, TRUE, sizeof(ac_trie_pat_t)); + + return mp; +} + +struct rspamd_multipattern * +rspamd_multipattern_create_sized(guint npatterns, + enum rspamd_multipattern_flags flags) +{ + struct rspamd_multipattern *mp; + + /* Align due to blake2b state */ + (void) !posix_memalign((void **) &mp, RSPAMD_ALIGNOF(struct rspamd_multipattern), sizeof(*mp)); + g_assert(mp != NULL); + memset(mp, 0, sizeof(*mp)); + mp->flags = flags; + +#ifdef WITH_HYPERSCAN + if (rspamd_hs_check()) { + mp->hs_pats = g_array_sized_new(FALSE, TRUE, sizeof(gchar *), npatterns); + mp->hs_flags = g_array_sized_new(FALSE, TRUE, sizeof(gint), npatterns); + mp->hs_ids = g_array_sized_new(FALSE, TRUE, sizeof(gint), npatterns); + rspamd_cryptobox_hash_init(&mp->hash_state, NULL, 0); + + return mp; + } +#endif + + mp->pats = g_array_sized_new(FALSE, TRUE, sizeof(ac_trie_pat_t), npatterns); + + return mp; +} + +void rspamd_multipattern_add_pattern(struct rspamd_multipattern *mp, + const gchar *pattern, gint flags) +{ + g_assert(pattern != NULL); + + rspamd_multipattern_add_pattern_len(mp, pattern, strlen(pattern), flags); +} + +void rspamd_multipattern_add_pattern_len(struct rspamd_multipattern *mp, + const gchar *pattern, gsize patlen, gint flags) +{ + gsize dlen; + + g_assert(pattern != NULL); + g_assert(mp != NULL); + g_assert(!mp->compiled); + +#ifdef WITH_HYPERSCAN + if (rspamd_hs_check()) { + gchar *np; + gint fl = HS_FLAG_SOM_LEFTMOST; + gint adjusted_flags = mp->flags | flags; + + if (adjusted_flags & RSPAMD_MULTIPATTERN_ICASE) { + fl |= HS_FLAG_CASELESS; + } + if (adjusted_flags & RSPAMD_MULTIPATTERN_UTF8) { + if (adjusted_flags & RSPAMD_MULTIPATTERN_TLD) { + fl |= HS_FLAG_UTF8; + } + else { + fl |= HS_FLAG_UTF8 | HS_FLAG_UCP; + } + } + if (adjusted_flags & RSPAMD_MULTIPATTERN_DOTALL) { + fl |= HS_FLAG_DOTALL; + } + if (adjusted_flags & RSPAMD_MULTIPATTERN_SINGLEMATCH) { + fl |= HS_FLAG_SINGLEMATCH; + fl &= ~HS_FLAG_SOM_LEFTMOST; /* According to hyperscan docs */ + } + if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) { + fl &= ~HS_FLAG_SOM_LEFTMOST; + } + + g_array_append_val(mp->hs_flags, fl); + np = rspamd_multipattern_pattern_filter(pattern, patlen, flags, &dlen); + g_array_append_val(mp->hs_pats, np); + fl = mp->cnt; + g_array_append_val(mp->hs_ids, fl); + rspamd_cryptobox_hash_update(&mp->hash_state, np, dlen); + + mp->cnt++; + + return; + } +#endif + ac_trie_pat_t pat; + + pat.ptr = rspamd_multipattern_pattern_filter(pattern, patlen, flags, &dlen); + pat.len = dlen; + + g_array_append_val(mp->pats, pat); + + mp->cnt++; +} + +struct rspamd_multipattern * +rspamd_multipattern_create_full(const gchar **patterns, + guint npatterns, enum rspamd_multipattern_flags flags) +{ + struct rspamd_multipattern *mp; + guint i; + + g_assert(npatterns > 0); + g_assert(patterns != NULL); + + mp = rspamd_multipattern_create_sized(npatterns, flags); + + for (i = 0; i < npatterns; i++) { + rspamd_multipattern_add_pattern(mp, patterns[i], flags); + } + + return mp; +} + +#ifdef WITH_HYPERSCAN +static gboolean +rspamd_multipattern_try_load_hs(struct rspamd_multipattern *mp, + const guchar *hash) +{ + gchar fp[PATH_MAX]; + + if (hs_cache_dir == NULL) { + return FALSE; + } + + rspamd_snprintf(fp, sizeof(fp), "%s/%*xs.hsmp", hs_cache_dir, + (gint) rspamd_cryptobox_HASHBYTES / 2, hash); + mp->hs_db = rspamd_hyperscan_maybe_load(fp, 0); + + return mp->hs_db != NULL; +} + +static void +rspamd_multipattern_try_save_hs(struct rspamd_multipattern *mp, + const guchar *hash) +{ + gchar fp[PATH_MAX], np[PATH_MAX]; + char *bytes = NULL; + gsize len; + gint fd; + + if (hs_cache_dir == NULL) { + return; + } + + rspamd_snprintf(fp, sizeof(fp), "%s%shsmp-XXXXXXXXXXXXX", G_DIR_SEPARATOR_S, + hs_cache_dir); + + if ((fd = g_mkstemp_full(fp, O_CREAT | O_EXCL | O_WRONLY, 00644)) != -1) { + int ret; + if ((ret = hs_serialize_database(rspamd_hyperscan_get_database(mp->hs_db), &bytes, &len)) == HS_SUCCESS) { + if (write(fd, bytes, len) == -1) { + msg_warn("cannot write hyperscan cache to %s: %s", + fp, strerror(errno)); + unlink(fp); + free(bytes); + } + else { + free(bytes); + fsync(fd); + + rspamd_snprintf(np, sizeof(np), "%s/%*xs.hsmp", hs_cache_dir, + (gint) rspamd_cryptobox_HASHBYTES / 2, hash); + + if (rename(fp, np) == -1) { + msg_warn("cannot rename hyperscan cache from %s to %s: %s", + fp, np, strerror(errno)); + unlink(fp); + } + else { + rspamd_hyperscan_notice_known(np); + } + } + } + else { + msg_warn("cannot serialize hyperscan cache to %s: error code %d", + fp, ret); + unlink(fp); + } + + + close(fd); + } + else { + msg_warn("cannot open a temp file %s to write hyperscan cache: %s", fp, strerror(errno)); + } +} +#endif + +gboolean +rspamd_multipattern_compile(struct rspamd_multipattern *mp, GError **err) +{ + g_assert(mp != NULL); + g_assert(!mp->compiled); + +#ifdef WITH_HYPERSCAN + if (rspamd_hs_check()) { + guint i; + hs_platform_info_t plt; + hs_compile_error_t *hs_errors; + guchar hash[rspamd_cryptobox_HASHBYTES]; + + if (mp->cnt > 0) { + g_assert(hs_populate_platform(&plt) == HS_SUCCESS); + rspamd_cryptobox_hash_update(&mp->hash_state, (void *) &plt, sizeof(plt)); + rspamd_cryptobox_hash_final(&mp->hash_state, hash); + + if (!rspamd_multipattern_try_load_hs(mp, hash)) { + hs_database_t *db = NULL; + + if (hs_compile_multi((const char *const *) mp->hs_pats->data, + (const unsigned int *) mp->hs_flags->data, + (const unsigned int *) mp->hs_ids->data, + mp->cnt, + HS_MODE_BLOCK, + &plt, + &db, + &hs_errors) != HS_SUCCESS) { + + g_set_error(err, rspamd_multipattern_quark(), EINVAL, + "cannot create tree of regexp when processing '%s': %s", + g_array_index(mp->hs_pats, char *, hs_errors->expression), + hs_errors->message); + hs_free_compile_error(hs_errors); + + return FALSE; + } + + if (hs_cache_dir != NULL) { + char fpath[PATH_MAX]; + rspamd_snprintf(fpath, sizeof(fpath), "%s/%*xs.hsmp", hs_cache_dir, + (gint) rspamd_cryptobox_HASHBYTES / 2, hash); + mp->hs_db = rspamd_hyperscan_from_raw_db(db, fpath); + } + else { + /* Should not happen in the real life */ + mp->hs_db = rspamd_hyperscan_from_raw_db(db, NULL); + } + + rspamd_multipattern_try_save_hs(mp, hash); + } + + for (i = 0; i < MAX_SCRATCH; i++) { + mp->scratch[i] = NULL; + } + + for (i = 0; i < MAX_SCRATCH; i++) { + int ret; + + if ((ret = hs_alloc_scratch(rspamd_hyperscan_get_database(mp->hs_db), &mp->scratch[i])) != HS_SUCCESS) { + msg_err("cannot allocate scratch space for hyperscan: error code %d", ret); + + /* Clean all scratches that are non-NULL */ + for (int ii = 0; ii < MAX_SCRATCH; ii++) { + if (mp->scratch[ii] != NULL) { + hs_free_scratch(mp->scratch[ii]); + } + } + g_set_error(err, rspamd_multipattern_quark(), EINVAL, + "cannot allocate scratch space for hyperscan: error code %d", ret); + + rspamd_hyperscan_free(mp->hs_db, true); + mp->hs_db = NULL; + + return FALSE; + } + } + } + + mp->compiled = TRUE; + + return TRUE; + } +#endif + + if (mp->cnt > 0) { + + if (mp->flags & (RSPAMD_MULTIPATTERN_GLOB | RSPAMD_MULTIPATTERN_RE)) { + /* Fallback to pcre... */ + rspamd_regexp_t *re; + mp->res = g_array_sized_new(FALSE, TRUE, + sizeof(rspamd_regexp_t *), mp->cnt); + + for (guint i = 0; i < mp->cnt; i++) { + const ac_trie_pat_t *pat; + const gchar *pat_flags = NULL; + + if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) { + pat_flags = "u"; + } + + pat = &g_array_index(mp->pats, ac_trie_pat_t, i); + re = rspamd_regexp_new(pat->ptr, pat_flags, err); + + if (re == NULL) { + return FALSE; + } + + g_array_append_val(mp->res, re); + } + } + else { + mp->t = acism_create((const ac_trie_pat_t *) mp->pats->data, mp->cnt); + } + } + + mp->compiled = TRUE; + + return TRUE; +} + +struct rspamd_multipattern_cbdata { + struct rspamd_multipattern *mp; + const gchar *in; + gsize len; + rspamd_multipattern_cb_t cb; + gpointer ud; + guint nfound; + gint ret; +}; + +#ifdef WITH_HYPERSCAN +static gint +rspamd_multipattern_hs_cb(unsigned int id, + unsigned long long from, + unsigned long long to, + unsigned int flags, + void *ud) +{ + struct rspamd_multipattern_cbdata *cbd = ud; + gint ret = 0; + + if (to > 0) { + + if (from == HS_OFFSET_PAST_HORIZON) { + from = 0; + } + + ret = cbd->cb(cbd->mp, id, from, to, cbd->in, cbd->len, cbd->ud); + + cbd->nfound++; + cbd->ret = ret; + } + + return ret; +} +#endif + +static gint +rspamd_multipattern_acism_cb(int strnum, int textpos, void *context) +{ + struct rspamd_multipattern_cbdata *cbd = context; + gint ret; + ac_trie_pat_t pat; + + pat = g_array_index(cbd->mp->pats, ac_trie_pat_t, strnum); + ret = cbd->cb(cbd->mp, strnum, textpos - pat.len, + textpos, cbd->in, cbd->len, cbd->ud); + + cbd->nfound++; + cbd->ret = ret; + + return ret; +} + +gint rspamd_multipattern_lookup(struct rspamd_multipattern *mp, + const gchar *in, gsize len, rspamd_multipattern_cb_t cb, + gpointer ud, guint *pnfound) +{ + struct rspamd_multipattern_cbdata cbd; + gint ret = 0; + + g_assert(mp != NULL); + + if (mp->cnt == 0 || !mp->compiled || len == 0) { + return 0; + } + + cbd.mp = mp; + cbd.in = in; + cbd.len = len; + cbd.cb = cb; + cbd.ud = ud; + cbd.nfound = 0; + cbd.ret = 0; + +#ifdef WITH_HYPERSCAN + if (rspamd_hs_check()) { + hs_scratch_t *scr = NULL; + guint i; + + for (i = 0; i < MAX_SCRATCH; i++) { + if (!(mp->scratch_used & (1 << i))) { + mp->scratch_used |= (1 << i); + scr = mp->scratch[i]; + break; + } + } + + g_assert(scr != NULL); + + ret = hs_scan(rspamd_hyperscan_get_database(mp->hs_db), in, len, 0, scr, + rspamd_multipattern_hs_cb, &cbd); + + mp->scratch_used &= ~(1 << i); + + if (ret == HS_SUCCESS) { + ret = 0; + } + else if (ret == HS_SCAN_TERMINATED) { + ret = cbd.ret; + } + + if (pnfound) { + *pnfound = cbd.nfound; + } + + return ret; + } +#endif + + gint state = 0; + + if (mp->flags & (RSPAMD_MULTIPATTERN_GLOB | RSPAMD_MULTIPATTERN_RE)) { + /* Terribly inefficient, but who cares - just use hyperscan */ + for (guint i = 0; i < mp->cnt; i++) { + rspamd_regexp_t *re = g_array_index(mp->res, rspamd_regexp_t *, i); + const gchar *start = NULL, *end = NULL; + + while (rspamd_regexp_search(re, + in, + len, + &start, + &end, + TRUE, + NULL)) { + if (rspamd_multipattern_acism_cb(i, end - in, &cbd)) { + goto out; + } + } + } + out: + ret = cbd.ret; + + if (pnfound) { + *pnfound = cbd.nfound; + } + } + else { + /* Plain trie */ + ret = acism_lookup(mp->t, in, len, rspamd_multipattern_acism_cb, &cbd, + &state, mp->flags & RSPAMD_MULTIPATTERN_ICASE); + + if (pnfound) { + *pnfound = cbd.nfound; + } + } + + return ret; +} + + +void rspamd_multipattern_destroy(struct rspamd_multipattern *mp) +{ + guint i; + + if (mp) { +#ifdef WITH_HYPERSCAN + if (rspamd_hs_check()) { + gchar *p; + + if (mp->compiled && mp->cnt > 0) { + for (i = 0; i < MAX_SCRATCH; i++) { + hs_free_scratch(mp->scratch[i]); + } + + if (mp->hs_db) { + rspamd_hyperscan_free(mp->hs_db, false); + } + } + + for (i = 0; i < mp->cnt; i++) { + p = g_array_index(mp->hs_pats, gchar *, i); + g_free(p); + } + + g_array_free(mp->hs_pats, TRUE); + g_array_free(mp->hs_ids, TRUE); + g_array_free(mp->hs_flags, TRUE); + free(mp); /* Due to posix_memalign */ + + return; + } +#endif + ac_trie_pat_t pat; + + if (mp->compiled && mp->cnt > 0) { + acism_destroy(mp->t); + } + + for (i = 0; i < mp->cnt; i++) { + pat = g_array_index(mp->pats, ac_trie_pat_t, i); + g_free((gchar *) pat.ptr); + } + + g_array_free(mp->pats, TRUE); + + g_free(mp); + } +} + +const gchar * +rspamd_multipattern_get_pattern(struct rspamd_multipattern *mp, + guint index) +{ + g_assert(mp != NULL); + g_assert(index < mp->cnt); + +#ifdef WITH_HYPERSCAN + if (rspamd_hs_check()) { + return g_array_index(mp->hs_pats, gchar *, index); + } +#endif + + ac_trie_pat_t pat; + + pat = g_array_index(mp->pats, ac_trie_pat_t, index); + + return pat.ptr; +} + +guint rspamd_multipattern_get_npatterns(struct rspamd_multipattern *mp) +{ + g_assert(mp != NULL); + + return mp->cnt; +} + +gboolean +rspamd_multipattern_has_hyperscan(void) +{ + return rspamd_hs_check(); +} |