diff options
Diffstat (limited to '')
-rw-r--r-- | src/libserver/url.c | 4365 |
1 files changed, 4365 insertions, 0 deletions
diff --git a/src/libserver/url.c b/src/libserver/url.c new file mode 100644 index 0000000..0842a1e --- /dev/null +++ b/src/libserver/url.c @@ -0,0 +1,4365 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "url.h" +#include "util.h" +#include "rspamd.h" +#include "message.h" +#include "multipattern.h" +#include "contrib/uthash/utlist.h" +#include "contrib/http-parser/http_parser.h" +#include <unicode/utf8.h> +#include <unicode/uchar.h> +#include <unicode/usprep.h> +#include <unicode/ucnv.h> + +typedef struct url_match_s { + const gchar *m_begin; + gsize m_len; + const gchar *pattern; + const gchar *prefix; + const gchar *newline_pos; + const gchar *prev_newline_pos; + gboolean add_prefix; + gchar st; +} url_match_t; + +#define URL_MATCHER_FLAG_NOHTML (1u << 0u) +#define URL_MATCHER_FLAG_TLD_MATCH (1u << 1u) +#define URL_MATCHER_FLAG_STAR_MATCH (1u << 2u) +#define URL_MATCHER_FLAG_REGEXP (1u << 3u) + +struct url_callback_data; + +static const struct { + enum rspamd_url_protocol proto; + const gchar *name; + gsize len; +} rspamd_url_protocols[] = { + {.proto = PROTOCOL_FILE, + .name = "file", + .len = 4}, + {.proto = PROTOCOL_FTP, + .name = "ftp", + .len = 3}, + {.proto = PROTOCOL_HTTP, + .name = "http", + .len = 4}, + {.proto = PROTOCOL_HTTPS, + .name = "https", + .len = 5}, + {.proto = PROTOCOL_MAILTO, + .name = "mailto", + .len = 6}, + {.proto = PROTOCOL_TELEPHONE, + .name = "tel", + .len = 3}, + {.proto = PROTOCOL_TELEPHONE, + .name = "callto", + .len = 3}, + {.proto = PROTOCOL_UNKNOWN, + .name = NULL, + .len = 0}}; +struct url_matcher { + const gchar *pattern; + const gchar *prefix; + + gboolean (*start)(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + + gboolean (*end)(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + + gint flags; +}; + +static gboolean url_file_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_file_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_web_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_web_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_tld_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_tld_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_email_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_email_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_tel_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_tel_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +struct url_matcher static_matchers[] = { + /* Common prefixes */ + {"file://", "", url_file_start, url_file_end, + 0}, + {"file:\\\\", "", url_file_start, url_file_end, + 0}, + {"ftp://", "", url_web_start, url_web_end, + 0}, + {"ftp:\\\\", "", url_web_start, url_web_end, + 0}, + {"sftp://", "", url_web_start, url_web_end, + 0}, + {"http:", "", url_web_start, url_web_end, + 0}, + {"https:", "", url_web_start, url_web_end, + 0}, + {"news://", "", url_web_start, url_web_end, + 0}, + {"nntp://", "", url_web_start, url_web_end, + 0}, + {"telnet://", "", url_web_start, url_web_end, + 0}, + {"tel:", "", url_tel_start, url_tel_end, + 0}, + {"webcal://", "", url_web_start, url_web_end, + 0}, + {"mailto:", "", url_email_start, url_email_end, + 0}, + {"callto:", "", url_tel_start, url_tel_end, + 0}, + {"h323:", "", url_web_start, url_web_end, + 0}, + {"sip:", "", url_web_start, url_web_end, + 0}, + {"www\\.[0-9a-z]", "http://", url_web_start, url_web_end, + URL_MATCHER_FLAG_REGEXP}, + {"ftp.", "ftp://", url_web_start, url_web_end, + 0}, + /* Likely emails */ + { + "@", "mailto://", url_email_start, url_email_end, + 0}}; + +struct rspamd_url_flag_name { + const gchar *name; + gint flag; + gint hash; +} url_flag_names[] = { + {"phished", RSPAMD_URL_FLAG_PHISHED, -1}, + {"numeric", RSPAMD_URL_FLAG_NUMERIC, -1}, + {"obscured", RSPAMD_URL_FLAG_OBSCURED, -1}, + {"redirected", RSPAMD_URL_FLAG_REDIRECTED, -1}, + {"html_displayed", RSPAMD_URL_FLAG_HTML_DISPLAYED, -1}, + {"text", RSPAMD_URL_FLAG_FROM_TEXT, -1}, + {"subject", RSPAMD_URL_FLAG_SUBJECT, -1}, + {"host_encoded", RSPAMD_URL_FLAG_HOSTENCODED, -1}, + {"schema_encoded", RSPAMD_URL_FLAG_SCHEMAENCODED, -1}, + {"path_encoded", RSPAMD_URL_FLAG_PATHENCODED, -1}, + {"query_encoded", RSPAMD_URL_FLAG_QUERYENCODED, -1}, + {"missing_slashes", RSPAMD_URL_FLAG_MISSINGSLASHES, -1}, + {"idn", RSPAMD_URL_FLAG_IDN, -1}, + {"has_port", RSPAMD_URL_FLAG_HAS_PORT, -1}, + {"has_user", RSPAMD_URL_FLAG_HAS_USER, -1}, + {"schemaless", RSPAMD_URL_FLAG_SCHEMALESS, -1}, + {"unnormalised", RSPAMD_URL_FLAG_UNNORMALISED, -1}, + {"zw_spaces", RSPAMD_URL_FLAG_ZW_SPACES, -1}, + {"url_displayed", RSPAMD_URL_FLAG_DISPLAY_URL, -1}, + {"image", RSPAMD_URL_FLAG_IMAGE, -1}, + {"query", RSPAMD_URL_FLAG_QUERY, -1}, + {"content", RSPAMD_URL_FLAG_CONTENT, -1}, + {"no_tld", RSPAMD_URL_FLAG_NO_TLD, -1}, + {"truncated", RSPAMD_URL_FLAG_TRUNCATED, -1}, + {"redirect_target", RSPAMD_URL_FLAG_REDIRECT_TARGET, -1}, + {"invisible", RSPAMD_URL_FLAG_INVISIBLE, -1}, + {"special", RSPAMD_URL_FLAG_SPECIAL, -1}, +}; + + +static inline khint_t rspamd_url_hash(struct rspamd_url *u); + +static inline khint_t rspamd_url_host_hash(struct rspamd_url *u); +static inline bool rspamd_urls_cmp(struct rspamd_url *a, struct rspamd_url *b); +static inline bool rspamd_urls_host_cmp(struct rspamd_url *a, struct rspamd_url *b); + +/* Hash table implementation */ +__KHASH_IMPL(rspamd_url_hash, kh_inline, struct rspamd_url *, char, false, + rspamd_url_hash, rspamd_urls_cmp); +__KHASH_IMPL(rspamd_url_host_hash, kh_inline, struct rspamd_url *, char, false, + rspamd_url_host_hash, rspamd_urls_host_cmp); + +struct url_callback_data { + const gchar *begin; + gchar *url_str; + rspamd_mempool_t *pool; + gint len; + enum rspamd_url_find_type how; + gboolean prefix_added; + guint newline_idx; + GArray *matchers; + GPtrArray *newlines; + const gchar *start; + const gchar *fin; + const gchar *end; + const gchar *last_at; + url_insert_function func; + void *funcd; +}; + +struct url_match_scanner { + GArray *matchers_full; + GArray *matchers_strict; + struct rspamd_multipattern *search_trie_full; + struct rspamd_multipattern *search_trie_strict; + bool has_tld_file; +}; + +struct url_match_scanner *url_scanner = NULL; + +enum { + IS_LWSP = (1 << 0), + IS_DOMAIN = (1 << 1), + IS_URLSAFE = (1 << 2), + IS_MAILSAFE = (1 << 3), + IS_DOMAIN_END = (1 << 4) +}; + +static const unsigned int url_scanner_table[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP /* */, + IS_MAILSAFE /* ! */, IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* " */, + IS_MAILSAFE /* # */, IS_MAILSAFE /* $ */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* % */, 0 /* & */, IS_MAILSAFE /* ' */, + 0 /* ( */, 0 /* ) */, IS_MAILSAFE /* * */, + IS_MAILSAFE /* + */, IS_MAILSAFE /* , */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* - */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* . */, IS_DOMAIN_END | IS_MAILSAFE /* / */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 0 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 1 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 2 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 3 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 4 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 5 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 6 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 7 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 8 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 9 */, IS_DOMAIN_END /* : */, + 0 /* ; */, IS_URLSAFE | IS_DOMAIN_END /* < */, 0 /* = */, + IS_URLSAFE | IS_DOMAIN_END /* > */, IS_DOMAIN_END /* ? */, 0 /* @ */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* A */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* B */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* C */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* D */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* E */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* F */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* G */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* H */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* I */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* J */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* K */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* L */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* M */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* N */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* O */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* P */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Q */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* R */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* S */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* T */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* U */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* V */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* W */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* X */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Y */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Z */, 0 /* [ */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* \ */, 0 /* ] */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* ^ */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* _ */, + IS_URLSAFE | IS_DOMAIN_END /* ` */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* a */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* b */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* c */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* d */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* e */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* f */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* g */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* h */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* i */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* j */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* k */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* l */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* m */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* n */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* o */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* p */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* q */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* r */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* s */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* t */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* u */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* v */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* w */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* x */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* y */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* z */, + IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* { */, + IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* | */, + IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* } */, + IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* ~ */, 0, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN}; + +#define is_lwsp(x) ((url_scanner_table[(guchar) (x)] & IS_LWSP) != 0) +#define is_mailsafe(x) ((url_scanner_table[(guchar) (x)] & (IS_MAILSAFE)) != 0) +#define is_domain(x) ((url_scanner_table[(guchar) (x)] & IS_DOMAIN) != 0) +#define is_urlsafe(x) ((url_scanner_table[(guchar) (x)] & (IS_URLSAFE)) != 0) + +const gchar * +rspamd_url_strerror(int err) +{ + switch (err) { + case URI_ERRNO_OK: + return "Parsing went well"; + case URI_ERRNO_EMPTY: + return "The URI string was empty"; + case URI_ERRNO_INVALID_PROTOCOL: + return "No protocol was found"; + case URI_ERRNO_BAD_FORMAT: + return "Bad URL format"; + case URI_ERRNO_BAD_ENCODING: + return "Invalid symbols encoded"; + case URI_ERRNO_INVALID_PORT: + return "Port number is bad"; + case URI_ERRNO_TLD_MISSING: + return "TLD part is not detected"; + case URI_ERRNO_HOST_MISSING: + return "Host part is missing"; + case URI_ERRNO_TOO_LONG: + return "URL is too long"; + } + + return NULL; +} + +static gboolean +rspamd_url_parse_tld_file(const gchar *fname, + struct url_match_scanner *scanner) +{ + FILE *f; + struct url_matcher m; + gchar *linebuf = NULL, *p; + gsize buflen = 0; + gssize r; + gint flags; + + f = fopen(fname, "r"); + + if (f == NULL) { + msg_err("cannot open TLD file %s: %s", fname, strerror(errno)); + return FALSE; + } + + m.end = url_tld_end; + m.start = url_tld_start; + m.prefix = "http://"; + + while ((r = getline(&linebuf, &buflen, f)) > 0) { + if (linebuf[0] == '/' || g_ascii_isspace(linebuf[0])) { + /* Skip comment or empty line */ + continue; + } + + g_strchomp(linebuf); + + /* TODO: add support for ! patterns */ + if (linebuf[0] == '!') { + msg_debug("skip '!' patterns from parsing for now: %s", linebuf); + continue; + } + + flags = URL_MATCHER_FLAG_NOHTML | URL_MATCHER_FLAG_TLD_MATCH; + + if (linebuf[0] == '*') { + flags |= URL_MATCHER_FLAG_STAR_MATCH; + p = strchr(linebuf, '.'); + + if (p == NULL) { + msg_err("got bad star line, skip it: %s", linebuf); + continue; + } + p++; + } + else { + p = linebuf; + } + + m.flags = flags; + rspamd_multipattern_add_pattern(url_scanner->search_trie_full, p, + RSPAMD_MULTIPATTERN_TLD | RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + m.pattern = rspamd_multipattern_get_pattern(url_scanner->search_trie_full, + rspamd_multipattern_get_npatterns(url_scanner->search_trie_full) - 1); + + g_array_append_val(url_scanner->matchers_full, m); + } + + free(linebuf); + fclose(f); + + return TRUE; +} + +static void +rspamd_url_add_static_matchers(struct url_match_scanner *sc) +{ + gint n = G_N_ELEMENTS(static_matchers), i; + + for (i = 0; i < n; i++) { + if (static_matchers[i].flags & URL_MATCHER_FLAG_REGEXP) { + rspamd_multipattern_add_pattern(url_scanner->search_trie_strict, + static_matchers[i].pattern, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 | + RSPAMD_MULTIPATTERN_RE); + } + else { + rspamd_multipattern_add_pattern(url_scanner->search_trie_strict, + static_matchers[i].pattern, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + } + } + + g_array_append_vals(sc->matchers_strict, static_matchers, n); + + if (sc->matchers_full) { + for (i = 0; i < n; i++) { + if (static_matchers[i].flags & URL_MATCHER_FLAG_REGEXP) { + rspamd_multipattern_add_pattern(url_scanner->search_trie_full, + static_matchers[i].pattern, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 | + RSPAMD_MULTIPATTERN_RE); + } + else { + rspamd_multipattern_add_pattern(url_scanner->search_trie_full, + static_matchers[i].pattern, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + } + } + g_array_append_vals(sc->matchers_full, static_matchers, n); + } +} + +void rspamd_url_deinit(void) +{ + if (url_scanner != NULL) { + if (url_scanner->search_trie_full) { + rspamd_multipattern_destroy(url_scanner->search_trie_full); + g_array_free(url_scanner->matchers_full, TRUE); + } + + rspamd_multipattern_destroy(url_scanner->search_trie_strict); + g_array_free(url_scanner->matchers_strict, TRUE); + g_free(url_scanner); + + url_scanner = NULL; + } +} + +void rspamd_url_init(const gchar *tld_file) +{ + GError *err = NULL; + gboolean ret = TRUE; + + if (url_scanner != NULL) { + rspamd_url_deinit(); + } + + url_scanner = g_malloc(sizeof(struct url_match_scanner)); + + url_scanner->matchers_strict = g_array_sized_new(FALSE, TRUE, + sizeof(struct url_matcher), G_N_ELEMENTS(static_matchers)); + url_scanner->search_trie_strict = rspamd_multipattern_create_sized( + G_N_ELEMENTS(static_matchers), + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + + if (tld_file) { + /* Reserve larger multipattern */ + url_scanner->matchers_full = g_array_sized_new(FALSE, TRUE, + sizeof(struct url_matcher), 13000); + url_scanner->search_trie_full = rspamd_multipattern_create_sized(13000, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + url_scanner->has_tld_file = true; + } + else { + url_scanner->matchers_full = NULL; + url_scanner->search_trie_full = NULL; + url_scanner->has_tld_file = false; + } + + rspamd_url_add_static_matchers(url_scanner); + + if (tld_file != NULL) { + ret = rspamd_url_parse_tld_file(tld_file, url_scanner); + } + + if (url_scanner->matchers_full && url_scanner->matchers_full->len > 1000) { + msg_info("start compiling of %d TLD suffixes; it might take a long time", + url_scanner->matchers_full->len); + } + + if (!rspamd_multipattern_compile(url_scanner->search_trie_strict, &err)) { + msg_err("cannot compile url matcher static patterns, fatal error: %e", err); + abort(); + } + + if (url_scanner->search_trie_full) { + if (!rspamd_multipattern_compile(url_scanner->search_trie_full, &err)) { + msg_err("cannot compile tld patterns, url matching will be " + "incomplete: %e", + err); + g_error_free(err); + ret = FALSE; + } + } + + if (tld_file != NULL) { + if (ret) { + msg_info("initialized %ud url match suffixes from '%s'", + url_scanner->matchers_full->len - url_scanner->matchers_strict->len, + tld_file); + } + else { + msg_err("failed to initialize url tld suffixes from '%s', " + "use %ud internal match suffixes", + tld_file, + url_scanner->matchers_strict->len); + } + } + + /* Generate hashes for flags */ + for (gint i = 0; i < G_N_ELEMENTS(url_flag_names); i++) { + url_flag_names[i].hash = + rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, + url_flag_names[i].name, + strlen(url_flag_names[i].name), 0); + } + /* Ensure that we have no hashes collisions O(N^2) but this array is small */ + for (gint i = 0; i < G_N_ELEMENTS(url_flag_names) - 1; i++) { + for (gint j = i + 1; j < G_N_ELEMENTS(url_flag_names); j++) { + if (url_flag_names[i].hash == url_flag_names[j].hash) { + msg_err("collision: both %s and %s map to %d", + url_flag_names[i].name, url_flag_names[j].name, + url_flag_names[i].hash); + abort(); + } + } + } +} + +#define SET_U(u, field) \ + do { \ + if ((u) != NULL) { \ + (u)->field_set |= 1 << (field); \ + (u)->field_data[(field)].len = p - c; \ + (u)->field_data[(field)].off = c - str; \ + } \ + } while (0) + +static bool +is_url_start(gchar c) +{ + if (c == '(' || + c == '{' || + c == '[' || + c == '<' || + c == '\'') { + return TRUE; + } + + return FALSE; +} + +static bool +is_url_end(gchar c) +{ + if (c == ')' || + c == '}' || + c == ']' || + c == '>' || + c == '\'') { + return TRUE; + } + + return FALSE; +} + +static bool +is_domain_start(int p) +{ + if (g_ascii_isalnum(p) || + p == '[' || + p == '%' || + p == '_' || + (p & 0x80)) { + return TRUE; + } + + return FALSE; +} + +static const guint max_domain_length = 253; +static const guint max_dns_label = 63; +static const guint max_email_user = 64; + +static gint +rspamd_mailto_parse(struct http_parser_url *u, + const gchar *str, gsize len, + gchar const **end, + enum rspamd_url_parse_flags parse_flags, guint *flags) +{ + const gchar *p = str, *c = str, *last = str + len; + gchar t; + gint ret = 1; + enum { + parse_mailto, + parse_slash, + parse_slash_slash, + parse_semicolon, + parse_prefix_question, + parse_destination, + parse_equal, + parse_user, + parse_at, + parse_domain, + parse_suffix_question, + parse_query + } st = parse_mailto; + + if (u != NULL) { + memset(u, 0, sizeof(*u)); + } + + while (p < last) { + t = *p; + + if (p - str > max_email_user + max_domain_length + 1) { + goto out; + } + + switch (st) { + case parse_mailto: + if (t == ':') { + st = parse_semicolon; + SET_U(u, UF_SCHEMA); + } + p++; + break; + case parse_semicolon: + if (t == '/' || t == '\\') { + st = parse_slash; + p++; + } + else { + *flags |= RSPAMD_URL_FLAG_MISSINGSLASHES; + st = parse_slash_slash; + } + break; + case parse_slash: + if (t == '/' || t == '\\') { + st = parse_slash_slash; + } + else { + goto out; + } + p++; + break; + case parse_slash_slash: + if (t == '?') { + st = parse_prefix_question; + p++; + } + else if (t != '/' && t != '\\') { + c = p; + st = parse_user; + } + else { + /* Skip multiple slashes */ + p++; + } + break; + case parse_prefix_question: + if (t == 't') { + /* XXX: accept only to= */ + st = parse_destination; + } + else { + goto out; + } + break; + case parse_destination: + if (t == '=') { + st = parse_equal; + } + p++; + break; + case parse_equal: + c = p; + st = parse_user; + break; + case parse_user: + if (t == '@') { + if (p - c == 0) { + goto out; + } + SET_U(u, UF_USERINFO); + st = parse_at; + } + else if (!is_mailsafe(t)) { + goto out; + } + else if (p - c > max_email_user) { + goto out; + } + p++; + break; + case parse_at: + c = p; + st = parse_domain; + break; + case parse_domain: + if (t == '?') { + SET_U(u, UF_HOST); + st = parse_suffix_question; + } + else if (!is_domain(t) && t != '.' && t != '_') { + goto out; + } + else if (p - c > max_domain_length) { + goto out; + } + p++; + break; + case parse_suffix_question: + c = p; + st = parse_query; + break; + case parse_query: + if (t == '#') { + if (p - c != 0) { + SET_U(u, UF_QUERY); + } + c = p + 1; + ret = 0; + + goto out; + } + else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { + ret = 0; + goto out; + } + else if (is_lwsp(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace(t)) { + ret = 0; + } + goto out; + } + else { + goto out; + } + } + p++; + break; + } + } + + if (st == parse_domain) { + if (p - c != 0) { + SET_U(u, UF_HOST); + ret = 0; + } + } + else if (st == parse_query) { + if (p - c > 0) { + SET_U(u, UF_QUERY); + } + + ret = 0; + } + +out: + if (end != NULL) { + *end = p; + } + + if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) { + return 0; + } + + return ret; +} + +static gint +rspamd_telephone_parse(struct http_parser_url *u, + const gchar *str, gsize len, + gchar const **end, + enum rspamd_url_parse_flags parse_flags, + guint *flags) +{ + enum { + parse_protocol, + parse_semicolon, + parse_slash, + parse_slash_slash, + parse_spaces, + parse_plus, + parse_phone_start, + parse_phone, + } st = parse_protocol; + + const gchar *p = str, *c = str, *last = str + len; + gchar t; + gint ret = 1, i; + UChar32 uc; + + if (u != NULL) { + memset(u, 0, sizeof(*u)); + } + + while (p < last) { + t = *p; + + if (p - str > max_email_user) { + goto out; + } + + switch (st) { + case parse_protocol: + if (t == ':') { + st = parse_semicolon; + SET_U(u, UF_SCHEMA); + } + p++; + break; + case parse_semicolon: + if (t == '/' || t == '\\') { + st = parse_slash; + p++; + } + else { + st = parse_slash_slash; + } + break; + case parse_slash: + if (t == '/' || t == '\\') { + st = parse_slash_slash; + } + else { + goto out; + } + p++; + break; + case parse_slash_slash: + if (g_ascii_isspace(t)) { + st = parse_spaces; + p++; + } + else if (t == '+') { + c = p; + st = parse_plus; + } + else if (t == '/') { + /* Skip multiple slashes */ + p++; + } + else { + st = parse_phone_start; + c = p; + } + break; + case parse_spaces: + if (t == '+') { + c = p; + st = parse_plus; + } + else if (!g_ascii_isspace(t)) { + st = parse_phone_start; + c = p; + } + else { + p++; + } + break; + case parse_plus: + c = p; + p++; + st = parse_phone_start; + break; + case parse_phone_start: + if (*p == '%' || *p == '(' || g_ascii_isdigit(*p)) { + st = parse_phone; + p++; + } + else { + goto out; + } + break; + case parse_phone: + i = p - str; + U8_NEXT(str, i, len, uc); + p = str + i; + + if (u_isdigit(uc) || uc == '(' || uc == ')' || uc == '[' || uc == ']' || u_isspace(uc) || uc == '%') { + /* p is already incremented by U8_NEXT! */ + } + else if (uc <= 0 || is_url_end(uc)) { + ret = 0; + goto set; + } + break; + } + } + +set: + if (st == parse_phone) { + if (p - c != 0) { + SET_U(u, UF_HOST); + ret = 0; + } + } + +out: + if (end != NULL) { + *end = p; + } + + if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) { + return 0; + } + + return ret; +} + +static gint +rspamd_web_parse(struct http_parser_url *u, const gchar *str, gsize len, + gchar const **end, + enum rspamd_url_parse_flags parse_flags, + guint *flags) +{ + const gchar *p = str, *c = str, *last = str + len, *slash = NULL, + *password_start = NULL, *user_start = NULL; + gchar t = 0; + UChar32 uc; + glong pt; + gint ret = 1; + gboolean user_seen = FALSE; + enum { + parse_protocol, + parse_slash, + parse_slash_slash, + parse_semicolon, + parse_user, + parse_at, + parse_multiple_at, + parse_password_start, + parse_password, + parse_domain_start, + parse_domain, + parse_ipv6, + parse_port_password, + parse_port, + parse_suffix_slash, + parse_path, + parse_query, + parse_part + } st = parse_protocol; + + if (u != NULL) { + memset(u, 0, sizeof(*u)); + } + + while (p < last) { + t = *p; + + switch (st) { + case parse_protocol: + if (t == ':') { + st = parse_semicolon; + SET_U(u, UF_SCHEMA); + } + else if (!g_ascii_isalnum(t) && t != '+' && t != '-') { + if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) { + /* We might have some domain, but no protocol */ + st = parse_domain_start; + p = c; + slash = c; + break; + } + else { + goto out; + } + } + p++; + break; + case parse_semicolon: + if (t == '/' || t == '\\') { + st = parse_slash; + p++; + } + else { + st = parse_slash_slash; + *(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES; + } + break; + case parse_slash: + if (t == '/' || t == '\\') { + st = parse_slash_slash; + } + else { + goto out; + } + p++; + break; + case parse_slash_slash: + + if (t != '/' && t != '\\') { + c = p; + slash = p; + st = parse_domain_start; + + /* + * Unfortunately, due to brain damage of the RFC 3986 authors, + * we have to distinguish two possibilities here: + * authority = [ userinfo "@" ] host [ ":" port ] + * So if we have @ somewhere before hostname then we must process + * with the username state. Otherwise, we have to process via + * the hostname state. Unfortunately, there is no way to distinguish + * them aside of running NFA or two DFA or performing lookahead. + * Lookahead approach looks easier to implement. + */ + + const char *tp = p; + while (tp < last) { + if (*tp == '@') { + user_seen = TRUE; + st = parse_user; + break; + } + else if (*tp == '/' || *tp == '#' || *tp == '?') { + st = parse_domain_start; + break; + } + + tp++; + } + + if (st == parse_domain_start && *p == '[') { + st = parse_ipv6; + p++; + c = p; + } + } + else { + /* Skip multiple slashes */ + p++; + } + break; + case parse_ipv6: + if (t == ']') { + if (p - c == 0) { + goto out; + } + SET_U(u, UF_HOST); + p++; + + if (*p == ':') { + st = parse_port; + c = p + 1; + } + else if (*p == '/' || *p == '\\') { + st = parse_path; + c = p + 1; + } + else if (*p == '?') { + st = parse_query; + c = p + 1; + } + else if (*p == '#') { + st = parse_part; + c = p + 1; + } + else if (p != last) { + goto out; + } + } + else if (!g_ascii_isxdigit(t) && t != ':' && t != '.') { + goto out; + } + p++; + break; + case parse_user: + if (t == ':') { + if (p - c == 0) { + goto out; + } + user_start = c; + st = parse_password_start; + } + else if (t == '@') { + /* No password */ + if (p - c == 0) { + /* We have multiple at in fact */ + st = parse_multiple_at; + user_seen = TRUE; + *flags |= RSPAMD_URL_FLAG_OBSCURED; + + continue; + } + + SET_U(u, UF_USERINFO); + *flags |= RSPAMD_URL_FLAG_HAS_USER; + st = parse_at; + } + else if (!g_ascii_isgraph(t)) { + goto out; + } + else if (p - c > max_email_user) { + goto out; + } + + p++; + break; + case parse_multiple_at: + if (t != '@') { + if (p - c == 0) { + goto out; + } + + /* For now, we ignore all that stuff as it is bogus */ + /* Off by one */ + p--; + SET_U(u, UF_USERINFO); + p++; + *flags |= RSPAMD_URL_FLAG_HAS_USER; + st = parse_at; + } + else { + p++; + } + break; + case parse_password_start: + if (t == '@') { + /* Empty password */ + SET_U(u, UF_USERINFO); + if (u != NULL && u->field_data[UF_USERINFO].len > 0) { + /* Eat semicolon */ + u->field_data[UF_USERINFO].len--; + } + *flags |= RSPAMD_URL_FLAG_HAS_USER; + st = parse_at; + } + else { + c = p; + password_start = p; + st = parse_password; + } + p++; + break; + case parse_password: + if (t == '@') { + /* XXX: password is not stored */ + if (u != NULL) { + if (u->field_data[UF_USERINFO].len == 0 && password_start && user_start && password_start > user_start + 1) { + *flags |= RSPAMD_URL_FLAG_HAS_USER; + u->field_set |= 1u << (UF_USERINFO); + u->field_data[UF_USERINFO].len = + password_start - user_start - 1; + u->field_data[UF_USERINFO].off = + user_start - str; + } + } + st = parse_at; + } + else if (!g_ascii_isgraph(t)) { + goto out; + } + else if (p - c > max_domain_length) { + goto out; + } + p++; + break; + case parse_at: + c = p; + + if (t == '@') { + *flags |= RSPAMD_URL_FLAG_OBSCURED; + p++; + } + else if (t == '[') { + st = parse_ipv6; + p++; + c = p; + } + else { + st = parse_domain_start; + } + break; + case parse_domain_start: + if (is_domain_start(t)) { + st = parse_domain; + } + else { + goto out; + } + break; + case parse_domain: + if (p - c > max_domain_length) { + /* Too large domain */ + goto out; + } + if (t == '/' || t == '\\' || t == ':' || t == '?' || t == '#') { + if (p - c == 0) { + goto out; + } + if (t == '/' || t == '\\') { + SET_U(u, UF_HOST); + st = parse_suffix_slash; + } + else if (t == '?') { + SET_U(u, UF_HOST); + st = parse_query; + c = p + 1; + } + else if (t == '#') { + SET_U(u, UF_HOST); + st = parse_part; + c = p + 1; + } + else if (t == ':' && !user_seen) { + /* + * Here we can have both port and password, hence we need + * to apply some heuristic here + */ + st = parse_port_password; + } + else { + /* + * We can go only for parsing port here + */ + SET_U(u, UF_HOST); + st = parse_port; + c = p + 1; + } + p++; + } + else { + if (is_url_end(t) || is_url_start(t)) { + goto set; + } + else if (*p == '@' && !user_seen) { + /* We need to fallback and test user */ + p = slash; + user_seen = TRUE; + st = parse_user; + } + else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') { + if (*p & 0x80) { + guint i = 0; + + U8_NEXT(((const guchar *) p), i, last - p, uc); + + if (uc < 0) { + /* Bad utf8 */ + goto out; + } + + if (!u_isalnum(uc)) { + /* Bad symbol */ + if (IS_ZERO_WIDTH_SPACE(uc)) { + (*flags) |= RSPAMD_URL_FLAG_ZW_SPACES; + } + else { + if (!u_isgraph(uc)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + goto out; + } + else { + goto set; + } + } + } + } + else { + (*flags) |= RSPAMD_URL_FLAG_IDN; + } + + p = p + i; + } + else if (is_urlsafe(*p)) { + p++; + } + else { + if (parse_flags & RSPAMD_URL_PARSE_HREF) { + /* We have to use all shit we are given here */ + p++; + (*flags) |= RSPAMD_URL_FLAG_OBSCURED; + } + else { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + goto out; + } + else { + goto set; + } + } + } + } + else { + p++; + } + } + break; + case parse_port_password: + if (g_ascii_isdigit(t)) { + const gchar *tmp = p; + + while (tmp < last) { + if (!g_ascii_isdigit(*tmp)) { + if (*tmp == '/' || *tmp == '#' || *tmp == '?' || + is_url_end(*tmp) || g_ascii_isspace(*tmp)) { + /* Port + something */ + st = parse_port; + c = slash; + p--; + SET_U(u, UF_HOST); + p++; + c = p; + break; + } + else { + /* Not a port, bad character at the end */ + break; + } + } + tmp++; + } + + if (tmp == last) { + /* Host + port only */ + st = parse_port; + c = slash; + p--; + SET_U(u, UF_HOST); + p++; + c = p; + } + + if (st != parse_port) { + /* Fallback to user:password */ + p = slash; + c = slash; + user_seen = TRUE; + st = parse_user; + } + } + else { + /* Rewind back */ + p = slash; + c = slash; + user_seen = TRUE; + st = parse_user; + } + break; + case parse_port: + if (t == '/' || t == '\\') { + pt = strtoul(c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; + } + st = parse_suffix_slash; + } + else if (t == '?') { + pt = strtoul(c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; + } + + c = p + 1; + st = parse_query; + } + else if (t == '#') { + pt = strtoul(c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; + } + + c = p + 1; + st = parse_part; + } + else if (is_url_end(t)) { + goto set; + } + else if (!g_ascii_isdigit(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) || + !g_ascii_isspace(t)) { + goto out; + } + else { + goto set; + } + } + p++; + break; + case parse_suffix_slash: + if (t != '/' && t != '\\') { + c = p; + st = parse_path; + } + else { + /* Skip extra slashes */ + p++; + } + break; + case parse_path: + if (t == '?') { + if (p - c != 0) { + SET_U(u, UF_PATH); + } + c = p + 1; + st = parse_query; + } + else if (t == '#') { + /* No query, just fragment */ + if (p - c != 0) { + SET_U(u, UF_PATH); + } + c = p + 1; + st = parse_part; + } + else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { + goto set; + } + else if (is_lwsp(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace(t)) { + goto set; + } + goto out; + } + else { + goto set; + } + } + p++; + break; + case parse_query: + if (t == '#') { + if (p - c != 0) { + SET_U(u, UF_QUERY); + } + c = p + 1; + st = parse_part; + } + else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { + goto set; + } + else if (is_lwsp(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace(t)) { + goto set; + } + goto out; + } + else { + goto set; + } + } + p++; + break; + case parse_part: + if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { + goto set; + } + else if (is_lwsp(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace(t)) { + goto set; + } + goto out; + } + else { + goto set; + } + } + p++; + break; + } + } + +set: + /* Parse remaining */ + switch (st) { + case parse_domain: + if (p - c == 0 || !is_domain(*(p - 1)) || !is_domain(*c)) { + goto out; + } + SET_U(u, UF_HOST); + ret = 0; + + break; + case parse_port: + pt = strtoul(c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + } + + ret = 0; + break; + case parse_suffix_slash: + /* Url ends with '/' */ + ret = 0; + break; + case parse_path: + if (p - c > 0) { + SET_U(u, UF_PATH); + } + ret = 0; + break; + case parse_query: + if (p - c > 0) { + SET_U(u, UF_QUERY); + } + ret = 0; + break; + case parse_part: + if (p - c > 0) { + SET_U(u, UF_FRAGMENT); + } + ret = 0; + break; + case parse_ipv6: + if (t != ']') { + ret = 1; + } + else { + /* e.g. http://[::] */ + ret = 0; + } + break; + default: + /* Error state */ + ret = 1; + break; + } +out: + if (end != NULL) { + *end = p; + } + + return ret; +} + +#undef SET_U + +static gint +rspamd_tld_trie_callback(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + struct url_matcher *matcher; + const gchar *start, *pos, *p; + struct rspamd_url *url = context; + gint ndots; + + matcher = &g_array_index(url_scanner->matchers_full, struct url_matcher, + strnum); + ndots = 1; + + if (matcher->flags & URL_MATCHER_FLAG_STAR_MATCH) { + /* Skip one more tld component */ + ndots++; + } + + pos = text + match_start; + p = pos - 1; + start = rspamd_url_host_unsafe(url); + + if (*pos != '.' || match_pos != (gint) url->hostlen) { + /* Something weird has been found */ + if (match_pos == (gint) url->hostlen - 1) { + pos = rspamd_url_host_unsafe(url) + match_pos; + if (*pos == '.') { + /* This is dot at the end of domain */ + url->hostlen--; + } + else { + return 0; + } + } + else { + return 0; + } + } + + /* Now we need to find top level domain */ + pos = start; + while (p >= start && ndots > 0) { + if (*p == '.') { + ndots--; + pos = p + 1; + } + else { + pos = p; + } + + p--; + } + + if ((ndots == 0 || p == start - 1) && + url->tldlen < rspamd_url_host_unsafe(url) + url->hostlen - pos) { + url->tldshift = (pos - url->string); + url->tldlen = rspamd_url_host_unsafe(url) + url->hostlen - pos; + } + + return 0; +} + +static void +rspamd_url_regen_from_inet_addr(struct rspamd_url *uri, const void *addr, int af, + rspamd_mempool_t *pool) +{ + gchar *strbuf, *p; + const gchar *start_offset; + gsize slen = uri->urllen - uri->hostlen; + goffset r = 0; + + if (af == AF_INET) { + slen += INET_ADDRSTRLEN; + } + else { + slen += INET6_ADDRSTRLEN; + } + + if (uri->flags & RSPAMD_URL_FLAG_HAS_PORT) { + slen += sizeof("65535") - 1; + } + + /* Allocate new string to build it from IP */ + strbuf = rspamd_mempool_alloc(pool, slen + 1); + r += rspamd_snprintf(strbuf + r, slen - r, "%*s", + (gint) (uri->hostshift), + uri->string); + + uri->hostshift = r; + uri->tldshift = r; + start_offset = strbuf + r; + inet_ntop(af, addr, strbuf + r, slen - r + 1); + uri->hostlen = strlen(start_offset); + r += uri->hostlen; + uri->tldlen = uri->hostlen; + uri->flags |= RSPAMD_URL_FLAG_NUMERIC; + + /* Reconstruct URL */ + if (uri->flags & RSPAMD_URL_FLAG_HAS_PORT && uri->ext) { + p = strbuf + r; + start_offset = p + 1; + r += rspamd_snprintf(strbuf + r, slen - r, ":%ud", + (unsigned int) uri->ext->port); + } + if (uri->datalen > 0) { + p = strbuf + r; + start_offset = p + 1; + r += rspamd_snprintf(strbuf + r, slen - r, "/%*s", + (gint) uri->datalen, + rspamd_url_data_unsafe(uri)); + uri->datashift = start_offset - strbuf; + } + else { + /* Add trailing slash if needed */ + if (uri->hostlen + uri->hostshift < uri->urllen && + *(rspamd_url_host_unsafe(uri) + uri->hostlen) == '/') { + r += rspamd_snprintf(strbuf + r, slen - r, "/"); + } + } + + if (uri->querylen > 0) { + p = strbuf + r; + start_offset = p + 1; + r += rspamd_snprintf(strbuf + r, slen - r, "?%*s", + (gint) uri->querylen, + rspamd_url_query_unsafe(uri)); + uri->queryshift = start_offset - strbuf; + } + if (uri->fragmentlen > 0) { + p = strbuf + r; + start_offset = p + 1; + r += rspamd_snprintf(strbuf + r, slen - r, "#%*s", + (gint) uri->fragmentlen, + rspamd_url_fragment_unsafe(uri)); + uri->fragmentshift = start_offset - strbuf; + } + + uri->string = strbuf; + uri->urllen = r; +} + +static gboolean +rspamd_url_maybe_regenerate_from_ip(struct rspamd_url *uri, rspamd_mempool_t *pool) +{ + const gchar *p, *end, *c; + gchar *errstr; + struct in_addr in4; + struct in6_addr in6; + gboolean ret = FALSE, check_num = TRUE; + guint32 n, dots, t = 0, i = 0, shift, nshift; + + p = rspamd_url_host_unsafe(uri); + end = p + uri->hostlen; + + if (*p == '[' && *(end - 1) == ']') { + p++; + end--; + } + + while (*(end - 1) == '.' && end > p) { + end--; + } + + if (end - p == 0 || end - p > INET6_ADDRSTRLEN) { + return FALSE; + } + + if (rspamd_str_has_8bit(p, end - p)) { + return FALSE; + } + + if (rspamd_parse_inet_address_ip4(p, end - p, &in4)) { + rspamd_url_regen_from_inet_addr(uri, &in4, AF_INET, pool); + ret = TRUE; + } + else if (rspamd_parse_inet_address_ip6(p, end - p, &in6)) { + rspamd_url_regen_from_inet_addr(uri, &in6, AF_INET6, pool); + ret = TRUE; + } + else { + /* Heuristics for broken urls */ + gchar buf[INET6_ADDRSTRLEN + 1]; + /* Try also numeric notation */ + c = p; + n = 0; + dots = 0; + shift = 0; + + while (p <= end && check_num) { + if (shift < 32 && + ((*p == '.' && dots < 3) || (p == end && dots <= 3))) { + if (p - c + 1 >= (gint) sizeof(buf)) { + msg_debug_pool("invalid numeric url %*.s...: too long", + INET6_ADDRSTRLEN, c); + return FALSE; + } + + rspamd_strlcpy(buf, c, p - c + 1); + c = p + 1; + + if (p < end && *p == '.') { + dots++; + } + + glong long_n = strtol(buf, &errstr, 0); + + if ((errstr == NULL || *errstr == '\0') && long_n >= 0) { + + t = long_n; /* Truncate as windows does */ + /* + * Even if we have zero, we need to shift by 1 octet + */ + nshift = (t == 0 ? shift + 8 : shift); + + /* + * Here we count number of octets encoded in this element + */ + for (i = 0; i < 4; i++) { + if ((t >> (8 * i)) > 0) { + nshift += 8; + } + else { + break; + } + } + /* + * Here we need to find the proper shift of the previous + * components, so we check possible cases: + * 1) 1 octet - just use it applying shift + * 2) 2 octets - convert to big endian 16 bit number + * 3) 3 octets - convert to big endian 24 bit number + * 4) 4 octets - convert to big endian 32 bit number + */ + switch (i) { + case 4: + t = GUINT32_TO_BE(t); + break; + case 3: + t = (GUINT32_TO_BE(t & 0xFFFFFFU)) >> 8; + break; + case 2: + t = GUINT16_TO_BE(t & 0xFFFFU); + break; + default: + t = t & 0xFF; + break; + } + + if (p != end) { + n |= t << shift; + + shift = nshift; + } + } + else { + check_num = FALSE; + } + } + + p++; + } + + /* The last component should be last according to url normalization: + * 192.168.1 -> 192.168.0.1 + * 192 -> 0.0.0.192 + * 192.168 -> 192.0.0.168 + */ + shift = 8 * (4 - i); + + if (shift < 32) { + n |= t << shift; + } + + if (check_num) { + if (dots <= 4) { + memcpy(&in4, &n, sizeof(in4)); + rspamd_url_regen_from_inet_addr(uri, &in4, AF_INET, pool); + uri->flags |= RSPAMD_URL_FLAG_OBSCURED; + ret = TRUE; + } + else if (end - c > (gint) sizeof(buf) - 1) { + rspamd_strlcpy(buf, c, end - c + 1); + + if (inet_pton(AF_INET6, buf, &in6) == 1) { + rspamd_url_regen_from_inet_addr(uri, &in6, AF_INET6, pool); + uri->flags |= RSPAMD_URL_FLAG_OBSCURED; + ret = TRUE; + } + } + } + } + + return ret; +} + +static void +rspamd_url_shift(struct rspamd_url *uri, gsize nlen, + enum http_parser_url_fields field) +{ + guint old_shift, shift = 0; + gint remain; + + /* Shift remaining data */ + switch (field) { + case UF_SCHEMA: + if (nlen >= uri->protocollen) { + return; + } + else { + shift = uri->protocollen - nlen; + } + + old_shift = uri->protocollen; + uri->protocollen -= shift; + remain = uri->urllen - uri->protocollen; + g_assert(remain >= 0); + memmove(uri->string + uri->protocollen, uri->string + old_shift, + remain); + uri->urllen -= shift; + uri->flags |= RSPAMD_URL_FLAG_SCHEMAENCODED; + break; + case UF_HOST: + if (nlen >= uri->hostlen) { + return; + } + else { + shift = uri->hostlen - nlen; + } + + old_shift = uri->hostlen; + uri->hostlen -= shift; + remain = (uri->urllen - (uri->hostshift)) - old_shift; + g_assert(remain >= 0); + memmove(rspamd_url_host_unsafe(uri) + uri->hostlen, + rspamd_url_host_unsafe(uri) + old_shift, + remain); + uri->urllen -= shift; + uri->flags |= RSPAMD_URL_FLAG_HOSTENCODED; + break; + case UF_PATH: + if (nlen >= uri->datalen) { + return; + } + else { + shift = uri->datalen - nlen; + } + + old_shift = uri->datalen; + uri->datalen -= shift; + remain = (uri->urllen - (uri->datashift)) - old_shift; + g_assert(remain >= 0); + memmove(rspamd_url_data_unsafe(uri) + uri->datalen, + rspamd_url_data_unsafe(uri) + old_shift, + remain); + uri->urllen -= shift; + uri->flags |= RSPAMD_URL_FLAG_PATHENCODED; + break; + case UF_QUERY: + if (nlen >= uri->querylen) { + return; + } + else { + shift = uri->querylen - nlen; + } + + old_shift = uri->querylen; + uri->querylen -= shift; + remain = (uri->urllen - (uri->queryshift)) - old_shift; + g_assert(remain >= 0); + memmove(rspamd_url_query_unsafe(uri) + uri->querylen, + rspamd_url_query_unsafe(uri) + old_shift, + remain); + uri->urllen -= shift; + uri->flags |= RSPAMD_URL_FLAG_QUERYENCODED; + break; + case UF_FRAGMENT: + if (nlen >= uri->fragmentlen) { + return; + } + else { + shift = uri->fragmentlen - nlen; + } + + uri->fragmentlen -= shift; + uri->urllen -= shift; + break; + default: + break; + } + + /* Now adjust lengths and offsets */ + switch (field) { + case UF_SCHEMA: + if (uri->userlen > 0) { + uri->usershift -= shift; + } + if (uri->hostlen > 0) { + uri->hostshift -= shift; + } + /* Go forward */ + /* FALLTHRU */ + case UF_HOST: + if (uri->datalen > 0) { + uri->datashift -= shift; + } + /* Go forward */ + /* FALLTHRU */ + case UF_PATH: + if (uri->querylen > 0) { + uri->queryshift -= shift; + } + /* Go forward */ + /* FALLTHRU */ + case UF_QUERY: + if (uri->fragmentlen > 0) { + uri->fragmentshift -= shift; + } + /* Go forward */ + /* FALLTHRU */ + case UF_FRAGMENT: + default: + break; + } +} + +static void +rspamd_telephone_normalise_inplace(struct rspamd_url *uri) +{ + gchar *t, *h, *end; + gint i = 0, w, orig_len; + UChar32 uc; + + t = rspamd_url_host_unsafe(uri); + h = t; + end = t + uri->hostlen; + orig_len = uri->hostlen; + + if (*h == '+') { + h++; + t++; + } + + while (h < end) { + i = 0; + U8_NEXT(h, i, end - h, uc); + + if (u_isdigit(uc)) { + w = 0; + U8_APPEND_UNSAFE(t, w, uc); + t += w; + } + + h += i; + } + + uri->hostlen = t - rspamd_url_host_unsafe(uri); + uri->urllen -= (orig_len - uri->hostlen); +} + +static inline bool +is_idna_label_dot(UChar ch) +{ + switch (ch) { + case 0x3002: + case 0xFF0E: + case 0xFF61: + return true; + default: + return false; + } +} + +/* + * All credits for this investigation should go to + * Dr. Hajime Shimada and Mr. Shirakura as they have revealed this case in their + * research. + */ + +/* + * This function replaces unsafe IDNA dots in host labels. Unfortunately, + * IDNA extends dot definition from '.' to multiple other characters that + * should be treated equally. + * This function replaces such dots and returns `true` if these dots are found. + * In this case, it should be treated as obfuscation attempt. + */ +static bool +rspamd_url_remove_dots(struct rspamd_url *uri) +{ + const gchar *hstart = rspamd_url_host_unsafe(uri); + gchar *t; + UChar32 uc; + gint i = 0, hlen; + bool ret = false; + + if (uri->hostlen == 0) { + return false; + } + + hlen = uri->hostlen; + t = rspamd_url_host_unsafe(uri); + + while (i < hlen) { + gint prev_i = i; + U8_NEXT(hstart, i, hlen, uc); + + if (is_idna_label_dot(uc)) { + *t++ = '.'; + ret = true; + } + else { + if (ret) { + /* We have to shift the remaining stuff */ + while (prev_i < i) { + *t++ = *(hstart + prev_i); + prev_i++; + } + } + else { + t += (i - prev_i); + } + } + } + + if (ret) { + rspamd_url_shift(uri, t - hstart, UF_HOST); + } + + return ret; +} + +enum uri_errno +rspamd_url_parse(struct rspamd_url *uri, + gchar *uristring, gsize len, + rspamd_mempool_t *pool, + enum rspamd_url_parse_flags parse_flags) +{ + struct http_parser_url u; + gchar *p; + const gchar *end; + guint complen, ret, flags = 0; + gsize unquoted_len = 0; + + memset(uri, 0, sizeof(*uri)); + memset(&u, 0, sizeof(u)); + uri->count = 1; + /* Undefine order */ + uri->order = -1; + uri->part_order = -1; + + if (*uristring == '\0') { + return URI_ERRNO_EMPTY; + } + + if (len >= G_MAXUINT16 / 2) { + flags |= RSPAMD_URL_FLAG_TRUNCATED; + len = G_MAXUINT16 / 2; + } + + p = uristring; + uri->protocol = PROTOCOL_UNKNOWN; + + if (len > sizeof("mailto:") - 1) { + /* For mailto: urls we also need to add slashes to make it a valid URL */ + if (g_ascii_strncasecmp(p, "mailto:", sizeof("mailto:") - 1) == 0) { + ret = rspamd_mailto_parse(&u, uristring, len, &end, parse_flags, + &flags); + } + else if (g_ascii_strncasecmp(p, "tel:", sizeof("tel:") - 1) == 0 || + g_ascii_strncasecmp(p, "callto:", sizeof("callto:") - 1) == 0) { + ret = rspamd_telephone_parse(&u, uristring, len, &end, parse_flags, + &flags); + uri->protocol = PROTOCOL_TELEPHONE; + } + else { + ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, + &flags); + } + } + else { + ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, &flags); + } + + if (ret != 0) { + return URI_ERRNO_BAD_FORMAT; + } + + if (end > uristring && (guint) (end - uristring) != len) { + len = end - uristring; + } + + uri->raw = p; + uri->rawlen = len; + + if (flags & RSPAMD_URL_FLAG_MISSINGSLASHES) { + len += 2; + uri->string = rspamd_mempool_alloc(pool, len + 1); + memcpy(uri->string, p, u.field_data[UF_SCHEMA].len); + memcpy(uri->string + u.field_data[UF_SCHEMA].len, "://", 3); + rspamd_strlcpy(uri->string + u.field_data[UF_SCHEMA].len + 3, + p + u.field_data[UF_SCHEMA].len + 1, + len - 2 - u.field_data[UF_SCHEMA].len); + /* Compensate slashes added */ + for (int i = UF_SCHEMA + 1; i < UF_MAX; i++) { + if (u.field_set & (1 << i)) { + u.field_data[i].off += 2; + } + } + } + else { + uri->string = rspamd_mempool_alloc(pool, len + 1); + rspamd_strlcpy(uri->string, p, len + 1); + } + + uri->urllen = len; + uri->flags = flags; + + for (guint i = 0; i < UF_MAX; i++) { + if (u.field_set & (1 << i)) { + guint shift = u.field_data[i].off; + complen = u.field_data[i].len; + + if (complen >= G_MAXUINT16) { + /* Too large component length */ + return URI_ERRNO_BAD_FORMAT; + } + + switch (i) { + case UF_SCHEMA: + uri->protocollen = u.field_data[i].len; + break; + case UF_HOST: + uri->hostshift = shift; + uri->hostlen = complen; + break; + case UF_PATH: + uri->datashift = shift; + uri->datalen = complen; + break; + case UF_QUERY: + uri->queryshift = shift; + uri->querylen = complen; + break; + case UF_FRAGMENT: + uri->fragmentshift = shift; + uri->fragmentlen = complen; + break; + case UF_USERINFO: + uri->usershift = shift; + uri->userlen = complen; + break; + default: + break; + } + } + } + + /* Port is 'special' in case of url_parser as it is not a part of UF_* macro logic */ + if (u.port != 0) { + if (!uri->ext) { + uri->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext); + } + uri->flags |= RSPAMD_URL_FLAG_HAS_PORT; + uri->ext->port = u.port; + } + + if (!uri->hostlen) { + return URI_ERRNO_HOST_MISSING; + } + + /* Now decode url symbols */ + unquoted_len = rspamd_url_decode(uri->string, + uri->string, + uri->protocollen); + rspamd_url_shift(uri, unquoted_len, UF_SCHEMA); + unquoted_len = rspamd_url_decode(rspamd_url_host_unsafe(uri), + rspamd_url_host_unsafe(uri), uri->hostlen); + + rspamd_url_normalise_propagate_flags(pool, rspamd_url_host_unsafe(uri), + &unquoted_len, uri->flags); + + rspamd_url_shift(uri, unquoted_len, UF_HOST); + + if (rspamd_url_remove_dots(uri)) { + uri->flags |= RSPAMD_URL_FLAG_OBSCURED; + } + + if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_MAILTO | PROTOCOL_FTP | PROTOCOL_FILE)) { + /* Ensure that hostname starts with something sane (exclude numeric urls) */ + const gchar *host = rspamd_url_host_unsafe(uri); + + if (!(is_domain_start(host[0]) || host[0] == ':')) { + return URI_ERRNO_BAD_FORMAT; + } + } + + /* Apply nameprep algorithm */ + static UStringPrepProfile *nameprep = NULL; + UErrorCode uc_err = U_ZERO_ERROR; + + if (nameprep == NULL) { + /* Open and cache profile */ + nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, &uc_err); + + g_assert(U_SUCCESS(uc_err)); + } + + UChar *utf16_hostname, *norm_utf16; + gint32 utf16_len, norm_utf16_len, norm_utf8_len; + UParseError parse_error; + + utf16_hostname = rspamd_mempool_alloc(pool, uri->hostlen * sizeof(UChar)); + struct UConverter *utf8_conv = rspamd_get_utf8_converter(); + + utf16_len = ucnv_toUChars(utf8_conv, utf16_hostname, uri->hostlen, + rspamd_url_host_unsafe(uri), uri->hostlen, &uc_err); + + if (!U_SUCCESS(uc_err)) { + + return URI_ERRNO_BAD_FORMAT; + } + + norm_utf16 = rspamd_mempool_alloc(pool, utf16_len * sizeof(UChar)); + norm_utf16_len = usprep_prepare(nameprep, utf16_hostname, utf16_len, + norm_utf16, utf16_len, USPREP_DEFAULT, &parse_error, &uc_err); + + if (!U_SUCCESS(uc_err)) { + + return URI_ERRNO_BAD_FORMAT; + } + + /* Convert back to utf8, sigh... */ + norm_utf8_len = ucnv_fromUChars(utf8_conv, + rspamd_url_host_unsafe(uri), uri->hostlen, + norm_utf16, norm_utf16_len, &uc_err); + + if (!U_SUCCESS(uc_err)) { + + return URI_ERRNO_BAD_FORMAT; + } + + /* Final shift of lengths */ + rspamd_url_shift(uri, norm_utf8_len, UF_HOST); + + /* Process data part */ + if (uri->datalen) { + unquoted_len = rspamd_url_decode(rspamd_url_data_unsafe(uri), + rspamd_url_data_unsafe(uri), uri->datalen); + + rspamd_url_normalise_propagate_flags(pool, rspamd_url_data_unsafe(uri), + &unquoted_len, uri->flags); + + rspamd_url_shift(uri, unquoted_len, UF_PATH); + /* We now normalize path */ + rspamd_normalize_path_inplace(rspamd_url_data_unsafe(uri), + uri->datalen, &unquoted_len); + rspamd_url_shift(uri, unquoted_len, UF_PATH); + } + + if (uri->querylen) { + unquoted_len = rspamd_url_decode(rspamd_url_query_unsafe(uri), + rspamd_url_query_unsafe(uri), + uri->querylen); + + rspamd_url_normalise_propagate_flags(pool, rspamd_url_query_unsafe(uri), + &unquoted_len, uri->flags); + rspamd_url_shift(uri, unquoted_len, UF_QUERY); + } + + if (uri->fragmentlen) { + unquoted_len = rspamd_url_decode(rspamd_url_fragment_unsafe(uri), + rspamd_url_fragment_unsafe(uri), + uri->fragmentlen); + + rspamd_url_normalise_propagate_flags(pool, rspamd_url_fragment_unsafe(uri), + &unquoted_len, uri->flags); + rspamd_url_shift(uri, unquoted_len, UF_FRAGMENT); + } + + rspamd_str_lc(uri->string, uri->protocollen); + unquoted_len = rspamd_str_lc_utf8(rspamd_url_host_unsafe(uri), uri->hostlen); + rspamd_url_shift(uri, unquoted_len, UF_HOST); + + if (uri->protocol == PROTOCOL_UNKNOWN) { + for (int i = 0; i < G_N_ELEMENTS(rspamd_url_protocols); i++) { + if (uri->protocollen == rspamd_url_protocols[i].len) { + if (memcmp(uri->string, + rspamd_url_protocols[i].name, uri->protocollen) == 0) { + uri->protocol = rspamd_url_protocols[i].proto; + break; + } + } + } + } + + if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_MAILTO | PROTOCOL_FTP | PROTOCOL_FILE)) { + /* Find TLD part */ + if (url_scanner->search_trie_full) { + rspamd_multipattern_lookup(url_scanner->search_trie_full, + rspamd_url_host_unsafe(uri), uri->hostlen, + rspamd_tld_trie_callback, uri, NULL); + } + + if (uri->tldlen == 0) { + /* + * If we have not detected eSLD, but there are no dots in the hostname, + * then we should treat the whole hostname as eSLD - a rule of thumb + * + * We also check that a hostname ends with a permitted character, and all characters are forming + * DNS label. We also need to check for a numeric IP within this check. + */ + const char *dot_pos = memchr(rspamd_url_host_unsafe(uri), '.', uri->hostlen); + bool is_whole_hostname_tld = false; + + if (uri->hostlen > 0 && (dot_pos == NULL || dot_pos == rspamd_url_host_unsafe(uri) + uri->hostlen - 1)) { + bool all_chars_domain = true; + + for (int i = 0; i < uri->hostlen; i++) { + if (!is_domain(rspamd_url_host_unsafe(uri)[i])) { + all_chars_domain = false; + break; + } + } + + char last_c = rspamd_url_host_unsafe(uri)[uri->hostlen - 1]; + + if (all_chars_domain) { + /* Also check the last character to be either a dot or alphanumeric character */ + if (last_c != '.' && !g_ascii_isalnum(last_c)) { + all_chars_domain = false; + } + } + + if (all_chars_domain) { + /* Additionally check for a numeric IP as we can have some number here... */ + rspamd_url_maybe_regenerate_from_ip(uri, pool); + + if (last_c == '.' && uri->hostlen > 1) { + /* Skip the last dot */ + uri->tldlen = uri->hostlen - 1; + } + else { + uri->tldlen = uri->hostlen; + } + + uri->tldshift = uri->hostshift; + is_whole_hostname_tld = true; + } + } + + if (!is_whole_hostname_tld) { + if (uri->protocol != PROTOCOL_MAILTO) { + if (url_scanner->has_tld_file && !(parse_flags & RSPAMD_URL_PARSE_HREF)) { + /* Ignore URL's without TLD if it is not a numeric URL */ + if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) { + return URI_ERRNO_TLD_MISSING; + } + } + else { + if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) { + /* Assume tld equal to host */ + uri->tldshift = uri->hostshift; + uri->tldlen = uri->hostlen; + } + else if (uri->flags & RSPAMD_URL_FLAG_SCHEMALESS) { + /* Ignore urls with both no schema and no tld */ + return URI_ERRNO_TLD_MISSING; + } + + uri->flags |= RSPAMD_URL_FLAG_NO_TLD; + } + } + else { + /* Ignore IP like domains for mailto, as it is really never supported */ + return URI_ERRNO_TLD_MISSING; + } + } + } + + /* Replace stupid '\' with '/' after schema */ + if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP) && + uri->protocollen > 0 && uri->urllen > uri->protocollen + 2) { + + gchar *pos = &uri->string[uri->protocollen], + *host_start = rspamd_url_host_unsafe(uri); + + while (pos < host_start) { + if (*pos == '\\') { + *pos = '/'; + uri->flags |= RSPAMD_URL_FLAG_OBSCURED; + } + pos++; + } + } + } + else if (uri->protocol & PROTOCOL_TELEPHONE) { + /* We need to normalise phone number: remove all spaces and braces */ + rspamd_telephone_normalise_inplace(uri); + + if (rspamd_url_host_unsafe(uri)[0] == '+') { + uri->tldshift = uri->hostshift + 1; + uri->tldlen = uri->hostlen - 1; + } + else { + uri->tldshift = uri->hostshift; + uri->tldlen = uri->hostlen; + } + } + + if (uri->protocol == PROTOCOL_UNKNOWN) { + if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) { + return URI_ERRNO_INVALID_PROTOCOL; + } + else { + /* Hack, hack, hack */ + uri->protocol = PROTOCOL_UNKNOWN; + } + } + + return URI_ERRNO_OK; +} + +struct tld_trie_cbdata { + const gchar *begin; + gsize len; + rspamd_ftok_t *out; +}; + +static gint +rspamd_tld_trie_find_callback(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + struct url_matcher *matcher; + const gchar *start, *pos, *p; + struct tld_trie_cbdata *cbdata = context; + gint ndots = 1; + + matcher = &g_array_index(url_scanner->matchers_full, struct url_matcher, + strnum); + + if (matcher->flags & URL_MATCHER_FLAG_STAR_MATCH) { + /* Skip one more tld component */ + ndots = 2; + } + + pos = text + match_start; + p = pos - 1; + start = text; + + if (*pos != '.' || match_pos != (gint) cbdata->len) { + /* Something weird has been found */ + if (match_pos != (gint) cbdata->len - 1) { + /* Search more */ + return 0; + } + } + + /* Now we need to find top level domain */ + pos = start; + + while (p >= start && ndots > 0) { + if (*p == '.') { + ndots--; + pos = p + 1; + } + else { + pos = p; + } + + p--; + } + + if (ndots == 0 || p == start - 1) { + if (cbdata->begin + cbdata->len - pos > cbdata->out->len) { + cbdata->out->begin = pos; + cbdata->out->len = cbdata->begin + cbdata->len - pos; + } + } + + return 0; +} + +gboolean +rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out) +{ + struct tld_trie_cbdata cbdata; + + g_assert(in != NULL); + g_assert(out != NULL); + g_assert(url_scanner != NULL); + + cbdata.begin = in; + cbdata.len = inlen; + cbdata.out = out; + out->len = 0; + + if (url_scanner->search_trie_full) { + rspamd_multipattern_lookup(url_scanner->search_trie_full, in, inlen, + rspamd_tld_trie_find_callback, &cbdata, NULL); + } + + if (out->len > 0) { + return TRUE; + } + + return FALSE; +} + +static const gchar url_braces[] = { + '(', ')', + '{', '}', + '[', ']', + '<', '>', + '|', '|', + '\'', '\''}; + + +static gboolean +url_file_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + match->m_begin = pos; + + if (pos > cb->begin) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + + return TRUE; +} + +static gboolean +url_file_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *p; + gchar stop; + guint i; + + p = pos + strlen(match->pattern); + stop = *p; + if (*p == '/') { + p++; + } + + for (i = 0; i < G_N_ELEMENTS(url_braces) / 2; i += 2) { + if (*p == url_braces[i]) { + stop = url_braces[i + 1]; + break; + } + } + + while (p < cb->end && *p != stop && is_urlsafe(*p)) { + p++; + } + + if (p == cb->begin) { + return FALSE; + } + match->m_len = p - match->m_begin; + + return TRUE; +} + +static gboolean +url_tld_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *p = pos; + guint processed = 0; + static const guint max_shift = 253 + sizeof("https://"); + + /* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */ + while (p >= cb->begin) { + if (!is_domain(*p) || g_ascii_isspace(*p) || is_url_start(*p) || + p == match->prev_newline_pos) { + if (!is_url_start(*p) && !g_ascii_isspace(*p) && + p != match->prev_newline_pos) { + return FALSE; + } + + if (p != match->prev_newline_pos) { + match->st = *p; + + p++; + } + else { + match->st = '\n'; + } + + if (!g_ascii_isalnum(*p)) { + /* Urls cannot start with strange symbols */ + return FALSE; + } + + match->m_begin = p; + return TRUE; + } + else if (p == cb->begin && p != pos) { + match->st = '\0'; + match->m_begin = p; + + return TRUE; + } + else if (*p == '.') { + if (p == cb->begin) { + /* Urls cannot start with a dot */ + return FALSE; + } + if (!g_ascii_isalnum(p[1])) { + /* Wrong we have an invalid character after dot */ + return FALSE; + } + } + else if (*p == '/') { + /* Urls cannot contain '/' in their body */ + return FALSE; + } + + p--; + processed++; + + if (processed > max_shift) { + /* Too long */ + return FALSE; + } + } + + return FALSE; +} + +static gboolean +url_tld_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *p; + gboolean ret = FALSE; + + p = pos + match->m_len; + + if (p == cb->end) { + match->m_len = p - match->m_begin; + return TRUE; + } + else if (*p == '/' || *p == ':' || is_url_end(*p) || is_lwsp(*p) || + (match->st != '<' && p == match->newline_pos)) { + /* Parse arguments, ports by normal way by url default function */ + p = match->m_begin; + /* Check common prefix */ + if (g_ascii_strncasecmp(p, "http://", sizeof("http://") - 1) == 0) { + ret = url_web_end(cb, + match->m_begin + sizeof("http://") - 1, + match); + } + else { + ret = url_web_end(cb, match->m_begin, match); + } + } + else if (*p == '.') { + p++; + if (p < cb->end) { + if (g_ascii_isspace(*p) || *p == '/' || + *p == '?' || *p == ':') { + ret = url_web_end(cb, match->m_begin, match); + } + } + } + + if (ret) { + /* Check sanity of match found */ + if (match->m_begin + match->m_len <= pos) { + return FALSE; + } + } + + return ret; +} + +static gboolean +url_web_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + /* Check what we have found */ + if (pos > cb->begin) { + if (g_ascii_strncasecmp(pos, "www", 3) == 0) { + + if (!(is_url_start(*(pos - 1)) || + g_ascii_isspace(*(pos - 1)) || + pos - 1 == match->prev_newline_pos || + (*(pos - 1) & 0x80))) { /* Chinese trick */ + return FALSE; + } + } + else { + guchar prev = *(pos - 1); + + if (g_ascii_isalnum(prev)) { + /* Part of another url */ + return FALSE; + } + } + } + + if (*pos == '.') { + /* Urls cannot start with . */ + return FALSE; + } + + if (pos > cb->begin) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + + match->m_begin = pos; + + return TRUE; +} + +static gboolean +url_web_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *last = NULL; + gint len = cb->end - pos; + guint flags = 0; + + if (match->newline_pos && match->st != '<') { + /* We should also limit our match end to the newline */ + len = MIN(len, match->newline_pos - pos); + } + + if (rspamd_web_parse(NULL, pos, len, &last, + RSPAMD_URL_PARSE_CHECK, &flags) != 0) { + return FALSE; + } + + if (last < cb->end && (*last == '>' && last != match->newline_pos)) { + /* We need to ensure that url also starts with '>' */ + if (match->st != '<') { + if (last + 1 < cb->end) { + if (g_ascii_isspace(last[1])) { + return FALSE; + } + } + else { + return FALSE; + } + } + } + + match->m_len = (last - pos); + cb->fin = last + 1; + + return TRUE; +} + + +static gboolean +url_email_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + if (!match->prefix || match->prefix[0] == '\0') { + /* We have mailto:// at the beginning */ + match->m_begin = pos; + + if (pos >= cb->begin + 1) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + } + else { + /* Just '@' */ + + /* Check if this match is a part of the previous mailto: email */ + if (cb->last_at != NULL && cb->last_at == pos) { + cb->last_at = NULL; + return FALSE; + } + else if (pos == cb->begin) { + /* Just @ at the start of input */ + return FALSE; + } + + match->st = '\0'; + } + + return TRUE; +} + +static gboolean +url_email_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *last = NULL; + struct http_parser_url u; + gint len = cb->end - pos; + guint flags = 0; + + if (match->newline_pos && match->st != '<') { + /* We should also limit our match end to the newline */ + len = MIN(len, match->newline_pos - pos); + } + + if (!match->prefix || match->prefix[0] == '\0') { + /* We have mailto:// at the beginning */ + if (rspamd_mailto_parse(&u, pos, len, &last, + RSPAMD_URL_PARSE_CHECK, &flags) != 0) { + return FALSE; + } + + if (!(u.field_set & (1 << UF_USERINFO))) { + return FALSE; + } + + cb->last_at = match->m_begin + u.field_data[UF_USERINFO].off + + u.field_data[UF_USERINFO].len; + + g_assert(*cb->last_at == '@'); + match->m_len = (last - pos); + + return TRUE; + } + else { + const gchar *c, *p; + /* + * Here we have just '@', so we need to find both start and end of the + * pattern + */ + g_assert(*pos == '@'); + + if (pos >= cb->end - 2 || pos < cb->begin + 1) { + /* Boundary violation */ + return FALSE; + } + + /* Check the next character after `@` */ + if (!g_ascii_isalnum(pos[1]) || !g_ascii_isalnum(*(pos - 1))) { + return FALSE; + } + + + c = pos - 1; + while (c > cb->begin) { + if (!is_mailsafe(*c)) { + break; + } + if (c == match->prev_newline_pos) { + break; + } + + c--; + } + /* Rewind to the first alphanumeric character */ + while (c < pos && !g_ascii_isalnum(*c)) { + c++; + } + + /* Find the end of email */ + p = pos + 1; + while (p < cb->end && is_domain(*p)) { + if (p == match->newline_pos) { + break; + } + + p++; + } + + /* Rewind it again to avoid bad emails to be detected */ + while (p > pos && p < cb->end && !g_ascii_isalnum(*p)) { + p--; + } + + if (p < cb->end && g_ascii_isalnum(*p) && + (match->newline_pos == NULL || p < match->newline_pos)) { + p++; + } + + if (p > c) { + match->m_begin = c; + match->m_len = p - c; + return TRUE; + } + } + + return FALSE; +} + +static gboolean +url_tel_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + match->m_begin = pos; + + if (pos >= cb->begin + 1) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + + return TRUE; +} + +static gboolean +url_tel_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *last = NULL; + struct http_parser_url u; + gint len = cb->end - pos; + guint flags = 0; + + if (match->newline_pos && match->st != '<') { + /* We should also limit our match end to the newline */ + len = MIN(len, match->newline_pos - pos); + } + + if (rspamd_telephone_parse(&u, pos, len, &last, + RSPAMD_URL_PARSE_CHECK, &flags) != 0) { + return FALSE; + } + + if (!(u.field_set & (1 << UF_HOST))) { + return FALSE; + } + + match->m_len = (last - pos); + + return TRUE; +} + + +static gboolean +rspamd_url_trie_is_match(struct url_matcher *matcher, const gchar *pos, + const gchar *end, const gchar *newline_pos) +{ + if (matcher->flags & URL_MATCHER_FLAG_TLD_MATCH) { + /* Immediately check pos for valid chars */ + if (pos < end) { + if (pos != newline_pos && !g_ascii_isspace(*pos) && *pos != '/' && *pos != '?' && + *pos != ':' && !is_url_end(*pos)) { + if (*pos == '.') { + /* We allow . at the end of the domain however */ + pos++; + if (pos < end) { + if (!g_ascii_isspace(*pos) && *pos != '/' && + *pos != '?' && *pos != ':' && !is_url_end(*pos)) { + return FALSE; + } + } + } + else { + return FALSE; + } + } + } + } + + return TRUE; +} + +static gint +rspamd_url_trie_callback(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + struct url_matcher *matcher; + url_match_t m; + const gchar *pos, *newline_pos = NULL; + struct url_callback_data *cb = context; + + pos = text + match_pos; + + if (cb->fin > pos) { + /* Already seen */ + return 0; + } + + matcher = &g_array_index(cb->matchers, struct url_matcher, + strnum); + + if ((matcher->flags & URL_MATCHER_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { + /* Do not try to match non-html like urls in html texts */ + return 0; + } + + memset(&m, 0, sizeof(m)); + m.m_begin = text + match_start; + m.m_len = match_pos - match_start; + + if (cb->newlines && cb->newlines->len > 0) { + newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); + + while (pos > newline_pos && cb->newline_idx < cb->newlines->len) { + cb->newline_idx++; + newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); + } + + if (pos > newline_pos) { + newline_pos = NULL; + } + + if (cb->newline_idx > 0) { + m.prev_newline_pos = g_ptr_array_index(cb->newlines, + cb->newline_idx - 1); + } + } + + if (!rspamd_url_trie_is_match(matcher, pos, cb->end, newline_pos)) { + return 0; + } + + m.pattern = matcher->pattern; + m.prefix = matcher->prefix; + m.add_prefix = FALSE; + m.newline_pos = newline_pos; + pos = cb->begin + match_start; + + if (matcher->start(cb, pos, &m) && + matcher->end(cb, pos, &m)) { + if (m.add_prefix || matcher->prefix[0] != '\0') { + cb->len = m.m_len + strlen(matcher->prefix); + cb->url_str = rspamd_mempool_alloc(cb->pool, cb->len + 1); + cb->len = rspamd_snprintf(cb->url_str, + cb->len + 1, + "%s%*s", + m.prefix, + (gint) m.m_len, + m.m_begin); + cb->prefix_added = TRUE; + } + else { + cb->url_str = rspamd_mempool_alloc(cb->pool, m.m_len + 1); + rspamd_strlcpy(cb->url_str, m.m_begin, m.m_len + 1); + } + + cb->start = m.m_begin; + + if (pos > cb->fin) { + cb->fin = pos; + } + + return 1; + } + else { + cb->url_str = NULL; + } + + /* Continue search */ + return 0; +} + +gboolean +rspamd_url_find(rspamd_mempool_t *pool, + const gchar *begin, gsize len, + gchar **url_str, + enum rspamd_url_find_type how, + goffset *url_pos, + gboolean *prefix_added) +{ + struct url_callback_data cb; + gint ret; + + memset(&cb, 0, sizeof(cb)); + cb.begin = begin; + cb.end = begin + len; + cb.how = how; + cb.pool = pool; + + if (how == RSPAMD_URL_FIND_ALL) { + if (url_scanner->search_trie_full) { + cb.matchers = url_scanner->matchers_full; + ret = rspamd_multipattern_lookup(url_scanner->search_trie_full, + begin, len, + rspamd_url_trie_callback, &cb, NULL); + } + else { + cb.matchers = url_scanner->matchers_strict; + ret = rspamd_multipattern_lookup(url_scanner->search_trie_strict, + begin, len, + rspamd_url_trie_callback, &cb, NULL); + } + } + else { + cb.matchers = url_scanner->matchers_strict; + ret = rspamd_multipattern_lookup(url_scanner->search_trie_strict, + begin, len, + rspamd_url_trie_callback, &cb, NULL); + } + + if (ret) { + if (url_str) { + *url_str = cb.url_str; + } + + if (url_pos) { + *url_pos = cb.start - begin; + } + + if (prefix_added) { + *prefix_added = cb.prefix_added; + } + + return TRUE; + } + + return FALSE; +} + +static gint +rspamd_url_trie_generic_callback_common(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context, + gboolean multiple) +{ + struct rspamd_url *url; + struct url_matcher *matcher; + url_match_t m; + const gchar *pos, *newline_pos = NULL; + struct url_callback_data *cb = context; + gint rc; + rspamd_mempool_t *pool; + + pos = text + match_pos; + + if (cb->fin > pos) { + /* Already seen */ + return 0; + } + + matcher = &g_array_index(cb->matchers, struct url_matcher, + strnum); + pool = cb->pool; + + if ((matcher->flags & URL_MATCHER_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { + /* Do not try to match non-html like urls in html texts, continue matching */ + return 0; + } + + memset(&m, 0, sizeof(m)); + + + /* Find the next newline after our pos */ + if (cb->newlines && cb->newlines->len > 0) { + newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); + + while (pos > newline_pos && cb->newline_idx < cb->newlines->len - 1) { + cb->newline_idx++; + newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); + } + + if (pos > newline_pos) { + newline_pos = NULL; + } + if (cb->newline_idx > 0) { + m.prev_newline_pos = g_ptr_array_index(cb->newlines, + cb->newline_idx - 1); + } + } + + if (!rspamd_url_trie_is_match(matcher, pos, text + len, newline_pos)) { + /* Mismatch, continue */ + return 0; + } + + pos = cb->begin + match_start; + m.pattern = matcher->pattern; + m.prefix = matcher->prefix; + m.add_prefix = FALSE; + m.m_begin = text + match_start; + m.m_len = match_pos - match_start; + m.newline_pos = newline_pos; + + if (matcher->start(cb, pos, &m) && + matcher->end(cb, pos, &m)) { + if (m.add_prefix || matcher->prefix[0] != '\0') { + cb->len = m.m_len + strlen(matcher->prefix); + cb->url_str = rspamd_mempool_alloc(cb->pool, cb->len + 1); + cb->len = rspamd_snprintf(cb->url_str, + cb->len + 1, + "%s%*s", + m.prefix, + (gint) m.m_len, + m.m_begin); + cb->prefix_added = TRUE; + } + else { + cb->url_str = rspamd_mempool_alloc(cb->pool, m.m_len + 1); + cb->len = rspamd_strlcpy(cb->url_str, m.m_begin, m.m_len + 1); + } + + cb->start = m.m_begin; + + if (pos > cb->fin) { + cb->fin = pos; + } + + url = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_url)); + g_strstrip(cb->url_str); + rc = rspamd_url_parse(url, cb->url_str, + strlen(cb->url_str), pool, + RSPAMD_URL_PARSE_TEXT); + + if (rc == URI_ERRNO_OK && url->hostlen > 0) { + if (cb->prefix_added) { + url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; + cb->prefix_added = FALSE; + } + + if (cb->func) { + if (!cb->func(url, cb->start - text, (m.m_begin + m.m_len) - text, + cb->funcd)) { + /* We need to stop here in any case! */ + return -1; + } + } + } + else if (rc != URI_ERRNO_OK) { + msg_debug_pool_check("extract of url '%s' failed: %s", + cb->url_str, + rspamd_url_strerror(rc)); + } + } + else { + cb->url_str = NULL; + /* Continue search if no pattern has been found */ + return 0; + } + + /* Continue search if required (return 0 means continue) */ + return !multiple; +} + +static gint +rspamd_url_trie_generic_callback_multiple(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + return rspamd_url_trie_generic_callback_common(mp, strnum, match_start, + match_pos, text, len, context, TRUE); +} + +static gint +rspamd_url_trie_generic_callback_single(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + return rspamd_url_trie_generic_callback_common(mp, strnum, match_start, + match_pos, text, len, context, FALSE); +} + +struct rspamd_url_mimepart_cbdata { + struct rspamd_task *task; + struct rspamd_mime_text_part *part; + gsize url_len; + uint16_t *cur_url_order; /* Global ordering */ + uint16_t cur_part_order; /* Per part ordering */ +}; + +static gboolean +rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ + struct rspamd_url_mimepart_cbdata *cbd = + (struct rspamd_url_mimepart_cbdata *) ud; + struct rspamd_task *task; + + task = cbd->task; + + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen == 0) { + return FALSE; + } + } + /* Also check max urls */ + if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) { + if (kh_size(MESSAGE_FIELD(task, urls)) > cbd->task->cfg->max_urls) { + msg_err_task("part has too many URLs, we cannot process more: " + "%d urls extracted ", + (guint) kh_size(MESSAGE_FIELD(task, urls))); + + return FALSE; + } + } + + url->flags |= RSPAMD_URL_FLAG_QUERY; + + + if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) { + if (cbd->part && cbd->part->mime_part->urls) { + g_ptr_array_add(cbd->part->mime_part->urls, url); + } + + url->part_order = cbd->cur_part_order++; + + if (cbd->cur_url_order) { + url->order = (*cbd->cur_url_order)++; + } + } + + return TRUE; +} + +static gboolean +rspamd_url_text_part_callback(struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ + struct rspamd_url_mimepart_cbdata *cbd = + (struct rspamd_url_mimepart_cbdata *) ud; + struct rspamd_process_exception *ex; + struct rspamd_task *task; + + task = cbd->task; + ex = rspamd_mempool_alloc0(task->task_pool, sizeof(struct rspamd_process_exception)); + + ex->pos = start_offset; + ex->len = end_offset - start_offset; + ex->type = RSPAMD_EXCEPTION_URL; + ex->ptr = url; + + cbd->url_len += ex->len; + + if (cbd->part->utf_stripped_content && + cbd->url_len > cbd->part->utf_stripped_content->len * 10) { + /* Absurd case, stop here now */ + msg_err_task("part has too many URLs, we cannot process more: %z url len; " + "%d stripped content length", + cbd->url_len, cbd->part->utf_stripped_content->len); + + return FALSE; + } + + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen == 0) { + return FALSE; + } + } + /* Also check max urls */ + if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) { + if (kh_size(MESSAGE_FIELD(task, urls)) > cbd->task->cfg->max_urls) { + msg_err_task("part has too many URLs, we cannot process more: " + "%d urls extracted ", + (guint) kh_size(MESSAGE_FIELD(task, urls))); + + return FALSE; + } + } + + url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; + + if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) && + cbd->part->mime_part->urls) { + url->part_order = cbd->cur_part_order++; + + if (cbd->cur_url_order) { + url->order = (*cbd->cur_url_order)++; + } + g_ptr_array_add(cbd->part->mime_part->urls, url); + } + + cbd->part->exceptions = g_list_prepend( + cbd->part->exceptions, + ex); + + /* We also search the query for additional url inside */ + if (url->querylen > 0) { + rspamd_url_find_multiple(task->task_pool, + rspamd_url_query_unsafe(url), url->querylen, + RSPAMD_URL_FIND_ALL, NULL, + rspamd_url_query_callback, cbd); + } + + return TRUE; +} + +void rspamd_url_text_extract(rspamd_mempool_t *pool, + struct rspamd_task *task, + struct rspamd_mime_text_part *part, + uint16_t *cur_url_order, + enum rspamd_url_find_type how) +{ + struct rspamd_url_mimepart_cbdata mcbd; + + if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) { + msg_warn_task("got empty text part"); + return; + } + + mcbd.task = task; + mcbd.part = part; + mcbd.url_len = 0; + mcbd.cur_url_order = cur_url_order; + mcbd.cur_part_order = 0; + + rspamd_url_find_multiple(task->task_pool, part->utf_stripped_content->data, + part->utf_stripped_content->len, how, part->newlines, + rspamd_url_text_part_callback, &mcbd); +} + +void rspamd_url_find_multiple(rspamd_mempool_t *pool, + const gchar *in, + gsize inlen, + enum rspamd_url_find_type how, + GPtrArray *nlines, + url_insert_function func, + gpointer ud) +{ + struct url_callback_data cb; + + g_assert(in != NULL); + + if (inlen == 0) { + inlen = strlen(in); + } + + memset(&cb, 0, sizeof(cb)); + cb.begin = in; + cb.end = in + inlen; + cb.how = how; + cb.pool = pool; + + cb.funcd = ud; + cb.func = func; + cb.newlines = nlines; + + if (how == RSPAMD_URL_FIND_ALL) { + if (url_scanner->search_trie_full) { + cb.matchers = url_scanner->matchers_full; + rspamd_multipattern_lookup(url_scanner->search_trie_full, + in, inlen, + rspamd_url_trie_generic_callback_multiple, &cb, NULL); + } + else { + cb.matchers = url_scanner->matchers_strict; + rspamd_multipattern_lookup(url_scanner->search_trie_strict, + in, inlen, + rspamd_url_trie_generic_callback_multiple, &cb, NULL); + } + } + else { + cb.matchers = url_scanner->matchers_strict; + rspamd_multipattern_lookup(url_scanner->search_trie_strict, + in, inlen, + rspamd_url_trie_generic_callback_multiple, &cb, NULL); + } +} + +void rspamd_url_find_single(rspamd_mempool_t *pool, + const gchar *in, + gsize inlen, + enum rspamd_url_find_type how, + url_insert_function func, + gpointer ud) +{ + struct url_callback_data cb; + + g_assert(in != NULL); + + if (inlen == 0) { + inlen = strlen(in); + } + + /* + * We might have a situation when we need to parse URLs on config file + * parsing, but there is no valid url_scanner loaded. Hence, we just load + * some defaults and it should be fine... + */ + if (url_scanner == NULL) { + rspamd_url_init(NULL); + } + + memset(&cb, 0, sizeof(cb)); + cb.begin = in; + cb.end = in + inlen; + cb.how = how; + cb.pool = pool; + + cb.funcd = ud; + cb.func = func; + + if (how == RSPAMD_URL_FIND_ALL) { + if (url_scanner->search_trie_full) { + cb.matchers = url_scanner->matchers_full; + rspamd_multipattern_lookup(url_scanner->search_trie_full, + in, inlen, + rspamd_url_trie_generic_callback_single, &cb, NULL); + } + else { + cb.matchers = url_scanner->matchers_strict; + rspamd_multipattern_lookup(url_scanner->search_trie_strict, + in, inlen, + rspamd_url_trie_generic_callback_single, &cb, NULL); + } + } + else { + cb.matchers = url_scanner->matchers_strict; + rspamd_multipattern_lookup(url_scanner->search_trie_strict, + in, inlen, + rspamd_url_trie_generic_callback_single, &cb, NULL); + } +} + + +gboolean +rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ + struct rspamd_task *task = ud; + gchar *url_str = NULL; + struct rspamd_url *query_url; + gint rc; + gboolean prefix_added; + + /* It is just a displayed URL, we should not check it for certain things */ + url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED | RSPAMD_URL_FLAG_SUBJECT; + + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen == 0) { + return FALSE; + } + } + + rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false); + + /* We also search the query for additional url inside */ + if (url->querylen > 0) { + if (rspamd_url_find(task->task_pool, rspamd_url_query_unsafe(url), url->querylen, + &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { + + query_url = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_url)); + rc = rspamd_url_parse(query_url, + url_str, + strlen(url_str), + task->task_pool, + RSPAMD_URL_PARSE_TEXT); + + if (rc == URI_ERRNO_OK && + url->hostlen > 0) { + msg_debug_task("found url %s in query of url" + " %*s", + url_str, url->querylen, rspamd_url_query_unsafe(url)); + + if (prefix_added) { + query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; + } + + if (query_url->protocol == PROTOCOL_MAILTO) { + if (query_url->userlen == 0) { + return TRUE; + } + } + + rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), + query_url, false); + } + } + } + + return TRUE; +} + +static inline khint_t +rspamd_url_hash(struct rspamd_url *url) +{ + if (url->urllen > 0) { + return (khint_t) rspamd_cryptobox_fast_hash(url->string, url->urllen, + rspamd_hash_seed()); + } + + return 0; +} + +static inline khint_t +rspamd_url_host_hash(struct rspamd_url *url) +{ + if (url->hostlen > 0) { + return (khint_t) rspamd_cryptobox_fast_hash(rspamd_url_host_unsafe(url), + url->hostlen, + rspamd_hash_seed()); + } + + return 0; +} + +/* Compare two emails for building emails tree */ +static inline bool +rspamd_emails_cmp(struct rspamd_url *u1, struct rspamd_url *u2) +{ + gint r; + + if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { + return FALSE; + } + else { + if ((r = rspamd_lc_cmp(rspamd_url_host_unsafe(u1), + rspamd_url_host_unsafe(u2), u1->hostlen)) == 0) { + if (u1->userlen != u2->userlen || u1->userlen == 0) { + return FALSE; + } + else { + return (rspamd_lc_cmp(rspamd_url_user_unsafe(u1), + rspamd_url_user_unsafe(u2), + u1->userlen) == 0); + } + } + else { + return r == 0; + } + } + + return FALSE; +} + +static inline bool +rspamd_urls_cmp(struct rspamd_url *u1, struct rspamd_url *u2) +{ + int r = 0; + + if (u1->protocol != u2->protocol || u1->urllen != u2->urllen) { + return false; + } + else { + if (u1->protocol & PROTOCOL_MAILTO) { + return rspamd_emails_cmp(u1, u2); + } + + r = memcmp(u1->string, u2->string, u1->urllen); + } + + return r == 0; +} + +static inline bool +rspamd_urls_host_cmp(struct rspamd_url *u1, struct rspamd_url *u2) +{ + int r = 0; + + if (u1->hostlen != u2->hostlen) { + return false; + } + else { + r = memcmp(rspamd_url_host_unsafe(u1), rspamd_url_host_unsafe(u2), + u1->hostlen); + } + + return r == 0; +} + +gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size) +{ + gchar *d, ch, c, decoded; + const gchar *s; + enum { + sw_usual = 0, + sw_quoted, + sw_quoted_second + } state; + + d = dst; + s = src; + + state = 0; + decoded = 0; + + while (size--) { + + ch = *s++; + + switch (state) { + case sw_usual: + + if (ch == '%') { + state = sw_quoted; + break; + } + else if (ch == '+') { + *d++ = ' '; + } + else { + *d++ = ch; + } + break; + + case sw_quoted: + + if (ch >= '0' && ch <= '9') { + decoded = (ch - '0'); + state = sw_quoted_second; + break; + } + + c = (ch | 0x20); + if (c >= 'a' && c <= 'f') { + decoded = (c - 'a' + 10); + state = sw_quoted_second; + break; + } + + /* the invalid quoted character */ + + state = sw_usual; + + *d++ = ch; + + break; + + case sw_quoted_second: + + state = sw_usual; + + if (ch >= '0' && ch <= '9') { + ch = ((decoded << 4) + ch - '0'); + *d++ = ch; + + break; + } + + c = (u_char) (ch | 0x20); + if (c >= 'a' && c <= 'f') { + ch = ((decoded << 4) + c - 'a' + 10); + + *d++ = ch; + break; + } + + /* the invalid quoted character */ + break; + } + } + + return (d - dst); +} + +enum rspamd_url_char_class { + RSPAMD_URL_UNRESERVED = (1 << 0), + RSPAMD_URL_SUBDELIM = (1 << 1), + RSPAMD_URL_PATHSAFE = (1 << 2), + RSPAMD_URL_QUERYSAFE = (1 << 3), + RSPAMD_URL_FRAGMENTSAFE = (1 << 4), + RSPAMD_URL_HOSTSAFE = (1 << 5), + RSPAMD_URL_USERSAFE = (1 << 6), +}; + +#define RSPAMD_URL_FLAGS_HOSTSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_HOSTSAFE | RSPAMD_URL_SUBDELIM) +#define RSPAMD_URL_FLAGS_USERSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_USERSAFE | RSPAMD_URL_SUBDELIM) +#define RSPAMD_URL_FLAGS_PATHSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_PATHSAFE | RSPAMD_URL_SUBDELIM) +#define RSPAMD_URL_FLAGS_QUERYSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_SUBDELIM) +#define RSPAMD_URL_FLAGS_FRAGMENTSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_FRAGMENTSAFE | RSPAMD_URL_SUBDELIM) + +static const unsigned char rspamd_url_encoding_classes[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0 /* */, RSPAMD_URL_SUBDELIM /* ! */, 0 /* " */, 0 /* # */, + RSPAMD_URL_SUBDELIM /* $ */, 0 /* % */, RSPAMD_URL_SUBDELIM /* & */, + RSPAMD_URL_SUBDELIM /* ' */, RSPAMD_URL_SUBDELIM /* ( */, + RSPAMD_URL_SUBDELIM /* ) */, RSPAMD_URL_SUBDELIM /* * */, + RSPAMD_URL_SUBDELIM /* + */, RSPAMD_URL_SUBDELIM /* , */, + RSPAMD_URL_UNRESERVED /* - */, RSPAMD_URL_UNRESERVED /* . */, + RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* / */, + RSPAMD_URL_UNRESERVED /* 0 */, RSPAMD_URL_UNRESERVED /* 1 */, + RSPAMD_URL_UNRESERVED /* 2 */, RSPAMD_URL_UNRESERVED /* 3 */, + RSPAMD_URL_UNRESERVED /* 4 */, RSPAMD_URL_UNRESERVED /* 5 */, + RSPAMD_URL_UNRESERVED /* 6 */, RSPAMD_URL_UNRESERVED /* 7 */, + RSPAMD_URL_UNRESERVED /* 8 */, RSPAMD_URL_UNRESERVED /* 9 */, + RSPAMD_URL_USERSAFE | RSPAMD_URL_HOSTSAFE | RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* : */, + RSPAMD_URL_SUBDELIM /* ; */, 0 /* < */, RSPAMD_URL_SUBDELIM /* = */, 0 /* > */, + RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* ? */, + RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* @ */, + RSPAMD_URL_UNRESERVED /* A */, RSPAMD_URL_UNRESERVED /* B */, + RSPAMD_URL_UNRESERVED /* C */, RSPAMD_URL_UNRESERVED /* D */, + RSPAMD_URL_UNRESERVED /* E */, RSPAMD_URL_UNRESERVED /* F */, + RSPAMD_URL_UNRESERVED /* G */, RSPAMD_URL_UNRESERVED /* H */, + RSPAMD_URL_UNRESERVED /* I */, RSPAMD_URL_UNRESERVED /* J */, + RSPAMD_URL_UNRESERVED /* K */, RSPAMD_URL_UNRESERVED /* L */, + RSPAMD_URL_UNRESERVED /* M */, RSPAMD_URL_UNRESERVED /* N */, + RSPAMD_URL_UNRESERVED /* O */, RSPAMD_URL_UNRESERVED /* P */, + RSPAMD_URL_UNRESERVED /* Q */, RSPAMD_URL_UNRESERVED /* R */, + RSPAMD_URL_UNRESERVED /* S */, RSPAMD_URL_UNRESERVED /* T */, + RSPAMD_URL_UNRESERVED /* U */, RSPAMD_URL_UNRESERVED /* V */, + RSPAMD_URL_UNRESERVED /* W */, RSPAMD_URL_UNRESERVED /* X */, + RSPAMD_URL_UNRESERVED /* Y */, RSPAMD_URL_UNRESERVED /* Z */, + RSPAMD_URL_HOSTSAFE /* [ */, 0 /* \ */, RSPAMD_URL_HOSTSAFE /* ] */, 0 /* ^ */, + RSPAMD_URL_UNRESERVED /* _ */, 0 /* ` */, RSPAMD_URL_UNRESERVED /* a */, + RSPAMD_URL_UNRESERVED /* b */, RSPAMD_URL_UNRESERVED /* c */, + RSPAMD_URL_UNRESERVED /* d */, RSPAMD_URL_UNRESERVED /* e */, + RSPAMD_URL_UNRESERVED /* f */, RSPAMD_URL_UNRESERVED /* g */, + RSPAMD_URL_UNRESERVED /* h */, RSPAMD_URL_UNRESERVED /* i */, + RSPAMD_URL_UNRESERVED /* j */, RSPAMD_URL_UNRESERVED /* k */, + RSPAMD_URL_UNRESERVED /* l */, RSPAMD_URL_UNRESERVED /* m */, + RSPAMD_URL_UNRESERVED /* n */, RSPAMD_URL_UNRESERVED /* o */, + RSPAMD_URL_UNRESERVED /* p */, RSPAMD_URL_UNRESERVED /* q */, + RSPAMD_URL_UNRESERVED /* r */, RSPAMD_URL_UNRESERVED /* s */, + RSPAMD_URL_UNRESERVED /* t */, RSPAMD_URL_UNRESERVED /* u */, + RSPAMD_URL_UNRESERVED /* v */, RSPAMD_URL_UNRESERVED /* w */, + RSPAMD_URL_UNRESERVED /* x */, RSPAMD_URL_UNRESERVED /* y */, + RSPAMD_URL_UNRESERVED /* z */, 0 /* { */, 0 /* | */, 0 /* } */, + RSPAMD_URL_UNRESERVED /* ~ */, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +#define CHECK_URL_COMPONENT(beg, len, flags) \ + do { \ + for (i = 0; i < (len); i++) { \ + if ((rspamd_url_encoding_classes[(guchar) (beg)[i]] & (flags)) == 0) { \ + dlen += 2; \ + } \ + } \ + } while (0) + +#define ENCODE_URL_COMPONENT(beg, len, flags) \ + do { \ + for (i = 0; i < (len) && dend > d; i++) { \ + if ((rspamd_url_encoding_classes[(guchar) (beg)[i]] & (flags)) == 0) { \ + *d++ = '%'; \ + *d++ = hexdigests[(guchar) ((beg)[i] >> 4) & 0xf]; \ + *d++ = hexdigests[(guchar) (beg)[i] & 0xf]; \ + } \ + else { \ + *d++ = (beg)[i]; \ + } \ + } \ + } while (0) + +const gchar * +rspamd_url_encode(struct rspamd_url *url, gsize *pdlen, + rspamd_mempool_t *pool) +{ + guchar *dest, *d, *dend; + static const gchar hexdigests[16] = "0123456789ABCDEF"; + guint i; + gsize dlen = 0; + + g_assert(pdlen != NULL && url != NULL && pool != NULL); + + CHECK_URL_COMPONENT(rspamd_url_host_unsafe(url), url->hostlen, + RSPAMD_URL_FLAGS_HOSTSAFE); + CHECK_URL_COMPONENT(rspamd_url_user_unsafe(url), url->userlen, + RSPAMD_URL_FLAGS_USERSAFE); + CHECK_URL_COMPONENT(rspamd_url_data_unsafe(url), url->datalen, + RSPAMD_URL_FLAGS_PATHSAFE); + CHECK_URL_COMPONENT(rspamd_url_query_unsafe(url), url->querylen, + RSPAMD_URL_FLAGS_QUERYSAFE); + CHECK_URL_COMPONENT(rspamd_url_fragment_unsafe(url), url->fragmentlen, + RSPAMD_URL_FLAGS_FRAGMENTSAFE); + + if (dlen == 0) { + *pdlen = url->urllen; + + return url->string; + } + + /* Need to encode */ + dlen += url->urllen + sizeof("telephone://"); /* Protocol hack */ + dest = rspamd_mempool_alloc(pool, dlen + 1); + d = dest; + dend = d + dlen; + + if (url->protocollen > 0) { + if (!(url->protocol & PROTOCOL_UNKNOWN)) { + const gchar *known_proto = rspamd_url_protocol_name(url->protocol); + d += rspamd_snprintf((gchar *) d, dend - d, + "%s://", + known_proto); + } + else { + d += rspamd_snprintf((gchar *) d, dend - d, + "%*s://", + (gint) url->protocollen, url->string); + } + } + else { + d += rspamd_snprintf((gchar *) d, dend - d, "http://"); + } + + if (url->userlen > 0) { + ENCODE_URL_COMPONENT(rspamd_url_user_unsafe(url), url->userlen, + RSPAMD_URL_FLAGS_USERSAFE); + *d++ = '@'; + } + + ENCODE_URL_COMPONENT(rspamd_url_host_unsafe(url), url->hostlen, + RSPAMD_URL_FLAGS_HOSTSAFE); + + if (url->datalen > 0) { + *d++ = '/'; + ENCODE_URL_COMPONENT(rspamd_url_data_unsafe(url), url->datalen, + RSPAMD_URL_FLAGS_PATHSAFE); + } + + if (url->querylen > 0) { + *d++ = '?'; + ENCODE_URL_COMPONENT(rspamd_url_query_unsafe(url), url->querylen, + RSPAMD_URL_FLAGS_QUERYSAFE); + } + + if (url->fragmentlen > 0) { + *d++ = '#'; + ENCODE_URL_COMPONENT(rspamd_url_fragment_unsafe(url), url->fragmentlen, + RSPAMD_URL_FLAGS_FRAGMENTSAFE); + } + + *pdlen = (d - dest); + + return (const gchar *) dest; +} + +gboolean +rspamd_url_is_domain(int c) +{ + return is_domain((guchar) c); +} + +const gchar * +rspamd_url_protocol_name(enum rspamd_url_protocol proto) +{ + const gchar *ret = "unknown"; + + switch (proto) { + case PROTOCOL_HTTP: + ret = "http"; + break; + case PROTOCOL_HTTPS: + ret = "https"; + break; + case PROTOCOL_FTP: + ret = "ftp"; + break; + case PROTOCOL_FILE: + ret = "file"; + break; + case PROTOCOL_MAILTO: + ret = "mailto"; + break; + case PROTOCOL_TELEPHONE: + ret = "telephone"; + break; + default: + break; + } + + return ret; +} + +enum rspamd_url_protocol +rspamd_url_protocol_from_string(const gchar *str) +{ + enum rspamd_url_protocol ret = PROTOCOL_UNKNOWN; + + if (strcmp(str, "http") == 0) { + ret = PROTOCOL_HTTP; + } + else if (strcmp(str, "https") == 0) { + ret = PROTOCOL_HTTPS; + } + else if (strcmp(str, "mailto") == 0) { + ret = PROTOCOL_MAILTO; + } + else if (strcmp(str, "ftp") == 0) { + ret = PROTOCOL_FTP; + } + else if (strcmp(str, "file") == 0) { + ret = PROTOCOL_FILE; + } + else if (strcmp(str, "telephone") == 0) { + ret = PROTOCOL_TELEPHONE; + } + + return ret; +} + + +bool rspamd_url_set_add_or_increase(khash_t(rspamd_url_hash) * set, + struct rspamd_url *u, + bool enforce_replace) +{ + khiter_t k; + gint r; + + k = kh_get(rspamd_url_hash, set, u); + + if (k != kh_end(set)) { + /* Existing url */ + struct rspamd_url *ex = kh_key(set, k); +#define SUSPICIOUS_URL_FLAGS (RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED | RSPAMD_URL_FLAG_ZW_SPACES) + if (enforce_replace) { + kh_key(set, k) = u; + u->count++; + } + else { + if (u->flags & SUSPICIOUS_URL_FLAGS) { + if (!(ex->flags & SUSPICIOUS_URL_FLAGS)) { + /* Propagate new url to an old one */ + kh_key(set, k) = u; + u->count++; + } + else { + ex->count++; + } + } + else { + ex->count++; + } + } + + return false; + } + else { + k = kh_put(rspamd_url_hash, set, u, &r); + } + + return true; +} + +struct rspamd_url * +rspamd_url_set_add_or_return(khash_t(rspamd_url_hash) * set, + struct rspamd_url *u) +{ + khiter_t k; + gint r; + + if (set) { + k = kh_get(rspamd_url_hash, set, u); + + if (k != kh_end(set)) { + return kh_key(set, k); + } + else { + k = kh_put(rspamd_url_hash, set, u, &r); + + return kh_key(set, k); + } + } + + return NULL; +} + +bool rspamd_url_host_set_add(khash_t(rspamd_url_host_hash) * set, + struct rspamd_url *u) +{ + gint r; + + if (set) { + kh_put(rspamd_url_host_hash, set, u, &r); + + if (r == 0) { + return false; + } + + return true; + } + + return false; +} + +bool rspamd_url_set_has(khash_t(rspamd_url_hash) * set, struct rspamd_url *u) +{ + khiter_t k; + + if (set) { + k = kh_get(rspamd_url_hash, set, u); + + if (k == kh_end(set)) { + return false; + } + + return true; + } + + return false; +} + +bool rspamd_url_host_set_has(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u) +{ + khiter_t k; + + if (set) { + k = kh_get(rspamd_url_host_hash, set, u); + + if (k == kh_end(set)) { + return false; + } + + return true; + } + + return false; +} + +bool rspamd_url_flag_from_string(const gchar *str, gint *flag) +{ + gint h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, + str, strlen(str), 0); + + for (int i = 0; i < G_N_ELEMENTS(url_flag_names); i++) { + if (url_flag_names[i].hash == h) { + *flag |= url_flag_names[i].flag; + + return true; + } + } + + return false; +} + + +const gchar * +rspamd_url_flag_to_string(int flag) +{ + for (int i = 0; i < G_N_ELEMENTS(url_flag_names); i++) { + if (url_flag_names[i].flag & flag) { + return url_flag_names[i].name; + } + } + + return NULL; +} + +inline int +rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2) +{ + int min_len = MIN(u1->urllen, u2->urllen); + int r; + + if (u1->protocol != u2->protocol) { + return u1->protocol - u2->protocol; + } + + if (u1->protocol & PROTOCOL_MAILTO) { + /* Emails specialisation (hosts must be compared in a case insensitive matter */ + min_len = MIN(u1->hostlen, u2->hostlen); + + if ((r = rspamd_lc_cmp(rspamd_url_host_unsafe(u1), + rspamd_url_host_unsafe(u2), min_len)) == 0) { + if (u1->hostlen == u2->hostlen) { + if (u1->userlen != u2->userlen || u1->userlen == 0) { + r = (int) u1->userlen - (int) u2->userlen; + } + else { + r = memcmp(rspamd_url_user_unsafe(u1), + rspamd_url_user_unsafe(u2), + u1->userlen); + } + } + else { + r = u1->hostlen - u2->hostlen; + } + } + } + else { + if (u1->urllen != u2->urllen) { + /* Different length, compare common part and then compare length */ + r = memcmp(u1->string, u2->string, min_len); + + if (r == 0) { + r = u1->urllen - u2->urllen; + } + } + else { + /* Equal length */ + r = memcmp(u1->string, u2->string, u1->urllen); + } + } + + return r; +} + +int rspamd_url_cmp_qsort(const void *_u1, const void *_u2) +{ + const struct rspamd_url *u1 = *(struct rspamd_url **) _u1, + *u2 = *(struct rspamd_url **) _u2; + + return rspamd_url_cmp(u1, u2); +} |