1 files changed, 4365 insertions, 0 deletions
diff --git a/src/libserver/url.c b/src/libserver/url.c
new file mode 100644
index 0000000..0842a1e
--- /dev/null
+++ b/src/libserver/url.c
@@ -0,0 +1,4365 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "url.h"
+#include "util.h"
+#include "rspamd.h"
+#include "message.h"
+#include "multipattern.h"
+#include "contrib/uthash/utlist.h"
+#include "contrib/http-parser/http_parser.h"
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <unicode/usprep.h>
+#include <unicode/ucnv.h>
+
+typedef struct url_match_s {
+	const gchar *m_begin;
+	gsize m_len;
+	const gchar *pattern;
+	const gchar *prefix;
+	const gchar *newline_pos;
+	const gchar *prev_newline_pos;
+	gboolean add_prefix;
+	gchar st;
+} url_match_t;
+
+#define URL_MATCHER_FLAG_NOHTML (1u << 0u)
+#define URL_MATCHER_FLAG_TLD_MATCH (1u << 1u)
+#define URL_MATCHER_FLAG_STAR_MATCH (1u << 2u)
+#define URL_MATCHER_FLAG_REGEXP (1u << 3u)
+
+struct url_callback_data;
+
+static const struct {
+	enum rspamd_url_protocol proto;
+	const gchar *name;
+	gsize len;
+} rspamd_url_protocols[] = {
+	{.proto = PROTOCOL_FILE,
+	 .name = "file",
+	 .len = 4},
+	{.proto = PROTOCOL_FTP,
+	 .name = "ftp",
+	 .len = 3},
+	{.proto = PROTOCOL_HTTP,
+	 .name = "http",
+	 .len = 4},
+	{.proto = PROTOCOL_HTTPS,
+	 .name = "https",
+	 .len = 5},
+	{.proto = PROTOCOL_MAILTO,
+	 .name = "mailto",
+	 .len = 6},
+	{.proto = PROTOCOL_TELEPHONE,
+	 .name = "tel",
+	 .len = 3},
+	{.proto = PROTOCOL_TELEPHONE,
+	 .name = "callto",
+	 .len = 3},
+	{.proto = PROTOCOL_UNKNOWN,
+	 .name = NULL,
+	 .len = 0}};
+struct url_matcher {
+	const gchar *pattern;
+	const gchar *prefix;
+
+	gboolean (*start)(struct url_callback_data *cb,
+					  const gchar *pos,
+					  url_match_t *match);
+
+	gboolean (*end)(struct url_callback_data *cb,
+					const gchar *pos,
+					url_match_t *match);
+
+	gint flags;
+};
+
+static gboolean url_file_start(struct url_callback_data *cb,
+							   const gchar *pos,
+							   url_match_t *match);
+
+static gboolean url_file_end(struct url_callback_data *cb,
+							 const gchar *pos,
+							 url_match_t *match);
+
+static gboolean url_web_start(struct url_callback_data *cb,
+							  const gchar *pos,
+							  url_match_t *match);
+
+static gboolean url_web_end(struct url_callback_data *cb,
+							const gchar *pos,
+							url_match_t *match);
+
+static gboolean url_tld_start(struct url_callback_data *cb,
+							  const gchar *pos,
+							  url_match_t *match);
+
+static gboolean url_tld_end(struct url_callback_data *cb,
+							const gchar *pos,
+							url_match_t *match);
+
+static gboolean url_email_start(struct url_callback_data *cb,
+								const gchar *pos,
+								url_match_t *match);
+
+static gboolean url_email_end(struct url_callback_data *cb,
+							  const gchar *pos,
+							  url_match_t *match);
+
+static gboolean url_tel_start(struct url_callback_data *cb,
+							  const gchar *pos,
+							  url_match_t *match);
+
+static gboolean url_tel_end(struct url_callback_data *cb,
+							const gchar *pos,
+							url_match_t *match);
+
+struct url_matcher static_matchers[] = {
+	/* Common prefixes */
+	{"file://", "", url_file_start, url_file_end,
+	 0},
+	{"file:\\\\", "", url_file_start, url_file_end,
+	 0},
+	{"ftp://", "", url_web_start, url_web_end,
+	 0},
+	{"ftp:\\\\", "", url_web_start, url_web_end,
+	 0},
+	{"sftp://", "", url_web_start, url_web_end,
+	 0},
+	{"http:", "", url_web_start, url_web_end,
+	 0},
+	{"https:", "", url_web_start, url_web_end,
+	 0},
+	{"news://", "", url_web_start, url_web_end,
+	 0},
+	{"nntp://", "", url_web_start, url_web_end,
+	 0},
+	{"telnet://", "", url_web_start, url_web_end,
+	 0},
+	{"tel:", "", url_tel_start, url_tel_end,
+	 0},
+	{"webcal://", "", url_web_start, url_web_end,
+	 0},
+	{"mailto:", "", url_email_start, url_email_end,
+	 0},
+	{"callto:", "", url_tel_start, url_tel_end,
+	 0},
+	{"h323:", "", url_web_start, url_web_end,
+	 0},
+	{"sip:", "", url_web_start, url_web_end,
+	 0},
+	{"www\\.[0-9a-z]", "http://", url_web_start, url_web_end,
+	 URL_MATCHER_FLAG_REGEXP},
+	{"ftp.", "ftp://", url_web_start, url_web_end,
+	 0},
+	/* Likely emails */
+	{
+		"@", "mailto://", url_email_start, url_email_end,
+		0}};
+
+struct rspamd_url_flag_name {
+	const gchar *name;
+	gint flag;
+	gint hash;
+} url_flag_names[] = {
+	{"phished", RSPAMD_URL_FLAG_PHISHED, -1},
+	{"numeric", RSPAMD_URL_FLAG_NUMERIC, -1},
+	{"obscured", RSPAMD_URL_FLAG_OBSCURED, -1},
+	{"redirected", RSPAMD_URL_FLAG_REDIRECTED, -1},
+	{"html_displayed", RSPAMD_URL_FLAG_HTML_DISPLAYED, -1},
+	{"text", RSPAMD_URL_FLAG_FROM_TEXT, -1},
+	{"subject", RSPAMD_URL_FLAG_SUBJECT, -1},
+	{"host_encoded", RSPAMD_URL_FLAG_HOSTENCODED, -1},
+	{"schema_encoded", RSPAMD_URL_FLAG_SCHEMAENCODED, -1},
+	{"path_encoded", RSPAMD_URL_FLAG_PATHENCODED, -1},
+	{"query_encoded", RSPAMD_URL_FLAG_QUERYENCODED, -1},
+	{"missing_slashes", RSPAMD_URL_FLAG_MISSINGSLASHES, -1},
+	{"idn", RSPAMD_URL_FLAG_IDN, -1},
+	{"has_port", RSPAMD_URL_FLAG_HAS_PORT, -1},
+	{"has_user", RSPAMD_URL_FLAG_HAS_USER, -1},
+	{"schemaless", RSPAMD_URL_FLAG_SCHEMALESS, -1},
+	{"unnormalised", RSPAMD_URL_FLAG_UNNORMALISED, -1},
+	{"zw_spaces", RSPAMD_URL_FLAG_ZW_SPACES, -1},
+	{"url_displayed", RSPAMD_URL_FLAG_DISPLAY_URL, -1},
+	{"image", RSPAMD_URL_FLAG_IMAGE, -1},
+	{"query", RSPAMD_URL_FLAG_QUERY, -1},
+	{"content", RSPAMD_URL_FLAG_CONTENT, -1},
+	{"no_tld", RSPAMD_URL_FLAG_NO_TLD, -1},
+	{"truncated", RSPAMD_URL_FLAG_TRUNCATED, -1},
+	{"redirect_target", RSPAMD_URL_FLAG_REDIRECT_TARGET, -1},
+	{"invisible", RSPAMD_URL_FLAG_INVISIBLE, -1},
+	{"special", RSPAMD_URL_FLAG_SPECIAL, -1},
+};
+
+
+static inline khint_t rspamd_url_hash(struct rspamd_url *u);
+
+static inline khint_t rspamd_url_host_hash(struct rspamd_url *u);
+static inline bool rspamd_urls_cmp(struct rspamd_url *a, struct rspamd_url *b);
+static inline bool rspamd_urls_host_cmp(struct rspamd_url *a, struct rspamd_url *b);
+
+/* Hash table implementation */
+__KHASH_IMPL(rspamd_url_hash, kh_inline, struct rspamd_url *, char, false,
+			 rspamd_url_hash, rspamd_urls_cmp);
+__KHASH_IMPL(rspamd_url_host_hash, kh_inline, struct rspamd_url *, char, false,
+			 rspamd_url_host_hash, rspamd_urls_host_cmp);
+
+struct url_callback_data {
+	const gchar *begin;
+	gchar *url_str;
+	rspamd_mempool_t *pool;
+	gint len;
+	enum rspamd_url_find_type how;
+	gboolean prefix_added;
+	guint newline_idx;
+	GArray *matchers;
+	GPtrArray *newlines;
+	const gchar *start;
+	const gchar *fin;
+	const gchar *end;
+	const gchar *last_at;
+	url_insert_function func;
+	void *funcd;
+};
+
+struct url_match_scanner {
+	GArray *matchers_full;
+	GArray *matchers_strict;
+	struct rspamd_multipattern *search_trie_full;
+	struct rspamd_multipattern *search_trie_strict;
+	bool has_tld_file;
+};
+
+struct url_match_scanner *url_scanner = NULL;
+
+enum {
+	IS_LWSP = (1 << 0),
+	IS_DOMAIN = (1 << 1),
+	IS_URLSAFE = (1 << 2),
+	IS_MAILSAFE = (1 << 3),
+	IS_DOMAIN_END = (1 << 4)
+};
+
+static const unsigned int url_scanner_table[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP /*   */,
+	IS_MAILSAFE /* ! */, IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* " */,
+	IS_MAILSAFE /* # */, IS_MAILSAFE /* $ */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* % */, 0 /* & */, IS_MAILSAFE /* ' */,
+	0 /* ( */, 0 /* ) */, IS_MAILSAFE /* * */,
+	IS_MAILSAFE /* + */, IS_MAILSAFE /* , */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* - */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* . */, IS_DOMAIN_END | IS_MAILSAFE /* / */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 0 */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 1 */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 2 */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 3 */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 4 */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 5 */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 6 */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 7 */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 8 */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 9 */, IS_DOMAIN_END /* : */,
+	0 /* ; */, IS_URLSAFE | IS_DOMAIN_END /* < */, 0 /* = */,
+	IS_URLSAFE | IS_DOMAIN_END /* > */, IS_DOMAIN_END /* ? */, 0 /* @ */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* A */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* B */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* C */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* D */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* E */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* F */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* G */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* H */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* I */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* J */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* K */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* L */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* M */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* N */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* O */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* P */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Q */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* R */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* S */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* T */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* U */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* V */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* W */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* X */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Y */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Z */, 0 /* [ */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* \ */, 0 /* ] */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* ^ */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* _ */,
+	IS_URLSAFE | IS_DOMAIN_END /* ` */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* a */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* b */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* c */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* d */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* e */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* f */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* g */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* h */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* i */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* j */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* k */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* l */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* m */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* n */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* o */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* p */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* q */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* r */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* s */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* t */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* u */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* v */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* w */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* x */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* y */,
+	IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* z */,
+	IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* { */,
+	IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* | */,
+	IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* } */,
+	IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* ~ */, 0, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN,
+	IS_URLSAFE | IS_DOMAIN};
+
+#define is_lwsp(x) ((url_scanner_table[(guchar) (x)] & IS_LWSP) != 0)
+#define is_mailsafe(x) ((url_scanner_table[(guchar) (x)] & (IS_MAILSAFE)) != 0)
+#define is_domain(x) ((url_scanner_table[(guchar) (x)] & IS_DOMAIN) != 0)
+#define is_urlsafe(x) ((url_scanner_table[(guchar) (x)] & (IS_URLSAFE)) != 0)
+
+const gchar *
+rspamd_url_strerror(int err)
+{
+	switch (err) {
+	case URI_ERRNO_OK:
+		return "Parsing went well";
+	case URI_ERRNO_EMPTY:
+		return "The URI string was empty";
+	case URI_ERRNO_INVALID_PROTOCOL:
+		return "No protocol was found";
+	case URI_ERRNO_BAD_FORMAT:
+		return "Bad URL format";
+	case URI_ERRNO_BAD_ENCODING:
+		return "Invalid symbols encoded";
+	case URI_ERRNO_INVALID_PORT:
+		return "Port number is bad";
+	case URI_ERRNO_TLD_MISSING:
+		return "TLD part is not detected";
+	case URI_ERRNO_HOST_MISSING:
+		return "Host part is missing";
+	case URI_ERRNO_TOO_LONG:
+		return "URL is too long";
+	}
+
+	return NULL;
+}
+
+static gboolean
+rspamd_url_parse_tld_file(const gchar *fname,
+						  struct url_match_scanner *scanner)
+{
+	FILE *f;
+	struct url_matcher m;
+	gchar *linebuf = NULL, *p;
+	gsize buflen = 0;
+	gssize r;
+	gint flags;
+
+	f = fopen(fname, "r");
+
+	if (f == NULL) {
+		msg_err("cannot open TLD file %s: %s", fname, strerror(errno));
+		return FALSE;
+	}
+
+	m.end = url_tld_end;
+	m.start = url_tld_start;
+	m.prefix = "http://";
+
+	while ((r = getline(&linebuf, &buflen, f)) > 0) {
+		if (linebuf[0] == '/' || g_ascii_isspace(linebuf[0])) {
+			/* Skip comment or empty line */
+			continue;
+		}
+
+		g_strchomp(linebuf);
+
+		/* TODO: add support for ! patterns */
+		if (linebuf[0] == '!') {
+			msg_debug("skip '!' patterns from parsing for now: %s", linebuf);
+			continue;
+		}
+
+		flags = URL_MATCHER_FLAG_NOHTML | URL_MATCHER_FLAG_TLD_MATCH;
+
+		if (linebuf[0] == '*') {
+			flags |= URL_MATCHER_FLAG_STAR_MATCH;
+			p = strchr(linebuf, '.');
+
+			if (p == NULL) {
+				msg_err("got bad star line, skip it: %s", linebuf);
+				continue;
+			}
+			p++;
+		}
+		else {
+			p = linebuf;
+		}
+
+		m.flags = flags;
+		rspamd_multipattern_add_pattern(url_scanner->search_trie_full, p,
+										RSPAMD_MULTIPATTERN_TLD | RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8);
+		m.pattern = rspamd_multipattern_get_pattern(url_scanner->search_trie_full,
+													rspamd_multipattern_get_npatterns(url_scanner->search_trie_full) - 1);
+
+		g_array_append_val(url_scanner->matchers_full, m);
+	}
+
+	free(linebuf);
+	fclose(f);
+
+	return TRUE;
+}
+
+static void
+rspamd_url_add_static_matchers(struct url_match_scanner *sc)
+{
+	gint n = G_N_ELEMENTS(static_matchers), i;
+
+	for (i = 0; i < n; i++) {
+		if (static_matchers[i].flags & URL_MATCHER_FLAG_REGEXP) {
+			rspamd_multipattern_add_pattern(url_scanner->search_trie_strict,
+											static_matchers[i].pattern,
+											RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 |
+												RSPAMD_MULTIPATTERN_RE);
+		}
+		else {
+			rspamd_multipattern_add_pattern(url_scanner->search_trie_strict,
+											static_matchers[i].pattern,
+											RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8);
+		}
+	}
+
+	g_array_append_vals(sc->matchers_strict, static_matchers, n);
+
+	if (sc->matchers_full) {
+		for (i = 0; i < n; i++) {
+			if (static_matchers[i].flags & URL_MATCHER_FLAG_REGEXP) {
+				rspamd_multipattern_add_pattern(url_scanner->search_trie_full,
+												static_matchers[i].pattern,
+												RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 |
+													RSPAMD_MULTIPATTERN_RE);
+			}
+			else {
+				rspamd_multipattern_add_pattern(url_scanner->search_trie_full,
+												static_matchers[i].pattern,
+												RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8);
+			}
+		}
+		g_array_append_vals(sc->matchers_full, static_matchers, n);
+	}
+}
+
+void rspamd_url_deinit(void)
+{
+	if (url_scanner != NULL) {
+		if (url_scanner->search_trie_full) {
+			rspamd_multipattern_destroy(url_scanner->search_trie_full);
+			g_array_free(url_scanner->matchers_full, TRUE);
+		}
+
+		rspamd_multipattern_destroy(url_scanner->search_trie_strict);
+		g_array_free(url_scanner->matchers_strict, TRUE);
+		g_free(url_scanner);
+
+		url_scanner = NULL;
+	}
+}
+
+void rspamd_url_init(const gchar *tld_file)
+{
+	GError *err = NULL;
+	gboolean ret = TRUE;
+
+	if (url_scanner != NULL) {
+		rspamd_url_deinit();
+	}
+
+	url_scanner = g_malloc(sizeof(struct url_match_scanner));
+
+	url_scanner->matchers_strict = g_array_sized_new(FALSE, TRUE,
+													 sizeof(struct url_matcher), G_N_ELEMENTS(static_matchers));
+	url_scanner->search_trie_strict = rspamd_multipattern_create_sized(
+		G_N_ELEMENTS(static_matchers),
+		RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8);
+
+	if (tld_file) {
+		/* Reserve larger multipattern */
+		url_scanner->matchers_full = g_array_sized_new(FALSE, TRUE,
+													   sizeof(struct url_matcher), 13000);
+		url_scanner->search_trie_full = rspamd_multipattern_create_sized(13000,
+																		 RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8);
+		url_scanner->has_tld_file = true;
+	}
+	else {
+		url_scanner->matchers_full = NULL;
+		url_scanner->search_trie_full = NULL;
+		url_scanner->has_tld_file = false;
+	}
+
+	rspamd_url_add_static_matchers(url_scanner);
+
+	if (tld_file != NULL) {
+		ret = rspamd_url_parse_tld_file(tld_file, url_scanner);
+	}
+
+	if (url_scanner->matchers_full && url_scanner->matchers_full->len > 1000) {
+		msg_info("start compiling of %d TLD suffixes; it might take a long time",
+				 url_scanner->matchers_full->len);
+	}
+
+	if (!rspamd_multipattern_compile(url_scanner->search_trie_strict, &err)) {
+		msg_err("cannot compile url matcher static patterns, fatal error: %e", err);
+		abort();
+	}
+
+	if (url_scanner->search_trie_full) {
+		if (!rspamd_multipattern_compile(url_scanner->search_trie_full, &err)) {
+			msg_err("cannot compile tld patterns, url matching will be "
+					"incomplete: %e",
+					err);
+			g_error_free(err);
+			ret = FALSE;
+		}
+	}
+
+	if (tld_file != NULL) {
+		if (ret) {
+			msg_info("initialized %ud url match suffixes from '%s'",
+					 url_scanner->matchers_full->len - url_scanner->matchers_strict->len,
+					 tld_file);
+		}
+		else {
+			msg_err("failed to initialize url tld suffixes from '%s', "
+					"use %ud internal match suffixes",
+					tld_file,
+					url_scanner->matchers_strict->len);
+		}
+	}
+
+	/* Generate hashes for flags */
+	for (gint i = 0; i < G_N_ELEMENTS(url_flag_names); i++) {
+		url_flag_names[i].hash =
+			rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
+												url_flag_names[i].name,
+												strlen(url_flag_names[i].name), 0);
+	}
+	/* Ensure that we have no hashes collisions O(N^2) but this array is small */
+	for (gint i = 0; i < G_N_ELEMENTS(url_flag_names) - 1; i++) {
+		for (gint j = i + 1; j < G_N_ELEMENTS(url_flag_names); j++) {
+			if (url_flag_names[i].hash == url_flag_names[j].hash) {
+				msg_err("collision: both %s and %s map to %d",
+						url_flag_names[i].name, url_flag_names[j].name,
+						url_flag_names[i].hash);
+				abort();
+			}
+		}
+	}
+}
+
+#define SET_U(u, field)                             \
+	do {                                            \
+		if ((u) != NULL) {                          \
+			(u)->field_set |= 1 << (field);         \
+			(u)->field_data[(field)].len = p - c;   \
+			(u)->field_data[(field)].off = c - str; \
+		}                                           \
+	} while (0)
+
+static bool
+is_url_start(gchar c)
+{
+	if (c == '(' ||
+		c == '{' ||
+		c == '[' ||
+		c == '<' ||
+		c == '\'') {
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+static bool
+is_url_end(gchar c)
+{
+	if (c == ')' ||
+		c == '}' ||
+		c == ']' ||
+		c == '>' ||
+		c == '\'') {
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+static bool
+is_domain_start(int p)
+{
+	if (g_ascii_isalnum(p) ||
+		p == '[' ||
+		p == '%' ||
+		p == '_' ||
+		(p & 0x80)) {
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+static const guint max_domain_length = 253;
+static const guint max_dns_label = 63;
+static const guint max_email_user = 64;
+
+static gint
+rspamd_mailto_parse(struct http_parser_url *u,
+					const gchar *str, gsize len,
+					gchar const **end,
+					enum rspamd_url_parse_flags parse_flags, guint *flags)
+{
+	const gchar *p = str, *c = str, *last = str + len;
+	gchar t;
+	gint ret = 1;
+	enum {
+		parse_mailto,
+		parse_slash,
+		parse_slash_slash,
+		parse_semicolon,
+		parse_prefix_question,
+		parse_destination,
+		parse_equal,
+		parse_user,
+		parse_at,
+		parse_domain,
+		parse_suffix_question,
+		parse_query
+	} st = parse_mailto;
+
+	if (u != NULL) {
+		memset(u, 0, sizeof(*u));
+	}
+
+	while (p < last) {
+		t = *p;
+
+		if (p - str > max_email_user + max_domain_length + 1) {
+			goto out;
+		}
+
+		switch (st) {
+		case parse_mailto:
+			if (t == ':') {
+				st = parse_semicolon;
+				SET_U(u, UF_SCHEMA);
+			}
+			p++;
+			break;
+		case parse_semicolon:
+			if (t == '/' || t == '\\') {
+				st = parse_slash;
+				p++;
+			}
+			else {
+				*flags |= RSPAMD_URL_FLAG_MISSINGSLASHES;
+				st = parse_slash_slash;
+			}
+			break;
+		case parse_slash:
+			if (t == '/' || t == '\\') {
+				st = parse_slash_slash;
+			}
+			else {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_slash_slash:
+			if (t == '?') {
+				st = parse_prefix_question;
+				p++;
+			}
+			else if (t != '/' && t != '\\') {
+				c = p;
+				st = parse_user;
+			}
+			else {
+				/* Skip multiple slashes */
+				p++;
+			}
+			break;
+		case parse_prefix_question:
+			if (t == 't') {
+				/* XXX: accept only to= */
+				st = parse_destination;
+			}
+			else {
+				goto out;
+			}
+			break;
+		case parse_destination:
+			if (t == '=') {
+				st = parse_equal;
+			}
+			p++;
+			break;
+		case parse_equal:
+			c = p;
+			st = parse_user;
+			break;
+		case parse_user:
+			if (t == '@') {
+				if (p - c == 0) {
+					goto out;
+				}
+				SET_U(u, UF_USERINFO);
+				st = parse_at;
+			}
+			else if (!is_mailsafe(t)) {
+				goto out;
+			}
+			else if (p - c > max_email_user) {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_at:
+			c = p;
+			st = parse_domain;
+			break;
+		case parse_domain:
+			if (t == '?') {
+				SET_U(u, UF_HOST);
+				st = parse_suffix_question;
+			}
+			else if (!is_domain(t) && t != '.' && t != '_') {
+				goto out;
+			}
+			else if (p - c > max_domain_length) {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_suffix_question:
+			c = p;
+			st = parse_query;
+			break;
+		case parse_query:
+			if (t == '#') {
+				if (p - c != 0) {
+					SET_U(u, UF_QUERY);
+				}
+				c = p + 1;
+				ret = 0;
+
+				goto out;
+			}
+			else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) {
+				ret = 0;
+				goto out;
+			}
+			else if (is_lwsp(t)) {
+				if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+					if (g_ascii_isspace(t)) {
+						ret = 0;
+					}
+					goto out;
+				}
+				else {
+					goto out;
+				}
+			}
+			p++;
+			break;
+		}
+	}
+
+	if (st == parse_domain) {
+		if (p - c != 0) {
+			SET_U(u, UF_HOST);
+			ret = 0;
+		}
+	}
+	else if (st == parse_query) {
+		if (p - c > 0) {
+			SET_U(u, UF_QUERY);
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (end != NULL) {
+		*end = p;
+	}
+
+	if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+		return 0;
+	}
+
+	return ret;
+}
+
+static gint
+rspamd_telephone_parse(struct http_parser_url *u,
+					   const gchar *str, gsize len,
+					   gchar const **end,
+					   enum rspamd_url_parse_flags parse_flags,
+					   guint *flags)
+{
+	enum {
+		parse_protocol,
+		parse_semicolon,
+		parse_slash,
+		parse_slash_slash,
+		parse_spaces,
+		parse_plus,
+		parse_phone_start,
+		parse_phone,
+	} st = parse_protocol;
+
+	const gchar *p = str, *c = str, *last = str + len;
+	gchar t;
+	gint ret = 1, i;
+	UChar32 uc;
+
+	if (u != NULL) {
+		memset(u, 0, sizeof(*u));
+	}
+
+	while (p < last) {
+		t = *p;
+
+		if (p - str > max_email_user) {
+			goto out;
+		}
+
+		switch (st) {
+		case parse_protocol:
+			if (t == ':') {
+				st = parse_semicolon;
+				SET_U(u, UF_SCHEMA);
+			}
+			p++;
+			break;
+		case parse_semicolon:
+			if (t == '/' || t == '\\') {
+				st = parse_slash;
+				p++;
+			}
+			else {
+				st = parse_slash_slash;
+			}
+			break;
+		case parse_slash:
+			if (t == '/' || t == '\\') {
+				st = parse_slash_slash;
+			}
+			else {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_slash_slash:
+			if (g_ascii_isspace(t)) {
+				st = parse_spaces;
+				p++;
+			}
+			else if (t == '+') {
+				c = p;
+				st = parse_plus;
+			}
+			else if (t == '/') {
+				/* Skip multiple slashes */
+				p++;
+			}
+			else {
+				st = parse_phone_start;
+				c = p;
+			}
+			break;
+		case parse_spaces:
+			if (t == '+') {
+				c = p;
+				st = parse_plus;
+			}
+			else if (!g_ascii_isspace(t)) {
+				st = parse_phone_start;
+				c = p;
+			}
+			else {
+				p++;
+			}
+			break;
+		case parse_plus:
+			c = p;
+			p++;
+			st = parse_phone_start;
+			break;
+		case parse_phone_start:
+			if (*p == '%' || *p == '(' || g_ascii_isdigit(*p)) {
+				st = parse_phone;
+				p++;
+			}
+			else {
+				goto out;
+			}
+			break;
+		case parse_phone:
+			i = p - str;
+			U8_NEXT(str, i, len, uc);
+			p = str + i;
+
+			if (u_isdigit(uc) || uc == '(' || uc == ')' || uc == '[' || uc == ']' || u_isspace(uc) || uc == '%') {
+				/* p is already incremented by U8_NEXT! */
+			}
+			else if (uc <= 0 || is_url_end(uc)) {
+				ret = 0;
+				goto set;
+			}
+			break;
+		}
+	}
+
+set:
+	if (st == parse_phone) {
+		if (p - c != 0) {
+			SET_U(u, UF_HOST);
+			ret = 0;
+		}
+	}
+
+out:
+	if (end != NULL) {
+		*end = p;
+	}
+
+	if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+		return 0;
+	}
+
+	return ret;
+}
+
+static gint
+rspamd_web_parse(struct http_parser_url *u, const gchar *str, gsize len,
+				 gchar const **end,
+				 enum rspamd_url_parse_flags parse_flags,
+				 guint *flags)
+{
+	const gchar *p = str, *c = str, *last = str + len, *slash = NULL,
+				*password_start = NULL, *user_start = NULL;
+	gchar t = 0;
+	UChar32 uc;
+	glong pt;
+	gint ret = 1;
+	gboolean user_seen = FALSE;
+	enum {
+		parse_protocol,
+		parse_slash,
+		parse_slash_slash,
+		parse_semicolon,
+		parse_user,
+		parse_at,
+		parse_multiple_at,
+		parse_password_start,
+		parse_password,
+		parse_domain_start,
+		parse_domain,
+		parse_ipv6,
+		parse_port_password,
+		parse_port,
+		parse_suffix_slash,
+		parse_path,
+		parse_query,
+		parse_part
+	} st = parse_protocol;
+
+	if (u != NULL) {
+		memset(u, 0, sizeof(*u));
+	}
+
+	while (p < last) {
+		t = *p;
+
+		switch (st) {
+		case parse_protocol:
+			if (t == ':') {
+				st = parse_semicolon;
+				SET_U(u, UF_SCHEMA);
+			}
+			else if (!g_ascii_isalnum(t) && t != '+' && t != '-') {
+				if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) {
+					/* We might have some domain, but no protocol */
+					st = parse_domain_start;
+					p = c;
+					slash = c;
+					break;
+				}
+				else {
+					goto out;
+				}
+			}
+			p++;
+			break;
+		case parse_semicolon:
+			if (t == '/' || t == '\\') {
+				st = parse_slash;
+				p++;
+			}
+			else {
+				st = parse_slash_slash;
+				*(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES;
+			}
+			break;
+		case parse_slash:
+			if (t == '/' || t == '\\') {
+				st = parse_slash_slash;
+			}
+			else {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_slash_slash:
+
+			if (t != '/' && t != '\\') {
+				c = p;
+				slash = p;
+				st = parse_domain_start;
+
+				/*
+				 * Unfortunately, due to brain damage of the RFC 3986 authors,
+				 * we have to distinguish two possibilities here:
+				 * authority = [ userinfo "@" ] host [ ":" port ]
+				 * So if we have @ somewhere before hostname then we must process
+				 * with the username state. Otherwise, we have to process via
+				 * the hostname state. Unfortunately, there is no way to distinguish
+				 * them aside of running NFA or two DFA or performing lookahead.
+				 * Lookahead approach looks easier to implement.
+				 */
+
+				const char *tp = p;
+				while (tp < last) {
+					if (*tp == '@') {
+						user_seen = TRUE;
+						st = parse_user;
+						break;
+					}
+					else if (*tp == '/' || *tp == '#' || *tp == '?') {
+						st = parse_domain_start;
+						break;
+					}
+
+					tp++;
+				}
+
+				if (st == parse_domain_start && *p == '[') {
+					st = parse_ipv6;
+					p++;
+					c = p;
+				}
+			}
+			else {
+				/* Skip multiple slashes */
+				p++;
+			}
+			break;
+		case parse_ipv6:
+			if (t == ']') {
+				if (p - c == 0) {
+					goto out;
+				}
+				SET_U(u, UF_HOST);
+				p++;
+
+				if (*p == ':') {
+					st = parse_port;
+					c = p + 1;
+				}
+				else if (*p == '/' || *p == '\\') {
+					st = parse_path;
+					c = p + 1;
+				}
+				else if (*p == '?') {
+					st = parse_query;
+					c = p + 1;
+				}
+				else if (*p == '#') {
+					st = parse_part;
+					c = p + 1;
+				}
+				else if (p != last) {
+					goto out;
+				}
+			}
+			else if (!g_ascii_isxdigit(t) && t != ':' && t != '.') {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_user:
+			if (t == ':') {
+				if (p - c == 0) {
+					goto out;
+				}
+				user_start = c;
+				st = parse_password_start;
+			}
+			else if (t == '@') {
+				/* No password */
+				if (p - c == 0) {
+					/* We have multiple at in fact */
+					st = parse_multiple_at;
+					user_seen = TRUE;
+					*flags |= RSPAMD_URL_FLAG_OBSCURED;
+
+					continue;
+				}
+
+				SET_U(u, UF_USERINFO);
+				*flags |= RSPAMD_URL_FLAG_HAS_USER;
+				st = parse_at;
+			}
+			else if (!g_ascii_isgraph(t)) {
+				goto out;
+			}
+			else if (p - c > max_email_user) {
+				goto out;
+			}
+
+			p++;
+			break;
+		case parse_multiple_at:
+			if (t != '@') {
+				if (p - c == 0) {
+					goto out;
+				}
+
+				/* For now, we ignore all that stuff as it is bogus */
+				/* Off by one */
+				p--;
+				SET_U(u, UF_USERINFO);
+				p++;
+				*flags |= RSPAMD_URL_FLAG_HAS_USER;
+				st = parse_at;
+			}
+			else {
+				p++;
+			}
+			break;
+		case parse_password_start:
+			if (t == '@') {
+				/* Empty password */
+				SET_U(u, UF_USERINFO);
+				if (u != NULL && u->field_data[UF_USERINFO].len > 0) {
+					/* Eat semicolon */
+					u->field_data[UF_USERINFO].len--;
+				}
+				*flags |= RSPAMD_URL_FLAG_HAS_USER;
+				st = parse_at;
+			}
+			else {
+				c = p;
+				password_start = p;
+				st = parse_password;
+			}
+			p++;
+			break;
+		case parse_password:
+			if (t == '@') {
+				/* XXX: password is not stored */
+				if (u != NULL) {
+					if (u->field_data[UF_USERINFO].len == 0 && password_start && user_start && password_start > user_start + 1) {
+						*flags |= RSPAMD_URL_FLAG_HAS_USER;
+						u->field_set |= 1u << (UF_USERINFO);
+						u->field_data[UF_USERINFO].len =
+							password_start - user_start - 1;
+						u->field_data[UF_USERINFO].off =
+							user_start - str;
+					}
+				}
+				st = parse_at;
+			}
+			else if (!g_ascii_isgraph(t)) {
+				goto out;
+			}
+			else if (p - c > max_domain_length) {
+				goto out;
+			}
+			p++;
+			break;
+		case parse_at:
+			c = p;
+
+			if (t == '@') {
+				*flags |= RSPAMD_URL_FLAG_OBSCURED;
+				p++;
+			}
+			else if (t == '[') {
+				st = parse_ipv6;
+				p++;
+				c = p;
+			}
+			else {
+				st = parse_domain_start;
+			}
+			break;
+		case parse_domain_start:
+			if (is_domain_start(t)) {
+				st = parse_domain;
+			}
+			else {
+				goto out;
+			}
+			break;
+		case parse_domain:
+			if (p - c > max_domain_length) {
+				/* Too large domain */
+				goto out;
+			}
+			if (t == '/' || t == '\\' || t == ':' || t == '?' || t == '#') {
+				if (p - c == 0) {
+					goto out;
+				}
+				if (t == '/' || t == '\\') {
+					SET_U(u, UF_HOST);
+					st = parse_suffix_slash;
+				}
+				else if (t == '?') {
+					SET_U(u, UF_HOST);
+					st = parse_query;
+					c = p + 1;
+				}
+				else if (t == '#') {
+					SET_U(u, UF_HOST);
+					st = parse_part;
+					c = p + 1;
+				}
+				else if (t == ':' && !user_seen) {
+					/*
+					 * Here we can have both port and password, hence we need
+					 * to apply some heuristic here
+					 */
+					st = parse_port_password;
+				}
+				else {
+					/*
+					 * We can go only for parsing port here
+					 */
+					SET_U(u, UF_HOST);
+					st = parse_port;
+					c = p + 1;
+				}
+				p++;
+			}
+			else {
+				if (is_url_end(t) || is_url_start(t)) {
+					goto set;
+				}
+				else if (*p == '@' && !user_seen) {
+					/* We need to fallback and test user */
+					p = slash;
+					user_seen = TRUE;
+					st = parse_user;
+				}
+				else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') {
+					if (*p & 0x80) {
+						guint i = 0;
+
+						U8_NEXT(((const guchar *) p), i, last - p, uc);
+
+						if (uc < 0) {
+							/* Bad utf8 */
+							goto out;
+						}
+
+						if (!u_isalnum(uc)) {
+							/* Bad symbol */
+							if (IS_ZERO_WIDTH_SPACE(uc)) {
+								(*flags) |= RSPAMD_URL_FLAG_ZW_SPACES;
+							}
+							else {
+								if (!u_isgraph(uc)) {
+									if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+										goto out;
+									}
+									else {
+										goto set;
+									}
+								}
+							}
+						}
+						else {
+							(*flags) |= RSPAMD_URL_FLAG_IDN;
+						}
+
+						p = p + i;
+					}
+					else if (is_urlsafe(*p)) {
+						p++;
+					}
+					else {
+						if (parse_flags & RSPAMD_URL_PARSE_HREF) {
+							/* We have to use all shit we are given here */
+							p++;
+							(*flags) |= RSPAMD_URL_FLAG_OBSCURED;
+						}
+						else {
+							if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+								goto out;
+							}
+							else {
+								goto set;
+							}
+						}
+					}
+				}
+				else {
+					p++;
+				}
+			}
+			break;
+		case parse_port_password:
+			if (g_ascii_isdigit(t)) {
+				const gchar *tmp = p;
+
+				while (tmp < last) {
+					if (!g_ascii_isdigit(*tmp)) {
+						if (*tmp == '/' || *tmp == '#' || *tmp == '?' ||
+							is_url_end(*tmp) || g_ascii_isspace(*tmp)) {
+							/* Port + something */
+							st = parse_port;
+							c = slash;
+							p--;
+							SET_U(u, UF_HOST);
+							p++;
+							c = p;
+							break;
+						}
+						else {
+							/* Not a port, bad character at the end */
+							break;
+						}
+					}
+					tmp++;
+				}
+
+				if (tmp == last) {
+					/* Host + port only */
+					st = parse_port;
+					c = slash;
+					p--;
+					SET_U(u, UF_HOST);
+					p++;
+					c = p;
+				}
+
+				if (st != parse_port) {
+					/* Fallback to user:password */
+					p = slash;
+					c = slash;
+					user_seen = TRUE;
+					st = parse_user;
+				}
+			}
+			else {
+				/* Rewind back */
+				p = slash;
+				c = slash;
+				user_seen = TRUE;
+				st = parse_user;
+			}
+			break;
+		case parse_port:
+			if (t == '/' || t == '\\') {
+				pt = strtoul(c, NULL, 10);
+				if (pt == 0 || pt > 65535) {
+					goto out;
+				}
+				if (u != NULL) {
+					u->port = pt;
+					*flags |= RSPAMD_URL_FLAG_HAS_PORT;
+				}
+				st = parse_suffix_slash;
+			}
+			else if (t == '?') {
+				pt = strtoul(c, NULL, 10);
+				if (pt == 0 || pt > 65535) {
+					goto out;
+				}
+				if (u != NULL) {
+					u->port = pt;
+					*flags |= RSPAMD_URL_FLAG_HAS_PORT;
+				}
+
+				c = p + 1;
+				st = parse_query;
+			}
+			else if (t == '#') {
+				pt = strtoul(c, NULL, 10);
+				if (pt == 0 || pt > 65535) {
+					goto out;
+				}
+				if (u != NULL) {
+					u->port = pt;
+					*flags |= RSPAMD_URL_FLAG_HAS_PORT;
+				}
+
+				c = p + 1;
+				st = parse_part;
+			}
+			else if (is_url_end(t)) {
+				goto set;
+			}
+			else if (!g_ascii_isdigit(t)) {
+				if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) ||
+					!g_ascii_isspace(t)) {
+					goto out;
+				}
+				else {
+					goto set;
+				}
+			}
+			p++;
+			break;
+		case parse_suffix_slash:
+			if (t != '/' && t != '\\') {
+				c = p;
+				st = parse_path;
+			}
+			else {
+				/* Skip extra slashes */
+				p++;
+			}
+			break;
+		case parse_path:
+			if (t == '?') {
+				if (p - c != 0) {
+					SET_U(u, UF_PATH);
+				}
+				c = p + 1;
+				st = parse_query;
+			}
+			else if (t == '#') {
+				/* No query, just fragment */
+				if (p - c != 0) {
+					SET_U(u, UF_PATH);
+				}
+				c = p + 1;
+				st = parse_part;
+			}
+			else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) {
+				goto set;
+			}
+			else if (is_lwsp(t)) {
+				if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+					if (g_ascii_isspace(t)) {
+						goto set;
+					}
+					goto out;
+				}
+				else {
+					goto set;
+				}
+			}
+			p++;
+			break;
+		case parse_query:
+			if (t == '#') {
+				if (p - c != 0) {
+					SET_U(u, UF_QUERY);
+				}
+				c = p + 1;
+				st = parse_part;
+			}
+			else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) {
+				goto set;
+			}
+			else if (is_lwsp(t)) {
+				if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+					if (g_ascii_isspace(t)) {
+						goto set;
+					}
+					goto out;
+				}
+				else {
+					goto set;
+				}
+			}
+			p++;
+			break;
+		case parse_part:
+			if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) {
+				goto set;
+			}
+			else if (is_lwsp(t)) {
+				if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) {
+					if (g_ascii_isspace(t)) {
+						goto set;
+					}
+					goto out;
+				}
+				else {
+					goto set;
+				}
+			}
+			p++;
+			break;
+		}
+	}
+
+set:
+	/* Parse remaining */
+	switch (st) {
+	case parse_domain:
+		if (p - c == 0 || !is_domain(*(p - 1)) || !is_domain(*c)) {
+			goto out;
+		}
+		SET_U(u, UF_HOST);
+		ret = 0;
+
+		break;
+	case parse_port:
+		pt = strtoul(c, NULL, 10);
+		if (pt == 0 || pt > 65535) {
+			goto out;
+		}
+		if (u != NULL) {
+			u->port = pt;
+		}
+
+		ret = 0;
+		break;
+	case parse_suffix_slash:
+		/* Url ends with '/' */
+		ret = 0;
+		break;
+	case parse_path:
+		if (p - c > 0) {
+			SET_U(u, UF_PATH);
+		}
+		ret = 0;
+		break;
+	case parse_query:
+		if (p - c > 0) {
+			SET_U(u, UF_QUERY);
+		}
+		ret = 0;
+		break;
+	case parse_part:
+		if (p - c > 0) {
+			SET_U(u, UF_FRAGMENT);
+		}
+		ret = 0;
+		break;
+	case parse_ipv6:
+		if (t != ']') {
+			ret = 1;
+		}
+		else {
+			/* e.g. http://[::] */
+			ret = 0;
+		}
+		break;
+	default:
+		/* Error state */
+		ret = 1;
+		break;
+	}
+out:
+	if (end != NULL) {
+		*end = p;
+	}
+
+	return ret;
+}
+
+#undef SET_U
+
+static gint
+rspamd_tld_trie_callback(struct rspamd_multipattern *mp,
+						 guint strnum,
+						 gint match_start,
+						 gint match_pos,
+						 const gchar *text,
+						 gsize len,
+						 void *context)
+{
+	struct url_matcher *matcher;
+	const gchar *start, *pos, *p;
+	struct rspamd_url *url = context;
+	gint ndots;
+
+	matcher = &g_array_index(url_scanner->matchers_full, struct url_matcher,
+							 strnum);
+	ndots = 1;
+
+	if (matcher->flags & URL_MATCHER_FLAG_STAR_MATCH) {
+		/* Skip one more tld component */
+		ndots++;
+	}
+
+	pos = text + match_start;
+	p = pos - 1;
+	start = rspamd_url_host_unsafe(url);
+
+	if (*pos != '.' || match_pos != (gint) url->hostlen) {
+		/* Something weird has been found */
+		if (match_pos == (gint) url->hostlen - 1) {
+			pos = rspamd_url_host_unsafe(url) + match_pos;
+			if (*pos == '.') {
+				/* This is dot at the end of domain */
+				url->hostlen--;
+			}
+			else {
+				return 0;
+			}
+		}
+		else {
+			return 0;
+		}
+	}
+
+	/* Now we need to find top level domain */
+	pos = start;
+	while (p >= start && ndots > 0) {
+		if (*p == '.') {
+			ndots--;
+			pos = p + 1;
+		}
+		else {
+			pos = p;
+		}
+
+		p--;
+	}
+
+	if ((ndots == 0 || p == start - 1) &&
+		url->tldlen < rspamd_url_host_unsafe(url) + url->hostlen - pos) {
+		url->tldshift = (pos - url->string);
+		url->tldlen = rspamd_url_host_unsafe(url) + url->hostlen - pos;
+	}
+
+	return 0;
+}
+
+static void
+rspamd_url_regen_from_inet_addr(struct rspamd_url *uri, const void *addr, int af,
+								rspamd_mempool_t *pool)
+{
+	gchar *strbuf, *p;
+	const gchar *start_offset;
+	gsize slen = uri->urllen - uri->hostlen;
+	goffset r = 0;
+
+	if (af == AF_INET) {
+		slen += INET_ADDRSTRLEN;
+	}
+	else {
+		slen += INET6_ADDRSTRLEN;
+	}
+
+	if (uri->flags & RSPAMD_URL_FLAG_HAS_PORT) {
+		slen += sizeof("65535") - 1;
+	}
+
+	/* Allocate new string to build it from IP */
+	strbuf = rspamd_mempool_alloc(pool, slen + 1);
+	r += rspamd_snprintf(strbuf + r, slen - r, "%*s",
+						 (gint) (uri->hostshift),
+						 uri->string);
+
+	uri->hostshift = r;
+	uri->tldshift = r;
+	start_offset = strbuf + r;
+	inet_ntop(af, addr, strbuf + r, slen - r + 1);
+	uri->hostlen = strlen(start_offset);
+	r += uri->hostlen;
+	uri->tldlen = uri->hostlen;
+	uri->flags |= RSPAMD_URL_FLAG_NUMERIC;
+
+	/* Reconstruct URL */
+	if (uri->flags & RSPAMD_URL_FLAG_HAS_PORT && uri->ext) {
+		p = strbuf + r;
+		start_offset = p + 1;
+		r += rspamd_snprintf(strbuf + r, slen - r, ":%ud",
+							 (unsigned int) uri->ext->port);
+	}
+	if (uri->datalen > 0) {
+		p = strbuf + r;
+		start_offset = p + 1;
+		r += rspamd_snprintf(strbuf + r, slen - r, "/%*s",
+							 (gint) uri->datalen,
+							 rspamd_url_data_unsafe(uri));
+		uri->datashift = start_offset - strbuf;
+	}
+	else {
+		/* Add trailing slash if needed */
+		if (uri->hostlen + uri->hostshift < uri->urllen &&
+			*(rspamd_url_host_unsafe(uri) + uri->hostlen) == '/') {
+			r += rspamd_snprintf(strbuf + r, slen - r, "/");
+		}
+	}
+
+	if (uri->querylen > 0) {
+		p = strbuf + r;
+		start_offset = p + 1;
+		r += rspamd_snprintf(strbuf + r, slen - r, "?%*s",
+							 (gint) uri->querylen,
+							 rspamd_url_query_unsafe(uri));
+		uri->queryshift = start_offset - strbuf;
+	}
+	if (uri->fragmentlen > 0) {
+		p = strbuf + r;
+		start_offset = p + 1;
+		r += rspamd_snprintf(strbuf + r, slen - r, "#%*s",
+							 (gint) uri->fragmentlen,
+							 rspamd_url_fragment_unsafe(uri));
+		uri->fragmentshift = start_offset - strbuf;
+	}
+
+	uri->string = strbuf;
+	uri->urllen = r;
+}
+
+static gboolean
+rspamd_url_maybe_regenerate_from_ip(struct rspamd_url *uri, rspamd_mempool_t *pool)
+{
+	const gchar *p, *end, *c;
+	gchar *errstr;
+	struct in_addr in4;
+	struct in6_addr in6;
+	gboolean ret = FALSE, check_num = TRUE;
+	guint32 n, dots, t = 0, i = 0, shift, nshift;
+
+	p = rspamd_url_host_unsafe(uri);
+	end = p + uri->hostlen;
+
+	if (*p == '[' && *(end - 1) == ']') {
+		p++;
+		end--;
+	}
+
+	while (*(end - 1) == '.' && end > p) {
+		end--;
+	}
+
+	if (end - p == 0 || end - p > INET6_ADDRSTRLEN) {
+		return FALSE;
+	}
+
+	if (rspamd_str_has_8bit(p, end - p)) {
+		return FALSE;
+	}
+
+	if (rspamd_parse_inet_address_ip4(p, end - p, &in4)) {
+		rspamd_url_regen_from_inet_addr(uri, &in4, AF_INET, pool);
+		ret = TRUE;
+	}
+	else if (rspamd_parse_inet_address_ip6(p, end - p, &in6)) {
+		rspamd_url_regen_from_inet_addr(uri, &in6, AF_INET6, pool);
+		ret = TRUE;
+	}
+	else {
+		/* Heuristics for broken urls */
+		gchar buf[INET6_ADDRSTRLEN + 1];
+		/* Try also numeric notation */
+		c = p;
+		n = 0;
+		dots = 0;
+		shift = 0;
+
+		while (p <= end && check_num) {
+			if (shift < 32 &&
+				((*p == '.' && dots < 3) || (p == end && dots <= 3))) {
+				if (p - c + 1 >= (gint) sizeof(buf)) {
+					msg_debug_pool("invalid numeric url %*.s...: too long",
+								   INET6_ADDRSTRLEN, c);
+					return FALSE;
+				}
+
+				rspamd_strlcpy(buf, c, p - c + 1);
+				c = p + 1;
+
+				if (p < end && *p == '.') {
+					dots++;
+				}
+
+				glong long_n = strtol(buf, &errstr, 0);
+
+				if ((errstr == NULL || *errstr == '\0') && long_n >= 0) {
+
+					t = long_n; /* Truncate as windows does */
+					/*
+					 * Even if we have zero, we need to shift by 1 octet
+					 */
+					nshift = (t == 0 ? shift + 8 : shift);
+
+					/*
+					 * Here we count number of octets encoded in this element
+					 */
+					for (i = 0; i < 4; i++) {
+						if ((t >> (8 * i)) > 0) {
+							nshift += 8;
+						}
+						else {
+							break;
+						}
+					}
+					/*
+					 * Here we need to find the proper shift of the previous
+					 * components, so we check possible cases:
+					 * 1) 1 octet - just use it applying shift
+					 * 2) 2 octets - convert to big endian 16 bit number
+					 * 3) 3 octets - convert to big endian 24 bit number
+					 * 4) 4 octets - convert to big endian 32 bit number
+					 */
+					switch (i) {
+					case 4:
+						t = GUINT32_TO_BE(t);
+						break;
+					case 3:
+						t = (GUINT32_TO_BE(t & 0xFFFFFFU)) >> 8;
+						break;
+					case 2:
+						t = GUINT16_TO_BE(t & 0xFFFFU);
+						break;
+					default:
+						t = t & 0xFF;
+						break;
+					}
+
+					if (p != end) {
+						n |= t << shift;
+
+						shift = nshift;
+					}
+				}
+				else {
+					check_num = FALSE;
+				}
+			}
+
+			p++;
+		}
+
+		/* The last component should be last according to url normalization:
+		 * 192.168.1 -> 192.168.0.1
+		 * 192 -> 0.0.0.192
+		 * 192.168 -> 192.0.0.168
+		 */
+		shift = 8 * (4 - i);
+
+		if (shift < 32) {
+			n |= t << shift;
+		}
+
+		if (check_num) {
+			if (dots <= 4) {
+				memcpy(&in4, &n, sizeof(in4));
+				rspamd_url_regen_from_inet_addr(uri, &in4, AF_INET, pool);
+				uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
+				ret = TRUE;
+			}
+			else if (end - c > (gint) sizeof(buf) - 1) {
+				rspamd_strlcpy(buf, c, end - c + 1);
+
+				if (inet_pton(AF_INET6, buf, &in6) == 1) {
+					rspamd_url_regen_from_inet_addr(uri, &in6, AF_INET6, pool);
+					uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
+					ret = TRUE;
+				}
+			}
+		}
+	}
+
+	return ret;
+}
+
+static void
+rspamd_url_shift(struct rspamd_url *uri, gsize nlen,
+				 enum http_parser_url_fields field)
+{
+	guint old_shift, shift = 0;
+	gint remain;
+
+	/* Shift remaining data */
+	switch (field) {
+	case UF_SCHEMA:
+		if (nlen >= uri->protocollen) {
+			return;
+		}
+		else {
+			shift = uri->protocollen - nlen;
+		}
+
+		old_shift = uri->protocollen;
+		uri->protocollen -= shift;
+		remain = uri->urllen - uri->protocollen;
+		g_assert(remain >= 0);
+		memmove(uri->string + uri->protocollen, uri->string + old_shift,
+				remain);
+		uri->urllen -= shift;
+		uri->flags |= RSPAMD_URL_FLAG_SCHEMAENCODED;
+		break;
+	case UF_HOST:
+		if (nlen >= uri->hostlen) {
+			return;
+		}
+		else {
+			shift = uri->hostlen - nlen;
+		}
+
+		old_shift = uri->hostlen;
+		uri->hostlen -= shift;
+		remain = (uri->urllen - (uri->hostshift)) - old_shift;
+		g_assert(remain >= 0);
+		memmove(rspamd_url_host_unsafe(uri) + uri->hostlen,
+				rspamd_url_host_unsafe(uri) + old_shift,
+				remain);
+		uri->urllen -= shift;
+		uri->flags |= RSPAMD_URL_FLAG_HOSTENCODED;
+		break;
+	case UF_PATH:
+		if (nlen >= uri->datalen) {
+			return;
+		}
+		else {
+			shift = uri->datalen - nlen;
+		}
+
+		old_shift = uri->datalen;
+		uri->datalen -= shift;
+		remain = (uri->urllen - (uri->datashift)) - old_shift;
+		g_assert(remain >= 0);
+		memmove(rspamd_url_data_unsafe(uri) + uri->datalen,
+				rspamd_url_data_unsafe(uri) + old_shift,
+				remain);
+		uri->urllen -= shift;
+		uri->flags |= RSPAMD_URL_FLAG_PATHENCODED;
+		break;
+	case UF_QUERY:
+		if (nlen >= uri->querylen) {
+			return;
+		}
+		else {
+			shift = uri->querylen - nlen;
+		}
+
+		old_shift = uri->querylen;
+		uri->querylen -= shift;
+		remain = (uri->urllen - (uri->queryshift)) - old_shift;
+		g_assert(remain >= 0);
+		memmove(rspamd_url_query_unsafe(uri) + uri->querylen,
+				rspamd_url_query_unsafe(uri) + old_shift,
+				remain);
+		uri->urllen -= shift;
+		uri->flags |= RSPAMD_URL_FLAG_QUERYENCODED;
+		break;
+	case UF_FRAGMENT:
+		if (nlen >= uri->fragmentlen) {
+			return;
+		}
+		else {
+			shift = uri->fragmentlen - nlen;
+		}
+
+		uri->fragmentlen -= shift;
+		uri->urllen -= shift;
+		break;
+	default:
+		break;
+	}
+
+	/* Now adjust lengths and offsets */
+	switch (field) {
+	case UF_SCHEMA:
+		if (uri->userlen > 0) {
+			uri->usershift -= shift;
+		}
+		if (uri->hostlen > 0) {
+			uri->hostshift -= shift;
+		}
+		/* Go forward */
+		/* FALLTHRU */
+	case UF_HOST:
+		if (uri->datalen > 0) {
+			uri->datashift -= shift;
+		}
+		/* Go forward */
+		/* FALLTHRU */
+	case UF_PATH:
+		if (uri->querylen > 0) {
+			uri->queryshift -= shift;
+		}
+		/* Go forward */
+		/* FALLTHRU */
+	case UF_QUERY:
+		if (uri->fragmentlen > 0) {
+			uri->fragmentshift -= shift;
+		}
+		/* Go forward */
+		/* FALLTHRU */
+	case UF_FRAGMENT:
+	default:
+		break;
+	}
+}
+
+static void
+rspamd_telephone_normalise_inplace(struct rspamd_url *uri)
+{
+	gchar *t, *h, *end;
+	gint i = 0, w, orig_len;
+	UChar32 uc;
+
+	t = rspamd_url_host_unsafe(uri);
+	h = t;
+	end = t + uri->hostlen;
+	orig_len = uri->hostlen;
+
+	if (*h == '+') {
+		h++;
+		t++;
+	}
+
+	while (h < end) {
+		i = 0;
+		U8_NEXT(h, i, end - h, uc);
+
+		if (u_isdigit(uc)) {
+			w = 0;
+			U8_APPEND_UNSAFE(t, w, uc);
+			t += w;
+		}
+
+		h += i;
+	}
+
+	uri->hostlen = t - rspamd_url_host_unsafe(uri);
+	uri->urllen -= (orig_len - uri->hostlen);
+}
+
+static inline bool
+is_idna_label_dot(UChar ch)
+{
+	switch (ch) {
+	case 0x3002:
+	case 0xFF0E:
+	case 0xFF61:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * All credits for this investigation should go to
+ * Dr. Hajime Shimada and Mr. Shirakura as they have revealed this case in their
+ * research.
+ */
+
+/*
+ * This function replaces unsafe IDNA dots in host labels. Unfortunately,
+ * IDNA extends dot definition from '.' to multiple other characters that
+ * should be treated equally.
+ * This function replaces such dots and returns `true` if these dots are found.
+ * In this case, it should be treated as obfuscation attempt.
+ */
+static bool
+rspamd_url_remove_dots(struct rspamd_url *uri)
+{
+	const gchar *hstart = rspamd_url_host_unsafe(uri);
+	gchar *t;
+	UChar32 uc;
+	gint i = 0, hlen;
+	bool ret = false;
+
+	if (uri->hostlen == 0) {
+		return false;
+	}
+
+	hlen = uri->hostlen;
+	t = rspamd_url_host_unsafe(uri);
+
+	while (i < hlen) {
+		gint prev_i = i;
+		U8_NEXT(hstart, i, hlen, uc);
+
+		if (is_idna_label_dot(uc)) {
+			*t++ = '.';
+			ret = true;
+		}
+		else {
+			if (ret) {
+				/* We have to shift the remaining stuff */
+				while (prev_i < i) {
+					*t++ = *(hstart + prev_i);
+					prev_i++;
+				}
+			}
+			else {
+				t += (i - prev_i);
+			}
+		}
+	}
+
+	if (ret) {
+		rspamd_url_shift(uri, t - hstart, UF_HOST);
+	}
+
+	return ret;
+}
+
+enum uri_errno
+rspamd_url_parse(struct rspamd_url *uri,
+				 gchar *uristring, gsize len,
+				 rspamd_mempool_t *pool,
+				 enum rspamd_url_parse_flags parse_flags)
+{
+	struct http_parser_url u;
+	gchar *p;
+	const gchar *end;
+	guint complen, ret, flags = 0;
+	gsize unquoted_len = 0;
+
+	memset(uri, 0, sizeof(*uri));
+	memset(&u, 0, sizeof(u));
+	uri->count = 1;
+	/* Undefine order */
+	uri->order = -1;
+	uri->part_order = -1;
+
+	if (*uristring == '\0') {
+		return URI_ERRNO_EMPTY;
+	}
+
+	if (len >= G_MAXUINT16 / 2) {
+		flags |= RSPAMD_URL_FLAG_TRUNCATED;
+		len = G_MAXUINT16 / 2;
+	}
+
+	p = uristring;
+	uri->protocol = PROTOCOL_UNKNOWN;
+
+	if (len > sizeof("mailto:") - 1) {
+		/* For mailto: urls we also need to add slashes to make it a valid URL */
+		if (g_ascii_strncasecmp(p, "mailto:", sizeof("mailto:") - 1) == 0) {
+			ret = rspamd_mailto_parse(&u, uristring, len, &end, parse_flags,
+									  &flags);
+		}
+		else if (g_ascii_strncasecmp(p, "tel:", sizeof("tel:") - 1) == 0 ||
+				 g_ascii_strncasecmp(p, "callto:", sizeof("callto:") - 1) == 0) {
+			ret = rspamd_telephone_parse(&u, uristring, len, &end, parse_flags,
+										 &flags);
+			uri->protocol = PROTOCOL_TELEPHONE;
+		}
+		else {
+			ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags,
+								   &flags);
+		}
+	}
+	else {
+		ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, &flags);
+	}
+
+	if (ret != 0) {
+		return URI_ERRNO_BAD_FORMAT;
+	}
+
+	if (end > uristring && (guint) (end - uristring) != len) {
+		len = end - uristring;
+	}
+
+	uri->raw = p;
+	uri->rawlen = len;
+
+	if (flags & RSPAMD_URL_FLAG_MISSINGSLASHES) {
+		len += 2;
+		uri->string = rspamd_mempool_alloc(pool, len + 1);
+		memcpy(uri->string, p, u.field_data[UF_SCHEMA].len);
+		memcpy(uri->string + u.field_data[UF_SCHEMA].len, "://", 3);
+		rspamd_strlcpy(uri->string + u.field_data[UF_SCHEMA].len + 3,
+					   p + u.field_data[UF_SCHEMA].len + 1,
+					   len - 2 - u.field_data[UF_SCHEMA].len);
+		/* Compensate slashes added */
+		for (int i = UF_SCHEMA + 1; i < UF_MAX; i++) {
+			if (u.field_set & (1 << i)) {
+				u.field_data[i].off += 2;
+			}
+		}
+	}
+	else {
+		uri->string = rspamd_mempool_alloc(pool, len + 1);
+		rspamd_strlcpy(uri->string, p, len + 1);
+	}
+
+	uri->urllen = len;
+	uri->flags = flags;
+
+	for (guint i = 0; i < UF_MAX; i++) {
+		if (u.field_set & (1 << i)) {
+			guint shift = u.field_data[i].off;
+			complen = u.field_data[i].len;
+
+			if (complen >= G_MAXUINT16) {
+				/* Too large component length */
+				return URI_ERRNO_BAD_FORMAT;
+			}
+
+			switch (i) {
+			case UF_SCHEMA:
+				uri->protocollen = u.field_data[i].len;
+				break;
+			case UF_HOST:
+				uri->hostshift = shift;
+				uri->hostlen = complen;
+				break;
+			case UF_PATH:
+				uri->datashift = shift;
+				uri->datalen = complen;
+				break;
+			case UF_QUERY:
+				uri->queryshift = shift;
+				uri->querylen = complen;
+				break;
+			case UF_FRAGMENT:
+				uri->fragmentshift = shift;
+				uri->fragmentlen = complen;
+				break;
+			case UF_USERINFO:
+				uri->usershift = shift;
+				uri->userlen = complen;
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	/* Port is 'special' in case of url_parser as it is not a part of UF_* macro logic */
+	if (u.port != 0) {
+		if (!uri->ext) {
+			uri->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext);
+		}
+		uri->flags |= RSPAMD_URL_FLAG_HAS_PORT;
+		uri->ext->port = u.port;
+	}
+
+	if (!uri->hostlen) {
+		return URI_ERRNO_HOST_MISSING;
+	}
+
+	/* Now decode url symbols */
+	unquoted_len = rspamd_url_decode(uri->string,
+									 uri->string,
+									 uri->protocollen);
+	rspamd_url_shift(uri, unquoted_len, UF_SCHEMA);
+	unquoted_len = rspamd_url_decode(rspamd_url_host_unsafe(uri),
+									 rspamd_url_host_unsafe(uri), uri->hostlen);
+
+	rspamd_url_normalise_propagate_flags(pool, rspamd_url_host_unsafe(uri),
+										 &unquoted_len, uri->flags);
+
+	rspamd_url_shift(uri, unquoted_len, UF_HOST);
+
+	if (rspamd_url_remove_dots(uri)) {
+		uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
+	}
+
+	if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_MAILTO | PROTOCOL_FTP | PROTOCOL_FILE)) {
+		/* Ensure that hostname starts with something sane (exclude numeric urls) */
+		const gchar *host = rspamd_url_host_unsafe(uri);
+
+		if (!(is_domain_start(host[0]) || host[0] == ':')) {
+			return URI_ERRNO_BAD_FORMAT;
+		}
+	}
+
+	/* Apply nameprep algorithm */
+	static UStringPrepProfile *nameprep = NULL;
+	UErrorCode uc_err = U_ZERO_ERROR;
+
+	if (nameprep == NULL) {
+		/* Open and cache profile */
+		nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, &uc_err);
+
+		g_assert(U_SUCCESS(uc_err));
+	}
+
+	UChar *utf16_hostname, *norm_utf16;
+	gint32 utf16_len, norm_utf16_len, norm_utf8_len;
+	UParseError parse_error;
+
+	utf16_hostname = rspamd_mempool_alloc(pool, uri->hostlen * sizeof(UChar));
+	struct UConverter *utf8_conv = rspamd_get_utf8_converter();
+
+	utf16_len = ucnv_toUChars(utf8_conv, utf16_hostname, uri->hostlen,
+							  rspamd_url_host_unsafe(uri), uri->hostlen, &uc_err);
+
+	if (!U_SUCCESS(uc_err)) {
+
+		return URI_ERRNO_BAD_FORMAT;
+	}
+
+	norm_utf16 = rspamd_mempool_alloc(pool, utf16_len * sizeof(UChar));
+	norm_utf16_len = usprep_prepare(nameprep, utf16_hostname, utf16_len,
+									norm_utf16, utf16_len, USPREP_DEFAULT, &parse_error, &uc_err);
+
+	if (!U_SUCCESS(uc_err)) {
+
+		return URI_ERRNO_BAD_FORMAT;
+	}
+
+	/* Convert back to utf8, sigh... */
+	norm_utf8_len = ucnv_fromUChars(utf8_conv,
+									rspamd_url_host_unsafe(uri), uri->hostlen,
+									norm_utf16, norm_utf16_len, &uc_err);
+
+	if (!U_SUCCESS(uc_err)) {
+
+		return URI_ERRNO_BAD_FORMAT;
+	}
+
+	/* Final shift of lengths */
+	rspamd_url_shift(uri, norm_utf8_len, UF_HOST);
+
+	/* Process data part */
+	if (uri->datalen) {
+		unquoted_len = rspamd_url_decode(rspamd_url_data_unsafe(uri),
+										 rspamd_url_data_unsafe(uri), uri->datalen);
+
+		rspamd_url_normalise_propagate_flags(pool, rspamd_url_data_unsafe(uri),
+											 &unquoted_len, uri->flags);
+
+		rspamd_url_shift(uri, unquoted_len, UF_PATH);
+		/* We now normalize path */
+		rspamd_normalize_path_inplace(rspamd_url_data_unsafe(uri),
+									  uri->datalen, &unquoted_len);
+		rspamd_url_shift(uri, unquoted_len, UF_PATH);
+	}
+
+	if (uri->querylen) {
+		unquoted_len = rspamd_url_decode(rspamd_url_query_unsafe(uri),
+										 rspamd_url_query_unsafe(uri),
+										 uri->querylen);
+
+		rspamd_url_normalise_propagate_flags(pool, rspamd_url_query_unsafe(uri),
+											 &unquoted_len, uri->flags);
+		rspamd_url_shift(uri, unquoted_len, UF_QUERY);
+	}
+
+	if (uri->fragmentlen) {
+		unquoted_len = rspamd_url_decode(rspamd_url_fragment_unsafe(uri),
+										 rspamd_url_fragment_unsafe(uri),
+										 uri->fragmentlen);
+
+		rspamd_url_normalise_propagate_flags(pool, rspamd_url_fragment_unsafe(uri),
+											 &unquoted_len, uri->flags);
+		rspamd_url_shift(uri, unquoted_len, UF_FRAGMENT);
+	}
+
+	rspamd_str_lc(uri->string, uri->protocollen);
+	unquoted_len = rspamd_str_lc_utf8(rspamd_url_host_unsafe(uri), uri->hostlen);
+	rspamd_url_shift(uri, unquoted_len, UF_HOST);
+
+	if (uri->protocol == PROTOCOL_UNKNOWN) {
+		for (int i = 0; i < G_N_ELEMENTS(rspamd_url_protocols); i++) {
+			if (uri->protocollen == rspamd_url_protocols[i].len) {
+				if (memcmp(uri->string,
+						   rspamd_url_protocols[i].name, uri->protocollen) == 0) {
+					uri->protocol = rspamd_url_protocols[i].proto;
+					break;
+				}
+			}
+		}
+	}
+
+	if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_MAILTO | PROTOCOL_FTP | PROTOCOL_FILE)) {
+		/* Find TLD part */
+		if (url_scanner->search_trie_full) {
+			rspamd_multipattern_lookup(url_scanner->search_trie_full,
+									   rspamd_url_host_unsafe(uri), uri->hostlen,
+									   rspamd_tld_trie_callback, uri, NULL);
+		}
+
+		if (uri->tldlen == 0) {
+			/*
+			 * If we have not detected eSLD, but there are no dots in the hostname,
+			 * then we should treat the whole hostname as eSLD - a rule of thumb
+			 *
+			 * We also check that a hostname ends with a permitted character, and all characters are forming
+			 * DNS label. We also need to check for a numeric IP within this check.
+			 */
+			const char *dot_pos = memchr(rspamd_url_host_unsafe(uri), '.', uri->hostlen);
+			bool is_whole_hostname_tld = false;
+
+			if (uri->hostlen > 0 && (dot_pos == NULL || dot_pos == rspamd_url_host_unsafe(uri) + uri->hostlen - 1)) {
+				bool all_chars_domain = true;
+
+				for (int i = 0; i < uri->hostlen; i++) {
+					if (!is_domain(rspamd_url_host_unsafe(uri)[i])) {
+						all_chars_domain = false;
+						break;
+					}
+				}
+
+				char last_c = rspamd_url_host_unsafe(uri)[uri->hostlen - 1];
+
+				if (all_chars_domain) {
+					/* Also check the last character to be either a dot or alphanumeric character */
+					if (last_c != '.' && !g_ascii_isalnum(last_c)) {
+						all_chars_domain = false;
+					}
+				}
+
+				if (all_chars_domain) {
+					/* Additionally check for a numeric IP as we can have some number here... */
+					rspamd_url_maybe_regenerate_from_ip(uri, pool);
+
+					if (last_c == '.' && uri->hostlen > 1) {
+						/* Skip the last dot */
+						uri->tldlen = uri->hostlen - 1;
+					}
+					else {
+						uri->tldlen = uri->hostlen;
+					}
+
+					uri->tldshift = uri->hostshift;
+					is_whole_hostname_tld = true;
+				}
+			}
+
+			if (!is_whole_hostname_tld) {
+				if (uri->protocol != PROTOCOL_MAILTO) {
+					if (url_scanner->has_tld_file && !(parse_flags & RSPAMD_URL_PARSE_HREF)) {
+						/* Ignore URL's without TLD if it is not a numeric URL */
+						if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) {
+							return URI_ERRNO_TLD_MISSING;
+						}
+					}
+					else {
+						if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) {
+							/* Assume tld equal to host */
+							uri->tldshift = uri->hostshift;
+							uri->tldlen = uri->hostlen;
+						}
+						else if (uri->flags & RSPAMD_URL_FLAG_SCHEMALESS) {
+							/* Ignore urls with both no schema and no tld */
+							return URI_ERRNO_TLD_MISSING;
+						}
+
+						uri->flags |= RSPAMD_URL_FLAG_NO_TLD;
+					}
+				}
+				else {
+					/* Ignore IP like domains for mailto, as it is really never supported */
+					return URI_ERRNO_TLD_MISSING;
+				}
+			}
+		}
+
+		/* Replace stupid '\' with '/' after schema */
+		if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP) &&
+			uri->protocollen > 0 && uri->urllen > uri->protocollen + 2) {
+
+			gchar *pos = &uri->string[uri->protocollen],
+				  *host_start = rspamd_url_host_unsafe(uri);
+
+			while (pos < host_start) {
+				if (*pos == '\\') {
+					*pos = '/';
+					uri->flags |= RSPAMD_URL_FLAG_OBSCURED;
+				}
+				pos++;
+			}
+		}
+	}
+	else if (uri->protocol & PROTOCOL_TELEPHONE) {
+		/* We need to normalise phone number: remove all spaces and braces */
+		rspamd_telephone_normalise_inplace(uri);
+
+		if (rspamd_url_host_unsafe(uri)[0] == '+') {
+			uri->tldshift = uri->hostshift + 1;
+			uri->tldlen = uri->hostlen - 1;
+		}
+		else {
+			uri->tldshift = uri->hostshift;
+			uri->tldlen = uri->hostlen;
+		}
+	}
+
+	if (uri->protocol == PROTOCOL_UNKNOWN) {
+		if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) {
+			return URI_ERRNO_INVALID_PROTOCOL;
+		}
+		else {
+			/* Hack, hack, hack */
+			uri->protocol = PROTOCOL_UNKNOWN;
+		}
+	}
+
+	return URI_ERRNO_OK;
+}
+
+struct tld_trie_cbdata {
+	const gchar *begin;
+	gsize len;
+	rspamd_ftok_t *out;
+};
+
+static gint
+rspamd_tld_trie_find_callback(struct rspamd_multipattern *mp,
+							  guint strnum,
+							  gint match_start,
+							  gint match_pos,
+							  const gchar *text,
+							  gsize len,
+							  void *context)
+{
+	struct url_matcher *matcher;
+	const gchar *start, *pos, *p;
+	struct tld_trie_cbdata *cbdata = context;
+	gint ndots = 1;
+
+	matcher = &g_array_index(url_scanner->matchers_full, struct url_matcher,
+							 strnum);
+
+	if (matcher->flags & URL_MATCHER_FLAG_STAR_MATCH) {
+		/* Skip one more tld component */
+		ndots = 2;
+	}
+
+	pos = text + match_start;
+	p = pos - 1;
+	start = text;
+
+	if (*pos != '.' || match_pos != (gint) cbdata->len) {
+		/* Something weird has been found */
+		if (match_pos != (gint) cbdata->len - 1) {
+			/* Search more */
+			return 0;
+		}
+	}
+
+	/* Now we need to find top level domain */
+	pos = start;
+
+	while (p >= start && ndots > 0) {
+		if (*p == '.') {
+			ndots--;
+			pos = p + 1;
+		}
+		else {
+			pos = p;
+		}
+
+		p--;
+	}
+
+	if (ndots == 0 || p == start - 1) {
+		if (cbdata->begin + cbdata->len - pos > cbdata->out->len) {
+			cbdata->out->begin = pos;
+			cbdata->out->len = cbdata->begin + cbdata->len - pos;
+		}
+	}
+
+	return 0;
+}
+
+gboolean
+rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out)
+{
+	struct tld_trie_cbdata cbdata;
+
+	g_assert(in != NULL);
+	g_assert(out != NULL);
+	g_assert(url_scanner != NULL);
+
+	cbdata.begin = in;
+	cbdata.len = inlen;
+	cbdata.out = out;
+	out->len = 0;
+
+	if (url_scanner->search_trie_full) {
+		rspamd_multipattern_lookup(url_scanner->search_trie_full, in, inlen,
+								   rspamd_tld_trie_find_callback, &cbdata, NULL);
+	}
+
+	if (out->len > 0) {
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+static const gchar url_braces[] = {
+	'(', ')',
+	'{', '}',
+	'[', ']',
+	'<', '>',
+	'|', '|',
+	'\'', '\''};
+
+
+static gboolean
+url_file_start(struct url_callback_data *cb,
+			   const gchar *pos,
+			   url_match_t *match)
+{
+	match->m_begin = pos;
+
+	if (pos > cb->begin) {
+		match->st = *(pos - 1);
+	}
+	else {
+		match->st = '\0';
+	}
+
+	return TRUE;
+}
+
+static gboolean
+url_file_end(struct url_callback_data *cb,
+			 const gchar *pos,
+			 url_match_t *match)
+{
+	const gchar *p;
+	gchar stop;
+	guint i;
+
+	p = pos + strlen(match->pattern);
+	stop = *p;
+	if (*p == '/') {
+		p++;
+	}
+
+	for (i = 0; i < G_N_ELEMENTS(url_braces) / 2; i += 2) {
+		if (*p == url_braces[i]) {
+			stop = url_braces[i + 1];
+			break;
+		}
+	}
+
+	while (p < cb->end && *p != stop && is_urlsafe(*p)) {
+		p++;
+	}
+
+	if (p == cb->begin) {
+		return FALSE;
+	}
+	match->m_len = p - match->m_begin;
+
+	return TRUE;
+}
+
+static gboolean
+url_tld_start(struct url_callback_data *cb,
+			  const gchar *pos,
+			  url_match_t *match)
+{
+	const gchar *p = pos;
+	guint processed = 0;
+	static const guint max_shift = 253 + sizeof("https://");
+
+	/* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */
+	while (p >= cb->begin) {
+		if (!is_domain(*p) || g_ascii_isspace(*p) || is_url_start(*p) ||
+			p == match->prev_newline_pos) {
+			if (!is_url_start(*p) && !g_ascii_isspace(*p) &&
+				p != match->prev_newline_pos) {
+				return FALSE;
+			}
+
+			if (p != match->prev_newline_pos) {
+				match->st = *p;
+
+				p++;
+			}
+			else {
+				match->st = '\n';
+			}
+
+			if (!g_ascii_isalnum(*p)) {
+				/* Urls cannot start with strange symbols */
+				return FALSE;
+			}
+
+			match->m_begin = p;
+			return TRUE;
+		}
+		else if (p == cb->begin && p != pos) {
+			match->st = '\0';
+			match->m_begin = p;
+
+			return TRUE;
+		}
+		else if (*p == '.') {
+			if (p == cb->begin) {
+				/* Urls cannot start with a dot */
+				return FALSE;
+			}
+			if (!g_ascii_isalnum(p[1])) {
+				/* Wrong we have an invalid character after dot */
+				return FALSE;
+			}
+		}
+		else if (*p == '/') {
+			/* Urls cannot contain '/' in their body */
+			return FALSE;
+		}
+
+		p--;
+		processed++;
+
+		if (processed > max_shift) {
+			/* Too long */
+			return FALSE;
+		}
+	}
+
+	return FALSE;
+}
+
+static gboolean
+url_tld_end(struct url_callback_data *cb,
+			const gchar *pos,
+			url_match_t *match)
+{
+	const gchar *p;
+	gboolean ret = FALSE;
+
+	p = pos + match->m_len;
+
+	if (p == cb->end) {
+		match->m_len = p - match->m_begin;
+		return TRUE;
+	}
+	else if (*p == '/' || *p == ':' || is_url_end(*p) || is_lwsp(*p) ||
+			 (match->st != '<' && p == match->newline_pos)) {
+		/* Parse arguments, ports by normal way by url default function */
+		p = match->m_begin;
+		/* Check common prefix */
+		if (g_ascii_strncasecmp(p, "http://", sizeof("http://") - 1) == 0) {
+			ret = url_web_end(cb,
+							  match->m_begin + sizeof("http://") - 1,
+							  match);
+		}
+		else {
+			ret = url_web_end(cb, match->m_begin, match);
+		}
+	}
+	else if (*p == '.') {
+		p++;
+		if (p < cb->end) {
+			if (g_ascii_isspace(*p) || *p == '/' ||
+				*p == '?' || *p == ':') {
+				ret = url_web_end(cb, match->m_begin, match);
+			}
+		}
+	}
+
+	if (ret) {
+		/* Check sanity of match found */
+		if (match->m_begin + match->m_len <= pos) {
+			return FALSE;
+		}
+	}
+
+	return ret;
+}
+
+static gboolean
+url_web_start(struct url_callback_data *cb,
+			  const gchar *pos,
+			  url_match_t *match)
+{
+	/* Check what we have found */
+	if (pos > cb->begin) {
+		if (g_ascii_strncasecmp(pos, "www", 3) == 0) {
+
+			if (!(is_url_start(*(pos - 1)) ||
+				  g_ascii_isspace(*(pos - 1)) ||
+				  pos - 1 == match->prev_newline_pos ||
+				  (*(pos - 1) & 0x80))) { /* Chinese trick */
+				return FALSE;
+			}
+		}
+		else {
+			guchar prev = *(pos - 1);
+
+			if (g_ascii_isalnum(prev)) {
+				/* Part of another url */
+				return FALSE;
+			}
+		}
+	}
+
+	if (*pos == '.') {
+		/* Urls cannot start with . */
+		return FALSE;
+	}
+
+	if (pos > cb->begin) {
+		match->st = *(pos - 1);
+	}
+	else {
+		match->st = '\0';
+	}
+
+	match->m_begin = pos;
+
+	return TRUE;
+}
+
+static gboolean
+url_web_end(struct url_callback_data *cb,
+			const gchar *pos,
+			url_match_t *match)
+{
+	const gchar *last = NULL;
+	gint len = cb->end - pos;
+	guint flags = 0;
+
+	if (match->newline_pos && match->st != '<') {
+		/* We should also limit our match end to the newline */
+		len = MIN(len, match->newline_pos - pos);
+	}
+
+	if (rspamd_web_parse(NULL, pos, len, &last,
+						 RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
+		return FALSE;
+	}
+
+	if (last < cb->end && (*last == '>' && last != match->newline_pos)) {
+		/* We need to ensure that url also starts with '>' */
+		if (match->st != '<') {
+			if (last + 1 < cb->end) {
+				if (g_ascii_isspace(last[1])) {
+					return FALSE;
+				}
+			}
+			else {
+				return FALSE;
+			}
+		}
+	}
+
+	match->m_len = (last - pos);
+	cb->fin = last + 1;
+
+	return TRUE;
+}
+
+
+static gboolean
+url_email_start(struct url_callback_data *cb,
+				const gchar *pos,
+				url_match_t *match)
+{
+	if (!match->prefix || match->prefix[0] == '\0') {
+		/* We have mailto:// at the beginning */
+		match->m_begin = pos;
+
+		if (pos >= cb->begin + 1) {
+			match->st = *(pos - 1);
+		}
+		else {
+			match->st = '\0';
+		}
+	}
+	else {
+		/* Just '@' */
+
+		/* Check if this match is a part of the previous mailto: email */
+		if (cb->last_at != NULL && cb->last_at == pos) {
+			cb->last_at = NULL;
+			return FALSE;
+		}
+		else if (pos == cb->begin) {
+			/* Just @ at the start of input */
+			return FALSE;
+		}
+
+		match->st = '\0';
+	}
+
+	return TRUE;
+}
+
+static gboolean
+url_email_end(struct url_callback_data *cb,
+			  const gchar *pos,
+			  url_match_t *match)
+{
+	const gchar *last = NULL;
+	struct http_parser_url u;
+	gint len = cb->end - pos;
+	guint flags = 0;
+
+	if (match->newline_pos && match->st != '<') {
+		/* We should also limit our match end to the newline */
+		len = MIN(len, match->newline_pos - pos);
+	}
+
+	if (!match->prefix || match->prefix[0] == '\0') {
+		/* We have mailto:// at the beginning */
+		if (rspamd_mailto_parse(&u, pos, len, &last,
+								RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
+			return FALSE;
+		}
+
+		if (!(u.field_set & (1 << UF_USERINFO))) {
+			return FALSE;
+		}
+
+		cb->last_at = match->m_begin + u.field_data[UF_USERINFO].off +
+					  u.field_data[UF_USERINFO].len;
+
+		g_assert(*cb->last_at == '@');
+		match->m_len = (last - pos);
+
+		return TRUE;
+	}
+	else {
+		const gchar *c, *p;
+		/*
+		 * Here we have just '@', so we need to find both start and end of the
+		 * pattern
+		 */
+		g_assert(*pos == '@');
+
+		if (pos >= cb->end - 2 || pos < cb->begin + 1) {
+			/* Boundary violation */
+			return FALSE;
+		}
+
+		/* Check the next character after `@` */
+		if (!g_ascii_isalnum(pos[1]) || !g_ascii_isalnum(*(pos - 1))) {
+			return FALSE;
+		}
+
+
+		c = pos - 1;
+		while (c > cb->begin) {
+			if (!is_mailsafe(*c)) {
+				break;
+			}
+			if (c == match->prev_newline_pos) {
+				break;
+			}
+
+			c--;
+		}
+		/* Rewind to the first alphanumeric character */
+		while (c < pos && !g_ascii_isalnum(*c)) {
+			c++;
+		}
+
+		/* Find the end of email */
+		p = pos + 1;
+		while (p < cb->end && is_domain(*p)) {
+			if (p == match->newline_pos) {
+				break;
+			}
+
+			p++;
+		}
+
+		/* Rewind it again to avoid bad emails to be detected */
+		while (p > pos && p < cb->end && !g_ascii_isalnum(*p)) {
+			p--;
+		}
+
+		if (p < cb->end && g_ascii_isalnum(*p) &&
+			(match->newline_pos == NULL || p < match->newline_pos)) {
+			p++;
+		}
+
+		if (p > c) {
+			match->m_begin = c;
+			match->m_len = p - c;
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+static gboolean
+url_tel_start(struct url_callback_data *cb,
+			  const gchar *pos,
+			  url_match_t *match)
+{
+	match->m_begin = pos;
+
+	if (pos >= cb->begin + 1) {
+		match->st = *(pos - 1);
+	}
+	else {
+		match->st = '\0';
+	}
+
+	return TRUE;
+}
+
+static gboolean
+url_tel_end(struct url_callback_data *cb,
+			const gchar *pos,
+			url_match_t *match)
+{
+	const gchar *last = NULL;
+	struct http_parser_url u;
+	gint len = cb->end - pos;
+	guint flags = 0;
+
+	if (match->newline_pos && match->st != '<') {
+		/* We should also limit our match end to the newline */
+		len = MIN(len, match->newline_pos - pos);
+	}
+
+	if (rspamd_telephone_parse(&u, pos, len, &last,
+							   RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
+		return FALSE;
+	}
+
+	if (!(u.field_set & (1 << UF_HOST))) {
+		return FALSE;
+	}
+
+	match->m_len = (last - pos);
+
+	return TRUE;
+}
+
+
+static gboolean
+rspamd_url_trie_is_match(struct url_matcher *matcher, const gchar *pos,
+						 const gchar *end, const gchar *newline_pos)
+{
+	if (matcher->flags & URL_MATCHER_FLAG_TLD_MATCH) {
+		/* Immediately check pos for valid chars */
+		if (pos < end) {
+			if (pos != newline_pos && !g_ascii_isspace(*pos) && *pos != '/' && *pos != '?' &&
+				*pos != ':' && !is_url_end(*pos)) {
+				if (*pos == '.') {
+					/* We allow . at the end of the domain however */
+					pos++;
+					if (pos < end) {
+						if (!g_ascii_isspace(*pos) && *pos != '/' &&
+							*pos != '?' && *pos != ':' && !is_url_end(*pos)) {
+							return FALSE;
+						}
+					}
+				}
+				else {
+					return FALSE;
+				}
+			}
+		}
+	}
+
+	return TRUE;
+}
+
+static gint
+rspamd_url_trie_callback(struct rspamd_multipattern *mp,
+						 guint strnum,
+						 gint match_start,
+						 gint match_pos,
+						 const gchar *text,
+						 gsize len,
+						 void *context)
+{
+	struct url_matcher *matcher;
+	url_match_t m;
+	const gchar *pos, *newline_pos = NULL;
+	struct url_callback_data *cb = context;
+
+	pos = text + match_pos;
+
+	if (cb->fin > pos) {
+		/* Already seen */
+		return 0;
+	}
+
+	matcher = &g_array_index(cb->matchers, struct url_matcher,
+							 strnum);
+
+	if ((matcher->flags & URL_MATCHER_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
+		/* Do not try to match non-html like urls in html texts */
+		return 0;
+	}
+
+	memset(&m, 0, sizeof(m));
+	m.m_begin = text + match_start;
+	m.m_len = match_pos - match_start;
+
+	if (cb->newlines && cb->newlines->len > 0) {
+		newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx);
+
+		while (pos > newline_pos && cb->newline_idx < cb->newlines->len) {
+			cb->newline_idx++;
+			newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx);
+		}
+
+		if (pos > newline_pos) {
+			newline_pos = NULL;
+		}
+
+		if (cb->newline_idx > 0) {
+			m.prev_newline_pos = g_ptr_array_index(cb->newlines,
+												   cb->newline_idx - 1);
+		}
+	}
+
+	if (!rspamd_url_trie_is_match(matcher, pos, cb->end, newline_pos)) {
+		return 0;
+	}
+
+	m.pattern = matcher->pattern;
+	m.prefix = matcher->prefix;
+	m.add_prefix = FALSE;
+	m.newline_pos = newline_pos;
+	pos = cb->begin + match_start;
+
+	if (matcher->start(cb, pos, &m) &&
+		matcher->end(cb, pos, &m)) {
+		if (m.add_prefix || matcher->prefix[0] != '\0') {
+			cb->len = m.m_len + strlen(matcher->prefix);
+			cb->url_str = rspamd_mempool_alloc(cb->pool, cb->len + 1);
+			cb->len = rspamd_snprintf(cb->url_str,
+									  cb->len + 1,
+									  "%s%*s",
+									  m.prefix,
+									  (gint) m.m_len,
+									  m.m_begin);
+			cb->prefix_added = TRUE;
+		}
+		else {
+			cb->url_str = rspamd_mempool_alloc(cb->pool, m.m_len + 1);
+			rspamd_strlcpy(cb->url_str, m.m_begin, m.m_len + 1);
+		}
+
+		cb->start = m.m_begin;
+
+		if (pos > cb->fin) {
+			cb->fin = pos;
+		}
+
+		return 1;
+	}
+	else {
+		cb->url_str = NULL;
+	}
+
+	/* Continue search */
+	return 0;
+}
+
+gboolean
+rspamd_url_find(rspamd_mempool_t *pool,
+				const gchar *begin, gsize len,
+				gchar **url_str,
+				enum rspamd_url_find_type how,
+				goffset *url_pos,
+				gboolean *prefix_added)
+{
+	struct url_callback_data cb;
+	gint ret;
+
+	memset(&cb, 0, sizeof(cb));
+	cb.begin = begin;
+	cb.end = begin + len;
+	cb.how = how;
+	cb.pool = pool;
+
+	if (how == RSPAMD_URL_FIND_ALL) {
+		if (url_scanner->search_trie_full) {
+			cb.matchers = url_scanner->matchers_full;
+			ret = rspamd_multipattern_lookup(url_scanner->search_trie_full,
+											 begin, len,
+											 rspamd_url_trie_callback, &cb, NULL);
+		}
+		else {
+			cb.matchers = url_scanner->matchers_strict;
+			ret = rspamd_multipattern_lookup(url_scanner->search_trie_strict,
+											 begin, len,
+											 rspamd_url_trie_callback, &cb, NULL);
+		}
+	}
+	else {
+		cb.matchers = url_scanner->matchers_strict;
+		ret = rspamd_multipattern_lookup(url_scanner->search_trie_strict,
+										 begin, len,
+										 rspamd_url_trie_callback, &cb, NULL);
+	}
+
+	if (ret) {
+		if (url_str) {
+			*url_str = cb.url_str;
+		}
+
+		if (url_pos) {
+			*url_pos = cb.start - begin;
+		}
+
+		if (prefix_added) {
+			*prefix_added = cb.prefix_added;
+		}
+
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+static gint
+rspamd_url_trie_generic_callback_common(struct rspamd_multipattern *mp,
+										guint strnum,
+										gint match_start,
+										gint match_pos,
+										const gchar *text,
+										gsize len,
+										void *context,
+										gboolean multiple)
+{
+	struct rspamd_url *url;
+	struct url_matcher *matcher;
+	url_match_t m;
+	const gchar *pos, *newline_pos = NULL;
+	struct url_callback_data *cb = context;
+	gint rc;
+	rspamd_mempool_t *pool;
+
+	pos = text + match_pos;
+
+	if (cb->fin > pos) {
+		/* Already seen */
+		return 0;
+	}
+
+	matcher = &g_array_index(cb->matchers, struct url_matcher,
+							 strnum);
+	pool = cb->pool;
+
+	if ((matcher->flags & URL_MATCHER_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
+		/* Do not try to match non-html like urls in html texts, continue matching */
+		return 0;
+	}
+
+	memset(&m, 0, sizeof(m));
+
+
+	/* Find the next newline after our pos */
+	if (cb->newlines && cb->newlines->len > 0) {
+		newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx);
+
+		while (pos > newline_pos && cb->newline_idx < cb->newlines->len - 1) {
+			cb->newline_idx++;
+			newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx);
+		}
+
+		if (pos > newline_pos) {
+			newline_pos = NULL;
+		}
+		if (cb->newline_idx > 0) {
+			m.prev_newline_pos = g_ptr_array_index(cb->newlines,
+												   cb->newline_idx - 1);
+		}
+	}
+
+	if (!rspamd_url_trie_is_match(matcher, pos, text + len, newline_pos)) {
+		/* Mismatch, continue */
+		return 0;
+	}
+
+	pos = cb->begin + match_start;
+	m.pattern = matcher->pattern;
+	m.prefix = matcher->prefix;
+	m.add_prefix = FALSE;
+	m.m_begin = text + match_start;
+	m.m_len = match_pos - match_start;
+	m.newline_pos = newline_pos;
+
+	if (matcher->start(cb, pos, &m) &&
+		matcher->end(cb, pos, &m)) {
+		if (m.add_prefix || matcher->prefix[0] != '\0') {
+			cb->len = m.m_len + strlen(matcher->prefix);
+			cb->url_str = rspamd_mempool_alloc(cb->pool, cb->len + 1);
+			cb->len = rspamd_snprintf(cb->url_str,
+									  cb->len + 1,
+									  "%s%*s",
+									  m.prefix,
+									  (gint) m.m_len,
+									  m.m_begin);
+			cb->prefix_added = TRUE;
+		}
+		else {
+			cb->url_str = rspamd_mempool_alloc(cb->pool, m.m_len + 1);
+			cb->len = rspamd_strlcpy(cb->url_str, m.m_begin, m.m_len + 1);
+		}
+
+		cb->start = m.m_begin;
+
+		if (pos > cb->fin) {
+			cb->fin = pos;
+		}
+
+		url = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_url));
+		g_strstrip(cb->url_str);
+		rc = rspamd_url_parse(url, cb->url_str,
+							  strlen(cb->url_str), pool,
+							  RSPAMD_URL_PARSE_TEXT);
+
+		if (rc == URI_ERRNO_OK && url->hostlen > 0) {
+			if (cb->prefix_added) {
+				url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+				cb->prefix_added = FALSE;
+			}
+
+			if (cb->func) {
+				if (!cb->func(url, cb->start - text, (m.m_begin + m.m_len) - text,
+							  cb->funcd)) {
+					/* We need to stop here in any case! */
+					return -1;
+				}
+			}
+		}
+		else if (rc != URI_ERRNO_OK) {
+			msg_debug_pool_check("extract of url '%s' failed: %s",
+								 cb->url_str,
+								 rspamd_url_strerror(rc));
+		}
+	}
+	else {
+		cb->url_str = NULL;
+		/* Continue search if no pattern has been found */
+		return 0;
+	}
+
+	/* Continue search if required (return 0 means continue) */
+	return !multiple;
+}
+
+static gint
+rspamd_url_trie_generic_callback_multiple(struct rspamd_multipattern *mp,
+										  guint strnum,
+										  gint match_start,
+										  gint match_pos,
+										  const gchar *text,
+										  gsize len,
+										  void *context)
+{
+	return rspamd_url_trie_generic_callback_common(mp, strnum, match_start,
+												   match_pos, text, len, context, TRUE);
+}
+
+static gint
+rspamd_url_trie_generic_callback_single(struct rspamd_multipattern *mp,
+										guint strnum,
+										gint match_start,
+										gint match_pos,
+										const gchar *text,
+										gsize len,
+										void *context)
+{
+	return rspamd_url_trie_generic_callback_common(mp, strnum, match_start,
+												   match_pos, text, len, context, FALSE);
+}
+
+struct rspamd_url_mimepart_cbdata {
+	struct rspamd_task *task;
+	struct rspamd_mime_text_part *part;
+	gsize url_len;
+	uint16_t *cur_url_order; /* Global ordering */
+	uint16_t cur_part_order; /* Per part ordering */
+};
+
+static gboolean
+rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset,
+						  gsize end_offset, gpointer ud)
+{
+	struct rspamd_url_mimepart_cbdata *cbd =
+		(struct rspamd_url_mimepart_cbdata *) ud;
+	struct rspamd_task *task;
+
+	task = cbd->task;
+
+	if (url->protocol == PROTOCOL_MAILTO) {
+		if (url->userlen == 0) {
+			return FALSE;
+		}
+	}
+	/* Also check max urls */
+	if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) {
+		if (kh_size(MESSAGE_FIELD(task, urls)) > cbd->task->cfg->max_urls) {
+			msg_err_task("part has too many URLs, we cannot process more: "
+						 "%d urls extracted ",
+						 (guint) kh_size(MESSAGE_FIELD(task, urls)));
+
+			return FALSE;
+		}
+	}
+
+	url->flags |= RSPAMD_URL_FLAG_QUERY;
+
+
+	if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
+		if (cbd->part && cbd->part->mime_part->urls) {
+			g_ptr_array_add(cbd->part->mime_part->urls, url);
+		}
+
+		url->part_order = cbd->cur_part_order++;
+
+		if (cbd->cur_url_order) {
+			url->order = (*cbd->cur_url_order)++;
+		}
+	}
+
+	return TRUE;
+}
+
+static gboolean
+rspamd_url_text_part_callback(struct rspamd_url *url, gsize start_offset,
+							  gsize end_offset, gpointer ud)
+{
+	struct rspamd_url_mimepart_cbdata *cbd =
+		(struct rspamd_url_mimepart_cbdata *) ud;
+	struct rspamd_process_exception *ex;
+	struct rspamd_task *task;
+
+	task = cbd->task;
+	ex = rspamd_mempool_alloc0(task->task_pool, sizeof(struct rspamd_process_exception));
+
+	ex->pos = start_offset;
+	ex->len = end_offset - start_offset;
+	ex->type = RSPAMD_EXCEPTION_URL;
+	ex->ptr = url;
+
+	cbd->url_len += ex->len;
+
+	if (cbd->part->utf_stripped_content &&
+		cbd->url_len > cbd->part->utf_stripped_content->len * 10) {
+		/* Absurd case, stop here now */
+		msg_err_task("part has too many URLs, we cannot process more: %z url len; "
+					 "%d stripped content length",
+					 cbd->url_len, cbd->part->utf_stripped_content->len);
+
+		return FALSE;
+	}
+
+	if (url->protocol == PROTOCOL_MAILTO) {
+		if (url->userlen == 0) {
+			return FALSE;
+		}
+	}
+	/* Also check max urls */
+	if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) {
+		if (kh_size(MESSAGE_FIELD(task, urls)) > cbd->task->cfg->max_urls) {
+			msg_err_task("part has too many URLs, we cannot process more: "
+						 "%d urls extracted ",
+						 (guint) kh_size(MESSAGE_FIELD(task, urls)));
+
+			return FALSE;
+		}
+	}
+
+	url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
+
+	if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) &&
+		cbd->part->mime_part->urls) {
+		url->part_order = cbd->cur_part_order++;
+
+		if (cbd->cur_url_order) {
+			url->order = (*cbd->cur_url_order)++;
+		}
+		g_ptr_array_add(cbd->part->mime_part->urls, url);
+	}
+
+	cbd->part->exceptions = g_list_prepend(
+		cbd->part->exceptions,
+		ex);
+
+	/* We also search the query for additional url inside */
+	if (url->querylen > 0) {
+		rspamd_url_find_multiple(task->task_pool,
+								 rspamd_url_query_unsafe(url), url->querylen,
+								 RSPAMD_URL_FIND_ALL, NULL,
+								 rspamd_url_query_callback, cbd);
+	}
+
+	return TRUE;
+}
+
+void rspamd_url_text_extract(rspamd_mempool_t *pool,
+							 struct rspamd_task *task,
+							 struct rspamd_mime_text_part *part,
+							 uint16_t *cur_url_order,
+							 enum rspamd_url_find_type how)
+{
+	struct rspamd_url_mimepart_cbdata mcbd;
+
+	if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) {
+		msg_warn_task("got empty text part");
+		return;
+	}
+
+	mcbd.task = task;
+	mcbd.part = part;
+	mcbd.url_len = 0;
+	mcbd.cur_url_order = cur_url_order;
+	mcbd.cur_part_order = 0;
+
+	rspamd_url_find_multiple(task->task_pool, part->utf_stripped_content->data,
+							 part->utf_stripped_content->len, how, part->newlines,
+							 rspamd_url_text_part_callback, &mcbd);
+}
+
+void rspamd_url_find_multiple(rspamd_mempool_t *pool,
+							  const gchar *in,
+							  gsize inlen,
+							  enum rspamd_url_find_type how,
+							  GPtrArray *nlines,
+							  url_insert_function func,
+							  gpointer ud)
+{
+	struct url_callback_data cb;
+
+	g_assert(in != NULL);
+
+	if (inlen == 0) {
+		inlen = strlen(in);
+	}
+
+	memset(&cb, 0, sizeof(cb));
+	cb.begin = in;
+	cb.end = in + inlen;
+	cb.how = how;
+	cb.pool = pool;
+
+	cb.funcd = ud;
+	cb.func = func;
+	cb.newlines = nlines;
+
+	if (how == RSPAMD_URL_FIND_ALL) {
+		if (url_scanner->search_trie_full) {
+			cb.matchers = url_scanner->matchers_full;
+			rspamd_multipattern_lookup(url_scanner->search_trie_full,
+									   in, inlen,
+									   rspamd_url_trie_generic_callback_multiple, &cb, NULL);
+		}
+		else {
+			cb.matchers = url_scanner->matchers_strict;
+			rspamd_multipattern_lookup(url_scanner->search_trie_strict,
+									   in, inlen,
+									   rspamd_url_trie_generic_callback_multiple, &cb, NULL);
+		}
+	}
+	else {
+		cb.matchers = url_scanner->matchers_strict;
+		rspamd_multipattern_lookup(url_scanner->search_trie_strict,
+								   in, inlen,
+								   rspamd_url_trie_generic_callback_multiple, &cb, NULL);
+	}
+}
+
+void rspamd_url_find_single(rspamd_mempool_t *pool,
+							const gchar *in,
+							gsize inlen,
+							enum rspamd_url_find_type how,
+							url_insert_function func,
+							gpointer ud)
+{
+	struct url_callback_data cb;
+
+	g_assert(in != NULL);
+
+	if (inlen == 0) {
+		inlen = strlen(in);
+	}
+
+	/*
+	 * We might have a situation when we need to parse URLs on config file
+	 * parsing, but there is no valid url_scanner loaded. Hence, we just load
+	 * some defaults and it should be fine...
+	 */
+	if (url_scanner == NULL) {
+		rspamd_url_init(NULL);
+	}
+
+	memset(&cb, 0, sizeof(cb));
+	cb.begin = in;
+	cb.end = in + inlen;
+	cb.how = how;
+	cb.pool = pool;
+
+	cb.funcd = ud;
+	cb.func = func;
+
+	if (how == RSPAMD_URL_FIND_ALL) {
+		if (url_scanner->search_trie_full) {
+			cb.matchers = url_scanner->matchers_full;
+			rspamd_multipattern_lookup(url_scanner->search_trie_full,
+									   in, inlen,
+									   rspamd_url_trie_generic_callback_single, &cb, NULL);
+		}
+		else {
+			cb.matchers = url_scanner->matchers_strict;
+			rspamd_multipattern_lookup(url_scanner->search_trie_strict,
+									   in, inlen,
+									   rspamd_url_trie_generic_callback_single, &cb, NULL);
+		}
+	}
+	else {
+		cb.matchers = url_scanner->matchers_strict;
+		rspamd_multipattern_lookup(url_scanner->search_trie_strict,
+								   in, inlen,
+								   rspamd_url_trie_generic_callback_single, &cb, NULL);
+	}
+}
+
+
+gboolean
+rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset,
+								 gsize end_offset, gpointer ud)
+{
+	struct rspamd_task *task = ud;
+	gchar *url_str = NULL;
+	struct rspamd_url *query_url;
+	gint rc;
+	gboolean prefix_added;
+
+	/* It is just a displayed URL, we should not check it for certain things */
+	url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED | RSPAMD_URL_FLAG_SUBJECT;
+
+	if (url->protocol == PROTOCOL_MAILTO) {
+		if (url->userlen == 0) {
+			return FALSE;
+		}
+	}
+
+	rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false);
+
+	/* We also search the query for additional url inside */
+	if (url->querylen > 0) {
+		if (rspamd_url_find(task->task_pool, rspamd_url_query_unsafe(url), url->querylen,
+							&url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
+
+			query_url = rspamd_mempool_alloc0(task->task_pool,
+											  sizeof(struct rspamd_url));
+			rc = rspamd_url_parse(query_url,
+								  url_str,
+								  strlen(url_str),
+								  task->task_pool,
+								  RSPAMD_URL_PARSE_TEXT);
+
+			if (rc == URI_ERRNO_OK &&
+				url->hostlen > 0) {
+				msg_debug_task("found url %s in query of url"
+							   " %*s",
+							   url_str, url->querylen, rspamd_url_query_unsafe(url));
+
+				if (prefix_added) {
+					query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+				}
+
+				if (query_url->protocol == PROTOCOL_MAILTO) {
+					if (query_url->userlen == 0) {
+						return TRUE;
+					}
+				}
+
+				rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls),
+											   query_url, false);
+			}
+		}
+	}
+
+	return TRUE;
+}
+
+static inline khint_t
+rspamd_url_hash(struct rspamd_url *url)
+{
+	if (url->urllen > 0) {
+		return (khint_t) rspamd_cryptobox_fast_hash(url->string, url->urllen,
+													rspamd_hash_seed());
+	}
+
+	return 0;
+}
+
+static inline khint_t
+rspamd_url_host_hash(struct rspamd_url *url)
+{
+	if (url->hostlen > 0) {
+		return (khint_t) rspamd_cryptobox_fast_hash(rspamd_url_host_unsafe(url),
+													url->hostlen,
+													rspamd_hash_seed());
+	}
+
+	return 0;
+}
+
+/* Compare two emails for building emails tree */
+static inline bool
+rspamd_emails_cmp(struct rspamd_url *u1, struct rspamd_url *u2)
+{
+	gint r;
+
+	if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
+		return FALSE;
+	}
+	else {
+		if ((r = rspamd_lc_cmp(rspamd_url_host_unsafe(u1),
+							   rspamd_url_host_unsafe(u2), u1->hostlen)) == 0) {
+			if (u1->userlen != u2->userlen || u1->userlen == 0) {
+				return FALSE;
+			}
+			else {
+				return (rspamd_lc_cmp(rspamd_url_user_unsafe(u1),
+									  rspamd_url_user_unsafe(u2),
+									  u1->userlen) == 0);
+			}
+		}
+		else {
+			return r == 0;
+		}
+	}
+
+	return FALSE;
+}
+
+static inline bool
+rspamd_urls_cmp(struct rspamd_url *u1, struct rspamd_url *u2)
+{
+	int r = 0;
+
+	if (u1->protocol != u2->protocol || u1->urllen != u2->urllen) {
+		return false;
+	}
+	else {
+		if (u1->protocol & PROTOCOL_MAILTO) {
+			return rspamd_emails_cmp(u1, u2);
+		}
+
+		r = memcmp(u1->string, u2->string, u1->urllen);
+	}
+
+	return r == 0;
+}
+
+static inline bool
+rspamd_urls_host_cmp(struct rspamd_url *u1, struct rspamd_url *u2)
+{
+	int r = 0;
+
+	if (u1->hostlen != u2->hostlen) {
+		return false;
+	}
+	else {
+		r = memcmp(rspamd_url_host_unsafe(u1), rspamd_url_host_unsafe(u2),
+				   u1->hostlen);
+	}
+
+	return r == 0;
+}
+
+gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size)
+{
+	gchar *d, ch, c, decoded;
+	const gchar *s;
+	enum {
+		sw_usual = 0,
+		sw_quoted,
+		sw_quoted_second
+	} state;
+
+	d = dst;
+	s = src;
+
+	state = 0;
+	decoded = 0;
+
+	while (size--) {
+
+		ch = *s++;
+
+		switch (state) {
+		case sw_usual:
+
+			if (ch == '%') {
+				state = sw_quoted;
+				break;
+			}
+			else if (ch == '+') {
+				*d++ = ' ';
+			}
+			else {
+				*d++ = ch;
+			}
+			break;
+
+		case sw_quoted:
+
+			if (ch >= '0' && ch <= '9') {
+				decoded = (ch - '0');
+				state = sw_quoted_second;
+				break;
+			}
+
+			c = (ch | 0x20);
+			if (c >= 'a' && c <= 'f') {
+				decoded = (c - 'a' + 10);
+				state = sw_quoted_second;
+				break;
+			}
+
+			/* the invalid quoted character */
+
+			state = sw_usual;
+
+			*d++ = ch;
+
+			break;
+
+		case sw_quoted_second:
+
+			state = sw_usual;
+
+			if (ch >= '0' && ch <= '9') {
+				ch = ((decoded << 4) + ch - '0');
+				*d++ = ch;
+
+				break;
+			}
+
+			c = (u_char) (ch | 0x20);
+			if (c >= 'a' && c <= 'f') {
+				ch = ((decoded << 4) + c - 'a' + 10);
+
+				*d++ = ch;
+				break;
+			}
+
+			/* the invalid quoted character */
+			break;
+		}
+	}
+
+	return (d - dst);
+}
+
+enum rspamd_url_char_class {
+	RSPAMD_URL_UNRESERVED = (1 << 0),
+	RSPAMD_URL_SUBDELIM = (1 << 1),
+	RSPAMD_URL_PATHSAFE = (1 << 2),
+	RSPAMD_URL_QUERYSAFE = (1 << 3),
+	RSPAMD_URL_FRAGMENTSAFE = (1 << 4),
+	RSPAMD_URL_HOSTSAFE = (1 << 5),
+	RSPAMD_URL_USERSAFE = (1 << 6),
+};
+
+#define RSPAMD_URL_FLAGS_HOSTSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_HOSTSAFE | RSPAMD_URL_SUBDELIM)
+#define RSPAMD_URL_FLAGS_USERSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_USERSAFE | RSPAMD_URL_SUBDELIM)
+#define RSPAMD_URL_FLAGS_PATHSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_PATHSAFE | RSPAMD_URL_SUBDELIM)
+#define RSPAMD_URL_FLAGS_QUERYSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_SUBDELIM)
+#define RSPAMD_URL_FLAGS_FRAGMENTSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_FRAGMENTSAFE | RSPAMD_URL_SUBDELIM)
+
+static const unsigned char rspamd_url_encoding_classes[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0 /*   */, RSPAMD_URL_SUBDELIM /* ! */, 0 /* " */, 0 /* # */,
+	RSPAMD_URL_SUBDELIM /* $ */, 0 /* % */, RSPAMD_URL_SUBDELIM /* & */,
+	RSPAMD_URL_SUBDELIM /* ' */, RSPAMD_URL_SUBDELIM /* ( */,
+	RSPAMD_URL_SUBDELIM /* ) */, RSPAMD_URL_SUBDELIM /* * */,
+	RSPAMD_URL_SUBDELIM /* + */, RSPAMD_URL_SUBDELIM /* , */,
+	RSPAMD_URL_UNRESERVED /* - */, RSPAMD_URL_UNRESERVED /* . */,
+	RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* / */,
+	RSPAMD_URL_UNRESERVED /* 0 */, RSPAMD_URL_UNRESERVED /* 1 */,
+	RSPAMD_URL_UNRESERVED /* 2 */, RSPAMD_URL_UNRESERVED /* 3 */,
+	RSPAMD_URL_UNRESERVED /* 4 */, RSPAMD_URL_UNRESERVED /* 5 */,
+	RSPAMD_URL_UNRESERVED /* 6 */, RSPAMD_URL_UNRESERVED /* 7 */,
+	RSPAMD_URL_UNRESERVED /* 8 */, RSPAMD_URL_UNRESERVED /* 9 */,
+	RSPAMD_URL_USERSAFE | RSPAMD_URL_HOSTSAFE | RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* : */,
+	RSPAMD_URL_SUBDELIM /* ; */, 0 /* < */, RSPAMD_URL_SUBDELIM /* = */, 0 /* > */,
+	RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* ? */,
+	RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* @ */,
+	RSPAMD_URL_UNRESERVED /* A */, RSPAMD_URL_UNRESERVED /* B */,
+	RSPAMD_URL_UNRESERVED /* C */, RSPAMD_URL_UNRESERVED /* D */,
+	RSPAMD_URL_UNRESERVED /* E */, RSPAMD_URL_UNRESERVED /* F */,
+	RSPAMD_URL_UNRESERVED /* G */, RSPAMD_URL_UNRESERVED /* H */,
+	RSPAMD_URL_UNRESERVED /* I */, RSPAMD_URL_UNRESERVED /* J */,
+	RSPAMD_URL_UNRESERVED /* K */, RSPAMD_URL_UNRESERVED /* L */,
+	RSPAMD_URL_UNRESERVED /* M */, RSPAMD_URL_UNRESERVED /* N */,
+	RSPAMD_URL_UNRESERVED /* O */, RSPAMD_URL_UNRESERVED /* P */,
+	RSPAMD_URL_UNRESERVED /* Q */, RSPAMD_URL_UNRESERVED /* R */,
+	RSPAMD_URL_UNRESERVED /* S */, RSPAMD_URL_UNRESERVED /* T */,
+	RSPAMD_URL_UNRESERVED /* U */, RSPAMD_URL_UNRESERVED /* V */,
+	RSPAMD_URL_UNRESERVED /* W */, RSPAMD_URL_UNRESERVED /* X */,
+	RSPAMD_URL_UNRESERVED /* Y */, RSPAMD_URL_UNRESERVED /* Z */,
+	RSPAMD_URL_HOSTSAFE /* [ */, 0 /* \ */, RSPAMD_URL_HOSTSAFE /* ] */, 0 /* ^ */,
+	RSPAMD_URL_UNRESERVED /* _ */, 0 /* ` */, RSPAMD_URL_UNRESERVED /* a */,
+	RSPAMD_URL_UNRESERVED /* b */, RSPAMD_URL_UNRESERVED /* c */,
+	RSPAMD_URL_UNRESERVED /* d */, RSPAMD_URL_UNRESERVED /* e */,
+	RSPAMD_URL_UNRESERVED /* f */, RSPAMD_URL_UNRESERVED /* g */,
+	RSPAMD_URL_UNRESERVED /* h */, RSPAMD_URL_UNRESERVED /* i */,
+	RSPAMD_URL_UNRESERVED /* j */, RSPAMD_URL_UNRESERVED /* k */,
+	RSPAMD_URL_UNRESERVED /* l */, RSPAMD_URL_UNRESERVED /* m */,
+	RSPAMD_URL_UNRESERVED /* n */, RSPAMD_URL_UNRESERVED /* o */,
+	RSPAMD_URL_UNRESERVED /* p */, RSPAMD_URL_UNRESERVED /* q */,
+	RSPAMD_URL_UNRESERVED /* r */, RSPAMD_URL_UNRESERVED /* s */,
+	RSPAMD_URL_UNRESERVED /* t */, RSPAMD_URL_UNRESERVED /* u */,
+	RSPAMD_URL_UNRESERVED /* v */, RSPAMD_URL_UNRESERVED /* w */,
+	RSPAMD_URL_UNRESERVED /* x */, RSPAMD_URL_UNRESERVED /* y */,
+	RSPAMD_URL_UNRESERVED /* z */, 0 /* { */, 0 /* | */, 0 /* } */,
+	RSPAMD_URL_UNRESERVED /* ~ */, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+#define CHECK_URL_COMPONENT(beg, len, flags)                                       \
+	do {                                                                           \
+		for (i = 0; i < (len); i++) {                                              \
+			if ((rspamd_url_encoding_classes[(guchar) (beg)[i]] & (flags)) == 0) { \
+				dlen += 2;                                                         \
+			}                                                                      \
+		}                                                                          \
+	} while (0)
+
+#define ENCODE_URL_COMPONENT(beg, len, flags)                                      \
+	do {                                                                           \
+		for (i = 0; i < (len) && dend > d; i++) {                                  \
+			if ((rspamd_url_encoding_classes[(guchar) (beg)[i]] & (flags)) == 0) { \
+				*d++ = '%';                                                        \
+				*d++ = hexdigests[(guchar) ((beg)[i] >> 4) & 0xf];                 \
+				*d++ = hexdigests[(guchar) (beg)[i] & 0xf];                        \
+			}                                                                      \
+			else {                                                                 \
+				*d++ = (beg)[i];                                                   \
+			}                                                                      \
+		}                                                                          \
+	} while (0)
+
+const gchar *
+rspamd_url_encode(struct rspamd_url *url, gsize *pdlen,
+				  rspamd_mempool_t *pool)
+{
+	guchar *dest, *d, *dend;
+	static const gchar hexdigests[16] = "0123456789ABCDEF";
+	guint i;
+	gsize dlen = 0;
+
+	g_assert(pdlen != NULL && url != NULL && pool != NULL);
+
+	CHECK_URL_COMPONENT(rspamd_url_host_unsafe(url), url->hostlen,
+						RSPAMD_URL_FLAGS_HOSTSAFE);
+	CHECK_URL_COMPONENT(rspamd_url_user_unsafe(url), url->userlen,
+						RSPAMD_URL_FLAGS_USERSAFE);
+	CHECK_URL_COMPONENT(rspamd_url_data_unsafe(url), url->datalen,
+						RSPAMD_URL_FLAGS_PATHSAFE);
+	CHECK_URL_COMPONENT(rspamd_url_query_unsafe(url), url->querylen,
+						RSPAMD_URL_FLAGS_QUERYSAFE);
+	CHECK_URL_COMPONENT(rspamd_url_fragment_unsafe(url), url->fragmentlen,
+						RSPAMD_URL_FLAGS_FRAGMENTSAFE);
+
+	if (dlen == 0) {
+		*pdlen = url->urllen;
+
+		return url->string;
+	}
+
+	/* Need to encode */
+	dlen += url->urllen + sizeof("telephone://"); /* Protocol hack */
+	dest = rspamd_mempool_alloc(pool, dlen + 1);
+	d = dest;
+	dend = d + dlen;
+
+	if (url->protocollen > 0) {
+		if (!(url->protocol & PROTOCOL_UNKNOWN)) {
+			const gchar *known_proto = rspamd_url_protocol_name(url->protocol);
+			d += rspamd_snprintf((gchar *) d, dend - d,
+								 "%s://",
+								 known_proto);
+		}
+		else {
+			d += rspamd_snprintf((gchar *) d, dend - d,
+								 "%*s://",
+								 (gint) url->protocollen, url->string);
+		}
+	}
+	else {
+		d += rspamd_snprintf((gchar *) d, dend - d, "http://");
+	}
+
+	if (url->userlen > 0) {
+		ENCODE_URL_COMPONENT(rspamd_url_user_unsafe(url), url->userlen,
+							 RSPAMD_URL_FLAGS_USERSAFE);
+		*d++ = '@';
+	}
+
+	ENCODE_URL_COMPONENT(rspamd_url_host_unsafe(url), url->hostlen,
+						 RSPAMD_URL_FLAGS_HOSTSAFE);
+
+	if (url->datalen > 0) {
+		*d++ = '/';
+		ENCODE_URL_COMPONENT(rspamd_url_data_unsafe(url), url->datalen,
+							 RSPAMD_URL_FLAGS_PATHSAFE);
+	}
+
+	if (url->querylen > 0) {
+		*d++ = '?';
+		ENCODE_URL_COMPONENT(rspamd_url_query_unsafe(url), url->querylen,
+							 RSPAMD_URL_FLAGS_QUERYSAFE);
+	}
+
+	if (url->fragmentlen > 0) {
+		*d++ = '#';
+		ENCODE_URL_COMPONENT(rspamd_url_fragment_unsafe(url), url->fragmentlen,
+							 RSPAMD_URL_FLAGS_FRAGMENTSAFE);
+	}
+
+	*pdlen = (d - dest);
+
+	return (const gchar *) dest;
+}
+
+gboolean
+rspamd_url_is_domain(int c)
+{
+	return is_domain((guchar) c);
+}
+
+const gchar *
+rspamd_url_protocol_name(enum rspamd_url_protocol proto)
+{
+	const gchar *ret = "unknown";
+
+	switch (proto) {
+	case PROTOCOL_HTTP:
+		ret = "http";
+		break;
+	case PROTOCOL_HTTPS:
+		ret = "https";
+		break;
+	case PROTOCOL_FTP:
+		ret = "ftp";
+		break;
+	case PROTOCOL_FILE:
+		ret = "file";
+		break;
+	case PROTOCOL_MAILTO:
+		ret = "mailto";
+		break;
+	case PROTOCOL_TELEPHONE:
+		ret = "telephone";
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+enum rspamd_url_protocol
+rspamd_url_protocol_from_string(const gchar *str)
+{
+	enum rspamd_url_protocol ret = PROTOCOL_UNKNOWN;
+
+	if (strcmp(str, "http") == 0) {
+		ret = PROTOCOL_HTTP;
+	}
+	else if (strcmp(str, "https") == 0) {
+		ret = PROTOCOL_HTTPS;
+	}
+	else if (strcmp(str, "mailto") == 0) {
+		ret = PROTOCOL_MAILTO;
+	}
+	else if (strcmp(str, "ftp") == 0) {
+		ret = PROTOCOL_FTP;
+	}
+	else if (strcmp(str, "file") == 0) {
+		ret = PROTOCOL_FILE;
+	}
+	else if (strcmp(str, "telephone") == 0) {
+		ret = PROTOCOL_TELEPHONE;
+	}
+
+	return ret;
+}
+
+
+bool rspamd_url_set_add_or_increase(khash_t(rspamd_url_hash) * set,
+									struct rspamd_url *u,
+									bool enforce_replace)
+{
+	khiter_t k;
+	gint r;
+
+	k = kh_get(rspamd_url_hash, set, u);
+
+	if (k != kh_end(set)) {
+		/* Existing url */
+		struct rspamd_url *ex = kh_key(set, k);
+#define SUSPICIOUS_URL_FLAGS (RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED | RSPAMD_URL_FLAG_ZW_SPACES)
+		if (enforce_replace) {
+			kh_key(set, k) = u;
+			u->count++;
+		}
+		else {
+			if (u->flags & SUSPICIOUS_URL_FLAGS) {
+				if (!(ex->flags & SUSPICIOUS_URL_FLAGS)) {
+					/* Propagate new url to an old one */
+					kh_key(set, k) = u;
+					u->count++;
+				}
+				else {
+					ex->count++;
+				}
+			}
+			else {
+				ex->count++;
+			}
+		}
+
+		return false;
+	}
+	else {
+		k = kh_put(rspamd_url_hash, set, u, &r);
+	}
+
+	return true;
+}
+
+struct rspamd_url *
+rspamd_url_set_add_or_return(khash_t(rspamd_url_hash) * set,
+							 struct rspamd_url *u)
+{
+	khiter_t k;
+	gint r;
+
+	if (set) {
+		k = kh_get(rspamd_url_hash, set, u);
+
+		if (k != kh_end(set)) {
+			return kh_key(set, k);
+		}
+		else {
+			k = kh_put(rspamd_url_hash, set, u, &r);
+
+			return kh_key(set, k);
+		}
+	}
+
+	return NULL;
+}
+
+bool rspamd_url_host_set_add(khash_t(rspamd_url_host_hash) * set,
+							 struct rspamd_url *u)
+{
+	gint r;
+
+	if (set) {
+		kh_put(rspamd_url_host_hash, set, u, &r);
+
+		if (r == 0) {
+			return false;
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+bool rspamd_url_set_has(khash_t(rspamd_url_hash) * set, struct rspamd_url *u)
+{
+	khiter_t k;
+
+	if (set) {
+		k = kh_get(rspamd_url_hash, set, u);
+
+		if (k == kh_end(set)) {
+			return false;
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+bool rspamd_url_host_set_has(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u)
+{
+	khiter_t k;
+
+	if (set) {
+		k = kh_get(rspamd_url_host_hash, set, u);
+
+		if (k == kh_end(set)) {
+			return false;
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+bool rspamd_url_flag_from_string(const gchar *str, gint *flag)
+{
+	gint h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
+												 str, strlen(str), 0);
+
+	for (int i = 0; i < G_N_ELEMENTS(url_flag_names); i++) {
+		if (url_flag_names[i].hash == h) {
+			*flag |= url_flag_names[i].flag;
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
+
+const gchar *
+rspamd_url_flag_to_string(int flag)
+{
+	for (int i = 0; i < G_N_ELEMENTS(url_flag_names); i++) {
+		if (url_flag_names[i].flag & flag) {
+			return url_flag_names[i].name;
+		}
+	}
+
+	return NULL;
+}
+
+inline int
+rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2)
+{
+	int min_len = MIN(u1->urllen, u2->urllen);
+	int r;
+
+	if (u1->protocol != u2->protocol) {
+		return u1->protocol - u2->protocol;
+	}
+
+	if (u1->protocol & PROTOCOL_MAILTO) {
+		/* Emails specialisation (hosts must be compared in a case insensitive matter */
+		min_len = MIN(u1->hostlen, u2->hostlen);
+
+		if ((r = rspamd_lc_cmp(rspamd_url_host_unsafe(u1),
+							   rspamd_url_host_unsafe(u2), min_len)) == 0) {
+			if (u1->hostlen == u2->hostlen) {
+				if (u1->userlen != u2->userlen || u1->userlen == 0) {
+					r = (int) u1->userlen - (int) u2->userlen;
+				}
+				else {
+					r = memcmp(rspamd_url_user_unsafe(u1),
+							   rspamd_url_user_unsafe(u2),
+							   u1->userlen);
+				}
+			}
+			else {
+				r = u1->hostlen - u2->hostlen;
+			}
+		}
+	}
+	else {
+		if (u1->urllen != u2->urllen) {
+			/* Different length, compare common part and then compare length */
+			r = memcmp(u1->string, u2->string, min_len);
+
+			if (r == 0) {
+				r = u1->urllen - u2->urllen;
+			}
+		}
+		else {
+			/* Equal length */
+			r = memcmp(u1->string, u2->string, u1->urllen);
+		}
+	}
+
+	return r;
+}
+
+int rspamd_url_cmp_qsort(const void *_u1, const void *_u2)
+{
+	const struct rspamd_url *u1 = *(struct rspamd_url **) _u1,
+							*u2 = *(struct rspamd_url **) _u2;
+
+	return rspamd_url_cmp(u1, u2);
+}