1 files changed, 496 insertions, 0 deletions
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx
new file mode 100644
index 0000000..8f29f2c
--- /dev/null
+++ b/src/libserver/html/html_url.cxx
@@ -0,0 +1,496 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "html_url.hxx"
+#include "libutil/str_util.h"
+#include "libserver/url.h"
+#include "libserver/logger.h"
+#include "rspamd.h"
+
+#include <unicode/idna.h>
+
+namespace rspamd::html {
+
+static auto
+rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool
+{
+	const auto *p1 = t1.data() + t1.size() - 1;
+	const auto *p2 = t2.data() + t2.size() - 1;
+
+	/* Skip trailing dots */
+	while (p1 > t1.data()) {
+		if (*p1 != '.') {
+			break;
+		}
+
+		p1--;
+	}
+
+	while (p2 > t2.data()) {
+		if (*p2 != '.') {
+			break;
+		}
+
+		p2--;
+	}
+
+	while (p1 > t1.data() && p2 > t2.data()) {
+		if (*p1 != *p2) {
+			break;
+		}
+
+		p1--;
+		p2--;
+	}
+
+	if (p2 == t2.data()) {
+		/* p2 can be subdomain of p1 if *p1 is '.' */
+		if (p1 != t1.data() && *(p1 - 1) == '.') {
+			return true;
+		}
+	}
+	else if (p1 == t1.data()) {
+		if (p2 != t2.data() && *(p2 - 1) == '.') {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+
+static auto
+get_icu_idna_instance(void) -> auto
+{
+	auto uc_err = U_ZERO_ERROR;
+	static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err);
+
+	return udn;
+}
+
+static auto
+convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld)
+	-> std::string_view
+{
+	std::string_view ret = use_tld ? std::string_view{rspamd_url_tld_unsafe(url), url->tldlen} : std::string_view{rspamd_url_host_unsafe(url), url->hostlen};
+
+	/* Handle IDN url's */
+	if (ret.size() > 4 &&
+		rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) {
+
+		const auto buf_capacity = ret.size() * 2 + 1;
+		auto *idn_hbuf = (char *) rspamd_mempool_alloc(pool, buf_capacity);
+		icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int) buf_capacity};
+
+		/* We need to convert it to the normal value first */
+		icu::IDNAInfo info;
+		auto uc_err = U_ZERO_ERROR;
+		auto *udn = get_icu_idna_instance();
+		udn->nameToUnicodeUTF8(icu::StringPiece(ret.data(), ret.size()),
+							   byte_sink, info, uc_err);
+
+		if (uc_err == U_ZERO_ERROR && !info.hasErrors()) {
+			/* idn_hbuf is allocated in mempool, so it is safe to use */
+			ret = std::string_view{idn_hbuf, (std::size_t) byte_sink.NumberOfBytesWritten()};
+		}
+		else {
+			msg_err_pool("cannot convert to IDN: %s (0x%xd)",
+						 u_errorName(uc_err), info.getErrors());
+		}
+	}
+
+	return ret;
+};
+
+constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto
+{
+	return (s1.size() == s2.size()) &&
+		   std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
+					  [](const auto c1, const auto c2) {
+						  return g_ascii_tolower(c1) == g_ascii_tolower(c2);
+					  });
+}
+
+constexpr auto
+is_transfer_proto(struct rspamd_url *u) -> bool
+{
+	return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0;
+}
+
+auto html_url_is_phished(rspamd_mempool_t *pool,
+						 struct rspamd_url *href_url,
+						 std::string_view text_data) -> std::optional<rspamd_url *>
+{
+	struct rspamd_url *text_url;
+	std::string_view disp_tok, href_tok;
+	goffset url_pos;
+	gchar *url_str = NULL;
+
+	auto sz = text_data.size();
+	const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz);
+	text_data = std::string_view(trimmed, sz);
+
+	if (text_data.size() > 4 &&
+		rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
+						RSPAMD_URL_FIND_ALL,
+						&url_pos, NULL) &&
+		url_str != nullptr) {
+
+		if (url_pos > 0) {
+			/*
+			 * We have some url at some offset, so we need to check what is
+			 * at the start of the text
+			 */
+			return std::nullopt;
+		}
+
+		text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+		auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
+								   RSPAMD_URL_PARSE_TEXT);
+
+		if (rc == URI_ERRNO_OK) {
+			text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+			href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
+
+			/* Check for phishing */
+			if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) {
+				disp_tok = convert_idna_hostname_maybe(pool, text_url, false);
+				href_tok = convert_idna_hostname_maybe(pool, href_url, false);
+
+				if (!sv_equals(disp_tok, href_tok) &&
+					text_url->tldlen > 0 && href_url->tldlen > 0) {
+
+					/* Apply the same logic for TLD */
+					disp_tok = convert_idna_hostname_maybe(pool, text_url, true);
+					href_tok = convert_idna_hostname_maybe(pool, href_url, true);
+
+					if (!sv_equals(disp_tok, href_tok)) {
+						/* Check if one url is a subdomain for another */
+
+						if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
+							href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
+							text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+
+							if (href_url->ext == nullptr) {
+								href_url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
+							}
+							href_url->ext->linked_url = text_url;
+						}
+					}
+				}
+			}
+
+			return text_url;
+		}
+		else {
+			/*
+			 * We have found something that looks like an url but it was
+			 * not parsed correctly.
+			 * Sometimes it means an obfuscation attempt, so we have to check
+			 * what's inside of the text
+			 */
+			gboolean obfuscation_found = FALSE;
+
+			if (text_data.size() > 4 && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 &&
+				rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) {
+				/* Clearly an obfuscation attempt */
+				obfuscation_found = TRUE;
+			}
+
+			msg_info_pool("extract of url '%s' failed: %s; obfuscation detected: %s",
+						  url_str,
+						  rspamd_url_strerror(rc),
+						  obfuscation_found ? "yes" : "no");
+
+			if (obfuscation_found) {
+				href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
+			}
+		}
+	}
+
+	return std::nullopt;
+}
+
+void html_check_displayed_url(rspamd_mempool_t *pool,
+							  GList **exceptions,
+							  void *url_set,
+							  std::string_view visible_part,
+							  goffset href_offset,
+							  struct rspamd_url *url)
+{
+	struct rspamd_url *displayed_url = nullptr;
+	struct rspamd_url *turl;
+	struct rspamd_process_exception *ex;
+	guint saved_flags = 0;
+	gsize dlen;
+
+	if (visible_part.empty()) {
+		/* No displayed url, just some text within <a> tag */
+		return;
+	}
+
+	if (url->ext == nullptr) {
+		url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
+	}
+	url->ext->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
+	rspamd_strlcpy(url->ext->visible_part,
+				   visible_part.data(),
+				   visible_part.size() + 1);
+	dlen = visible_part.size();
+
+	/* Strip unicode spaces from the start and the end */
+	url->ext->visible_part = const_cast<char *>(
+		rspamd_string_unicode_trim_inplace(url->ext->visible_part,
+										   &dlen));
+	auto maybe_url = html_url_is_phished(pool, url,
+										 {url->ext->visible_part, dlen});
+
+	if (maybe_url) {
+		url->flags |= saved_flags;
+		displayed_url = maybe_url.value();
+	}
+
+	if (exceptions && displayed_url != nullptr) {
+		ex = rspamd_mempool_alloc_type(pool, struct rspamd_process_exception);
+		ex->pos = href_offset;
+		ex->len = dlen;
+		ex->type = RSPAMD_EXCEPTION_URL;
+		ex->ptr = url;
+
+		*exceptions = g_list_prepend(*exceptions, ex);
+	}
+
+	if (displayed_url && url_set) {
+		turl = rspamd_url_set_add_or_return((khash_t(rspamd_url_hash) *) url_set, displayed_url);
+
+		if (turl != nullptr) {
+			/* Here, we assume the following:
+			 * if we have a URL in the text part which
+			 * is the same as displayed URL in the
+			 * HTML part, we assume that it is also
+			 * hint only.
+			 */
+			if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) {
+
+				/*
+				 * We have the same URL for href and displayed url, so we
+				 * know that this url cannot be both target and display (as
+				 * it breaks logic in many places), so we do not
+				 * propagate html flags
+				 */
+				if (!(turl->flags & RSPAMD_URL_FLAG_DISPLAY_URL)) {
+					turl->flags |= displayed_url->flags;
+				}
+				turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
+			}
+
+			turl->count++;
+		}
+		else {
+			/* Already inserted by `rspamd_url_set_add_or_return` */
+		}
+	}
+
+	rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen);
+}
+
+auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+	-> std::optional<struct rspamd_url *>
+{
+	struct rspamd_url *url;
+	guint saved_flags = 0;
+	gint rc;
+	const gchar *s, *prefix = "http://";
+	gchar *d;
+	gsize dlen;
+	gboolean has_bad_chars = FALSE, no_prefix = FALSE;
+	static const gchar hexdigests[] = "0123456789abcdef";
+
+	auto sz = input.length();
+	const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
+	input = {trimmed, sz};
+
+	const auto *start = input.data();
+	s = start;
+	dlen = 0;
+
+	for (auto i = 0; i < sz; i++) {
+		if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+			dlen += 3;
+		}
+		else {
+			dlen++;
+		}
+	}
+
+	if (rspamd_substring_search(start, sz, "://", 3) == -1) {
+		if (sz >= sizeof("mailto:") &&
+			(memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
+			 memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
+			 memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
+			/* Exclusion, has valid but 'strange' prefix */
+		}
+		else {
+			for (auto i = 0; i < sz; i++) {
+				if (!((s[i] & 0x80) || g_ascii_isalnum(s[i]))) {
+					if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') {
+						prefix = "http:";
+						dlen += sizeof("http:") - 1;
+						no_prefix = TRUE;
+					}
+					else if (s[i] == '@') {
+						/* Likely email prefix */
+						prefix = "mailto://";
+						dlen += sizeof("mailto://") - 1;
+						no_prefix = TRUE;
+					}
+					else if (s[i] == ':' && i != 0) {
+						/* Special case */
+						no_prefix = FALSE;
+					}
+					else {
+						if (i == 0) {
+							/* No valid data */
+							return std::nullopt;
+						}
+						else {
+							no_prefix = TRUE;
+							dlen += strlen(prefix);
+						}
+					}
+
+					break;
+				}
+			}
+		}
+	}
+
+	auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1);
+	d = decoded;
+
+	if (no_prefix) {
+		gsize plen = strlen(prefix);
+		memcpy(d, prefix, plen);
+		d += plen;
+	}
+
+	/*
+	 * We also need to remove all internal newlines, spaces
+	 * and encode unsafe characters
+	 * Another obfuscation find in the wild was encoding of the SAFE url characters,
+	 * including essential ones
+	 */
+	for (auto i = 0; i < sz; i++) {
+		if (G_UNLIKELY(g_ascii_isspace(s[i]))) {
+			continue;
+		}
+		else if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+			/* URL encode */
+			*d++ = '%';
+			*d++ = hexdigests[(s[i] >> 4) & 0xf];
+			*d++ = hexdigests[s[i] & 0xf];
+			has_bad_chars = TRUE;
+		}
+		else if (G_UNLIKELY(s[i] == '%')) {
+			if (i + 2 < sz) {
+				auto c1 = s[i + 1];
+				auto c2 = s[i + 2];
+
+				if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) {
+					auto codepoint = 0;
+
+					if (c1 >= '0' && c1 <= '9') codepoint = c1 - '0';
+					else if (c1 >= 'A' && c1 <= 'F')
+						codepoint = c1 - 'A' + 10;
+					else if (c1 >= 'a' && c1 <= 'f')
+						codepoint = c1 - 'a' + 10;
+
+					codepoint <<= 4;
+
+					if (c2 >= '0' && c2 <= '9') codepoint += c2 - '0';
+					else if (c2 >= 'A' && c2 <= 'F')
+						codepoint += c2 - 'A' + 10;
+					else if (c2 >= 'a' && c2 <= 'f')
+						codepoint += c2 - 'a' + 10;
+
+					/* Now check for 'interesting' codepoints */
+					if (codepoint == '@' || codepoint == ':' || codepoint == '|' ||
+						codepoint == '?' || codepoint == '\\' || codepoint == '/') {
+						/* Replace it back */
+						*d++ = (char) (codepoint & 0xff);
+						i += 2;
+					}
+					else {
+						*d++ = s[i];
+					}
+				}
+				else {
+					*d++ = s[i];
+				}
+			}
+			else {
+				*d++ = s[i];
+			}
+		}
+		else {
+			*d++ = s[i];
+		}
+	}
+
+	*d = '\0';
+	dlen = d - decoded;
+
+	url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+	rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags);
+	rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
+
+	/* Filter some completely damaged urls */
+	if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
+		!((url->protocol & PROTOCOL_UNKNOWN))) {
+		url->flags |= saved_flags;
+
+		if (has_bad_chars) {
+			url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+		}
+
+		if (no_prefix) {
+			url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+
+			if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
+				/* Ignore urls with both no schema and no tld */
+				return std::nullopt;
+			}
+		}
+
+		decoded = url->string;
+
+		input = {decoded, url->urllen};
+
+		/* Spaces in href usually mean an attempt to obfuscate URL */
+		/* See https://github.com/vstakhov/rspamd/issues/593 */
+#if 0
+		if (has_spaces) {
+			url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+		}
+#endif
+
+		return url;
+	}
+
+	return std::nullopt;
+}
+
+}// namespace rspamd::html
+\ No newline at end of file