summaryrefslogtreecommitdiffstats
path: root/src/libserver/html/html_url.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/libserver/html/html_url.cxx')
-rw-r--r--src/libserver/html/html_url.cxx496
1 files changed, 496 insertions, 0 deletions
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx
new file mode 100644
index 0000000..8f29f2c
--- /dev/null
+++ b/src/libserver/html/html_url.cxx
@@ -0,0 +1,496 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "html_url.hxx"
+#include "libutil/str_util.h"
+#include "libserver/url.h"
+#include "libserver/logger.h"
+#include "rspamd.h"
+
+#include <unicode/idna.h>
+
+namespace rspamd::html {
+
+static auto
+rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool
+{
+ const auto *p1 = t1.data() + t1.size() - 1;
+ const auto *p2 = t2.data() + t2.size() - 1;
+
+ /* Skip trailing dots */
+ while (p1 > t1.data()) {
+ if (*p1 != '.') {
+ break;
+ }
+
+ p1--;
+ }
+
+ while (p2 > t2.data()) {
+ if (*p2 != '.') {
+ break;
+ }
+
+ p2--;
+ }
+
+ while (p1 > t1.data() && p2 > t2.data()) {
+ if (*p1 != *p2) {
+ break;
+ }
+
+ p1--;
+ p2--;
+ }
+
+ if (p2 == t2.data()) {
+ /* p2 can be subdomain of p1 if *p1 is '.' */
+ if (p1 != t1.data() && *(p1 - 1) == '.') {
+ return true;
+ }
+ }
+ else if (p1 == t1.data()) {
+ if (p2 != t2.data() && *(p2 - 1) == '.') {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+static auto
+get_icu_idna_instance(void) -> auto
+{
+ auto uc_err = U_ZERO_ERROR;
+ static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err);
+
+ return udn;
+}
+
+static auto
+convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld)
+ -> std::string_view
+{
+ std::string_view ret = use_tld ? std::string_view{rspamd_url_tld_unsafe(url), url->tldlen} : std::string_view{rspamd_url_host_unsafe(url), url->hostlen};
+
+ /* Handle IDN url's */
+ if (ret.size() > 4 &&
+ rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) {
+
+ const auto buf_capacity = ret.size() * 2 + 1;
+ auto *idn_hbuf = (char *) rspamd_mempool_alloc(pool, buf_capacity);
+ icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int) buf_capacity};
+
+ /* We need to convert it to the normal value first */
+ icu::IDNAInfo info;
+ auto uc_err = U_ZERO_ERROR;
+ auto *udn = get_icu_idna_instance();
+ udn->nameToUnicodeUTF8(icu::StringPiece(ret.data(), ret.size()),
+ byte_sink, info, uc_err);
+
+ if (uc_err == U_ZERO_ERROR && !info.hasErrors()) {
+ /* idn_hbuf is allocated in mempool, so it is safe to use */
+ ret = std::string_view{idn_hbuf, (std::size_t) byte_sink.NumberOfBytesWritten()};
+ }
+ else {
+ msg_err_pool("cannot convert to IDN: %s (0x%xd)",
+ u_errorName(uc_err), info.getErrors());
+ }
+ }
+
+ return ret;
+};
+
+constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto
+{
+ return (s1.size() == s2.size()) &&
+ std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
+ [](const auto c1, const auto c2) {
+ return g_ascii_tolower(c1) == g_ascii_tolower(c2);
+ });
+}
+
+constexpr auto
+is_transfer_proto(struct rspamd_url *u) -> bool
+{
+ return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0;
+}
+
+auto html_url_is_phished(rspamd_mempool_t *pool,
+ struct rspamd_url *href_url,
+ std::string_view text_data) -> std::optional<rspamd_url *>
+{
+ struct rspamd_url *text_url;
+ std::string_view disp_tok, href_tok;
+ goffset url_pos;
+ gchar *url_str = NULL;
+
+ auto sz = text_data.size();
+ const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz);
+ text_data = std::string_view(trimmed, sz);
+
+ if (text_data.size() > 4 &&
+ rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
+ RSPAMD_URL_FIND_ALL,
+ &url_pos, NULL) &&
+ url_str != nullptr) {
+
+ if (url_pos > 0) {
+ /*
+ * We have some url at some offset, so we need to check what is
+ * at the start of the text
+ */
+ return std::nullopt;
+ }
+
+ text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+ auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
+ RSPAMD_URL_PARSE_TEXT);
+
+ if (rc == URI_ERRNO_OK) {
+ text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+ href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
+
+ /* Check for phishing */
+ if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) {
+ disp_tok = convert_idna_hostname_maybe(pool, text_url, false);
+ href_tok = convert_idna_hostname_maybe(pool, href_url, false);
+
+ if (!sv_equals(disp_tok, href_tok) &&
+ text_url->tldlen > 0 && href_url->tldlen > 0) {
+
+ /* Apply the same logic for TLD */
+ disp_tok = convert_idna_hostname_maybe(pool, text_url, true);
+ href_tok = convert_idna_hostname_maybe(pool, href_url, true);
+
+ if (!sv_equals(disp_tok, href_tok)) {
+ /* Check if one url is a subdomain for another */
+
+ if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
+ href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
+ text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+
+ if (href_url->ext == nullptr) {
+ href_url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
+ }
+ href_url->ext->linked_url = text_url;
+ }
+ }
+ }
+ }
+
+ return text_url;
+ }
+ else {
+ /*
+ * We have found something that looks like an url but it was
+ * not parsed correctly.
+ * Sometimes it means an obfuscation attempt, so we have to check
+ * what's inside of the text
+ */
+ gboolean obfuscation_found = FALSE;
+
+ if (text_data.size() > 4 && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 &&
+ rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) {
+ /* Clearly an obfuscation attempt */
+ obfuscation_found = TRUE;
+ }
+
+ msg_info_pool("extract of url '%s' failed: %s; obfuscation detected: %s",
+ url_str,
+ rspamd_url_strerror(rc),
+ obfuscation_found ? "yes" : "no");
+
+ if (obfuscation_found) {
+ href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
+ }
+ }
+ }
+
+ return std::nullopt;
+}
+
+void html_check_displayed_url(rspamd_mempool_t *pool,
+ GList **exceptions,
+ void *url_set,
+ std::string_view visible_part,
+ goffset href_offset,
+ struct rspamd_url *url)
+{
+ struct rspamd_url *displayed_url = nullptr;
+ struct rspamd_url *turl;
+ struct rspamd_process_exception *ex;
+ guint saved_flags = 0;
+ gsize dlen;
+
+ if (visible_part.empty()) {
+ /* No displayed url, just some text within <a> tag */
+ return;
+ }
+
+ if (url->ext == nullptr) {
+ url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
+ }
+ url->ext->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
+ rspamd_strlcpy(url->ext->visible_part,
+ visible_part.data(),
+ visible_part.size() + 1);
+ dlen = visible_part.size();
+
+ /* Strip unicode spaces from the start and the end */
+ url->ext->visible_part = const_cast<char *>(
+ rspamd_string_unicode_trim_inplace(url->ext->visible_part,
+ &dlen));
+ auto maybe_url = html_url_is_phished(pool, url,
+ {url->ext->visible_part, dlen});
+
+ if (maybe_url) {
+ url->flags |= saved_flags;
+ displayed_url = maybe_url.value();
+ }
+
+ if (exceptions && displayed_url != nullptr) {
+ ex = rspamd_mempool_alloc_type(pool, struct rspamd_process_exception);
+ ex->pos = href_offset;
+ ex->len = dlen;
+ ex->type = RSPAMD_EXCEPTION_URL;
+ ex->ptr = url;
+
+ *exceptions = g_list_prepend(*exceptions, ex);
+ }
+
+ if (displayed_url && url_set) {
+ turl = rspamd_url_set_add_or_return((khash_t(rspamd_url_hash) *) url_set, displayed_url);
+
+ if (turl != nullptr) {
+ /* Here, we assume the following:
+ * if we have a URL in the text part which
+ * is the same as displayed URL in the
+ * HTML part, we assume that it is also
+ * hint only.
+ */
+ if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) {
+
+ /*
+ * We have the same URL for href and displayed url, so we
+ * know that this url cannot be both target and display (as
+ * it breaks logic in many places), so we do not
+ * propagate html flags
+ */
+ if (!(turl->flags & RSPAMD_URL_FLAG_DISPLAY_URL)) {
+ turl->flags |= displayed_url->flags;
+ }
+ turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
+ }
+
+ turl->count++;
+ }
+ else {
+ /* Already inserted by `rspamd_url_set_add_or_return` */
+ }
+ }
+
+ rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen);
+}
+
+auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+ -> std::optional<struct rspamd_url *>
+{
+ struct rspamd_url *url;
+ guint saved_flags = 0;
+ gint rc;
+ const gchar *s, *prefix = "http://";
+ gchar *d;
+ gsize dlen;
+ gboolean has_bad_chars = FALSE, no_prefix = FALSE;
+ static const gchar hexdigests[] = "0123456789abcdef";
+
+ auto sz = input.length();
+ const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
+ input = {trimmed, sz};
+
+ const auto *start = input.data();
+ s = start;
+ dlen = 0;
+
+ for (auto i = 0; i < sz; i++) {
+ if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+ dlen += 3;
+ }
+ else {
+ dlen++;
+ }
+ }
+
+ if (rspamd_substring_search(start, sz, "://", 3) == -1) {
+ if (sz >= sizeof("mailto:") &&
+ (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
+ memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
+ memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
+ /* Exclusion, has valid but 'strange' prefix */
+ }
+ else {
+ for (auto i = 0; i < sz; i++) {
+ if (!((s[i] & 0x80) || g_ascii_isalnum(s[i]))) {
+ if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') {
+ prefix = "http:";
+ dlen += sizeof("http:") - 1;
+ no_prefix = TRUE;
+ }
+ else if (s[i] == '@') {
+ /* Likely email prefix */
+ prefix = "mailto://";
+ dlen += sizeof("mailto://") - 1;
+ no_prefix = TRUE;
+ }
+ else if (s[i] == ':' && i != 0) {
+ /* Special case */
+ no_prefix = FALSE;
+ }
+ else {
+ if (i == 0) {
+ /* No valid data */
+ return std::nullopt;
+ }
+ else {
+ no_prefix = TRUE;
+ dlen += strlen(prefix);
+ }
+ }
+
+ break;
+ }
+ }
+ }
+ }
+
+ auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1);
+ d = decoded;
+
+ if (no_prefix) {
+ gsize plen = strlen(prefix);
+ memcpy(d, prefix, plen);
+ d += plen;
+ }
+
+ /*
+ * We also need to remove all internal newlines, spaces
+ * and encode unsafe characters
+ * Another obfuscation find in the wild was encoding of the SAFE url characters,
+ * including essential ones
+ */
+ for (auto i = 0; i < sz; i++) {
+ if (G_UNLIKELY(g_ascii_isspace(s[i]))) {
+ continue;
+ }
+ else if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+ /* URL encode */
+ *d++ = '%';
+ *d++ = hexdigests[(s[i] >> 4) & 0xf];
+ *d++ = hexdigests[s[i] & 0xf];
+ has_bad_chars = TRUE;
+ }
+ else if (G_UNLIKELY(s[i] == '%')) {
+ if (i + 2 < sz) {
+ auto c1 = s[i + 1];
+ auto c2 = s[i + 2];
+
+ if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) {
+ auto codepoint = 0;
+
+ if (c1 >= '0' && c1 <= '9') codepoint = c1 - '0';
+ else if (c1 >= 'A' && c1 <= 'F')
+ codepoint = c1 - 'A' + 10;
+ else if (c1 >= 'a' && c1 <= 'f')
+ codepoint = c1 - 'a' + 10;
+
+ codepoint <<= 4;
+
+ if (c2 >= '0' && c2 <= '9') codepoint += c2 - '0';
+ else if (c2 >= 'A' && c2 <= 'F')
+ codepoint += c2 - 'A' + 10;
+ else if (c2 >= 'a' && c2 <= 'f')
+ codepoint += c2 - 'a' + 10;
+
+ /* Now check for 'interesting' codepoints */
+ if (codepoint == '@' || codepoint == ':' || codepoint == '|' ||
+ codepoint == '?' || codepoint == '\\' || codepoint == '/') {
+ /* Replace it back */
+ *d++ = (char) (codepoint & 0xff);
+ i += 2;
+ }
+ else {
+ *d++ = s[i];
+ }
+ }
+ else {
+ *d++ = s[i];
+ }
+ }
+ else {
+ *d++ = s[i];
+ }
+ }
+ else {
+ *d++ = s[i];
+ }
+ }
+
+ *d = '\0';
+ dlen = d - decoded;
+
+ url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+ rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags);
+ rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
+
+ /* Filter some completely damaged urls */
+ if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
+ !((url->protocol & PROTOCOL_UNKNOWN))) {
+ url->flags |= saved_flags;
+
+ if (has_bad_chars) {
+ url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+ }
+
+ if (no_prefix) {
+ url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+
+ if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
+ /* Ignore urls with both no schema and no tld */
+ return std::nullopt;
+ }
+ }
+
+ decoded = url->string;
+
+ input = {decoded, url->urllen};
+
+ /* Spaces in href usually mean an attempt to obfuscate URL */
+ /* See https://github.com/vstakhov/rspamd/issues/593 */
+#if 0
+ if (has_spaces) {
+ url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+ }
+#endif
+
+ return url;
+ }
+
+ return std::nullopt;
+}
+
+}// namespace rspamd::html \ No newline at end of file