diff options
Diffstat (limited to 'src/libserver/html/html_url.cxx')
-rw-r--r-- | src/libserver/html/html_url.cxx | 496 |
1 files changed, 496 insertions, 0 deletions
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx new file mode 100644 index 0000000..8f29f2c --- /dev/null +++ b/src/libserver/html/html_url.cxx @@ -0,0 +1,496 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "html_url.hxx" +#include "libutil/str_util.h" +#include "libserver/url.h" +#include "libserver/logger.h" +#include "rspamd.h" + +#include <unicode/idna.h> + +namespace rspamd::html { + +static auto +rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool +{ + const auto *p1 = t1.data() + t1.size() - 1; + const auto *p2 = t2.data() + t2.size() - 1; + + /* Skip trailing dots */ + while (p1 > t1.data()) { + if (*p1 != '.') { + break; + } + + p1--; + } + + while (p2 > t2.data()) { + if (*p2 != '.') { + break; + } + + p2--; + } + + while (p1 > t1.data() && p2 > t2.data()) { + if (*p1 != *p2) { + break; + } + + p1--; + p2--; + } + + if (p2 == t2.data()) { + /* p2 can be subdomain of p1 if *p1 is '.' */ + if (p1 != t1.data() && *(p1 - 1) == '.') { + return true; + } + } + else if (p1 == t1.data()) { + if (p2 != t2.data() && *(p2 - 1) == '.') { + return true; + } + } + + return false; +} + + +static auto +get_icu_idna_instance(void) -> auto +{ + auto uc_err = U_ZERO_ERROR; + static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err); + + return udn; +} + +static auto +convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld) + -> std::string_view +{ + std::string_view ret = use_tld ? std::string_view{rspamd_url_tld_unsafe(url), url->tldlen} : std::string_view{rspamd_url_host_unsafe(url), url->hostlen}; + + /* Handle IDN url's */ + if (ret.size() > 4 && + rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) { + + const auto buf_capacity = ret.size() * 2 + 1; + auto *idn_hbuf = (char *) rspamd_mempool_alloc(pool, buf_capacity); + icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int) buf_capacity}; + + /* We need to convert it to the normal value first */ + icu::IDNAInfo info; + auto uc_err = U_ZERO_ERROR; + auto *udn = get_icu_idna_instance(); + udn->nameToUnicodeUTF8(icu::StringPiece(ret.data(), ret.size()), + byte_sink, info, uc_err); + + if (uc_err == U_ZERO_ERROR && !info.hasErrors()) { + /* idn_hbuf is allocated in mempool, so it is safe to use */ + ret = std::string_view{idn_hbuf, (std::size_t) byte_sink.NumberOfBytesWritten()}; + } + else { + msg_err_pool("cannot convert to IDN: %s (0x%xd)", + u_errorName(uc_err), info.getErrors()); + } + } + + return ret; +}; + +constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto +{ + return (s1.size() == s2.size()) && + std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(), + [](const auto c1, const auto c2) { + return g_ascii_tolower(c1) == g_ascii_tolower(c2); + }); +} + +constexpr auto +is_transfer_proto(struct rspamd_url *u) -> bool +{ + return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0; +} + +auto html_url_is_phished(rspamd_mempool_t *pool, + struct rspamd_url *href_url, + std::string_view text_data) -> std::optional<rspamd_url *> +{ + struct rspamd_url *text_url; + std::string_view disp_tok, href_tok; + goffset url_pos; + gchar *url_str = NULL; + + auto sz = text_data.size(); + const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz); + text_data = std::string_view(trimmed, sz); + + if (text_data.size() > 4 && + rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str, + RSPAMD_URL_FIND_ALL, + &url_pos, NULL) && + url_str != nullptr) { + + if (url_pos > 0) { + /* + * We have some url at some offset, so we need to check what is + * at the start of the text + */ + return std::nullopt; + } + + text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url); + auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool, + RSPAMD_URL_PARSE_TEXT); + + if (rc == URI_ERRNO_OK) { + text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; + href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL; + + /* Check for phishing */ + if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) { + disp_tok = convert_idna_hostname_maybe(pool, text_url, false); + href_tok = convert_idna_hostname_maybe(pool, href_url, false); + + if (!sv_equals(disp_tok, href_tok) && + text_url->tldlen > 0 && href_url->tldlen > 0) { + + /* Apply the same logic for TLD */ + disp_tok = convert_idna_hostname_maybe(pool, text_url, true); + href_tok = convert_idna_hostname_maybe(pool, href_url, true); + + if (!sv_equals(disp_tok, href_tok)) { + /* Check if one url is a subdomain for another */ + + if (!rspamd_url_is_subdomain(disp_tok, href_tok)) { + href_url->flags |= RSPAMD_URL_FLAG_PHISHED; + text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; + + if (href_url->ext == nullptr) { + href_url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext); + } + href_url->ext->linked_url = text_url; + } + } + } + } + + return text_url; + } + else { + /* + * We have found something that looks like an url but it was + * not parsed correctly. + * Sometimes it means an obfuscation attempt, so we have to check + * what's inside of the text + */ + gboolean obfuscation_found = FALSE; + + if (text_data.size() > 4 && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 && + rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) { + /* Clearly an obfuscation attempt */ + obfuscation_found = TRUE; + } + + msg_info_pool("extract of url '%s' failed: %s; obfuscation detected: %s", + url_str, + rspamd_url_strerror(rc), + obfuscation_found ? "yes" : "no"); + + if (obfuscation_found) { + href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED; + } + } + } + + return std::nullopt; +} + +void html_check_displayed_url(rspamd_mempool_t *pool, + GList **exceptions, + void *url_set, + std::string_view visible_part, + goffset href_offset, + struct rspamd_url *url) +{ + struct rspamd_url *displayed_url = nullptr; + struct rspamd_url *turl; + struct rspamd_process_exception *ex; + guint saved_flags = 0; + gsize dlen; + + if (visible_part.empty()) { + /* No displayed url, just some text within <a> tag */ + return; + } + + if (url->ext == nullptr) { + url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext); + } + url->ext->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1); + rspamd_strlcpy(url->ext->visible_part, + visible_part.data(), + visible_part.size() + 1); + dlen = visible_part.size(); + + /* Strip unicode spaces from the start and the end */ + url->ext->visible_part = const_cast<char *>( + rspamd_string_unicode_trim_inplace(url->ext->visible_part, + &dlen)); + auto maybe_url = html_url_is_phished(pool, url, + {url->ext->visible_part, dlen}); + + if (maybe_url) { + url->flags |= saved_flags; + displayed_url = maybe_url.value(); + } + + if (exceptions && displayed_url != nullptr) { + ex = rspamd_mempool_alloc_type(pool, struct rspamd_process_exception); + ex->pos = href_offset; + ex->len = dlen; + ex->type = RSPAMD_EXCEPTION_URL; + ex->ptr = url; + + *exceptions = g_list_prepend(*exceptions, ex); + } + + if (displayed_url && url_set) { + turl = rspamd_url_set_add_or_return((khash_t(rspamd_url_hash) *) url_set, displayed_url); + + if (turl != nullptr) { + /* Here, we assume the following: + * if we have a URL in the text part which + * is the same as displayed URL in the + * HTML part, we assume that it is also + * hint only. + */ + if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) { + + /* + * We have the same URL for href and displayed url, so we + * know that this url cannot be both target and display (as + * it breaks logic in many places), so we do not + * propagate html flags + */ + if (!(turl->flags & RSPAMD_URL_FLAG_DISPLAY_URL)) { + turl->flags |= displayed_url->flags; + } + turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT; + } + + turl->count++; + } + else { + /* Already inserted by `rspamd_url_set_add_or_return` */ + } + } + + rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen); +} + +auto html_process_url(rspamd_mempool_t *pool, std::string_view &input) + -> std::optional<struct rspamd_url *> +{ + struct rspamd_url *url; + guint saved_flags = 0; + gint rc; + const gchar *s, *prefix = "http://"; + gchar *d; + gsize dlen; + gboolean has_bad_chars = FALSE, no_prefix = FALSE; + static const gchar hexdigests[] = "0123456789abcdef"; + + auto sz = input.length(); + const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz); + input = {trimmed, sz}; + + const auto *start = input.data(); + s = start; + dlen = 0; + + for (auto i = 0; i < sz; i++) { + if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) { + dlen += 3; + } + else { + dlen++; + } + } + + if (rspamd_substring_search(start, sz, "://", 3) == -1) { + if (sz >= sizeof("mailto:") && + (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 || + memcmp(start, "tel:", sizeof("tel:") - 1) == 0 || + memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) { + /* Exclusion, has valid but 'strange' prefix */ + } + else { + for (auto i = 0; i < sz; i++) { + if (!((s[i] & 0x80) || g_ascii_isalnum(s[i]))) { + if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') { + prefix = "http:"; + dlen += sizeof("http:") - 1; + no_prefix = TRUE; + } + else if (s[i] == '@') { + /* Likely email prefix */ + prefix = "mailto://"; + dlen += sizeof("mailto://") - 1; + no_prefix = TRUE; + } + else if (s[i] == ':' && i != 0) { + /* Special case */ + no_prefix = FALSE; + } + else { + if (i == 0) { + /* No valid data */ + return std::nullopt; + } + else { + no_prefix = TRUE; + dlen += strlen(prefix); + } + } + + break; + } + } + } + } + + auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1); + d = decoded; + + if (no_prefix) { + gsize plen = strlen(prefix); + memcpy(d, prefix, plen); + d += plen; + } + + /* + * We also need to remove all internal newlines, spaces + * and encode unsafe characters + * Another obfuscation find in the wild was encoding of the SAFE url characters, + * including essential ones + */ + for (auto i = 0; i < sz; i++) { + if (G_UNLIKELY(g_ascii_isspace(s[i]))) { + continue; + } + else if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) { + /* URL encode */ + *d++ = '%'; + *d++ = hexdigests[(s[i] >> 4) & 0xf]; + *d++ = hexdigests[s[i] & 0xf]; + has_bad_chars = TRUE; + } + else if (G_UNLIKELY(s[i] == '%')) { + if (i + 2 < sz) { + auto c1 = s[i + 1]; + auto c2 = s[i + 2]; + + if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) { + auto codepoint = 0; + + if (c1 >= '0' && c1 <= '9') codepoint = c1 - '0'; + else if (c1 >= 'A' && c1 <= 'F') + codepoint = c1 - 'A' + 10; + else if (c1 >= 'a' && c1 <= 'f') + codepoint = c1 - 'a' + 10; + + codepoint <<= 4; + + if (c2 >= '0' && c2 <= '9') codepoint += c2 - '0'; + else if (c2 >= 'A' && c2 <= 'F') + codepoint += c2 - 'A' + 10; + else if (c2 >= 'a' && c2 <= 'f') + codepoint += c2 - 'a' + 10; + + /* Now check for 'interesting' codepoints */ + if (codepoint == '@' || codepoint == ':' || codepoint == '|' || + codepoint == '?' || codepoint == '\\' || codepoint == '/') { + /* Replace it back */ + *d++ = (char) (codepoint & 0xff); + i += 2; + } + else { + *d++ = s[i]; + } + } + else { + *d++ = s[i]; + } + } + else { + *d++ = s[i]; + } + } + else { + *d++ = s[i]; + } + } + + *d = '\0'; + dlen = d - decoded; + + url = rspamd_mempool_alloc0_type(pool, struct rspamd_url); + rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags); + rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF); + + /* Filter some completely damaged urls */ + if (rc == URI_ERRNO_OK && url->hostlen > 0 && + !((url->protocol & PROTOCOL_UNKNOWN))) { + url->flags |= saved_flags; + + if (has_bad_chars) { + url->flags |= RSPAMD_URL_FLAG_OBSCURED; + } + + if (no_prefix) { + url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; + + if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) { + /* Ignore urls with both no schema and no tld */ + return std::nullopt; + } + } + + decoded = url->string; + + input = {decoded, url->urllen}; + + /* Spaces in href usually mean an attempt to obfuscate URL */ + /* See https://github.com/vstakhov/rspamd/issues/593 */ +#if 0 + if (has_spaces) { + url->flags |= RSPAMD_URL_FLAG_OBSCURED; + } +#endif + + return url; + } + + return std::nullopt; +} + +}// namespace rspamd::html
\ No newline at end of file |