1 files changed, 2393 insertions, 0 deletions
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
new file mode 100644
index 0000000..5861d45
--- /dev/null
+++ b/src/libserver/html/html.cxx
@@ -0,0 +1,2393 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "config.h"
+#include "util.h"
+#include "message.h"
+#include "html.h"
+#include "html_tags.h"
+#include "html_block.hxx"
+#include "html.hxx"
+#include "libserver/css/css_value.hxx"
+#include "libserver/css/css.hxx"
+#include "libserver/task.h"
+#include "libserver/cfg_file.h"
+
+#include "url.h"
+#include "contrib/libucl/khash.h"
+#include "libmime/images.h"
+#include "libutil/cxx/utf8_util.h"
+
+#include "html_tag_defs.hxx"
+#include "html_entities.hxx"
+#include "html_tag.hxx"
+#include "html_url.hxx"
+
+#include <frozen/unordered_map.h>
+#include <frozen/string.h>
+#include <fmt/core.h>
+
+#include <unicode/uversion.h>
+
+namespace rspamd::html {
+
+static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
+
+static const html_tags_storage html_tags_defs;
+
+auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>(
+	{
+		{"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME},
+		{"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
+		{"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
+		{"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
+		{"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR},
+		{"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
+		{"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE},
+		{"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS},
+		{"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH},
+		{"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT},
+		{"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE},
+		{"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL},
+		{"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT},
+		{"id", html_component_type::RSPAMD_HTML_COMPONENT_ID},
+		{"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN},
+	});
+
+#define msg_debug_html(...) rspamd_conditional_debug_fast(NULL, NULL,                                \
+														  rspamd_html_log_id, "html", pool->tag.uid, \
+														  __FUNCTION__,                              \
+														  __VA_ARGS__)
+
+INIT_LOG_MODULE(html)
+
+/*
+ * This function is expected to be called on a closing tag to fill up all tags
+ * and return the current parent (meaning unclosed) tag
+ */
+static auto
+html_check_balance(struct html_content *hc,
+				   struct html_tag *tag,
+				   goffset tag_start_offset,
+				   goffset tag_end_offset) -> html_tag *
+{
+	/* As agreed, the closing tag has the last opening at the parent ptr */
+	auto *opening_tag = tag->parent;
+
+	auto calculate_content_length = [tag_start_offset, tag_end_offset](html_tag *t) {
+		auto opening_content_offset = t->content_offset;
+
+		if (t->flags & (CM_EMPTY)) {
+			/* Attach closing tag just at the opening tag */
+			t->closing.start = t->tag_start;
+			t->closing.end = t->content_offset;
+		}
+		else {
+
+			if (opening_content_offset <= tag_start_offset) {
+				t->closing.start = tag_start_offset;
+				t->closing.end = tag_end_offset;
+			}
+			else {
+
+				t->closing.start = t->content_offset;
+				t->closing.end = tag_end_offset;
+			}
+		}
+	};
+
+	auto balance_tag = [&]() -> html_tag * {
+		auto it = tag->parent;
+		auto found_pair = false;
+
+		for (; it != nullptr; it = it->parent) {
+			if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
+				found_pair = true;
+				break;
+			}
+		}
+
+		/*
+		 * If we have found a closing pair, then we need to close all tags and
+		 * return the top-most tag
+		 */
+		if (found_pair) {
+			for (it = tag->parent; it != nullptr; it = it->parent) {
+				it->flags |= FL_CLOSED;
+				/* Insert a virtual closing tag for all tags that are not closed */
+				calculate_content_length(it);
+				if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
+					break;
+				}
+			}
+
+			return it;
+		}
+		else {
+			/*
+			 * We have not found a pair, so this closing tag is bogus and should
+			 * be ignored completely.
+			 * Unfortunately, it also means that we need to insert another tag,
+			 * as the current closing tag is unusable for that purposes.
+			 *
+			 * We assume that callee will recognise that and reconstruct the
+			 * tag at the tag_end_closing state, so we return nullptr...
+			 */
+		}
+
+		/* Tag must be ignored and reconstructed */
+		return nullptr;
+	};
+
+	if (opening_tag) {
+
+		if (opening_tag->id == tag->id) {
+			opening_tag->flags |= FL_CLOSED;
+
+			calculate_content_length(opening_tag);
+			/* All good */
+			return opening_tag->parent;
+		}
+		else {
+			return balance_tag();
+		}
+	}
+	else {
+		/*
+		 * We have no opening tag
+		 * There are two possibilities:
+		 *
+		 * 1) We have some block tag in hc->all_tags;
+		 * 2) We have no tags
+		 */
+
+		if (hc->all_tags.empty()) {
+			hc->all_tags.push_back(std::make_unique<html_tag>());
+			auto *vtag = hc->all_tags.back().get();
+			vtag->id = Tag_HTML;
+			vtag->flags = FL_VIRTUAL;
+			vtag->tag_start = 0;
+			vtag->content_offset = 0;
+			calculate_content_length(vtag);
+
+			if (!hc->root_tag) {
+				hc->root_tag = vtag;
+			}
+			else {
+				vtag->parent = hc->root_tag;
+			}
+
+			tag->parent = vtag;
+
+			/* Recursively call with a virtual <html> tag inserted */
+			return html_check_balance(hc, tag, tag_start_offset, tag_end_offset);
+		}
+	}
+
+	return nullptr;
+}
+
+auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>
+{
+	auto known_component_it = html_components_map.find(st);
+
+	if (known_component_it != html_components_map.end()) {
+		return known_component_it->second;
+	}
+	else {
+		return std::nullopt;
+	}
+}
+
+enum tag_parser_state {
+	parse_start = 0,
+	parse_name,
+	parse_attr_name,
+	parse_equal,
+	parse_start_dquote,
+	parse_dqvalue,
+	parse_end_dquote,
+	parse_start_squote,
+	parse_sqvalue,
+	parse_end_squote,
+	parse_value,
+	spaces_before_eq,
+	spaces_after_eq,
+	spaces_after_param,
+	ignore_bad_tag,
+	tag_end,
+	slash_after_value,
+	slash_in_unquoted_value,
+};
+struct tag_content_parser_state {
+	tag_parser_state cur_state = parse_start;
+	std::string buf;
+	std::optional<html_component_type> cur_component;
+
+	void reset()
+	{
+		cur_state = parse_start;
+		buf.clear();
+		cur_component = std::nullopt;
+	}
+};
+
+static inline void
+html_parse_tag_content(rspamd_mempool_t *pool,
+					   struct html_content *hc,
+					   struct html_tag *tag,
+					   const char *in,
+					   struct tag_content_parser_state &parser_env)
+{
+	auto state = parser_env.cur_state;
+
+	/*
+	 * Stores tag component if it doesn't exist, performing copy of the
+	 * value + decoding of the entities
+	 * Parser env is set to clear the current html attribute fields (saved_p and
+	 * cur_component)
+	 */
+	auto store_component_value = [&]() -> void {
+		if (parser_env.cur_component) {
+
+			if (parser_env.buf.empty()) {
+				tag->components.emplace_back(parser_env.cur_component.value(),
+											 std::string_view{});
+			}
+			else {
+				/* We need to copy buf to a persistent storage */
+				auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
+
+				if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID ||
+					parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
+					/* Lowercase */
+					rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size());
+				}
+				else {
+					memcpy(s, parser_env.buf.data(), parser_env.buf.size());
+				}
+
+				auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
+				tag->components.emplace_back(parser_env.cur_component.value(),
+											 std::string_view{s, sz});
+			}
+		}
+
+		parser_env.buf.clear();
+		parser_env.cur_component = std::nullopt;
+	};
+
+	auto store_component_name = [&]() -> bool {
+		decode_html_entitles_inplace(parser_env.buf);
+		auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
+		parser_env.buf.clear();
+
+		if (known_component_it != html_components_map.end()) {
+			parser_env.cur_component = known_component_it->second;
+
+			return true;
+		}
+		else {
+			parser_env.cur_component = std::nullopt;
+		}
+
+		return false;
+	};
+
+	auto store_value_character = [&](bool lc) -> void {
+		auto c = lc ? g_ascii_tolower(*in) : *in;
+
+		if (c == '\0') {
+			/* Replace with u0FFD */
+			parser_env.buf.append((const char *) u8"\uFFFD");
+		}
+		else {
+			parser_env.buf.push_back(c);
+		}
+	};
+
+	switch (state) {
+	case parse_start:
+		if (!g_ascii_isalpha(*in) && !g_ascii_isspace(*in)) {
+			hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+			state = ignore_bad_tag;
+			tag->id = N_TAGS;
+			tag->flags |= FL_BROKEN;
+		}
+		else if (g_ascii_isalpha(*in)) {
+			state = parse_name;
+			store_value_character(true);
+		}
+		break;
+
+	case parse_name:
+		if ((g_ascii_isspace(*in) || *in == '>' || *in == '/')) {
+			if (*in == '/') {
+				tag->flags |= FL_CLOSED;
+			}
+
+			if (parser_env.buf.empty()) {
+				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+				tag->id = N_TAGS;
+				tag->flags |= FL_BROKEN;
+				state = ignore_bad_tag;
+			}
+			else {
+				decode_html_entitles_inplace(parser_env.buf);
+				const auto *tag_def = rspamd::html::html_tags_defs.by_name(parser_env.buf);
+
+				if (tag_def == nullptr) {
+					hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
+					/* Assign -hash to match closing tag if needed */
+					auto nhash = static_cast<std::int32_t>(std::hash<std::string>{}(parser_env.buf));
+					/* Always negative */
+					tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
+				}
+				else {
+					tag->id = tag_def->id;
+					tag->flags = tag_def->flags;
+				}
+
+				parser_env.buf.clear();
+
+				state = spaces_after_param;
+			}
+		}
+		else {
+			store_value_character(true);
+		}
+		break;
+
+	case parse_attr_name:
+		if (*in == '=') {
+			if (!parser_env.buf.empty()) {
+				store_component_name();
+			}
+			state = parse_equal;
+		}
+		else if (g_ascii_isspace(*in)) {
+			store_component_name();
+			state = spaces_before_eq;
+		}
+		else if (*in == '/') {
+			store_component_name();
+			store_component_value();
+			state = slash_after_value;
+		}
+		else if (*in == '>') {
+			store_component_name();
+			store_component_value();
+			state = tag_end;
+		}
+		else {
+			if (*in == '"' || *in == '\'' || *in == '<') {
+				/* Should never be in attribute names but ignored */
+				tag->flags |= FL_BROKEN;
+			}
+
+			store_value_character(true);
+		}
+
+		break;
+
+	case spaces_before_eq:
+		if (*in == '=') {
+			state = parse_equal;
+		}
+		else if (!g_ascii_isspace(*in)) {
+			/*
+			 * HTML defines that crap could still be restored and
+			 * calculated somehow... So we have to follow this stupid behaviour
+			 */
+			/*
+			 * TODO: estimate what insane things do email clients in each case
+			 */
+			if (*in == '>') {
+				/*
+				 * Attribute name followed by end of tag
+				 * Should be okay (empty attribute). The rest is handled outside
+				 * this automata.
+				 */
+				store_component_value();
+				state = tag_end;
+			}
+			else if (*in == '"' || *in == '\'' || *in == '<') {
+				/* Attribute followed by quote... Missing '=' ? Dunno, need to test */
+				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+				tag->flags |= FL_BROKEN;
+				store_component_value();
+				store_value_character(true);
+				state = spaces_after_param;
+			}
+			else {
+				/* Empty attribute */
+				store_component_value();
+				store_value_character(true);
+				state = spaces_after_param;
+			}
+		}
+		break;
+
+	case spaces_after_eq:
+		if (*in == '"') {
+			state = parse_start_dquote;
+		}
+		else if (*in == '\'') {
+			state = parse_start_squote;
+		}
+		else if (!g_ascii_isspace(*in)) {
+			store_value_character(true);
+			state = parse_value;
+		}
+		break;
+
+	case parse_equal:
+		if (g_ascii_isspace(*in)) {
+			state = spaces_after_eq;
+		}
+		else if (*in == '"') {
+			state = parse_start_dquote;
+		}
+		else if (*in == '\'') {
+			state = parse_start_squote;
+		}
+		else {
+			store_value_character(true);
+			state = parse_value;
+		}
+		break;
+
+	case parse_start_dquote:
+		if (*in == '"') {
+			state = spaces_after_param;
+		}
+		else {
+			store_value_character(false);
+			state = parse_dqvalue;
+		}
+		break;
+
+	case parse_start_squote:
+		if (*in == '\'') {
+			state = spaces_after_param;
+		}
+		else {
+			store_value_character(false);
+			state = parse_sqvalue;
+		}
+		break;
+
+	case parse_dqvalue:
+		if (*in == '"') {
+			store_component_value();
+			state = parse_end_dquote;
+		}
+		else {
+			store_value_character(false);
+		}
+		break;
+
+	case parse_sqvalue:
+		if (*in == '\'') {
+			store_component_value();
+			state = parse_end_squote;
+		}
+		else {
+			store_value_character(false);
+		}
+
+		break;
+
+	case parse_value:
+		if (*in == '/') {
+			state = slash_in_unquoted_value;
+		}
+		else if (g_ascii_isspace(*in) || *in == '>' || *in == '"') {
+			store_component_value();
+			state = spaces_after_param;
+		}
+		else {
+			store_value_character(false);
+		}
+		break;
+
+	case parse_end_dquote:
+	case parse_end_squote:
+		if (g_ascii_isspace(*in)) {
+			state = spaces_after_param;
+		}
+		else if (*in == '/') {
+			store_component_value();
+			store_value_character(true);
+			state = slash_after_value;
+		}
+		else {
+			/* No space, proceed immediately to the attribute name */
+			state = parse_attr_name;
+			store_component_value();
+			store_value_character(true);
+		}
+		break;
+
+	case spaces_after_param:
+		if (!g_ascii_isspace(*in)) {
+			if (*in == '/') {
+				state = slash_after_value;
+			}
+			else if (*in == '=') {
+				/* Attributes cannot start with '=' */
+				tag->flags |= FL_BROKEN;
+				store_value_character(true);
+				state = parse_attr_name;
+			}
+			else {
+				store_value_character(true);
+				state = parse_attr_name;
+			}
+		}
+		break;
+	case slash_after_value:
+		if (*in == '>') {
+			tag->flags |= FL_CLOSED;
+			state = tag_end;
+		}
+		else if (!g_ascii_isspace(*in)) {
+			tag->flags |= FL_BROKEN;
+			state = parse_attr_name;
+		}
+		break;
+	case slash_in_unquoted_value:
+		if (*in == '>') {
+			/* That slash was in fact closing tag slash, woohoo */
+			tag->flags |= FL_CLOSED;
+			state = tag_end;
+			store_component_value();
+		}
+		else {
+			/* Welcome to the world of html, revert state and save missing / */
+			parser_env.buf.push_back('/');
+			store_value_character(false);
+			state = parse_value;
+		}
+		break;
+	case ignore_bad_tag:
+	case tag_end:
+		break;
+	}
+
+	parser_env.cur_state = state;
+}
+
+static inline auto
+html_is_absolute_url(std::string_view st) -> bool
+{
+	auto alnum_pos = std::find_if(std::begin(st), std::end(st),
+								  [](auto c) { return !g_ascii_isalnum(c); });
+
+	if (alnum_pos != std::end(st) && alnum_pos != std::begin(st)) {
+		if (*alnum_pos == ':') {
+			if (st.substr(0, std::distance(std::begin(st), alnum_pos)) == "mailto") {
+				return true;
+			}
+
+			std::advance(alnum_pos, 1);
+			if (alnum_pos != std::end(st)) {
+				/* Include even malformed urls */
+				if (*alnum_pos == '/' || *alnum_pos == '\\') {
+					return true;
+				}
+			}
+		}
+	}
+
+	return false;
+}
+
+static auto
+html_process_url_tag(rspamd_mempool_t *pool,
+					 struct html_tag *tag,
+					 struct html_content *hc) -> std::optional<struct rspamd_url *>
+{
+	auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
+
+	if (found_href_maybe) {
+		/* Check base url */
+		auto &href_value = found_href_maybe.value();
+
+		if (hc && hc->base_url) {
+			/*
+			 * Relative url cannot start from the following:
+			 * schema://
+			 * data:
+			 * slash
+			 */
+
+			if (!html_is_absolute_url(href_value)) {
+
+				if (href_value.size() >= sizeof("data:") &&
+					g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
+					/* Image data url, never insert as url */
+					return std::nullopt;
+				}
+
+				/* Assume relative url */
+				auto need_slash = false;
+
+				auto orig_len = href_value.size();
+				auto len = orig_len + hc->base_url->urllen;
+
+				if (hc->base_url->datalen == 0) {
+					need_slash = true;
+					len++;
+				}
+
+				auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+				auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1,
+														  "%*s%s%*s",
+														  (int) hc->base_url->urllen, hc->base_url->string,
+														  need_slash ? "/" : "",
+														  (gint) orig_len, href_value.data());
+				href_value = {buf, nlen};
+			}
+			else if (href_value.size() > 2 && href_value[0] == '/' && href_value[1] != '/') {
+				/* Relative to the hostname */
+				auto orig_len = href_value.size();
+				auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
+						   3 /* for :// */;
+				auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+				auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
+														  (int) hc->base_url->protocollen, hc->base_url->string,
+														  (int) hc->base_url->hostlen, rspamd_url_host_unsafe(hc->base_url),
+														  (gint) orig_len, href_value.data());
+				href_value = {buf, nlen};
+			}
+		}
+
+		auto url = html_process_url(pool, href_value).value_or(nullptr);
+
+		if (url) {
+			if (tag->id != Tag_A) {
+				/* Mark special tags special */
+				url->flags |= RSPAMD_URL_FLAG_SPECIAL;
+			}
+
+			if (std::holds_alternative<std::monostate>(tag->extra)) {
+				tag->extra = url;
+			}
+
+			return url;
+		}
+
+		return std::nullopt;
+	}
+
+	return std::nullopt;
+}
+
+struct rspamd_html_url_query_cbd {
+	rspamd_mempool_t *pool;
+	khash_t(rspamd_url_hash) * url_set;
+	struct rspamd_url *url;
+	GPtrArray *part_urls;
+};
+
+static gboolean
+html_url_query_callback(struct rspamd_url *url, gsize start_offset,
+						gsize end_offset, gpointer ud)
+{
+	struct rspamd_html_url_query_cbd *cbd =
+		(struct rspamd_html_url_query_cbd *) ud;
+	rspamd_mempool_t *pool;
+
+	pool = cbd->pool;
+
+	if (url->protocol == PROTOCOL_MAILTO) {
+		if (url->userlen == 0) {
+			return FALSE;
+		}
+	}
+
+	msg_debug_html("found url %s in query of url"
+				   " %*s",
+				   url->string,
+				   cbd->url->querylen, rspamd_url_query_unsafe(cbd->url));
+
+	url->flags |= RSPAMD_URL_FLAG_QUERY;
+
+	if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) {
+		g_ptr_array_add(cbd->part_urls, url);
+	}
+
+	return TRUE;
+}
+
+static void
+html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
+					   khash_t(rspamd_url_hash) * url_set,
+					   GPtrArray *part_urls)
+{
+	if (url->querylen > 0) {
+		struct rspamd_html_url_query_cbd qcbd;
+
+		qcbd.pool = pool;
+		qcbd.url_set = url_set;
+		qcbd.url = url;
+		qcbd.part_urls = part_urls;
+
+		rspamd_url_find_multiple(pool,
+								 rspamd_url_query_unsafe(url), url->querylen,
+								 RSPAMD_URL_FIND_ALL, NULL,
+								 html_url_query_callback, &qcbd);
+	}
+
+	if (part_urls) {
+		g_ptr_array_add(part_urls, url);
+	}
+}
+
+static auto
+html_process_data_image(rspamd_mempool_t *pool,
+						struct html_image *img,
+						std::string_view input) -> void
+{
+	/*
+	 * Here, we do very basic processing of the data:
+	 * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
+	 * We only parse base64 encoded data.
+	 * We ignore content type so far
+	 */
+	struct rspamd_image *parsed_image;
+	const gchar *semicolon_pos = input.data(),
+				*end = input.data() + input.size();
+
+	if ((semicolon_pos = (const gchar *) memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) {
+		if (end - semicolon_pos > sizeof("base64,")) {
+			if (memcmp(semicolon_pos + 1, "base64,", sizeof("base64,") - 1) == 0) {
+				const gchar *data_pos = semicolon_pos + sizeof("base64,");
+				gchar *decoded;
+				gsize encoded_len = end - data_pos, decoded_len;
+				rspamd_ftok_t inp;
+
+				decoded_len = (encoded_len / 4 * 3) + 12;
+				decoded = rspamd_mempool_alloc_buffer(pool, decoded_len);
+				rspamd_cryptobox_base64_decode(data_pos, encoded_len,
+											   reinterpret_cast<guchar *>(decoded), &decoded_len);
+				inp.begin = decoded;
+				inp.len = decoded_len;
+
+				parsed_image = rspamd_maybe_process_image(pool, &inp);
+
+				if (parsed_image) {
+					msg_debug_html("detected %s image of size %ud x %ud in data url",
+								   rspamd_image_type_str(parsed_image->type),
+								   parsed_image->width, parsed_image->height);
+					img->embedded_image = parsed_image;
+				}
+			}
+		}
+		else {
+			/* Nothing useful */
+			return;
+		}
+	}
+}
+
+static void
+html_process_img_tag(rspamd_mempool_t *pool,
+					 struct html_tag *tag,
+					 struct html_content *hc,
+					 khash_t(rspamd_url_hash) * url_set,
+					 GPtrArray *part_urls)
+{
+	struct html_image *img;
+
+	img = rspamd_mempool_alloc0_type(pool, struct html_image);
+	img->tag = tag;
+
+	for (const auto &param: tag->components) {
+
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
+			/* Check base url */
+			const auto &href_value = param.value;
+
+			if (href_value.size() > 0) {
+				rspamd_ftok_t fstr;
+				fstr.begin = href_value.data();
+				fstr.len = href_value.size();
+				img->src = rspamd_mempool_ftokdup(pool, &fstr);
+
+				if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
+																	 "cid:", sizeof("cid:") - 1) == 0) {
+					/* We have an embedded image */
+					img->src += sizeof("cid:") - 1;
+					img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
+				}
+				else {
+					if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
+																		  "data:", sizeof("data:") - 1) == 0) {
+						/* We have an embedded image in HTML tag */
+						img->flags |=
+							(RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
+						html_process_data_image(pool, img, href_value);
+						hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
+					}
+					else {
+						img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
+						if (img->src) {
+
+							std::string_view cpy{href_value};
+							auto maybe_url = html_process_url(pool, cpy);
+
+							if (maybe_url) {
+								img->url = maybe_url.value();
+								struct rspamd_url *existing;
+
+								img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
+								existing = rspamd_url_set_add_or_return(url_set,
+																		img->url);
+
+								if (existing && existing != img->url) {
+									/*
+									 * We have some other URL that could be
+									 * found, e.g. from another part. However,
+									 * we still want to set an image flag on it
+									 */
+									existing->flags |= img->url->flags;
+									existing->count++;
+								}
+								else if (part_urls) {
+									/* New url */
+									g_ptr_array_add(part_urls, img->url);
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+
+
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
+			unsigned long val;
+
+			rspamd_strtoul(param.value.data(), param.value.size(), &val);
+			img->height = val;
+		}
+
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
+			unsigned long val;
+
+			rspamd_strtoul(param.value.data(), param.value.size(), &val);
+			img->width = val;
+		}
+
+		/* TODO: rework to css at some time */
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
+			if (img->height == 0) {
+				auto style_st = param.value;
+				auto pos = rspamd_substring_search_caseless(style_st.data(),
+															style_st.size(),
+															"height", sizeof("height") - 1);
+				if (pos != -1) {
+					auto substr = style_st.substr(pos + sizeof("height") - 1);
+
+					for (auto i = 0; i < substr.size(); i++) {
+						auto t = substr[i];
+						if (g_ascii_isdigit(t)) {
+							unsigned long val;
+							rspamd_strtoul(substr.data(),
+										   substr.size(), &val);
+							img->height = val;
+							break;
+						}
+						else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+							/* Fallback */
+							break;
+						}
+					}
+				}
+			}
+			if (img->width == 0) {
+				auto style_st = param.value;
+				auto pos = rspamd_substring_search_caseless(style_st.data(),
+															style_st.size(),
+															"width", sizeof("width") - 1);
+				if (pos != -1) {
+					auto substr = style_st.substr(pos + sizeof("width") - 1);
+
+					for (auto i = 0; i < substr.size(); i++) {
+						auto t = substr[i];
+						if (g_ascii_isdigit(t)) {
+							unsigned long val;
+							rspamd_strtoul(substr.data(),
+										   substr.size(), &val);
+							img->width = val;
+							break;
+						}
+						else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+							/* Fallback */
+							break;
+						}
+					}
+				}
+			}
+		}
+	}
+
+	if (img->embedded_image) {
+		if (img->height == 0) {
+			img->height = img->embedded_image->height;
+		}
+		if (img->width == 0) {
+			img->width = img->embedded_image->width;
+		}
+	}
+
+	hc->images.push_back(img);
+
+	if (std::holds_alternative<std::monostate>(tag->extra)) {
+		tag->extra = img;
+	}
+}
+
+static auto
+html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+					  struct html_content *hc,
+					  khash_t(rspamd_url_hash) * url_set,
+					  GPtrArray *part_urls) -> void
+{
+	auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
+
+	if (found_rel_maybe) {
+		if (found_rel_maybe.value() == "icon") {
+			html_process_img_tag(pool, tag, hc, url_set, part_urls);
+		}
+	}
+}
+
+static auto
+html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+					   struct html_content *hc) -> void
+{
+	std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
+	bool hidden = false;
+
+	for (const auto &param: tag->components) {
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
+			maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
+		}
+
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
+			maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
+		}
+
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
+			tag->block = rspamd::css::parse_css_declaration(pool, param.value);
+		}
+
+		if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) {
+			hidden = true;
+		}
+	}
+
+	if (!tag->block) {
+		tag->block = html_block::undefined_html_block_pool(pool);
+	}
+
+	if (hidden) {
+		tag->block->set_display(false);
+	}
+
+	if (maybe_fgcolor) {
+		tag->block->set_fgcolor(maybe_fgcolor->to_color().value());
+	}
+
+	if (maybe_bgcolor) {
+		tag->block->set_bgcolor(maybe_bgcolor->to_color().value());
+	}
+}
+
+static inline auto
+html_append_parsed(struct html_content *hc,
+				   std::string_view data,
+				   bool transparent,
+				   std::size_t input_len,
+				   std::string &dest) -> std::size_t
+{
+	auto cur_offset = dest.size();
+
+	if (dest.size() > input_len) {
+		/* Impossible case, refuse to append */
+		return 0;
+	}
+
+	if (data.size() > 0) {
+		/* Handle multiple spaces at the begin */
+
+		if (cur_offset > 0) {
+			auto last = dest.back();
+			if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) {
+				dest.append(" ");
+				data = {data.data() + 1, data.size() - 1};
+				cur_offset++;
+			}
+		}
+
+		if (data.find('\0') != std::string_view::npos) {
+			auto replace_zero_func = [](const auto &input, auto &output) {
+				const auto last = input.cend();
+				for (auto it = input.cbegin(); it != last; ++it) {
+					if (*it == '\0') {
+						output.append((const char *) u8"\uFFFD");
+					}
+					else {
+						output.push_back(*it);
+					}
+				}
+			};
+
+			dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD"));
+			replace_zero_func(data, dest);
+			hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
+		}
+		else {
+			dest.append(data);
+		}
+	}
+
+	auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset,
+											 dest.size() - cur_offset, true);
+
+	dest.resize(nlen + cur_offset);
+
+	if (transparent) {
+		/* Replace all visible characters with spaces */
+		auto start = std::next(dest.begin(), cur_offset);
+		std::replace_if(
+			start, std::end(dest), [](const auto c) {
+				return !g_ascii_isspace(c);
+			},
+			' ');
+	}
+
+	return nlen;
+}
+
+static auto
+html_process_displayed_href_tag(rspamd_mempool_t *pool,
+								struct html_content *hc,
+								std::string_view data,
+								const struct html_tag *cur_tag,
+								GList **exceptions,
+								khash_t(rspamd_url_hash) * url_set,
+								goffset dest_offset) -> void
+{
+
+	if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) {
+		auto *url = std::get<rspamd_url *>(cur_tag->extra);
+
+		html_check_displayed_url(pool,
+								 exceptions, url_set,
+								 data,
+								 dest_offset,
+								 url);
+	}
+}
+
+static auto
+html_append_tag_content(rspamd_mempool_t *pool,
+						const gchar *start, gsize len,
+						struct html_content *hc,
+						html_tag *tag,
+						GList **exceptions,
+						khash_t(rspamd_url_hash) * url_set) -> goffset
+{
+	auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false;
+	goffset next_tag_offset = tag->closing.end,
+			initial_parsed_offset = hc->parsed.size(),
+			initial_invisible_offset = hc->invisible.size();
+
+	auto calculate_final_tag_offsets = [&]() -> void {
+		if (is_visible) {
+			tag->content_offset = initial_parsed_offset;
+			tag->closing.start = hc->parsed.size();
+		}
+		else {
+			tag->content_offset = initial_invisible_offset;
+			tag->closing.start = hc->invisible.size();
+		}
+	};
+
+	if (tag->closing.end == -1) {
+		if (tag->closing.start != -1) {
+			next_tag_offset = tag->closing.start;
+			tag->closing.end = tag->closing.start;
+		}
+		else {
+			next_tag_offset = tag->content_offset;
+			tag->closing.end = tag->content_offset;
+		}
+	}
+	if (tag->closing.start == -1) {
+		tag->closing.start = tag->closing.end;
+	}
+
+	auto append_margin = [&](char c) -> void {
+		/* We do care about visible margins only */
+		if (is_visible) {
+			if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
+				if (hc->parsed.back() == ' ') {
+					/* We also strip extra spaces at the end, but limiting the start */
+					auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset);
+					auto first = std::find_if(hc->parsed.rbegin(), last,
+											  [](auto ch) -> auto {
+												  return ch != ' ';
+											  });
+					hc->parsed.erase(first.base(), hc->parsed.end());
+					g_assert(hc->parsed.size() >= initial_parsed_offset);
+				}
+				hc->parsed.push_back(c);
+			}
+		}
+	};
+
+	if (tag->id == Tag_BR || tag->id == Tag_HR) {
+
+		if (!(tag->flags & FL_IGNORE)) {
+			hc->parsed.append("\n");
+		}
+
+		auto ret = tag->content_offset;
+		calculate_final_tag_offsets();
+
+		return ret;
+	}
+	else if ((tag->id == Tag_HEAD && (tag->flags & FL_IGNORE)) || (tag->flags & CM_HEAD)) {
+		auto ret = tag->closing.end;
+		calculate_final_tag_offsets();
+
+		return ret;
+	}
+
+	if ((tag->flags & (FL_COMMENT | FL_XML | FL_IGNORE | CM_HEAD))) {
+		is_visible = false;
+	}
+	else {
+		if (!tag->block) {
+			is_visible = true;
+		}
+		else if (!tag->block->is_visible()) {
+			if (!tag->block->is_transparent()) {
+				is_visible = false;
+			}
+			else {
+				if (tag->block->has_display() &&
+					tag->block->display == css::css_display_value::DISPLAY_HIDDEN) {
+					is_visible = false;
+				}
+				else {
+					is_transparent = true;
+				}
+			}
+		}
+		else {
+			if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
+				is_block = true;
+			}
+			else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) {
+				is_spaces = true;
+			}
+		}
+	}
+
+	if (is_block) {
+		append_margin('\n');
+	}
+	else if (is_spaces) {
+		append_margin(' ');
+	}
+
+	goffset cur_offset = tag->content_offset;
+
+	for (auto *cld: tag->children) {
+		auto enclosed_start = cld->tag_start;
+		goffset initial_part_len = enclosed_start - cur_offset;
+
+		if (initial_part_len > 0) {
+			if (is_visible) {
+				html_append_parsed(hc,
+								   {start + cur_offset, std::size_t(initial_part_len)},
+								   is_transparent, len, hc->parsed);
+			}
+			else {
+				html_append_parsed(hc,
+								   {start + cur_offset, std::size_t(initial_part_len)},
+								   is_transparent, len, hc->invisible);
+			}
+		}
+
+		auto next_offset = html_append_tag_content(pool, start, len,
+												   hc, cld, exceptions, url_set);
+
+		/* Do not allow shifting back */
+		if (next_offset > cur_offset) {
+			cur_offset = next_offset;
+		}
+	}
+
+	if (cur_offset < tag->closing.start) {
+		goffset final_part_len = tag->closing.start - cur_offset;
+
+		if (final_part_len > 0) {
+			if (is_visible) {
+				html_append_parsed(hc,
+								   {start + cur_offset, std::size_t(final_part_len)},
+								   is_transparent,
+								   len,
+								   hc->parsed);
+			}
+			else {
+				html_append_parsed(hc,
+								   {start + cur_offset, std::size_t(final_part_len)},
+								   is_transparent,
+								   len,
+								   hc->invisible);
+			}
+		}
+	}
+	if (is_block) {
+		append_margin('\n');
+	}
+	else if (is_spaces) {
+		append_margin(' ');
+	}
+
+	if (is_visible) {
+		if (tag->id == Tag_A) {
+			auto written_len = hc->parsed.size() - initial_parsed_offset;
+			html_process_displayed_href_tag(pool, hc,
+											{hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
+											tag, exceptions,
+											url_set, initial_parsed_offset);
+		}
+		else if (tag->id == Tag_IMG) {
+			/* Process ALT if presented */
+			auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
+
+			if (maybe_alt) {
+				if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) {
+					/* Add a space */
+					hc->parsed += ' ';
+				}
+
+				hc->parsed.append(maybe_alt.value());
+
+				if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) {
+					/* Add a space */
+					hc->parsed += ' ';
+				}
+			}
+		}
+	}
+	else {
+		/* Invisible stuff */
+		if (std::holds_alternative<rspamd_url *>(tag->extra)) {
+			auto *url_enclosed = std::get<rspamd_url *>(tag->extra);
+
+			/*
+			 * TODO: when hash is fixed to include flags we need to remove and add
+			 * url to the hash set
+			 */
+			if (url_enclosed) {
+				url_enclosed->flags |= RSPAMD_URL_FLAG_INVISIBLE;
+			}
+		}
+	}
+
+	calculate_final_tag_offsets();
+
+	return next_tag_offset;
+}
+
+auto html_process_input(struct rspamd_task *task,
+						GByteArray *in,
+						GList **exceptions,
+						khash_t(rspamd_url_hash) * url_set,
+						GPtrArray *part_urls,
+						bool allow_css,
+						std::uint16_t *cur_url_order) -> html_content *
+{
+	const gchar *p, *c, *end, *start;
+	guchar t;
+	auto closing = false;
+	guint obrace = 0, ebrace = 0;
+	struct rspamd_url *url = nullptr;
+	gint href_offset = -1;
+	auto overflow_input = false;
+	struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
+	struct tag_content_parser_state content_parser_env;
+	auto process_size = in->len;
+
+
+	enum {
+		parse_start = 0,
+		content_before_start,
+		tag_begin,
+		sgml_tag,
+		xml_tag,
+		compound_tag,
+		comment_tag,
+		comment_content,
+		sgml_content,
+		tag_content,
+		tag_end_opening,
+		tag_end_closing,
+		html_text_content,
+		xml_tag_end,
+		tag_raw_text,
+		tag_raw_text_less_than,
+		tags_limit_overflow,
+	} state = parse_start;
+
+	enum class html_document_state {
+		doctype,
+		head,
+		body
+	} html_document_state = html_document_state::doctype;
+
+	g_assert(in != NULL);
+	g_assert(task != NULL);
+
+	auto *pool = task->task_pool;
+	auto cur_url_part_order = 0u;
+
+	auto *hc = new html_content;
+	rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
+
+	if (task->cfg && in->len > task->cfg->max_html_len) {
+		msg_notice_task("html input is too big: %z, limit is %z",
+						in->len,
+						task->cfg->max_html_len);
+		process_size = task->cfg->max_html_len;
+		overflow_input = true;
+	}
+
+	auto new_tag = [&](int flags = 0) -> struct html_tag *
+	{
+
+		if (hc->all_tags.size() > rspamd::html::max_tags) {
+			hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
+
+			return nullptr;
+		}
+
+		hc->all_tags.emplace_back(std::make_unique<html_tag>());
+		auto *ntag = hc->all_tags.back().get();
+		ntag->tag_start = c - start;
+		ntag->flags = flags;
+
+		if (cur_tag && !(cur_tag->flags & (CM_EMPTY | FL_CLOSED)) && cur_tag != &cur_closing_tag) {
+			parent_tag = cur_tag;
+		}
+
+		if (flags & FL_XML) {
+			return ntag;
+		}
+
+		return ntag;
+	};
+
+	auto process_opening_tag = [&]() {
+		if (cur_tag->id > Tag_UNKNOWN) {
+			if (cur_tag->flags & CM_UNIQUE) {
+				if (!hc->tags_seen[cur_tag->id]) {
+					/* Duplicate tag has been found */
+					hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
+				}
+			}
+			hc->tags_seen[cur_tag->id] = true;
+		}
+
+		/* Shift to the first unclosed tag */
+		auto *pt = parent_tag;
+		while (pt && (pt->flags & FL_CLOSED)) {
+			pt = pt->parent;
+		}
+
+		if (pt) {
+			g_assert(cur_tag != pt);
+			cur_tag->parent = pt;
+			g_assert(cur_tag->parent != &cur_closing_tag);
+			parent_tag = pt;
+			parent_tag->children.push_back(cur_tag);
+		}
+		else {
+			if (hc->root_tag) {
+				if (cur_tag != hc->root_tag) {
+					cur_tag->parent = hc->root_tag;
+					g_assert(cur_tag->parent != cur_tag);
+					hc->root_tag->children.push_back(cur_tag);
+					parent_tag = hc->root_tag;
+				}
+			}
+			else {
+				if (cur_tag->id == Tag_HTML) {
+					hc->root_tag = cur_tag;
+				}
+				else {
+					/* Insert a fake html tag */
+					hc->all_tags.emplace_back(std::make_unique<html_tag>());
+					auto *top_tag = hc->all_tags.back().get();
+					top_tag->tag_start = 0;
+					top_tag->flags = FL_VIRTUAL;
+					top_tag->id = Tag_HTML;
+					top_tag->content_offset = 0;
+					top_tag->children.push_back(cur_tag);
+					cur_tag->parent = top_tag;
+					g_assert(cur_tag->parent != cur_tag);
+					hc->root_tag = top_tag;
+					parent_tag = top_tag;
+				}
+			}
+		}
+
+		if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) {
+			auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+
+			if (maybe_url.has_value()) {
+				url = maybe_url.value();
+
+				if (url_set != NULL) {
+					struct rspamd_url *maybe_existing =
+						rspamd_url_set_add_or_return(url_set, maybe_url.value());
+					if (maybe_existing == maybe_url.value()) {
+						if (cur_url_order) {
+							url->order = (*cur_url_order)++;
+						}
+						url->part_order = cur_url_part_order++;
+						html_process_query_url(pool, url, url_set,
+											   part_urls);
+					}
+					else {
+						url = maybe_existing;
+						/* Replace extra as well */
+						cur_tag->extra = maybe_existing;
+						/* Increase count to avoid odd checks failure */
+						url->count++;
+					}
+				}
+				if (part_urls) {
+					g_ptr_array_add(part_urls, url);
+				}
+
+				href_offset = hc->parsed.size();
+			}
+		}
+		else if (cur_tag->id == Tag_BASE) {
+			/*
+			 * Base is allowed only within head tag but HTML is retarded
+			 */
+			auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+
+			if (maybe_url) {
+				msg_debug_html("got valid base tag");
+				cur_tag->extra = maybe_url.value();
+				cur_tag->flags |= FL_HREF;
+
+				if (hc->base_url == nullptr) {
+					hc->base_url = maybe_url.value();
+				}
+				else {
+					msg_debug_html("ignore redundant base tag");
+				}
+			}
+			else {
+				msg_debug_html("got invalid base tag!");
+			}
+		}
+
+		if (cur_tag->id == Tag_IMG) {
+			html_process_img_tag(pool, cur_tag, hc, url_set,
+								 part_urls);
+		}
+		else if (cur_tag->id == Tag_LINK) {
+			html_process_link_tag(pool, cur_tag, hc, url_set,
+								  part_urls);
+		}
+
+		if (!(cur_tag->flags & CM_EMPTY)) {
+			html_process_block_tag(pool, cur_tag, hc);
+		}
+		else {
+			/* Implicitly close */
+			cur_tag->flags |= FL_CLOSED;
+		}
+
+		if (cur_tag->flags & FL_CLOSED) {
+			cur_tag->closing.end = cur_tag->content_offset;
+			cur_tag->closing.start = cur_tag->tag_start;
+
+			cur_tag = parent_tag;
+		}
+	};
+
+	p = (const char *) in->data;
+	c = p;
+	end = p + process_size;
+	start = c;
+
+	while (p < end) {
+		t = *p;
+
+		switch (state) {
+		case parse_start:
+			if (t == '<') {
+				state = tag_begin;
+			}
+			else {
+				/* We have no starting tag, so assume that it's content */
+				hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
+				cur_tag = new_tag();
+				html_document_state = html_document_state::body;
+
+				if (cur_tag) {
+					cur_tag->id = Tag_HTML;
+					hc->root_tag = cur_tag;
+					state = content_before_start;
+				}
+				else {
+					state = tags_limit_overflow;
+				}
+			}
+			break;
+		case content_before_start:
+			if (t == '<') {
+				state = tag_begin;
+			}
+			else {
+				p++;
+			}
+			break;
+		case tag_begin:
+			switch (t) {
+			case '<':
+				c = p;
+				p++;
+				closing = FALSE;
+				break;
+			case '!':
+				cur_tag = new_tag(FL_XML | FL_CLOSED);
+				if (cur_tag) {
+					state = sgml_tag;
+				}
+				else {
+					state = tags_limit_overflow;
+				}
+				p++;
+				break;
+			case '?':
+				cur_tag = new_tag(FL_XML | FL_CLOSED);
+				if (cur_tag) {
+					state = xml_tag;
+				}
+				else {
+					state = tags_limit_overflow;
+				}
+				hc->flags |= RSPAMD_HTML_FLAG_XML;
+				p++;
+				break;
+			case '/':
+				closing = TRUE;
+				/* We fill fake closing tag to fill it with the content parser */
+				cur_closing_tag.clear();
+				/*
+				 * For closing tags, we need to find some corresponding opening tag.
+				 * However, at this point we have not even parsed a name, so we
+				 * can not assume anything about balancing, etc.
+				 *
+				 * So we need to ensure that:
+				 * 1) We have some opening tag in the chain cur_tag->parent...
+				 * 2) cur_tag is nullptr - okay, html is just brain damaged
+				 * 3) cur_tag must NOT be equal to cur_closing tag. It means that
+				 * we had some poor closing tag but we still need to find an opening
+				 * tag... Somewhere...
+				 */
+
+				if (cur_tag == &cur_closing_tag) {
+					if (parent_tag != &cur_closing_tag) {
+						cur_closing_tag.parent = parent_tag;
+					}
+					else {
+						cur_closing_tag.parent = nullptr;
+					}
+				}
+				else if (cur_tag && cur_tag->flags & FL_CLOSED) {
+					/* Cur tag is already closed, we should find something else */
+					auto *tmp = cur_tag;
+					while (tmp) {
+						tmp = tmp->parent;
+
+						if (tmp == nullptr || !(tmp->flags & FL_CLOSED)) {
+							break;
+						}
+					}
+
+					cur_closing_tag.parent = tmp;
+				}
+				else {
+					cur_closing_tag.parent = cur_tag;
+				}
+
+				cur_tag = &cur_closing_tag;
+				p++;
+				break;
+			case '>':
+				/* Empty tag */
+				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+				state = html_text_content;
+				continue;
+			default:
+				if (g_ascii_isalpha(t)) {
+					state = tag_content;
+					content_parser_env.reset();
+
+					if (!closing) {
+						cur_tag = new_tag();
+					}
+
+					if (cur_tag) {
+						state = tag_content;
+					}
+					else {
+						state = tags_limit_overflow;
+					}
+				}
+				else {
+					/* Wrong bad tag */
+					state = html_text_content;
+				}
+				break;
+			}
+
+			break;
+
+		case sgml_tag:
+			switch (t) {
+			case '[':
+				state = compound_tag;
+				obrace = 1;
+				ebrace = 0;
+				p++;
+				break;
+			case '-':
+				cur_tag->flags |= FL_COMMENT;
+				state = comment_tag;
+				p++;
+				break;
+			default:
+				state = sgml_content;
+				break;
+			}
+
+			break;
+
+		case xml_tag:
+			if (t == '?') {
+				state = xml_tag_end;
+			}
+			else if (t == '>') {
+				/* Misformed xml tag */
+				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+				state = tag_end_opening;
+				continue;
+			}
+			/* We efficiently ignore xml tags */
+			p++;
+			break;
+
+		case xml_tag_end:
+			if (t == '>') {
+				state = tag_end_opening;
+				cur_tag->content_offset = p - start + 1;
+				continue;
+			}
+			else {
+				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+			}
+			p++;
+			break;
+
+		case compound_tag:
+			if (t == '[') {
+				obrace++;
+			}
+			else if (t == ']') {
+				ebrace++;
+			}
+			else if (t == '>' && obrace == ebrace) {
+				state = tag_end_opening;
+				cur_tag->content_offset = p - start + 1;
+				continue;
+			}
+			p++;
+			break;
+
+		case comment_tag:
+			if (t != '-') {
+				hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+				state = tag_end_opening;
+			}
+			else {
+				p++;
+				ebrace = 0;
+				/*
+				 * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
+				 *  ... the text must not start with a single
+				 *  U+003E GREATER-THAN SIGN character (>),
+				 *  nor start with a "-" (U+002D) character followed by
+				 *  a U+003E GREATER-THAN SIGN (>) character,
+				 *  nor contain two consecutive U+002D HYPHEN-MINUS
+				 *  characters (--), nor end with a "-" (U+002D) character.
+				 */
+				if (p[0] == '-' && p + 1 < end && p[1] == '>') {
+					hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+					p++;
+					state = tag_end_opening;
+				}
+				else if (*p == '>') {
+					hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+					state = tag_end_opening;
+				}
+				else {
+					state = comment_content;
+				}
+			}
+			break;
+
+		case comment_content:
+			if (t == '-') {
+				ebrace++;
+			}
+			else if (t == '>' && ebrace >= 2) {
+				cur_tag->content_offset = p - start + 1;
+				state = tag_end_opening;
+				continue;
+			}
+			else {
+				ebrace = 0;
+			}
+
+			p++;
+			break;
+
+		case html_text_content:
+			if (t != '<') {
+				p++;
+			}
+			else {
+				state = tag_begin;
+			}
+			break;
+
+		case tag_raw_text:
+			if (t == '<') {
+				c = p;
+				state = tag_raw_text_less_than;
+			}
+			p++;
+			break;
+		case tag_raw_text_less_than:
+			if (t == '/') {
+				/* Here are special things: we look for obrace and then ensure
+				 * that if there is any closing brace nearby
+				 * (we look maximum at 30 characters). We also need to ensure
+				 * that we have no special characters, such as punctuation marks and
+				 * so on.
+				 * Basically, we validate the input to be sane.
+				 * Since closing tags must not have attributes, these assumptions
+				 * seems to be reasonable enough for our toy parser.
+				 */
+				gint cur_lookahead = 1;
+				gint max_lookahead = MIN(end - p, 30);
+				bool valid_closing_tag = true;
+
+				if (p + 1 < end && !g_ascii_isalpha(p[1])) {
+					valid_closing_tag = false;
+				}
+				else {
+					while (cur_lookahead < max_lookahead) {
+						gchar tt = p[cur_lookahead];
+						if (tt == '>') {
+							break;
+						}
+						else if (tt < '\n' || tt == ',') {
+							valid_closing_tag = false;
+							break;
+						}
+						cur_lookahead++;
+					}
+
+					if (cur_lookahead == max_lookahead) {
+						valid_closing_tag = false;
+					}
+				}
+
+				if (valid_closing_tag) {
+					/* Shift back */
+					p = c;
+					state = tag_begin;
+				}
+				else {
+					p++;
+					state = tag_raw_text;
+				}
+			}
+			else {
+				p++;
+				state = tag_raw_text;
+			}
+			break;
+		case sgml_content:
+			/* TODO: parse DOCTYPE here */
+			if (t == '>') {
+				cur_tag->content_offset = p - start + 1;
+				state = tag_end_opening;
+			}
+			else {
+				p++;
+			}
+			break;
+
+		case tag_content:
+			html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
+
+			if (t == '>') {
+				if (content_parser_env.cur_state != parse_dqvalue && content_parser_env.cur_state != parse_sqvalue) {
+					/* We have a closing element */
+					if (closing) {
+						cur_tag->closing.start = c - start;
+						cur_tag->closing.end = p - start + 1;
+
+						closing = FALSE;
+						state = tag_end_closing;
+					}
+					else {
+						cur_tag->content_offset = p - start + 1;
+						state = tag_end_opening;
+					}
+				}
+				else {
+					/*
+					 * We are in the parse_quoted value state but got
+					 * an unescaped `>` character.
+					 * HTML is written for monkeys, so there are two possibilities:
+					 * 1) We have missing ending quote
+					 * 2) We have unescaped `>` character
+					 * How to distinguish between those possibilities?
+					 * Well, the idea is to do some lookahead and try to find a
+					 * quote. If we can find a quote, we just pretend as we have
+					 * not seen `>` character. Otherwise, we pretend that it is an
+					 * unquoted stuff. This logic is quite fragile but I really
+					 * don't know any better options...
+					 */
+					auto end_quote = content_parser_env.cur_state == parse_sqvalue ? '\'' : '"';
+					if (memchr(p, end_quote, end - p) != nullptr) {
+						/* Unencoded `>` */
+						p++;
+						continue;
+					}
+					else {
+						if (closing) {
+							cur_tag->closing.start = c - start;
+							cur_tag->closing.end = p - start + 1;
+
+							closing = FALSE;
+							state = tag_end_closing;
+						}
+						else {
+							cur_tag->content_offset = p - start + 1;
+							state = tag_end_opening;
+						}
+					}
+				}
+				continue;
+			}
+			p++;
+			break;
+
+		case tag_end_opening:
+			content_parser_env.reset();
+			state = html_text_content;
+
+			if (cur_tag) {
+				if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) {
+					state = tag_raw_text;
+				}
+				if (html_document_state == html_document_state::doctype) {
+					if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) {
+						html_document_state = html_document_state::head;
+						cur_tag->flags |= FL_IGNORE;
+					}
+					else if (cur_tag->id != Tag_HTML) {
+						html_document_state = html_document_state::body;
+					}
+				}
+				else if (html_document_state == html_document_state::head) {
+					if (!(cur_tag->flags & (CM_EMPTY | CM_HEAD))) {
+						if (parent_tag && (parent_tag->id == Tag_HEAD || !(parent_tag->flags & CM_HEAD))) {
+							/*
+							 * As by standard, we have to close the HEAD tag
+							 * and switch to the body state
+							 */
+							parent_tag->flags |= FL_CLOSED;
+							parent_tag->closing.start = cur_tag->tag_start;
+							parent_tag->closing.end = cur_tag->content_offset;
+
+							html_document_state = html_document_state::body;
+						}
+						else if (cur_tag->id == Tag_BODY) {
+							html_document_state = html_document_state::body;
+						}
+						else {
+							/*
+							 * For propagation in something like
+							 * <title><p><a>ololo</a></p></title> - should be unprocessed
+							 */
+							cur_tag->flags |= CM_HEAD;
+						}
+					}
+				}
+
+				process_opening_tag();
+			}
+
+			p++;
+			c = p;
+			break;
+		case tag_end_closing: {
+			if (cur_tag) {
+
+				if (cur_tag->flags & CM_EMPTY) {
+					/* Ignore closing empty tags */
+					cur_tag->flags |= FL_IGNORE;
+				}
+				if (html_document_state == html_document_state::doctype) {
+				}
+				else if (html_document_state == html_document_state::head) {
+					if (cur_tag->id == Tag_HEAD) {
+						html_document_state = html_document_state::body;
+					}
+				}
+
+				/* cur_tag here is a closing tag */
+				auto *next_cur_tag = html_check_balance(hc, cur_tag,
+														c - start, p - start + 1);
+
+				if (cur_tag->id == Tag_STYLE && allow_css) {
+					auto *opening_tag = cur_tag->parent;
+
+					if (opening_tag && opening_tag->id == Tag_STYLE &&
+						(int) opening_tag->content_offset < opening_tag->closing.start) {
+						auto ret_maybe = rspamd::css::parse_css(pool,
+																{start + opening_tag->content_offset,
+																 opening_tag->closing.start - opening_tag->content_offset},
+																std::move(hc->css_style));
+
+						if (!ret_maybe.has_value()) {
+							if (ret_maybe.error().is_fatal()) {
+								auto err_str = fmt::format(
+									"cannot parse css (error code: {}): {}",
+									static_cast<int>(ret_maybe.error().type),
+									ret_maybe.error().description.value_or("unknown error"));
+								msg_info_pool("%*s", (int) err_str.size(), err_str.data());
+							}
+						}
+						else {
+							hc->css_style = ret_maybe.value();
+						}
+					}
+				}
+
+				if (next_cur_tag != nullptr) {
+					cur_tag = next_cur_tag;
+				}
+				else {
+					/*
+					 * Here, we handle cases like <p>lala</b>...
+					 * So the tag </b> is bogus and unpaired
+					 * However, we need to exclude it from the output of <p> tag
+					 * To do that, we create a fake opening tag and insert that to
+					 * the current opening tag
+					 */
+					auto *cur_opening_tag = cur_tag->parent;
+
+					while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) {
+						cur_opening_tag = cur_opening_tag->parent;
+					}
+
+					if (!cur_opening_tag) {
+						cur_opening_tag = hc->root_tag;
+					}
+
+					auto &&vtag = std::make_unique<html_tag>();
+					vtag->id = cur_tag->id;
+					vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags;
+					vtag->tag_start = cur_tag->closing.start;
+					vtag->content_offset = p - start + 1;
+					vtag->closing = cur_tag->closing;
+					vtag->parent = cur_opening_tag;
+					g_assert(vtag->parent != &cur_closing_tag);
+					cur_opening_tag->children.push_back(vtag.get());
+					hc->all_tags.emplace_back(std::move(vtag));
+					cur_tag = cur_opening_tag;
+					parent_tag = cur_tag->parent;
+					g_assert(cur_tag->parent != &cur_closing_tag);
+				}
+			} /* if cur_tag != nullptr */
+			state = html_text_content;
+			p++;
+			c = p;
+			break;
+		}
+		case tags_limit_overflow:
+			msg_warn_pool("tags limit of %d tags is reached at the position %d;"
+						  " ignoring the rest of the HTML content",
+						  (int) hc->all_tags.size(), (int) (p - start));
+			c = p;
+			p = end;
+			break;
+		}
+	}
+
+	if (cur_tag && !(cur_tag->flags & FL_CLOSED) && cur_tag != &cur_closing_tag) {
+		cur_closing_tag.parent = cur_tag;
+		cur_closing_tag.id = cur_tag->id;
+		cur_tag = &cur_closing_tag;
+		html_check_balance(hc, cur_tag,
+						   end - start, end - start);
+	}
+
+	/* Propagate styles */
+	hc->traverse_block_tags([&hc, &pool](const html_tag *tag) -> bool {
+		if (hc->css_style && tag->id > Tag_UNKNOWN && tag->id < Tag_MAX) {
+			auto *css_block = hc->css_style->check_tag_block(tag);
+
+			if (css_block) {
+				if (tag->block) {
+					tag->block->set_block(*css_block);
+				}
+				else {
+					tag->block = css_block;
+				}
+			}
+		}
+		if (tag->block) {
+			if (!tag->block->has_display()) {
+				/* If we have no display field, we can check it by tag */
+				if (tag->flags & CM_HEAD) {
+					tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN,
+											html_block::set);
+				}
+				else if (tag->flags & (CM_BLOCK | CM_TABLE)) {
+					tag->block->set_display(css::css_display_value::DISPLAY_BLOCK,
+											html_block::implicit);
+				}
+				else if (tag->flags & CM_ROW) {
+					tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW,
+											html_block::implicit);
+				}
+				else {
+					tag->block->set_display(css::css_display_value::DISPLAY_INLINE,
+											html_block::implicit);
+				}
+			}
+
+			tag->block->compute_visibility();
+
+			for (const auto *cld_tag: tag->children) {
+
+				if (cld_tag->block) {
+					cld_tag->block->propagate_block(*tag->block);
+				}
+				else {
+					cld_tag->block = rspamd_mempool_alloc0_type(pool, html_block);
+					*cld_tag->block = *tag->block;
+				}
+			}
+		}
+		return true;
+	},
+							html_content::traverse_type::PRE_ORDER);
+
+	/* Leftover before content */
+	switch (state) {
+	case tag_end_opening:
+		if (cur_tag != nullptr) {
+			process_opening_tag();
+		}
+		break;
+	default:
+		/* Do nothing */
+		break;
+	}
+
+	if (!hc->all_tags.empty() && hc->root_tag) {
+		html_append_tag_content(pool, start, end - start, hc, hc->root_tag,
+								exceptions, url_set);
+	}
+
+	/* Leftover after content */
+	switch (state) {
+	case tags_limit_overflow:
+		html_append_parsed(hc, {c, (std::size_t)(end - c)},
+						   false, end - start, hc->parsed);
+		break;
+	default:
+		/* Do nothing */
+		break;
+	}
+
+	if (overflow_input) {
+		/*
+		 * Append the rest of the input as raw html, this might work as
+		 * further algorithms can skip words when auto *pool = task->task_pool;there are too many.
+		 * It is still unclear about urls though...
+		 */
+		html_append_parsed(hc, {end, in->len - process_size}, false,
+						   end - start, hc->parsed);
+	}
+
+	if (!hc->parsed.empty()) {
+		/* Trim extra spaces at the end if needed */
+		if (g_ascii_isspace(hc->parsed.back())) {
+			auto last_it = std::end(hc->parsed);
+
+			/* Allow last newline */
+			if (hc->parsed.back() == '\n') {
+				--last_it;
+			}
+
+			hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(),
+										  [](auto ch) -> auto {
+											  return !g_ascii_isspace(ch);
+										  })
+								 .base(),
+							 last_it);
+		}
+	}
+
+	return hc;
+}
+
+static auto
+html_find_image_by_cid(const html_content &hc, std::string_view cid)
+	-> std::optional<const html_image *>
+{
+	for (const auto *html_image: hc.images) {
+		/* Filter embedded images */
+		if (html_image->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED &&
+			html_image->src != nullptr) {
+			if (cid == html_image->src) {
+				return html_image;
+			}
+		}
+	}
+
+	return std::nullopt;
+}
+
+auto html_debug_structure(const html_content &hc) -> std::string
+{
+	std::string output;
+
+	if (hc.root_tag) {
+		auto rec_functor = [&](const html_tag *t, int level, auto rec_functor) -> void {
+			std::string pluses(level, '+');
+
+			if (!(t->flags & (FL_VIRTUAL | FL_IGNORE))) {
+				if (t->flags & FL_XML) {
+					output += fmt::format("{}xml;", pluses);
+				}
+				else {
+					output += fmt::format("{}{};", pluses,
+										  html_tags_defs.name_by_id_safe(t->id));
+				}
+				level++;
+			}
+			for (const auto *cld: t->children) {
+				rec_functor(cld, level, rec_functor);
+			}
+		};
+
+		rec_functor(hc.root_tag, 1, rec_functor);
+	}
+
+	return output;
+}
+
+auto html_tag_by_name(const std::string_view &name)
+	-> std::optional<tag_id_t>
+{
+	const auto *td = rspamd::html::html_tags_defs.by_name(name);
+
+	if (td != nullptr) {
+		return td->id;
+	}
+
+	return std::nullopt;
+}
+
+auto html_tag::get_content(const struct html_content *hc) const -> std::string_view
+{
+	const std::string *dest = &hc->parsed;
+
+	if (block && !block->is_visible()) {
+		dest = &hc->invisible;
+	}
+	const auto clen = get_content_length();
+	if (content_offset < dest->size()) {
+		if (dest->size() - content_offset >= clen) {
+			return std::string_view{*dest}.substr(content_offset, clen);
+		}
+		else {
+			return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset);
+		}
+	}
+
+	return std::string_view{};
+}
+
+}// namespace rspamd::html
+
+void *
+rspamd_html_process_part_full(struct rspamd_task *task,
+							  GByteArray *in, GList **exceptions,
+							  khash_t(rspamd_url_hash) * url_set,
+							  GPtrArray *part_urls,
+							  bool allow_css,
+							  uint16_t *cur_url_order)
+{
+	return rspamd::html::html_process_input(task, in, exceptions, url_set,
+											part_urls, allow_css, cur_url_order);
+}
+
+void *
+rspamd_html_process_part(rspamd_mempool_t *pool,
+						 GByteArray *in)
+{
+	struct rspamd_task fake_task;
+	memset(&fake_task, 0, sizeof(fake_task));
+	fake_task.task_pool = pool;
+	uint16_t order = 0;
+
+	return rspamd_html_process_part_full(&fake_task, in, NULL,
+										 NULL, NULL, FALSE, &order);
+}
+
+guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len)
+{
+	return rspamd::html::decode_html_entitles_inplace(s, len);
+}
+
+gint rspamd_html_tag_by_name(const gchar *name)
+{
+	const auto *td = rspamd::html::html_tags_defs.by_name(name);
+
+	if (td != nullptr) {
+		return td->id;
+	}
+
+	return -1;
+}
+
+gboolean
+rspamd_html_tag_seen(void *ptr, const gchar *tagname)
+{
+	gint id;
+	auto *hc = rspamd::html::html_content::from_ptr(ptr);
+
+	g_assert(hc != NULL);
+
+	id = rspamd_html_tag_by_name(tagname);
+
+	if (id != -1) {
+		return hc->tags_seen[id];
+	}
+
+	return FALSE;
+}
+
+const gchar *
+rspamd_html_tag_by_id(gint id)
+{
+	if (id > Tag_UNKNOWN && id < Tag_MAX) {
+		const auto *td = rspamd::html::html_tags_defs.by_id(id);
+
+		if (td != nullptr) {
+			return td->name.c_str();
+		}
+	}
+
+	return nullptr;
+}
+
+const gchar *
+rspamd_html_tag_name(void *p, gsize *len)
+{
+	auto *tag = reinterpret_cast<rspamd::html::html_tag *>(p);
+	auto tname = rspamd::html::html_tags_defs.name_by_id_safe(tag->id);
+
+	if (len) {
+		*len = tname.size();
+	}
+
+	return tname.data();
+}
+
+struct html_image *
+rspamd_html_find_embedded_image(void *html_content,
+								const char *cid, gsize cid_len)
+{
+	auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+	auto maybe_img = rspamd::html::html_find_image_by_cid(*hc, {cid, cid_len});
+
+	if (maybe_img) {
+		return (html_image *) maybe_img.value();
+	}
+
+	return nullptr;
+}
+
+bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
+{
+	auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+	dest->begin = hc->parsed.data();
+	dest->len = hc->parsed.size();
+
+	return true;
+}
+
+gsize rspamd_html_get_tags_count(void *html_content)
+{
+	auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+	if (!hc) {
+		return 0;
+	}
+
+	return hc->all_tags.size();
+}
+\ No newline at end of file