summaryrefslogtreecommitdiffstats
path: root/src/libserver/html
diff options
context:
space:
mode:
Diffstat (limited to 'src/libserver/html')
-rw-r--r--src/libserver/html/html.cxx2393
-rw-r--r--src/libserver/html/html.h137
-rw-r--r--src/libserver/html/html.hxx146
-rw-r--r--src/libserver/html/html_block.hxx358
-rw-r--r--src/libserver/html/html_entities.cxx2644
-rw-r--r--src/libserver/html/html_entities.hxx31
-rw-r--r--src/libserver/html/html_tag.hxx159
-rw-r--r--src/libserver/html/html_tag_defs.hxx194
-rw-r--r--src/libserver/html/html_tags.h176
-rw-r--r--src/libserver/html/html_tests.cxx304
-rw-r--r--src/libserver/html/html_url.cxx496
-rw-r--r--src/libserver/html/html_url.hxx68
12 files changed, 7106 insertions, 0 deletions
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
new file mode 100644
index 0000000..5861d45
--- /dev/null
+++ b/src/libserver/html/html.cxx
@@ -0,0 +1,2393 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "config.h"
+#include "util.h"
+#include "message.h"
+#include "html.h"
+#include "html_tags.h"
+#include "html_block.hxx"
+#include "html.hxx"
+#include "libserver/css/css_value.hxx"
+#include "libserver/css/css.hxx"
+#include "libserver/task.h"
+#include "libserver/cfg_file.h"
+
+#include "url.h"
+#include "contrib/libucl/khash.h"
+#include "libmime/images.h"
+#include "libutil/cxx/utf8_util.h"
+
+#include "html_tag_defs.hxx"
+#include "html_entities.hxx"
+#include "html_tag.hxx"
+#include "html_url.hxx"
+
+#include <frozen/unordered_map.h>
+#include <frozen/string.h>
+#include <fmt/core.h>
+
+#include <unicode/uversion.h>
+
+namespace rspamd::html {
+
+static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */
+
+static const html_tags_storage html_tags_defs;
+
+auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>(
+ {
+ {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME},
+ {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR},
+ {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
+ {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE},
+ {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS},
+ {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH},
+ {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT},
+ {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE},
+ {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL},
+ {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT},
+ {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID},
+ {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN},
+ });
+
+#define msg_debug_html(...) rspamd_conditional_debug_fast(NULL, NULL, \
+ rspamd_html_log_id, "html", pool->tag.uid, \
+ __FUNCTION__, \
+ __VA_ARGS__)
+
+INIT_LOG_MODULE(html)
+
+/*
+ * This function is expected to be called on a closing tag to fill up all tags
+ * and return the current parent (meaning unclosed) tag
+ */
+static auto
+html_check_balance(struct html_content *hc,
+ struct html_tag *tag,
+ goffset tag_start_offset,
+ goffset tag_end_offset) -> html_tag *
+{
+ /* As agreed, the closing tag has the last opening at the parent ptr */
+ auto *opening_tag = tag->parent;
+
+ auto calculate_content_length = [tag_start_offset, tag_end_offset](html_tag *t) {
+ auto opening_content_offset = t->content_offset;
+
+ if (t->flags & (CM_EMPTY)) {
+ /* Attach closing tag just at the opening tag */
+ t->closing.start = t->tag_start;
+ t->closing.end = t->content_offset;
+ }
+ else {
+
+ if (opening_content_offset <= tag_start_offset) {
+ t->closing.start = tag_start_offset;
+ t->closing.end = tag_end_offset;
+ }
+ else {
+
+ t->closing.start = t->content_offset;
+ t->closing.end = tag_end_offset;
+ }
+ }
+ };
+
+ auto balance_tag = [&]() -> html_tag * {
+ auto it = tag->parent;
+ auto found_pair = false;
+
+ for (; it != nullptr; it = it->parent) {
+ if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
+ found_pair = true;
+ break;
+ }
+ }
+
+ /*
+ * If we have found a closing pair, then we need to close all tags and
+ * return the top-most tag
+ */
+ if (found_pair) {
+ for (it = tag->parent; it != nullptr; it = it->parent) {
+ it->flags |= FL_CLOSED;
+ /* Insert a virtual closing tag for all tags that are not closed */
+ calculate_content_length(it);
+ if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
+ break;
+ }
+ }
+
+ return it;
+ }
+ else {
+ /*
+ * We have not found a pair, so this closing tag is bogus and should
+ * be ignored completely.
+ * Unfortunately, it also means that we need to insert another tag,
+ * as the current closing tag is unusable for that purposes.
+ *
+ * We assume that callee will recognise that and reconstruct the
+ * tag at the tag_end_closing state, so we return nullptr...
+ */
+ }
+
+ /* Tag must be ignored and reconstructed */
+ return nullptr;
+ };
+
+ if (opening_tag) {
+
+ if (opening_tag->id == tag->id) {
+ opening_tag->flags |= FL_CLOSED;
+
+ calculate_content_length(opening_tag);
+ /* All good */
+ return opening_tag->parent;
+ }
+ else {
+ return balance_tag();
+ }
+ }
+ else {
+ /*
+ * We have no opening tag
+ * There are two possibilities:
+ *
+ * 1) We have some block tag in hc->all_tags;
+ * 2) We have no tags
+ */
+
+ if (hc->all_tags.empty()) {
+ hc->all_tags.push_back(std::make_unique<html_tag>());
+ auto *vtag = hc->all_tags.back().get();
+ vtag->id = Tag_HTML;
+ vtag->flags = FL_VIRTUAL;
+ vtag->tag_start = 0;
+ vtag->content_offset = 0;
+ calculate_content_length(vtag);
+
+ if (!hc->root_tag) {
+ hc->root_tag = vtag;
+ }
+ else {
+ vtag->parent = hc->root_tag;
+ }
+
+ tag->parent = vtag;
+
+ /* Recursively call with a virtual <html> tag inserted */
+ return html_check_balance(hc, tag, tag_start_offset, tag_end_offset);
+ }
+ }
+
+ return nullptr;
+}
+
+auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>
+{
+ auto known_component_it = html_components_map.find(st);
+
+ if (known_component_it != html_components_map.end()) {
+ return known_component_it->second;
+ }
+ else {
+ return std::nullopt;
+ }
+}
+
+enum tag_parser_state {
+ parse_start = 0,
+ parse_name,
+ parse_attr_name,
+ parse_equal,
+ parse_start_dquote,
+ parse_dqvalue,
+ parse_end_dquote,
+ parse_start_squote,
+ parse_sqvalue,
+ parse_end_squote,
+ parse_value,
+ spaces_before_eq,
+ spaces_after_eq,
+ spaces_after_param,
+ ignore_bad_tag,
+ tag_end,
+ slash_after_value,
+ slash_in_unquoted_value,
+};
+struct tag_content_parser_state {
+ tag_parser_state cur_state = parse_start;
+ std::string buf;
+ std::optional<html_component_type> cur_component;
+
+ void reset()
+ {
+ cur_state = parse_start;
+ buf.clear();
+ cur_component = std::nullopt;
+ }
+};
+
+static inline void
+html_parse_tag_content(rspamd_mempool_t *pool,
+ struct html_content *hc,
+ struct html_tag *tag,
+ const char *in,
+ struct tag_content_parser_state &parser_env)
+{
+ auto state = parser_env.cur_state;
+
+ /*
+ * Stores tag component if it doesn't exist, performing copy of the
+ * value + decoding of the entities
+ * Parser env is set to clear the current html attribute fields (saved_p and
+ * cur_component)
+ */
+ auto store_component_value = [&]() -> void {
+ if (parser_env.cur_component) {
+
+ if (parser_env.buf.empty()) {
+ tag->components.emplace_back(parser_env.cur_component.value(),
+ std::string_view{});
+ }
+ else {
+ /* We need to copy buf to a persistent storage */
+ auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
+
+ if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID ||
+ parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
+ /* Lowercase */
+ rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size());
+ }
+ else {
+ memcpy(s, parser_env.buf.data(), parser_env.buf.size());
+ }
+
+ auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
+ tag->components.emplace_back(parser_env.cur_component.value(),
+ std::string_view{s, sz});
+ }
+ }
+
+ parser_env.buf.clear();
+ parser_env.cur_component = std::nullopt;
+ };
+
+ auto store_component_name = [&]() -> bool {
+ decode_html_entitles_inplace(parser_env.buf);
+ auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
+ parser_env.buf.clear();
+
+ if (known_component_it != html_components_map.end()) {
+ parser_env.cur_component = known_component_it->second;
+
+ return true;
+ }
+ else {
+ parser_env.cur_component = std::nullopt;
+ }
+
+ return false;
+ };
+
+ auto store_value_character = [&](bool lc) -> void {
+ auto c = lc ? g_ascii_tolower(*in) : *in;
+
+ if (c == '\0') {
+ /* Replace with u0FFD */
+ parser_env.buf.append((const char *) u8"\uFFFD");
+ }
+ else {
+ parser_env.buf.push_back(c);
+ }
+ };
+
+ switch (state) {
+ case parse_start:
+ if (!g_ascii_isalpha(*in) && !g_ascii_isspace(*in)) {
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+ state = ignore_bad_tag;
+ tag->id = N_TAGS;
+ tag->flags |= FL_BROKEN;
+ }
+ else if (g_ascii_isalpha(*in)) {
+ state = parse_name;
+ store_value_character(true);
+ }
+ break;
+
+ case parse_name:
+ if ((g_ascii_isspace(*in) || *in == '>' || *in == '/')) {
+ if (*in == '/') {
+ tag->flags |= FL_CLOSED;
+ }
+
+ if (parser_env.buf.empty()) {
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+ tag->id = N_TAGS;
+ tag->flags |= FL_BROKEN;
+ state = ignore_bad_tag;
+ }
+ else {
+ decode_html_entitles_inplace(parser_env.buf);
+ const auto *tag_def = rspamd::html::html_tags_defs.by_name(parser_env.buf);
+
+ if (tag_def == nullptr) {
+ hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
+ /* Assign -hash to match closing tag if needed */
+ auto nhash = static_cast<std::int32_t>(std::hash<std::string>{}(parser_env.buf));
+ /* Always negative */
+ tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
+ }
+ else {
+ tag->id = tag_def->id;
+ tag->flags = tag_def->flags;
+ }
+
+ parser_env.buf.clear();
+
+ state = spaces_after_param;
+ }
+ }
+ else {
+ store_value_character(true);
+ }
+ break;
+
+ case parse_attr_name:
+ if (*in == '=') {
+ if (!parser_env.buf.empty()) {
+ store_component_name();
+ }
+ state = parse_equal;
+ }
+ else if (g_ascii_isspace(*in)) {
+ store_component_name();
+ state = spaces_before_eq;
+ }
+ else if (*in == '/') {
+ store_component_name();
+ store_component_value();
+ state = slash_after_value;
+ }
+ else if (*in == '>') {
+ store_component_name();
+ store_component_value();
+ state = tag_end;
+ }
+ else {
+ if (*in == '"' || *in == '\'' || *in == '<') {
+ /* Should never be in attribute names but ignored */
+ tag->flags |= FL_BROKEN;
+ }
+
+ store_value_character(true);
+ }
+
+ break;
+
+ case spaces_before_eq:
+ if (*in == '=') {
+ state = parse_equal;
+ }
+ else if (!g_ascii_isspace(*in)) {
+ /*
+ * HTML defines that crap could still be restored and
+ * calculated somehow... So we have to follow this stupid behaviour
+ */
+ /*
+ * TODO: estimate what insane things do email clients in each case
+ */
+ if (*in == '>') {
+ /*
+ * Attribute name followed by end of tag
+ * Should be okay (empty attribute). The rest is handled outside
+ * this automata.
+ */
+ store_component_value();
+ state = tag_end;
+ }
+ else if (*in == '"' || *in == '\'' || *in == '<') {
+ /* Attribute followed by quote... Missing '=' ? Dunno, need to test */
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+ tag->flags |= FL_BROKEN;
+ store_component_value();
+ store_value_character(true);
+ state = spaces_after_param;
+ }
+ else {
+ /* Empty attribute */
+ store_component_value();
+ store_value_character(true);
+ state = spaces_after_param;
+ }
+ }
+ break;
+
+ case spaces_after_eq:
+ if (*in == '"') {
+ state = parse_start_dquote;
+ }
+ else if (*in == '\'') {
+ state = parse_start_squote;
+ }
+ else if (!g_ascii_isspace(*in)) {
+ store_value_character(true);
+ state = parse_value;
+ }
+ break;
+
+ case parse_equal:
+ if (g_ascii_isspace(*in)) {
+ state = spaces_after_eq;
+ }
+ else if (*in == '"') {
+ state = parse_start_dquote;
+ }
+ else if (*in == '\'') {
+ state = parse_start_squote;
+ }
+ else {
+ store_value_character(true);
+ state = parse_value;
+ }
+ break;
+
+ case parse_start_dquote:
+ if (*in == '"') {
+ state = spaces_after_param;
+ }
+ else {
+ store_value_character(false);
+ state = parse_dqvalue;
+ }
+ break;
+
+ case parse_start_squote:
+ if (*in == '\'') {
+ state = spaces_after_param;
+ }
+ else {
+ store_value_character(false);
+ state = parse_sqvalue;
+ }
+ break;
+
+ case parse_dqvalue:
+ if (*in == '"') {
+ store_component_value();
+ state = parse_end_dquote;
+ }
+ else {
+ store_value_character(false);
+ }
+ break;
+
+ case parse_sqvalue:
+ if (*in == '\'') {
+ store_component_value();
+ state = parse_end_squote;
+ }
+ else {
+ store_value_character(false);
+ }
+
+ break;
+
+ case parse_value:
+ if (*in == '/') {
+ state = slash_in_unquoted_value;
+ }
+ else if (g_ascii_isspace(*in) || *in == '>' || *in == '"') {
+ store_component_value();
+ state = spaces_after_param;
+ }
+ else {
+ store_value_character(false);
+ }
+ break;
+
+ case parse_end_dquote:
+ case parse_end_squote:
+ if (g_ascii_isspace(*in)) {
+ state = spaces_after_param;
+ }
+ else if (*in == '/') {
+ store_component_value();
+ store_value_character(true);
+ state = slash_after_value;
+ }
+ else {
+ /* No space, proceed immediately to the attribute name */
+ state = parse_attr_name;
+ store_component_value();
+ store_value_character(true);
+ }
+ break;
+
+ case spaces_after_param:
+ if (!g_ascii_isspace(*in)) {
+ if (*in == '/') {
+ state = slash_after_value;
+ }
+ else if (*in == '=') {
+ /* Attributes cannot start with '=' */
+ tag->flags |= FL_BROKEN;
+ store_value_character(true);
+ state = parse_attr_name;
+ }
+ else {
+ store_value_character(true);
+ state = parse_attr_name;
+ }
+ }
+ break;
+ case slash_after_value:
+ if (*in == '>') {
+ tag->flags |= FL_CLOSED;
+ state = tag_end;
+ }
+ else if (!g_ascii_isspace(*in)) {
+ tag->flags |= FL_BROKEN;
+ state = parse_attr_name;
+ }
+ break;
+ case slash_in_unquoted_value:
+ if (*in == '>') {
+ /* That slash was in fact closing tag slash, woohoo */
+ tag->flags |= FL_CLOSED;
+ state = tag_end;
+ store_component_value();
+ }
+ else {
+ /* Welcome to the world of html, revert state and save missing / */
+ parser_env.buf.push_back('/');
+ store_value_character(false);
+ state = parse_value;
+ }
+ break;
+ case ignore_bad_tag:
+ case tag_end:
+ break;
+ }
+
+ parser_env.cur_state = state;
+}
+
+static inline auto
+html_is_absolute_url(std::string_view st) -> bool
+{
+ auto alnum_pos = std::find_if(std::begin(st), std::end(st),
+ [](auto c) { return !g_ascii_isalnum(c); });
+
+ if (alnum_pos != std::end(st) && alnum_pos != std::begin(st)) {
+ if (*alnum_pos == ':') {
+ if (st.substr(0, std::distance(std::begin(st), alnum_pos)) == "mailto") {
+ return true;
+ }
+
+ std::advance(alnum_pos, 1);
+ if (alnum_pos != std::end(st)) {
+ /* Include even malformed urls */
+ if (*alnum_pos == '/' || *alnum_pos == '\\') {
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+static auto
+html_process_url_tag(rspamd_mempool_t *pool,
+ struct html_tag *tag,
+ struct html_content *hc) -> std::optional<struct rspamd_url *>
+{
+ auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
+
+ if (found_href_maybe) {
+ /* Check base url */
+ auto &href_value = found_href_maybe.value();
+
+ if (hc && hc->base_url) {
+ /*
+ * Relative url cannot start from the following:
+ * schema://
+ * data:
+ * slash
+ */
+
+ if (!html_is_absolute_url(href_value)) {
+
+ if (href_value.size() >= sizeof("data:") &&
+ g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
+ /* Image data url, never insert as url */
+ return std::nullopt;
+ }
+
+ /* Assume relative url */
+ auto need_slash = false;
+
+ auto orig_len = href_value.size();
+ auto len = orig_len + hc->base_url->urllen;
+
+ if (hc->base_url->datalen == 0) {
+ need_slash = true;
+ len++;
+ }
+
+ auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+ auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1,
+ "%*s%s%*s",
+ (int) hc->base_url->urllen, hc->base_url->string,
+ need_slash ? "/" : "",
+ (gint) orig_len, href_value.data());
+ href_value = {buf, nlen};
+ }
+ else if (href_value.size() > 2 && href_value[0] == '/' && href_value[1] != '/') {
+ /* Relative to the hostname */
+ auto orig_len = href_value.size();
+ auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
+ 3 /* for :// */;
+ auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+ auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
+ (int) hc->base_url->protocollen, hc->base_url->string,
+ (int) hc->base_url->hostlen, rspamd_url_host_unsafe(hc->base_url),
+ (gint) orig_len, href_value.data());
+ href_value = {buf, nlen};
+ }
+ }
+
+ auto url = html_process_url(pool, href_value).value_or(nullptr);
+
+ if (url) {
+ if (tag->id != Tag_A) {
+ /* Mark special tags special */
+ url->flags |= RSPAMD_URL_FLAG_SPECIAL;
+ }
+
+ if (std::holds_alternative<std::monostate>(tag->extra)) {
+ tag->extra = url;
+ }
+
+ return url;
+ }
+
+ return std::nullopt;
+ }
+
+ return std::nullopt;
+}
+
+struct rspamd_html_url_query_cbd {
+ rspamd_mempool_t *pool;
+ khash_t(rspamd_url_hash) * url_set;
+ struct rspamd_url *url;
+ GPtrArray *part_urls;
+};
+
+static gboolean
+html_url_query_callback(struct rspamd_url *url, gsize start_offset,
+ gsize end_offset, gpointer ud)
+{
+ struct rspamd_html_url_query_cbd *cbd =
+ (struct rspamd_html_url_query_cbd *) ud;
+ rspamd_mempool_t *pool;
+
+ pool = cbd->pool;
+
+ if (url->protocol == PROTOCOL_MAILTO) {
+ if (url->userlen == 0) {
+ return FALSE;
+ }
+ }
+
+ msg_debug_html("found url %s in query of url"
+ " %*s",
+ url->string,
+ cbd->url->querylen, rspamd_url_query_unsafe(cbd->url));
+
+ url->flags |= RSPAMD_URL_FLAG_QUERY;
+
+ if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) {
+ g_ptr_array_add(cbd->part_urls, url);
+ }
+
+ return TRUE;
+}
+
+static void
+html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
+ khash_t(rspamd_url_hash) * url_set,
+ GPtrArray *part_urls)
+{
+ if (url->querylen > 0) {
+ struct rspamd_html_url_query_cbd qcbd;
+
+ qcbd.pool = pool;
+ qcbd.url_set = url_set;
+ qcbd.url = url;
+ qcbd.part_urls = part_urls;
+
+ rspamd_url_find_multiple(pool,
+ rspamd_url_query_unsafe(url), url->querylen,
+ RSPAMD_URL_FIND_ALL, NULL,
+ html_url_query_callback, &qcbd);
+ }
+
+ if (part_urls) {
+ g_ptr_array_add(part_urls, url);
+ }
+}
+
+static auto
+html_process_data_image(rspamd_mempool_t *pool,
+ struct html_image *img,
+ std::string_view input) -> void
+{
+ /*
+ * Here, we do very basic processing of the data:
+ * detect if we have something like: `data:image/xxx;base64,yyyzzz==`
+ * We only parse base64 encoded data.
+ * We ignore content type so far
+ */
+ struct rspamd_image *parsed_image;
+ const gchar *semicolon_pos = input.data(),
+ *end = input.data() + input.size();
+
+ if ((semicolon_pos = (const gchar *) memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) {
+ if (end - semicolon_pos > sizeof("base64,")) {
+ if (memcmp(semicolon_pos + 1, "base64,", sizeof("base64,") - 1) == 0) {
+ const gchar *data_pos = semicolon_pos + sizeof("base64,");
+ gchar *decoded;
+ gsize encoded_len = end - data_pos, decoded_len;
+ rspamd_ftok_t inp;
+
+ decoded_len = (encoded_len / 4 * 3) + 12;
+ decoded = rspamd_mempool_alloc_buffer(pool, decoded_len);
+ rspamd_cryptobox_base64_decode(data_pos, encoded_len,
+ reinterpret_cast<guchar *>(decoded), &decoded_len);
+ inp.begin = decoded;
+ inp.len = decoded_len;
+
+ parsed_image = rspamd_maybe_process_image(pool, &inp);
+
+ if (parsed_image) {
+ msg_debug_html("detected %s image of size %ud x %ud in data url",
+ rspamd_image_type_str(parsed_image->type),
+ parsed_image->width, parsed_image->height);
+ img->embedded_image = parsed_image;
+ }
+ }
+ }
+ else {
+ /* Nothing useful */
+ return;
+ }
+ }
+}
+
+static void
+html_process_img_tag(rspamd_mempool_t *pool,
+ struct html_tag *tag,
+ struct html_content *hc,
+ khash_t(rspamd_url_hash) * url_set,
+ GPtrArray *part_urls)
+{
+ struct html_image *img;
+
+ img = rspamd_mempool_alloc0_type(pool, struct html_image);
+ img->tag = tag;
+
+ for (const auto &param: tag->components) {
+
+ if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
+ /* Check base url */
+ const auto &href_value = param.value;
+
+ if (href_value.size() > 0) {
+ rspamd_ftok_t fstr;
+ fstr.begin = href_value.data();
+ fstr.len = href_value.size();
+ img->src = rspamd_mempool_ftokdup(pool, &fstr);
+
+ if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
+ "cid:", sizeof("cid:") - 1) == 0) {
+ /* We have an embedded image */
+ img->src += sizeof("cid:") - 1;
+ img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
+ }
+ else {
+ if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
+ "data:", sizeof("data:") - 1) == 0) {
+ /* We have an embedded image in HTML tag */
+ img->flags |=
+ (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
+ html_process_data_image(pool, img, href_value);
+ hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
+ }
+ else {
+ img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
+ if (img->src) {
+
+ std::string_view cpy{href_value};
+ auto maybe_url = html_process_url(pool, cpy);
+
+ if (maybe_url) {
+ img->url = maybe_url.value();
+ struct rspamd_url *existing;
+
+ img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
+ existing = rspamd_url_set_add_or_return(url_set,
+ img->url);
+
+ if (existing && existing != img->url) {
+ /*
+ * We have some other URL that could be
+ * found, e.g. from another part. However,
+ * we still want to set an image flag on it
+ */
+ existing->flags |= img->url->flags;
+ existing->count++;
+ }
+ else if (part_urls) {
+ /* New url */
+ g_ptr_array_add(part_urls, img->url);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
+ unsigned long val;
+
+ rspamd_strtoul(param.value.data(), param.value.size(), &val);
+ img->height = val;
+ }
+
+ if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
+ unsigned long val;
+
+ rspamd_strtoul(param.value.data(), param.value.size(), &val);
+ img->width = val;
+ }
+
+ /* TODO: rework to css at some time */
+ if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
+ if (img->height == 0) {
+ auto style_st = param.value;
+ auto pos = rspamd_substring_search_caseless(style_st.data(),
+ style_st.size(),
+ "height", sizeof("height") - 1);
+ if (pos != -1) {
+ auto substr = style_st.substr(pos + sizeof("height") - 1);
+
+ for (auto i = 0; i < substr.size(); i++) {
+ auto t = substr[i];
+ if (g_ascii_isdigit(t)) {
+ unsigned long val;
+ rspamd_strtoul(substr.data(),
+ substr.size(), &val);
+ img->height = val;
+ break;
+ }
+ else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+ /* Fallback */
+ break;
+ }
+ }
+ }
+ }
+ if (img->width == 0) {
+ auto style_st = param.value;
+ auto pos = rspamd_substring_search_caseless(style_st.data(),
+ style_st.size(),
+ "width", sizeof("width") - 1);
+ if (pos != -1) {
+ auto substr = style_st.substr(pos + sizeof("width") - 1);
+
+ for (auto i = 0; i < substr.size(); i++) {
+ auto t = substr[i];
+ if (g_ascii_isdigit(t)) {
+ unsigned long val;
+ rspamd_strtoul(substr.data(),
+ substr.size(), &val);
+ img->width = val;
+ break;
+ }
+ else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+ /* Fallback */
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (img->embedded_image) {
+ if (img->height == 0) {
+ img->height = img->embedded_image->height;
+ }
+ if (img->width == 0) {
+ img->width = img->embedded_image->width;
+ }
+ }
+
+ hc->images.push_back(img);
+
+ if (std::holds_alternative<std::monostate>(tag->extra)) {
+ tag->extra = img;
+ }
+}
+
+static auto
+html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+ struct html_content *hc,
+ khash_t(rspamd_url_hash) * url_set,
+ GPtrArray *part_urls) -> void
+{
+ auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
+
+ if (found_rel_maybe) {
+ if (found_rel_maybe.value() == "icon") {
+ html_process_img_tag(pool, tag, hc, url_set, part_urls);
+ }
+ }
+}
+
+static auto
+html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+ struct html_content *hc) -> void
+{
+ std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
+ bool hidden = false;
+
+ for (const auto &param: tag->components) {
+ if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
+ maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
+ }
+
+ if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
+ maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
+ }
+
+ if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
+ tag->block = rspamd::css::parse_css_declaration(pool, param.value);
+ }
+
+ if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) {
+ hidden = true;
+ }
+ }
+
+ if (!tag->block) {
+ tag->block = html_block::undefined_html_block_pool(pool);
+ }
+
+ if (hidden) {
+ tag->block->set_display(false);
+ }
+
+ if (maybe_fgcolor) {
+ tag->block->set_fgcolor(maybe_fgcolor->to_color().value());
+ }
+
+ if (maybe_bgcolor) {
+ tag->block->set_bgcolor(maybe_bgcolor->to_color().value());
+ }
+}
+
+static inline auto
+html_append_parsed(struct html_content *hc,
+ std::string_view data,
+ bool transparent,
+ std::size_t input_len,
+ std::string &dest) -> std::size_t
+{
+ auto cur_offset = dest.size();
+
+ if (dest.size() > input_len) {
+ /* Impossible case, refuse to append */
+ return 0;
+ }
+
+ if (data.size() > 0) {
+ /* Handle multiple spaces at the begin */
+
+ if (cur_offset > 0) {
+ auto last = dest.back();
+ if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) {
+ dest.append(" ");
+ data = {data.data() + 1, data.size() - 1};
+ cur_offset++;
+ }
+ }
+
+ if (data.find('\0') != std::string_view::npos) {
+ auto replace_zero_func = [](const auto &input, auto &output) {
+ const auto last = input.cend();
+ for (auto it = input.cbegin(); it != last; ++it) {
+ if (*it == '\0') {
+ output.append((const char *) u8"\uFFFD");
+ }
+ else {
+ output.push_back(*it);
+ }
+ }
+ };
+
+ dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD"));
+ replace_zero_func(data, dest);
+ hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
+ }
+ else {
+ dest.append(data);
+ }
+ }
+
+ auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset,
+ dest.size() - cur_offset, true);
+
+ dest.resize(nlen + cur_offset);
+
+ if (transparent) {
+ /* Replace all visible characters with spaces */
+ auto start = std::next(dest.begin(), cur_offset);
+ std::replace_if(
+ start, std::end(dest), [](const auto c) {
+ return !g_ascii_isspace(c);
+ },
+ ' ');
+ }
+
+ return nlen;
+}
+
+static auto
+html_process_displayed_href_tag(rspamd_mempool_t *pool,
+ struct html_content *hc,
+ std::string_view data,
+ const struct html_tag *cur_tag,
+ GList **exceptions,
+ khash_t(rspamd_url_hash) * url_set,
+ goffset dest_offset) -> void
+{
+
+ if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) {
+ auto *url = std::get<rspamd_url *>(cur_tag->extra);
+
+ html_check_displayed_url(pool,
+ exceptions, url_set,
+ data,
+ dest_offset,
+ url);
+ }
+}
+
+static auto
+html_append_tag_content(rspamd_mempool_t *pool,
+ const gchar *start, gsize len,
+ struct html_content *hc,
+ html_tag *tag,
+ GList **exceptions,
+ khash_t(rspamd_url_hash) * url_set) -> goffset
+{
+ auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false;
+ goffset next_tag_offset = tag->closing.end,
+ initial_parsed_offset = hc->parsed.size(),
+ initial_invisible_offset = hc->invisible.size();
+
+ auto calculate_final_tag_offsets = [&]() -> void {
+ if (is_visible) {
+ tag->content_offset = initial_parsed_offset;
+ tag->closing.start = hc->parsed.size();
+ }
+ else {
+ tag->content_offset = initial_invisible_offset;
+ tag->closing.start = hc->invisible.size();
+ }
+ };
+
+ if (tag->closing.end == -1) {
+ if (tag->closing.start != -1) {
+ next_tag_offset = tag->closing.start;
+ tag->closing.end = tag->closing.start;
+ }
+ else {
+ next_tag_offset = tag->content_offset;
+ tag->closing.end = tag->content_offset;
+ }
+ }
+ if (tag->closing.start == -1) {
+ tag->closing.start = tag->closing.end;
+ }
+
+ auto append_margin = [&](char c) -> void {
+ /* We do care about visible margins only */
+ if (is_visible) {
+ if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
+ if (hc->parsed.back() == ' ') {
+ /* We also strip extra spaces at the end, but limiting the start */
+ auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset);
+ auto first = std::find_if(hc->parsed.rbegin(), last,
+ [](auto ch) -> auto {
+ return ch != ' ';
+ });
+ hc->parsed.erase(first.base(), hc->parsed.end());
+ g_assert(hc->parsed.size() >= initial_parsed_offset);
+ }
+ hc->parsed.push_back(c);
+ }
+ }
+ };
+
+ if (tag->id == Tag_BR || tag->id == Tag_HR) {
+
+ if (!(tag->flags & FL_IGNORE)) {
+ hc->parsed.append("\n");
+ }
+
+ auto ret = tag->content_offset;
+ calculate_final_tag_offsets();
+
+ return ret;
+ }
+ else if ((tag->id == Tag_HEAD && (tag->flags & FL_IGNORE)) || (tag->flags & CM_HEAD)) {
+ auto ret = tag->closing.end;
+ calculate_final_tag_offsets();
+
+ return ret;
+ }
+
+ if ((tag->flags & (FL_COMMENT | FL_XML | FL_IGNORE | CM_HEAD))) {
+ is_visible = false;
+ }
+ else {
+ if (!tag->block) {
+ is_visible = true;
+ }
+ else if (!tag->block->is_visible()) {
+ if (!tag->block->is_transparent()) {
+ is_visible = false;
+ }
+ else {
+ if (tag->block->has_display() &&
+ tag->block->display == css::css_display_value::DISPLAY_HIDDEN) {
+ is_visible = false;
+ }
+ else {
+ is_transparent = true;
+ }
+ }
+ }
+ else {
+ if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
+ is_block = true;
+ }
+ else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) {
+ is_spaces = true;
+ }
+ }
+ }
+
+ if (is_block) {
+ append_margin('\n');
+ }
+ else if (is_spaces) {
+ append_margin(' ');
+ }
+
+ goffset cur_offset = tag->content_offset;
+
+ for (auto *cld: tag->children) {
+ auto enclosed_start = cld->tag_start;
+ goffset initial_part_len = enclosed_start - cur_offset;
+
+ if (initial_part_len > 0) {
+ if (is_visible) {
+ html_append_parsed(hc,
+ {start + cur_offset, std::size_t(initial_part_len)},
+ is_transparent, len, hc->parsed);
+ }
+ else {
+ html_append_parsed(hc,
+ {start + cur_offset, std::size_t(initial_part_len)},
+ is_transparent, len, hc->invisible);
+ }
+ }
+
+ auto next_offset = html_append_tag_content(pool, start, len,
+ hc, cld, exceptions, url_set);
+
+ /* Do not allow shifting back */
+ if (next_offset > cur_offset) {
+ cur_offset = next_offset;
+ }
+ }
+
+ if (cur_offset < tag->closing.start) {
+ goffset final_part_len = tag->closing.start - cur_offset;
+
+ if (final_part_len > 0) {
+ if (is_visible) {
+ html_append_parsed(hc,
+ {start + cur_offset, std::size_t(final_part_len)},
+ is_transparent,
+ len,
+ hc->parsed);
+ }
+ else {
+ html_append_parsed(hc,
+ {start + cur_offset, std::size_t(final_part_len)},
+ is_transparent,
+ len,
+ hc->invisible);
+ }
+ }
+ }
+ if (is_block) {
+ append_margin('\n');
+ }
+ else if (is_spaces) {
+ append_margin(' ');
+ }
+
+ if (is_visible) {
+ if (tag->id == Tag_A) {
+ auto written_len = hc->parsed.size() - initial_parsed_offset;
+ html_process_displayed_href_tag(pool, hc,
+ {hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
+ tag, exceptions,
+ url_set, initial_parsed_offset);
+ }
+ else if (tag->id == Tag_IMG) {
+ /* Process ALT if presented */
+ auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
+
+ if (maybe_alt) {
+ if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) {
+ /* Add a space */
+ hc->parsed += ' ';
+ }
+
+ hc->parsed.append(maybe_alt.value());
+
+ if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) {
+ /* Add a space */
+ hc->parsed += ' ';
+ }
+ }
+ }
+ }
+ else {
+ /* Invisible stuff */
+ if (std::holds_alternative<rspamd_url *>(tag->extra)) {
+ auto *url_enclosed = std::get<rspamd_url *>(tag->extra);
+
+ /*
+ * TODO: when hash is fixed to include flags we need to remove and add
+ * url to the hash set
+ */
+ if (url_enclosed) {
+ url_enclosed->flags |= RSPAMD_URL_FLAG_INVISIBLE;
+ }
+ }
+ }
+
+ calculate_final_tag_offsets();
+
+ return next_tag_offset;
+}
+
+auto html_process_input(struct rspamd_task *task,
+ GByteArray *in,
+ GList **exceptions,
+ khash_t(rspamd_url_hash) * url_set,
+ GPtrArray *part_urls,
+ bool allow_css,
+ std::uint16_t *cur_url_order) -> html_content *
+{
+ const gchar *p, *c, *end, *start;
+ guchar t;
+ auto closing = false;
+ guint obrace = 0, ebrace = 0;
+ struct rspamd_url *url = nullptr;
+ gint href_offset = -1;
+ auto overflow_input = false;
+ struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
+ struct tag_content_parser_state content_parser_env;
+ auto process_size = in->len;
+
+
+ enum {
+ parse_start = 0,
+ content_before_start,
+ tag_begin,
+ sgml_tag,
+ xml_tag,
+ compound_tag,
+ comment_tag,
+ comment_content,
+ sgml_content,
+ tag_content,
+ tag_end_opening,
+ tag_end_closing,
+ html_text_content,
+ xml_tag_end,
+ tag_raw_text,
+ tag_raw_text_less_than,
+ tags_limit_overflow,
+ } state = parse_start;
+
+ enum class html_document_state {
+ doctype,
+ head,
+ body
+ } html_document_state = html_document_state::doctype;
+
+ g_assert(in != NULL);
+ g_assert(task != NULL);
+
+ auto *pool = task->task_pool;
+ auto cur_url_part_order = 0u;
+
+ auto *hc = new html_content;
+ rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
+
+ if (task->cfg && in->len > task->cfg->max_html_len) {
+ msg_notice_task("html input is too big: %z, limit is %z",
+ in->len,
+ task->cfg->max_html_len);
+ process_size = task->cfg->max_html_len;
+ overflow_input = true;
+ }
+
+ auto new_tag = [&](int flags = 0) -> struct html_tag *
+ {
+
+ if (hc->all_tags.size() > rspamd::html::max_tags) {
+ hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
+
+ return nullptr;
+ }
+
+ hc->all_tags.emplace_back(std::make_unique<html_tag>());
+ auto *ntag = hc->all_tags.back().get();
+ ntag->tag_start = c - start;
+ ntag->flags = flags;
+
+ if (cur_tag && !(cur_tag->flags & (CM_EMPTY | FL_CLOSED)) && cur_tag != &cur_closing_tag) {
+ parent_tag = cur_tag;
+ }
+
+ if (flags & FL_XML) {
+ return ntag;
+ }
+
+ return ntag;
+ };
+
+ auto process_opening_tag = [&]() {
+ if (cur_tag->id > Tag_UNKNOWN) {
+ if (cur_tag->flags & CM_UNIQUE) {
+ if (!hc->tags_seen[cur_tag->id]) {
+ /* Duplicate tag has been found */
+ hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
+ }
+ }
+ hc->tags_seen[cur_tag->id] = true;
+ }
+
+ /* Shift to the first unclosed tag */
+ auto *pt = parent_tag;
+ while (pt && (pt->flags & FL_CLOSED)) {
+ pt = pt->parent;
+ }
+
+ if (pt) {
+ g_assert(cur_tag != pt);
+ cur_tag->parent = pt;
+ g_assert(cur_tag->parent != &cur_closing_tag);
+ parent_tag = pt;
+ parent_tag->children.push_back(cur_tag);
+ }
+ else {
+ if (hc->root_tag) {
+ if (cur_tag != hc->root_tag) {
+ cur_tag->parent = hc->root_tag;
+ g_assert(cur_tag->parent != cur_tag);
+ hc->root_tag->children.push_back(cur_tag);
+ parent_tag = hc->root_tag;
+ }
+ }
+ else {
+ if (cur_tag->id == Tag_HTML) {
+ hc->root_tag = cur_tag;
+ }
+ else {
+ /* Insert a fake html tag */
+ hc->all_tags.emplace_back(std::make_unique<html_tag>());
+ auto *top_tag = hc->all_tags.back().get();
+ top_tag->tag_start = 0;
+ top_tag->flags = FL_VIRTUAL;
+ top_tag->id = Tag_HTML;
+ top_tag->content_offset = 0;
+ top_tag->children.push_back(cur_tag);
+ cur_tag->parent = top_tag;
+ g_assert(cur_tag->parent != cur_tag);
+ hc->root_tag = top_tag;
+ parent_tag = top_tag;
+ }
+ }
+ }
+
+ if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) {
+ auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+
+ if (maybe_url.has_value()) {
+ url = maybe_url.value();
+
+ if (url_set != NULL) {
+ struct rspamd_url *maybe_existing =
+ rspamd_url_set_add_or_return(url_set, maybe_url.value());
+ if (maybe_existing == maybe_url.value()) {
+ if (cur_url_order) {
+ url->order = (*cur_url_order)++;
+ }
+ url->part_order = cur_url_part_order++;
+ html_process_query_url(pool, url, url_set,
+ part_urls);
+ }
+ else {
+ url = maybe_existing;
+ /* Replace extra as well */
+ cur_tag->extra = maybe_existing;
+ /* Increase count to avoid odd checks failure */
+ url->count++;
+ }
+ }
+ if (part_urls) {
+ g_ptr_array_add(part_urls, url);
+ }
+
+ href_offset = hc->parsed.size();
+ }
+ }
+ else if (cur_tag->id == Tag_BASE) {
+ /*
+ * Base is allowed only within head tag but HTML is retarded
+ */
+ auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+
+ if (maybe_url) {
+ msg_debug_html("got valid base tag");
+ cur_tag->extra = maybe_url.value();
+ cur_tag->flags |= FL_HREF;
+
+ if (hc->base_url == nullptr) {
+ hc->base_url = maybe_url.value();
+ }
+ else {
+ msg_debug_html("ignore redundant base tag");
+ }
+ }
+ else {
+ msg_debug_html("got invalid base tag!");
+ }
+ }
+
+ if (cur_tag->id == Tag_IMG) {
+ html_process_img_tag(pool, cur_tag, hc, url_set,
+ part_urls);
+ }
+ else if (cur_tag->id == Tag_LINK) {
+ html_process_link_tag(pool, cur_tag, hc, url_set,
+ part_urls);
+ }
+
+ if (!(cur_tag->flags & CM_EMPTY)) {
+ html_process_block_tag(pool, cur_tag, hc);
+ }
+ else {
+ /* Implicitly close */
+ cur_tag->flags |= FL_CLOSED;
+ }
+
+ if (cur_tag->flags & FL_CLOSED) {
+ cur_tag->closing.end = cur_tag->content_offset;
+ cur_tag->closing.start = cur_tag->tag_start;
+
+ cur_tag = parent_tag;
+ }
+ };
+
+ p = (const char *) in->data;
+ c = p;
+ end = p + process_size;
+ start = c;
+
+ while (p < end) {
+ t = *p;
+
+ switch (state) {
+ case parse_start:
+ if (t == '<') {
+ state = tag_begin;
+ }
+ else {
+ /* We have no starting tag, so assume that it's content */
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
+ cur_tag = new_tag();
+ html_document_state = html_document_state::body;
+
+ if (cur_tag) {
+ cur_tag->id = Tag_HTML;
+ hc->root_tag = cur_tag;
+ state = content_before_start;
+ }
+ else {
+ state = tags_limit_overflow;
+ }
+ }
+ break;
+ case content_before_start:
+ if (t == '<') {
+ state = tag_begin;
+ }
+ else {
+ p++;
+ }
+ break;
+ case tag_begin:
+ switch (t) {
+ case '<':
+ c = p;
+ p++;
+ closing = FALSE;
+ break;
+ case '!':
+ cur_tag = new_tag(FL_XML | FL_CLOSED);
+ if (cur_tag) {
+ state = sgml_tag;
+ }
+ else {
+ state = tags_limit_overflow;
+ }
+ p++;
+ break;
+ case '?':
+ cur_tag = new_tag(FL_XML | FL_CLOSED);
+ if (cur_tag) {
+ state = xml_tag;
+ }
+ else {
+ state = tags_limit_overflow;
+ }
+ hc->flags |= RSPAMD_HTML_FLAG_XML;
+ p++;
+ break;
+ case '/':
+ closing = TRUE;
+ /* We fill fake closing tag to fill it with the content parser */
+ cur_closing_tag.clear();
+ /*
+ * For closing tags, we need to find some corresponding opening tag.
+ * However, at this point we have not even parsed a name, so we
+ * can not assume anything about balancing, etc.
+ *
+ * So we need to ensure that:
+ * 1) We have some opening tag in the chain cur_tag->parent...
+ * 2) cur_tag is nullptr - okay, html is just brain damaged
+ * 3) cur_tag must NOT be equal to cur_closing tag. It means that
+ * we had some poor closing tag but we still need to find an opening
+ * tag... Somewhere...
+ */
+
+ if (cur_tag == &cur_closing_tag) {
+ if (parent_tag != &cur_closing_tag) {
+ cur_closing_tag.parent = parent_tag;
+ }
+ else {
+ cur_closing_tag.parent = nullptr;
+ }
+ }
+ else if (cur_tag && cur_tag->flags & FL_CLOSED) {
+ /* Cur tag is already closed, we should find something else */
+ auto *tmp = cur_tag;
+ while (tmp) {
+ tmp = tmp->parent;
+
+ if (tmp == nullptr || !(tmp->flags & FL_CLOSED)) {
+ break;
+ }
+ }
+
+ cur_closing_tag.parent = tmp;
+ }
+ else {
+ cur_closing_tag.parent = cur_tag;
+ }
+
+ cur_tag = &cur_closing_tag;
+ p++;
+ break;
+ case '>':
+ /* Empty tag */
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+ state = html_text_content;
+ continue;
+ default:
+ if (g_ascii_isalpha(t)) {
+ state = tag_content;
+ content_parser_env.reset();
+
+ if (!closing) {
+ cur_tag = new_tag();
+ }
+
+ if (cur_tag) {
+ state = tag_content;
+ }
+ else {
+ state = tags_limit_overflow;
+ }
+ }
+ else {
+ /* Wrong bad tag */
+ state = html_text_content;
+ }
+ break;
+ }
+
+ break;
+
+ case sgml_tag:
+ switch (t) {
+ case '[':
+ state = compound_tag;
+ obrace = 1;
+ ebrace = 0;
+ p++;
+ break;
+ case '-':
+ cur_tag->flags |= FL_COMMENT;
+ state = comment_tag;
+ p++;
+ break;
+ default:
+ state = sgml_content;
+ break;
+ }
+
+ break;
+
+ case xml_tag:
+ if (t == '?') {
+ state = xml_tag_end;
+ }
+ else if (t == '>') {
+ /* Misformed xml tag */
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+ state = tag_end_opening;
+ continue;
+ }
+ /* We efficiently ignore xml tags */
+ p++;
+ break;
+
+ case xml_tag_end:
+ if (t == '>') {
+ state = tag_end_opening;
+ cur_tag->content_offset = p - start + 1;
+ continue;
+ }
+ else {
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+ }
+ p++;
+ break;
+
+ case compound_tag:
+ if (t == '[') {
+ obrace++;
+ }
+ else if (t == ']') {
+ ebrace++;
+ }
+ else if (t == '>' && obrace == ebrace) {
+ state = tag_end_opening;
+ cur_tag->content_offset = p - start + 1;
+ continue;
+ }
+ p++;
+ break;
+
+ case comment_tag:
+ if (t != '-') {
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+ state = tag_end_opening;
+ }
+ else {
+ p++;
+ ebrace = 0;
+ /*
+ * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments
+ * ... the text must not start with a single
+ * U+003E GREATER-THAN SIGN character (>),
+ * nor start with a "-" (U+002D) character followed by
+ * a U+003E GREATER-THAN SIGN (>) character,
+ * nor contain two consecutive U+002D HYPHEN-MINUS
+ * characters (--), nor end with a "-" (U+002D) character.
+ */
+ if (p[0] == '-' && p + 1 < end && p[1] == '>') {
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+ p++;
+ state = tag_end_opening;
+ }
+ else if (*p == '>') {
+ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+ state = tag_end_opening;
+ }
+ else {
+ state = comment_content;
+ }
+ }
+ break;
+
+ case comment_content:
+ if (t == '-') {
+ ebrace++;
+ }
+ else if (t == '>' && ebrace >= 2) {
+ cur_tag->content_offset = p - start + 1;
+ state = tag_end_opening;
+ continue;
+ }
+ else {
+ ebrace = 0;
+ }
+
+ p++;
+ break;
+
+ case html_text_content:
+ if (t != '<') {
+ p++;
+ }
+ else {
+ state = tag_begin;
+ }
+ break;
+
+ case tag_raw_text:
+ if (t == '<') {
+ c = p;
+ state = tag_raw_text_less_than;
+ }
+ p++;
+ break;
+ case tag_raw_text_less_than:
+ if (t == '/') {
+ /* Here are special things: we look for obrace and then ensure
+ * that if there is any closing brace nearby
+ * (we look maximum at 30 characters). We also need to ensure
+ * that we have no special characters, such as punctuation marks and
+ * so on.
+ * Basically, we validate the input to be sane.
+ * Since closing tags must not have attributes, these assumptions
+ * seems to be reasonable enough for our toy parser.
+ */
+ gint cur_lookahead = 1;
+ gint max_lookahead = MIN(end - p, 30);
+ bool valid_closing_tag = true;
+
+ if (p + 1 < end && !g_ascii_isalpha(p[1])) {
+ valid_closing_tag = false;
+ }
+ else {
+ while (cur_lookahead < max_lookahead) {
+ gchar tt = p[cur_lookahead];
+ if (tt == '>') {
+ break;
+ }
+ else if (tt < '\n' || tt == ',') {
+ valid_closing_tag = false;
+ break;
+ }
+ cur_lookahead++;
+ }
+
+ if (cur_lookahead == max_lookahead) {
+ valid_closing_tag = false;
+ }
+ }
+
+ if (valid_closing_tag) {
+ /* Shift back */
+ p = c;
+ state = tag_begin;
+ }
+ else {
+ p++;
+ state = tag_raw_text;
+ }
+ }
+ else {
+ p++;
+ state = tag_raw_text;
+ }
+ break;
+ case sgml_content:
+ /* TODO: parse DOCTYPE here */
+ if (t == '>') {
+ cur_tag->content_offset = p - start + 1;
+ state = tag_end_opening;
+ }
+ else {
+ p++;
+ }
+ break;
+
+ case tag_content:
+ html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
+
+ if (t == '>') {
+ if (content_parser_env.cur_state != parse_dqvalue && content_parser_env.cur_state != parse_sqvalue) {
+ /* We have a closing element */
+ if (closing) {
+ cur_tag->closing.start = c - start;
+ cur_tag->closing.end = p - start + 1;
+
+ closing = FALSE;
+ state = tag_end_closing;
+ }
+ else {
+ cur_tag->content_offset = p - start + 1;
+ state = tag_end_opening;
+ }
+ }
+ else {
+ /*
+ * We are in the parse_quoted value state but got
+ * an unescaped `>` character.
+ * HTML is written for monkeys, so there are two possibilities:
+ * 1) We have missing ending quote
+ * 2) We have unescaped `>` character
+ * How to distinguish between those possibilities?
+ * Well, the idea is to do some lookahead and try to find a
+ * quote. If we can find a quote, we just pretend as we have
+ * not seen `>` character. Otherwise, we pretend that it is an
+ * unquoted stuff. This logic is quite fragile but I really
+ * don't know any better options...
+ */
+ auto end_quote = content_parser_env.cur_state == parse_sqvalue ? '\'' : '"';
+ if (memchr(p, end_quote, end - p) != nullptr) {
+ /* Unencoded `>` */
+ p++;
+ continue;
+ }
+ else {
+ if (closing) {
+ cur_tag->closing.start = c - start;
+ cur_tag->closing.end = p - start + 1;
+
+ closing = FALSE;
+ state = tag_end_closing;
+ }
+ else {
+ cur_tag->content_offset = p - start + 1;
+ state = tag_end_opening;
+ }
+ }
+ }
+ continue;
+ }
+ p++;
+ break;
+
+ case tag_end_opening:
+ content_parser_env.reset();
+ state = html_text_content;
+
+ if (cur_tag) {
+ if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) {
+ state = tag_raw_text;
+ }
+ if (html_document_state == html_document_state::doctype) {
+ if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) {
+ html_document_state = html_document_state::head;
+ cur_tag->flags |= FL_IGNORE;
+ }
+ else if (cur_tag->id != Tag_HTML) {
+ html_document_state = html_document_state::body;
+ }
+ }
+ else if (html_document_state == html_document_state::head) {
+ if (!(cur_tag->flags & (CM_EMPTY | CM_HEAD))) {
+ if (parent_tag && (parent_tag->id == Tag_HEAD || !(parent_tag->flags & CM_HEAD))) {
+ /*
+ * As by standard, we have to close the HEAD tag
+ * and switch to the body state
+ */
+ parent_tag->flags |= FL_CLOSED;
+ parent_tag->closing.start = cur_tag->tag_start;
+ parent_tag->closing.end = cur_tag->content_offset;
+
+ html_document_state = html_document_state::body;
+ }
+ else if (cur_tag->id == Tag_BODY) {
+ html_document_state = html_document_state::body;
+ }
+ else {
+ /*
+ * For propagation in something like
+ * <title><p><a>ololo</a></p></title> - should be unprocessed
+ */
+ cur_tag->flags |= CM_HEAD;
+ }
+ }
+ }
+
+ process_opening_tag();
+ }
+
+ p++;
+ c = p;
+ break;
+ case tag_end_closing: {
+ if (cur_tag) {
+
+ if (cur_tag->flags & CM_EMPTY) {
+ /* Ignore closing empty tags */
+ cur_tag->flags |= FL_IGNORE;
+ }
+ if (html_document_state == html_document_state::doctype) {
+ }
+ else if (html_document_state == html_document_state::head) {
+ if (cur_tag->id == Tag_HEAD) {
+ html_document_state = html_document_state::body;
+ }
+ }
+
+ /* cur_tag here is a closing tag */
+ auto *next_cur_tag = html_check_balance(hc, cur_tag,
+ c - start, p - start + 1);
+
+ if (cur_tag->id == Tag_STYLE && allow_css) {
+ auto *opening_tag = cur_tag->parent;
+
+ if (opening_tag && opening_tag->id == Tag_STYLE &&
+ (int) opening_tag->content_offset < opening_tag->closing.start) {
+ auto ret_maybe = rspamd::css::parse_css(pool,
+ {start + opening_tag->content_offset,
+ opening_tag->closing.start - opening_tag->content_offset},
+ std::move(hc->css_style));
+
+ if (!ret_maybe.has_value()) {
+ if (ret_maybe.error().is_fatal()) {
+ auto err_str = fmt::format(
+ "cannot parse css (error code: {}): {}",
+ static_cast<int>(ret_maybe.error().type),
+ ret_maybe.error().description.value_or("unknown error"));
+ msg_info_pool("%*s", (int) err_str.size(), err_str.data());
+ }
+ }
+ else {
+ hc->css_style = ret_maybe.value();
+ }
+ }
+ }
+
+ if (next_cur_tag != nullptr) {
+ cur_tag = next_cur_tag;
+ }
+ else {
+ /*
+ * Here, we handle cases like <p>lala</b>...
+ * So the tag </b> is bogus and unpaired
+ * However, we need to exclude it from the output of <p> tag
+ * To do that, we create a fake opening tag and insert that to
+ * the current opening tag
+ */
+ auto *cur_opening_tag = cur_tag->parent;
+
+ while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) {
+ cur_opening_tag = cur_opening_tag->parent;
+ }
+
+ if (!cur_opening_tag) {
+ cur_opening_tag = hc->root_tag;
+ }
+
+ auto &&vtag = std::make_unique<html_tag>();
+ vtag->id = cur_tag->id;
+ vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags;
+ vtag->tag_start = cur_tag->closing.start;
+ vtag->content_offset = p - start + 1;
+ vtag->closing = cur_tag->closing;
+ vtag->parent = cur_opening_tag;
+ g_assert(vtag->parent != &cur_closing_tag);
+ cur_opening_tag->children.push_back(vtag.get());
+ hc->all_tags.emplace_back(std::move(vtag));
+ cur_tag = cur_opening_tag;
+ parent_tag = cur_tag->parent;
+ g_assert(cur_tag->parent != &cur_closing_tag);
+ }
+ } /* if cur_tag != nullptr */
+ state = html_text_content;
+ p++;
+ c = p;
+ break;
+ }
+ case tags_limit_overflow:
+ msg_warn_pool("tags limit of %d tags is reached at the position %d;"
+ " ignoring the rest of the HTML content",
+ (int) hc->all_tags.size(), (int) (p - start));
+ c = p;
+ p = end;
+ break;
+ }
+ }
+
+ if (cur_tag && !(cur_tag->flags & FL_CLOSED) && cur_tag != &cur_closing_tag) {
+ cur_closing_tag.parent = cur_tag;
+ cur_closing_tag.id = cur_tag->id;
+ cur_tag = &cur_closing_tag;
+ html_check_balance(hc, cur_tag,
+ end - start, end - start);
+ }
+
+ /* Propagate styles */
+ hc->traverse_block_tags([&hc, &pool](const html_tag *tag) -> bool {
+ if (hc->css_style && tag->id > Tag_UNKNOWN && tag->id < Tag_MAX) {
+ auto *css_block = hc->css_style->check_tag_block(tag);
+
+ if (css_block) {
+ if (tag->block) {
+ tag->block->set_block(*css_block);
+ }
+ else {
+ tag->block = css_block;
+ }
+ }
+ }
+ if (tag->block) {
+ if (!tag->block->has_display()) {
+ /* If we have no display field, we can check it by tag */
+ if (tag->flags & CM_HEAD) {
+ tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN,
+ html_block::set);
+ }
+ else if (tag->flags & (CM_BLOCK | CM_TABLE)) {
+ tag->block->set_display(css::css_display_value::DISPLAY_BLOCK,
+ html_block::implicit);
+ }
+ else if (tag->flags & CM_ROW) {
+ tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW,
+ html_block::implicit);
+ }
+ else {
+ tag->block->set_display(css::css_display_value::DISPLAY_INLINE,
+ html_block::implicit);
+ }
+ }
+
+ tag->block->compute_visibility();
+
+ for (const auto *cld_tag: tag->children) {
+
+ if (cld_tag->block) {
+ cld_tag->block->propagate_block(*tag->block);
+ }
+ else {
+ cld_tag->block = rspamd_mempool_alloc0_type(pool, html_block);
+ *cld_tag->block = *tag->block;
+ }
+ }
+ }
+ return true;
+ },
+ html_content::traverse_type::PRE_ORDER);
+
+ /* Leftover before content */
+ switch (state) {
+ case tag_end_opening:
+ if (cur_tag != nullptr) {
+ process_opening_tag();
+ }
+ break;
+ default:
+ /* Do nothing */
+ break;
+ }
+
+ if (!hc->all_tags.empty() && hc->root_tag) {
+ html_append_tag_content(pool, start, end - start, hc, hc->root_tag,
+ exceptions, url_set);
+ }
+
+ /* Leftover after content */
+ switch (state) {
+ case tags_limit_overflow:
+ html_append_parsed(hc, {c, (std::size_t)(end - c)},
+ false, end - start, hc->parsed);
+ break;
+ default:
+ /* Do nothing */
+ break;
+ }
+
+ if (overflow_input) {
+ /*
+ * Append the rest of the input as raw html, this might work as
+ * further algorithms can skip words when auto *pool = task->task_pool;there are too many.
+ * It is still unclear about urls though...
+ */
+ html_append_parsed(hc, {end, in->len - process_size}, false,
+ end - start, hc->parsed);
+ }
+
+ if (!hc->parsed.empty()) {
+ /* Trim extra spaces at the end if needed */
+ if (g_ascii_isspace(hc->parsed.back())) {
+ auto last_it = std::end(hc->parsed);
+
+ /* Allow last newline */
+ if (hc->parsed.back() == '\n') {
+ --last_it;
+ }
+
+ hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(),
+ [](auto ch) -> auto {
+ return !g_ascii_isspace(ch);
+ })
+ .base(),
+ last_it);
+ }
+ }
+
+ return hc;
+}
+
+static auto
+html_find_image_by_cid(const html_content &hc, std::string_view cid)
+ -> std::optional<const html_image *>
+{
+ for (const auto *html_image: hc.images) {
+ /* Filter embedded images */
+ if (html_image->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED &&
+ html_image->src != nullptr) {
+ if (cid == html_image->src) {
+ return html_image;
+ }
+ }
+ }
+
+ return std::nullopt;
+}
+
+auto html_debug_structure(const html_content &hc) -> std::string
+{
+ std::string output;
+
+ if (hc.root_tag) {
+ auto rec_functor = [&](const html_tag *t, int level, auto rec_functor) -> void {
+ std::string pluses(level, '+');
+
+ if (!(t->flags & (FL_VIRTUAL | FL_IGNORE))) {
+ if (t->flags & FL_XML) {
+ output += fmt::format("{}xml;", pluses);
+ }
+ else {
+ output += fmt::format("{}{};", pluses,
+ html_tags_defs.name_by_id_safe(t->id));
+ }
+ level++;
+ }
+ for (const auto *cld: t->children) {
+ rec_functor(cld, level, rec_functor);
+ }
+ };
+
+ rec_functor(hc.root_tag, 1, rec_functor);
+ }
+
+ return output;
+}
+
+auto html_tag_by_name(const std::string_view &name)
+ -> std::optional<tag_id_t>
+{
+ const auto *td = rspamd::html::html_tags_defs.by_name(name);
+
+ if (td != nullptr) {
+ return td->id;
+ }
+
+ return std::nullopt;
+}
+
+auto html_tag::get_content(const struct html_content *hc) const -> std::string_view
+{
+ const std::string *dest = &hc->parsed;
+
+ if (block && !block->is_visible()) {
+ dest = &hc->invisible;
+ }
+ const auto clen = get_content_length();
+ if (content_offset < dest->size()) {
+ if (dest->size() - content_offset >= clen) {
+ return std::string_view{*dest}.substr(content_offset, clen);
+ }
+ else {
+ return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset);
+ }
+ }
+
+ return std::string_view{};
+}
+
+}// namespace rspamd::html
+
+void *
+rspamd_html_process_part_full(struct rspamd_task *task,
+ GByteArray *in, GList **exceptions,
+ khash_t(rspamd_url_hash) * url_set,
+ GPtrArray *part_urls,
+ bool allow_css,
+ uint16_t *cur_url_order)
+{
+ return rspamd::html::html_process_input(task, in, exceptions, url_set,
+ part_urls, allow_css, cur_url_order);
+}
+
+void *
+rspamd_html_process_part(rspamd_mempool_t *pool,
+ GByteArray *in)
+{
+ struct rspamd_task fake_task;
+ memset(&fake_task, 0, sizeof(fake_task));
+ fake_task.task_pool = pool;
+ uint16_t order = 0;
+
+ return rspamd_html_process_part_full(&fake_task, in, NULL,
+ NULL, NULL, FALSE, &order);
+}
+
+guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len)
+{
+ return rspamd::html::decode_html_entitles_inplace(s, len);
+}
+
+gint rspamd_html_tag_by_name(const gchar *name)
+{
+ const auto *td = rspamd::html::html_tags_defs.by_name(name);
+
+ if (td != nullptr) {
+ return td->id;
+ }
+
+ return -1;
+}
+
+gboolean
+rspamd_html_tag_seen(void *ptr, const gchar *tagname)
+{
+ gint id;
+ auto *hc = rspamd::html::html_content::from_ptr(ptr);
+
+ g_assert(hc != NULL);
+
+ id = rspamd_html_tag_by_name(tagname);
+
+ if (id != -1) {
+ return hc->tags_seen[id];
+ }
+
+ return FALSE;
+}
+
+const gchar *
+rspamd_html_tag_by_id(gint id)
+{
+ if (id > Tag_UNKNOWN && id < Tag_MAX) {
+ const auto *td = rspamd::html::html_tags_defs.by_id(id);
+
+ if (td != nullptr) {
+ return td->name.c_str();
+ }
+ }
+
+ return nullptr;
+}
+
+const gchar *
+rspamd_html_tag_name(void *p, gsize *len)
+{
+ auto *tag = reinterpret_cast<rspamd::html::html_tag *>(p);
+ auto tname = rspamd::html::html_tags_defs.name_by_id_safe(tag->id);
+
+ if (len) {
+ *len = tname.size();
+ }
+
+ return tname.data();
+}
+
+struct html_image *
+rspamd_html_find_embedded_image(void *html_content,
+ const char *cid, gsize cid_len)
+{
+ auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+ auto maybe_img = rspamd::html::html_find_image_by_cid(*hc, {cid, cid_len});
+
+ if (maybe_img) {
+ return (html_image *) maybe_img.value();
+ }
+
+ return nullptr;
+}
+
+bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest)
+{
+ auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+ dest->begin = hc->parsed.data();
+ dest->len = hc->parsed.size();
+
+ return true;
+}
+
+gsize rspamd_html_get_tags_count(void *html_content)
+{
+ auto *hc = rspamd::html::html_content::from_ptr(html_content);
+
+ if (!hc) {
+ return 0;
+ }
+
+ return hc->all_tags.size();
+} \ No newline at end of file
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
new file mode 100644
index 0000000..2d34f2a
--- /dev/null
+++ b/src/libserver/html/html.h
@@ -0,0 +1,137 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_H
+#define RSPAMD_HTML_H
+
+#include "config.h"
+#include "libutil/mem_pool.h"
+#include "libserver/url.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * HTML content flags
+ */
+#define RSPAMD_HTML_FLAG_BAD_START (1 << 0)
+#define RSPAMD_HTML_FLAG_BAD_ELEMENTS (1 << 1)
+#define RSPAMD_HTML_FLAG_XML (1 << 2)
+#define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3)
+#define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4)
+#define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5)
+#define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6)
+#define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7)
+#define RSPAMD_HTML_FLAG_HAS_ZEROS (1 << 8)
+
+/*
+ * Image flags
+ */
+#define RSPAMD_HTML_FLAG_IMAGE_EMBEDDED (1 << 0)
+#define RSPAMD_HTML_FLAG_IMAGE_EXTERNAL (1 << 1)
+#define RSPAMD_HTML_FLAG_IMAGE_DATA (1 << 2)
+
+
+struct rspamd_image;
+
+struct html_image {
+ guint height;
+ guint width;
+ guint flags;
+ gchar *src;
+ struct rspamd_url *url;
+ struct rspamd_image *embedded_image;
+ void *tag;
+};
+
+
+/* Forwarded declaration */
+struct rspamd_task;
+
+/*
+ * Decode HTML entitles in text. Text is modified in place.
+ */
+guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
+
+void *rspamd_html_process_part(rspamd_mempool_t *pool,
+ GByteArray *in);
+
+void *rspamd_html_process_part_full(struct rspamd_task *task,
+ GByteArray *in, GList **exceptions,
+ khash_t(rspamd_url_hash) * url_set,
+ GPtrArray *part_urls,
+ bool allow_css,
+ uint16_t *cur_url_order);
+
+/*
+ * Returns true if a specified tag has been seen in a part
+ */
+gboolean rspamd_html_tag_seen(void *ptr, const gchar *tagname);
+
+/**
+ * Returns name for the specified tag id
+ * @param id
+ * @return
+ */
+const gchar *rspamd_html_tag_by_id(gint id);
+
+/**
+ * Returns HTML tag id by name
+ * @param name
+ * @return
+ */
+gint rspamd_html_tag_by_name(const gchar *name);
+
+/**
+ * Gets a name for a tag
+ * @param tag
+ * @param len
+ * @return
+ */
+const gchar *rspamd_html_tag_name(void *tag, gsize *len);
+
+/**
+ * Find HTML image by content id
+ * @param html_content
+ * @param cid
+ * @param cid_len
+ * @return
+ */
+struct html_image *rspamd_html_find_embedded_image(void *html_content,
+ const char *cid, gsize cid_len);
+
+/**
+ * Stores parsed content in ftok_t structure
+ * @param html_content
+ * @param dest
+ * @return
+ */
+bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest);
+
+/**
+ * Returns number of tags in the html content
+ * @param html_content
+ * @return
+ */
+gsize rspamd_html_get_tags_count(void *html_content);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx
new file mode 100644
index 0000000..3320fd6
--- /dev/null
+++ b/src/libserver/html/html.hxx
@@ -0,0 +1,146 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_HXX
+#define RSPAMD_HTML_HXX
+#pragma once
+
+#include "config.h"
+#include "libserver/url.h"
+#include "libserver/html/html_tag.hxx"
+#include "libserver/html/html.h"
+#include "libserver/html/html_tags.h"
+
+
+#include <vector>
+#include <memory>
+#include <string>
+#include "function2/function2.hpp"
+
+namespace rspamd::css {
+/* Forward declaration */
+class css_style_sheet;
+}// namespace rspamd::css
+
+namespace rspamd::html {
+
+struct html_block;
+
+struct html_content {
+ struct rspamd_url *base_url = nullptr;
+ struct html_tag *root_tag = nullptr;
+ gint flags = 0;
+ std::vector<bool> tags_seen;
+ std::vector<html_image *> images;
+ std::vector<std::unique_ptr<struct html_tag>> all_tags;
+ std::string parsed;
+ std::string invisible;
+ std::shared_ptr<css::css_style_sheet> css_style;
+
+ /* Preallocate and reserve all internal structures */
+ html_content()
+ {
+ tags_seen.resize(Tag_MAX, false);
+ all_tags.reserve(128);
+ parsed.reserve(256);
+ }
+
+ static void html_content_dtor(void *ptr)
+ {
+ delete html_content::from_ptr(ptr);
+ }
+
+ static auto from_ptr(void *ptr) -> html_content *
+ {
+ return static_cast<html_content *>(ptr);
+ }
+
+ enum class traverse_type {
+ PRE_ORDER,
+ POST_ORDER
+ };
+ auto traverse_block_tags(fu2::function<bool(const html_tag *)> &&func,
+ traverse_type how = traverse_type::PRE_ORDER) const -> bool
+ {
+
+ if (root_tag == nullptr) {
+ return false;
+ }
+
+ auto rec_functor_pre_order = [&](const html_tag *root, auto &&rec) -> bool {
+ if (func(root)) {
+
+ for (const auto *c: root->children) {
+ if (!rec(c, rec)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+ return false;
+ };
+ auto rec_functor_post_order = [&](const html_tag *root, auto &&rec) -> bool {
+ for (const auto *c: root->children) {
+ if (!rec(c, rec)) {
+ return false;
+ }
+ }
+
+ return func(root);
+ };
+
+ switch (how) {
+ case traverse_type::PRE_ORDER:
+ return rec_functor_pre_order(root_tag, rec_functor_pre_order);
+ case traverse_type::POST_ORDER:
+ return rec_functor_post_order(root_tag, rec_functor_post_order);
+ default:
+ RSPAMD_UNREACHABLE;
+ }
+ }
+
+ auto traverse_all_tags(fu2::function<bool(const html_tag *)> &&func) const -> bool
+ {
+ for (const auto &tag: all_tags) {
+ if (!(tag->flags & (FL_XML | FL_VIRTUAL))) {
+ if (!func(tag.get())) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+private:
+ ~html_content() = default;
+};
+
+
+auto html_tag_by_name(const std::string_view &name) -> std::optional<tag_id_t>;
+auto html_process_input(struct rspamd_task *task,
+ GByteArray *in,
+ GList **exceptions,
+ khash_t(rspamd_url_hash) * url_set,
+ GPtrArray *part_urls,
+ bool allow_css,
+ std::uint16_t *cur_url_order) -> html_content *;
+auto html_debug_structure(const html_content &hc) -> std::string;
+
+}// namespace rspamd::html
+
+#endif//RSPAMD_HTML_HXX
diff --git a/src/libserver/html/html_block.hxx b/src/libserver/html/html_block.hxx
new file mode 100644
index 0000000..f9b5184
--- /dev/null
+++ b/src/libserver/html/html_block.hxx
@@ -0,0 +1,358 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_HTML_BLOCK_HXX
+#define RSPAMD_HTML_BLOCK_HXX
+#pragma once
+
+#include "libserver/css/css_value.hxx"
+#include <cmath>
+
+namespace rspamd::html {
+
+/*
+ * Block tag definition
+ */
+struct html_block {
+ rspamd::css::css_color fg_color;
+ rspamd::css::css_color bg_color;
+ std::int16_t height;
+ std::int16_t width;
+ rspamd::css::css_display_value display;
+ std::int8_t font_size;
+
+ unsigned fg_color_mask : 2;
+ unsigned bg_color_mask : 2;
+ unsigned height_mask : 2;
+ unsigned width_mask : 2;
+ unsigned font_mask : 2;
+ unsigned display_mask : 2;
+ unsigned visibility_mask : 2;
+
+ constexpr static const auto unset = 0;
+ constexpr static const auto inherited = 1;
+ constexpr static const auto implicit = 1;
+ constexpr static const auto set = 3;
+ constexpr static const auto invisible_flag = 1;
+ constexpr static const auto transparent_flag = 2;
+
+ /* Helpers to set mask when setting the elements */
+ auto set_fgcolor(const rspamd::css::css_color &c, int how = html_block::set) -> void
+ {
+ fg_color = c;
+ fg_color_mask = how;
+ }
+ auto set_bgcolor(const rspamd::css::css_color &c, int how = html_block::set) -> void
+ {
+ bg_color = c;
+ bg_color_mask = how;
+ }
+ auto set_height(float h, bool is_percent = false, int how = html_block::set) -> void
+ {
+ h = is_percent ? (-h) : h;
+ if (h < INT16_MIN) {
+ /* Negative numbers encode percents... */
+ height = -100;
+ }
+ else if (h > INT16_MAX) {
+ height = INT16_MAX;
+ }
+ else {
+ height = h;
+ }
+ height_mask = how;
+ }
+
+ auto set_width(float w, bool is_percent = false, int how = html_block::set) -> void
+ {
+ w = is_percent ? (-w) : w;
+ if (w < INT16_MIN) {
+ width = INT16_MIN;
+ }
+ else if (w > INT16_MAX) {
+ width = INT16_MAX;
+ }
+ else {
+ width = w;
+ }
+ width_mask = how;
+ }
+
+ auto set_display(bool v, int how = html_block::set) -> void
+ {
+ if (v) {
+ display = rspamd::css::css_display_value::DISPLAY_INLINE;
+ }
+ else {
+ display = rspamd::css::css_display_value::DISPLAY_HIDDEN;
+ }
+ display_mask = how;
+ }
+
+ auto set_display(rspamd::css::css_display_value v, int how = html_block::set) -> void
+ {
+ display = v;
+ display_mask = how;
+ }
+
+ auto set_font_size(float fs, bool is_percent = false, int how = html_block::set) -> void
+ {
+ fs = is_percent ? (-fs) : fs;
+ if (fs < INT8_MIN) {
+ font_size = -100;
+ }
+ else if (fs > INT8_MAX) {
+ font_size = INT8_MAX;
+ }
+ else {
+ font_size = fs;
+ }
+ font_mask = how;
+ }
+
+private:
+ template<typename T, typename MT>
+ static constexpr auto simple_prop(MT mask_val, MT other_mask, T &our_val,
+ T other_val) -> MT
+ {
+ if (other_mask && other_mask > mask_val) {
+ our_val = other_val;
+ mask_val = html_block::inherited;
+ }
+
+ return mask_val;
+ }
+
+ /* Sizes propagation logic
+ * We can have multiple cases:
+ * 1) Our size is > 0 and we can use it as is
+ * 2) Parent size is > 0 and our size is undefined, so propagate parent
+ * 3) Parent size is < 0 and our size is undefined - propagate parent
+ * 4) Parent size is > 0 and our size is < 0 - multiply parent by abs(ours)
+ * 5) Parent size is undefined and our size is < 0 - tricky stuff, assume some defaults
+ */
+ template<typename T, typename MT>
+ static constexpr auto size_prop(MT mask_val, MT other_mask, T &our_val,
+ T other_val, T default_val) -> MT
+ {
+ if (mask_val) {
+ /* We have our value */
+ if (our_val < 0) {
+ if (other_mask > 0) {
+ if (other_val >= 0) {
+ our_val = other_val * (-our_val / 100.0);
+ }
+ else {
+ our_val *= (-other_val / 100.0);
+ }
+ }
+ else {
+ /* Parent value is not defined and our value is relative */
+ our_val = default_val * (-our_val / 100.0);
+ }
+ }
+ else if (other_mask && other_mask > mask_val) {
+ our_val = other_val;
+ mask_val = html_block::inherited;
+ }
+ }
+ else {
+ /* We propagate parent if defined */
+ if (other_mask && other_mask > mask_val) {
+ our_val = other_val;
+ mask_val = html_block::inherited;
+ }
+ /* Otherwise do nothing */
+ }
+
+ return mask_val;
+ }
+
+public:
+ /**
+ * Propagate values from the block if they are not defined by the current block
+ * @param other
+ * @return
+ */
+ auto propagate_block(const html_block &other) -> void
+ {
+ fg_color_mask = html_block::simple_prop(fg_color_mask, other.fg_color_mask,
+ fg_color, other.fg_color);
+ bg_color_mask = html_block::simple_prop(bg_color_mask, other.bg_color_mask,
+ bg_color, other.bg_color);
+ display_mask = html_block::simple_prop(display_mask, other.display_mask,
+ display, other.display);
+
+ height_mask = html_block::size_prop(height_mask, other.height_mask,
+ height, other.height, static_cast<std::int16_t>(800));
+ width_mask = html_block::size_prop(width_mask, other.width_mask,
+ width, other.width, static_cast<std::int16_t>(1024));
+ font_mask = html_block::size_prop(font_mask, other.font_mask,
+ font_size, other.font_size, static_cast<std::int8_t>(10));
+ }
+
+ /*
+ * Set block overriding all inherited values
+ */
+ auto set_block(const html_block &other) -> void
+ {
+ constexpr auto set_value = [](auto mask_val, auto other_mask, auto &our_val,
+ auto other_val) constexpr -> int {
+ if (other_mask && mask_val != html_block::set) {
+ our_val = other_val;
+ mask_val = other_mask;
+ }
+
+ return mask_val;
+ };
+
+ fg_color_mask = set_value(fg_color_mask, other.fg_color_mask, fg_color, other.fg_color);
+ bg_color_mask = set_value(bg_color_mask, other.bg_color_mask, bg_color, other.bg_color);
+ display_mask = set_value(display_mask, other.display_mask, display, other.display);
+ height_mask = set_value(height_mask, other.height_mask, height, other.height);
+ width_mask = set_value(width_mask, other.width_mask, width, other.width);
+ font_mask = set_value(font_mask, other.font_mask, font_size, other.font_size);
+ }
+
+ auto compute_visibility(void) -> void
+ {
+ if (display_mask) {
+ if (display == css::css_display_value::DISPLAY_HIDDEN) {
+ visibility_mask = html_block::invisible_flag;
+
+ return;
+ }
+ }
+
+ if (font_mask) {
+ if (font_size == 0) {
+ visibility_mask = html_block::invisible_flag;
+
+ return;
+ }
+ }
+
+ auto is_similar_colors = [](const rspamd::css::css_color &fg, const rspamd::css::css_color &bg) -> bool {
+ constexpr const auto min_visible_diff = 0.1f;
+ auto diff_r = ((float) fg.r - bg.r);
+ auto diff_g = ((float) fg.g - bg.g);
+ auto diff_b = ((float) fg.b - bg.b);
+ auto ravg = ((float) fg.r + bg.r) / 2.0f;
+
+ /* Square diffs */
+ diff_r *= diff_r;
+ diff_g *= diff_g;
+ diff_b *= diff_b;
+
+ auto diff = std::sqrt(2.0f * diff_r + 4.0f * diff_g + 3.0f * diff_b +
+ (ravg * (diff_r - diff_b) / 256.0f)) /
+ 256.0f;
+
+ return diff < min_visible_diff;
+ };
+ /* Check if we have both bg/fg colors */
+ if (fg_color_mask && bg_color_mask) {
+ if (fg_color.alpha < 10) {
+ /* Too transparent */
+ visibility_mask = html_block::transparent_flag;
+
+ return;
+ }
+
+ if (bg_color.alpha > 10) {
+ if (is_similar_colors(fg_color, bg_color)) {
+ visibility_mask = html_block::transparent_flag;
+ return;
+ }
+ }
+ }
+ else if (fg_color_mask) {
+ /* Merely fg color */
+ if (fg_color.alpha < 10) {
+ /* Too transparent */
+ visibility_mask = html_block::transparent_flag;
+
+ return;
+ }
+
+ /* Implicit fg color */
+ if (is_similar_colors(fg_color, rspamd::css::css_color::white())) {
+ visibility_mask = html_block::transparent_flag;
+ return;
+ }
+ }
+ else if (bg_color_mask) {
+ if (bg_color.alpha > 10) {
+ if (is_similar_colors(rspamd::css::css_color::black(), bg_color)) {
+ visibility_mask = html_block::transparent_flag;
+ return;
+ }
+ }
+ }
+
+ visibility_mask = html_block::unset;
+ }
+
+ constexpr auto is_visible(void) const -> bool
+ {
+ return visibility_mask == html_block::unset;
+ }
+
+ constexpr auto is_transparent(void) const -> bool
+ {
+ return visibility_mask == html_block::transparent_flag;
+ }
+
+ constexpr auto has_display(int how = html_block::set) const -> bool
+ {
+ return display_mask >= how;
+ }
+
+ /**
+ * Returns a default html block for root HTML element
+ * @return
+ */
+ static auto default_html_block(void) -> html_block
+ {
+ return html_block{.fg_color = rspamd::css::css_color::black(),
+ .bg_color = rspamd::css::css_color::white(),
+ .height = 0,
+ .width = 0,
+ .display = rspamd::css::css_display_value::DISPLAY_INLINE,
+ .font_size = 12,
+ .fg_color_mask = html_block::inherited,
+ .bg_color_mask = html_block::inherited,
+ .height_mask = html_block::unset,
+ .width_mask = html_block::unset,
+ .font_mask = html_block::unset,
+ .display_mask = html_block::inherited,
+ .visibility_mask = html_block::unset};
+ }
+ /**
+ * Produces html block with no defined values allocated from the pool
+ * @param pool
+ * @return
+ */
+ static auto undefined_html_block_pool(rspamd_mempool_t *pool) -> html_block *
+ {
+ auto *bl = rspamd_mempool_alloc0_type(pool, html_block);
+
+ return bl;
+ }
+};
+
+}// namespace rspamd::html
+
+#endif//RSPAMD_HTML_BLOCK_HXX
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx
new file mode 100644
index 0000000..c642536
--- /dev/null
+++ b/src/libserver/html/html_entities.cxx
@@ -0,0 +1,2644 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "html_entities.hxx"
+
+#include <string>
+#include <utility>
+#include <vector>
+#include "contrib/ankerl/unordered_dense.h"
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include "libutil/cxx/util.hxx"
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+namespace rspamd::html {
+
+struct html_entity_def {
+ const char *name;
+ const char *replacement;
+ unsigned code;
+ bool allow_heuristic;
+};
+
+#define ENTITY_DEF(name, code, replacement) \
+ html_entity_def \
+ { \
+ (name), (replacement), (code), false \
+ }
+#define ENTITY_DEF_HEUR(name, code, replacement) \
+ html_entity_def \
+ { \
+ (name), (replacement), (code), true \
+ }
+
+static const html_entity_def html_entities_array[] = {
+ ENTITY_DEF_HEUR("szlig", 223, "\xc3\x9f"),
+ ENTITY_DEF("prime", 8242, "\xe2\x80\xb2"),
+ ENTITY_DEF("lnsim", 8934, "\xe2\x8b\xa6"),
+ ENTITY_DEF("nvDash", 8877, "\xe2\x8a\xad"),
+ ENTITY_DEF("isinsv", 8947, "\xe2\x8b\xb3"),
+ ENTITY_DEF("notin", 8713, "\xe2\x88\x89"),
+ ENTITY_DEF("becaus", 8757, "\xe2\x88\xb5"),
+ ENTITY_DEF("Leftrightarrow", 8660, "\xe2\x87\x94"),
+ ENTITY_DEF("EmptySmallSquare", 9723, "\xe2\x97\xbb"),
+ ENTITY_DEF("SquareUnion", 8852, "\xe2\x8a\x94"),
+ ENTITY_DEF("subdot", 10941, "\xe2\xaa\xbd"),
+ ENTITY_DEF("Dstrok", 272, "\xc4\x90"),
+ ENTITY_DEF("rrarr", 8649, "\xe2\x87\x89"),
+ ENTITY_DEF("rArr", 8658, "\xe2\x87\x92"),
+ ENTITY_DEF_HEUR("Aacute", 193, "\xc3\x81"),
+ ENTITY_DEF("kappa", 954, "\xce\xba"),
+ ENTITY_DEF("Iopf", 120128, "\xf0\x9d\x95\x80"),
+ ENTITY_DEF("hyphen", 8208, "\xe2\x80\x90"),
+ ENTITY_DEF("rarrbfs", 10528, "\xe2\xa4\xa0"),
+ ENTITY_DEF("supsetneqq", 10956, "\xe2\xab\x8c"),
+ ENTITY_DEF("gacute", 501, "\xc7\xb5"),
+ ENTITY_DEF("VeryThinSpace", 8202, "\xe2\x80\x8a"),
+ ENTITY_DEF("tint", 8749, "\xe2\x88\xad"),
+ ENTITY_DEF("ffr", 120099, "\xf0\x9d\x94\xa3"),
+ ENTITY_DEF("kgreen", 312, "\xc4\xb8"),
+ ENTITY_DEF("nis", 8956, "\xe2\x8b\xbc"),
+ ENTITY_DEF("NotRightTriangleBar", 10704, "\xe2\xa7\x90\xcc\xb8"),
+ ENTITY_DEF("Eogon", 280, "\xc4\x98"),
+ ENTITY_DEF("lbrke", 10635, "\xe2\xa6\x8b"),
+ ENTITY_DEF("phi", 966, "\xcf\x86"),
+ ENTITY_DEF("notnivc", 8957, "\xe2\x8b\xbd"),
+ ENTITY_DEF("utilde", 361, "\xc5\xa9"),
+ ENTITY_DEF("Fopf", 120125, "\xf0\x9d\x94\xbd"),
+ ENTITY_DEF("Vcy", 1042, "\xd0\x92"),
+ ENTITY_DEF("erDot", 8787, "\xe2\x89\x93"),
+ ENTITY_DEF("nsubE", 10949, "\xe2\xab\x85\xcc\xb8"),
+ ENTITY_DEF_HEUR("egrave", 232, "\xc3\xa8"),
+ ENTITY_DEF("Lcedil", 315, "\xc4\xbb"),
+ ENTITY_DEF("lharul", 10602, "\xe2\xa5\xaa"),
+ ENTITY_DEF_HEUR("middot", 183, "\xc2\xb7"),
+ ENTITY_DEF("ggg", 8921, "\xe2\x8b\x99"),
+ ENTITY_DEF("NestedLessLess", 8810, "\xe2\x89\xaa"),
+ ENTITY_DEF("tau", 964, "\xcf\x84"),
+ ENTITY_DEF("setmn", 8726, "\xe2\x88\x96"),
+ ENTITY_DEF("frac78", 8542, "\xe2\x85\x9e"),
+ ENTITY_DEF_HEUR("para", 182, "\xc2\xb6"),
+ ENTITY_DEF("Rcedil", 342, "\xc5\x96"),
+ ENTITY_DEF("propto", 8733, "\xe2\x88\x9d"),
+ ENTITY_DEF("sqsubset", 8847, "\xe2\x8a\x8f"),
+ ENTITY_DEF("ensp", 8194, "\xe2\x80\x82"),
+ ENTITY_DEF("boxvH", 9578, "\xe2\x95\xaa"),
+ ENTITY_DEF("NotGreaterTilde", 8821, "\xe2\x89\xb5"),
+ ENTITY_DEF("ffllig", 64260, "\xef\xac\x84"),
+ ENTITY_DEF("kcedil", 311, "\xc4\xb7"),
+ ENTITY_DEF("omega", 969, "\xcf\x89"),
+ ENTITY_DEF("sime", 8771, "\xe2\x89\x83"),
+ ENTITY_DEF("LeftTriangleEqual", 8884, "\xe2\x8a\xb4"),
+ ENTITY_DEF("bsemi", 8271, "\xe2\x81\x8f"),
+ ENTITY_DEF("rdquor", 8221, "\xe2\x80\x9d"),
+ ENTITY_DEF("Utilde", 360, "\xc5\xa8"),
+ ENTITY_DEF("bsol", 92, "\x5c"),
+ ENTITY_DEF("risingdotseq", 8787, "\xe2\x89\x93"),
+ ENTITY_DEF("ultri", 9720, "\xe2\x97\xb8"),
+ ENTITY_DEF("rhov", 1009, "\xcf\xb1"),
+ ENTITY_DEF("TildeEqual", 8771, "\xe2\x89\x83"),
+ ENTITY_DEF("jukcy", 1108, "\xd1\x94"),
+ ENTITY_DEF("perp", 8869, "\xe2\x8a\xa5"),
+ ENTITY_DEF("capbrcup", 10825, "\xe2\xa9\x89"),
+ ENTITY_DEF("ltrie", 8884, "\xe2\x8a\xb4"),
+ ENTITY_DEF("LessTilde", 8818, "\xe2\x89\xb2"),
+ ENTITY_DEF("popf", 120161, "\xf0\x9d\x95\xa1"),
+ ENTITY_DEF("dbkarow", 10511, "\xe2\xa4\x8f"),
+ ENTITY_DEF("roang", 10221, "\xe2\x9f\xad"),
+ ENTITY_DEF_HEUR("brvbar", 166, "\xc2\xa6"),
+ ENTITY_DEF("CenterDot", 183, "\xc2\xb7"),
+ ENTITY_DEF("notindot", 8949, "\xe2\x8b\xb5\xcc\xb8"),
+ ENTITY_DEF("supmult", 10946, "\xe2\xab\x82"),
+ ENTITY_DEF("multimap", 8888, "\xe2\x8a\xb8"),
+ ENTITY_DEF_HEUR("frac34", 190, "\xc2\xbe"),
+ ENTITY_DEF("mapsto", 8614, "\xe2\x86\xa6"),
+ ENTITY_DEF("flat", 9837, "\xe2\x99\xad"),
+ ENTITY_DEF("updownarrow", 8597, "\xe2\x86\x95"),
+ ENTITY_DEF("gne", 10888, "\xe2\xaa\x88"),
+ ENTITY_DEF("nrarrc", 10547, "\xe2\xa4\xb3\xcc\xb8"),
+ ENTITY_DEF("suphsol", 10185, "\xe2\x9f\x89"),
+ ENTITY_DEF("nGtv", 8811, "\xe2\x89\xab\xcc\xb8"),
+ ENTITY_DEF("hopf", 120153, "\xf0\x9d\x95\x99"),
+ ENTITY_DEF("pointint", 10773, "\xe2\xa8\x95"),
+ ENTITY_DEF("glj", 10916, "\xe2\xaa\xa4"),
+ ENTITY_DEF("LeftDoubleBracket", 10214, "\xe2\x9f\xa6"),
+ ENTITY_DEF("NotSupersetEqual", 8841, "\xe2\x8a\x89"),
+ ENTITY_DEF("dot", 729, "\xcb\x99"),
+ ENTITY_DEF("tbrk", 9140, "\xe2\x8e\xb4"),
+ ENTITY_DEF("LeftUpDownVector", 10577, "\xe2\xa5\x91"),
+ ENTITY_DEF_HEUR("uml", 168, "\xc2\xa8"),
+ ENTITY_DEF("bbrk", 9141, "\xe2\x8e\xb5"),
+ ENTITY_DEF("nearrow", 8599, "\xe2\x86\x97"),
+ ENTITY_DEF("backsimeq", 8909, "\xe2\x8b\x8d"),
+ ENTITY_DEF("dblac", 733, "\xcb\x9d"),
+ ENTITY_DEF("circleddash", 8861, "\xe2\x8a\x9d"),
+ ENTITY_DEF("ldsh", 8626, "\xe2\x86\xb2"),
+ ENTITY_DEF("sce", 10928, "\xe2\xaa\xb0"),
+ ENTITY_DEF("angst", 197, "\xc3\x85"),
+ ENTITY_DEF_HEUR("yen", 165, "\xc2\xa5"),
+ ENTITY_DEF("nsupE", 10950, "\xe2\xab\x86\xcc\xb8"),
+ ENTITY_DEF("Uscr", 119984, "\xf0\x9d\x92\xb0"),
+ ENTITY_DEF("subplus", 10943, "\xe2\xaa\xbf"),
+ ENTITY_DEF("nleqq", 8806, "\xe2\x89\xa6\xcc\xb8"),
+ ENTITY_DEF("nprcue", 8928, "\xe2\x8b\xa0"),
+ ENTITY_DEF("Ocirc", 212, "\xc3\x94"),
+ ENTITY_DEF("disin", 8946, "\xe2\x8b\xb2"),
+ ENTITY_DEF("EqualTilde", 8770, "\xe2\x89\x82"),
+ ENTITY_DEF("YUcy", 1070, "\xd0\xae"),
+ ENTITY_DEF("Kscr", 119974, "\xf0\x9d\x92\xa6"),
+ ENTITY_DEF("lg", 8822, "\xe2\x89\xb6"),
+ ENTITY_DEF("nLeftrightarrow", 8654, "\xe2\x87\x8e"),
+ ENTITY_DEF("eplus", 10865, "\xe2\xa9\xb1"),
+ ENTITY_DEF("les", 10877, "\xe2\xa9\xbd"),
+ ENTITY_DEF("sfr", 120112, "\xf0\x9d\x94\xb0"),
+ ENTITY_DEF("HumpDownHump", 8782, "\xe2\x89\x8e"),
+ ENTITY_DEF("Fouriertrf", 8497, "\xe2\x84\xb1"),
+ ENTITY_DEF("Updownarrow", 8661, "\xe2\x87\x95"),
+ ENTITY_DEF("nrarr", 8603, "\xe2\x86\x9b"),
+ ENTITY_DEF("radic", 8730, "\xe2\x88\x9a"),
+ ENTITY_DEF("gnap", 10890, "\xe2\xaa\x8a"),
+ ENTITY_DEF("zeta", 950, "\xce\xb6"),
+ ENTITY_DEF("Qscr", 119980, "\xf0\x9d\x92\xac"),
+ ENTITY_DEF("NotRightTriangleEqual", 8941, "\xe2\x8b\xad"),
+ ENTITY_DEF("nshortmid", 8740, "\xe2\x88\xa4"),
+ ENTITY_DEF("SHCHcy", 1065, "\xd0\xa9"),
+ ENTITY_DEF("piv", 982, "\xcf\x96"),
+ ENTITY_DEF("angmsdaa", 10664, "\xe2\xa6\xa8"),
+ ENTITY_DEF("curlywedge", 8911, "\xe2\x8b\x8f"),
+ ENTITY_DEF("sqcaps", 8851, "\xe2\x8a\x93\xef\xb8\x80"),
+ ENTITY_DEF("sum", 8721, "\xe2\x88\x91"),
+ ENTITY_DEF("rarrtl", 8611, "\xe2\x86\xa3"),
+ ENTITY_DEF("gescc", 10921, "\xe2\xaa\xa9"),
+ ENTITY_DEF("sup", 8835, "\xe2\x8a\x83"),
+ ENTITY_DEF("smid", 8739, "\xe2\x88\xa3"),
+ ENTITY_DEF("cularr", 8630, "\xe2\x86\xb6"),
+ ENTITY_DEF("olcross", 10683, "\xe2\xa6\xbb"),
+ ENTITY_DEF_HEUR("GT", 62, "\x3e"),
+ ENTITY_DEF("scap", 10936, "\xe2\xaa\xb8"),
+ ENTITY_DEF("capcup", 10823, "\xe2\xa9\x87"),
+ ENTITY_DEF("NotSquareSubsetEqual", 8930, "\xe2\x8b\xa2"),
+ ENTITY_DEF("uhblk", 9600, "\xe2\x96\x80"),
+ ENTITY_DEF("latail", 10521, "\xe2\xa4\x99"),
+ ENTITY_DEF("smtes", 10924, "\xe2\xaa\xac\xef\xb8\x80"),
+ ENTITY_DEF("RoundImplies", 10608, "\xe2\xa5\xb0"),
+ ENTITY_DEF("wreath", 8768, "\xe2\x89\x80"),
+ ENTITY_DEF("curlyvee", 8910, "\xe2\x8b\x8e"),
+ ENTITY_DEF("uscr", 120010, "\xf0\x9d\x93\x8a"),
+ ENTITY_DEF("nleftrightarrow", 8622, "\xe2\x86\xae"),
+ ENTITY_DEF("ucy", 1091, "\xd1\x83"),
+ ENTITY_DEF("nvge", 8805, "\xe2\x89\xa5\xe2\x83\x92"),
+ ENTITY_DEF("bnot", 8976, "\xe2\x8c\x90"),
+ ENTITY_DEF("alefsym", 8501, "\xe2\x84\xb5"),
+ ENTITY_DEF("star", 9734, "\xe2\x98\x86"),
+ ENTITY_DEF("boxHd", 9572, "\xe2\x95\xa4"),
+ ENTITY_DEF("vsubnE", 10955, "\xe2\xab\x8b\xef\xb8\x80"),
+ ENTITY_DEF("Popf", 8473, "\xe2\x84\x99"),
+ ENTITY_DEF("simgE", 10912, "\xe2\xaa\xa0"),
+ ENTITY_DEF("upsilon", 965, "\xcf\x85"),
+ ENTITY_DEF("NoBreak", 8288, "\xe2\x81\xa0"),
+ ENTITY_DEF("realine", 8475, "\xe2\x84\x9b"),
+ ENTITY_DEF("frac38", 8540, "\xe2\x85\x9c"),
+ ENTITY_DEF("YAcy", 1071, "\xd0\xaf"),
+ ENTITY_DEF("bnequiv", 8801, "\xe2\x89\xa1\xe2\x83\xa5"),
+ ENTITY_DEF("cudarrr", 10549, "\xe2\xa4\xb5"),
+ ENTITY_DEF("lsime", 10893, "\xe2\xaa\x8d"),
+ ENTITY_DEF("lowbar", 95, "\x5f"),
+ ENTITY_DEF("utdot", 8944, "\xe2\x8b\xb0"),
+ ENTITY_DEF("ReverseElement", 8715, "\xe2\x88\x8b"),
+ ENTITY_DEF("nshortparallel", 8742, "\xe2\x88\xa6"),
+ ENTITY_DEF("DJcy", 1026, "\xd0\x82"),
+ ENTITY_DEF("nsube", 8840, "\xe2\x8a\x88"),
+ ENTITY_DEF("VDash", 8875, "\xe2\x8a\xab"),
+ ENTITY_DEF("Ncaron", 327, "\xc5\x87"),
+ ENTITY_DEF("LeftUpVector", 8639, "\xe2\x86\xbf"),
+ ENTITY_DEF("Kcy", 1050, "\xd0\x9a"),
+ ENTITY_DEF("NotLeftTriangleEqual", 8940, "\xe2\x8b\xac"),
+ ENTITY_DEF("nvHarr", 10500, "\xe2\xa4\x84"),
+ ENTITY_DEF("lotimes", 10804, "\xe2\xa8\xb4"),
+ ENTITY_DEF("RightFloor", 8971, "\xe2\x8c\x8b"),
+ ENTITY_DEF("succ", 8827, "\xe2\x89\xbb"),
+ ENTITY_DEF("Ucy", 1059, "\xd0\xa3"),
+ ENTITY_DEF("darr", 8595, "\xe2\x86\x93"),
+ ENTITY_DEF("lbarr", 10508, "\xe2\xa4\x8c"),
+ ENTITY_DEF("xfr", 120117, "\xf0\x9d\x94\xb5"),
+ ENTITY_DEF("zopf", 120171, "\xf0\x9d\x95\xab"),
+ ENTITY_DEF("Phi", 934, "\xce\xa6"),
+ ENTITY_DEF("ord", 10845, "\xe2\xa9\x9d"),
+ ENTITY_DEF("iinfin", 10716, "\xe2\xa7\x9c"),
+ ENTITY_DEF("Xfr", 120091, "\xf0\x9d\x94\x9b"),
+ ENTITY_DEF("qint", 10764, "\xe2\xa8\x8c"),
+ ENTITY_DEF("Upsilon", 933, "\xce\xa5"),
+ ENTITY_DEF("NotSubset", 8834, "\xe2\x8a\x82\xe2\x83\x92"),
+ ENTITY_DEF("gfr", 120100, "\xf0\x9d\x94\xa4"),
+ ENTITY_DEF("notnivb", 8958, "\xe2\x8b\xbe"),
+ ENTITY_DEF("Afr", 120068, "\xf0\x9d\x94\x84"),
+ ENTITY_DEF_HEUR("ge", 8805, "\xe2\x89\xa5"),
+ ENTITY_DEF_HEUR("iexcl", 161, "\xc2\xa1"),
+ ENTITY_DEF("dfr", 120097, "\xf0\x9d\x94\xa1"),
+ ENTITY_DEF("rsaquo", 8250, "\xe2\x80\xba"),
+ ENTITY_DEF("xcap", 8898, "\xe2\x8b\x82"),
+ ENTITY_DEF("Jopf", 120129, "\xf0\x9d\x95\x81"),
+ ENTITY_DEF("Hstrok", 294, "\xc4\xa6"),
+ ENTITY_DEF("ldca", 10550, "\xe2\xa4\xb6"),
+ ENTITY_DEF("lmoust", 9136, "\xe2\x8e\xb0"),
+ ENTITY_DEF("wcirc", 373, "\xc5\xb5"),
+ ENTITY_DEF("DownRightVector", 8641, "\xe2\x87\x81"),
+ ENTITY_DEF("LessFullEqual", 8806, "\xe2\x89\xa6"),
+ ENTITY_DEF("dotsquare", 8865, "\xe2\x8a\xa1"),
+ ENTITY_DEF("zhcy", 1078, "\xd0\xb6"),
+ ENTITY_DEF("mDDot", 8762, "\xe2\x88\xba"),
+ ENTITY_DEF("Prime", 8243, "\xe2\x80\xb3"),
+ ENTITY_DEF("prec", 8826, "\xe2\x89\xba"),
+ ENTITY_DEF("swnwar", 10538, "\xe2\xa4\xaa"),
+ ENTITY_DEF_HEUR("COPY", 169, "\xc2\xa9"),
+ ENTITY_DEF("cong", 8773, "\xe2\x89\x85"),
+ ENTITY_DEF("sacute", 347, "\xc5\x9b"),
+ ENTITY_DEF("Nopf", 8469, "\xe2\x84\x95"),
+ ENTITY_DEF("it", 8290, "\xe2\x81\xa2"),
+ ENTITY_DEF("SOFTcy", 1068, "\xd0\xac"),
+ ENTITY_DEF("uuarr", 8648, "\xe2\x87\x88"),
+ ENTITY_DEF("iota", 953, "\xce\xb9"),
+ ENTITY_DEF("notinE", 8953, "\xe2\x8b\xb9\xcc\xb8"),
+ ENTITY_DEF("jfr", 120103, "\xf0\x9d\x94\xa7"),
+ ENTITY_DEF_HEUR("QUOT", 34, "\x22"),
+ ENTITY_DEF("vsupnE", 10956, "\xe2\xab\x8c\xef\xb8\x80"),
+ ENTITY_DEF_HEUR("igrave", 236, "\xc3\xac"),
+ ENTITY_DEF("bsim", 8765, "\xe2\x88\xbd"),
+ ENTITY_DEF("npreceq", 10927, "\xe2\xaa\xaf\xcc\xb8"),
+ ENTITY_DEF("zcaron", 382, "\xc5\xbe"),
+ ENTITY_DEF("DD", 8517, "\xe2\x85\x85"),
+ ENTITY_DEF("gamma", 947, "\xce\xb3"),
+ ENTITY_DEF("homtht", 8763, "\xe2\x88\xbb"),
+ ENTITY_DEF("NonBreakingSpace", 160, "\xc2\xa0"),
+ ENTITY_DEF("Proportion", 8759, "\xe2\x88\xb7"),
+ ENTITY_DEF("nedot", 8784, "\xe2\x89\x90\xcc\xb8"),
+ ENTITY_DEF("nabla", 8711, "\xe2\x88\x87"),
+ ENTITY_DEF("ac", 8766, "\xe2\x88\xbe"),
+ ENTITY_DEF("nsupe", 8841, "\xe2\x8a\x89"),
+ ENTITY_DEF("ell", 8467, "\xe2\x84\x93"),
+ ENTITY_DEF("boxvR", 9566, "\xe2\x95\x9e"),
+ ENTITY_DEF("LowerRightArrow", 8600, "\xe2\x86\x98"),
+ ENTITY_DEF("boxHu", 9575, "\xe2\x95\xa7"),
+ ENTITY_DEF("lE", 8806, "\xe2\x89\xa6"),
+ ENTITY_DEF("dzigrarr", 10239, "\xe2\x9f\xbf"),
+ ENTITY_DEF("rfloor", 8971, "\xe2\x8c\x8b"),
+ ENTITY_DEF("gneq", 10888, "\xe2\xaa\x88"),
+ ENTITY_DEF("rightleftharpoons", 8652, "\xe2\x87\x8c"),
+ ENTITY_DEF("gtquest", 10876, "\xe2\xa9\xbc"),
+ ENTITY_DEF("searhk", 10533, "\xe2\xa4\xa5"),
+ ENTITY_DEF("gesdoto", 10882, "\xe2\xaa\x82"),
+ ENTITY_DEF("cross", 10007, "\xe2\x9c\x97"),
+ ENTITY_DEF("rdquo", 8221, "\xe2\x80\x9d"),
+ ENTITY_DEF("sqsupset", 8848, "\xe2\x8a\x90"),
+ ENTITY_DEF("divonx", 8903, "\xe2\x8b\x87"),
+ ENTITY_DEF("lat", 10923, "\xe2\xaa\xab"),
+ ENTITY_DEF("rmoustache", 9137, "\xe2\x8e\xb1"),
+ ENTITY_DEF("succapprox", 10936, "\xe2\xaa\xb8"),
+ ENTITY_DEF("nhpar", 10994, "\xe2\xab\xb2"),
+ ENTITY_DEF("sharp", 9839, "\xe2\x99\xaf"),
+ ENTITY_DEF("lrcorner", 8991, "\xe2\x8c\x9f"),
+ ENTITY_DEF("Vscr", 119985, "\xf0\x9d\x92\xb1"),
+ ENTITY_DEF("varsigma", 962, "\xcf\x82"),
+ ENTITY_DEF("bsolb", 10693, "\xe2\xa7\x85"),
+ ENTITY_DEF("cupcap", 10822, "\xe2\xa9\x86"),
+ ENTITY_DEF("leftrightarrow", 8596, "\xe2\x86\x94"),
+ ENTITY_DEF("LeftTee", 8867, "\xe2\x8a\xa3"),
+ ENTITY_DEF("Sqrt", 8730, "\xe2\x88\x9a"),
+ ENTITY_DEF("Odblac", 336, "\xc5\x90"),
+ ENTITY_DEF("ocir", 8858, "\xe2\x8a\x9a"),
+ ENTITY_DEF("eqslantless", 10901, "\xe2\xaa\x95"),
+ ENTITY_DEF("supedot", 10948, "\xe2\xab\x84"),
+ ENTITY_DEF("intercal", 8890, "\xe2\x8a\xba"),
+ ENTITY_DEF("Gbreve", 286, "\xc4\x9e"),
+ ENTITY_DEF("xrArr", 10233, "\xe2\x9f\xb9"),
+ ENTITY_DEF("NotTildeEqual", 8772, "\xe2\x89\x84"),
+ ENTITY_DEF("Bfr", 120069, "\xf0\x9d\x94\x85"),
+ ENTITY_DEF_HEUR("Iuml", 207, "\xc3\x8f"),
+ ENTITY_DEF("leg", 8922, "\xe2\x8b\x9a"),
+ ENTITY_DEF("boxhU", 9576, "\xe2\x95\xa8"),
+ ENTITY_DEF("Gopf", 120126, "\xf0\x9d\x94\xbe"),
+ ENTITY_DEF("af", 8289, "\xe2\x81\xa1"),
+ ENTITY_DEF("xwedge", 8896, "\xe2\x8b\x80"),
+ ENTITY_DEF("precapprox", 10935, "\xe2\xaa\xb7"),
+ ENTITY_DEF("lcedil", 316, "\xc4\xbc"),
+ ENTITY_DEF("between", 8812, "\xe2\x89\xac"),
+ ENTITY_DEF_HEUR("Oslash", 216, "\xc3\x98"),
+ ENTITY_DEF("breve", 728, "\xcb\x98"),
+ ENTITY_DEF("caps", 8745, "\xe2\x88\xa9\xef\xb8\x80"),
+ ENTITY_DEF("vangrt", 10652, "\xe2\xa6\x9c"),
+ ENTITY_DEF("lagran", 8466, "\xe2\x84\x92"),
+ ENTITY_DEF("kopf", 120156, "\xf0\x9d\x95\x9c"),
+ ENTITY_DEF("ReverseUpEquilibrium", 10607, "\xe2\xa5\xaf"),
+ ENTITY_DEF("nlsim", 8820, "\xe2\x89\xb4"),
+ ENTITY_DEF("Cap", 8914, "\xe2\x8b\x92"),
+ ENTITY_DEF("angmsdac", 10666, "\xe2\xa6\xaa"),
+ ENTITY_DEF("iocy", 1105, "\xd1\x91"),
+ ENTITY_DEF("seswar", 10537, "\xe2\xa4\xa9"),
+ ENTITY_DEF("dzcy", 1119, "\xd1\x9f"),
+ ENTITY_DEF("nsubset", 8834, "\xe2\x8a\x82\xe2\x83\x92"),
+ ENTITY_DEF("cup", 8746, "\xe2\x88\xaa"),
+ ENTITY_DEF("npar", 8742, "\xe2\x88\xa6"),
+ ENTITY_DEF("late", 10925, "\xe2\xaa\xad"),
+ ENTITY_DEF("plussim", 10790, "\xe2\xa8\xa6"),
+ ENTITY_DEF("Darr", 8609, "\xe2\x86\xa1"),
+ ENTITY_DEF("nexist", 8708, "\xe2\x88\x84"),
+ ENTITY_DEF_HEUR("cent", 162, "\xc2\xa2"),
+ ENTITY_DEF("khcy", 1093, "\xd1\x85"),
+ ENTITY_DEF("smallsetminus", 8726, "\xe2\x88\x96"),
+ ENTITY_DEF("ycirc", 375, "\xc5\xb7"),
+ ENTITY_DEF("lharu", 8636, "\xe2\x86\xbc"),
+ ENTITY_DEF("upuparrows", 8648, "\xe2\x87\x88"),
+ ENTITY_DEF("sigmaf", 962, "\xcf\x82"),
+ ENTITY_DEF("nltri", 8938, "\xe2\x8b\xaa"),
+ ENTITY_DEF("mstpos", 8766, "\xe2\x88\xbe"),
+ ENTITY_DEF("Zopf", 8484, "\xe2\x84\xa4"),
+ ENTITY_DEF("dwangle", 10662, "\xe2\xa6\xa6"),
+ ENTITY_DEF("bowtie", 8904, "\xe2\x8b\x88"),
+ ENTITY_DEF("Dfr", 120071, "\xf0\x9d\x94\x87"),
+ ENTITY_DEF_HEUR("iacute", 237, "\xc3\xad"),
+ ENTITY_DEF("njcy", 1114, "\xd1\x9a"),
+ ENTITY_DEF("cfr", 120096, "\xf0\x9d\x94\xa0"),
+ ENTITY_DEF("TripleDot", 8411, "\xe2\x83\x9b"),
+ ENTITY_DEF("Or", 10836, "\xe2\xa9\x94"),
+ ENTITY_DEF("blk34", 9619, "\xe2\x96\x93"),
+ ENTITY_DEF("equiv", 8801, "\xe2\x89\xa1"),
+ ENTITY_DEF("fflig", 64256, "\xef\xac\x80"),
+ ENTITY_DEF("Rang", 10219, "\xe2\x9f\xab"),
+ ENTITY_DEF("Wopf", 120142, "\xf0\x9d\x95\x8e"),
+ ENTITY_DEF("boxUl", 9564, "\xe2\x95\x9c"),
+ ENTITY_DEF_HEUR("frac12", 189, "\xc2\xbd"),
+ ENTITY_DEF("clubs", 9827, "\xe2\x99\xa3"),
+ ENTITY_DEF("amalg", 10815, "\xe2\xa8\xbf"),
+ ENTITY_DEF("Lang", 10218, "\xe2\x9f\xaa"),
+ ENTITY_DEF("asymp", 8776, "\xe2\x89\x88"),
+ ENTITY_DEF("models", 8871, "\xe2\x8a\xa7"),
+ ENTITY_DEF("emptyset", 8709, "\xe2\x88\x85"),
+ ENTITY_DEF("Tscr", 119983, "\xf0\x9d\x92\xaf"),
+ ENTITY_DEF("nleftarrow", 8602, "\xe2\x86\x9a"),
+ ENTITY_DEF("Omacr", 332, "\xc5\x8c"),
+ ENTITY_DEF("gtrarr", 10616, "\xe2\xa5\xb8"),
+ ENTITY_DEF("excl", 33, "\x21"),
+ ENTITY_DEF("rarrw", 8605, "\xe2\x86\x9d"),
+ ENTITY_DEF("abreve", 259, "\xc4\x83"),
+ ENTITY_DEF("CircleTimes", 8855, "\xe2\x8a\x97"),
+ ENTITY_DEF("aopf", 120146, "\xf0\x9d\x95\x92"),
+ ENTITY_DEF("eqvparsl", 10725, "\xe2\xa7\xa5"),
+ ENTITY_DEF("boxv", 9474, "\xe2\x94\x82"),
+ ENTITY_DEF("SuchThat", 8715, "\xe2\x88\x8b"),
+ ENTITY_DEF("varphi", 981, "\xcf\x95"),
+ ENTITY_DEF("Ropf", 8477, "\xe2\x84\x9d"),
+ ENTITY_DEF("rscr", 120007, "\xf0\x9d\x93\x87"),
+ ENTITY_DEF("Rrightarrow", 8667, "\xe2\x87\x9b"),
+ ENTITY_DEF("equest", 8799, "\xe2\x89\x9f"),
+ ENTITY_DEF_HEUR("ntilde", 241, "\xc3\xb1"),
+ ENTITY_DEF("Escr", 8496, "\xe2\x84\xb0"),
+ ENTITY_DEF("Lopf", 120131, "\xf0\x9d\x95\x83"),
+ ENTITY_DEF("GreaterGreater", 10914, "\xe2\xaa\xa2"),
+ ENTITY_DEF("pluscir", 10786, "\xe2\xa8\xa2"),
+ ENTITY_DEF("nsupset", 8835, "\xe2\x8a\x83\xe2\x83\x92"),
+ ENTITY_DEF("uArr", 8657, "\xe2\x87\x91"),
+ ENTITY_DEF("nwarhk", 10531, "\xe2\xa4\xa3"),
+ ENTITY_DEF("Ycirc", 374, "\xc5\xb6"),
+ ENTITY_DEF("tdot", 8411, "\xe2\x83\x9b"),
+ ENTITY_DEF("circledS", 9416, "\xe2\x93\x88"),
+ ENTITY_DEF("lhard", 8637, "\xe2\x86\xbd"),
+ ENTITY_DEF("iukcy", 1110, "\xd1\x96"),
+ ENTITY_DEF("PrecedesSlantEqual", 8828, "\xe2\x89\xbc"),
+ ENTITY_DEF("Sfr", 120086, "\xf0\x9d\x94\x96"),
+ ENTITY_DEF("egs", 10902, "\xe2\xaa\x96"),
+ ENTITY_DEF("oelig", 339, "\xc5\x93"),
+ ENTITY_DEF("bigtriangledown", 9661, "\xe2\x96\xbd"),
+ ENTITY_DEF("EmptyVerySmallSquare", 9643, "\xe2\x96\xab"),
+ ENTITY_DEF("Backslash", 8726, "\xe2\x88\x96"),
+ ENTITY_DEF("nscr", 120003, "\xf0\x9d\x93\x83"),
+ ENTITY_DEF("uogon", 371, "\xc5\xb3"),
+ ENTITY_DEF("circeq", 8791, "\xe2\x89\x97"),
+ ENTITY_DEF("check", 10003, "\xe2\x9c\x93"),
+ ENTITY_DEF("Sup", 8913, "\xe2\x8b\x91"),
+ ENTITY_DEF("Rcaron", 344, "\xc5\x98"),
+ ENTITY_DEF("lneqq", 8808, "\xe2\x89\xa8"),
+ ENTITY_DEF("lrhar", 8651, "\xe2\x87\x8b"),
+ ENTITY_DEF("ulcorn", 8988, "\xe2\x8c\x9c"),
+ ENTITY_DEF("timesd", 10800, "\xe2\xa8\xb0"),
+ ENTITY_DEF("Sum", 8721, "\xe2\x88\x91"),
+ ENTITY_DEF("varpropto", 8733, "\xe2\x88\x9d"),
+ ENTITY_DEF("Lcaron", 317, "\xc4\xbd"),
+ ENTITY_DEF("lbrkslu", 10637, "\xe2\xa6\x8d"),
+ ENTITY_DEF_HEUR("AElig", 198, "\xc3\x86"),
+ ENTITY_DEF("varr", 8597, "\xe2\x86\x95"),
+ ENTITY_DEF("nvinfin", 10718, "\xe2\xa7\x9e"),
+ ENTITY_DEF("leq", 8804, "\xe2\x89\xa4"),
+ ENTITY_DEF("biguplus", 10756, "\xe2\xa8\x84"),
+ ENTITY_DEF("rpar", 41, "\x29"),
+ ENTITY_DEF("eng", 331, "\xc5\x8b"),
+ ENTITY_DEF("NegativeThinSpace", 8203, "\xe2\x80\x8b"),
+ ENTITY_DEF("lesssim", 8818, "\xe2\x89\xb2"),
+ ENTITY_DEF("lBarr", 10510, "\xe2\xa4\x8e"),
+ ENTITY_DEF("LeftUpTeeVector", 10592, "\xe2\xa5\xa0"),
+ ENTITY_DEF("gnE", 8809, "\xe2\x89\xa9"),
+ ENTITY_DEF("efr", 120098, "\xf0\x9d\x94\xa2"),
+ ENTITY_DEF("barvee", 8893, "\xe2\x8a\xbd"),
+ ENTITY_DEF("ee", 8519, "\xe2\x85\x87"),
+ ENTITY_DEF("Uogon", 370, "\xc5\xb2"),
+ ENTITY_DEF("gnapprox", 10890, "\xe2\xaa\x8a"),
+ ENTITY_DEF("olcir", 10686, "\xe2\xa6\xbe"),
+ ENTITY_DEF("boxUL", 9565, "\xe2\x95\x9d"),
+ ENTITY_DEF("Gg", 8921, "\xe2\x8b\x99"),
+ ENTITY_DEF("CloseCurlyQuote", 8217, "\xe2\x80\x99"),
+ ENTITY_DEF("leftharpoondown", 8637, "\xe2\x86\xbd"),
+ ENTITY_DEF("vfr", 120115, "\xf0\x9d\x94\xb3"),
+ ENTITY_DEF("gvertneqq", 8809, "\xe2\x89\xa9\xef\xb8\x80"),
+ ENTITY_DEF_HEUR("ouml", 246, "\xc3\xb6"),
+ ENTITY_DEF("raemptyv", 10675, "\xe2\xa6\xb3"),
+ ENTITY_DEF("Zcaron", 381, "\xc5\xbd"),
+ ENTITY_DEF("scE", 10932, "\xe2\xaa\xb4"),
+ ENTITY_DEF("boxvh", 9532, "\xe2\x94\xbc"),
+ ENTITY_DEF("ominus", 8854, "\xe2\x8a\x96"),
+ ENTITY_DEF("oopf", 120160, "\xf0\x9d\x95\xa0"),
+ ENTITY_DEF("nsucceq", 10928, "\xe2\xaa\xb0\xcc\xb8"),
+ ENTITY_DEF("RBarr", 10512, "\xe2\xa4\x90"),
+ ENTITY_DEF("iprod", 10812, "\xe2\xa8\xbc"),
+ ENTITY_DEF("lvnE", 8808, "\xe2\x89\xa8\xef\xb8\x80"),
+ ENTITY_DEF("andand", 10837, "\xe2\xa9\x95"),
+ ENTITY_DEF("upharpoonright", 8638, "\xe2\x86\xbe"),
+ ENTITY_DEF("ncongdot", 10861, "\xe2\xa9\xad\xcc\xb8"),
+ ENTITY_DEF("drcrop", 8972, "\xe2\x8c\x8c"),
+ ENTITY_DEF("nsimeq", 8772, "\xe2\x89\x84"),
+ ENTITY_DEF("subsub", 10965, "\xe2\xab\x95"),
+ ENTITY_DEF("hardcy", 1098, "\xd1\x8a"),
+ ENTITY_DEF("leqslant", 10877, "\xe2\xa9\xbd"),
+ ENTITY_DEF("uharl", 8639, "\xe2\x86\xbf"),
+ ENTITY_DEF("expectation", 8496, "\xe2\x84\xb0"),
+ ENTITY_DEF("mdash", 8212, "\xe2\x80\x94"),
+ ENTITY_DEF("VerticalTilde", 8768, "\xe2\x89\x80"),
+ ENTITY_DEF("rdldhar", 10601, "\xe2\xa5\xa9"),
+ ENTITY_DEF("leftharpoonup", 8636, "\xe2\x86\xbc"),
+ ENTITY_DEF("mu", 956, "\xce\xbc"),
+ ENTITY_DEF("curarrm", 10556, "\xe2\xa4\xbc"),
+ ENTITY_DEF("Cdot", 266, "\xc4\x8a"),
+ ENTITY_DEF("NotTildeTilde", 8777, "\xe2\x89\x89"),
+ ENTITY_DEF("boxul", 9496, "\xe2\x94\x98"),
+ ENTITY_DEF("planckh", 8462, "\xe2\x84\x8e"),
+ ENTITY_DEF("CapitalDifferentialD", 8517, "\xe2\x85\x85"),
+ ENTITY_DEF("boxDL", 9559, "\xe2\x95\x97"),
+ ENTITY_DEF("cupbrcap", 10824, "\xe2\xa9\x88"),
+ ENTITY_DEF("boxdL", 9557, "\xe2\x95\x95"),
+ ENTITY_DEF("supe", 8839, "\xe2\x8a\x87"),
+ ENTITY_DEF("nvlt", 60, "\x3c\xe2\x83\x92"),
+ ENTITY_DEF("par", 8741, "\xe2\x88\xa5"),
+ ENTITY_DEF("InvisibleComma", 8291, "\xe2\x81\xa3"),
+ ENTITY_DEF("ring", 730, "\xcb\x9a"),
+ ENTITY_DEF("nvap", 8781, "\xe2\x89\x8d\xe2\x83\x92"),
+ ENTITY_DEF("veeeq", 8794, "\xe2\x89\x9a"),
+ ENTITY_DEF("Hfr", 8460, "\xe2\x84\x8c"),
+ ENTITY_DEF("dstrok", 273, "\xc4\x91"),
+ ENTITY_DEF("gesles", 10900, "\xe2\xaa\x94"),
+ ENTITY_DEF("dash", 8208, "\xe2\x80\x90"),
+ ENTITY_DEF("SHcy", 1064, "\xd0\xa8"),
+ ENTITY_DEF("congdot", 10861, "\xe2\xa9\xad"),
+ ENTITY_DEF("imagline", 8464, "\xe2\x84\x90"),
+ ENTITY_DEF("ncy", 1085, "\xd0\xbd"),
+ ENTITY_DEF("bigstar", 9733, "\xe2\x98\x85"),
+ ENTITY_DEF_HEUR("REG", 174, "\xc2\xae"),
+ ENTITY_DEF("triangleq", 8796, "\xe2\x89\x9c"),
+ ENTITY_DEF("rsqb", 93, "\x5d"),
+ ENTITY_DEF("ddarr", 8650, "\xe2\x87\x8a"),
+ ENTITY_DEF("csub", 10959, "\xe2\xab\x8f"),
+ ENTITY_DEF("quest", 63, "\x3f"),
+ ENTITY_DEF("Star", 8902, "\xe2\x8b\x86"),
+ ENTITY_DEF_HEUR("LT", 60, "\x3c"),
+ ENTITY_DEF("ncong", 8775, "\xe2\x89\x87"),
+ ENTITY_DEF("prnE", 10933, "\xe2\xaa\xb5"),
+ ENTITY_DEF("bigtriangleup", 9651, "\xe2\x96\xb3"),
+ ENTITY_DEF("Tilde", 8764, "\xe2\x88\xbc"),
+ ENTITY_DEF("ltrif", 9666, "\xe2\x97\x82"),
+ ENTITY_DEF("ldrdhar", 10599, "\xe2\xa5\xa7"),
+ ENTITY_DEF("lcaron", 318, "\xc4\xbe"),
+ ENTITY_DEF("equivDD", 10872, "\xe2\xa9\xb8"),
+ ENTITY_DEF("lHar", 10594, "\xe2\xa5\xa2"),
+ ENTITY_DEF("vBar", 10984, "\xe2\xab\xa8"),
+ ENTITY_DEF("Mopf", 120132, "\xf0\x9d\x95\x84"),
+ ENTITY_DEF("LeftArrow", 8592, "\xe2\x86\x90"),
+ ENTITY_DEF("Rho", 929, "\xce\xa1"),
+ ENTITY_DEF("Ccirc", 264, "\xc4\x88"),
+ ENTITY_DEF("ifr", 120102, "\xf0\x9d\x94\xa6"),
+ ENTITY_DEF("cacute", 263, "\xc4\x87"),
+ ENTITY_DEF("centerdot", 183, "\xc2\xb7"),
+ ENTITY_DEF("dollar", 36, "\x24"),
+ ENTITY_DEF("lang", 10216, "\xe2\x9f\xa8"),
+ ENTITY_DEF("curvearrowright", 8631, "\xe2\x86\xb7"),
+ ENTITY_DEF("half", 189, "\xc2\xbd"),
+ ENTITY_DEF("Ecy", 1069, "\xd0\xad"),
+ ENTITY_DEF("rcub", 125, "\x7d"),
+ ENTITY_DEF("rcy", 1088, "\xd1\x80"),
+ ENTITY_DEF("isins", 8948, "\xe2\x8b\xb4"),
+ ENTITY_DEF("bsolhsub", 10184, "\xe2\x9f\x88"),
+ ENTITY_DEF("boxuL", 9563, "\xe2\x95\x9b"),
+ ENTITY_DEF("shchcy", 1097, "\xd1\x89"),
+ ENTITY_DEF("cwconint", 8754, "\xe2\x88\xb2"),
+ ENTITY_DEF("euro", 8364, "\xe2\x82\xac"),
+ ENTITY_DEF("lesseqqgtr", 10891, "\xe2\xaa\x8b"),
+ ENTITY_DEF("sim", 8764, "\xe2\x88\xbc"),
+ ENTITY_DEF("rarrc", 10547, "\xe2\xa4\xb3"),
+ ENTITY_DEF("boxdl", 9488, "\xe2\x94\x90"),
+ ENTITY_DEF("Epsilon", 917, "\xce\x95"),
+ ENTITY_DEF("iiiint", 10764, "\xe2\xa8\x8c"),
+ ENTITY_DEF("Rightarrow", 8658, "\xe2\x87\x92"),
+ ENTITY_DEF("conint", 8750, "\xe2\x88\xae"),
+ ENTITY_DEF("boxDl", 9558, "\xe2\x95\x96"),
+ ENTITY_DEF("kappav", 1008, "\xcf\xb0"),
+ ENTITY_DEF("profsurf", 8979, "\xe2\x8c\x93"),
+ ENTITY_DEF_HEUR("auml", 228, "\xc3\xa4"),
+ ENTITY_DEF("heartsuit", 9829, "\xe2\x99\xa5"),
+ ENTITY_DEF_HEUR("eacute", 233, "\xc3\xa9"),
+ ENTITY_DEF_HEUR("gt", 62, "\x3e"),
+ ENTITY_DEF("Gcedil", 290, "\xc4\xa2"),
+ ENTITY_DEF("easter", 10862, "\xe2\xa9\xae"),
+ ENTITY_DEF("Tcy", 1058, "\xd0\xa2"),
+ ENTITY_DEF("swarrow", 8601, "\xe2\x86\x99"),
+ ENTITY_DEF("lopf", 120157, "\xf0\x9d\x95\x9d"),
+ ENTITY_DEF("Agrave", 192, "\xc3\x80"),
+ ENTITY_DEF("Aring", 197, "\xc3\x85"),
+ ENTITY_DEF("fpartint", 10765, "\xe2\xa8\x8d"),
+ ENTITY_DEF("xoplus", 10753, "\xe2\xa8\x81"),
+ ENTITY_DEF("LeftDownTeeVector", 10593, "\xe2\xa5\xa1"),
+ ENTITY_DEF("int", 8747, "\xe2\x88\xab"),
+ ENTITY_DEF("Zeta", 918, "\xce\x96"),
+ ENTITY_DEF("loz", 9674, "\xe2\x97\x8a"),
+ ENTITY_DEF("ncup", 10818, "\xe2\xa9\x82"),
+ ENTITY_DEF("napE", 10864, "\xe2\xa9\xb0\xcc\xb8"),
+ ENTITY_DEF("csup", 10960, "\xe2\xab\x90"),
+ ENTITY_DEF("Ncedil", 325, "\xc5\x85"),
+ ENTITY_DEF("cuwed", 8911, "\xe2\x8b\x8f"),
+ ENTITY_DEF("Dot", 168, "\xc2\xa8"),
+ ENTITY_DEF("SquareIntersection", 8851, "\xe2\x8a\x93"),
+ ENTITY_DEF("map", 8614, "\xe2\x86\xa6"),
+ ENTITY_DEF_HEUR("aelig", 230, "\xc3\xa6"),
+ ENTITY_DEF("RightArrow", 8594, "\xe2\x86\x92"),
+ ENTITY_DEF("rightharpoondown", 8641, "\xe2\x87\x81"),
+ ENTITY_DEF("bNot", 10989, "\xe2\xab\xad"),
+ ENTITY_DEF("nsccue", 8929, "\xe2\x8b\xa1"),
+ ENTITY_DEF("zigrarr", 8669, "\xe2\x87\x9d"),
+ ENTITY_DEF("Sacute", 346, "\xc5\x9a"),
+ ENTITY_DEF("orv", 10843, "\xe2\xa9\x9b"),
+ ENTITY_DEF("RightVectorBar", 10579, "\xe2\xa5\x93"),
+ ENTITY_DEF("nrarrw", 8605, "\xe2\x86\x9d\xcc\xb8"),
+ ENTITY_DEF("nbump", 8782, "\xe2\x89\x8e\xcc\xb8"),
+ ENTITY_DEF_HEUR("iquest", 191, "\xc2\xbf"),
+ ENTITY_DEF("wr", 8768, "\xe2\x89\x80"),
+ ENTITY_DEF("UpArrow", 8593, "\xe2\x86\x91"),
+ ENTITY_DEF("notinva", 8713, "\xe2\x88\x89"),
+ ENTITY_DEF("ddagger", 8225, "\xe2\x80\xa1"),
+ ENTITY_DEF("nLeftarrow", 8653, "\xe2\x87\x8d"),
+ ENTITY_DEF("rbbrk", 10099, "\xe2\x9d\xb3"),
+ ENTITY_DEF("RightTriangle", 8883, "\xe2\x8a\xb3"),
+ ENTITY_DEF("leqq", 8806, "\xe2\x89\xa6"),
+ ENTITY_DEF("Vert", 8214, "\xe2\x80\x96"),
+ ENTITY_DEF("gesl", 8923, "\xe2\x8b\x9b\xef\xb8\x80"),
+ ENTITY_DEF("LeftTeeVector", 10586, "\xe2\xa5\x9a"),
+ ENTITY_DEF("Union", 8899, "\xe2\x8b\x83"),
+ ENTITY_DEF("sc", 8827, "\xe2\x89\xbb"),
+ ENTITY_DEF("ofr", 120108, "\xf0\x9d\x94\xac"),
+ ENTITY_DEF("quatint", 10774, "\xe2\xa8\x96"),
+ ENTITY_DEF("apacir", 10863, "\xe2\xa9\xaf"),
+ ENTITY_DEF("profalar", 9006, "\xe2\x8c\xae"),
+ ENTITY_DEF("subsetneq", 8842, "\xe2\x8a\x8a"),
+ ENTITY_DEF("Vvdash", 8874, "\xe2\x8a\xaa"),
+ ENTITY_DEF("ohbar", 10677, "\xe2\xa6\xb5"),
+ ENTITY_DEF("Gt", 8811, "\xe2\x89\xab"),
+ ENTITY_DEF("exist", 8707, "\xe2\x88\x83"),
+ ENTITY_DEF("gtrapprox", 10886, "\xe2\xaa\x86"),
+ ENTITY_DEF_HEUR("euml", 235, "\xc3\xab"),
+ ENTITY_DEF("Equilibrium", 8652, "\xe2\x87\x8c"),
+ ENTITY_DEF_HEUR("aacute", 225, "\xc3\xa1"),
+ ENTITY_DEF("omid", 10678, "\xe2\xa6\xb6"),
+ ENTITY_DEF("loarr", 8701, "\xe2\x87\xbd"),
+ ENTITY_DEF("SucceedsSlantEqual", 8829, "\xe2\x89\xbd"),
+ ENTITY_DEF("angsph", 8738, "\xe2\x88\xa2"),
+ ENTITY_DEF("nsmid", 8740, "\xe2\x88\xa4"),
+ ENTITY_DEF("lsquor", 8218, "\xe2\x80\x9a"),
+ ENTITY_DEF("cemptyv", 10674, "\xe2\xa6\xb2"),
+ ENTITY_DEF("rAarr", 8667, "\xe2\x87\x9b"),
+ ENTITY_DEF("searr", 8600, "\xe2\x86\x98"),
+ ENTITY_DEF("complexes", 8450, "\xe2\x84\x82"),
+ ENTITY_DEF("UnderParenthesis", 9181, "\xe2\x8f\x9d"),
+ ENTITY_DEF("nparsl", 11005, "\xe2\xab\xbd\xe2\x83\xa5"),
+ ENTITY_DEF("Lacute", 313, "\xc4\xb9"),
+ ENTITY_DEF_HEUR("deg", 176, "\xc2\xb0"),
+ ENTITY_DEF("Racute", 340, "\xc5\x94"),
+ ENTITY_DEF("Verbar", 8214, "\xe2\x80\x96"),
+ ENTITY_DEF("sqcups", 8852, "\xe2\x8a\x94\xef\xb8\x80"),
+ ENTITY_DEF("Hopf", 8461, "\xe2\x84\x8d"),
+ ENTITY_DEF("naturals", 8469, "\xe2\x84\x95"),
+ ENTITY_DEF("Cedilla", 184, "\xc2\xb8"),
+ ENTITY_DEF("exponentiale", 8519, "\xe2\x85\x87"),
+ ENTITY_DEF("vnsup", 8835, "\xe2\x8a\x83\xe2\x83\x92"),
+ ENTITY_DEF("leftrightarrows", 8646, "\xe2\x87\x86"),
+ ENTITY_DEF("Laplacetrf", 8466, "\xe2\x84\x92"),
+ ENTITY_DEF("vartriangleright", 8883, "\xe2\x8a\xb3"),
+ ENTITY_DEF("rtri", 9657, "\xe2\x96\xb9"),
+ ENTITY_DEF("gE", 8807, "\xe2\x89\xa7"),
+ ENTITY_DEF("SmallCircle", 8728, "\xe2\x88\x98"),
+ ENTITY_DEF("diamondsuit", 9830, "\xe2\x99\xa6"),
+ ENTITY_DEF_HEUR("Otilde", 213, "\xc3\x95"),
+ ENTITY_DEF("lneq", 10887, "\xe2\xaa\x87"),
+ ENTITY_DEF("lesdoto", 10881, "\xe2\xaa\x81"),
+ ENTITY_DEF("ltquest", 10875, "\xe2\xa9\xbb"),
+ ENTITY_DEF("thinsp", 8201, "\xe2\x80\x89"),
+ ENTITY_DEF("barwed", 8965, "\xe2\x8c\x85"),
+ ENTITY_DEF("elsdot", 10903, "\xe2\xaa\x97"),
+ ENTITY_DEF("circ", 710, "\xcb\x86"),
+ ENTITY_DEF("ni", 8715, "\xe2\x88\x8b"),
+ ENTITY_DEF("mlcp", 10971, "\xe2\xab\x9b"),
+ ENTITY_DEF("Vdash", 8873, "\xe2\x8a\xa9"),
+ ENTITY_DEF("ShortRightArrow", 8594, "\xe2\x86\x92"),
+ ENTITY_DEF("upharpoonleft", 8639, "\xe2\x86\xbf"),
+ ENTITY_DEF("UnderBracket", 9141, "\xe2\x8e\xb5"),
+ ENTITY_DEF("rAtail", 10524, "\xe2\xa4\x9c"),
+ ENTITY_DEF("iopf", 120154, "\xf0\x9d\x95\x9a"),
+ ENTITY_DEF("longleftarrow", 10229, "\xe2\x9f\xb5"),
+ ENTITY_DEF("Zacute", 377, "\xc5\xb9"),
+ ENTITY_DEF("duhar", 10607, "\xe2\xa5\xaf"),
+ ENTITY_DEF("Mfr", 120080, "\xf0\x9d\x94\x90"),
+ ENTITY_DEF("prnap", 10937, "\xe2\xaa\xb9"),
+ ENTITY_DEF("eqcirc", 8790, "\xe2\x89\x96"),
+ ENTITY_DEF("rarrlp", 8620, "\xe2\x86\xac"),
+ ENTITY_DEF("le", 8804, "\xe2\x89\xa4"),
+ ENTITY_DEF("Oscr", 119978, "\xf0\x9d\x92\xaa"),
+ ENTITY_DEF("langd", 10641, "\xe2\xa6\x91"),
+ ENTITY_DEF("Ucirc", 219, "\xc3\x9b"),
+ ENTITY_DEF("precnapprox", 10937, "\xe2\xaa\xb9"),
+ ENTITY_DEF("succcurlyeq", 8829, "\xe2\x89\xbd"),
+ ENTITY_DEF("Tau", 932, "\xce\xa4"),
+ ENTITY_DEF("larr", 8592, "\xe2\x86\x90"),
+ ENTITY_DEF("neArr", 8663, "\xe2\x87\x97"),
+ ENTITY_DEF("subsim", 10951, "\xe2\xab\x87"),
+ ENTITY_DEF("DScy", 1029, "\xd0\x85"),
+ ENTITY_DEF("preccurlyeq", 8828, "\xe2\x89\xbc"),
+ ENTITY_DEF("NotLessLess", 8810, "\xe2\x89\xaa\xcc\xb8"),
+ ENTITY_DEF("succnapprox", 10938, "\xe2\xaa\xba"),
+ ENTITY_DEF("prcue", 8828, "\xe2\x89\xbc"),
+ ENTITY_DEF("Downarrow", 8659, "\xe2\x87\x93"),
+ ENTITY_DEF("angmsdah", 10671, "\xe2\xa6\xaf"),
+ ENTITY_DEF("Emacr", 274, "\xc4\x92"),
+ ENTITY_DEF("lsh", 8624, "\xe2\x86\xb0"),
+ ENTITY_DEF("simne", 8774, "\xe2\x89\x86"),
+ ENTITY_DEF("Bumpeq", 8782, "\xe2\x89\x8e"),
+ ENTITY_DEF("RightUpTeeVector", 10588, "\xe2\xa5\x9c"),
+ ENTITY_DEF("Sigma", 931, "\xce\xa3"),
+ ENTITY_DEF("nvltrie", 8884, "\xe2\x8a\xb4\xe2\x83\x92"),
+ ENTITY_DEF("lfr", 120105, "\xf0\x9d\x94\xa9"),
+ ENTITY_DEF("emsp13", 8196, "\xe2\x80\x84"),
+ ENTITY_DEF("parsl", 11005, "\xe2\xab\xbd"),
+ ENTITY_DEF_HEUR("ucirc", 251, "\xc3\xbb"),
+ ENTITY_DEF("gsiml", 10896, "\xe2\xaa\x90"),
+ ENTITY_DEF("xsqcup", 10758, "\xe2\xa8\x86"),
+ ENTITY_DEF("Omicron", 927, "\xce\x9f"),
+ ENTITY_DEF("gsime", 10894, "\xe2\xaa\x8e"),
+ ENTITY_DEF("circlearrowleft", 8634, "\xe2\x86\xba"),
+ ENTITY_DEF("sqsupe", 8850, "\xe2\x8a\x92"),
+ ENTITY_DEF("supE", 10950, "\xe2\xab\x86"),
+ ENTITY_DEF("dlcrop", 8973, "\xe2\x8c\x8d"),
+ ENTITY_DEF("RightDownTeeVector", 10589, "\xe2\xa5\x9d"),
+ ENTITY_DEF("Colone", 10868, "\xe2\xa9\xb4"),
+ ENTITY_DEF("awconint", 8755, "\xe2\x88\xb3"),
+ ENTITY_DEF("smte", 10924, "\xe2\xaa\xac"),
+ ENTITY_DEF("lEg", 10891, "\xe2\xaa\x8b"),
+ ENTITY_DEF("circledast", 8859, "\xe2\x8a\x9b"),
+ ENTITY_DEF("ecolon", 8789, "\xe2\x89\x95"),
+ ENTITY_DEF("rect", 9645, "\xe2\x96\xad"),
+ ENTITY_DEF("Equal", 10869, "\xe2\xa9\xb5"),
+ ENTITY_DEF("nwnear", 10535, "\xe2\xa4\xa7"),
+ ENTITY_DEF("capdot", 10816, "\xe2\xa9\x80"),
+ ENTITY_DEF("straightphi", 981, "\xcf\x95"),
+ ENTITY_DEF("forkv", 10969, "\xe2\xab\x99"),
+ ENTITY_DEF("ZHcy", 1046, "\xd0\x96"),
+ ENTITY_DEF("Element", 8712, "\xe2\x88\x88"),
+ ENTITY_DEF("rthree", 8908, "\xe2\x8b\x8c"),
+ ENTITY_DEF("vzigzag", 10650, "\xe2\xa6\x9a"),
+ ENTITY_DEF("hybull", 8259, "\xe2\x81\x83"),
+ ENTITY_DEF("intprod", 10812, "\xe2\xa8\xbc"),
+ ENTITY_DEF("HumpEqual", 8783, "\xe2\x89\x8f"),
+ ENTITY_DEF("bigsqcup", 10758, "\xe2\xa8\x86"),
+ ENTITY_DEF("mp", 8723, "\xe2\x88\x93"),
+ ENTITY_DEF("lescc", 10920, "\xe2\xaa\xa8"),
+ ENTITY_DEF("NotPrecedes", 8832, "\xe2\x8a\x80"),
+ ENTITY_DEF("wedge", 8743, "\xe2\x88\xa7"),
+ ENTITY_DEF("Supset", 8913, "\xe2\x8b\x91"),
+ ENTITY_DEF("pm", 177, "\xc2\xb1"),
+ ENTITY_DEF("kfr", 120104, "\xf0\x9d\x94\xa8"),
+ ENTITY_DEF("ufisht", 10622, "\xe2\xa5\xbe"),
+ ENTITY_DEF("ecaron", 283, "\xc4\x9b"),
+ ENTITY_DEF("chcy", 1095, "\xd1\x87"),
+ ENTITY_DEF("Esim", 10867, "\xe2\xa9\xb3"),
+ ENTITY_DEF("fltns", 9649, "\xe2\x96\xb1"),
+ ENTITY_DEF("nsce", 10928, "\xe2\xaa\xb0\xcc\xb8"),
+ ENTITY_DEF("hookrightarrow", 8618, "\xe2\x86\xaa"),
+ ENTITY_DEF("semi", 59, "\x3b"),
+ ENTITY_DEF("ges", 10878, "\xe2\xa9\xbe"),
+ ENTITY_DEF("approxeq", 8778, "\xe2\x89\x8a"),
+ ENTITY_DEF("rarrsim", 10612, "\xe2\xa5\xb4"),
+ ENTITY_DEF("boxhD", 9573, "\xe2\x95\xa5"),
+ ENTITY_DEF("varpi", 982, "\xcf\x96"),
+ ENTITY_DEF("larrb", 8676, "\xe2\x87\xa4"),
+ ENTITY_DEF("copf", 120148, "\xf0\x9d\x95\x94"),
+ ENTITY_DEF("Dopf", 120123, "\xf0\x9d\x94\xbb"),
+ ENTITY_DEF("LeftVector", 8636, "\xe2\x86\xbc"),
+ ENTITY_DEF("iff", 8660, "\xe2\x87\x94"),
+ ENTITY_DEF("lnap", 10889, "\xe2\xaa\x89"),
+ ENTITY_DEF("NotGreaterFullEqual", 8807, "\xe2\x89\xa7\xcc\xb8"),
+ ENTITY_DEF("varrho", 1009, "\xcf\xb1"),
+ ENTITY_DEF("NotSucceeds", 8833, "\xe2\x8a\x81"),
+ ENTITY_DEF("ltrPar", 10646, "\xe2\xa6\x96"),
+ ENTITY_DEF("nlE", 8806, "\xe2\x89\xa6\xcc\xb8"),
+ ENTITY_DEF("Zfr", 8488, "\xe2\x84\xa8"),
+ ENTITY_DEF("LeftArrowBar", 8676, "\xe2\x87\xa4"),
+ ENTITY_DEF("boxplus", 8862, "\xe2\x8a\x9e"),
+ ENTITY_DEF("sqsube", 8849, "\xe2\x8a\x91"),
+ ENTITY_DEF("Re", 8476, "\xe2\x84\x9c"),
+ ENTITY_DEF("Wfr", 120090, "\xf0\x9d\x94\x9a"),
+ ENTITY_DEF("epsi", 949, "\xce\xb5"),
+ ENTITY_DEF("oacute", 243, "\xc3\xb3"),
+ ENTITY_DEF("bdquo", 8222, "\xe2\x80\x9e"),
+ ENTITY_DEF("wscr", 120012, "\xf0\x9d\x93\x8c"),
+ ENTITY_DEF("bullet", 8226, "\xe2\x80\xa2"),
+ ENTITY_DEF("frown", 8994, "\xe2\x8c\xa2"),
+ ENTITY_DEF("siml", 10909, "\xe2\xaa\x9d"),
+ ENTITY_DEF("Rarr", 8608, "\xe2\x86\xa0"),
+ ENTITY_DEF("Scaron", 352, "\xc5\xa0"),
+ ENTITY_DEF("gtreqqless", 10892, "\xe2\xaa\x8c"),
+ ENTITY_DEF("Larr", 8606, "\xe2\x86\x9e"),
+ ENTITY_DEF("notniva", 8716, "\xe2\x88\x8c"),
+ ENTITY_DEF("gg", 8811, "\xe2\x89\xab"),
+ ENTITY_DEF("phmmat", 8499, "\xe2\x84\xb3"),
+ ENTITY_DEF("boxVL", 9571, "\xe2\x95\xa3"),
+ ENTITY_DEF("sigmav", 962, "\xcf\x82"),
+ ENTITY_DEF("order", 8500, "\xe2\x84\xb4"),
+ ENTITY_DEF("subsup", 10963, "\xe2\xab\x93"),
+ ENTITY_DEF("afr", 120094, "\xf0\x9d\x94\x9e"),
+ ENTITY_DEF("lbrace", 123, "\x7b"),
+ ENTITY_DEF("urcorn", 8989, "\xe2\x8c\x9d"),
+ ENTITY_DEF("Im", 8465, "\xe2\x84\x91"),
+ ENTITY_DEF("CounterClockwiseContourIntegral", 8755, "\xe2\x88\xb3"),
+ ENTITY_DEF("lne", 10887, "\xe2\xaa\x87"),
+ ENTITY_DEF("chi", 967, "\xcf\x87"),
+ ENTITY_DEF("cudarrl", 10552, "\xe2\xa4\xb8"),
+ ENTITY_DEF("ang", 8736, "\xe2\x88\xa0"),
+ ENTITY_DEF("isindot", 8949, "\xe2\x8b\xb5"),
+ ENTITY_DEF("Lfr", 120079, "\xf0\x9d\x94\x8f"),
+ ENTITY_DEF("Rsh", 8625, "\xe2\x86\xb1"),
+ ENTITY_DEF("Ocy", 1054, "\xd0\x9e"),
+ ENTITY_DEF("nvrArr", 10499, "\xe2\xa4\x83"),
+ ENTITY_DEF("otimes", 8855, "\xe2\x8a\x97"),
+ ENTITY_DEF("eqslantgtr", 10902, "\xe2\xaa\x96"),
+ ENTITY_DEF("Rfr", 8476, "\xe2\x84\x9c"),
+ ENTITY_DEF("blacktriangleleft", 9666, "\xe2\x97\x82"),
+ ENTITY_DEF("Lsh", 8624, "\xe2\x86\xb0"),
+ ENTITY_DEF("boxvr", 9500, "\xe2\x94\x9c"),
+ ENTITY_DEF("scedil", 351, "\xc5\x9f"),
+ ENTITY_DEF_HEUR("iuml", 239, "\xc3\xaf"),
+ ENTITY_DEF("NJcy", 1034, "\xd0\x8a"),
+ ENTITY_DEF("Dagger", 8225, "\xe2\x80\xa1"),
+ ENTITY_DEF("rarrap", 10613, "\xe2\xa5\xb5"),
+ ENTITY_DEF("udblac", 369, "\xc5\xb1"),
+ ENTITY_DEF("Sopf", 120138, "\xf0\x9d\x95\x8a"),
+ ENTITY_DEF("scnsim", 8937, "\xe2\x8b\xa9"),
+ ENTITY_DEF("hbar", 8463, "\xe2\x84\x8f"),
+ ENTITY_DEF("frac15", 8533, "\xe2\x85\x95"),
+ ENTITY_DEF_HEUR("sup3", 179, "\xc2\xb3"),
+ ENTITY_DEF("NegativeThickSpace", 8203, "\xe2\x80\x8b"),
+ ENTITY_DEF("npr", 8832, "\xe2\x8a\x80"),
+ ENTITY_DEF("doteq", 8784, "\xe2\x89\x90"),
+ ENTITY_DEF("subrarr", 10617, "\xe2\xa5\xb9"),
+ ENTITY_DEF("SquareSubset", 8847, "\xe2\x8a\x8f"),
+ ENTITY_DEF("vprop", 8733, "\xe2\x88\x9d"),
+ ENTITY_DEF("OpenCurlyQuote", 8216, "\xe2\x80\x98"),
+ ENTITY_DEF("supseteq", 8839, "\xe2\x8a\x87"),
+ ENTITY_DEF("nRightarrow", 8655, "\xe2\x87\x8f"),
+ ENTITY_DEF("Longleftarrow", 10232, "\xe2\x9f\xb8"),
+ ENTITY_DEF("lsquo", 8216, "\xe2\x80\x98"),
+ ENTITY_DEF("hstrok", 295, "\xc4\xa7"),
+ ENTITY_DEF("NotTilde", 8769, "\xe2\x89\x81"),
+ ENTITY_DEF("ogt", 10689, "\xe2\xa7\x81"),
+ ENTITY_DEF("block", 9608, "\xe2\x96\x88"),
+ ENTITY_DEF("minusd", 8760, "\xe2\x88\xb8"),
+ ENTITY_DEF("esdot", 8784, "\xe2\x89\x90"),
+ ENTITY_DEF("nsim", 8769, "\xe2\x89\x81"),
+ ENTITY_DEF("scsim", 8831, "\xe2\x89\xbf"),
+ ENTITY_DEF("boxVl", 9570, "\xe2\x95\xa2"),
+ ENTITY_DEF("ltimes", 8905, "\xe2\x8b\x89"),
+ ENTITY_DEF("thkap", 8776, "\xe2\x89\x88"),
+ ENTITY_DEF("vnsub", 8834, "\xe2\x8a\x82\xe2\x83\x92"),
+ ENTITY_DEF("thetasym", 977, "\xcf\x91"),
+ ENTITY_DEF("eopf", 120150, "\xf0\x9d\x95\x96"),
+ ENTITY_DEF("image", 8465, "\xe2\x84\x91"),
+ ENTITY_DEF("doteqdot", 8785, "\xe2\x89\x91"),
+ ENTITY_DEF("Udblac", 368, "\xc5\xb0"),
+ ENTITY_DEF("gnsim", 8935, "\xe2\x8b\xa7"),
+ ENTITY_DEF("yicy", 1111, "\xd1\x97"),
+ ENTITY_DEF("vopf", 120167, "\xf0\x9d\x95\xa7"),
+ ENTITY_DEF("DDotrahd", 10513, "\xe2\xa4\x91"),
+ ENTITY_DEF("Iota", 921, "\xce\x99"),
+ ENTITY_DEF("GJcy", 1027, "\xd0\x83"),
+ ENTITY_DEF("rightthreetimes", 8908, "\xe2\x8b\x8c"),
+ ENTITY_DEF("nrtri", 8939, "\xe2\x8b\xab"),
+ ENTITY_DEF("TildeFullEqual", 8773, "\xe2\x89\x85"),
+ ENTITY_DEF("Dcaron", 270, "\xc4\x8e"),
+ ENTITY_DEF("ccaron", 269, "\xc4\x8d"),
+ ENTITY_DEF("lacute", 314, "\xc4\xba"),
+ ENTITY_DEF("VerticalBar", 8739, "\xe2\x88\xa3"),
+ ENTITY_DEF("Igrave", 204, "\xc3\x8c"),
+ ENTITY_DEF("boxH", 9552, "\xe2\x95\x90"),
+ ENTITY_DEF("Pfr", 120083, "\xf0\x9d\x94\x93"),
+ ENTITY_DEF("equals", 61, "\x3d"),
+ ENTITY_DEF("rbrack", 93, "\x5d"),
+ ENTITY_DEF("OverParenthesis", 9180, "\xe2\x8f\x9c"),
+ ENTITY_DEF("in", 8712, "\xe2\x88\x88"),
+ ENTITY_DEF("llcorner", 8990, "\xe2\x8c\x9e"),
+ ENTITY_DEF("mcomma", 10793, "\xe2\xa8\xa9"),
+ ENTITY_DEF("NotGreater", 8815, "\xe2\x89\xaf"),
+ ENTITY_DEF("midcir", 10992, "\xe2\xab\xb0"),
+ ENTITY_DEF("Edot", 278, "\xc4\x96"),
+ ENTITY_DEF("oplus", 8853, "\xe2\x8a\x95"),
+ ENTITY_DEF("geqq", 8807, "\xe2\x89\xa7"),
+ ENTITY_DEF("curvearrowleft", 8630, "\xe2\x86\xb6"),
+ ENTITY_DEF("Poincareplane", 8460, "\xe2\x84\x8c"),
+ ENTITY_DEF("yscr", 120014, "\xf0\x9d\x93\x8e"),
+ ENTITY_DEF("ccaps", 10829, "\xe2\xa9\x8d"),
+ ENTITY_DEF("rpargt", 10644, "\xe2\xa6\x94"),
+ ENTITY_DEF("topfork", 10970, "\xe2\xab\x9a"),
+ ENTITY_DEF("Gamma", 915, "\xce\x93"),
+ ENTITY_DEF("umacr", 363, "\xc5\xab"),
+ ENTITY_DEF("frac13", 8531, "\xe2\x85\x93"),
+ ENTITY_DEF("cirfnint", 10768, "\xe2\xa8\x90"),
+ ENTITY_DEF("xlArr", 10232, "\xe2\x9f\xb8"),
+ ENTITY_DEF("digamma", 989, "\xcf\x9d"),
+ ENTITY_DEF("Hat", 94, "\x5e"),
+ ENTITY_DEF("lates", 10925, "\xe2\xaa\xad\xef\xb8\x80"),
+ ENTITY_DEF("lgE", 10897, "\xe2\xaa\x91"),
+ ENTITY_DEF("commat", 64, "\x40"),
+ ENTITY_DEF("NotPrecedesSlantEqual", 8928, "\xe2\x8b\xa0"),
+ ENTITY_DEF("phone", 9742, "\xe2\x98\x8e"),
+ ENTITY_DEF("Ecirc", 202, "\xc3\x8a"),
+ ENTITY_DEF_HEUR("lt", 60, "\x3c"),
+ ENTITY_DEF("intcal", 8890, "\xe2\x8a\xba"),
+ ENTITY_DEF("xdtri", 9661, "\xe2\x96\xbd"),
+ ENTITY_DEF("Abreve", 258, "\xc4\x82"),
+ ENTITY_DEF("gopf", 120152, "\xf0\x9d\x95\x98"),
+ ENTITY_DEF("Xopf", 120143, "\xf0\x9d\x95\x8f"),
+ ENTITY_DEF("Iacute", 205, "\xc3\x8d"),
+ ENTITY_DEF("Aopf", 120120, "\xf0\x9d\x94\xb8"),
+ ENTITY_DEF("gbreve", 287, "\xc4\x9f"),
+ ENTITY_DEF("nleq", 8816, "\xe2\x89\xb0"),
+ ENTITY_DEF("xopf", 120169, "\xf0\x9d\x95\xa9"),
+ ENTITY_DEF("SquareSupersetEqual", 8850, "\xe2\x8a\x92"),
+ ENTITY_DEF("NotLessTilde", 8820, "\xe2\x89\xb4"),
+ ENTITY_DEF("SubsetEqual", 8838, "\xe2\x8a\x86"),
+ ENTITY_DEF("Sc", 10940, "\xe2\xaa\xbc"),
+ ENTITY_DEF("sdote", 10854, "\xe2\xa9\xa6"),
+ ENTITY_DEF("loplus", 10797, "\xe2\xa8\xad"),
+ ENTITY_DEF("zfr", 120119, "\xf0\x9d\x94\xb7"),
+ ENTITY_DEF("subseteqq", 10949, "\xe2\xab\x85"),
+ ENTITY_DEF("Vdashl", 10982, "\xe2\xab\xa6"),
+ ENTITY_DEF("integers", 8484, "\xe2\x84\xa4"),
+ ENTITY_DEF("Umacr", 362, "\xc5\xaa"),
+ ENTITY_DEF("dopf", 120149, "\xf0\x9d\x95\x95"),
+ ENTITY_DEF("RightDownVectorBar", 10581, "\xe2\xa5\x95"),
+ ENTITY_DEF("angmsdaf", 10669, "\xe2\xa6\xad"),
+ ENTITY_DEF("Jfr", 120077, "\xf0\x9d\x94\x8d"),
+ ENTITY_DEF("bernou", 8492, "\xe2\x84\xac"),
+ ENTITY_DEF("lceil", 8968, "\xe2\x8c\x88"),
+ ENTITY_DEF("nvsim", 8764, "\xe2\x88\xbc\xe2\x83\x92"),
+ ENTITY_DEF("NotSucceedsSlantEqual", 8929, "\xe2\x8b\xa1"),
+ ENTITY_DEF("hearts", 9829, "\xe2\x99\xa5"),
+ ENTITY_DEF("vee", 8744, "\xe2\x88\xa8"),
+ ENTITY_DEF("LJcy", 1033, "\xd0\x89"),
+ ENTITY_DEF("nlt", 8814, "\xe2\x89\xae"),
+ ENTITY_DEF("because", 8757, "\xe2\x88\xb5"),
+ ENTITY_DEF("hairsp", 8202, "\xe2\x80\x8a"),
+ ENTITY_DEF("comma", 44, "\x2c"),
+ ENTITY_DEF("iecy", 1077, "\xd0\xb5"),
+ ENTITY_DEF("npre", 10927, "\xe2\xaa\xaf\xcc\xb8"),
+ ENTITY_DEF("NotSquareSubset", 8847, "\xe2\x8a\x8f\xcc\xb8"),
+ ENTITY_DEF("mscr", 120002, "\xf0\x9d\x93\x82"),
+ ENTITY_DEF("jopf", 120155, "\xf0\x9d\x95\x9b"),
+ ENTITY_DEF("bumpE", 10926, "\xe2\xaa\xae"),
+ ENTITY_DEF("thicksim", 8764, "\xe2\x88\xbc"),
+ ENTITY_DEF("Nfr", 120081, "\xf0\x9d\x94\x91"),
+ ENTITY_DEF("yucy", 1102, "\xd1\x8e"),
+ ENTITY_DEF("notinvc", 8950, "\xe2\x8b\xb6"),
+ ENTITY_DEF("lstrok", 322, "\xc5\x82"),
+ ENTITY_DEF("robrk", 10215, "\xe2\x9f\xa7"),
+ ENTITY_DEF("LeftTriangleBar", 10703, "\xe2\xa7\x8f"),
+ ENTITY_DEF("hksearow", 10533, "\xe2\xa4\xa5"),
+ ENTITY_DEF("bigcap", 8898, "\xe2\x8b\x82"),
+ ENTITY_DEF("udhar", 10606, "\xe2\xa5\xae"),
+ ENTITY_DEF("Yscr", 119988, "\xf0\x9d\x92\xb4"),
+ ENTITY_DEF("smeparsl", 10724, "\xe2\xa7\xa4"),
+ ENTITY_DEF("NotLess", 8814, "\xe2\x89\xae"),
+ ENTITY_DEF("dcaron", 271, "\xc4\x8f"),
+ ENTITY_DEF("ange", 10660, "\xe2\xa6\xa4"),
+ ENTITY_DEF("dHar", 10597, "\xe2\xa5\xa5"),
+ ENTITY_DEF("UpperRightArrow", 8599, "\xe2\x86\x97"),
+ ENTITY_DEF("trpezium", 9186, "\xe2\x8f\xa2"),
+ ENTITY_DEF("boxminus", 8863, "\xe2\x8a\x9f"),
+ ENTITY_DEF("notni", 8716, "\xe2\x88\x8c"),
+ ENTITY_DEF("dtrif", 9662, "\xe2\x96\xbe"),
+ ENTITY_DEF("nhArr", 8654, "\xe2\x87\x8e"),
+ ENTITY_DEF("larrpl", 10553, "\xe2\xa4\xb9"),
+ ENTITY_DEF("simeq", 8771, "\xe2\x89\x83"),
+ ENTITY_DEF("geqslant", 10878, "\xe2\xa9\xbe"),
+ ENTITY_DEF("RightUpVectorBar", 10580, "\xe2\xa5\x94"),
+ ENTITY_DEF("nsc", 8833, "\xe2\x8a\x81"),
+ ENTITY_DEF("div", 247, "\xc3\xb7"),
+ ENTITY_DEF("orslope", 10839, "\xe2\xa9\x97"),
+ ENTITY_DEF("lparlt", 10643, "\xe2\xa6\x93"),
+ ENTITY_DEF("trie", 8796, "\xe2\x89\x9c"),
+ ENTITY_DEF("cirmid", 10991, "\xe2\xab\xaf"),
+ ENTITY_DEF("wp", 8472, "\xe2\x84\x98"),
+ ENTITY_DEF("dagger", 8224, "\xe2\x80\xa0"),
+ ENTITY_DEF("utri", 9653, "\xe2\x96\xb5"),
+ ENTITY_DEF("supnE", 10956, "\xe2\xab\x8c"),
+ ENTITY_DEF("eg", 10906, "\xe2\xaa\x9a"),
+ ENTITY_DEF("LeftDownVector", 8643, "\xe2\x87\x83"),
+ ENTITY_DEF("NotLessEqual", 8816, "\xe2\x89\xb0"),
+ ENTITY_DEF("Bopf", 120121, "\xf0\x9d\x94\xb9"),
+ ENTITY_DEF("LongLeftRightArrow", 10231, "\xe2\x9f\xb7"),
+ ENTITY_DEF("Gfr", 120074, "\xf0\x9d\x94\x8a"),
+ ENTITY_DEF("sqsubseteq", 8849, "\xe2\x8a\x91"),
+ ENTITY_DEF_HEUR("ograve", 242, "\xc3\xb2"),
+ ENTITY_DEF("larrhk", 8617, "\xe2\x86\xa9"),
+ ENTITY_DEF("sigma", 963, "\xcf\x83"),
+ ENTITY_DEF("NotSquareSupersetEqual", 8931, "\xe2\x8b\xa3"),
+ ENTITY_DEF("gvnE", 8809, "\xe2\x89\xa9\xef\xb8\x80"),
+ ENTITY_DEF("timesbar", 10801, "\xe2\xa8\xb1"),
+ ENTITY_DEF("Iukcy", 1030, "\xd0\x86"),
+ ENTITY_DEF("bscr", 119991, "\xf0\x9d\x92\xb7"),
+ ENTITY_DEF("Exists", 8707, "\xe2\x88\x83"),
+ ENTITY_DEF("tscr", 120009, "\xf0\x9d\x93\x89"),
+ ENTITY_DEF("tcy", 1090, "\xd1\x82"),
+ ENTITY_DEF("nwarr", 8598, "\xe2\x86\x96"),
+ ENTITY_DEF("hoarr", 8703, "\xe2\x87\xbf"),
+ ENTITY_DEF("lnapprox", 10889, "\xe2\xaa\x89"),
+ ENTITY_DEF("nu", 957, "\xce\xbd"),
+ ENTITY_DEF("bcy", 1073, "\xd0\xb1"),
+ ENTITY_DEF("ndash", 8211, "\xe2\x80\x93"),
+ ENTITY_DEF("smt", 10922, "\xe2\xaa\xaa"),
+ ENTITY_DEF("scaron", 353, "\xc5\xa1"),
+ ENTITY_DEF("IOcy", 1025, "\xd0\x81"),
+ ENTITY_DEF("Ifr", 8465, "\xe2\x84\x91"),
+ ENTITY_DEF("cularrp", 10557, "\xe2\xa4\xbd"),
+ ENTITY_DEF("lvertneqq", 8808, "\xe2\x89\xa8\xef\xb8\x80"),
+ ENTITY_DEF("nlarr", 8602, "\xe2\x86\x9a"),
+ ENTITY_DEF("colon", 58, "\x3a"),
+ ENTITY_DEF("ddotseq", 10871, "\xe2\xa9\xb7"),
+ ENTITY_DEF("zacute", 378, "\xc5\xba"),
+ ENTITY_DEF("DoubleVerticalBar", 8741, "\xe2\x88\xa5"),
+ ENTITY_DEF("larrfs", 10525, "\xe2\xa4\x9d"),
+ ENTITY_DEF("NotExists", 8708, "\xe2\x88\x84"),
+ ENTITY_DEF("geq", 8805, "\xe2\x89\xa5"),
+ ENTITY_DEF("Ffr", 120073, "\xf0\x9d\x94\x89"),
+ ENTITY_DEF_HEUR("divide", 247, "\xc3\xb7"),
+ ENTITY_DEF("blank", 9251, "\xe2\x90\xa3"),
+ ENTITY_DEF("IEcy", 1045, "\xd0\x95"),
+ ENTITY_DEF_HEUR("ordm", 186, "\xc2\xba"),
+ ENTITY_DEF("fopf", 120151, "\xf0\x9d\x95\x97"),
+ ENTITY_DEF("ecir", 8790, "\xe2\x89\x96"),
+ ENTITY_DEF("complement", 8705, "\xe2\x88\x81"),
+ ENTITY_DEF("top", 8868, "\xe2\x8a\xa4"),
+ ENTITY_DEF("DoubleContourIntegral", 8751, "\xe2\x88\xaf"),
+ ENTITY_DEF("nisd", 8954, "\xe2\x8b\xba"),
+ ENTITY_DEF("bcong", 8780, "\xe2\x89\x8c"),
+ ENTITY_DEF("plusdu", 10789, "\xe2\xa8\xa5"),
+ ENTITY_DEF("TildeTilde", 8776, "\xe2\x89\x88"),
+ ENTITY_DEF("lnE", 8808, "\xe2\x89\xa8"),
+ ENTITY_DEF("DoubleLongRightArrow", 10233, "\xe2\x9f\xb9"),
+ ENTITY_DEF("nsubseteqq", 10949, "\xe2\xab\x85\xcc\xb8"),
+ ENTITY_DEF("DownTeeArrow", 8615, "\xe2\x86\xa7"),
+ ENTITY_DEF("Cscr", 119966, "\xf0\x9d\x92\x9e"),
+ ENTITY_DEF("NegativeVeryThinSpace", 8203, "\xe2\x80\x8b"),
+ ENTITY_DEF("emsp", 8195, "\xe2\x80\x83"),
+ ENTITY_DEF("vartriangleleft", 8882, "\xe2\x8a\xb2"),
+ ENTITY_DEF("ropar", 10630, "\xe2\xa6\x86"),
+ ENTITY_DEF("checkmark", 10003, "\xe2\x9c\x93"),
+ ENTITY_DEF("Ycy", 1067, "\xd0\xab"),
+ ENTITY_DEF("supset", 8835, "\xe2\x8a\x83"),
+ ENTITY_DEF("gneqq", 8809, "\xe2\x89\xa9"),
+ ENTITY_DEF("Lstrok", 321, "\xc5\x81"),
+ ENTITY_DEF_HEUR("AMP", 38, "\x26"),
+ ENTITY_DEF("acE", 8766, "\xe2\x88\xbe\xcc\xb3"),
+ ENTITY_DEF("sqsupseteq", 8850, "\xe2\x8a\x92"),
+ ENTITY_DEF("nle", 8816, "\xe2\x89\xb0"),
+ ENTITY_DEF("nesear", 10536, "\xe2\xa4\xa8"),
+ ENTITY_DEF("LeftDownVectorBar", 10585, "\xe2\xa5\x99"),
+ ENTITY_DEF("Integral", 8747, "\xe2\x88\xab"),
+ ENTITY_DEF("Beta", 914, "\xce\x92"),
+ ENTITY_DEF("nvdash", 8876, "\xe2\x8a\xac"),
+ ENTITY_DEF("nges", 10878, "\xe2\xa9\xbe\xcc\xb8"),
+ ENTITY_DEF("demptyv", 10673, "\xe2\xa6\xb1"),
+ ENTITY_DEF("eta", 951, "\xce\xb7"),
+ ENTITY_DEF("GreaterSlantEqual", 10878, "\xe2\xa9\xbe"),
+ ENTITY_DEF_HEUR("ccedil", 231, "\xc3\xa7"),
+ ENTITY_DEF("pfr", 120109, "\xf0\x9d\x94\xad"),
+ ENTITY_DEF("bbrktbrk", 9142, "\xe2\x8e\xb6"),
+ ENTITY_DEF("mcy", 1084, "\xd0\xbc"),
+ ENTITY_DEF("Not", 10988, "\xe2\xab\xac"),
+ ENTITY_DEF("qscr", 120006, "\xf0\x9d\x93\x86"),
+ ENTITY_DEF("zwj", 8205, "\xe2\x80\x8d"),
+ ENTITY_DEF("ntrianglerighteq", 8941, "\xe2\x8b\xad"),
+ ENTITY_DEF("permil", 8240, "\xe2\x80\xb0"),
+ ENTITY_DEF("squarf", 9642, "\xe2\x96\xaa"),
+ ENTITY_DEF("apos", 39, "\x27"),
+ ENTITY_DEF("lrm", 8206, "\xe2\x80\x8e"),
+ ENTITY_DEF("male", 9794, "\xe2\x99\x82"),
+ ENTITY_DEF_HEUR("agrave", 224, "\xc3\xa0"),
+ ENTITY_DEF("Lt", 8810, "\xe2\x89\xaa"),
+ ENTITY_DEF("capand", 10820, "\xe2\xa9\x84"),
+ ENTITY_DEF_HEUR("aring", 229, "\xc3\xa5"),
+ ENTITY_DEF("Jukcy", 1028, "\xd0\x84"),
+ ENTITY_DEF("bumpe", 8783, "\xe2\x89\x8f"),
+ ENTITY_DEF("dd", 8518, "\xe2\x85\x86"),
+ ENTITY_DEF("tscy", 1094, "\xd1\x86"),
+ ENTITY_DEF("oS", 9416, "\xe2\x93\x88"),
+ ENTITY_DEF("succeq", 10928, "\xe2\xaa\xb0"),
+ ENTITY_DEF("xharr", 10231, "\xe2\x9f\xb7"),
+ ENTITY_DEF("pluse", 10866, "\xe2\xa9\xb2"),
+ ENTITY_DEF("rfisht", 10621, "\xe2\xa5\xbd"),
+ ENTITY_DEF("HorizontalLine", 9472, "\xe2\x94\x80"),
+ ENTITY_DEF("DiacriticalAcute", 180, "\xc2\xb4"),
+ ENTITY_DEF("hfr", 120101, "\xf0\x9d\x94\xa5"),
+ ENTITY_DEF("preceq", 10927, "\xe2\xaa\xaf"),
+ ENTITY_DEF("rationals", 8474, "\xe2\x84\x9a"),
+ ENTITY_DEF_HEUR("Auml", 196, "\xc3\x84"),
+ ENTITY_DEF("LeftRightArrow", 8596, "\xe2\x86\x94"),
+ ENTITY_DEF("blacktriangleright", 9656, "\xe2\x96\xb8"),
+ ENTITY_DEF("dharr", 8642, "\xe2\x87\x82"),
+ ENTITY_DEF("isin", 8712, "\xe2\x88\x88"),
+ ENTITY_DEF("ldrushar", 10571, "\xe2\xa5\x8b"),
+ ENTITY_DEF("squ", 9633, "\xe2\x96\xa1"),
+ ENTITY_DEF("rbrksld", 10638, "\xe2\xa6\x8e"),
+ ENTITY_DEF("bigwedge", 8896, "\xe2\x8b\x80"),
+ ENTITY_DEF("swArr", 8665, "\xe2\x87\x99"),
+ ENTITY_DEF("IJlig", 306, "\xc4\xb2"),
+ ENTITY_DEF("harr", 8596, "\xe2\x86\x94"),
+ ENTITY_DEF("range", 10661, "\xe2\xa6\xa5"),
+ ENTITY_DEF("urtri", 9721, "\xe2\x97\xb9"),
+ ENTITY_DEF("NotVerticalBar", 8740, "\xe2\x88\xa4"),
+ ENTITY_DEF("ic", 8291, "\xe2\x81\xa3"),
+ ENTITY_DEF("solbar", 9023, "\xe2\x8c\xbf"),
+ ENTITY_DEF("approx", 8776, "\xe2\x89\x88"),
+ ENTITY_DEF("SquareSuperset", 8848, "\xe2\x8a\x90"),
+ ENTITY_DEF("numsp", 8199, "\xe2\x80\x87"),
+ ENTITY_DEF("nLt", 8810, "\xe2\x89\xaa\xe2\x83\x92"),
+ ENTITY_DEF("tilde", 732, "\xcb\x9c"),
+ ENTITY_DEF("rlarr", 8644, "\xe2\x87\x84"),
+ ENTITY_DEF("langle", 10216, "\xe2\x9f\xa8"),
+ ENTITY_DEF("nleqslant", 10877, "\xe2\xa9\xbd\xcc\xb8"),
+ ENTITY_DEF("Nacute", 323, "\xc5\x83"),
+ ENTITY_DEF("NotLeftTriangle", 8938, "\xe2\x8b\xaa"),
+ ENTITY_DEF("sopf", 120164, "\xf0\x9d\x95\xa4"),
+ ENTITY_DEF("xmap", 10236, "\xe2\x9f\xbc"),
+ ENTITY_DEF("supne", 8843, "\xe2\x8a\x8b"),
+ ENTITY_DEF("Int", 8748, "\xe2\x88\xac"),
+ ENTITY_DEF("nsupseteqq", 10950, "\xe2\xab\x86\xcc\xb8"),
+ ENTITY_DEF("circlearrowright", 8635, "\xe2\x86\xbb"),
+ ENTITY_DEF("NotCongruent", 8802, "\xe2\x89\xa2"),
+ ENTITY_DEF("Scedil", 350, "\xc5\x9e"),
+ ENTITY_DEF_HEUR("raquo", 187, "\xc2\xbb"),
+ ENTITY_DEF("ycy", 1099, "\xd1\x8b"),
+ ENTITY_DEF("notinvb", 8951, "\xe2\x8b\xb7"),
+ ENTITY_DEF("andv", 10842, "\xe2\xa9\x9a"),
+ ENTITY_DEF("nap", 8777, "\xe2\x89\x89"),
+ ENTITY_DEF("shcy", 1096, "\xd1\x88"),
+ ENTITY_DEF("ssetmn", 8726, "\xe2\x88\x96"),
+ ENTITY_DEF("downarrow", 8595, "\xe2\x86\x93"),
+ ENTITY_DEF("gesdotol", 10884, "\xe2\xaa\x84"),
+ ENTITY_DEF("Congruent", 8801, "\xe2\x89\xa1"),
+ ENTITY_DEF_HEUR("pound", 163, "\xc2\xa3"),
+ ENTITY_DEF("ZeroWidthSpace", 8203, "\xe2\x80\x8b"),
+ ENTITY_DEF("rdca", 10551, "\xe2\xa4\xb7"),
+ ENTITY_DEF("rmoust", 9137, "\xe2\x8e\xb1"),
+ ENTITY_DEF("zcy", 1079, "\xd0\xb7"),
+ ENTITY_DEF("Square", 9633, "\xe2\x96\xa1"),
+ ENTITY_DEF("subE", 10949, "\xe2\xab\x85"),
+ ENTITY_DEF("infintie", 10717, "\xe2\xa7\x9d"),
+ ENTITY_DEF("Cayleys", 8493, "\xe2\x84\xad"),
+ ENTITY_DEF("lsaquo", 8249, "\xe2\x80\xb9"),
+ ENTITY_DEF("realpart", 8476, "\xe2\x84\x9c"),
+ ENTITY_DEF("nprec", 8832, "\xe2\x8a\x80"),
+ ENTITY_DEF("RightTriangleBar", 10704, "\xe2\xa7\x90"),
+ ENTITY_DEF("Kopf", 120130, "\xf0\x9d\x95\x82"),
+ ENTITY_DEF("Ubreve", 364, "\xc5\xac"),
+ ENTITY_DEF("Uopf", 120140, "\xf0\x9d\x95\x8c"),
+ ENTITY_DEF("trianglelefteq", 8884, "\xe2\x8a\xb4"),
+ ENTITY_DEF("rotimes", 10805, "\xe2\xa8\xb5"),
+ ENTITY_DEF("qfr", 120110, "\xf0\x9d\x94\xae"),
+ ENTITY_DEF("gtcc", 10919, "\xe2\xaa\xa7"),
+ ENTITY_DEF("fnof", 402, "\xc6\x92"),
+ ENTITY_DEF("tritime", 10811, "\xe2\xa8\xbb"),
+ ENTITY_DEF("andslope", 10840, "\xe2\xa9\x98"),
+ ENTITY_DEF("harrw", 8621, "\xe2\x86\xad"),
+ ENTITY_DEF("NotSquareSuperset", 8848, "\xe2\x8a\x90\xcc\xb8"),
+ ENTITY_DEF("Amacr", 256, "\xc4\x80"),
+ ENTITY_DEF("OpenCurlyDoubleQuote", 8220, "\xe2\x80\x9c"),
+ ENTITY_DEF_HEUR("thorn", 254, "\xc3\xbe"),
+ ENTITY_DEF_HEUR("ordf", 170, "\xc2\xaa"),
+ ENTITY_DEF("natur", 9838, "\xe2\x99\xae"),
+ ENTITY_DEF("xi", 958, "\xce\xbe"),
+ ENTITY_DEF("infin", 8734, "\xe2\x88\x9e"),
+ ENTITY_DEF("nspar", 8742, "\xe2\x88\xa6"),
+ ENTITY_DEF("Jcy", 1049, "\xd0\x99"),
+ ENTITY_DEF("DownLeftTeeVector", 10590, "\xe2\xa5\x9e"),
+ ENTITY_DEF("rbarr", 10509, "\xe2\xa4\x8d"),
+ ENTITY_DEF("Xi", 926, "\xce\x9e"),
+ ENTITY_DEF("bull", 8226, "\xe2\x80\xa2"),
+ ENTITY_DEF("cuesc", 8927, "\xe2\x8b\x9f"),
+ ENTITY_DEF("backcong", 8780, "\xe2\x89\x8c"),
+ ENTITY_DEF("frac35", 8535, "\xe2\x85\x97"),
+ ENTITY_DEF("hscr", 119997, "\xf0\x9d\x92\xbd"),
+ ENTITY_DEF("LessEqualGreater", 8922, "\xe2\x8b\x9a"),
+ ENTITY_DEF("Implies", 8658, "\xe2\x87\x92"),
+ ENTITY_DEF("ETH", 208, "\xc3\x90"),
+ ENTITY_DEF_HEUR("Yacute", 221, "\xc3\x9d"),
+ ENTITY_DEF_HEUR("shy", 173, "\xc2\xad"),
+ ENTITY_DEF("Rarrtl", 10518, "\xe2\xa4\x96"),
+ ENTITY_DEF_HEUR("sup1", 185, "\xc2\xb9"),
+ ENTITY_DEF("reals", 8477, "\xe2\x84\x9d"),
+ ENTITY_DEF("blacklozenge", 10731, "\xe2\xa7\xab"),
+ ENTITY_DEF("ncedil", 326, "\xc5\x86"),
+ ENTITY_DEF("Lambda", 923, "\xce\x9b"),
+ ENTITY_DEF("uopf", 120166, "\xf0\x9d\x95\xa6"),
+ ENTITY_DEF("bigodot", 10752, "\xe2\xa8\x80"),
+ ENTITY_DEF("ubreve", 365, "\xc5\xad"),
+ ENTITY_DEF("drbkarow", 10512, "\xe2\xa4\x90"),
+ ENTITY_DEF("els", 10901, "\xe2\xaa\x95"),
+ ENTITY_DEF("shortparallel", 8741, "\xe2\x88\xa5"),
+ ENTITY_DEF("Pcy", 1055, "\xd0\x9f"),
+ ENTITY_DEF("dsol", 10742, "\xe2\xa7\xb6"),
+ ENTITY_DEF("supsim", 10952, "\xe2\xab\x88"),
+ ENTITY_DEF("Longrightarrow", 10233, "\xe2\x9f\xb9"),
+ ENTITY_DEF("ThickSpace", 8287, "\xe2\x81\x9f\xe2\x80\x8a"),
+ ENTITY_DEF("Itilde", 296, "\xc4\xa8"),
+ ENTITY_DEF("nparallel", 8742, "\xe2\x88\xa6"),
+ ENTITY_DEF("And", 10835, "\xe2\xa9\x93"),
+ ENTITY_DEF("boxhd", 9516, "\xe2\x94\xac"),
+ ENTITY_DEF("Dashv", 10980, "\xe2\xab\xa4"),
+ ENTITY_DEF("NotSuperset", 8835, "\xe2\x8a\x83\xe2\x83\x92"),
+ ENTITY_DEF("Eta", 919, "\xce\x97"),
+ ENTITY_DEF("Qopf", 8474, "\xe2\x84\x9a"),
+ ENTITY_DEF("period", 46, "\x2e"),
+ ENTITY_DEF("angmsd", 8737, "\xe2\x88\xa1"),
+ ENTITY_DEF("fllig", 64258, "\xef\xac\x82"),
+ ENTITY_DEF("cuvee", 8910, "\xe2\x8b\x8e"),
+ ENTITY_DEF("wedbar", 10847, "\xe2\xa9\x9f"),
+ ENTITY_DEF("Fscr", 8497, "\xe2\x84\xb1"),
+ ENTITY_DEF("veebar", 8891, "\xe2\x8a\xbb"),
+ ENTITY_DEF("Longleftrightarrow", 10234, "\xe2\x9f\xba"),
+ ENTITY_DEF_HEUR("reg", 174, "\xc2\xae"),
+ ENTITY_DEF("NegativeMediumSpace", 8203, "\xe2\x80\x8b"),
+ ENTITY_DEF("Upsi", 978, "\xcf\x92"),
+ ENTITY_DEF("Mellintrf", 8499, "\xe2\x84\xb3"),
+ ENTITY_DEF("boxHU", 9577, "\xe2\x95\xa9"),
+ ENTITY_DEF("frac56", 8538, "\xe2\x85\x9a"),
+ ENTITY_DEF("utrif", 9652, "\xe2\x96\xb4"),
+ ENTITY_DEF("LeftTriangle", 8882, "\xe2\x8a\xb2"),
+ ENTITY_DEF("nsime", 8772, "\xe2\x89\x84"),
+ ENTITY_DEF("rcedil", 343, "\xc5\x97"),
+ ENTITY_DEF("aogon", 261, "\xc4\x85"),
+ ENTITY_DEF("uHar", 10595, "\xe2\xa5\xa3"),
+ ENTITY_DEF("ForAll", 8704, "\xe2\x88\x80"),
+ ENTITY_DEF("prE", 10931, "\xe2\xaa\xb3"),
+ ENTITY_DEF("boxV", 9553, "\xe2\x95\x91"),
+ ENTITY_DEF("softcy", 1100, "\xd1\x8c"),
+ ENTITY_DEF("hercon", 8889, "\xe2\x8a\xb9"),
+ ENTITY_DEF("lmoustache", 9136, "\xe2\x8e\xb0"),
+ ENTITY_DEF("Product", 8719, "\xe2\x88\x8f"),
+ ENTITY_DEF("lsimg", 10895, "\xe2\xaa\x8f"),
+ ENTITY_DEF("verbar", 124, "\x7c"),
+ ENTITY_DEF("ofcir", 10687, "\xe2\xa6\xbf"),
+ ENTITY_DEF("curlyeqprec", 8926, "\xe2\x8b\x9e"),
+ ENTITY_DEF("ldquo", 8220, "\xe2\x80\x9c"),
+ ENTITY_DEF("bot", 8869, "\xe2\x8a\xa5"),
+ ENTITY_DEF("Psi", 936, "\xce\xa8"),
+ ENTITY_DEF("OElig", 338, "\xc5\x92"),
+ ENTITY_DEF("DownRightVectorBar", 10583, "\xe2\xa5\x97"),
+ ENTITY_DEF("minusb", 8863, "\xe2\x8a\x9f"),
+ ENTITY_DEF("Iscr", 8464, "\xe2\x84\x90"),
+ ENTITY_DEF("Tcedil", 354, "\xc5\xa2"),
+ ENTITY_DEF("ffilig", 64259, "\xef\xac\x83"),
+ ENTITY_DEF("Gcy", 1043, "\xd0\x93"),
+ ENTITY_DEF("oline", 8254, "\xe2\x80\xbe"),
+ ENTITY_DEF("bottom", 8869, "\xe2\x8a\xa5"),
+ ENTITY_DEF("nVDash", 8879, "\xe2\x8a\xaf"),
+ ENTITY_DEF("lessdot", 8918, "\xe2\x8b\x96"),
+ ENTITY_DEF("cups", 8746, "\xe2\x88\xaa\xef\xb8\x80"),
+ ENTITY_DEF("gla", 10917, "\xe2\xaa\xa5"),
+ ENTITY_DEF("hellip", 8230, "\xe2\x80\xa6"),
+ ENTITY_DEF("hookleftarrow", 8617, "\xe2\x86\xa9"),
+ ENTITY_DEF("Cup", 8915, "\xe2\x8b\x93"),
+ ENTITY_DEF("upsi", 965, "\xcf\x85"),
+ ENTITY_DEF("DownArrowBar", 10515, "\xe2\xa4\x93"),
+ ENTITY_DEF("lowast", 8727, "\xe2\x88\x97"),
+ ENTITY_DEF("profline", 8978, "\xe2\x8c\x92"),
+ ENTITY_DEF("ngsim", 8821, "\xe2\x89\xb5"),
+ ENTITY_DEF("boxhu", 9524, "\xe2\x94\xb4"),
+ ENTITY_DEF("operp", 10681, "\xe2\xa6\xb9"),
+ ENTITY_DEF("cap", 8745, "\xe2\x88\xa9"),
+ ENTITY_DEF("Hcirc", 292, "\xc4\xa4"),
+ ENTITY_DEF("Ncy", 1053, "\xd0\x9d"),
+ ENTITY_DEF("zeetrf", 8488, "\xe2\x84\xa8"),
+ ENTITY_DEF("cuepr", 8926, "\xe2\x8b\x9e"),
+ ENTITY_DEF("supsetneq", 8843, "\xe2\x8a\x8b"),
+ ENTITY_DEF("lfloor", 8970, "\xe2\x8c\x8a"),
+ ENTITY_DEF("ngtr", 8815, "\xe2\x89\xaf"),
+ ENTITY_DEF("ccups", 10828, "\xe2\xa9\x8c"),
+ ENTITY_DEF("pscr", 120005, "\xf0\x9d\x93\x85"),
+ ENTITY_DEF("Cfr", 8493, "\xe2\x84\xad"),
+ ENTITY_DEF("dtri", 9663, "\xe2\x96\xbf"),
+ ENTITY_DEF("icirc", 238, "\xc3\xae"),
+ ENTITY_DEF("leftarrow", 8592, "\xe2\x86\x90"),
+ ENTITY_DEF("vdash", 8866, "\xe2\x8a\xa2"),
+ ENTITY_DEF("leftrightharpoons", 8651, "\xe2\x87\x8b"),
+ ENTITY_DEF("rightrightarrows", 8649, "\xe2\x87\x89"),
+ ENTITY_DEF("strns", 175, "\xc2\xaf"),
+ ENTITY_DEF("intlarhk", 10775, "\xe2\xa8\x97"),
+ ENTITY_DEF("downharpoonright", 8642, "\xe2\x87\x82"),
+ ENTITY_DEF_HEUR("yacute", 253, "\xc3\xbd"),
+ ENTITY_DEF("boxUr", 9561, "\xe2\x95\x99"),
+ ENTITY_DEF("triangleleft", 9667, "\xe2\x97\x83"),
+ ENTITY_DEF("DiacriticalDot", 729, "\xcb\x99"),
+ ENTITY_DEF("thetav", 977, "\xcf\x91"),
+ ENTITY_DEF("OverBracket", 9140, "\xe2\x8e\xb4"),
+ ENTITY_DEF("PrecedesTilde", 8830, "\xe2\x89\xbe"),
+ ENTITY_DEF("rtrie", 8885, "\xe2\x8a\xb5"),
+ ENTITY_DEF("Scirc", 348, "\xc5\x9c"),
+ ENTITY_DEF("vsupne", 8843, "\xe2\x8a\x8b\xef\xb8\x80"),
+ ENTITY_DEF("OverBrace", 9182, "\xe2\x8f\x9e"),
+ ENTITY_DEF("Yfr", 120092, "\xf0\x9d\x94\x9c"),
+ ENTITY_DEF("scnE", 10934, "\xe2\xaa\xb6"),
+ ENTITY_DEF("simlE", 10911, "\xe2\xaa\x9f"),
+ ENTITY_DEF("Proportional", 8733, "\xe2\x88\x9d"),
+ ENTITY_DEF("edot", 279, "\xc4\x97"),
+ ENTITY_DEF("loang", 10220, "\xe2\x9f\xac"),
+ ENTITY_DEF("gesdot", 10880, "\xe2\xaa\x80"),
+ ENTITY_DEF("DownBreve", 785, "\xcc\x91"),
+ ENTITY_DEF("pcy", 1087, "\xd0\xbf"),
+ ENTITY_DEF("Succeeds", 8827, "\xe2\x89\xbb"),
+ ENTITY_DEF("mfr", 120106, "\xf0\x9d\x94\xaa"),
+ ENTITY_DEF("Leftarrow", 8656, "\xe2\x87\x90"),
+ ENTITY_DEF("boxDr", 9555, "\xe2\x95\x93"),
+ ENTITY_DEF("Nscr", 119977, "\xf0\x9d\x92\xa9"),
+ ENTITY_DEF("diam", 8900, "\xe2\x8b\x84"),
+ ENTITY_DEF("CHcy", 1063, "\xd0\xa7"),
+ ENTITY_DEF("boxdr", 9484, "\xe2\x94\x8c"),
+ ENTITY_DEF("rlm", 8207, "\xe2\x80\x8f"),
+ ENTITY_DEF("Coproduct", 8720, "\xe2\x88\x90"),
+ ENTITY_DEF("RightTeeArrow", 8614, "\xe2\x86\xa6"),
+ ENTITY_DEF("tridot", 9708, "\xe2\x97\xac"),
+ ENTITY_DEF("ldquor", 8222, "\xe2\x80\x9e"),
+ ENTITY_DEF("sol", 47, "\x2f"),
+ ENTITY_DEF_HEUR("ecirc", 234, "\xc3\xaa"),
+ ENTITY_DEF("DoubleLeftArrow", 8656, "\xe2\x87\x90"),
+ ENTITY_DEF("Gscr", 119970, "\xf0\x9d\x92\xa2"),
+ ENTITY_DEF("ap", 8776, "\xe2\x89\x88"),
+ ENTITY_DEF("rbrke", 10636, "\xe2\xa6\x8c"),
+ ENTITY_DEF("LeftFloor", 8970, "\xe2\x8c\x8a"),
+ ENTITY_DEF("blk12", 9618, "\xe2\x96\x92"),
+ ENTITY_DEF("Conint", 8751, "\xe2\x88\xaf"),
+ ENTITY_DEF("triangledown", 9663, "\xe2\x96\xbf"),
+ ENTITY_DEF("Icy", 1048, "\xd0\x98"),
+ ENTITY_DEF("backprime", 8245, "\xe2\x80\xb5"),
+ ENTITY_DEF("longleftrightarrow", 10231, "\xe2\x9f\xb7"),
+ ENTITY_DEF("ntriangleleft", 8938, "\xe2\x8b\xaa"),
+ ENTITY_DEF_HEUR("copy", 169, "\xc2\xa9"),
+ ENTITY_DEF("mapstodown", 8615, "\xe2\x86\xa7"),
+ ENTITY_DEF("seArr", 8664, "\xe2\x87\x98"),
+ ENTITY_DEF("ENG", 330, "\xc5\x8a"),
+ ENTITY_DEF("DoubleRightArrow", 8658, "\xe2\x87\x92"),
+ ENTITY_DEF("tfr", 120113, "\xf0\x9d\x94\xb1"),
+ ENTITY_DEF("rharul", 10604, "\xe2\xa5\xac"),
+ ENTITY_DEF("bfr", 120095, "\xf0\x9d\x94\x9f"),
+ ENTITY_DEF("origof", 8886, "\xe2\x8a\xb6"),
+ ENTITY_DEF("Therefore", 8756, "\xe2\x88\xb4"),
+ ENTITY_DEF("glE", 10898, "\xe2\xaa\x92"),
+ ENTITY_DEF("leftarrowtail", 8610, "\xe2\x86\xa2"),
+ ENTITY_DEF("NotEqual", 8800, "\xe2\x89\xa0"),
+ ENTITY_DEF("LeftCeiling", 8968, "\xe2\x8c\x88"),
+ ENTITY_DEF("lArr", 8656, "\xe2\x87\x90"),
+ ENTITY_DEF("subseteq", 8838, "\xe2\x8a\x86"),
+ ENTITY_DEF("larrbfs", 10527, "\xe2\xa4\x9f"),
+ ENTITY_DEF("Gammad", 988, "\xcf\x9c"),
+ ENTITY_DEF("rtriltri", 10702, "\xe2\xa7\x8e"),
+ ENTITY_DEF("Fcy", 1060, "\xd0\xa4"),
+ ENTITY_DEF("Vopf", 120141, "\xf0\x9d\x95\x8d"),
+ ENTITY_DEF("lrarr", 8646, "\xe2\x87\x86"),
+ ENTITY_DEF("delta", 948, "\xce\xb4"),
+ ENTITY_DEF("xodot", 10752, "\xe2\xa8\x80"),
+ ENTITY_DEF("larrtl", 8610, "\xe2\x86\xa2"),
+ ENTITY_DEF("gsim", 8819, "\xe2\x89\xb3"),
+ ENTITY_DEF("ratail", 10522, "\xe2\xa4\x9a"),
+ ENTITY_DEF("vsubne", 8842, "\xe2\x8a\x8a\xef\xb8\x80"),
+ ENTITY_DEF("boxur", 9492, "\xe2\x94\x94"),
+ ENTITY_DEF("succsim", 8831, "\xe2\x89\xbf"),
+ ENTITY_DEF("triplus", 10809, "\xe2\xa8\xb9"),
+ ENTITY_DEF("nless", 8814, "\xe2\x89\xae"),
+ ENTITY_DEF("uharr", 8638, "\xe2\x86\xbe"),
+ ENTITY_DEF("lambda", 955, "\xce\xbb"),
+ ENTITY_DEF_HEUR("uuml", 252, "\xc3\xbc"),
+ ENTITY_DEF("horbar", 8213, "\xe2\x80\x95"),
+ ENTITY_DEF("ccirc", 265, "\xc4\x89"),
+ ENTITY_DEF("sqcup", 8852, "\xe2\x8a\x94"),
+ ENTITY_DEF("Pscr", 119979, "\xf0\x9d\x92\xab"),
+ ENTITY_DEF("supsup", 10966, "\xe2\xab\x96"),
+ ENTITY_DEF("Cacute", 262, "\xc4\x86"),
+ ENTITY_DEF("upsih", 978, "\xcf\x92"),
+ ENTITY_DEF("precsim", 8830, "\xe2\x89\xbe"),
+ ENTITY_DEF("longrightarrow", 10230, "\xe2\x9f\xb6"),
+ ENTITY_DEF("circledR", 174, "\xc2\xae"),
+ ENTITY_DEF("UpTeeArrow", 8613, "\xe2\x86\xa5"),
+ ENTITY_DEF("bepsi", 1014, "\xcf\xb6"),
+ ENTITY_DEF("oast", 8859, "\xe2\x8a\x9b"),
+ ENTITY_DEF("yfr", 120118, "\xf0\x9d\x94\xb6"),
+ ENTITY_DEF("rdsh", 8627, "\xe2\x86\xb3"),
+ ENTITY_DEF("Ograve", 210, "\xc3\x92"),
+ ENTITY_DEF("LeftVectorBar", 10578, "\xe2\xa5\x92"),
+ ENTITY_DEF("NotNestedLessLess", 10913, "\xe2\xaa\xa1\xcc\xb8"),
+ ENTITY_DEF("Jscr", 119973, "\xf0\x9d\x92\xa5"),
+ ENTITY_DEF("psi", 968, "\xcf\x88"),
+ ENTITY_DEF("orarr", 8635, "\xe2\x86\xbb"),
+ ENTITY_DEF("Subset", 8912, "\xe2\x8b\x90"),
+ ENTITY_DEF("curarr", 8631, "\xe2\x86\xb7"),
+ ENTITY_DEF("CirclePlus", 8853, "\xe2\x8a\x95"),
+ ENTITY_DEF("gtrless", 8823, "\xe2\x89\xb7"),
+ ENTITY_DEF("nvle", 8804, "\xe2\x89\xa4\xe2\x83\x92"),
+ ENTITY_DEF("prop", 8733, "\xe2\x88\x9d"),
+ ENTITY_DEF("gEl", 10892, "\xe2\xaa\x8c"),
+ ENTITY_DEF("gtlPar", 10645, "\xe2\xa6\x95"),
+ ENTITY_DEF("frasl", 8260, "\xe2\x81\x84"),
+ ENTITY_DEF("nearr", 8599, "\xe2\x86\x97"),
+ ENTITY_DEF("NotSubsetEqual", 8840, "\xe2\x8a\x88"),
+ ENTITY_DEF("planck", 8463, "\xe2\x84\x8f"),
+ ENTITY_DEF_HEUR("Uuml", 220, "\xc3\x9c"),
+ ENTITY_DEF("spadesuit", 9824, "\xe2\x99\xa0"),
+ ENTITY_DEF_HEUR("sect", 167, "\xc2\xa7"),
+ ENTITY_DEF("cdot", 267, "\xc4\x8b"),
+ ENTITY_DEF("boxVh", 9579, "\xe2\x95\xab"),
+ ENTITY_DEF("zscr", 120015, "\xf0\x9d\x93\x8f"),
+ ENTITY_DEF("nsqsube", 8930, "\xe2\x8b\xa2"),
+ ENTITY_DEF("grave", 96, "\x60"),
+ ENTITY_DEF("angrtvb", 8894, "\xe2\x8a\xbe"),
+ ENTITY_DEF("MediumSpace", 8287, "\xe2\x81\x9f"),
+ ENTITY_DEF("Ntilde", 209, "\xc3\x91"),
+ ENTITY_DEF("solb", 10692, "\xe2\xa7\x84"),
+ ENTITY_DEF("angzarr", 9084, "\xe2\x8d\xbc"),
+ ENTITY_DEF("nopf", 120159, "\xf0\x9d\x95\x9f"),
+ ENTITY_DEF("rtrif", 9656, "\xe2\x96\xb8"),
+ ENTITY_DEF("nrightarrow", 8603, "\xe2\x86\x9b"),
+ ENTITY_DEF("Kappa", 922, "\xce\x9a"),
+ ENTITY_DEF("simrarr", 10610, "\xe2\xa5\xb2"),
+ ENTITY_DEF("imacr", 299, "\xc4\xab"),
+ ENTITY_DEF("vrtri", 8883, "\xe2\x8a\xb3"),
+ ENTITY_DEF("part", 8706, "\xe2\x88\x82"),
+ ENTITY_DEF("esim", 8770, "\xe2\x89\x82"),
+ ENTITY_DEF_HEUR("atilde", 227, "\xc3\xa3"),
+ ENTITY_DEF("DownRightTeeVector", 10591, "\xe2\xa5\x9f"),
+ ENTITY_DEF("jcirc", 309, "\xc4\xb5"),
+ ENTITY_DEF("Ecaron", 282, "\xc4\x9a"),
+ ENTITY_DEF("VerticalSeparator", 10072, "\xe2\x9d\x98"),
+ ENTITY_DEF("rHar", 10596, "\xe2\xa5\xa4"),
+ ENTITY_DEF("rcaron", 345, "\xc5\x99"),
+ ENTITY_DEF("subnE", 10955, "\xe2\xab\x8b"),
+ ENTITY_DEF("ii", 8520, "\xe2\x85\x88"),
+ ENTITY_DEF("Cconint", 8752, "\xe2\x88\xb0"),
+ ENTITY_DEF("Mcy", 1052, "\xd0\x9c"),
+ ENTITY_DEF("eqcolon", 8789, "\xe2\x89\x95"),
+ ENTITY_DEF("cupor", 10821, "\xe2\xa9\x85"),
+ ENTITY_DEF("DoubleUpArrow", 8657, "\xe2\x87\x91"),
+ ENTITY_DEF("boxbox", 10697, "\xe2\xa7\x89"),
+ ENTITY_DEF("setminus", 8726, "\xe2\x88\x96"),
+ ENTITY_DEF("Lleftarrow", 8666, "\xe2\x87\x9a"),
+ ENTITY_DEF("nang", 8736, "\xe2\x88\xa0\xe2\x83\x92"),
+ ENTITY_DEF("TRADE", 8482, "\xe2\x84\xa2"),
+ ENTITY_DEF("urcorner", 8989, "\xe2\x8c\x9d"),
+ ENTITY_DEF("lsqb", 91, "\x5b"),
+ ENTITY_DEF("cupcup", 10826, "\xe2\xa9\x8a"),
+ ENTITY_DEF("kjcy", 1116, "\xd1\x9c"),
+ ENTITY_DEF("llhard", 10603, "\xe2\xa5\xab"),
+ ENTITY_DEF("mumap", 8888, "\xe2\x8a\xb8"),
+ ENTITY_DEF("iiint", 8749, "\xe2\x88\xad"),
+ ENTITY_DEF("RightTee", 8866, "\xe2\x8a\xa2"),
+ ENTITY_DEF("Tcaron", 356, "\xc5\xa4"),
+ ENTITY_DEF("bigcirc", 9711, "\xe2\x97\xaf"),
+ ENTITY_DEF("trianglerighteq", 8885, "\xe2\x8a\xb5"),
+ ENTITY_DEF("NotLessGreater", 8824, "\xe2\x89\xb8"),
+ ENTITY_DEF("hArr", 8660, "\xe2\x87\x94"),
+ ENTITY_DEF("ocy", 1086, "\xd0\xbe"),
+ ENTITY_DEF("tosa", 10537, "\xe2\xa4\xa9"),
+ ENTITY_DEF("twixt", 8812, "\xe2\x89\xac"),
+ ENTITY_DEF("square", 9633, "\xe2\x96\xa1"),
+ ENTITY_DEF("Otimes", 10807, "\xe2\xa8\xb7"),
+ ENTITY_DEF("Kcedil", 310, "\xc4\xb6"),
+ ENTITY_DEF("beth", 8502, "\xe2\x84\xb6"),
+ ENTITY_DEF("triminus", 10810, "\xe2\xa8\xba"),
+ ENTITY_DEF("nlArr", 8653, "\xe2\x87\x8d"),
+ ENTITY_DEF("Oacute", 211, "\xc3\x93"),
+ ENTITY_DEF("zwnj", 8204, "\xe2\x80\x8c"),
+ ENTITY_DEF("ll", 8810, "\xe2\x89\xaa"),
+ ENTITY_DEF("smashp", 10803, "\xe2\xa8\xb3"),
+ ENTITY_DEF("ngeqq", 8807, "\xe2\x89\xa7\xcc\xb8"),
+ ENTITY_DEF("rnmid", 10990, "\xe2\xab\xae"),
+ ENTITY_DEF("nwArr", 8662, "\xe2\x87\x96"),
+ ENTITY_DEF("RightUpDownVector", 10575, "\xe2\xa5\x8f"),
+ ENTITY_DEF("lbbrk", 10098, "\xe2\x9d\xb2"),
+ ENTITY_DEF("compfn", 8728, "\xe2\x88\x98"),
+ ENTITY_DEF("eDDot", 10871, "\xe2\xa9\xb7"),
+ ENTITY_DEF("Jsercy", 1032, "\xd0\x88"),
+ ENTITY_DEF("HARDcy", 1066, "\xd0\xaa"),
+ ENTITY_DEF("nexists", 8708, "\xe2\x88\x84"),
+ ENTITY_DEF("theta", 952, "\xce\xb8"),
+ ENTITY_DEF("plankv", 8463, "\xe2\x84\x8f"),
+ ENTITY_DEF_HEUR("sup2", 178, "\xc2\xb2"),
+ ENTITY_DEF("lessapprox", 10885, "\xe2\xaa\x85"),
+ ENTITY_DEF("gdot", 289, "\xc4\xa1"),
+ ENTITY_DEF("angmsdae", 10668, "\xe2\xa6\xac"),
+ ENTITY_DEF("Superset", 8835, "\xe2\x8a\x83"),
+ ENTITY_DEF("prap", 10935, "\xe2\xaa\xb7"),
+ ENTITY_DEF("Zscr", 119989, "\xf0\x9d\x92\xb5"),
+ ENTITY_DEF("nsucc", 8833, "\xe2\x8a\x81"),
+ ENTITY_DEF("supseteqq", 10950, "\xe2\xab\x86"),
+ ENTITY_DEF("UpTee", 8869, "\xe2\x8a\xa5"),
+ ENTITY_DEF("LowerLeftArrow", 8601, "\xe2\x86\x99"),
+ ENTITY_DEF("ssmile", 8995, "\xe2\x8c\xa3"),
+ ENTITY_DEF("niv", 8715, "\xe2\x88\x8b"),
+ ENTITY_DEF("bigvee", 8897, "\xe2\x8b\x81"),
+ ENTITY_DEF("kscr", 120000, "\xf0\x9d\x93\x80"),
+ ENTITY_DEF("xutri", 9651, "\xe2\x96\xb3"),
+ ENTITY_DEF("caret", 8257, "\xe2\x81\x81"),
+ ENTITY_DEF("caron", 711, "\xcb\x87"),
+ ENTITY_DEF("Wedge", 8896, "\xe2\x8b\x80"),
+ ENTITY_DEF("sdotb", 8865, "\xe2\x8a\xa1"),
+ ENTITY_DEF("bigoplus", 10753, "\xe2\xa8\x81"),
+ ENTITY_DEF("Breve", 728, "\xcb\x98"),
+ ENTITY_DEF("ImaginaryI", 8520, "\xe2\x85\x88"),
+ ENTITY_DEF("longmapsto", 10236, "\xe2\x9f\xbc"),
+ ENTITY_DEF("boxVH", 9580, "\xe2\x95\xac"),
+ ENTITY_DEF("lozenge", 9674, "\xe2\x97\x8a"),
+ ENTITY_DEF("toea", 10536, "\xe2\xa4\xa8"),
+ ENTITY_DEF("nbumpe", 8783, "\xe2\x89\x8f\xcc\xb8"),
+ ENTITY_DEF("gcirc", 285, "\xc4\x9d"),
+ ENTITY_DEF("NotHumpEqual", 8783, "\xe2\x89\x8f\xcc\xb8"),
+ ENTITY_DEF("pre", 10927, "\xe2\xaa\xaf"),
+ ENTITY_DEF("ascr", 119990, "\xf0\x9d\x92\xb6"),
+ ENTITY_DEF("Acirc", 194, "\xc3\x82"),
+ ENTITY_DEF("questeq", 8799, "\xe2\x89\x9f"),
+ ENTITY_DEF("ncaron", 328, "\xc5\x88"),
+ ENTITY_DEF("LeftTeeArrow", 8612, "\xe2\x86\xa4"),
+ ENTITY_DEF("xcirc", 9711, "\xe2\x97\xaf"),
+ ENTITY_DEF("swarr", 8601, "\xe2\x86\x99"),
+ ENTITY_DEF("MinusPlus", 8723, "\xe2\x88\x93"),
+ ENTITY_DEF("plus", 43, "\x2b"),
+ ENTITY_DEF("NotDoubleVerticalBar", 8742, "\xe2\x88\xa6"),
+ ENTITY_DEF("rppolint", 10770, "\xe2\xa8\x92"),
+ ENTITY_DEF("NotTildeFullEqual", 8775, "\xe2\x89\x87"),
+ ENTITY_DEF("ltdot", 8918, "\xe2\x8b\x96"),
+ ENTITY_DEF("NotNestedGreaterGreater", 10914, "\xe2\xaa\xa2\xcc\xb8"),
+ ENTITY_DEF("Lscr", 8466, "\xe2\x84\x92"),
+ ENTITY_DEF("pitchfork", 8916, "\xe2\x8b\x94"),
+ ENTITY_DEF("Eopf", 120124, "\xf0\x9d\x94\xbc"),
+ ENTITY_DEF("ropf", 120163, "\xf0\x9d\x95\xa3"),
+ ENTITY_DEF("Delta", 916, "\xce\x94"),
+ ENTITY_DEF("lozf", 10731, "\xe2\xa7\xab"),
+ ENTITY_DEF("RightTeeVector", 10587, "\xe2\xa5\x9b"),
+ ENTITY_DEF("UpDownArrow", 8597, "\xe2\x86\x95"),
+ ENTITY_DEF("bump", 8782, "\xe2\x89\x8e"),
+ ENTITY_DEF("Rscr", 8475, "\xe2\x84\x9b"),
+ ENTITY_DEF("slarr", 8592, "\xe2\x86\x90"),
+ ENTITY_DEF("lcy", 1083, "\xd0\xbb"),
+ ENTITY_DEF("Vee", 8897, "\xe2\x8b\x81"),
+ ENTITY_DEF("Iogon", 302, "\xc4\xae"),
+ ENTITY_DEF("minus", 8722, "\xe2\x88\x92"),
+ ENTITY_DEF("GreaterFullEqual", 8807, "\xe2\x89\xa7"),
+ ENTITY_DEF("xhArr", 10234, "\xe2\x9f\xba"),
+ ENTITY_DEF("shortmid", 8739, "\xe2\x88\xa3"),
+ ENTITY_DEF("DoubleDownArrow", 8659, "\xe2\x87\x93"),
+ ENTITY_DEF("Wscr", 119986, "\xf0\x9d\x92\xb2"),
+ ENTITY_DEF("rang", 10217, "\xe2\x9f\xa9"),
+ ENTITY_DEF("lcub", 123, "\x7b"),
+ ENTITY_DEF("mnplus", 8723, "\xe2\x88\x93"),
+ ENTITY_DEF("ulcrop", 8975, "\xe2\x8c\x8f"),
+ ENTITY_DEF("wfr", 120116, "\xf0\x9d\x94\xb4"),
+ ENTITY_DEF("DifferentialD", 8518, "\xe2\x85\x86"),
+ ENTITY_DEF("ThinSpace", 8201, "\xe2\x80\x89"),
+ ENTITY_DEF("NotGreaterGreater", 8811, "\xe2\x89\xab\xcc\xb8"),
+ ENTITY_DEF("Topf", 120139, "\xf0\x9d\x95\x8b"),
+ ENTITY_DEF("sbquo", 8218, "\xe2\x80\x9a"),
+ ENTITY_DEF("sdot", 8901, "\xe2\x8b\x85"),
+ ENTITY_DEF("DoubleLeftTee", 10980, "\xe2\xab\xa4"),
+ ENTITY_DEF("vBarv", 10985, "\xe2\xab\xa9"),
+ ENTITY_DEF("subne", 8842, "\xe2\x8a\x8a"),
+ ENTITY_DEF("gtrdot", 8919, "\xe2\x8b\x97"),
+ ENTITY_DEF("opar", 10679, "\xe2\xa6\xb7"),
+ ENTITY_DEF("apid", 8779, "\xe2\x89\x8b"),
+ ENTITY_DEF("Cross", 10799, "\xe2\xa8\xaf"),
+ ENTITY_DEF("lhblk", 9604, "\xe2\x96\x84"),
+ ENTITY_DEF("capcap", 10827, "\xe2\xa9\x8b"),
+ ENTITY_DEF("midast", 42, "\x2a"),
+ ENTITY_DEF("lscr", 120001, "\xf0\x9d\x93\x81"),
+ ENTITY_DEF("nGt", 8811, "\xe2\x89\xab\xe2\x83\x92"),
+ ENTITY_DEF_HEUR("Euml", 203, "\xc3\x8b"),
+ ENTITY_DEF("blacktriangledown", 9662, "\xe2\x96\xbe"),
+ ENTITY_DEF("Rcy", 1056, "\xd0\xa0"),
+ ENTITY_DEF("dfisht", 10623, "\xe2\xa5\xbf"),
+ ENTITY_DEF("dashv", 8867, "\xe2\x8a\xa3"),
+ ENTITY_DEF("ast", 42, "\x2a"),
+ ENTITY_DEF("ContourIntegral", 8750, "\xe2\x88\xae"),
+ ENTITY_DEF("Ofr", 120082, "\xf0\x9d\x94\x92"),
+ ENTITY_DEF("Lcy", 1051, "\xd0\x9b"),
+ ENTITY_DEF("nltrie", 8940, "\xe2\x8b\xac"),
+ ENTITY_DEF("ShortUpArrow", 8593, "\xe2\x86\x91"),
+ ENTITY_DEF("acy", 1072, "\xd0\xb0"),
+ ENTITY_DEF("rightarrow", 8594, "\xe2\x86\x92"),
+ ENTITY_DEF("UnderBar", 95, "\x5f"),
+ ENTITY_DEF("LongLeftArrow", 10229, "\xe2\x9f\xb5"),
+ ENTITY_DEF("andd", 10844, "\xe2\xa9\x9c"),
+ ENTITY_DEF("xlarr", 10229, "\xe2\x9f\xb5"),
+ ENTITY_DEF("percnt", 37, "\x25"),
+ ENTITY_DEF("rharu", 8640, "\xe2\x87\x80"),
+ ENTITY_DEF("plusdo", 8724, "\xe2\x88\x94"),
+ ENTITY_DEF("TScy", 1062, "\xd0\xa6"),
+ ENTITY_DEF("kcy", 1082, "\xd0\xba"),
+ ENTITY_DEF("boxVR", 9568, "\xe2\x95\xa0"),
+ ENTITY_DEF("looparrowleft", 8619, "\xe2\x86\xab"),
+ ENTITY_DEF("scirc", 349, "\xc5\x9d"),
+ ENTITY_DEF("drcorn", 8991, "\xe2\x8c\x9f"),
+ ENTITY_DEF("iiota", 8489, "\xe2\x84\xa9"),
+ ENTITY_DEF("Zcy", 1047, "\xd0\x97"),
+ ENTITY_DEF("frac58", 8541, "\xe2\x85\x9d"),
+ ENTITY_DEF("alpha", 945, "\xce\xb1"),
+ ENTITY_DEF("daleth", 8504, "\xe2\x84\xb8"),
+ ENTITY_DEF("gtreqless", 8923, "\xe2\x8b\x9b"),
+ ENTITY_DEF("tstrok", 359, "\xc5\xa7"),
+ ENTITY_DEF("plusb", 8862, "\xe2\x8a\x9e"),
+ ENTITY_DEF("odsold", 10684, "\xe2\xa6\xbc"),
+ ENTITY_DEF("varsupsetneqq", 10956, "\xe2\xab\x8c\xef\xb8\x80"),
+ ENTITY_DEF_HEUR("otilde", 245, "\xc3\xb5"),
+ ENTITY_DEF("gtcir", 10874, "\xe2\xa9\xba"),
+ ENTITY_DEF("lltri", 9722, "\xe2\x97\xba"),
+ ENTITY_DEF("rx", 8478, "\xe2\x84\x9e"),
+ ENTITY_DEF("ljcy", 1113, "\xd1\x99"),
+ ENTITY_DEF("parsim", 10995, "\xe2\xab\xb3"),
+ ENTITY_DEF("NotElement", 8713, "\xe2\x88\x89"),
+ ENTITY_DEF_HEUR("plusmn", 177, "\xc2\xb1"),
+ ENTITY_DEF("varsubsetneq", 8842, "\xe2\x8a\x8a\xef\xb8\x80"),
+ ENTITY_DEF("subset", 8834, "\xe2\x8a\x82"),
+ ENTITY_DEF("awint", 10769, "\xe2\xa8\x91"),
+ ENTITY_DEF("laemptyv", 10676, "\xe2\xa6\xb4"),
+ ENTITY_DEF("phiv", 981, "\xcf\x95"),
+ ENTITY_DEF("sfrown", 8994, "\xe2\x8c\xa2"),
+ ENTITY_DEF("DoubleUpDownArrow", 8661, "\xe2\x87\x95"),
+ ENTITY_DEF("lpar", 40, "\x28"),
+ ENTITY_DEF("frac45", 8536, "\xe2\x85\x98"),
+ ENTITY_DEF("rBarr", 10511, "\xe2\xa4\x8f"),
+ ENTITY_DEF("npolint", 10772, "\xe2\xa8\x94"),
+ ENTITY_DEF("emacr", 275, "\xc4\x93"),
+ ENTITY_DEF("maltese", 10016, "\xe2\x9c\xa0"),
+ ENTITY_DEF("PlusMinus", 177, "\xc2\xb1"),
+ ENTITY_DEF("ReverseEquilibrium", 8651, "\xe2\x87\x8b"),
+ ENTITY_DEF("oscr", 8500, "\xe2\x84\xb4"),
+ ENTITY_DEF("blacksquare", 9642, "\xe2\x96\xaa"),
+ ENTITY_DEF("TSHcy", 1035, "\xd0\x8b"),
+ ENTITY_DEF("gap", 10886, "\xe2\xaa\x86"),
+ ENTITY_DEF("xnis", 8955, "\xe2\x8b\xbb"),
+ ENTITY_DEF("Ll", 8920, "\xe2\x8b\x98"),
+ ENTITY_DEF("PrecedesEqual", 10927, "\xe2\xaa\xaf"),
+ ENTITY_DEF("incare", 8453, "\xe2\x84\x85"),
+ ENTITY_DEF("nharr", 8622, "\xe2\x86\xae"),
+ ENTITY_DEF("varnothing", 8709, "\xe2\x88\x85"),
+ ENTITY_DEF("ShortDownArrow", 8595, "\xe2\x86\x93"),
+ ENTITY_DEF_HEUR("nbsp", 160, " "),
+ ENTITY_DEF("asympeq", 8781, "\xe2\x89\x8d"),
+ ENTITY_DEF("rbrkslu", 10640, "\xe2\xa6\x90"),
+ ENTITY_DEF("rho", 961, "\xcf\x81"),
+ ENTITY_DEF("Mscr", 8499, "\xe2\x84\xb3"),
+ ENTITY_DEF_HEUR("eth", 240, "\xc3\xb0"),
+ ENTITY_DEF("suplarr", 10619, "\xe2\xa5\xbb"),
+ ENTITY_DEF("Tab", 9, "\x09"),
+ ENTITY_DEF("omicron", 959, "\xce\xbf"),
+ ENTITY_DEF("blacktriangle", 9652, "\xe2\x96\xb4"),
+ ENTITY_DEF("nldr", 8229, "\xe2\x80\xa5"),
+ ENTITY_DEF("downharpoonleft", 8643, "\xe2\x87\x83"),
+ ENTITY_DEF("circledcirc", 8858, "\xe2\x8a\x9a"),
+ ENTITY_DEF("leftleftarrows", 8647, "\xe2\x87\x87"),
+ ENTITY_DEF("NotHumpDownHump", 8782, "\xe2\x89\x8e\xcc\xb8"),
+ ENTITY_DEF("nvgt", 62, "\x3e\xe2\x83\x92"),
+ ENTITY_DEF("rhard", 8641, "\xe2\x87\x81"),
+ ENTITY_DEF("nGg", 8921, "\xe2\x8b\x99\xcc\xb8"),
+ ENTITY_DEF("lurdshar", 10570, "\xe2\xa5\x8a"),
+ ENTITY_DEF("cirE", 10691, "\xe2\xa7\x83"),
+ ENTITY_DEF("isinE", 8953, "\xe2\x8b\xb9"),
+ ENTITY_DEF("eparsl", 10723, "\xe2\xa7\xa3"),
+ ENTITY_DEF("RightAngleBracket", 10217, "\xe2\x9f\xa9"),
+ ENTITY_DEF("hcirc", 293, "\xc4\xa5"),
+ ENTITY_DEF("bumpeq", 8783, "\xe2\x89\x8f"),
+ ENTITY_DEF("cire", 8791, "\xe2\x89\x97"),
+ ENTITY_DEF("dotplus", 8724, "\xe2\x88\x94"),
+ ENTITY_DEF("itilde", 297, "\xc4\xa9"),
+ ENTITY_DEF("uwangle", 10663, "\xe2\xa6\xa7"),
+ ENTITY_DEF("rlhar", 8652, "\xe2\x87\x8c"),
+ ENTITY_DEF("rbrace", 125, "\x7d"),
+ ENTITY_DEF("mid", 8739, "\xe2\x88\xa3"),
+ ENTITY_DEF("el", 10905, "\xe2\xaa\x99"),
+ ENTITY_DEF("KJcy", 1036, "\xd0\x8c"),
+ ENTITY_DEF("odiv", 10808, "\xe2\xa8\xb8"),
+ ENTITY_DEF("amacr", 257, "\xc4\x81"),
+ ENTITY_DEF("qprime", 8279, "\xe2\x81\x97"),
+ ENTITY_DEF("tcedil", 355, "\xc5\xa3"),
+ ENTITY_DEF("UpArrowDownArrow", 8645, "\xe2\x87\x85"),
+ ENTITY_DEF("spades", 9824, "\xe2\x99\xa0"),
+ ENTITY_DEF("napos", 329, "\xc5\x89"),
+ ENTITY_DEF("straightepsilon", 1013, "\xcf\xb5"),
+ ENTITY_DEF("CupCap", 8781, "\xe2\x89\x8d"),
+ ENTITY_DEF("Oopf", 120134, "\xf0\x9d\x95\x86"),
+ ENTITY_DEF("sub", 8834, "\xe2\x8a\x82"),
+ ENTITY_DEF("ohm", 937, "\xce\xa9"),
+ ENTITY_DEF("UnderBrace", 9183, "\xe2\x8f\x9f"),
+ ENTITY_DEF("looparrowright", 8620, "\xe2\x86\xac"),
+ ENTITY_DEF("xotime", 10754, "\xe2\xa8\x82"),
+ ENTITY_DEF("ntgl", 8825, "\xe2\x89\xb9"),
+ ENTITY_DEF("minusdu", 10794, "\xe2\xa8\xaa"),
+ ENTITY_DEF("rarrb", 8677, "\xe2\x87\xa5"),
+ ENTITY_DEF("nvlArr", 10498, "\xe2\xa4\x82"),
+ ENTITY_DEF("triangle", 9653, "\xe2\x96\xb5"),
+ ENTITY_DEF("nacute", 324, "\xc5\x84"),
+ ENTITY_DEF("boxHD", 9574, "\xe2\x95\xa6"),
+ ENTITY_DEF("ratio", 8758, "\xe2\x88\xb6"),
+ ENTITY_DEF("larrsim", 10611, "\xe2\xa5\xb3"),
+ ENTITY_DEF("LessLess", 10913, "\xe2\xaa\xa1"),
+ ENTITY_DEF("yacy", 1103, "\xd1\x8f"),
+ ENTITY_DEF("ctdot", 8943, "\xe2\x8b\xaf"),
+ ENTITY_DEF("and", 8743, "\xe2\x88\xa7"),
+ ENTITY_DEF("lrtri", 8895, "\xe2\x8a\xbf"),
+ ENTITY_DEF("eDot", 8785, "\xe2\x89\x91"),
+ ENTITY_DEF("sqsub", 8847, "\xe2\x8a\x8f"),
+ ENTITY_DEF("real", 8476, "\xe2\x84\x9c"),
+ ENTITY_DEF("Dcy", 1044, "\xd0\x94"),
+ ENTITY_DEF("vartheta", 977, "\xcf\x91"),
+ ENTITY_DEF("nsub", 8836, "\xe2\x8a\x84"),
+ ENTITY_DEF("DownTee", 8868, "\xe2\x8a\xa4"),
+ ENTITY_DEF_HEUR("acute", 180, "\xc2\xb4"),
+ ENTITY_DEF("GreaterLess", 8823, "\xe2\x89\xb7"),
+ ENTITY_DEF("supplus", 10944, "\xe2\xab\x80"),
+ ENTITY_DEF("Vbar", 10987, "\xe2\xab\xab"),
+ ENTITY_DEF("divideontimes", 8903, "\xe2\x8b\x87"),
+ ENTITY_DEF("lsim", 8818, "\xe2\x89\xb2"),
+ ENTITY_DEF("nearhk", 10532, "\xe2\xa4\xa4"),
+ ENTITY_DEF("nLtv", 8810, "\xe2\x89\xaa\xcc\xb8"),
+ ENTITY_DEF("RuleDelayed", 10740, "\xe2\xa7\xb4"),
+ ENTITY_DEF("smile", 8995, "\xe2\x8c\xa3"),
+ ENTITY_DEF("coprod", 8720, "\xe2\x88\x90"),
+ ENTITY_DEF("imof", 8887, "\xe2\x8a\xb7"),
+ ENTITY_DEF("ecy", 1101, "\xd1\x8d"),
+ ENTITY_DEF("RightCeiling", 8969, "\xe2\x8c\x89"),
+ ENTITY_DEF("dlcorn", 8990, "\xe2\x8c\x9e"),
+ ENTITY_DEF("Nu", 925, "\xce\x9d"),
+ ENTITY_DEF("frac18", 8539, "\xe2\x85\x9b"),
+ ENTITY_DEF("diamond", 8900, "\xe2\x8b\x84"),
+ ENTITY_DEF("Icirc", 206, "\xc3\x8e"),
+ ENTITY_DEF("ngeq", 8817, "\xe2\x89\xb1"),
+ ENTITY_DEF("epsilon", 949, "\xce\xb5"),
+ ENTITY_DEF("fork", 8916, "\xe2\x8b\x94"),
+ ENTITY_DEF("xrarr", 10230, "\xe2\x9f\xb6"),
+ ENTITY_DEF("racute", 341, "\xc5\x95"),
+ ENTITY_DEF("ntlg", 8824, "\xe2\x89\xb8"),
+ ENTITY_DEF("xvee", 8897, "\xe2\x8b\x81"),
+ ENTITY_DEF("LeftArrowRightArrow", 8646, "\xe2\x87\x86"),
+ ENTITY_DEF("DownLeftRightVector", 10576, "\xe2\xa5\x90"),
+ ENTITY_DEF("Eacute", 201, "\xc3\x89"),
+ ENTITY_DEF("gimel", 8503, "\xe2\x84\xb7"),
+ ENTITY_DEF("rtimes", 8906, "\xe2\x8b\x8a"),
+ ENTITY_DEF("forall", 8704, "\xe2\x88\x80"),
+ ENTITY_DEF("DiacriticalDoubleAcute", 733, "\xcb\x9d"),
+ ENTITY_DEF("dArr", 8659, "\xe2\x87\x93"),
+ ENTITY_DEF("fallingdotseq", 8786, "\xe2\x89\x92"),
+ ENTITY_DEF("Aogon", 260, "\xc4\x84"),
+ ENTITY_DEF("PartialD", 8706, "\xe2\x88\x82"),
+ ENTITY_DEF("mapstoup", 8613, "\xe2\x86\xa5"),
+ ENTITY_DEF("die", 168, "\xc2\xa8"),
+ ENTITY_DEF("ngt", 8815, "\xe2\x89\xaf"),
+ ENTITY_DEF("vcy", 1074, "\xd0\xb2"),
+ ENTITY_DEF("fjlig", (unsigned) -1, "\x66\x6a"),
+ ENTITY_DEF("submult", 10945, "\xe2\xab\x81"),
+ ENTITY_DEF("ubrcy", 1118, "\xd1\x9e"),
+ ENTITY_DEF("ovbar", 9021, "\xe2\x8c\xbd"),
+ ENTITY_DEF("bsime", 8909, "\xe2\x8b\x8d"),
+ ENTITY_DEF("precnsim", 8936, "\xe2\x8b\xa8"),
+ ENTITY_DEF("DiacriticalTilde", 732, "\xcb\x9c"),
+ ENTITY_DEF("cwint", 8753, "\xe2\x88\xb1"),
+ ENTITY_DEF("Scy", 1057, "\xd0\xa1"),
+ ENTITY_DEF("NotGreaterEqual", 8817, "\xe2\x89\xb1"),
+ ENTITY_DEF("boxUR", 9562, "\xe2\x95\x9a"),
+ ENTITY_DEF("LessSlantEqual", 10877, "\xe2\xa9\xbd"),
+ ENTITY_DEF("Barwed", 8966, "\xe2\x8c\x86"),
+ ENTITY_DEF("supdot", 10942, "\xe2\xaa\xbe"),
+ ENTITY_DEF("gel", 8923, "\xe2\x8b\x9b"),
+ ENTITY_DEF("iscr", 119998, "\xf0\x9d\x92\xbe"),
+ ENTITY_DEF("doublebarwedge", 8966, "\xe2\x8c\x86"),
+ ENTITY_DEF("Idot", 304, "\xc4\xb0"),
+ ENTITY_DEF("DoubleDot", 168, "\xc2\xa8"),
+ ENTITY_DEF("rsquo", 8217, "\xe2\x80\x99"),
+ ENTITY_DEF("subsetneqq", 10955, "\xe2\xab\x8b"),
+ ENTITY_DEF("UpEquilibrium", 10606, "\xe2\xa5\xae"),
+ ENTITY_DEF("copysr", 8471, "\xe2\x84\x97"),
+ ENTITY_DEF("RightDoubleBracket", 10215, "\xe2\x9f\xa7"),
+ ENTITY_DEF("LeftRightVector", 10574, "\xe2\xa5\x8e"),
+ ENTITY_DEF("DownLeftVectorBar", 10582, "\xe2\xa5\x96"),
+ ENTITY_DEF("suphsub", 10967, "\xe2\xab\x97"),
+ ENTITY_DEF_HEUR("cedil", 184, "\xc2\xb8"),
+ ENTITY_DEF("prurel", 8880, "\xe2\x8a\xb0"),
+ ENTITY_DEF("imagpart", 8465, "\xe2\x84\x91"),
+ ENTITY_DEF("Hscr", 8459, "\xe2\x84\x8b"),
+ ENTITY_DEF("jmath", 567, "\xc8\xb7"),
+ ENTITY_DEF("nrtrie", 8941, "\xe2\x8b\xad"),
+ ENTITY_DEF("nsup", 8837, "\xe2\x8a\x85"),
+ ENTITY_DEF("Ubrcy", 1038, "\xd0\x8e"),
+ ENTITY_DEF("succnsim", 8937, "\xe2\x8b\xa9"),
+ ENTITY_DEF("nesim", 8770, "\xe2\x89\x82\xcc\xb8"),
+ ENTITY_DEF("varepsilon", 1013, "\xcf\xb5"),
+ ENTITY_DEF("DoubleRightTee", 8872, "\xe2\x8a\xa8"),
+ ENTITY_DEF_HEUR("not", 172, "\xc2\xac"),
+ ENTITY_DEF("lesdot", 10879, "\xe2\xa9\xbf"),
+ ENTITY_DEF("backepsilon", 1014, "\xcf\xb6"),
+ ENTITY_DEF("srarr", 8594, "\xe2\x86\x92"),
+ ENTITY_DEF("varsubsetneqq", 10955, "\xe2\xab\x8b\xef\xb8\x80"),
+ ENTITY_DEF("sqcap", 8851, "\xe2\x8a\x93"),
+ ENTITY_DEF("rightleftarrows", 8644, "\xe2\x87\x84"),
+ ENTITY_DEF("diams", 9830, "\xe2\x99\xa6"),
+ ENTITY_DEF("boxdR", 9554, "\xe2\x95\x92"),
+ ENTITY_DEF("ngeqslant", 10878, "\xe2\xa9\xbe\xcc\xb8"),
+ ENTITY_DEF("boxDR", 9556, "\xe2\x95\x94"),
+ ENTITY_DEF("sext", 10038, "\xe2\x9c\xb6"),
+ ENTITY_DEF("backsim", 8765, "\xe2\x88\xbd"),
+ ENTITY_DEF("nfr", 120107, "\xf0\x9d\x94\xab"),
+ ENTITY_DEF("CloseCurlyDoubleQuote", 8221, "\xe2\x80\x9d"),
+ ENTITY_DEF("npart", 8706, "\xe2\x88\x82\xcc\xb8"),
+ ENTITY_DEF("dharl", 8643, "\xe2\x87\x83"),
+ ENTITY_DEF("NewLine", 10, "\x0a"),
+ ENTITY_DEF("bigotimes", 10754, "\xe2\xa8\x82"),
+ ENTITY_DEF("lAtail", 10523, "\xe2\xa4\x9b"),
+ ENTITY_DEF_HEUR("frac14", 188, "\xc2\xbc"),
+ ENTITY_DEF("or", 8744, "\xe2\x88\xa8"),
+ ENTITY_DEF("subedot", 10947, "\xe2\xab\x83"),
+ ENTITY_DEF("nmid", 8740, "\xe2\x88\xa4"),
+ ENTITY_DEF("DownArrowUpArrow", 8693, "\xe2\x87\xb5"),
+ ENTITY_DEF("icy", 1080, "\xd0\xb8"),
+ ENTITY_DEF("num", 35, "\x23"),
+ ENTITY_DEF("Gdot", 288, "\xc4\xa0"),
+ ENTITY_DEF("urcrop", 8974, "\xe2\x8c\x8e"),
+ ENTITY_DEF("epsiv", 1013, "\xcf\xb5"),
+ ENTITY_DEF("topcir", 10993, "\xe2\xab\xb1"),
+ ENTITY_DEF("ne", 8800, "\xe2\x89\xa0"),
+ ENTITY_DEF("osol", 8856, "\xe2\x8a\x98"),
+ ENTITY_DEF_HEUR("amp", 38, "\x26"),
+ ENTITY_DEF("ncap", 10819, "\xe2\xa9\x83"),
+ ENTITY_DEF("Sscr", 119982, "\xf0\x9d\x92\xae"),
+ ENTITY_DEF("sung", 9834, "\xe2\x99\xaa"),
+ ENTITY_DEF("ltri", 9667, "\xe2\x97\x83"),
+ ENTITY_DEF("frac25", 8534, "\xe2\x85\x96"),
+ ENTITY_DEF("DZcy", 1039, "\xd0\x8f"),
+ ENTITY_DEF("RightUpVector", 8638, "\xe2\x86\xbe"),
+ ENTITY_DEF("rsquor", 8217, "\xe2\x80\x99"),
+ ENTITY_DEF("uplus", 8846, "\xe2\x8a\x8e"),
+ ENTITY_DEF("triangleright", 9657, "\xe2\x96\xb9"),
+ ENTITY_DEF("lAarr", 8666, "\xe2\x87\x9a"),
+ ENTITY_DEF("HilbertSpace", 8459, "\xe2\x84\x8b"),
+ ENTITY_DEF("there4", 8756, "\xe2\x88\xb4"),
+ ENTITY_DEF("vscr", 120011, "\xf0\x9d\x93\x8b"),
+ ENTITY_DEF("cirscir", 10690, "\xe2\xa7\x82"),
+ ENTITY_DEF("roarr", 8702, "\xe2\x87\xbe"),
+ ENTITY_DEF("hslash", 8463, "\xe2\x84\x8f"),
+ ENTITY_DEF("supdsub", 10968, "\xe2\xab\x98"),
+ ENTITY_DEF("simg", 10910, "\xe2\xaa\x9e"),
+ ENTITY_DEF("trade", 8482, "\xe2\x84\xa2"),
+ ENTITY_DEF("searrow", 8600, "\xe2\x86\x98"),
+ ENTITY_DEF("DownLeftVector", 8637, "\xe2\x86\xbd"),
+ ENTITY_DEF("FilledSmallSquare", 9724, "\xe2\x97\xbc"),
+ ENTITY_DEF("prod", 8719, "\xe2\x88\x8f"),
+ ENTITY_DEF("oror", 10838, "\xe2\xa9\x96"),
+ ENTITY_DEF("udarr", 8645, "\xe2\x87\x85"),
+ ENTITY_DEF("jsercy", 1112, "\xd1\x98"),
+ ENTITY_DEF("tprime", 8244, "\xe2\x80\xb4"),
+ ENTITY_DEF("bprime", 8245, "\xe2\x80\xb5"),
+ ENTITY_DEF("malt", 10016, "\xe2\x9c\xa0"),
+ ENTITY_DEF("bigcup", 8899, "\xe2\x8b\x83"),
+ ENTITY_DEF("oint", 8750, "\xe2\x88\xae"),
+ ENTITY_DEF("female", 9792, "\xe2\x99\x80"),
+ ENTITY_DEF("omacr", 333, "\xc5\x8d"),
+ ENTITY_DEF("SquareSubsetEqual", 8849, "\xe2\x8a\x91"),
+ ENTITY_DEF("SucceedsEqual", 10928, "\xe2\xaa\xb0"),
+ ENTITY_DEF("plusacir", 10787, "\xe2\xa8\xa3"),
+ ENTITY_DEF("Gcirc", 284, "\xc4\x9c"),
+ ENTITY_DEF("lesdotor", 10883, "\xe2\xaa\x83"),
+ ENTITY_DEF("escr", 8495, "\xe2\x84\xaf"),
+ ENTITY_DEF_HEUR("THORN", 222, "\xc3\x9e"),
+ ENTITY_DEF("UpArrowBar", 10514, "\xe2\xa4\x92"),
+ ENTITY_DEF("nvrtrie", 8885, "\xe2\x8a\xb5\xe2\x83\x92"),
+ ENTITY_DEF("varkappa", 1008, "\xcf\xb0"),
+ ENTITY_DEF("NotReverseElement", 8716, "\xe2\x88\x8c"),
+ ENTITY_DEF("zdot", 380, "\xc5\xbc"),
+ ENTITY_DEF("ExponentialE", 8519, "\xe2\x85\x87"),
+ ENTITY_DEF("lesseqgtr", 8922, "\xe2\x8b\x9a"),
+ ENTITY_DEF("cscr", 119992, "\xf0\x9d\x92\xb8"),
+ ENTITY_DEF("Dscr", 119967, "\xf0\x9d\x92\x9f"),
+ ENTITY_DEF("lthree", 8907, "\xe2\x8b\x8b"),
+ ENTITY_DEF("Ccedil", 199, "\xc3\x87"),
+ ENTITY_DEF("nge", 8817, "\xe2\x89\xb1"),
+ ENTITY_DEF("UpperLeftArrow", 8598, "\xe2\x86\x96"),
+ ENTITY_DEF("vDash", 8872, "\xe2\x8a\xa8"),
+ ENTITY_DEF("efDot", 8786, "\xe2\x89\x92"),
+ ENTITY_DEF("telrec", 8981, "\xe2\x8c\x95"),
+ ENTITY_DEF("vellip", 8942, "\xe2\x8b\xae"),
+ ENTITY_DEF("nrArr", 8655, "\xe2\x87\x8f"),
+ ENTITY_DEF_HEUR("ugrave", 249, "\xc3\xb9"),
+ ENTITY_DEF("uring", 367, "\xc5\xaf"),
+ ENTITY_DEF("Bernoullis", 8492, "\xe2\x84\xac"),
+ ENTITY_DEF("nles", 10877, "\xe2\xa9\xbd\xcc\xb8"),
+ ENTITY_DEF_HEUR("macr", 175, "\xc2\xaf"),
+ ENTITY_DEF("boxuR", 9560, "\xe2\x95\x98"),
+ ENTITY_DEF("clubsuit", 9827, "\xe2\x99\xa3"),
+ ENTITY_DEF("rightarrowtail", 8611, "\xe2\x86\xa3"),
+ ENTITY_DEF("epar", 8917, "\xe2\x8b\x95"),
+ ENTITY_DEF("ltcc", 10918, "\xe2\xaa\xa6"),
+ ENTITY_DEF("twoheadleftarrow", 8606, "\xe2\x86\x9e"),
+ ENTITY_DEF("aleph", 8501, "\xe2\x84\xb5"),
+ ENTITY_DEF("Colon", 8759, "\xe2\x88\xb7"),
+ ENTITY_DEF("vltri", 8882, "\xe2\x8a\xb2"),
+ ENTITY_DEF("quaternions", 8461, "\xe2\x84\x8d"),
+ ENTITY_DEF("rfr", 120111, "\xf0\x9d\x94\xaf"),
+ ENTITY_DEF_HEUR("Ouml", 214, "\xc3\x96"),
+ ENTITY_DEF("rsh", 8625, "\xe2\x86\xb1"),
+ ENTITY_DEF("emptyv", 8709, "\xe2\x88\x85"),
+ ENTITY_DEF("sqsup", 8848, "\xe2\x8a\x90"),
+ ENTITY_DEF("marker", 9646, "\xe2\x96\xae"),
+ ENTITY_DEF("Efr", 120072, "\xf0\x9d\x94\x88"),
+ ENTITY_DEF("DotEqual", 8784, "\xe2\x89\x90"),
+ ENTITY_DEF("eqsim", 8770, "\xe2\x89\x82"),
+ ENTITY_DEF("NotSucceedsEqual", 10928, "\xe2\xaa\xb0\xcc\xb8"),
+ ENTITY_DEF("primes", 8473, "\xe2\x84\x99"),
+ ENTITY_DEF_HEUR("times", 215, "\xc3\x97"),
+ ENTITY_DEF("rangd", 10642, "\xe2\xa6\x92"),
+ ENTITY_DEF("rightharpoonup", 8640, "\xe2\x87\x80"),
+ ENTITY_DEF("lrhard", 10605, "\xe2\xa5\xad"),
+ ENTITY_DEF("ape", 8778, "\xe2\x89\x8a"),
+ ENTITY_DEF("varsupsetneq", 8843, "\xe2\x8a\x8b\xef\xb8\x80"),
+ ENTITY_DEF("larrlp", 8619, "\xe2\x86\xab"),
+ ENTITY_DEF("NotPrecedesEqual", 10927, "\xe2\xaa\xaf\xcc\xb8"),
+ ENTITY_DEF("ulcorner", 8988, "\xe2\x8c\x9c"),
+ ENTITY_DEF("acd", 8767, "\xe2\x88\xbf"),
+ ENTITY_DEF("Hacek", 711, "\xcb\x87"),
+ ENTITY_DEF("xuplus", 10756, "\xe2\xa8\x84"),
+ ENTITY_DEF("therefore", 8756, "\xe2\x88\xb4"),
+ ENTITY_DEF("YIcy", 1031, "\xd0\x87"),
+ ENTITY_DEF("Tfr", 120087, "\xf0\x9d\x94\x97"),
+ ENTITY_DEF("Jcirc", 308, "\xc4\xb4"),
+ ENTITY_DEF("LessGreater", 8822, "\xe2\x89\xb6"),
+ ENTITY_DEF("Uring", 366, "\xc5\xae"),
+ ENTITY_DEF("Ugrave", 217, "\xc3\x99"),
+ ENTITY_DEF("rarr", 8594, "\xe2\x86\x92"),
+ ENTITY_DEF("wopf", 120168, "\xf0\x9d\x95\xa8"),
+ ENTITY_DEF("imath", 305, "\xc4\xb1"),
+ ENTITY_DEF("Yopf", 120144, "\xf0\x9d\x95\x90"),
+ ENTITY_DEF("colone", 8788, "\xe2\x89\x94"),
+ ENTITY_DEF("csube", 10961, "\xe2\xab\x91"),
+ ENTITY_DEF("odash", 8861, "\xe2\x8a\x9d"),
+ ENTITY_DEF("olarr", 8634, "\xe2\x86\xba"),
+ ENTITY_DEF("angrt", 8735, "\xe2\x88\x9f"),
+ ENTITY_DEF("NotLeftTriangleBar", 10703, "\xe2\xa7\x8f\xcc\xb8"),
+ ENTITY_DEF("GreaterEqual", 8805, "\xe2\x89\xa5"),
+ ENTITY_DEF("scnap", 10938, "\xe2\xaa\xba"),
+ ENTITY_DEF("pi", 960, "\xcf\x80"),
+ ENTITY_DEF("lesg", 8922, "\xe2\x8b\x9a\xef\xb8\x80"),
+ ENTITY_DEF("orderof", 8500, "\xe2\x84\xb4"),
+ ENTITY_DEF_HEUR("uacute", 250, "\xc3\xba"),
+ ENTITY_DEF("Barv", 10983, "\xe2\xab\xa7"),
+ ENTITY_DEF("Theta", 920, "\xce\x98"),
+ ENTITY_DEF("leftrightsquigarrow", 8621, "\xe2\x86\xad"),
+ ENTITY_DEF("Atilde", 195, "\xc3\x83"),
+ ENTITY_DEF("cupdot", 8845, "\xe2\x8a\x8d"),
+ ENTITY_DEF("ntriangleright", 8939, "\xe2\x8b\xab"),
+ ENTITY_DEF("measuredangle", 8737, "\xe2\x88\xa1"),
+ ENTITY_DEF("jscr", 119999, "\xf0\x9d\x92\xbf"),
+ ENTITY_DEF("inodot", 305, "\xc4\xb1"),
+ ENTITY_DEF("mopf", 120158, "\xf0\x9d\x95\x9e"),
+ ENTITY_DEF("hkswarow", 10534, "\xe2\xa4\xa6"),
+ ENTITY_DEF("lopar", 10629, "\xe2\xa6\x85"),
+ ENTITY_DEF("thksim", 8764, "\xe2\x88\xbc"),
+ ENTITY_DEF("bkarow", 10509, "\xe2\xa4\x8d"),
+ ENTITY_DEF("rarrfs", 10526, "\xe2\xa4\x9e"),
+ ENTITY_DEF("ntrianglelefteq", 8940, "\xe2\x8b\xac"),
+ ENTITY_DEF("Bscr", 8492, "\xe2\x84\xac"),
+ ENTITY_DEF("topf", 120165, "\xf0\x9d\x95\xa5"),
+ ENTITY_DEF("Uacute", 218, "\xc3\x9a"),
+ ENTITY_DEF("lap", 10885, "\xe2\xaa\x85"),
+ ENTITY_DEF("djcy", 1106, "\xd1\x92"),
+ ENTITY_DEF("bopf", 120147, "\xf0\x9d\x95\x93"),
+ ENTITY_DEF("empty", 8709, "\xe2\x88\x85"),
+ ENTITY_DEF("LeftAngleBracket", 10216, "\xe2\x9f\xa8"),
+ ENTITY_DEF("Imacr", 298, "\xc4\xaa"),
+ ENTITY_DEF("ltcir", 10873, "\xe2\xa9\xb9"),
+ ENTITY_DEF("trisb", 10701, "\xe2\xa7\x8d"),
+ ENTITY_DEF("gjcy", 1107, "\xd1\x93"),
+ ENTITY_DEF("pr", 8826, "\xe2\x89\xba"),
+ ENTITY_DEF("Mu", 924, "\xce\x9c"),
+ ENTITY_DEF("ogon", 731, "\xcb\x9b"),
+ ENTITY_DEF("pertenk", 8241, "\xe2\x80\xb1"),
+ ENTITY_DEF("plustwo", 10791, "\xe2\xa8\xa7"),
+ ENTITY_DEF("Vfr", 120089, "\xf0\x9d\x94\x99"),
+ ENTITY_DEF("ApplyFunction", 8289, "\xe2\x81\xa1"),
+ ENTITY_DEF("Sub", 8912, "\xe2\x8b\x90"),
+ ENTITY_DEF("DoubleLeftRightArrow", 8660, "\xe2\x87\x94"),
+ ENTITY_DEF("Lmidot", 319, "\xc4\xbf"),
+ ENTITY_DEF("nwarrow", 8598, "\xe2\x86\x96"),
+ ENTITY_DEF("angrtvbd", 10653, "\xe2\xa6\x9d"),
+ ENTITY_DEF("fcy", 1092, "\xd1\x84"),
+ ENTITY_DEF("ltlarr", 10614, "\xe2\xa5\xb6"),
+ ENTITY_DEF("CircleMinus", 8854, "\xe2\x8a\x96"),
+ ENTITY_DEF("angmsdab", 10665, "\xe2\xa6\xa9"),
+ ENTITY_DEF("wedgeq", 8793, "\xe2\x89\x99"),
+ ENTITY_DEF("iogon", 303, "\xc4\xaf"),
+ ENTITY_DEF_HEUR("laquo", 171, "\xc2\xab"),
+ ENTITY_DEF("NestedGreaterGreater", 8811, "\xe2\x89\xab"),
+ ENTITY_DEF("UnionPlus", 8846, "\xe2\x8a\x8e"),
+ ENTITY_DEF("CircleDot", 8857, "\xe2\x8a\x99"),
+ ENTITY_DEF("coloneq", 8788, "\xe2\x89\x94"),
+ ENTITY_DEF("csupe", 10962, "\xe2\xab\x92"),
+ ENTITY_DEF("tcaron", 357, "\xc5\xa5"),
+ ENTITY_DEF("GreaterTilde", 8819, "\xe2\x89\xb3"),
+ ENTITY_DEF("Map", 10501, "\xe2\xa4\x85"),
+ ENTITY_DEF("DoubleLongLeftArrow", 10232, "\xe2\x9f\xb8"),
+ ENTITY_DEF("Uparrow", 8657, "\xe2\x87\x91"),
+ ENTITY_DEF("scy", 1089, "\xd1\x81"),
+ ENTITY_DEF("llarr", 8647, "\xe2\x87\x87"),
+ ENTITY_DEF("rangle", 10217, "\xe2\x9f\xa9"),
+ ENTITY_DEF("sstarf", 8902, "\xe2\x8b\x86"),
+ ENTITY_DEF("InvisibleTimes", 8290, "\xe2\x81\xa2"),
+ ENTITY_DEF("egsdot", 10904, "\xe2\xaa\x98"),
+ ENTITY_DEF("target", 8982, "\xe2\x8c\x96"),
+ ENTITY_DEF("lesges", 10899, "\xe2\xaa\x93"),
+ ENTITY_DEF_HEUR("curren", 164, "\xc2\xa4"),
+ ENTITY_DEF("yopf", 120170, "\xf0\x9d\x95\xaa"),
+ ENTITY_DEF("frac23", 8532, "\xe2\x85\x94"),
+ ENTITY_DEF("NotSucceedsTilde", 8831, "\xe2\x89\xbf\xcc\xb8"),
+ ENTITY_DEF("napprox", 8777, "\xe2\x89\x89"),
+ ENTITY_DEF("odblac", 337, "\xc5\x91"),
+ ENTITY_DEF("gammad", 989, "\xcf\x9d"),
+ ENTITY_DEF("dscr", 119993, "\xf0\x9d\x92\xb9"),
+ ENTITY_DEF("SupersetEqual", 8839, "\xe2\x8a\x87"),
+ ENTITY_DEF("squf", 9642, "\xe2\x96\xaa"),
+ ENTITY_DEF("Because", 8757, "\xe2\x88\xb5"),
+ ENTITY_DEF("sccue", 8829, "\xe2\x89\xbd"),
+ ENTITY_DEF("KHcy", 1061, "\xd0\xa5"),
+ ENTITY_DEF("Wcirc", 372, "\xc5\xb4"),
+ ENTITY_DEF("uparrow", 8593, "\xe2\x86\x91"),
+ ENTITY_DEF("lessgtr", 8822, "\xe2\x89\xb6"),
+ ENTITY_DEF("thickapprox", 8776, "\xe2\x89\x88"),
+ ENTITY_DEF("lbrksld", 10639, "\xe2\xa6\x8f"),
+ ENTITY_DEF_HEUR("oslash", 248, "\xc3\xb8"),
+ ENTITY_DEF("NotCupCap", 8813, "\xe2\x89\xad"),
+ ENTITY_DEF("elinters", 9191, "\xe2\x8f\xa7"),
+ ENTITY_DEF("Assign", 8788, "\xe2\x89\x94"),
+ ENTITY_DEF("ClockwiseContourIntegral", 8754, "\xe2\x88\xb2"),
+ ENTITY_DEF("lfisht", 10620, "\xe2\xa5\xbc"),
+ ENTITY_DEF("DownArrow", 8595, "\xe2\x86\x93"),
+ ENTITY_DEF("Zdot", 379, "\xc5\xbb"),
+ ENTITY_DEF("xscr", 120013, "\xf0\x9d\x93\x8d"),
+ ENTITY_DEF("DiacriticalGrave", 96, "\x60"),
+ ENTITY_DEF("DoubleLongLeftRightArrow", 10234, "\xe2\x9f\xba"),
+ ENTITY_DEF("angle", 8736, "\xe2\x88\xa0"),
+ ENTITY_DEF("race", 8765, "\xe2\x88\xbd\xcc\xb1"),
+ ENTITY_DEF("Ascr", 119964, "\xf0\x9d\x92\x9c"),
+ ENTITY_DEF("Xscr", 119987, "\xf0\x9d\x92\xb3"),
+ ENTITY_DEF_HEUR("acirc", 226, "\xc3\xa2"),
+ ENTITY_DEF("otimesas", 10806, "\xe2\xa8\xb6"),
+ ENTITY_DEF("gscr", 8458, "\xe2\x84\x8a"),
+ ENTITY_DEF("gcy", 1075, "\xd0\xb3"),
+ ENTITY_DEF("angmsdag", 10670, "\xe2\xa6\xae"),
+ ENTITY_DEF("tshcy", 1115, "\xd1\x9b"),
+ ENTITY_DEF("Acy", 1040, "\xd0\x90"),
+ ENTITY_DEF("NotGreaterLess", 8825, "\xe2\x89\xb9"),
+ ENTITY_DEF("dtdot", 8945, "\xe2\x8b\xb1"),
+ ENTITY_DEF_HEUR("quot", 34, "\x22"),
+ ENTITY_DEF_HEUR("micro", 181, "\xc2\xb5"),
+ ENTITY_DEF("simplus", 10788, "\xe2\xa8\xa4"),
+ ENTITY_DEF("nsupseteq", 8841, "\xe2\x8a\x89"),
+ ENTITY_DEF("Ufr", 120088, "\xf0\x9d\x94\x98"),
+ ENTITY_DEF("Pr", 10939, "\xe2\xaa\xbb"),
+ ENTITY_DEF("napid", 8779, "\xe2\x89\x8b\xcc\xb8"),
+ ENTITY_DEF("rceil", 8969, "\xe2\x8c\x89"),
+ ENTITY_DEF("boxtimes", 8864, "\xe2\x8a\xa0"),
+ ENTITY_DEF("erarr", 10609, "\xe2\xa5\xb1"),
+ ENTITY_DEF("downdownarrows", 8650, "\xe2\x87\x8a"),
+ ENTITY_DEF("Kfr", 120078, "\xf0\x9d\x94\x8e"),
+ ENTITY_DEF("mho", 8487, "\xe2\x84\xa7"),
+ ENTITY_DEF("scpolint", 10771, "\xe2\xa8\x93"),
+ ENTITY_DEF("vArr", 8661, "\xe2\x87\x95"),
+ ENTITY_DEF("Ccaron", 268, "\xc4\x8c"),
+ ENTITY_DEF("NotRightTriangle", 8939, "\xe2\x8b\xab"),
+ ENTITY_DEF("topbot", 9014, "\xe2\x8c\xb6"),
+ ENTITY_DEF("qopf", 120162, "\xf0\x9d\x95\xa2"),
+ ENTITY_DEF("eogon", 281, "\xc4\x99"),
+ ENTITY_DEF("luruhar", 10598, "\xe2\xa5\xa6"),
+ ENTITY_DEF("gtdot", 8919, "\xe2\x8b\x97"),
+ ENTITY_DEF("Egrave", 200, "\xc3\x88"),
+ ENTITY_DEF("roplus", 10798, "\xe2\xa8\xae"),
+ ENTITY_DEF("Intersection", 8898, "\xe2\x8b\x82"),
+ ENTITY_DEF("Uarr", 8607, "\xe2\x86\x9f"),
+ ENTITY_DEF("dcy", 1076, "\xd0\xb4"),
+ ENTITY_DEF("boxvl", 9508, "\xe2\x94\xa4"),
+ ENTITY_DEF("RightArrowBar", 8677, "\xe2\x87\xa5"),
+ ENTITY_DEF_HEUR("yuml", 255, "\xc3\xbf"),
+ ENTITY_DEF("parallel", 8741, "\xe2\x88\xa5"),
+ ENTITY_DEF("succneqq", 10934, "\xe2\xaa\xb6"),
+ ENTITY_DEF("bemptyv", 10672, "\xe2\xa6\xb0"),
+ ENTITY_DEF("starf", 9733, "\xe2\x98\x85"),
+ ENTITY_DEF("OverBar", 8254, "\xe2\x80\xbe"),
+ ENTITY_DEF("Alpha", 913, "\xce\x91"),
+ ENTITY_DEF("LeftUpVectorBar", 10584, "\xe2\xa5\x98"),
+ ENTITY_DEF("ufr", 120114, "\xf0\x9d\x94\xb2"),
+ ENTITY_DEF("swarhk", 10534, "\xe2\xa4\xa6"),
+ ENTITY_DEF("GreaterEqualLess", 8923, "\xe2\x8b\x9b"),
+ ENTITY_DEF("sscr", 120008, "\xf0\x9d\x93\x88"),
+ ENTITY_DEF("Pi", 928, "\xce\xa0"),
+ ENTITY_DEF("boxh", 9472, "\xe2\x94\x80"),
+ ENTITY_DEF("frac16", 8537, "\xe2\x85\x99"),
+ ENTITY_DEF("lbrack", 91, "\x5b"),
+ ENTITY_DEF("vert", 124, "\x7c"),
+ ENTITY_DEF("precneqq", 10933, "\xe2\xaa\xb5"),
+ ENTITY_DEF("NotGreaterSlantEqual", 10878, "\xe2\xa9\xbe\xcc\xb8"),
+ ENTITY_DEF("Omega", 937, "\xce\xa9"),
+ ENTITY_DEF("uarr", 8593, "\xe2\x86\x91"),
+ ENTITY_DEF("boxVr", 9567, "\xe2\x95\x9f"),
+ ENTITY_DEF("ruluhar", 10600, "\xe2\xa5\xa8"),
+ ENTITY_DEF("ShortLeftArrow", 8592, "\xe2\x86\x90"),
+ ENTITY_DEF("Qfr", 120084, "\xf0\x9d\x94\x94"),
+ ENTITY_DEF("olt", 10688, "\xe2\xa7\x80"),
+ ENTITY_DEF("nequiv", 8802, "\xe2\x89\xa2"),
+ ENTITY_DEF("fscr", 119995, "\xf0\x9d\x92\xbb"),
+ ENTITY_DEF("rarrhk", 8618, "\xe2\x86\xaa"),
+ ENTITY_DEF("nsqsupe", 8931, "\xe2\x8b\xa3"),
+ ENTITY_DEF("nsubseteq", 8840, "\xe2\x8a\x88"),
+ ENTITY_DEF("numero", 8470, "\xe2\x84\x96"),
+ ENTITY_DEF("emsp14", 8197, "\xe2\x80\x85"),
+ ENTITY_DEF("gl", 8823, "\xe2\x89\xb7"),
+ ENTITY_DEF("ocirc", 244, "\xc3\xb4"),
+ ENTITY_DEF("weierp", 8472, "\xe2\x84\x98"),
+ ENTITY_DEF("boxvL", 9569, "\xe2\x95\xa1"),
+ ENTITY_DEF("RightArrowLeftArrow", 8644, "\xe2\x87\x84"),
+ ENTITY_DEF("Precedes", 8826, "\xe2\x89\xba"),
+ ENTITY_DEF("RightVector", 8640, "\xe2\x87\x80"),
+ ENTITY_DEF("xcup", 8899, "\xe2\x8b\x83"),
+ ENTITY_DEF("angmsdad", 10667, "\xe2\xa6\xab"),
+ ENTITY_DEF("gtrsim", 8819, "\xe2\x89\xb3"),
+ ENTITY_DEF("natural", 9838, "\xe2\x99\xae"),
+ ENTITY_DEF("nVdash", 8878, "\xe2\x8a\xae"),
+ ENTITY_DEF("RightTriangleEqual", 8885, "\xe2\x8a\xb5"),
+ ENTITY_DEF("dscy", 1109, "\xd1\x95"),
+ ENTITY_DEF("leftthreetimes", 8907, "\xe2\x8b\x8b"),
+ ENTITY_DEF("prsim", 8830, "\xe2\x89\xbe"),
+ ENTITY_DEF("Bcy", 1041, "\xd0\x91"),
+ ENTITY_DEF("Chi", 935, "\xce\xa7"),
+ ENTITY_DEF("timesb", 8864, "\xe2\x8a\xa0"),
+ ENTITY_DEF("Del", 8711, "\xe2\x88\x87"),
+ ENTITY_DEF("lmidot", 320, "\xc5\x80"),
+ ENTITY_DEF("RightDownVector", 8642, "\xe2\x87\x82"),
+ ENTITY_DEF("simdot", 10858, "\xe2\xa9\xaa"),
+ ENTITY_DEF("FilledVerySmallSquare", 9642, "\xe2\x96\xaa"),
+ ENTITY_DEF("NotLessSlantEqual", 10877, "\xe2\xa9\xbd\xcc\xb8"),
+ ENTITY_DEF("SucceedsTilde", 8831, "\xe2\x89\xbf"),
+ ENTITY_DEF("duarr", 8693, "\xe2\x87\xb5"),
+ ENTITY_DEF("apE", 10864, "\xe2\xa9\xb0"),
+ ENTITY_DEF("odot", 8857, "\xe2\x8a\x99"),
+ ENTITY_DEF("mldr", 8230, "\xe2\x80\xa6"),
+ ENTITY_DEF("Uarrocir", 10569, "\xe2\xa5\x89"),
+ ENTITY_DEF("nLl", 8920, "\xe2\x8b\x98\xcc\xb8"),
+ ENTITY_DEF("rarrpl", 10565, "\xe2\xa5\x85"),
+ ENTITY_DEF("cir", 9675, "\xe2\x97\x8b"),
+ ENTITY_DEF("blk14", 9617, "\xe2\x96\x91"),
+ ENTITY_DEF("VerticalLine", 124, "\x7c"),
+ ENTITY_DEF("jcy", 1081, "\xd0\xb9"),
+ ENTITY_DEF("filig", 64257, "\xef\xac\x81"),
+ ENTITY_DEF("LongRightArrow", 10230, "\xe2\x9f\xb6"),
+ ENTITY_DEF("beta", 946, "\xce\xb2"),
+ ENTITY_DEF("ccupssm", 10832, "\xe2\xa9\x90"),
+ ENTITY_DEF("supsub", 10964, "\xe2\xab\x94"),
+ ENTITY_DEF("spar", 8741, "\xe2\x88\xa5"),
+ ENTITY_DEF("Tstrok", 358, "\xc5\xa6"),
+ ENTITY_DEF("isinv", 8712, "\xe2\x88\x88"),
+ ENTITY_DEF("rightsquigarrow", 8605, "\xe2\x86\x9d"),
+ ENTITY_DEF("Diamond", 8900, "\xe2\x8b\x84"),
+ ENTITY_DEF("curlyeqsucc", 8927, "\xe2\x8b\x9f"),
+ ENTITY_DEF("ijlig", 307, "\xc4\xb3"),
+ ENTITY_DEF("puncsp", 8200, "\xe2\x80\x88"),
+ ENTITY_DEF("hamilt", 8459, "\xe2\x84\x8b"),
+ ENTITY_DEF("mapstoleft", 8612, "\xe2\x86\xa4"),
+ ENTITY_DEF("Copf", 8450, "\xe2\x84\x82"),
+ ENTITY_DEF("prnsim", 8936, "\xe2\x8b\xa8"),
+ ENTITY_DEF("DotDot", 8412, "\xe2\x83\x9c"),
+ ENTITY_DEF("lobrk", 10214, "\xe2\x9f\xa6"),
+ ENTITY_DEF("twoheadrightarrow", 8608, "\xe2\x86\xa0"),
+ ENTITY_DEF("ngE", 8807, "\xe2\x89\xa7\xcc\xb8"),
+ ENTITY_DEF("cylcty", 9005, "\xe2\x8c\xad"),
+ ENTITY_DEF("sube", 8838, "\xe2\x8a\x86"),
+ ENTITY_DEF("NotEqualTilde", 8770, "\xe2\x89\x82\xcc\xb8"),
+ ENTITY_DEF_HEUR("Yuml", 376, "\xc5\xb8"),
+ ENTITY_DEF("comp", 8705, "\xe2\x88\x81"),
+ ENTITY_DEF("dotminus", 8760, "\xe2\x88\xb8"),
+ ENTITY_DEF("crarr", 8629, "\xe2\x86\xb5"),
+ ENTITY_DEF("imped", 437, "\xc6\xb5"),
+ ENTITY_DEF("barwedge", 8965, "\xe2\x8c\x85"),
+ ENTITY_DEF("harrcir", 10568, "\xe2\xa5\x88")};
+
+class html_entities_storage {
+ ankerl::unordered_dense::map<std::string_view, html_entity_def> entity_by_name;
+ ankerl::unordered_dense::map<std::string_view, html_entity_def> entity_by_name_heur;
+ ankerl::unordered_dense::map<unsigned, html_entity_def> entity_by_id;
+
+public:
+ html_entities_storage()
+ {
+ auto nelts = G_N_ELEMENTS(html_entities_array);
+ entity_by_name.reserve(nelts);
+ entity_by_id.reserve(nelts);
+
+ for (const auto &e: html_entities_array) {
+ entity_by_name[e.name] = e;
+ entity_by_id[e.code] = e;
+
+ if (e.allow_heuristic) {
+ entity_by_name_heur[e.name] = e;
+ }
+ }
+ }
+
+ auto by_name(std::string_view name, bool use_heuristic = false) const -> const html_entity_def *
+ {
+ const decltype(entity_by_name) *htb;
+
+ if (use_heuristic) {
+ htb = &entity_by_name_heur;
+ }
+ else {
+ htb = &entity_by_name;
+ }
+ auto it = htb->find(name);
+
+ if (it != htb->end()) {
+ return &(it->second);
+ }
+
+ return nullptr;
+ }
+
+ auto by_id(int id) const -> const html_entity_def *
+ {
+ auto it = entity_by_id.find(id);
+ if (it != entity_by_id.end()) {
+ return &(it->second);
+ }
+
+ return nullptr;
+ }
+};
+
+static const html_entities_storage html_entities_defs;
+
+std::size_t
+decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
+{
+ /*
+ * t - tortoise (destination ptr)
+ * h - hare (source ptr)
+ * e - begin of entity
+ */
+ char *t = s, *h = s, *e = s;
+ const gchar *end;
+ bool seen_hash = false, seen_hex = false;
+ enum {
+ do_undefined,
+ do_digits_only,
+ do_mixed,
+ } seen_digit_only;
+ enum class parser_state {
+ normal_content,
+ ampersand,
+ skip_multi_spaces,
+ skip_start_spaces,
+ } state = parser_state::normal_content;
+
+ end = s + len;
+
+ auto replace_named_entity = [&](const char *entity, std::size_t len) -> bool {
+ const auto *entity_def = html_entities_defs.by_name({entity,
+ (std::size_t)(h - entity)},
+ false);
+
+ auto replace_entity = [&]() -> void {
+ auto l = strlen(entity_def->replacement);
+ memcpy(t, entity_def->replacement, l);
+ t += l;
+ };
+
+ if (entity_def) {
+ replace_entity();
+ return true;
+ }
+ else {
+ /* Try heuristic */
+ auto heuristic_lookup_func = [&](std::size_t lookup_len) -> bool {
+ if (!entity_def && h - e > lookup_len) {
+ entity_def = html_entities_defs.by_name({entity, lookup_len}, true);
+
+ if (entity_def) {
+ replace_entity();
+ /* Adjust h back */
+ h = e + lookup_len;
+
+ return true;
+ }
+
+ entity_def = nullptr;
+ }
+
+ return false;
+ };
+
+ heuristic_lookup_func(5);
+ heuristic_lookup_func(4);
+ heuristic_lookup_func(3);
+ heuristic_lookup_func(2);
+
+ /* Leave undecoded */
+ if (!entity_def && (end - t > h - e)) {
+ memmove(t, e, h - e);
+ t += h - e;
+ }
+ else if (entity_def) {
+ return true;
+ }
+ }
+
+ return false;
+ };
+
+ /* Strtoul works merely for 0 terminated strings, so leave it alone... */
+ auto dec_to_int = [](const char *str, std::size_t len) -> std::optional<int> {
+ int n = 0;
+
+ /* Avoid INT_MIN overflow by moving to negative numbers */
+ while (len > 0 && g_ascii_isdigit(*str)) {
+ n = 10 * n - (*str++ - '0');
+ len--;
+ }
+
+ if (len == 0) {
+ return -(n);
+ }
+ else {
+ return std::nullopt;
+ }
+ };
+ auto hex_to_int = [](const char *str, std::size_t len) -> std::optional<int> {
+ int n = 0;
+
+ /* Avoid INT_MIN overflow by moving to negative numbers */
+ while (len > 0 && g_ascii_isxdigit(*str)) {
+ if (*str <= 0x39) {
+ n = 16 * n - (*str++ - '0');
+ }
+ else {
+ n = 16 * n - (((*str++) | ' ') - 'a' + 10);
+ }
+ len--;
+ }
+
+ if (len == 0) {
+ return -(n);
+ }
+ else {
+ return std::nullopt;
+ }
+ };
+ auto oct_to_int = [](const char *str, std::size_t len) -> std::optional<int> {
+ int n = 0;
+
+ /* Avoid INT_MIN overflow by moving to negative numbers */
+ while (len > 0 && g_ascii_isdigit(*str)) {
+ if (*str > '7') {
+ break;
+ }
+ else {
+ n = 8 * n - (*str++ - '0');
+ }
+ len--;
+ }
+
+ if (len == 0) {
+ return -(n);
+ }
+ else {
+ return std::nullopt;
+ }
+ };
+
+ auto replace_numeric_entity = [&](const char *entity) -> bool {
+ UChar32 uc;
+ std::optional<int> maybe_num;
+
+ if (*entity == 'x' || *entity == 'X') {
+ maybe_num = hex_to_int(entity + 1, h - (entity + 1));
+ }
+ else if (*entity == 'o' || *entity == 'O') {
+ maybe_num = oct_to_int(entity + 1, h - (entity + 1));
+ }
+ else {
+ maybe_num = dec_to_int(entity, h - entity);
+ }
+
+ if (!maybe_num) {
+ /* Skip undecoded */
+ if (end - t >= h - e) {
+ memmove(t, e, h - e);
+ t += h - e;
+ }
+
+ return false;
+ }
+ else {
+ uc = maybe_num.value();
+ /* Search for a replacement */
+ const auto *entity_def = html_entities_defs.by_id(uc);
+
+ if (entity_def) {
+ auto rep_len = strlen(entity_def->replacement);
+
+ if (end - t >= rep_len) {
+ memcpy(t, entity_def->replacement,
+ rep_len);
+ t += rep_len;
+ }
+
+ return true;
+ }
+ else {
+ /* Unicode point */
+ goffset off = t - s;
+ UBool is_error = 0;
+
+ if (uc > 0) {
+ U8_APPEND((std::uint8_t *) s, off, len, uc, is_error);
+
+ if (!is_error) {
+ t = s + off;
+ }
+ else if (end - t > 3) {
+ /* Not printable code point replace with 0xFFFD */
+ *t++ = '\357';
+ *t++ = '\277';
+ *t++ = '\275';
+
+ return true;
+ }
+ }
+ else if (end - t > 3) {
+ /* Not printable code point replace with 0xFFFD */
+ *t++ = '\357';
+ *t++ = '\277';
+ *t++ = '\275';
+ }
+ }
+
+ return true;
+ }
+
+ return false;
+ };
+
+ auto replace_entity = [&]() -> bool {
+ if (e + 1 < end) {
+ const auto *entity_start = e + 1;
+
+ if (*entity_start != '#') {
+ return replace_named_entity(entity_start, (h - entity_start));
+ }
+ else if (entity_start + 1 < h) {
+ return replace_numeric_entity(entity_start + 1);
+ }
+ }
+
+ return false;
+ };
+
+ if (norm_spaces && g_ascii_isspace(*h)) {
+ state = parser_state::skip_start_spaces;
+ }
+
+ while (h - s < len && t <= h) {
+ switch (state) {
+ case parser_state::normal_content:
+ if (*h == '&') {
+ state = parser_state::ampersand;
+ seen_hash = false;
+ seen_hex = false;
+ seen_digit_only = do_undefined;
+ e = h;
+ h++;
+ continue;
+ }
+ else {
+ if (norm_spaces && g_ascii_isspace(*h)) {
+ *t++ = ' ';
+ state = parser_state::skip_multi_spaces;
+ h++;
+ }
+ else {
+ *t++ = *h++;
+ }
+ }
+ break;
+ case parser_state::ampersand:
+ if ((*h == ';' || g_ascii_isspace(*h)) && h > e) {
+ replace_entity();
+ state = parser_state::normal_content;
+
+ if (g_ascii_isspace(*h)) {
+ /* Avoid increase of h */
+ continue;
+ }
+ }
+ else if (*h == '&') {
+ /* Previous `&` was bogus */
+ state = parser_state::ampersand;
+
+ if (end - t > h - e) {
+ memmove(t, e, h - e);
+ t += h - e;
+ }
+
+ e = h;
+ }
+ else if (*h == '#') {
+ seen_hash = true;
+
+ if (h + 1 < end && h[1] == 'x') {
+ seen_hex = true;
+ /* Skip one more character */
+ h++;
+ }
+ }
+ else if (seen_digit_only != do_mixed &&
+ (g_ascii_isdigit(*h) || (seen_hex && g_ascii_isxdigit(*h)))) {
+ seen_digit_only = do_digits_only;
+ }
+ else {
+ if (seen_digit_only == do_digits_only && seen_hash && h > e) {
+ /* We have seen some digits, so we can try to decode, eh */
+ /* Fuck retarded email clients... */
+ replace_entity();
+ state = parser_state::normal_content;
+ continue;
+ }
+
+ seen_digit_only = do_mixed;
+ }
+
+ h++;
+
+ break;
+ case parser_state::skip_multi_spaces:
+ if (g_ascii_isspace(*h)) {
+ h++;
+ }
+ else {
+ state = parser_state::normal_content;
+ }
+ break;
+ case parser_state::skip_start_spaces:
+ if (g_ascii_isspace(*h)) {
+ h++;
+ }
+ else {
+ state = parser_state::normal_content;
+ }
+ break;
+ }
+ }
+
+ /* Leftover */
+ if (state == parser_state::ampersand && h > e) {
+ /* Unfinished entity, copy as is */
+ if (replace_entity()) {
+ /* To follow FSM semantics */
+ h++;
+ }
+ else {
+ h = e; /* Include the last & */
+ }
+
+ /* Leftover after replacement */
+ if (h < end && t + (end - h) <= end) {
+ memmove(t, h, end - h);
+ t += end - h;
+ }
+ }
+
+ if (norm_spaces) {
+ bool seen_spaces = false;
+
+ while (t > s && g_ascii_isspace(*(t - 1))) {
+ seen_spaces = true;
+ t--;
+ }
+
+ if (seen_spaces) {
+ *t++ = ' ';
+ }
+ }
+
+ return (t - s);
+}
+
+auto decode_html_entitles_inplace(std::string &st) -> void
+{
+ auto nlen = decode_html_entitles_inplace(st.data(), st.size());
+ st.resize(nlen);
+}
+
+TEST_SUITE("html entities")
+{
+
+ TEST_CASE("html entities decode")
+ {
+ std::vector<std::pair<std::string, std::string>> cases{
+ {"", ""},
+ {"abc", "abc"},
+ {"abc def", "abc def"},
+ {"abc def", "abc def"},
+ {"abc\ndef", "abc def"},
+ {"abc\n \tdef", "abc def"},
+ {" abc def ", "abc def "},
+ {"FOO&gt;BAR", "FOO>BAR"},
+ {"FOO&gtBAR", "FOO>BAR"},
+ {"FOO&gt BAR", "FOO> BAR"},
+ {"FOO&gt;;;BAR", "FOO>;;BAR"},
+ {"I'm &notit;", "I'm Β¬it;"},
+ {"I'm &notin;", "I'm βˆ‰"},
+ {"FOO& BAR", "FOO& BAR"},
+ {"FOO&&&&gt;BAR", "FOO&&&>BAR"},
+ {"FOO&#41;BAR", "FOO)BAR"},
+ {"FOO&#x41;BAR", "FOOABAR"},
+ {"FOO&#X41;BAR", "FOOABAR"},
+ {"FOO&#BAR", "FOO&#BAR"},
+ {"FOO&#ZOO", "FOO&#ZOO"},
+ {"FOO&#xBAR", "FOOΒΊR"},
+ {"FOO&#x41BAR", "FOO䆺R"},
+ {"FOO&#x0000;ZOO", "FOO\uFFFDZOO"},
+ {"FOO&#x0081;ZOO", "FOO\u0081ZOO"},
+ {"FOO&#xD800;ZOO", "FOO\uFFFDZOO"},
+ {"FOO&#xFFFFFF;ZOO", "FOO\uFFFDZOO"},
+ {"ZZ&pound_id=23", "ZZΒ£_id=23"},
+ {"ZZ&prod_id=23", "ZZ&prod_id=23"},
+ {"ZZ&gt", "ZZ>"},
+ {"ZZ&", "ZZ&"},
+ {"ZZ&AElig=", "ZZÆ="},
+ };
+
+ for (const auto &c: cases) {
+ SUBCASE(("decode entities: " + c.first).c_str())
+ {
+ auto *cpy = new char[c.first.size()];
+ memcpy(cpy, c.first.data(), c.first.size());
+ auto nlen = decode_html_entitles_inplace(cpy, c.first.size(), true);
+ CHECK(std::string{cpy, nlen} == c.second);
+ delete[] cpy;
+ }
+ }
+ }
+}
+
+}// namespace rspamd::html \ No newline at end of file
diff --git a/src/libserver/html/html_entities.hxx b/src/libserver/html/html_entities.hxx
new file mode 100644
index 0000000..fc1f7cc
--- /dev/null
+++ b/src/libserver/html/html_entities.hxx
@@ -0,0 +1,31 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_ENTITIES_H
+#define RSPAMD_HTML_ENTITIES_H
+#pragma once
+
+#include <utility>
+#include <string>
+
+namespace rspamd::html {
+
+auto decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces = false) -> std::size_t;
+auto decode_html_entitles_inplace(std::string &st) -> void;
+
+}// namespace rspamd::html
+
+#endif
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
new file mode 100644
index 0000000..309d761
--- /dev/null
+++ b/src/libserver/html/html_tag.hxx
@@ -0,0 +1,159 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_TAG_HXX
+#define RSPAMD_HTML_TAG_HXX
+#pragma once
+
+#include <utility>
+#include <string_view>
+#include <variant>
+#include <vector>
+#include <optional>
+#include <cstdint>
+
+#include "html_tags.h"
+
+struct rspamd_url;
+struct html_image;
+
+namespace rspamd::html {
+
+struct html_content; /* Forward declaration */
+
+enum class html_component_type : std::uint8_t {
+ RSPAMD_HTML_COMPONENT_NAME = 0,
+ RSPAMD_HTML_COMPONENT_HREF,
+ RSPAMD_HTML_COMPONENT_COLOR,
+ RSPAMD_HTML_COMPONENT_BGCOLOR,
+ RSPAMD_HTML_COMPONENT_STYLE,
+ RSPAMD_HTML_COMPONENT_CLASS,
+ RSPAMD_HTML_COMPONENT_WIDTH,
+ RSPAMD_HTML_COMPONENT_HEIGHT,
+ RSPAMD_HTML_COMPONENT_SIZE,
+ RSPAMD_HTML_COMPONENT_REL,
+ RSPAMD_HTML_COMPONENT_ALT,
+ RSPAMD_HTML_COMPONENT_ID,
+ RSPAMD_HTML_COMPONENT_HIDDEN,
+};
+
+/* Public tags flags */
+/* XML tag */
+#define FL_XML (1u << CM_USER_SHIFT)
+/* Fully closed tag (e.g. <a attrs />) */
+#define FL_CLOSED (1 << (CM_USER_SHIFT + 1))
+#define FL_BROKEN (1 << (CM_USER_SHIFT + 2))
+#define FL_IGNORE (1 << (CM_USER_SHIFT + 3))
+#define FL_BLOCK (1 << (CM_USER_SHIFT + 4))
+#define FL_HREF (1 << (CM_USER_SHIFT + 5))
+#define FL_COMMENT (1 << (CM_USER_SHIFT + 6))
+#define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7))
+
+/**
+ * Returns component type from a string
+ * @param st
+ * @return
+ */
+auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>;
+
+using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>;
+struct html_tag_component {
+ html_component_type type;
+ std::string_view value;
+
+ html_tag_component(html_component_type type, std::string_view value)
+ : type(type), value(value)
+ {
+ }
+};
+
+/* Pairing closing tag representation */
+struct html_closing_tag {
+ int start = -1;
+ int end = -1;
+
+ auto clear() -> void
+ {
+ start = end = -1;
+ }
+};
+
+struct html_tag {
+ unsigned int tag_start = 0;
+ unsigned int content_offset = 0;
+ std::uint32_t flags = 0;
+ std::int32_t id = Tag_UNKNOWN;
+ html_closing_tag closing;
+
+ std::vector<html_tag_component> components;
+
+ html_tag_extra_t extra;
+ mutable struct html_block *block = nullptr;
+ std::vector<struct html_tag *> children;
+ struct html_tag *parent;
+
+ auto find_component(html_component_type what) const -> std::optional<std::string_view>
+ {
+ for (const auto &comp: components) {
+ if (comp.type == what) {
+ return comp.value;
+ }
+ }
+
+ return std::nullopt;
+ }
+
+ auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view>
+ {
+ if (what) {
+ return find_component(what.value());
+ }
+
+ return std::nullopt;
+ }
+
+ auto clear(void) -> void
+ {
+ id = Tag_UNKNOWN;
+ tag_start = content_offset = 0;
+ extra = std::monostate{};
+ components.clear();
+ flags = 0;
+ block = nullptr;
+ children.clear();
+ closing.clear();
+ }
+
+ constexpr auto get_content_length() const -> std::size_t
+ {
+ if (flags & (FL_IGNORE | CM_HEAD)) {
+ return 0;
+ }
+ if (closing.start > content_offset) {
+ return closing.start - content_offset;
+ }
+
+ return 0;
+ }
+
+ auto get_content(const struct html_content *hc) const -> std::string_view;
+};
+
+static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY);
+
+}// namespace rspamd::html
+
+#endif//RSPAMD_HTML_TAG_HXX
diff --git a/src/libserver/html/html_tag_defs.hxx b/src/libserver/html/html_tag_defs.hxx
new file mode 100644
index 0000000..647f7c3
--- /dev/null
+++ b/src/libserver/html/html_tag_defs.hxx
@@ -0,0 +1,194 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_HTML_TAG_DEFS_HXX
+#define RSPAMD_HTML_TAG_DEFS_HXX
+
+#include "config.h"
+#include "html_tags.h"
+#include "libutil/cxx/util.hxx"
+
+#include <string>
+#include "contrib/ankerl/unordered_dense.h"
+
+namespace rspamd::html {
+
+struct html_tag_def {
+ std::string name;
+ tag_id_t id;
+ guint flags;
+};
+
+#define TAG_DEF(id, name, flags) \
+ html_tag_def \
+ { \
+ (name), (id), (flags) \
+ }
+
+static const auto html_tag_defs_array = rspamd::array_of(
+ /* W3C defined elements */
+ TAG_DEF(Tag_A, "a", FL_HREF),
+ TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
+ TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
+ TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
+ TAG_DEF(Tag_APPLET, "applet", (CM_IMG | CM_INLINE | CM_PARAM)),
+ TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
+ TAG_DEF(Tag_B, "b", (CM_INLINE | FL_BLOCK)),
+ TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
+ TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
+ TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
+ TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
+ TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
+ TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
+ TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
+ TAG_DEF(Tag_BUTTON, "button", (CM_INLINE | FL_BLOCK)),
+ TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
+ TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
+ TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
+ TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
+ TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
+ TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
+ TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
+ TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK)),
+ TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
+ TAG_DEF(Tag_DIR, "dir", (CM_BLOCK)),
+ TAG_DEF(Tag_DIV, "div", (CM_BLOCK | FL_BLOCK)),
+ TAG_DEF(Tag_DL, "dl", (CM_BLOCK | FL_BLOCK)),
+ TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
+ TAG_DEF(Tag_EM, "em", (CM_INLINE)),
+ TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
+ TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
+ TAG_DEF(Tag_FORM, "form", (CM_BLOCK | FL_HREF)),
+ TAG_DEF(Tag_FRAME, "frame", (CM_EMPTY | FL_HREF)),
+ TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML)),
+ TAG_DEF(Tag_H1, "h1", (CM_BLOCK)),
+ TAG_DEF(Tag_H2, "h2", (CM_BLOCK)),
+ TAG_DEF(Tag_H3, "h3", (CM_BLOCK)),
+ TAG_DEF(Tag_H4, "h4", (CM_BLOCK)),
+ TAG_DEF(Tag_H5, "h5", (CM_BLOCK)),
+ TAG_DEF(Tag_H6, "h6", (CM_BLOCK)),
+ TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
+ TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
+ TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
+ TAG_DEF(Tag_I, "i", (CM_INLINE)),
+ TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
+ TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
+ TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
+ TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK)),
+ TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
+ TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
+ TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
+ TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
+ TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
+ TAG_DEF(Tag_LINK, "link", (CM_EMPTY | FL_HREF)),
+ TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK)),
+ TAG_DEF(Tag_MAP, "map", (CM_INLINE | FL_HREF)),
+ TAG_DEF(Tag_MENU, "menu", (CM_BLOCK)),
+ TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
+ TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK)),
+ TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_RAW)),
+ TAG_DEF(Tag_OBJECT, "object", (CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
+ TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
+ TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
+ TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
+ TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
+ TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
+ TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK)),
+ TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
+ TAG_DEF(Tag_Q, "q", (CM_INLINE)),
+ TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
+ TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
+ TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
+ TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
+ TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
+ TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
+ TAG_DEF(Tag_S, "s", (CM_INLINE)),
+ TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
+ TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_RAW)),
+ TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
+ TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
+ TAG_DEF(Tag_SPAN, "span", (CM_NO_INDENT | FL_BLOCK)),
+ TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
+ TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
+ TAG_DEF(Tag_STYLE, "style", (CM_HEAD | CM_RAW)),
+ TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
+ TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
+ TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
+ TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT | FL_BLOCK)),
+ TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
+ TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
+ TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
+ TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
+ TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
+ TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
+ TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT | FL_BLOCK)),
+ TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
+ TAG_DEF(Tag_U, "u", (CM_INLINE)),
+ TAG_DEF(Tag_UL, "ul", (CM_BLOCK | FL_BLOCK)),
+ TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
+ TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK)),
+ TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)));
+
+class html_tags_storage {
+ ankerl::unordered_dense::map<std::string_view, html_tag_def> tag_by_name;
+ ankerl::unordered_dense::map<tag_id_t, html_tag_def> tag_by_id;
+
+public:
+ html_tags_storage()
+ {
+ tag_by_name.reserve(html_tag_defs_array.size());
+ tag_by_id.reserve(html_tag_defs_array.size());
+
+ for (const auto &t: html_tag_defs_array) {
+ tag_by_name[t.name] = t;
+ tag_by_id[t.id] = t;
+ }
+ }
+
+ auto by_name(std::string_view name) const -> const html_tag_def *
+ {
+ auto it = tag_by_name.find(name);
+
+ if (it != tag_by_name.end()) {
+ return &(it->second);
+ }
+
+ return nullptr;
+ }
+
+ auto by_id(int id) const -> const html_tag_def *
+ {
+ auto it = tag_by_id.find(static_cast<tag_id_t>(id));
+ if (it != tag_by_id.end()) {
+ return &(it->second);
+ }
+
+ return nullptr;
+ }
+
+ auto name_by_id_safe(int id) const -> std::string_view
+ {
+ auto it = tag_by_id.find(static_cast<tag_id_t>(id));
+ if (it != tag_by_id.end()) {
+ return it->second.name;
+ }
+
+ return "unknown";
+ }
+};
+
+}// namespace rspamd::html
+
+#endif//RSPAMD_HTML_TAG_DEFS_HXX
diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h
new file mode 100644
index 0000000..c186314
--- /dev/null
+++ b/src/libserver/html/html_tags.h
@@ -0,0 +1,176 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBSERVER_HTML_TAGS_H_
+#define SRC_LIBSERVER_HTML_TAGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Known HTML tags */
+typedef enum {
+ Tag_UNKNOWN = 0, /**< Unknown tag! */
+ Tag_A, /**< A */
+ Tag_ABBR, /**< ABBR */
+ Tag_ACRONYM, /**< ACRONYM */
+ Tag_ADDRESS, /**< ADDRESS */
+ Tag_APPLET, /**< APPLET */
+ Tag_AREA, /**< AREA */
+ Tag_B, /**< B */
+ Tag_BASE, /**< BASE */
+ Tag_BASEFONT, /**< BASEFONT */
+ Tag_BDO, /**< BDO */
+ Tag_BIG, /**< BIG */
+ Tag_BLOCKQUOTE, /**< BLOCKQUOTE */
+ Tag_BODY, /**< BODY */
+ Tag_BR, /**< BR */
+ Tag_BUTTON, /**< BUTTON */
+ Tag_CAPTION, /**< CAPTION */
+ Tag_CENTER, /**< CENTER */
+ Tag_CITE, /**< CITE */
+ Tag_CODE, /**< CODE */
+ Tag_COL, /**< COL */
+ Tag_COLGROUP, /**< COLGROUP */
+ Tag_DD, /**< DD */
+ Tag_DEL, /**< DEL */
+ Tag_DFN, /**< DFN */
+ Tag_DIR, /**< DIR */
+ Tag_DIV, /**< DIF */
+ Tag_DL, /**< DL */
+ Tag_DT, /**< DT */
+ Tag_EM, /**< EM */
+ Tag_FIELDSET, /**< FIELDSET */
+ Tag_FONT, /**< FONT */
+ Tag_FORM, /**< FORM */
+ Tag_FRAME, /**< FRAME */
+ Tag_FRAMESET, /**< FRAMESET */
+ Tag_H1, /**< H1 */
+ Tag_H2, /**< H2 */
+ Tag_H3, /**< H3 */
+ Tag_H4, /**< H4 */
+ Tag_H5, /**< H5 */
+ Tag_H6, /**< H6 */
+ Tag_HEAD, /**< HEAD */
+ Tag_HR, /**< HR */
+ Tag_HTML, /**< HTML */
+ Tag_I, /**< I */
+ Tag_IFRAME, /**< IFRAME */
+ Tag_IMG, /**< IMG */
+ Tag_INPUT, /**< INPUT */
+ Tag_INS, /**< INS */
+ Tag_ISINDEX, /**< ISINDEX */
+ Tag_KBD, /**< KBD */
+ Tag_KEYGEN, /**< KEYGEN */
+ Tag_LABEL, /**< LABEL */
+ Tag_LEGEND, /**< LEGEND */
+ Tag_LI, /**< LI */
+ Tag_LINK, /**< LINK */
+ Tag_LISTING, /**< LISTING */
+ Tag_MAP, /**< MAP */
+ Tag_MENU, /**< MENU */
+ Tag_META, /**< META */
+ Tag_NOFRAMES, /**< NOFRAMES */
+ Tag_NOSCRIPT, /**< NOSCRIPT */
+ Tag_OBJECT, /**< OBJECT */
+ Tag_OL, /**< OL */
+ Tag_OPTGROUP, /**< OPTGROUP */
+ Tag_OPTION, /**< OPTION */
+ Tag_P, /**< P */
+ Tag_PARAM, /**< PARAM */
+ Tag_PLAINTEXT, /**< PLAINTEXT */
+ Tag_PRE, /**< PRE */
+ Tag_Q, /**< Q */
+ Tag_RB, /**< RB */
+ Tag_RBC, /**< RBC */
+ Tag_RP, /**< RP */
+ Tag_RT, /**< RT */
+ Tag_RTC, /**< RTC */
+ Tag_RUBY, /**< RUBY */
+ Tag_S, /**< S */
+ Tag_SAMP, /**< SAMP */
+ Tag_SCRIPT, /**< SCRIPT */
+ Tag_SELECT, /**< SELECT */
+ Tag_SMALL, /**< SMALL */
+ Tag_SPAN, /**< SPAN */
+ Tag_STRIKE, /**< STRIKE */
+ Tag_STRONG, /**< STRONG */
+ Tag_STYLE, /**< STYLE */
+ Tag_SUB, /**< SUB */
+ Tag_SUP, /**< SUP */
+ Tag_TABLE, /**< TABLE */
+ Tag_TBODY, /**< TBODY */
+ Tag_TD, /**< TD */
+ Tag_TEXTAREA, /**< TEXTAREA */
+ Tag_TFOOT, /**< TFOOT */
+ Tag_TH, /**< TH */
+ Tag_THEAD, /**< THEAD */
+ Tag_TITLE, /**< TITLE */
+ Tag_TR, /**< TR */
+ Tag_TT, /**< TT */
+ Tag_U, /**< U */
+ Tag_UL, /**< UL */
+ Tag_VAR, /**< VAR */
+ Tag_XMP, /**< XMP */
+ Tag_NEXTID, /**< NEXTID */
+ Tag_MAX,
+
+ N_TAGS = -1 /**< Must be -1 */
+} tag_id_t;
+
+#define CM_UNKNOWN 0
+/* Elements with no content. Map to HTML specification. */
+#define CM_EMPTY (1 << 0)
+/* Elements that appear outside of "BODY". */
+#define CM_HTML (1 << 1)
+/* Elements that can appear within HEAD. */
+#define CM_HEAD (1 << 2)
+/* HTML "block" elements. */
+#define CM_BLOCK (1 << 3)
+/* HTML "inline" elements. */
+#define CM_INLINE (1 << 4)
+/* Elements that mark list item ("LI"). */
+#define CM_LIST (1 << 5)
+/* Elements that mark definition list item ("DL", "DT"). */
+#define CM_DEFLIST (1 << 6)
+/* Elements that can appear inside TABLE. */
+#define CM_TABLE (1 << 7)
+/* Used for "THEAD", "TFOOT" or "TBODY". */
+#define CM_ROWGRP (1 << 8)
+/* Used for "TD", "TH" */
+#define CM_ROW (1 << 9)
+/* Elements whose content must be protected against white space movement.
+ Includes some elements that can found in forms. */
+#define CM_FIELD (1 << 10)
+#define CM_RAW (1 << 11)
+/* Elements that allows "PARAM". */
+#define CM_PARAM (1 << 12)
+/* Elements with an optional end tag. */
+#define CM_OPT (1 << 13)
+/* Elements that use "align" attribute for vertical position. */
+#define CM_IMG (1 << 14)
+#define CM_NO_INDENT (1 << 15)
+/* Elements that cannot be omitted. */
+#define CM_OMITST (1 << 16)
+/* Unique elements */
+#define CM_UNIQUE (1 << 17)
+
+#define CM_USER_SHIFT (18)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SRC_LIBSERVER_HTML_TAGS_H_ */
diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx
new file mode 100644
index 0000000..2fe6702
--- /dev/null
+++ b/src/libserver/html/html_tests.cxx
@@ -0,0 +1,304 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "html.hxx"
+#include "libserver/task.h"
+
+#include <vector>
+#include <fmt/core.h>
+
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+namespace rspamd::html {
+
+/*
+ * Tests part
+ */
+
+TEST_SUITE("html")
+{
+ TEST_CASE("html parsing")
+ {
+
+ const std::vector<std::pair<std::string, std::string>> cases{
+ {"<html><!DOCTYPE html><body>", "+html;++xml;++body;"},
+ {"<html><div><div></div></div></html>", "+html;++div;+++div;"},
+ {"<html><div><div></div></html>", "+html;++div;+++div;"},
+ {"<html><div><div></div></html></div>", "+html;++div;+++div;"},
+ {"<p><p><a></p></a></a>", "+p;++p;+++a;"},
+ {"<div><a href=\"http://example.com\"></div></a>", "+div;++a;"},
+ /* Broken, as I don't know how the hell this should be really parsed */
+ //{"<html><!DOCTYPE html><body><head><body></body></html></body></html>",
+ // "+html;++xml;++body;+++head;+++body;"}
+ };
+
+ rspamd_url_init(NULL);
+ auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
+ "html", 0);
+ struct rspamd_task fake_task;
+ memset(&fake_task, 0, sizeof(fake_task));
+ fake_task.task_pool = pool;
+
+ for (const auto &c: cases) {
+ SUBCASE((std::string("extract tags from: ") + c.first).c_str())
+ {
+ GByteArray *tmp = g_byte_array_sized_new(c.first.size());
+ g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
+ auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
+ CHECK(hc != nullptr);
+ auto dump = html_debug_structure(*hc);
+ CHECK(c.second == dump);
+ g_byte_array_free(tmp, TRUE);
+ }
+ }
+
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("html text extraction")
+ {
+ using namespace std::string_literals;
+ const std::vector<std::pair<std::string, std::string>> cases{
+ {"test", "test"},
+ {"test\0"s, "test\uFFFD"s},
+ {"test\0test"s, "test\uFFFDtest"s},
+ {"test\0\0test"s, "test\uFFFD\uFFFDtest"s},
+ {"test ", "test"},
+ {"test foo, bar", "test foo, bar"},
+ {"<p>text</p>", "text\n"},
+ {"olo<p>text</p>lolo", "olo\ntext\nlolo"},
+ {"<div>foo</div><div>bar</div>", "foo\nbar\n"},
+ {"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
+ {"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
+ {"foo<br>baz", "foo\nbaz"},
+ {"<a href=https://example.com>test</a>", "test"},
+ {"<img alt=test>", "test"},
+ {" <body>\n"
+ " <!-- escape content -->\n"
+ " a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;\n"
+ " </body>",
+ R"|(a b a > b a < b a & b 'a "a")|"},
+ /* XML tags */
+ {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
+ " <!DOCTYPE html\n"
+ " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n"
+ " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
+ "<body>test</body>",
+ "test"},
+ {"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>"
+ " <body>\n"
+ " <p><br>\n"
+ " </p>\n"
+ " <div class=\"moz-forward-container\"><br>\n"
+ " <br>\n"
+ " test</div>"
+ "</body>",
+ "\n\n\ntest\n"},
+ {"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
+ "sh<span style=\"FONT-SIZE: 0px\">aring </span></div>",
+ "fish\n"},
+ /* FIXME: broken until rework of css parser */
+ //{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
+ // "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"},
+ /* Complex html with bad tags */
+ {"<!DOCTYPE html>\n"
+ "<html lang=\"en\">\n"
+ " <head>\n"
+ " <meta charset=\"utf-8\">\n"
+ " <title>title</title>\n"
+ " <link rel=\"stylesheet\" href=\"style.css\">\n"
+ " <script src=\"script.js\"></script>\n"
+ " </head>\n"
+ " <body>\n"
+ " <!-- page content -->\n"
+ " Hello, world! <b>test</b>\n"
+ " <p>data<>\n"
+ " </P>\n"
+ " <b>stuff</p>?\n"
+ " </body>\n"
+ "</html>",
+ "Hello, world! test \ndata<>\nstuff\n?"},
+ {"<p><!--comment-->test</br></hr><br>", "test\n"},
+ /* Tables */
+ {"<table>\n"
+ " <tr>\n"
+ " <th>heada</th>\n"
+ " <th>headb</th>\n"
+ " </tr>\n"
+ " <tr>\n"
+ " <td>data1</td>\n"
+ " <td>data2</td>\n"
+ " </tr>\n"
+ " </table>",
+ "heada headb\ndata1 data2\n"},
+ /* Invalid closing br and hr + comment */
+ {" <body>\n"
+ " <!-- page content -->\n"
+ " Hello, world!<br>test</br><br>content</hr>more content<br>\n"
+ " <div>\n"
+ " content inside div\n"
+ " </div>\n"
+ " </body>",
+ "Hello, world!\ntest\ncontentmore content\ncontent inside div\n"},
+ /* First closing tag */
+ {"</head>\n"
+ "<body>\n"
+ "<p> Hello. I have some bad news.\n"
+ "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n"
+ "</body>\n"
+ "</html>",
+ "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"},
+ /* Invalid tags */
+ {"lol <sht> omg </sht> oh my!\n"
+ "<name>words words</name> goodbye",
+ "lol omg oh my! words words goodbye"},
+ /* Invisible stuff */
+ {"<div style=\"color:#555555;font-family:Arial, 'Helvetica Neue', Helvetica, sans-serif;line-height:1.2;padding-top:10px;padding-right:10px;padding-bottom:10px;padding-left:10px;font-style: italic;\">\n"
+ "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n"
+ "<span style=\"color:#FFFFFF; \">F</span>Sincerely,</p>\n"
+ "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n"
+ "<span style=\"color:#FFFFFF; \">8</span>Sky<span style=\"opacity:1;\"></span>pe<span style=\"color:#FFFFFF; \">F</span>Web<span style=\"color:#FFFFFF; \">F</span></p>\n"
+ "<span style=\"color:#FFFFFF; \">kreyes</span>\n"
+ "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n"
+ "&nbsp;</p>",
+ " Sincerely,\n Skype Web\n"},
+ {"lala<p hidden>fafa</p>", "lala"},
+ {"<table style=\"FONT-SIZE: 0px;\"><tbody><tr><td>\n"
+ "DONKEY\n"
+ "</td></tr></tbody></table>",
+ ""},
+ /* bgcolor propagation */
+ {"<a style=\"display: inline-block; color: #ffffff; background-color: #00aff0;\">\n"
+ "<span style=\"color: #00aff0;\">F</span>Rev<span style=\"opacity: 1;\"></span></span>ie<span style=\"opacity: 1;\"></span>"
+ "</span>w<span style=\"color: #00aff0;\">F<span style=\"opacity: 1;\">ΜΉ</span></span>",
+ " Review"},
+ {"<td style=\"color:#ffffff\" bgcolor=\"#005595\">\n"
+ "hello world\n"
+ "</td>",
+ "hello world"},
+ /* Colors */
+ {"goodbye <span style=\"COLOR: rgb(64,64,64)\">cruel</span>"
+ "<span>world</span>",
+ "goodbye cruelworld"},
+ /* Font-size propagation */
+ {"<p style=\"font-size: 11pt;line-height:22px\">goodbye <span style=\"font-size:0px\">cruel</span>world</p>",
+ "goodbye world\n"},
+ /* Newline before tag -> must be space */
+ {"goodbye <span style=\"COLOR: rgb(64,64,64)\">cruel</span>\n"
+ "<span>world</span>",
+ "goodbye cruel world"},
+ /* Head tag with some stuff */
+ {"<html><head><p>oh my god</head><body></body></html>", "oh my god\n"},
+ {"<html><head><title>oh my god</head><body></body></html>", ""},
+ {"<html><body><html><head>displayed</body></html></body></html>", "displayed"},
+
+ };
+
+ rspamd_url_init(NULL);
+ auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
+ "html", 0);
+ struct rspamd_task fake_task;
+ memset(&fake_task, 0, sizeof(fake_task));
+ fake_task.task_pool = pool;
+
+ auto replace_newlines = [](std::string &str) {
+ auto start_pos = 0;
+ while ((start_pos = str.find("\n", start_pos, 1)) != std::string::npos) {
+ str.replace(start_pos, 1, "\\n", 2);
+ start_pos += 2;
+ }
+ };
+
+ auto i = 1;
+ for (const auto &c: cases) {
+ SUBCASE((fmt::format("html extraction case {}", i)).c_str())
+ {
+ GByteArray *tmp = g_byte_array_sized_new(c.first.size());
+ g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
+ auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
+ CHECK(hc != nullptr);
+ replace_newlines(hc->parsed);
+ auto expected = c.second;
+ replace_newlines(expected);
+ CHECK(hc->parsed == expected);
+ g_byte_array_free(tmp, TRUE);
+ }
+ i++;
+ }
+
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("html urls extraction")
+ {
+ using namespace std::string_literals;
+ const std::vector<std::tuple<std::string, std::vector<std::string>, std::optional<std::string>>> cases{
+ {"<style></style><a href=\"https://www.example.com\">yolo</a>",
+ {"https://www.example.com"},
+ "yolo"},
+ {"<a href=\"https://example.com\">test</a>", {"https://example.com"}, "test"},
+ {"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}, "hello"},
+ {"<html>\n"
+ "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\">\n"
+ "<body>\n"
+ "<a href=\"https://www.example.com\">hello</a>\n"
+ "</body>\n"
+ "</html>",
+ {"https://www.example.com"},
+ "hello"},
+ };
+
+ rspamd_url_init(NULL);
+ auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
+ "html", 0);
+ struct rspamd_task fake_task;
+ memset(&fake_task, 0, sizeof(fake_task));
+ fake_task.task_pool = pool;
+
+ auto i = 1;
+ for (const auto &c: cases) {
+ SUBCASE((fmt::format("html url extraction case {}", i)).c_str())
+ {
+ GPtrArray *purls = g_ptr_array_new();
+ auto input = std::get<0>(c);
+ GByteArray *tmp = g_byte_array_sized_new(input.size());
+ g_byte_array_append(tmp, (const guint8 *) input.data(), input.size());
+ auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true, nullptr);
+ CHECK(hc != nullptr);
+ auto &expected_text = std::get<2>(c);
+ if (expected_text.has_value()) {
+ CHECK(hc->parsed == expected_text.value());
+ }
+ const auto &expected_urls = std::get<1>(c);
+ CHECK(expected_urls.size() == purls->len);
+ for (auto j = 0; j < expected_urls.size(); ++j) {
+ auto *url = (rspamd_url *) g_ptr_array_index(purls, j);
+ CHECK(expected_urls[j] == std::string{url->string, url->urllen});
+ }
+ g_byte_array_free(tmp, TRUE);
+ g_ptr_array_free(purls, TRUE);
+ }
+ ++i;
+ }
+
+ rspamd_mempool_delete(pool);
+ }
+}
+
+} /* namespace rspamd::html */
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx
new file mode 100644
index 0000000..8f29f2c
--- /dev/null
+++ b/src/libserver/html/html_url.cxx
@@ -0,0 +1,496 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "html_url.hxx"
+#include "libutil/str_util.h"
+#include "libserver/url.h"
+#include "libserver/logger.h"
+#include "rspamd.h"
+
+#include <unicode/idna.h>
+
+namespace rspamd::html {
+
+static auto
+rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool
+{
+ const auto *p1 = t1.data() + t1.size() - 1;
+ const auto *p2 = t2.data() + t2.size() - 1;
+
+ /* Skip trailing dots */
+ while (p1 > t1.data()) {
+ if (*p1 != '.') {
+ break;
+ }
+
+ p1--;
+ }
+
+ while (p2 > t2.data()) {
+ if (*p2 != '.') {
+ break;
+ }
+
+ p2--;
+ }
+
+ while (p1 > t1.data() && p2 > t2.data()) {
+ if (*p1 != *p2) {
+ break;
+ }
+
+ p1--;
+ p2--;
+ }
+
+ if (p2 == t2.data()) {
+ /* p2 can be subdomain of p1 if *p1 is '.' */
+ if (p1 != t1.data() && *(p1 - 1) == '.') {
+ return true;
+ }
+ }
+ else if (p1 == t1.data()) {
+ if (p2 != t2.data() && *(p2 - 1) == '.') {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+static auto
+get_icu_idna_instance(void) -> auto
+{
+ auto uc_err = U_ZERO_ERROR;
+ static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err);
+
+ return udn;
+}
+
+static auto
+convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld)
+ -> std::string_view
+{
+ std::string_view ret = use_tld ? std::string_view{rspamd_url_tld_unsafe(url), url->tldlen} : std::string_view{rspamd_url_host_unsafe(url), url->hostlen};
+
+ /* Handle IDN url's */
+ if (ret.size() > 4 &&
+ rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) {
+
+ const auto buf_capacity = ret.size() * 2 + 1;
+ auto *idn_hbuf = (char *) rspamd_mempool_alloc(pool, buf_capacity);
+ icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int) buf_capacity};
+
+ /* We need to convert it to the normal value first */
+ icu::IDNAInfo info;
+ auto uc_err = U_ZERO_ERROR;
+ auto *udn = get_icu_idna_instance();
+ udn->nameToUnicodeUTF8(icu::StringPiece(ret.data(), ret.size()),
+ byte_sink, info, uc_err);
+
+ if (uc_err == U_ZERO_ERROR && !info.hasErrors()) {
+ /* idn_hbuf is allocated in mempool, so it is safe to use */
+ ret = std::string_view{idn_hbuf, (std::size_t) byte_sink.NumberOfBytesWritten()};
+ }
+ else {
+ msg_err_pool("cannot convert to IDN: %s (0x%xd)",
+ u_errorName(uc_err), info.getErrors());
+ }
+ }
+
+ return ret;
+};
+
+constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto
+{
+ return (s1.size() == s2.size()) &&
+ std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
+ [](const auto c1, const auto c2) {
+ return g_ascii_tolower(c1) == g_ascii_tolower(c2);
+ });
+}
+
+constexpr auto
+is_transfer_proto(struct rspamd_url *u) -> bool
+{
+ return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0;
+}
+
+auto html_url_is_phished(rspamd_mempool_t *pool,
+ struct rspamd_url *href_url,
+ std::string_view text_data) -> std::optional<rspamd_url *>
+{
+ struct rspamd_url *text_url;
+ std::string_view disp_tok, href_tok;
+ goffset url_pos;
+ gchar *url_str = NULL;
+
+ auto sz = text_data.size();
+ const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz);
+ text_data = std::string_view(trimmed, sz);
+
+ if (text_data.size() > 4 &&
+ rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
+ RSPAMD_URL_FIND_ALL,
+ &url_pos, NULL) &&
+ url_str != nullptr) {
+
+ if (url_pos > 0) {
+ /*
+ * We have some url at some offset, so we need to check what is
+ * at the start of the text
+ */
+ return std::nullopt;
+ }
+
+ text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+ auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
+ RSPAMD_URL_PARSE_TEXT);
+
+ if (rc == URI_ERRNO_OK) {
+ text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+ href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
+
+ /* Check for phishing */
+ if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) {
+ disp_tok = convert_idna_hostname_maybe(pool, text_url, false);
+ href_tok = convert_idna_hostname_maybe(pool, href_url, false);
+
+ if (!sv_equals(disp_tok, href_tok) &&
+ text_url->tldlen > 0 && href_url->tldlen > 0) {
+
+ /* Apply the same logic for TLD */
+ disp_tok = convert_idna_hostname_maybe(pool, text_url, true);
+ href_tok = convert_idna_hostname_maybe(pool, href_url, true);
+
+ if (!sv_equals(disp_tok, href_tok)) {
+ /* Check if one url is a subdomain for another */
+
+ if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
+ href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
+ text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+
+ if (href_url->ext == nullptr) {
+ href_url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
+ }
+ href_url->ext->linked_url = text_url;
+ }
+ }
+ }
+ }
+
+ return text_url;
+ }
+ else {
+ /*
+ * We have found something that looks like an url but it was
+ * not parsed correctly.
+ * Sometimes it means an obfuscation attempt, so we have to check
+ * what's inside of the text
+ */
+ gboolean obfuscation_found = FALSE;
+
+ if (text_data.size() > 4 && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 &&
+ rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) {
+ /* Clearly an obfuscation attempt */
+ obfuscation_found = TRUE;
+ }
+
+ msg_info_pool("extract of url '%s' failed: %s; obfuscation detected: %s",
+ url_str,
+ rspamd_url_strerror(rc),
+ obfuscation_found ? "yes" : "no");
+
+ if (obfuscation_found) {
+ href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
+ }
+ }
+ }
+
+ return std::nullopt;
+}
+
+void html_check_displayed_url(rspamd_mempool_t *pool,
+ GList **exceptions,
+ void *url_set,
+ std::string_view visible_part,
+ goffset href_offset,
+ struct rspamd_url *url)
+{
+ struct rspamd_url *displayed_url = nullptr;
+ struct rspamd_url *turl;
+ struct rspamd_process_exception *ex;
+ guint saved_flags = 0;
+ gsize dlen;
+
+ if (visible_part.empty()) {
+ /* No displayed url, just some text within <a> tag */
+ return;
+ }
+
+ if (url->ext == nullptr) {
+ url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
+ }
+ url->ext->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
+ rspamd_strlcpy(url->ext->visible_part,
+ visible_part.data(),
+ visible_part.size() + 1);
+ dlen = visible_part.size();
+
+ /* Strip unicode spaces from the start and the end */
+ url->ext->visible_part = const_cast<char *>(
+ rspamd_string_unicode_trim_inplace(url->ext->visible_part,
+ &dlen));
+ auto maybe_url = html_url_is_phished(pool, url,
+ {url->ext->visible_part, dlen});
+
+ if (maybe_url) {
+ url->flags |= saved_flags;
+ displayed_url = maybe_url.value();
+ }
+
+ if (exceptions && displayed_url != nullptr) {
+ ex = rspamd_mempool_alloc_type(pool, struct rspamd_process_exception);
+ ex->pos = href_offset;
+ ex->len = dlen;
+ ex->type = RSPAMD_EXCEPTION_URL;
+ ex->ptr = url;
+
+ *exceptions = g_list_prepend(*exceptions, ex);
+ }
+
+ if (displayed_url && url_set) {
+ turl = rspamd_url_set_add_or_return((khash_t(rspamd_url_hash) *) url_set, displayed_url);
+
+ if (turl != nullptr) {
+ /* Here, we assume the following:
+ * if we have a URL in the text part which
+ * is the same as displayed URL in the
+ * HTML part, we assume that it is also
+ * hint only.
+ */
+ if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) {
+
+ /*
+ * We have the same URL for href and displayed url, so we
+ * know that this url cannot be both target and display (as
+ * it breaks logic in many places), so we do not
+ * propagate html flags
+ */
+ if (!(turl->flags & RSPAMD_URL_FLAG_DISPLAY_URL)) {
+ turl->flags |= displayed_url->flags;
+ }
+ turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
+ }
+
+ turl->count++;
+ }
+ else {
+ /* Already inserted by `rspamd_url_set_add_or_return` */
+ }
+ }
+
+ rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen);
+}
+
+auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+ -> std::optional<struct rspamd_url *>
+{
+ struct rspamd_url *url;
+ guint saved_flags = 0;
+ gint rc;
+ const gchar *s, *prefix = "http://";
+ gchar *d;
+ gsize dlen;
+ gboolean has_bad_chars = FALSE, no_prefix = FALSE;
+ static const gchar hexdigests[] = "0123456789abcdef";
+
+ auto sz = input.length();
+ const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
+ input = {trimmed, sz};
+
+ const auto *start = input.data();
+ s = start;
+ dlen = 0;
+
+ for (auto i = 0; i < sz; i++) {
+ if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+ dlen += 3;
+ }
+ else {
+ dlen++;
+ }
+ }
+
+ if (rspamd_substring_search(start, sz, "://", 3) == -1) {
+ if (sz >= sizeof("mailto:") &&
+ (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
+ memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
+ memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
+ /* Exclusion, has valid but 'strange' prefix */
+ }
+ else {
+ for (auto i = 0; i < sz; i++) {
+ if (!((s[i] & 0x80) || g_ascii_isalnum(s[i]))) {
+ if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') {
+ prefix = "http:";
+ dlen += sizeof("http:") - 1;
+ no_prefix = TRUE;
+ }
+ else if (s[i] == '@') {
+ /* Likely email prefix */
+ prefix = "mailto://";
+ dlen += sizeof("mailto://") - 1;
+ no_prefix = TRUE;
+ }
+ else if (s[i] == ':' && i != 0) {
+ /* Special case */
+ no_prefix = FALSE;
+ }
+ else {
+ if (i == 0) {
+ /* No valid data */
+ return std::nullopt;
+ }
+ else {
+ no_prefix = TRUE;
+ dlen += strlen(prefix);
+ }
+ }
+
+ break;
+ }
+ }
+ }
+ }
+
+ auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1);
+ d = decoded;
+
+ if (no_prefix) {
+ gsize plen = strlen(prefix);
+ memcpy(d, prefix, plen);
+ d += plen;
+ }
+
+ /*
+ * We also need to remove all internal newlines, spaces
+ * and encode unsafe characters
+ * Another obfuscation find in the wild was encoding of the SAFE url characters,
+ * including essential ones
+ */
+ for (auto i = 0; i < sz; i++) {
+ if (G_UNLIKELY(g_ascii_isspace(s[i]))) {
+ continue;
+ }
+ else if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
+ /* URL encode */
+ *d++ = '%';
+ *d++ = hexdigests[(s[i] >> 4) & 0xf];
+ *d++ = hexdigests[s[i] & 0xf];
+ has_bad_chars = TRUE;
+ }
+ else if (G_UNLIKELY(s[i] == '%')) {
+ if (i + 2 < sz) {
+ auto c1 = s[i + 1];
+ auto c2 = s[i + 2];
+
+ if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) {
+ auto codepoint = 0;
+
+ if (c1 >= '0' && c1 <= '9') codepoint = c1 - '0';
+ else if (c1 >= 'A' && c1 <= 'F')
+ codepoint = c1 - 'A' + 10;
+ else if (c1 >= 'a' && c1 <= 'f')
+ codepoint = c1 - 'a' + 10;
+
+ codepoint <<= 4;
+
+ if (c2 >= '0' && c2 <= '9') codepoint += c2 - '0';
+ else if (c2 >= 'A' && c2 <= 'F')
+ codepoint += c2 - 'A' + 10;
+ else if (c2 >= 'a' && c2 <= 'f')
+ codepoint += c2 - 'a' + 10;
+
+ /* Now check for 'interesting' codepoints */
+ if (codepoint == '@' || codepoint == ':' || codepoint == '|' ||
+ codepoint == '?' || codepoint == '\\' || codepoint == '/') {
+ /* Replace it back */
+ *d++ = (char) (codepoint & 0xff);
+ i += 2;
+ }
+ else {
+ *d++ = s[i];
+ }
+ }
+ else {
+ *d++ = s[i];
+ }
+ }
+ else {
+ *d++ = s[i];
+ }
+ }
+ else {
+ *d++ = s[i];
+ }
+ }
+
+ *d = '\0';
+ dlen = d - decoded;
+
+ url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+ rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags);
+ rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
+
+ /* Filter some completely damaged urls */
+ if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
+ !((url->protocol & PROTOCOL_UNKNOWN))) {
+ url->flags |= saved_flags;
+
+ if (has_bad_chars) {
+ url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+ }
+
+ if (no_prefix) {
+ url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+
+ if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
+ /* Ignore urls with both no schema and no tld */
+ return std::nullopt;
+ }
+ }
+
+ decoded = url->string;
+
+ input = {decoded, url->urllen};
+
+ /* Spaces in href usually mean an attempt to obfuscate URL */
+ /* See https://github.com/vstakhov/rspamd/issues/593 */
+#if 0
+ if (has_spaces) {
+ url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+ }
+#endif
+
+ return url;
+ }
+
+ return std::nullopt;
+}
+
+}// namespace rspamd::html \ No newline at end of file
diff --git a/src/libserver/html/html_url.hxx b/src/libserver/html/html_url.hxx
new file mode 100644
index 0000000..46dde6d
--- /dev/null
+++ b/src/libserver/html/html_url.hxx
@@ -0,0 +1,68 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_URL_HXX
+#define RSPAMD_HTML_URL_HXX
+#pragma once
+
+#include "libutil/mem_pool.h"
+
+#include <string_view>
+#include <optional>
+
+struct rspamd_url; /* Forward declaration */
+
+namespace rspamd::html {
+
+
+/**
+ * Checks if an html url is likely phished by some displayed url
+ * @param pool
+ * @param href_url
+ * @param text_data
+ * @return
+ */
+auto html_url_is_phished(rspamd_mempool_t *pool,
+ struct rspamd_url *href_url,
+ std::string_view text_data) -> std::optional<rspamd_url *>;
+
+/**
+ * Check displayed part of the url at specified offset
+ * @param pool
+ * @param exceptions
+ * @param url_set
+ * @param visible_part
+ * @param href_offset
+ * @param url
+ */
+auto html_check_displayed_url(rspamd_mempool_t *pool,
+ GList **exceptions,
+ void *url_set,
+ std::string_view visible_part,
+ goffset href_offset,
+ struct rspamd_url *url) -> void;
+
+/**
+ * Process HTML url (e.g. for href component)
+ * @param pool
+ * @param input may be modified during the process
+ * @return
+ */
+auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+ -> std::optional<struct rspamd_url *>;
+}// namespace rspamd::html
+
+#endif//RSPAMD_HTML_URL_HXX \ No newline at end of file