diff options
Diffstat (limited to 'src/libserver/html')
-rw-r--r-- | src/libserver/html/html.cxx | 2393 | ||||
-rw-r--r-- | src/libserver/html/html.h | 137 | ||||
-rw-r--r-- | src/libserver/html/html.hxx | 146 | ||||
-rw-r--r-- | src/libserver/html/html_block.hxx | 358 | ||||
-rw-r--r-- | src/libserver/html/html_entities.cxx | 2644 | ||||
-rw-r--r-- | src/libserver/html/html_entities.hxx | 31 | ||||
-rw-r--r-- | src/libserver/html/html_tag.hxx | 159 | ||||
-rw-r--r-- | src/libserver/html/html_tag_defs.hxx | 194 | ||||
-rw-r--r-- | src/libserver/html/html_tags.h | 176 | ||||
-rw-r--r-- | src/libserver/html/html_tests.cxx | 304 | ||||
-rw-r--r-- | src/libserver/html/html_url.cxx | 496 | ||||
-rw-r--r-- | src/libserver/html/html_url.hxx | 68 |
12 files changed, 7106 insertions, 0 deletions
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx new file mode 100644 index 0000000..5861d45 --- /dev/null +++ b/src/libserver/html/html.cxx @@ -0,0 +1,2393 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "util.h" +#include "message.h" +#include "html.h" +#include "html_tags.h" +#include "html_block.hxx" +#include "html.hxx" +#include "libserver/css/css_value.hxx" +#include "libserver/css/css.hxx" +#include "libserver/task.h" +#include "libserver/cfg_file.h" + +#include "url.h" +#include "contrib/libucl/khash.h" +#include "libmime/images.h" +#include "libutil/cxx/utf8_util.h" + +#include "html_tag_defs.hxx" +#include "html_entities.hxx" +#include "html_tag.hxx" +#include "html_url.hxx" + +#include <frozen/unordered_map.h> +#include <frozen/string.h> +#include <fmt/core.h> + +#include <unicode/uversion.h> + +namespace rspamd::html { + +static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */ + +static const html_tags_storage html_tags_defs; + +auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>( + { + {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME}, + {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, + {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, + {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, + {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR}, + {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR}, + {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE}, + {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS}, + {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH}, + {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT}, + {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE}, + {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL}, + {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT}, + {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID}, + {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN}, + }); + +#define msg_debug_html(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_html_log_id, "html", pool->tag.uid, \ + __FUNCTION__, \ + __VA_ARGS__) + +INIT_LOG_MODULE(html) + +/* + * This function is expected to be called on a closing tag to fill up all tags + * and return the current parent (meaning unclosed) tag + */ +static auto +html_check_balance(struct html_content *hc, + struct html_tag *tag, + goffset tag_start_offset, + goffset tag_end_offset) -> html_tag * +{ + /* As agreed, the closing tag has the last opening at the parent ptr */ + auto *opening_tag = tag->parent; + + auto calculate_content_length = [tag_start_offset, tag_end_offset](html_tag *t) { + auto opening_content_offset = t->content_offset; + + if (t->flags & (CM_EMPTY)) { + /* Attach closing tag just at the opening tag */ + t->closing.start = t->tag_start; + t->closing.end = t->content_offset; + } + else { + + if (opening_content_offset <= tag_start_offset) { + t->closing.start = tag_start_offset; + t->closing.end = tag_end_offset; + } + else { + + t->closing.start = t->content_offset; + t->closing.end = tag_end_offset; + } + } + }; + + auto balance_tag = [&]() -> html_tag * { + auto it = tag->parent; + auto found_pair = false; + + for (; it != nullptr; it = it->parent) { + if (it->id == tag->id && !(it->flags & FL_CLOSED)) { + found_pair = true; + break; + } + } + + /* + * If we have found a closing pair, then we need to close all tags and + * return the top-most tag + */ + if (found_pair) { + for (it = tag->parent; it != nullptr; it = it->parent) { + it->flags |= FL_CLOSED; + /* Insert a virtual closing tag for all tags that are not closed */ + calculate_content_length(it); + if (it->id == tag->id && !(it->flags & FL_CLOSED)) { + break; + } + } + + return it; + } + else { + /* + * We have not found a pair, so this closing tag is bogus and should + * be ignored completely. + * Unfortunately, it also means that we need to insert another tag, + * as the current closing tag is unusable for that purposes. + * + * We assume that callee will recognise that and reconstruct the + * tag at the tag_end_closing state, so we return nullptr... + */ + } + + /* Tag must be ignored and reconstructed */ + return nullptr; + }; + + if (opening_tag) { + + if (opening_tag->id == tag->id) { + opening_tag->flags |= FL_CLOSED; + + calculate_content_length(opening_tag); + /* All good */ + return opening_tag->parent; + } + else { + return balance_tag(); + } + } + else { + /* + * We have no opening tag + * There are two possibilities: + * + * 1) We have some block tag in hc->all_tags; + * 2) We have no tags + */ + + if (hc->all_tags.empty()) { + hc->all_tags.push_back(std::make_unique<html_tag>()); + auto *vtag = hc->all_tags.back().get(); + vtag->id = Tag_HTML; + vtag->flags = FL_VIRTUAL; + vtag->tag_start = 0; + vtag->content_offset = 0; + calculate_content_length(vtag); + + if (!hc->root_tag) { + hc->root_tag = vtag; + } + else { + vtag->parent = hc->root_tag; + } + + tag->parent = vtag; + + /* Recursively call with a virtual <html> tag inserted */ + return html_check_balance(hc, tag, tag_start_offset, tag_end_offset); + } + } + + return nullptr; +} + +auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type> +{ + auto known_component_it = html_components_map.find(st); + + if (known_component_it != html_components_map.end()) { + return known_component_it->second; + } + else { + return std::nullopt; + } +} + +enum tag_parser_state { + parse_start = 0, + parse_name, + parse_attr_name, + parse_equal, + parse_start_dquote, + parse_dqvalue, + parse_end_dquote, + parse_start_squote, + parse_sqvalue, + parse_end_squote, + parse_value, + spaces_before_eq, + spaces_after_eq, + spaces_after_param, + ignore_bad_tag, + tag_end, + slash_after_value, + slash_in_unquoted_value, +}; +struct tag_content_parser_state { + tag_parser_state cur_state = parse_start; + std::string buf; + std::optional<html_component_type> cur_component; + + void reset() + { + cur_state = parse_start; + buf.clear(); + cur_component = std::nullopt; + } +}; + +static inline void +html_parse_tag_content(rspamd_mempool_t *pool, + struct html_content *hc, + struct html_tag *tag, + const char *in, + struct tag_content_parser_state &parser_env) +{ + auto state = parser_env.cur_state; + + /* + * Stores tag component if it doesn't exist, performing copy of the + * value + decoding of the entities + * Parser env is set to clear the current html attribute fields (saved_p and + * cur_component) + */ + auto store_component_value = [&]() -> void { + if (parser_env.cur_component) { + + if (parser_env.buf.empty()) { + tag->components.emplace_back(parser_env.cur_component.value(), + std::string_view{}); + } + else { + /* We need to copy buf to a persistent storage */ + auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); + + if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID || + parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { + /* Lowercase */ + rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size()); + } + else { + memcpy(s, parser_env.buf.data(), parser_env.buf.size()); + } + + auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size()); + tag->components.emplace_back(parser_env.cur_component.value(), + std::string_view{s, sz}); + } + } + + parser_env.buf.clear(); + parser_env.cur_component = std::nullopt; + }; + + auto store_component_name = [&]() -> bool { + decode_html_entitles_inplace(parser_env.buf); + auto known_component_it = html_components_map.find(std::string_view{parser_env.buf}); + parser_env.buf.clear(); + + if (known_component_it != html_components_map.end()) { + parser_env.cur_component = known_component_it->second; + + return true; + } + else { + parser_env.cur_component = std::nullopt; + } + + return false; + }; + + auto store_value_character = [&](bool lc) -> void { + auto c = lc ? g_ascii_tolower(*in) : *in; + + if (c == '\0') { + /* Replace with u0FFD */ + parser_env.buf.append((const char *) u8"\uFFFD"); + } + else { + parser_env.buf.push_back(c); + } + }; + + switch (state) { + case parse_start: + if (!g_ascii_isalpha(*in) && !g_ascii_isspace(*in)) { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = ignore_bad_tag; + tag->id = N_TAGS; + tag->flags |= FL_BROKEN; + } + else if (g_ascii_isalpha(*in)) { + state = parse_name; + store_value_character(true); + } + break; + + case parse_name: + if ((g_ascii_isspace(*in) || *in == '>' || *in == '/')) { + if (*in == '/') { + tag->flags |= FL_CLOSED; + } + + if (parser_env.buf.empty()) { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + tag->id = N_TAGS; + tag->flags |= FL_BROKEN; + state = ignore_bad_tag; + } + else { + decode_html_entitles_inplace(parser_env.buf); + const auto *tag_def = rspamd::html::html_tags_defs.by_name(parser_env.buf); + + if (tag_def == nullptr) { + hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS; + /* Assign -hash to match closing tag if needed */ + auto nhash = static_cast<std::int32_t>(std::hash<std::string>{}(parser_env.buf)); + /* Always negative */ + tag->id = static_cast<tag_id_t>(nhash | G_MININT32); + } + else { + tag->id = tag_def->id; + tag->flags = tag_def->flags; + } + + parser_env.buf.clear(); + + state = spaces_after_param; + } + } + else { + store_value_character(true); + } + break; + + case parse_attr_name: + if (*in == '=') { + if (!parser_env.buf.empty()) { + store_component_name(); + } + state = parse_equal; + } + else if (g_ascii_isspace(*in)) { + store_component_name(); + state = spaces_before_eq; + } + else if (*in == '/') { + store_component_name(); + store_component_value(); + state = slash_after_value; + } + else if (*in == '>') { + store_component_name(); + store_component_value(); + state = tag_end; + } + else { + if (*in == '"' || *in == '\'' || *in == '<') { + /* Should never be in attribute names but ignored */ + tag->flags |= FL_BROKEN; + } + + store_value_character(true); + } + + break; + + case spaces_before_eq: + if (*in == '=') { + state = parse_equal; + } + else if (!g_ascii_isspace(*in)) { + /* + * HTML defines that crap could still be restored and + * calculated somehow... So we have to follow this stupid behaviour + */ + /* + * TODO: estimate what insane things do email clients in each case + */ + if (*in == '>') { + /* + * Attribute name followed by end of tag + * Should be okay (empty attribute). The rest is handled outside + * this automata. + */ + store_component_value(); + state = tag_end; + } + else if (*in == '"' || *in == '\'' || *in == '<') { + /* Attribute followed by quote... Missing '=' ? Dunno, need to test */ + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + tag->flags |= FL_BROKEN; + store_component_value(); + store_value_character(true); + state = spaces_after_param; + } + else { + /* Empty attribute */ + store_component_value(); + store_value_character(true); + state = spaces_after_param; + } + } + break; + + case spaces_after_eq: + if (*in == '"') { + state = parse_start_dquote; + } + else if (*in == '\'') { + state = parse_start_squote; + } + else if (!g_ascii_isspace(*in)) { + store_value_character(true); + state = parse_value; + } + break; + + case parse_equal: + if (g_ascii_isspace(*in)) { + state = spaces_after_eq; + } + else if (*in == '"') { + state = parse_start_dquote; + } + else if (*in == '\'') { + state = parse_start_squote; + } + else { + store_value_character(true); + state = parse_value; + } + break; + + case parse_start_dquote: + if (*in == '"') { + state = spaces_after_param; + } + else { + store_value_character(false); + state = parse_dqvalue; + } + break; + + case parse_start_squote: + if (*in == '\'') { + state = spaces_after_param; + } + else { + store_value_character(false); + state = parse_sqvalue; + } + break; + + case parse_dqvalue: + if (*in == '"') { + store_component_value(); + state = parse_end_dquote; + } + else { + store_value_character(false); + } + break; + + case parse_sqvalue: + if (*in == '\'') { + store_component_value(); + state = parse_end_squote; + } + else { + store_value_character(false); + } + + break; + + case parse_value: + if (*in == '/') { + state = slash_in_unquoted_value; + } + else if (g_ascii_isspace(*in) || *in == '>' || *in == '"') { + store_component_value(); + state = spaces_after_param; + } + else { + store_value_character(false); + } + break; + + case parse_end_dquote: + case parse_end_squote: + if (g_ascii_isspace(*in)) { + state = spaces_after_param; + } + else if (*in == '/') { + store_component_value(); + store_value_character(true); + state = slash_after_value; + } + else { + /* No space, proceed immediately to the attribute name */ + state = parse_attr_name; + store_component_value(); + store_value_character(true); + } + break; + + case spaces_after_param: + if (!g_ascii_isspace(*in)) { + if (*in == '/') { + state = slash_after_value; + } + else if (*in == '=') { + /* Attributes cannot start with '=' */ + tag->flags |= FL_BROKEN; + store_value_character(true); + state = parse_attr_name; + } + else { + store_value_character(true); + state = parse_attr_name; + } + } + break; + case slash_after_value: + if (*in == '>') { + tag->flags |= FL_CLOSED; + state = tag_end; + } + else if (!g_ascii_isspace(*in)) { + tag->flags |= FL_BROKEN; + state = parse_attr_name; + } + break; + case slash_in_unquoted_value: + if (*in == '>') { + /* That slash was in fact closing tag slash, woohoo */ + tag->flags |= FL_CLOSED; + state = tag_end; + store_component_value(); + } + else { + /* Welcome to the world of html, revert state and save missing / */ + parser_env.buf.push_back('/'); + store_value_character(false); + state = parse_value; + } + break; + case ignore_bad_tag: + case tag_end: + break; + } + + parser_env.cur_state = state; +} + +static inline auto +html_is_absolute_url(std::string_view st) -> bool +{ + auto alnum_pos = std::find_if(std::begin(st), std::end(st), + [](auto c) { return !g_ascii_isalnum(c); }); + + if (alnum_pos != std::end(st) && alnum_pos != std::begin(st)) { + if (*alnum_pos == ':') { + if (st.substr(0, std::distance(std::begin(st), alnum_pos)) == "mailto") { + return true; + } + + std::advance(alnum_pos, 1); + if (alnum_pos != std::end(st)) { + /* Include even malformed urls */ + if (*alnum_pos == '/' || *alnum_pos == '\\') { + return true; + } + } + } + } + + return false; +} + +static auto +html_process_url_tag(rspamd_mempool_t *pool, + struct html_tag *tag, + struct html_content *hc) -> std::optional<struct rspamd_url *> +{ + auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF); + + if (found_href_maybe) { + /* Check base url */ + auto &href_value = found_href_maybe.value(); + + if (hc && hc->base_url) { + /* + * Relative url cannot start from the following: + * schema:// + * data: + * slash + */ + + if (!html_is_absolute_url(href_value)) { + + if (href_value.size() >= sizeof("data:") && + g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) { + /* Image data url, never insert as url */ + return std::nullopt; + } + + /* Assume relative url */ + auto need_slash = false; + + auto orig_len = href_value.size(); + auto len = orig_len + hc->base_url->urllen; + + if (hc->base_url->datalen == 0) { + need_slash = true; + len++; + } + + auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1); + auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, + "%*s%s%*s", + (int) hc->base_url->urllen, hc->base_url->string, + need_slash ? "/" : "", + (gint) orig_len, href_value.data()); + href_value = {buf, nlen}; + } + else if (href_value.size() > 2 && href_value[0] == '/' && href_value[1] != '/') { + /* Relative to the hostname */ + auto orig_len = href_value.size(); + auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen + + 3 /* for :// */; + auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1); + auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s", + (int) hc->base_url->protocollen, hc->base_url->string, + (int) hc->base_url->hostlen, rspamd_url_host_unsafe(hc->base_url), + (gint) orig_len, href_value.data()); + href_value = {buf, nlen}; + } + } + + auto url = html_process_url(pool, href_value).value_or(nullptr); + + if (url) { + if (tag->id != Tag_A) { + /* Mark special tags special */ + url->flags |= RSPAMD_URL_FLAG_SPECIAL; + } + + if (std::holds_alternative<std::monostate>(tag->extra)) { + tag->extra = url; + } + + return url; + } + + return std::nullopt; + } + + return std::nullopt; +} + +struct rspamd_html_url_query_cbd { + rspamd_mempool_t *pool; + khash_t(rspamd_url_hash) * url_set; + struct rspamd_url *url; + GPtrArray *part_urls; +}; + +static gboolean +html_url_query_callback(struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ + struct rspamd_html_url_query_cbd *cbd = + (struct rspamd_html_url_query_cbd *) ud; + rspamd_mempool_t *pool; + + pool = cbd->pool; + + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen == 0) { + return FALSE; + } + } + + msg_debug_html("found url %s in query of url" + " %*s", + url->string, + cbd->url->querylen, rspamd_url_query_unsafe(cbd->url)); + + url->flags |= RSPAMD_URL_FLAG_QUERY; + + if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) { + g_ptr_array_add(cbd->part_urls, url); + } + + return TRUE; +} + +static void +html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls) +{ + if (url->querylen > 0) { + struct rspamd_html_url_query_cbd qcbd; + + qcbd.pool = pool; + qcbd.url_set = url_set; + qcbd.url = url; + qcbd.part_urls = part_urls; + + rspamd_url_find_multiple(pool, + rspamd_url_query_unsafe(url), url->querylen, + RSPAMD_URL_FIND_ALL, NULL, + html_url_query_callback, &qcbd); + } + + if (part_urls) { + g_ptr_array_add(part_urls, url); + } +} + +static auto +html_process_data_image(rspamd_mempool_t *pool, + struct html_image *img, + std::string_view input) -> void +{ + /* + * Here, we do very basic processing of the data: + * detect if we have something like: `data:image/xxx;base64,yyyzzz==` + * We only parse base64 encoded data. + * We ignore content type so far + */ + struct rspamd_image *parsed_image; + const gchar *semicolon_pos = input.data(), + *end = input.data() + input.size(); + + if ((semicolon_pos = (const gchar *) memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) { + if (end - semicolon_pos > sizeof("base64,")) { + if (memcmp(semicolon_pos + 1, "base64,", sizeof("base64,") - 1) == 0) { + const gchar *data_pos = semicolon_pos + sizeof("base64,"); + gchar *decoded; + gsize encoded_len = end - data_pos, decoded_len; + rspamd_ftok_t inp; + + decoded_len = (encoded_len / 4 * 3) + 12; + decoded = rspamd_mempool_alloc_buffer(pool, decoded_len); + rspamd_cryptobox_base64_decode(data_pos, encoded_len, + reinterpret_cast<guchar *>(decoded), &decoded_len); + inp.begin = decoded; + inp.len = decoded_len; + + parsed_image = rspamd_maybe_process_image(pool, &inp); + + if (parsed_image) { + msg_debug_html("detected %s image of size %ud x %ud in data url", + rspamd_image_type_str(parsed_image->type), + parsed_image->width, parsed_image->height); + img->embedded_image = parsed_image; + } + } + } + else { + /* Nothing useful */ + return; + } + } +} + +static void +html_process_img_tag(rspamd_mempool_t *pool, + struct html_tag *tag, + struct html_content *hc, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls) +{ + struct html_image *img; + + img = rspamd_mempool_alloc0_type(pool, struct html_image); + img->tag = tag; + + for (const auto ¶m: tag->components) { + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) { + /* Check base url */ + const auto &href_value = param.value; + + if (href_value.size() > 0) { + rspamd_ftok_t fstr; + fstr.begin = href_value.data(); + fstr.len = href_value.size(); + img->src = rspamd_mempool_ftokdup(pool, &fstr); + + if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(), + "cid:", sizeof("cid:") - 1) == 0) { + /* We have an embedded image */ + img->src += sizeof("cid:") - 1; + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; + } + else { + if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(), + "data:", sizeof("data:") - 1) == 0) { + /* We have an embedded image in HTML tag */ + img->flags |= + (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); + html_process_data_image(pool, img, href_value); + hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; + } + else { + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; + if (img->src) { + + std::string_view cpy{href_value}; + auto maybe_url = html_process_url(pool, cpy); + + if (maybe_url) { + img->url = maybe_url.value(); + struct rspamd_url *existing; + + img->url->flags |= RSPAMD_URL_FLAG_IMAGE; + existing = rspamd_url_set_add_or_return(url_set, + img->url); + + if (existing && existing != img->url) { + /* + * We have some other URL that could be + * found, e.g. from another part. However, + * we still want to set an image flag on it + */ + existing->flags |= img->url->flags; + existing->count++; + } + else if (part_urls) { + /* New url */ + g_ptr_array_add(part_urls, img->url); + } + } + } + } + } + } + } + + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) { + unsigned long val; + + rspamd_strtoul(param.value.data(), param.value.size(), &val); + img->height = val; + } + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) { + unsigned long val; + + rspamd_strtoul(param.value.data(), param.value.size(), &val); + img->width = val; + } + + /* TODO: rework to css at some time */ + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { + if (img->height == 0) { + auto style_st = param.value; + auto pos = rspamd_substring_search_caseless(style_st.data(), + style_st.size(), + "height", sizeof("height") - 1); + if (pos != -1) { + auto substr = style_st.substr(pos + sizeof("height") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->height = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; + } + } + } + } + if (img->width == 0) { + auto style_st = param.value; + auto pos = rspamd_substring_search_caseless(style_st.data(), + style_st.size(), + "width", sizeof("width") - 1); + if (pos != -1) { + auto substr = style_st.substr(pos + sizeof("width") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->width = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; + } + } + } + } + } + } + + if (img->embedded_image) { + if (img->height == 0) { + img->height = img->embedded_image->height; + } + if (img->width == 0) { + img->width = img->embedded_image->width; + } + } + + hc->images.push_back(img); + + if (std::holds_alternative<std::monostate>(tag->extra)) { + tag->extra = img; + } +} + +static auto +html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag, + struct html_content *hc, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls) -> void +{ + auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL); + + if (found_rel_maybe) { + if (found_rel_maybe.value() == "icon") { + html_process_img_tag(pool, tag, hc, url_set, part_urls); + } + } +} + +static auto +html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, + struct html_content *hc) -> void +{ + std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor; + bool hidden = false; + + for (const auto ¶m: tag->components) { + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) { + maybe_fgcolor = css::css_value::maybe_color_from_string(param.value); + } + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) { + maybe_bgcolor = css::css_value::maybe_color_from_string(param.value); + } + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { + tag->block = rspamd::css::parse_css_declaration(pool, param.value); + } + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) { + hidden = true; + } + } + + if (!tag->block) { + tag->block = html_block::undefined_html_block_pool(pool); + } + + if (hidden) { + tag->block->set_display(false); + } + + if (maybe_fgcolor) { + tag->block->set_fgcolor(maybe_fgcolor->to_color().value()); + } + + if (maybe_bgcolor) { + tag->block->set_bgcolor(maybe_bgcolor->to_color().value()); + } +} + +static inline auto +html_append_parsed(struct html_content *hc, + std::string_view data, + bool transparent, + std::size_t input_len, + std::string &dest) -> std::size_t +{ + auto cur_offset = dest.size(); + + if (dest.size() > input_len) { + /* Impossible case, refuse to append */ + return 0; + } + + if (data.size() > 0) { + /* Handle multiple spaces at the begin */ + + if (cur_offset > 0) { + auto last = dest.back(); + if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) { + dest.append(" "); + data = {data.data() + 1, data.size() - 1}; + cur_offset++; + } + } + + if (data.find('\0') != std::string_view::npos) { + auto replace_zero_func = [](const auto &input, auto &output) { + const auto last = input.cend(); + for (auto it = input.cbegin(); it != last; ++it) { + if (*it == '\0') { + output.append((const char *) u8"\uFFFD"); + } + else { + output.push_back(*it); + } + } + }; + + dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD")); + replace_zero_func(data, dest); + hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS; + } + else { + dest.append(data); + } + } + + auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset, + dest.size() - cur_offset, true); + + dest.resize(nlen + cur_offset); + + if (transparent) { + /* Replace all visible characters with spaces */ + auto start = std::next(dest.begin(), cur_offset); + std::replace_if( + start, std::end(dest), [](const auto c) { + return !g_ascii_isspace(c); + }, + ' '); + } + + return nlen; +} + +static auto +html_process_displayed_href_tag(rspamd_mempool_t *pool, + struct html_content *hc, + std::string_view data, + const struct html_tag *cur_tag, + GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + goffset dest_offset) -> void +{ + + if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) { + auto *url = std::get<rspamd_url *>(cur_tag->extra); + + html_check_displayed_url(pool, + exceptions, url_set, + data, + dest_offset, + url); + } +} + +static auto +html_append_tag_content(rspamd_mempool_t *pool, + const gchar *start, gsize len, + struct html_content *hc, + html_tag *tag, + GList **exceptions, + khash_t(rspamd_url_hash) * url_set) -> goffset +{ + auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false; + goffset next_tag_offset = tag->closing.end, + initial_parsed_offset = hc->parsed.size(), + initial_invisible_offset = hc->invisible.size(); + + auto calculate_final_tag_offsets = [&]() -> void { + if (is_visible) { + tag->content_offset = initial_parsed_offset; + tag->closing.start = hc->parsed.size(); + } + else { + tag->content_offset = initial_invisible_offset; + tag->closing.start = hc->invisible.size(); + } + }; + + if (tag->closing.end == -1) { + if (tag->closing.start != -1) { + next_tag_offset = tag->closing.start; + tag->closing.end = tag->closing.start; + } + else { + next_tag_offset = tag->content_offset; + tag->closing.end = tag->content_offset; + } + } + if (tag->closing.start == -1) { + tag->closing.start = tag->closing.end; + } + + auto append_margin = [&](char c) -> void { + /* We do care about visible margins only */ + if (is_visible) { + if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') { + if (hc->parsed.back() == ' ') { + /* We also strip extra spaces at the end, but limiting the start */ + auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset); + auto first = std::find_if(hc->parsed.rbegin(), last, + [](auto ch) -> auto { + return ch != ' '; + }); + hc->parsed.erase(first.base(), hc->parsed.end()); + g_assert(hc->parsed.size() >= initial_parsed_offset); + } + hc->parsed.push_back(c); + } + } + }; + + if (tag->id == Tag_BR || tag->id == Tag_HR) { + + if (!(tag->flags & FL_IGNORE)) { + hc->parsed.append("\n"); + } + + auto ret = tag->content_offset; + calculate_final_tag_offsets(); + + return ret; + } + else if ((tag->id == Tag_HEAD && (tag->flags & FL_IGNORE)) || (tag->flags & CM_HEAD)) { + auto ret = tag->closing.end; + calculate_final_tag_offsets(); + + return ret; + } + + if ((tag->flags & (FL_COMMENT | FL_XML | FL_IGNORE | CM_HEAD))) { + is_visible = false; + } + else { + if (!tag->block) { + is_visible = true; + } + else if (!tag->block->is_visible()) { + if (!tag->block->is_transparent()) { + is_visible = false; + } + else { + if (tag->block->has_display() && + tag->block->display == css::css_display_value::DISPLAY_HIDDEN) { + is_visible = false; + } + else { + is_transparent = true; + } + } + } + else { + if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) { + is_block = true; + } + else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) { + is_spaces = true; + } + } + } + + if (is_block) { + append_margin('\n'); + } + else if (is_spaces) { + append_margin(' '); + } + + goffset cur_offset = tag->content_offset; + + for (auto *cld: tag->children) { + auto enclosed_start = cld->tag_start; + goffset initial_part_len = enclosed_start - cur_offset; + + if (initial_part_len > 0) { + if (is_visible) { + html_append_parsed(hc, + {start + cur_offset, std::size_t(initial_part_len)}, + is_transparent, len, hc->parsed); + } + else { + html_append_parsed(hc, + {start + cur_offset, std::size_t(initial_part_len)}, + is_transparent, len, hc->invisible); + } + } + + auto next_offset = html_append_tag_content(pool, start, len, + hc, cld, exceptions, url_set); + + /* Do not allow shifting back */ + if (next_offset > cur_offset) { + cur_offset = next_offset; + } + } + + if (cur_offset < tag->closing.start) { + goffset final_part_len = tag->closing.start - cur_offset; + + if (final_part_len > 0) { + if (is_visible) { + html_append_parsed(hc, + {start + cur_offset, std::size_t(final_part_len)}, + is_transparent, + len, + hc->parsed); + } + else { + html_append_parsed(hc, + {start + cur_offset, std::size_t(final_part_len)}, + is_transparent, + len, + hc->invisible); + } + } + } + if (is_block) { + append_margin('\n'); + } + else if (is_spaces) { + append_margin(' '); + } + + if (is_visible) { + if (tag->id == Tag_A) { + auto written_len = hc->parsed.size() - initial_parsed_offset; + html_process_displayed_href_tag(pool, hc, + {hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)}, + tag, exceptions, + url_set, initial_parsed_offset); + } + else if (tag->id == Tag_IMG) { + /* Process ALT if presented */ + auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT); + + if (maybe_alt) { + if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) { + /* Add a space */ + hc->parsed += ' '; + } + + hc->parsed.append(maybe_alt.value()); + + if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) { + /* Add a space */ + hc->parsed += ' '; + } + } + } + } + else { + /* Invisible stuff */ + if (std::holds_alternative<rspamd_url *>(tag->extra)) { + auto *url_enclosed = std::get<rspamd_url *>(tag->extra); + + /* + * TODO: when hash is fixed to include flags we need to remove and add + * url to the hash set + */ + if (url_enclosed) { + url_enclosed->flags |= RSPAMD_URL_FLAG_INVISIBLE; + } + } + } + + calculate_final_tag_offsets(); + + return next_tag_offset; +} + +auto html_process_input(struct rspamd_task *task, + GByteArray *in, + GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls, + bool allow_css, + std::uint16_t *cur_url_order) -> html_content * +{ + const gchar *p, *c, *end, *start; + guchar t; + auto closing = false; + guint obrace = 0, ebrace = 0; + struct rspamd_url *url = nullptr; + gint href_offset = -1; + auto overflow_input = false; + struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag; + struct tag_content_parser_state content_parser_env; + auto process_size = in->len; + + + enum { + parse_start = 0, + content_before_start, + tag_begin, + sgml_tag, + xml_tag, + compound_tag, + comment_tag, + comment_content, + sgml_content, + tag_content, + tag_end_opening, + tag_end_closing, + html_text_content, + xml_tag_end, + tag_raw_text, + tag_raw_text_less_than, + tags_limit_overflow, + } state = parse_start; + + enum class html_document_state { + doctype, + head, + body + } html_document_state = html_document_state::doctype; + + g_assert(in != NULL); + g_assert(task != NULL); + + auto *pool = task->task_pool; + auto cur_url_part_order = 0u; + + auto *hc = new html_content; + rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc); + + if (task->cfg && in->len > task->cfg->max_html_len) { + msg_notice_task("html input is too big: %z, limit is %z", + in->len, + task->cfg->max_html_len); + process_size = task->cfg->max_html_len; + overflow_input = true; + } + + auto new_tag = [&](int flags = 0) -> struct html_tag * + { + + if (hc->all_tags.size() > rspamd::html::max_tags) { + hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS; + + return nullptr; + } + + hc->all_tags.emplace_back(std::make_unique<html_tag>()); + auto *ntag = hc->all_tags.back().get(); + ntag->tag_start = c - start; + ntag->flags = flags; + + if (cur_tag && !(cur_tag->flags & (CM_EMPTY | FL_CLOSED)) && cur_tag != &cur_closing_tag) { + parent_tag = cur_tag; + } + + if (flags & FL_XML) { + return ntag; + } + + return ntag; + }; + + auto process_opening_tag = [&]() { + if (cur_tag->id > Tag_UNKNOWN) { + if (cur_tag->flags & CM_UNIQUE) { + if (!hc->tags_seen[cur_tag->id]) { + /* Duplicate tag has been found */ + hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS; + } + } + hc->tags_seen[cur_tag->id] = true; + } + + /* Shift to the first unclosed tag */ + auto *pt = parent_tag; + while (pt && (pt->flags & FL_CLOSED)) { + pt = pt->parent; + } + + if (pt) { + g_assert(cur_tag != pt); + cur_tag->parent = pt; + g_assert(cur_tag->parent != &cur_closing_tag); + parent_tag = pt; + parent_tag->children.push_back(cur_tag); + } + else { + if (hc->root_tag) { + if (cur_tag != hc->root_tag) { + cur_tag->parent = hc->root_tag; + g_assert(cur_tag->parent != cur_tag); + hc->root_tag->children.push_back(cur_tag); + parent_tag = hc->root_tag; + } + } + else { + if (cur_tag->id == Tag_HTML) { + hc->root_tag = cur_tag; + } + else { + /* Insert a fake html tag */ + hc->all_tags.emplace_back(std::make_unique<html_tag>()); + auto *top_tag = hc->all_tags.back().get(); + top_tag->tag_start = 0; + top_tag->flags = FL_VIRTUAL; + top_tag->id = Tag_HTML; + top_tag->content_offset = 0; + top_tag->children.push_back(cur_tag); + cur_tag->parent = top_tag; + g_assert(cur_tag->parent != cur_tag); + hc->root_tag = top_tag; + parent_tag = top_tag; + } + } + } + + if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) { + auto maybe_url = html_process_url_tag(pool, cur_tag, hc); + + if (maybe_url.has_value()) { + url = maybe_url.value(); + + if (url_set != NULL) { + struct rspamd_url *maybe_existing = + rspamd_url_set_add_or_return(url_set, maybe_url.value()); + if (maybe_existing == maybe_url.value()) { + if (cur_url_order) { + url->order = (*cur_url_order)++; + } + url->part_order = cur_url_part_order++; + html_process_query_url(pool, url, url_set, + part_urls); + } + else { + url = maybe_existing; + /* Replace extra as well */ + cur_tag->extra = maybe_existing; + /* Increase count to avoid odd checks failure */ + url->count++; + } + } + if (part_urls) { + g_ptr_array_add(part_urls, url); + } + + href_offset = hc->parsed.size(); + } + } + else if (cur_tag->id == Tag_BASE) { + /* + * Base is allowed only within head tag but HTML is retarded + */ + auto maybe_url = html_process_url_tag(pool, cur_tag, hc); + + if (maybe_url) { + msg_debug_html("got valid base tag"); + cur_tag->extra = maybe_url.value(); + cur_tag->flags |= FL_HREF; + + if (hc->base_url == nullptr) { + hc->base_url = maybe_url.value(); + } + else { + msg_debug_html("ignore redundant base tag"); + } + } + else { + msg_debug_html("got invalid base tag!"); + } + } + + if (cur_tag->id == Tag_IMG) { + html_process_img_tag(pool, cur_tag, hc, url_set, + part_urls); + } + else if (cur_tag->id == Tag_LINK) { + html_process_link_tag(pool, cur_tag, hc, url_set, + part_urls); + } + + if (!(cur_tag->flags & CM_EMPTY)) { + html_process_block_tag(pool, cur_tag, hc); + } + else { + /* Implicitly close */ + cur_tag->flags |= FL_CLOSED; + } + + if (cur_tag->flags & FL_CLOSED) { + cur_tag->closing.end = cur_tag->content_offset; + cur_tag->closing.start = cur_tag->tag_start; + + cur_tag = parent_tag; + } + }; + + p = (const char *) in->data; + c = p; + end = p + process_size; + start = c; + + while (p < end) { + t = *p; + + switch (state) { + case parse_start: + if (t == '<') { + state = tag_begin; + } + else { + /* We have no starting tag, so assume that it's content */ + hc->flags |= RSPAMD_HTML_FLAG_BAD_START; + cur_tag = new_tag(); + html_document_state = html_document_state::body; + + if (cur_tag) { + cur_tag->id = Tag_HTML; + hc->root_tag = cur_tag; + state = content_before_start; + } + else { + state = tags_limit_overflow; + } + } + break; + case content_before_start: + if (t == '<') { + state = tag_begin; + } + else { + p++; + } + break; + case tag_begin: + switch (t) { + case '<': + c = p; + p++; + closing = FALSE; + break; + case '!': + cur_tag = new_tag(FL_XML | FL_CLOSED); + if (cur_tag) { + state = sgml_tag; + } + else { + state = tags_limit_overflow; + } + p++; + break; + case '?': + cur_tag = new_tag(FL_XML | FL_CLOSED); + if (cur_tag) { + state = xml_tag; + } + else { + state = tags_limit_overflow; + } + hc->flags |= RSPAMD_HTML_FLAG_XML; + p++; + break; + case '/': + closing = TRUE; + /* We fill fake closing tag to fill it with the content parser */ + cur_closing_tag.clear(); + /* + * For closing tags, we need to find some corresponding opening tag. + * However, at this point we have not even parsed a name, so we + * can not assume anything about balancing, etc. + * + * So we need to ensure that: + * 1) We have some opening tag in the chain cur_tag->parent... + * 2) cur_tag is nullptr - okay, html is just brain damaged + * 3) cur_tag must NOT be equal to cur_closing tag. It means that + * we had some poor closing tag but we still need to find an opening + * tag... Somewhere... + */ + + if (cur_tag == &cur_closing_tag) { + if (parent_tag != &cur_closing_tag) { + cur_closing_tag.parent = parent_tag; + } + else { + cur_closing_tag.parent = nullptr; + } + } + else if (cur_tag && cur_tag->flags & FL_CLOSED) { + /* Cur tag is already closed, we should find something else */ + auto *tmp = cur_tag; + while (tmp) { + tmp = tmp->parent; + + if (tmp == nullptr || !(tmp->flags & FL_CLOSED)) { + break; + } + } + + cur_closing_tag.parent = tmp; + } + else { + cur_closing_tag.parent = cur_tag; + } + + cur_tag = &cur_closing_tag; + p++; + break; + case '>': + /* Empty tag */ + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = html_text_content; + continue; + default: + if (g_ascii_isalpha(t)) { + state = tag_content; + content_parser_env.reset(); + + if (!closing) { + cur_tag = new_tag(); + } + + if (cur_tag) { + state = tag_content; + } + else { + state = tags_limit_overflow; + } + } + else { + /* Wrong bad tag */ + state = html_text_content; + } + break; + } + + break; + + case sgml_tag: + switch (t) { + case '[': + state = compound_tag; + obrace = 1; + ebrace = 0; + p++; + break; + case '-': + cur_tag->flags |= FL_COMMENT; + state = comment_tag; + p++; + break; + default: + state = sgml_content; + break; + } + + break; + + case xml_tag: + if (t == '?') { + state = xml_tag_end; + } + else if (t == '>') { + /* Misformed xml tag */ + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = tag_end_opening; + continue; + } + /* We efficiently ignore xml tags */ + p++; + break; + + case xml_tag_end: + if (t == '>') { + state = tag_end_opening; + cur_tag->content_offset = p - start + 1; + continue; + } + else { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + } + p++; + break; + + case compound_tag: + if (t == '[') { + obrace++; + } + else if (t == ']') { + ebrace++; + } + else if (t == '>' && obrace == ebrace) { + state = tag_end_opening; + cur_tag->content_offset = p - start + 1; + continue; + } + p++; + break; + + case comment_tag: + if (t != '-') { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = tag_end_opening; + } + else { + p++; + ebrace = 0; + /* + * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments + * ... the text must not start with a single + * U+003E GREATER-THAN SIGN character (>), + * nor start with a "-" (U+002D) character followed by + * a U+003E GREATER-THAN SIGN (>) character, + * nor contain two consecutive U+002D HYPHEN-MINUS + * characters (--), nor end with a "-" (U+002D) character. + */ + if (p[0] == '-' && p + 1 < end && p[1] == '>') { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + p++; + state = tag_end_opening; + } + else if (*p == '>') { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = tag_end_opening; + } + else { + state = comment_content; + } + } + break; + + case comment_content: + if (t == '-') { + ebrace++; + } + else if (t == '>' && ebrace >= 2) { + cur_tag->content_offset = p - start + 1; + state = tag_end_opening; + continue; + } + else { + ebrace = 0; + } + + p++; + break; + + case html_text_content: + if (t != '<') { + p++; + } + else { + state = tag_begin; + } + break; + + case tag_raw_text: + if (t == '<') { + c = p; + state = tag_raw_text_less_than; + } + p++; + break; + case tag_raw_text_less_than: + if (t == '/') { + /* Here are special things: we look for obrace and then ensure + * that if there is any closing brace nearby + * (we look maximum at 30 characters). We also need to ensure + * that we have no special characters, such as punctuation marks and + * so on. + * Basically, we validate the input to be sane. + * Since closing tags must not have attributes, these assumptions + * seems to be reasonable enough for our toy parser. + */ + gint cur_lookahead = 1; + gint max_lookahead = MIN(end - p, 30); + bool valid_closing_tag = true; + + if (p + 1 < end && !g_ascii_isalpha(p[1])) { + valid_closing_tag = false; + } + else { + while (cur_lookahead < max_lookahead) { + gchar tt = p[cur_lookahead]; + if (tt == '>') { + break; + } + else if (tt < '\n' || tt == ',') { + valid_closing_tag = false; + break; + } + cur_lookahead++; + } + + if (cur_lookahead == max_lookahead) { + valid_closing_tag = false; + } + } + + if (valid_closing_tag) { + /* Shift back */ + p = c; + state = tag_begin; + } + else { + p++; + state = tag_raw_text; + } + } + else { + p++; + state = tag_raw_text; + } + break; + case sgml_content: + /* TODO: parse DOCTYPE here */ + if (t == '>') { + cur_tag->content_offset = p - start + 1; + state = tag_end_opening; + } + else { + p++; + } + break; + + case tag_content: + html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env); + + if (t == '>') { + if (content_parser_env.cur_state != parse_dqvalue && content_parser_env.cur_state != parse_sqvalue) { + /* We have a closing element */ + if (closing) { + cur_tag->closing.start = c - start; + cur_tag->closing.end = p - start + 1; + + closing = FALSE; + state = tag_end_closing; + } + else { + cur_tag->content_offset = p - start + 1; + state = tag_end_opening; + } + } + else { + /* + * We are in the parse_quoted value state but got + * an unescaped `>` character. + * HTML is written for monkeys, so there are two possibilities: + * 1) We have missing ending quote + * 2) We have unescaped `>` character + * How to distinguish between those possibilities? + * Well, the idea is to do some lookahead and try to find a + * quote. If we can find a quote, we just pretend as we have + * not seen `>` character. Otherwise, we pretend that it is an + * unquoted stuff. This logic is quite fragile but I really + * don't know any better options... + */ + auto end_quote = content_parser_env.cur_state == parse_sqvalue ? '\'' : '"'; + if (memchr(p, end_quote, end - p) != nullptr) { + /* Unencoded `>` */ + p++; + continue; + } + else { + if (closing) { + cur_tag->closing.start = c - start; + cur_tag->closing.end = p - start + 1; + + closing = FALSE; + state = tag_end_closing; + } + else { + cur_tag->content_offset = p - start + 1; + state = tag_end_opening; + } + } + } + continue; + } + p++; + break; + + case tag_end_opening: + content_parser_env.reset(); + state = html_text_content; + + if (cur_tag) { + if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) { + state = tag_raw_text; + } + if (html_document_state == html_document_state::doctype) { + if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) { + html_document_state = html_document_state::head; + cur_tag->flags |= FL_IGNORE; + } + else if (cur_tag->id != Tag_HTML) { + html_document_state = html_document_state::body; + } + } + else if (html_document_state == html_document_state::head) { + if (!(cur_tag->flags & (CM_EMPTY | CM_HEAD))) { + if (parent_tag && (parent_tag->id == Tag_HEAD || !(parent_tag->flags & CM_HEAD))) { + /* + * As by standard, we have to close the HEAD tag + * and switch to the body state + */ + parent_tag->flags |= FL_CLOSED; + parent_tag->closing.start = cur_tag->tag_start; + parent_tag->closing.end = cur_tag->content_offset; + + html_document_state = html_document_state::body; + } + else if (cur_tag->id == Tag_BODY) { + html_document_state = html_document_state::body; + } + else { + /* + * For propagation in something like + * <title><p><a>ololo</a></p></title> - should be unprocessed + */ + cur_tag->flags |= CM_HEAD; + } + } + } + + process_opening_tag(); + } + + p++; + c = p; + break; + case tag_end_closing: { + if (cur_tag) { + + if (cur_tag->flags & CM_EMPTY) { + /* Ignore closing empty tags */ + cur_tag->flags |= FL_IGNORE; + } + if (html_document_state == html_document_state::doctype) { + } + else if (html_document_state == html_document_state::head) { + if (cur_tag->id == Tag_HEAD) { + html_document_state = html_document_state::body; + } + } + + /* cur_tag here is a closing tag */ + auto *next_cur_tag = html_check_balance(hc, cur_tag, + c - start, p - start + 1); + + if (cur_tag->id == Tag_STYLE && allow_css) { + auto *opening_tag = cur_tag->parent; + + if (opening_tag && opening_tag->id == Tag_STYLE && + (int) opening_tag->content_offset < opening_tag->closing.start) { + auto ret_maybe = rspamd::css::parse_css(pool, + {start + opening_tag->content_offset, + opening_tag->closing.start - opening_tag->content_offset}, + std::move(hc->css_style)); + + if (!ret_maybe.has_value()) { + if (ret_maybe.error().is_fatal()) { + auto err_str = fmt::format( + "cannot parse css (error code: {}): {}", + static_cast<int>(ret_maybe.error().type), + ret_maybe.error().description.value_or("unknown error")); + msg_info_pool("%*s", (int) err_str.size(), err_str.data()); + } + } + else { + hc->css_style = ret_maybe.value(); + } + } + } + + if (next_cur_tag != nullptr) { + cur_tag = next_cur_tag; + } + else { + /* + * Here, we handle cases like <p>lala</b>... + * So the tag </b> is bogus and unpaired + * However, we need to exclude it from the output of <p> tag + * To do that, we create a fake opening tag and insert that to + * the current opening tag + */ + auto *cur_opening_tag = cur_tag->parent; + + while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) { + cur_opening_tag = cur_opening_tag->parent; + } + + if (!cur_opening_tag) { + cur_opening_tag = hc->root_tag; + } + + auto &&vtag = std::make_unique<html_tag>(); + vtag->id = cur_tag->id; + vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags; + vtag->tag_start = cur_tag->closing.start; + vtag->content_offset = p - start + 1; + vtag->closing = cur_tag->closing; + vtag->parent = cur_opening_tag; + g_assert(vtag->parent != &cur_closing_tag); + cur_opening_tag->children.push_back(vtag.get()); + hc->all_tags.emplace_back(std::move(vtag)); + cur_tag = cur_opening_tag; + parent_tag = cur_tag->parent; + g_assert(cur_tag->parent != &cur_closing_tag); + } + } /* if cur_tag != nullptr */ + state = html_text_content; + p++; + c = p; + break; + } + case tags_limit_overflow: + msg_warn_pool("tags limit of %d tags is reached at the position %d;" + " ignoring the rest of the HTML content", + (int) hc->all_tags.size(), (int) (p - start)); + c = p; + p = end; + break; + } + } + + if (cur_tag && !(cur_tag->flags & FL_CLOSED) && cur_tag != &cur_closing_tag) { + cur_closing_tag.parent = cur_tag; + cur_closing_tag.id = cur_tag->id; + cur_tag = &cur_closing_tag; + html_check_balance(hc, cur_tag, + end - start, end - start); + } + + /* Propagate styles */ + hc->traverse_block_tags([&hc, &pool](const html_tag *tag) -> bool { + if (hc->css_style && tag->id > Tag_UNKNOWN && tag->id < Tag_MAX) { + auto *css_block = hc->css_style->check_tag_block(tag); + + if (css_block) { + if (tag->block) { + tag->block->set_block(*css_block); + } + else { + tag->block = css_block; + } + } + } + if (tag->block) { + if (!tag->block->has_display()) { + /* If we have no display field, we can check it by tag */ + if (tag->flags & CM_HEAD) { + tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN, + html_block::set); + } + else if (tag->flags & (CM_BLOCK | CM_TABLE)) { + tag->block->set_display(css::css_display_value::DISPLAY_BLOCK, + html_block::implicit); + } + else if (tag->flags & CM_ROW) { + tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW, + html_block::implicit); + } + else { + tag->block->set_display(css::css_display_value::DISPLAY_INLINE, + html_block::implicit); + } + } + + tag->block->compute_visibility(); + + for (const auto *cld_tag: tag->children) { + + if (cld_tag->block) { + cld_tag->block->propagate_block(*tag->block); + } + else { + cld_tag->block = rspamd_mempool_alloc0_type(pool, html_block); + *cld_tag->block = *tag->block; + } + } + } + return true; + }, + html_content::traverse_type::PRE_ORDER); + + /* Leftover before content */ + switch (state) { + case tag_end_opening: + if (cur_tag != nullptr) { + process_opening_tag(); + } + break; + default: + /* Do nothing */ + break; + } + + if (!hc->all_tags.empty() && hc->root_tag) { + html_append_tag_content(pool, start, end - start, hc, hc->root_tag, + exceptions, url_set); + } + + /* Leftover after content */ + switch (state) { + case tags_limit_overflow: + html_append_parsed(hc, {c, (std::size_t)(end - c)}, + false, end - start, hc->parsed); + break; + default: + /* Do nothing */ + break; + } + + if (overflow_input) { + /* + * Append the rest of the input as raw html, this might work as + * further algorithms can skip words when auto *pool = task->task_pool;there are too many. + * It is still unclear about urls though... + */ + html_append_parsed(hc, {end, in->len - process_size}, false, + end - start, hc->parsed); + } + + if (!hc->parsed.empty()) { + /* Trim extra spaces at the end if needed */ + if (g_ascii_isspace(hc->parsed.back())) { + auto last_it = std::end(hc->parsed); + + /* Allow last newline */ + if (hc->parsed.back() == '\n') { + --last_it; + } + + hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(), + [](auto ch) -> auto { + return !g_ascii_isspace(ch); + }) + .base(), + last_it); + } + } + + return hc; +} + +static auto +html_find_image_by_cid(const html_content &hc, std::string_view cid) + -> std::optional<const html_image *> +{ + for (const auto *html_image: hc.images) { + /* Filter embedded images */ + if (html_image->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED && + html_image->src != nullptr) { + if (cid == html_image->src) { + return html_image; + } + } + } + + return std::nullopt; +} + +auto html_debug_structure(const html_content &hc) -> std::string +{ + std::string output; + + if (hc.root_tag) { + auto rec_functor = [&](const html_tag *t, int level, auto rec_functor) -> void { + std::string pluses(level, '+'); + + if (!(t->flags & (FL_VIRTUAL | FL_IGNORE))) { + if (t->flags & FL_XML) { + output += fmt::format("{}xml;", pluses); + } + else { + output += fmt::format("{}{};", pluses, + html_tags_defs.name_by_id_safe(t->id)); + } + level++; + } + for (const auto *cld: t->children) { + rec_functor(cld, level, rec_functor); + } + }; + + rec_functor(hc.root_tag, 1, rec_functor); + } + + return output; +} + +auto html_tag_by_name(const std::string_view &name) + -> std::optional<tag_id_t> +{ + const auto *td = rspamd::html::html_tags_defs.by_name(name); + + if (td != nullptr) { + return td->id; + } + + return std::nullopt; +} + +auto html_tag::get_content(const struct html_content *hc) const -> std::string_view +{ + const std::string *dest = &hc->parsed; + + if (block && !block->is_visible()) { + dest = &hc->invisible; + } + const auto clen = get_content_length(); + if (content_offset < dest->size()) { + if (dest->size() - content_offset >= clen) { + return std::string_view{*dest}.substr(content_offset, clen); + } + else { + return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset); + } + } + + return std::string_view{}; +} + +}// namespace rspamd::html + +void * +rspamd_html_process_part_full(struct rspamd_task *task, + GByteArray *in, GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls, + bool allow_css, + uint16_t *cur_url_order) +{ + return rspamd::html::html_process_input(task, in, exceptions, url_set, + part_urls, allow_css, cur_url_order); +} + +void * +rspamd_html_process_part(rspamd_mempool_t *pool, + GByteArray *in) +{ + struct rspamd_task fake_task; + memset(&fake_task, 0, sizeof(fake_task)); + fake_task.task_pool = pool; + uint16_t order = 0; + + return rspamd_html_process_part_full(&fake_task, in, NULL, + NULL, NULL, FALSE, &order); +} + +guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len) +{ + return rspamd::html::decode_html_entitles_inplace(s, len); +} + +gint rspamd_html_tag_by_name(const gchar *name) +{ + const auto *td = rspamd::html::html_tags_defs.by_name(name); + + if (td != nullptr) { + return td->id; + } + + return -1; +} + +gboolean +rspamd_html_tag_seen(void *ptr, const gchar *tagname) +{ + gint id; + auto *hc = rspamd::html::html_content::from_ptr(ptr); + + g_assert(hc != NULL); + + id = rspamd_html_tag_by_name(tagname); + + if (id != -1) { + return hc->tags_seen[id]; + } + + return FALSE; +} + +const gchar * +rspamd_html_tag_by_id(gint id) +{ + if (id > Tag_UNKNOWN && id < Tag_MAX) { + const auto *td = rspamd::html::html_tags_defs.by_id(id); + + if (td != nullptr) { + return td->name.c_str(); + } + } + + return nullptr; +} + +const gchar * +rspamd_html_tag_name(void *p, gsize *len) +{ + auto *tag = reinterpret_cast<rspamd::html::html_tag *>(p); + auto tname = rspamd::html::html_tags_defs.name_by_id_safe(tag->id); + + if (len) { + *len = tname.size(); + } + + return tname.data(); +} + +struct html_image * +rspamd_html_find_embedded_image(void *html_content, + const char *cid, gsize cid_len) +{ + auto *hc = rspamd::html::html_content::from_ptr(html_content); + + auto maybe_img = rspamd::html::html_find_image_by_cid(*hc, {cid, cid_len}); + + if (maybe_img) { + return (html_image *) maybe_img.value(); + } + + return nullptr; +} + +bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest) +{ + auto *hc = rspamd::html::html_content::from_ptr(html_content); + + dest->begin = hc->parsed.data(); + dest->len = hc->parsed.size(); + + return true; +} + +gsize rspamd_html_get_tags_count(void *html_content) +{ + auto *hc = rspamd::html::html_content::from_ptr(html_content); + + if (!hc) { + return 0; + } + + return hc->all_tags.size(); +}
\ No newline at end of file diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h new file mode 100644 index 0000000..2d34f2a --- /dev/null +++ b/src/libserver/html/html.h @@ -0,0 +1,137 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_H +#define RSPAMD_HTML_H + +#include "config.h" +#include "libutil/mem_pool.h" +#include "libserver/url.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * HTML content flags + */ +#define RSPAMD_HTML_FLAG_BAD_START (1 << 0) +#define RSPAMD_HTML_FLAG_BAD_ELEMENTS (1 << 1) +#define RSPAMD_HTML_FLAG_XML (1 << 2) +#define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3) +#define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4) +#define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5) +#define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6) +#define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7) +#define RSPAMD_HTML_FLAG_HAS_ZEROS (1 << 8) + +/* + * Image flags + */ +#define RSPAMD_HTML_FLAG_IMAGE_EMBEDDED (1 << 0) +#define RSPAMD_HTML_FLAG_IMAGE_EXTERNAL (1 << 1) +#define RSPAMD_HTML_FLAG_IMAGE_DATA (1 << 2) + + +struct rspamd_image; + +struct html_image { + guint height; + guint width; + guint flags; + gchar *src; + struct rspamd_url *url; + struct rspamd_image *embedded_image; + void *tag; +}; + + +/* Forwarded declaration */ +struct rspamd_task; + +/* + * Decode HTML entitles in text. Text is modified in place. + */ +guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len); + +void *rspamd_html_process_part(rspamd_mempool_t *pool, + GByteArray *in); + +void *rspamd_html_process_part_full(struct rspamd_task *task, + GByteArray *in, GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls, + bool allow_css, + uint16_t *cur_url_order); + +/* + * Returns true if a specified tag has been seen in a part + */ +gboolean rspamd_html_tag_seen(void *ptr, const gchar *tagname); + +/** + * Returns name for the specified tag id + * @param id + * @return + */ +const gchar *rspamd_html_tag_by_id(gint id); + +/** + * Returns HTML tag id by name + * @param name + * @return + */ +gint rspamd_html_tag_by_name(const gchar *name); + +/** + * Gets a name for a tag + * @param tag + * @param len + * @return + */ +const gchar *rspamd_html_tag_name(void *tag, gsize *len); + +/** + * Find HTML image by content id + * @param html_content + * @param cid + * @param cid_len + * @return + */ +struct html_image *rspamd_html_find_embedded_image(void *html_content, + const char *cid, gsize cid_len); + +/** + * Stores parsed content in ftok_t structure + * @param html_content + * @param dest + * @return + */ +bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest); + +/** + * Returns number of tags in the html content + * @param html_content + * @return + */ +gsize rspamd_html_get_tags_count(void *html_content); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx new file mode 100644 index 0000000..3320fd6 --- /dev/null +++ b/src/libserver/html/html.hxx @@ -0,0 +1,146 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_HXX +#define RSPAMD_HTML_HXX +#pragma once + +#include "config.h" +#include "libserver/url.h" +#include "libserver/html/html_tag.hxx" +#include "libserver/html/html.h" +#include "libserver/html/html_tags.h" + + +#include <vector> +#include <memory> +#include <string> +#include "function2/function2.hpp" + +namespace rspamd::css { +/* Forward declaration */ +class css_style_sheet; +}// namespace rspamd::css + +namespace rspamd::html { + +struct html_block; + +struct html_content { + struct rspamd_url *base_url = nullptr; + struct html_tag *root_tag = nullptr; + gint flags = 0; + std::vector<bool> tags_seen; + std::vector<html_image *> images; + std::vector<std::unique_ptr<struct html_tag>> all_tags; + std::string parsed; + std::string invisible; + std::shared_ptr<css::css_style_sheet> css_style; + + /* Preallocate and reserve all internal structures */ + html_content() + { + tags_seen.resize(Tag_MAX, false); + all_tags.reserve(128); + parsed.reserve(256); + } + + static void html_content_dtor(void *ptr) + { + delete html_content::from_ptr(ptr); + } + + static auto from_ptr(void *ptr) -> html_content * + { + return static_cast<html_content *>(ptr); + } + + enum class traverse_type { + PRE_ORDER, + POST_ORDER + }; + auto traverse_block_tags(fu2::function<bool(const html_tag *)> &&func, + traverse_type how = traverse_type::PRE_ORDER) const -> bool + { + + if (root_tag == nullptr) { + return false; + } + + auto rec_functor_pre_order = [&](const html_tag *root, auto &&rec) -> bool { + if (func(root)) { + + for (const auto *c: root->children) { + if (!rec(c, rec)) { + return false; + } + } + + return true; + } + return false; + }; + auto rec_functor_post_order = [&](const html_tag *root, auto &&rec) -> bool { + for (const auto *c: root->children) { + if (!rec(c, rec)) { + return false; + } + } + + return func(root); + }; + + switch (how) { + case traverse_type::PRE_ORDER: + return rec_functor_pre_order(root_tag, rec_functor_pre_order); + case traverse_type::POST_ORDER: + return rec_functor_post_order(root_tag, rec_functor_post_order); + default: + RSPAMD_UNREACHABLE; + } + } + + auto traverse_all_tags(fu2::function<bool(const html_tag *)> &&func) const -> bool + { + for (const auto &tag: all_tags) { + if (!(tag->flags & (FL_XML | FL_VIRTUAL))) { + if (!func(tag.get())) { + return false; + } + } + } + + return true; + } + +private: + ~html_content() = default; +}; + + +auto html_tag_by_name(const std::string_view &name) -> std::optional<tag_id_t>; +auto html_process_input(struct rspamd_task *task, + GByteArray *in, + GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls, + bool allow_css, + std::uint16_t *cur_url_order) -> html_content *; +auto html_debug_structure(const html_content &hc) -> std::string; + +}// namespace rspamd::html + +#endif//RSPAMD_HTML_HXX diff --git a/src/libserver/html/html_block.hxx b/src/libserver/html/html_block.hxx new file mode 100644 index 0000000..f9b5184 --- /dev/null +++ b/src/libserver/html/html_block.hxx @@ -0,0 +1,358 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_HTML_BLOCK_HXX +#define RSPAMD_HTML_BLOCK_HXX +#pragma once + +#include "libserver/css/css_value.hxx" +#include <cmath> + +namespace rspamd::html { + +/* + * Block tag definition + */ +struct html_block { + rspamd::css::css_color fg_color; + rspamd::css::css_color bg_color; + std::int16_t height; + std::int16_t width; + rspamd::css::css_display_value display; + std::int8_t font_size; + + unsigned fg_color_mask : 2; + unsigned bg_color_mask : 2; + unsigned height_mask : 2; + unsigned width_mask : 2; + unsigned font_mask : 2; + unsigned display_mask : 2; + unsigned visibility_mask : 2; + + constexpr static const auto unset = 0; + constexpr static const auto inherited = 1; + constexpr static const auto implicit = 1; + constexpr static const auto set = 3; + constexpr static const auto invisible_flag = 1; + constexpr static const auto transparent_flag = 2; + + /* Helpers to set mask when setting the elements */ + auto set_fgcolor(const rspamd::css::css_color &c, int how = html_block::set) -> void + { + fg_color = c; + fg_color_mask = how; + } + auto set_bgcolor(const rspamd::css::css_color &c, int how = html_block::set) -> void + { + bg_color = c; + bg_color_mask = how; + } + auto set_height(float h, bool is_percent = false, int how = html_block::set) -> void + { + h = is_percent ? (-h) : h; + if (h < INT16_MIN) { + /* Negative numbers encode percents... */ + height = -100; + } + else if (h > INT16_MAX) { + height = INT16_MAX; + } + else { + height = h; + } + height_mask = how; + } + + auto set_width(float w, bool is_percent = false, int how = html_block::set) -> void + { + w = is_percent ? (-w) : w; + if (w < INT16_MIN) { + width = INT16_MIN; + } + else if (w > INT16_MAX) { + width = INT16_MAX; + } + else { + width = w; + } + width_mask = how; + } + + auto set_display(bool v, int how = html_block::set) -> void + { + if (v) { + display = rspamd::css::css_display_value::DISPLAY_INLINE; + } + else { + display = rspamd::css::css_display_value::DISPLAY_HIDDEN; + } + display_mask = how; + } + + auto set_display(rspamd::css::css_display_value v, int how = html_block::set) -> void + { + display = v; + display_mask = how; + } + + auto set_font_size(float fs, bool is_percent = false, int how = html_block::set) -> void + { + fs = is_percent ? (-fs) : fs; + if (fs < INT8_MIN) { + font_size = -100; + } + else if (fs > INT8_MAX) { + font_size = INT8_MAX; + } + else { + font_size = fs; + } + font_mask = how; + } + +private: + template<typename T, typename MT> + static constexpr auto simple_prop(MT mask_val, MT other_mask, T &our_val, + T other_val) -> MT + { + if (other_mask && other_mask > mask_val) { + our_val = other_val; + mask_val = html_block::inherited; + } + + return mask_val; + } + + /* Sizes propagation logic + * We can have multiple cases: + * 1) Our size is > 0 and we can use it as is + * 2) Parent size is > 0 and our size is undefined, so propagate parent + * 3) Parent size is < 0 and our size is undefined - propagate parent + * 4) Parent size is > 0 and our size is < 0 - multiply parent by abs(ours) + * 5) Parent size is undefined and our size is < 0 - tricky stuff, assume some defaults + */ + template<typename T, typename MT> + static constexpr auto size_prop(MT mask_val, MT other_mask, T &our_val, + T other_val, T default_val) -> MT + { + if (mask_val) { + /* We have our value */ + if (our_val < 0) { + if (other_mask > 0) { + if (other_val >= 0) { + our_val = other_val * (-our_val / 100.0); + } + else { + our_val *= (-other_val / 100.0); + } + } + else { + /* Parent value is not defined and our value is relative */ + our_val = default_val * (-our_val / 100.0); + } + } + else if (other_mask && other_mask > mask_val) { + our_val = other_val; + mask_val = html_block::inherited; + } + } + else { + /* We propagate parent if defined */ + if (other_mask && other_mask > mask_val) { + our_val = other_val; + mask_val = html_block::inherited; + } + /* Otherwise do nothing */ + } + + return mask_val; + } + +public: + /** + * Propagate values from the block if they are not defined by the current block + * @param other + * @return + */ + auto propagate_block(const html_block &other) -> void + { + fg_color_mask = html_block::simple_prop(fg_color_mask, other.fg_color_mask, + fg_color, other.fg_color); + bg_color_mask = html_block::simple_prop(bg_color_mask, other.bg_color_mask, + bg_color, other.bg_color); + display_mask = html_block::simple_prop(display_mask, other.display_mask, + display, other.display); + + height_mask = html_block::size_prop(height_mask, other.height_mask, + height, other.height, static_cast<std::int16_t>(800)); + width_mask = html_block::size_prop(width_mask, other.width_mask, + width, other.width, static_cast<std::int16_t>(1024)); + font_mask = html_block::size_prop(font_mask, other.font_mask, + font_size, other.font_size, static_cast<std::int8_t>(10)); + } + + /* + * Set block overriding all inherited values + */ + auto set_block(const html_block &other) -> void + { + constexpr auto set_value = [](auto mask_val, auto other_mask, auto &our_val, + auto other_val) constexpr -> int { + if (other_mask && mask_val != html_block::set) { + our_val = other_val; + mask_val = other_mask; + } + + return mask_val; + }; + + fg_color_mask = set_value(fg_color_mask, other.fg_color_mask, fg_color, other.fg_color); + bg_color_mask = set_value(bg_color_mask, other.bg_color_mask, bg_color, other.bg_color); + display_mask = set_value(display_mask, other.display_mask, display, other.display); + height_mask = set_value(height_mask, other.height_mask, height, other.height); + width_mask = set_value(width_mask, other.width_mask, width, other.width); + font_mask = set_value(font_mask, other.font_mask, font_size, other.font_size); + } + + auto compute_visibility(void) -> void + { + if (display_mask) { + if (display == css::css_display_value::DISPLAY_HIDDEN) { + visibility_mask = html_block::invisible_flag; + + return; + } + } + + if (font_mask) { + if (font_size == 0) { + visibility_mask = html_block::invisible_flag; + + return; + } + } + + auto is_similar_colors = [](const rspamd::css::css_color &fg, const rspamd::css::css_color &bg) -> bool { + constexpr const auto min_visible_diff = 0.1f; + auto diff_r = ((float) fg.r - bg.r); + auto diff_g = ((float) fg.g - bg.g); + auto diff_b = ((float) fg.b - bg.b); + auto ravg = ((float) fg.r + bg.r) / 2.0f; + + /* Square diffs */ + diff_r *= diff_r; + diff_g *= diff_g; + diff_b *= diff_b; + + auto diff = std::sqrt(2.0f * diff_r + 4.0f * diff_g + 3.0f * diff_b + + (ravg * (diff_r - diff_b) / 256.0f)) / + 256.0f; + + return diff < min_visible_diff; + }; + /* Check if we have both bg/fg colors */ + if (fg_color_mask && bg_color_mask) { + if (fg_color.alpha < 10) { + /* Too transparent */ + visibility_mask = html_block::transparent_flag; + + return; + } + + if (bg_color.alpha > 10) { + if (is_similar_colors(fg_color, bg_color)) { + visibility_mask = html_block::transparent_flag; + return; + } + } + } + else if (fg_color_mask) { + /* Merely fg color */ + if (fg_color.alpha < 10) { + /* Too transparent */ + visibility_mask = html_block::transparent_flag; + + return; + } + + /* Implicit fg color */ + if (is_similar_colors(fg_color, rspamd::css::css_color::white())) { + visibility_mask = html_block::transparent_flag; + return; + } + } + else if (bg_color_mask) { + if (bg_color.alpha > 10) { + if (is_similar_colors(rspamd::css::css_color::black(), bg_color)) { + visibility_mask = html_block::transparent_flag; + return; + } + } + } + + visibility_mask = html_block::unset; + } + + constexpr auto is_visible(void) const -> bool + { + return visibility_mask == html_block::unset; + } + + constexpr auto is_transparent(void) const -> bool + { + return visibility_mask == html_block::transparent_flag; + } + + constexpr auto has_display(int how = html_block::set) const -> bool + { + return display_mask >= how; + } + + /** + * Returns a default html block for root HTML element + * @return + */ + static auto default_html_block(void) -> html_block + { + return html_block{.fg_color = rspamd::css::css_color::black(), + .bg_color = rspamd::css::css_color::white(), + .height = 0, + .width = 0, + .display = rspamd::css::css_display_value::DISPLAY_INLINE, + .font_size = 12, + .fg_color_mask = html_block::inherited, + .bg_color_mask = html_block::inherited, + .height_mask = html_block::unset, + .width_mask = html_block::unset, + .font_mask = html_block::unset, + .display_mask = html_block::inherited, + .visibility_mask = html_block::unset}; + } + /** + * Produces html block with no defined values allocated from the pool + * @param pool + * @return + */ + static auto undefined_html_block_pool(rspamd_mempool_t *pool) -> html_block * + { + auto *bl = rspamd_mempool_alloc0_type(pool, html_block); + + return bl; + } +}; + +}// namespace rspamd::html + +#endif//RSPAMD_HTML_BLOCK_HXX diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx new file mode 100644 index 0000000..c642536 --- /dev/null +++ b/src/libserver/html/html_entities.cxx @@ -0,0 +1,2644 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "html_entities.hxx" + +#include <string> +#include <utility> +#include <vector> +#include "contrib/ankerl/unordered_dense.h" +#include <unicode/utf8.h> +#include <unicode/uchar.h> +#include "libutil/cxx/util.hxx" + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +namespace rspamd::html { + +struct html_entity_def { + const char *name; + const char *replacement; + unsigned code; + bool allow_heuristic; +}; + +#define ENTITY_DEF(name, code, replacement) \ + html_entity_def \ + { \ + (name), (replacement), (code), false \ + } +#define ENTITY_DEF_HEUR(name, code, replacement) \ + html_entity_def \ + { \ + (name), (replacement), (code), true \ + } + +static const html_entity_def html_entities_array[] = { + ENTITY_DEF_HEUR("szlig", 223, "\xc3\x9f"), + ENTITY_DEF("prime", 8242, "\xe2\x80\xb2"), + ENTITY_DEF("lnsim", 8934, "\xe2\x8b\xa6"), + ENTITY_DEF("nvDash", 8877, "\xe2\x8a\xad"), + ENTITY_DEF("isinsv", 8947, "\xe2\x8b\xb3"), + ENTITY_DEF("notin", 8713, "\xe2\x88\x89"), + ENTITY_DEF("becaus", 8757, "\xe2\x88\xb5"), + ENTITY_DEF("Leftrightarrow", 8660, "\xe2\x87\x94"), + ENTITY_DEF("EmptySmallSquare", 9723, "\xe2\x97\xbb"), + ENTITY_DEF("SquareUnion", 8852, "\xe2\x8a\x94"), + ENTITY_DEF("subdot", 10941, "\xe2\xaa\xbd"), + ENTITY_DEF("Dstrok", 272, "\xc4\x90"), + ENTITY_DEF("rrarr", 8649, "\xe2\x87\x89"), + ENTITY_DEF("rArr", 8658, "\xe2\x87\x92"), + ENTITY_DEF_HEUR("Aacute", 193, "\xc3\x81"), + ENTITY_DEF("kappa", 954, "\xce\xba"), + ENTITY_DEF("Iopf", 120128, "\xf0\x9d\x95\x80"), + ENTITY_DEF("hyphen", 8208, "\xe2\x80\x90"), + ENTITY_DEF("rarrbfs", 10528, "\xe2\xa4\xa0"), + ENTITY_DEF("supsetneqq", 10956, "\xe2\xab\x8c"), + ENTITY_DEF("gacute", 501, "\xc7\xb5"), + ENTITY_DEF("VeryThinSpace", 8202, "\xe2\x80\x8a"), + ENTITY_DEF("tint", 8749, "\xe2\x88\xad"), + ENTITY_DEF("ffr", 120099, "\xf0\x9d\x94\xa3"), + ENTITY_DEF("kgreen", 312, "\xc4\xb8"), + ENTITY_DEF("nis", 8956, "\xe2\x8b\xbc"), + ENTITY_DEF("NotRightTriangleBar", 10704, "\xe2\xa7\x90\xcc\xb8"), + ENTITY_DEF("Eogon", 280, "\xc4\x98"), + ENTITY_DEF("lbrke", 10635, "\xe2\xa6\x8b"), + ENTITY_DEF("phi", 966, "\xcf\x86"), + ENTITY_DEF("notnivc", 8957, "\xe2\x8b\xbd"), + ENTITY_DEF("utilde", 361, "\xc5\xa9"), + ENTITY_DEF("Fopf", 120125, "\xf0\x9d\x94\xbd"), + ENTITY_DEF("Vcy", 1042, "\xd0\x92"), + ENTITY_DEF("erDot", 8787, "\xe2\x89\x93"), + ENTITY_DEF("nsubE", 10949, "\xe2\xab\x85\xcc\xb8"), + ENTITY_DEF_HEUR("egrave", 232, "\xc3\xa8"), + ENTITY_DEF("Lcedil", 315, "\xc4\xbb"), + ENTITY_DEF("lharul", 10602, "\xe2\xa5\xaa"), + ENTITY_DEF_HEUR("middot", 183, "\xc2\xb7"), + ENTITY_DEF("ggg", 8921, "\xe2\x8b\x99"), + ENTITY_DEF("NestedLessLess", 8810, "\xe2\x89\xaa"), + ENTITY_DEF("tau", 964, "\xcf\x84"), + ENTITY_DEF("setmn", 8726, "\xe2\x88\x96"), + ENTITY_DEF("frac78", 8542, "\xe2\x85\x9e"), + ENTITY_DEF_HEUR("para", 182, "\xc2\xb6"), + ENTITY_DEF("Rcedil", 342, "\xc5\x96"), + ENTITY_DEF("propto", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("sqsubset", 8847, "\xe2\x8a\x8f"), + ENTITY_DEF("ensp", 8194, "\xe2\x80\x82"), + ENTITY_DEF("boxvH", 9578, "\xe2\x95\xaa"), + ENTITY_DEF("NotGreaterTilde", 8821, "\xe2\x89\xb5"), + ENTITY_DEF("ffllig", 64260, "\xef\xac\x84"), + ENTITY_DEF("kcedil", 311, "\xc4\xb7"), + ENTITY_DEF("omega", 969, "\xcf\x89"), + ENTITY_DEF("sime", 8771, "\xe2\x89\x83"), + ENTITY_DEF("LeftTriangleEqual", 8884, "\xe2\x8a\xb4"), + ENTITY_DEF("bsemi", 8271, "\xe2\x81\x8f"), + ENTITY_DEF("rdquor", 8221, "\xe2\x80\x9d"), + ENTITY_DEF("Utilde", 360, "\xc5\xa8"), + ENTITY_DEF("bsol", 92, "\x5c"), + ENTITY_DEF("risingdotseq", 8787, "\xe2\x89\x93"), + ENTITY_DEF("ultri", 9720, "\xe2\x97\xb8"), + ENTITY_DEF("rhov", 1009, "\xcf\xb1"), + ENTITY_DEF("TildeEqual", 8771, "\xe2\x89\x83"), + ENTITY_DEF("jukcy", 1108, "\xd1\x94"), + ENTITY_DEF("perp", 8869, "\xe2\x8a\xa5"), + ENTITY_DEF("capbrcup", 10825, "\xe2\xa9\x89"), + ENTITY_DEF("ltrie", 8884, "\xe2\x8a\xb4"), + ENTITY_DEF("LessTilde", 8818, "\xe2\x89\xb2"), + ENTITY_DEF("popf", 120161, "\xf0\x9d\x95\xa1"), + ENTITY_DEF("dbkarow", 10511, "\xe2\xa4\x8f"), + ENTITY_DEF("roang", 10221, "\xe2\x9f\xad"), + ENTITY_DEF_HEUR("brvbar", 166, "\xc2\xa6"), + ENTITY_DEF("CenterDot", 183, "\xc2\xb7"), + ENTITY_DEF("notindot", 8949, "\xe2\x8b\xb5\xcc\xb8"), + ENTITY_DEF("supmult", 10946, "\xe2\xab\x82"), + ENTITY_DEF("multimap", 8888, "\xe2\x8a\xb8"), + ENTITY_DEF_HEUR("frac34", 190, "\xc2\xbe"), + ENTITY_DEF("mapsto", 8614, "\xe2\x86\xa6"), + ENTITY_DEF("flat", 9837, "\xe2\x99\xad"), + ENTITY_DEF("updownarrow", 8597, "\xe2\x86\x95"), + ENTITY_DEF("gne", 10888, "\xe2\xaa\x88"), + ENTITY_DEF("nrarrc", 10547, "\xe2\xa4\xb3\xcc\xb8"), + ENTITY_DEF("suphsol", 10185, "\xe2\x9f\x89"), + ENTITY_DEF("nGtv", 8811, "\xe2\x89\xab\xcc\xb8"), + ENTITY_DEF("hopf", 120153, "\xf0\x9d\x95\x99"), + ENTITY_DEF("pointint", 10773, "\xe2\xa8\x95"), + ENTITY_DEF("glj", 10916, "\xe2\xaa\xa4"), + ENTITY_DEF("LeftDoubleBracket", 10214, "\xe2\x9f\xa6"), + ENTITY_DEF("NotSupersetEqual", 8841, "\xe2\x8a\x89"), + ENTITY_DEF("dot", 729, "\xcb\x99"), + ENTITY_DEF("tbrk", 9140, "\xe2\x8e\xb4"), + ENTITY_DEF("LeftUpDownVector", 10577, "\xe2\xa5\x91"), + ENTITY_DEF_HEUR("uml", 168, "\xc2\xa8"), + ENTITY_DEF("bbrk", 9141, "\xe2\x8e\xb5"), + ENTITY_DEF("nearrow", 8599, "\xe2\x86\x97"), + ENTITY_DEF("backsimeq", 8909, "\xe2\x8b\x8d"), + ENTITY_DEF("dblac", 733, "\xcb\x9d"), + ENTITY_DEF("circleddash", 8861, "\xe2\x8a\x9d"), + ENTITY_DEF("ldsh", 8626, "\xe2\x86\xb2"), + ENTITY_DEF("sce", 10928, "\xe2\xaa\xb0"), + ENTITY_DEF("angst", 197, "\xc3\x85"), + ENTITY_DEF_HEUR("yen", 165, "\xc2\xa5"), + ENTITY_DEF("nsupE", 10950, "\xe2\xab\x86\xcc\xb8"), + ENTITY_DEF("Uscr", 119984, "\xf0\x9d\x92\xb0"), + ENTITY_DEF("subplus", 10943, "\xe2\xaa\xbf"), + ENTITY_DEF("nleqq", 8806, "\xe2\x89\xa6\xcc\xb8"), + ENTITY_DEF("nprcue", 8928, "\xe2\x8b\xa0"), + ENTITY_DEF("Ocirc", 212, "\xc3\x94"), + ENTITY_DEF("disin", 8946, "\xe2\x8b\xb2"), + ENTITY_DEF("EqualTilde", 8770, "\xe2\x89\x82"), + ENTITY_DEF("YUcy", 1070, "\xd0\xae"), + ENTITY_DEF("Kscr", 119974, "\xf0\x9d\x92\xa6"), + ENTITY_DEF("lg", 8822, "\xe2\x89\xb6"), + ENTITY_DEF("nLeftrightarrow", 8654, "\xe2\x87\x8e"), + ENTITY_DEF("eplus", 10865, "\xe2\xa9\xb1"), + ENTITY_DEF("les", 10877, "\xe2\xa9\xbd"), + ENTITY_DEF("sfr", 120112, "\xf0\x9d\x94\xb0"), + ENTITY_DEF("HumpDownHump", 8782, "\xe2\x89\x8e"), + ENTITY_DEF("Fouriertrf", 8497, "\xe2\x84\xb1"), + ENTITY_DEF("Updownarrow", 8661, "\xe2\x87\x95"), + ENTITY_DEF("nrarr", 8603, "\xe2\x86\x9b"), + ENTITY_DEF("radic", 8730, "\xe2\x88\x9a"), + ENTITY_DEF("gnap", 10890, "\xe2\xaa\x8a"), + ENTITY_DEF("zeta", 950, "\xce\xb6"), + ENTITY_DEF("Qscr", 119980, "\xf0\x9d\x92\xac"), + ENTITY_DEF("NotRightTriangleEqual", 8941, "\xe2\x8b\xad"), + ENTITY_DEF("nshortmid", 8740, "\xe2\x88\xa4"), + ENTITY_DEF("SHCHcy", 1065, "\xd0\xa9"), + ENTITY_DEF("piv", 982, "\xcf\x96"), + ENTITY_DEF("angmsdaa", 10664, "\xe2\xa6\xa8"), + ENTITY_DEF("curlywedge", 8911, "\xe2\x8b\x8f"), + ENTITY_DEF("sqcaps", 8851, "\xe2\x8a\x93\xef\xb8\x80"), + ENTITY_DEF("sum", 8721, "\xe2\x88\x91"), + ENTITY_DEF("rarrtl", 8611, "\xe2\x86\xa3"), + ENTITY_DEF("gescc", 10921, "\xe2\xaa\xa9"), + ENTITY_DEF("sup", 8835, "\xe2\x8a\x83"), + ENTITY_DEF("smid", 8739, "\xe2\x88\xa3"), + ENTITY_DEF("cularr", 8630, "\xe2\x86\xb6"), + ENTITY_DEF("olcross", 10683, "\xe2\xa6\xbb"), + ENTITY_DEF_HEUR("GT", 62, "\x3e"), + ENTITY_DEF("scap", 10936, "\xe2\xaa\xb8"), + ENTITY_DEF("capcup", 10823, "\xe2\xa9\x87"), + ENTITY_DEF("NotSquareSubsetEqual", 8930, "\xe2\x8b\xa2"), + ENTITY_DEF("uhblk", 9600, "\xe2\x96\x80"), + ENTITY_DEF("latail", 10521, "\xe2\xa4\x99"), + ENTITY_DEF("smtes", 10924, "\xe2\xaa\xac\xef\xb8\x80"), + ENTITY_DEF("RoundImplies", 10608, "\xe2\xa5\xb0"), + ENTITY_DEF("wreath", 8768, "\xe2\x89\x80"), + ENTITY_DEF("curlyvee", 8910, "\xe2\x8b\x8e"), + ENTITY_DEF("uscr", 120010, "\xf0\x9d\x93\x8a"), + ENTITY_DEF("nleftrightarrow", 8622, "\xe2\x86\xae"), + ENTITY_DEF("ucy", 1091, "\xd1\x83"), + ENTITY_DEF("nvge", 8805, "\xe2\x89\xa5\xe2\x83\x92"), + ENTITY_DEF("bnot", 8976, "\xe2\x8c\x90"), + ENTITY_DEF("alefsym", 8501, "\xe2\x84\xb5"), + ENTITY_DEF("star", 9734, "\xe2\x98\x86"), + ENTITY_DEF("boxHd", 9572, "\xe2\x95\xa4"), + ENTITY_DEF("vsubnE", 10955, "\xe2\xab\x8b\xef\xb8\x80"), + ENTITY_DEF("Popf", 8473, "\xe2\x84\x99"), + ENTITY_DEF("simgE", 10912, "\xe2\xaa\xa0"), + ENTITY_DEF("upsilon", 965, "\xcf\x85"), + ENTITY_DEF("NoBreak", 8288, "\xe2\x81\xa0"), + ENTITY_DEF("realine", 8475, "\xe2\x84\x9b"), + ENTITY_DEF("frac38", 8540, "\xe2\x85\x9c"), + ENTITY_DEF("YAcy", 1071, "\xd0\xaf"), + ENTITY_DEF("bnequiv", 8801, "\xe2\x89\xa1\xe2\x83\xa5"), + ENTITY_DEF("cudarrr", 10549, "\xe2\xa4\xb5"), + ENTITY_DEF("lsime", 10893, "\xe2\xaa\x8d"), + ENTITY_DEF("lowbar", 95, "\x5f"), + ENTITY_DEF("utdot", 8944, "\xe2\x8b\xb0"), + ENTITY_DEF("ReverseElement", 8715, "\xe2\x88\x8b"), + ENTITY_DEF("nshortparallel", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("DJcy", 1026, "\xd0\x82"), + ENTITY_DEF("nsube", 8840, "\xe2\x8a\x88"), + ENTITY_DEF("VDash", 8875, "\xe2\x8a\xab"), + ENTITY_DEF("Ncaron", 327, "\xc5\x87"), + ENTITY_DEF("LeftUpVector", 8639, "\xe2\x86\xbf"), + ENTITY_DEF("Kcy", 1050, "\xd0\x9a"), + ENTITY_DEF("NotLeftTriangleEqual", 8940, "\xe2\x8b\xac"), + ENTITY_DEF("nvHarr", 10500, "\xe2\xa4\x84"), + ENTITY_DEF("lotimes", 10804, "\xe2\xa8\xb4"), + ENTITY_DEF("RightFloor", 8971, "\xe2\x8c\x8b"), + ENTITY_DEF("succ", 8827, "\xe2\x89\xbb"), + ENTITY_DEF("Ucy", 1059, "\xd0\xa3"), + ENTITY_DEF("darr", 8595, "\xe2\x86\x93"), + ENTITY_DEF("lbarr", 10508, "\xe2\xa4\x8c"), + ENTITY_DEF("xfr", 120117, "\xf0\x9d\x94\xb5"), + ENTITY_DEF("zopf", 120171, "\xf0\x9d\x95\xab"), + ENTITY_DEF("Phi", 934, "\xce\xa6"), + ENTITY_DEF("ord", 10845, "\xe2\xa9\x9d"), + ENTITY_DEF("iinfin", 10716, "\xe2\xa7\x9c"), + ENTITY_DEF("Xfr", 120091, "\xf0\x9d\x94\x9b"), + ENTITY_DEF("qint", 10764, "\xe2\xa8\x8c"), + ENTITY_DEF("Upsilon", 933, "\xce\xa5"), + ENTITY_DEF("NotSubset", 8834, "\xe2\x8a\x82\xe2\x83\x92"), + ENTITY_DEF("gfr", 120100, "\xf0\x9d\x94\xa4"), + ENTITY_DEF("notnivb", 8958, "\xe2\x8b\xbe"), + ENTITY_DEF("Afr", 120068, "\xf0\x9d\x94\x84"), + ENTITY_DEF_HEUR("ge", 8805, "\xe2\x89\xa5"), + ENTITY_DEF_HEUR("iexcl", 161, "\xc2\xa1"), + ENTITY_DEF("dfr", 120097, "\xf0\x9d\x94\xa1"), + ENTITY_DEF("rsaquo", 8250, "\xe2\x80\xba"), + ENTITY_DEF("xcap", 8898, "\xe2\x8b\x82"), + ENTITY_DEF("Jopf", 120129, "\xf0\x9d\x95\x81"), + ENTITY_DEF("Hstrok", 294, "\xc4\xa6"), + ENTITY_DEF("ldca", 10550, "\xe2\xa4\xb6"), + ENTITY_DEF("lmoust", 9136, "\xe2\x8e\xb0"), + ENTITY_DEF("wcirc", 373, "\xc5\xb5"), + ENTITY_DEF("DownRightVector", 8641, "\xe2\x87\x81"), + ENTITY_DEF("LessFullEqual", 8806, "\xe2\x89\xa6"), + ENTITY_DEF("dotsquare", 8865, "\xe2\x8a\xa1"), + ENTITY_DEF("zhcy", 1078, "\xd0\xb6"), + ENTITY_DEF("mDDot", 8762, "\xe2\x88\xba"), + ENTITY_DEF("Prime", 8243, "\xe2\x80\xb3"), + ENTITY_DEF("prec", 8826, "\xe2\x89\xba"), + ENTITY_DEF("swnwar", 10538, "\xe2\xa4\xaa"), + ENTITY_DEF_HEUR("COPY", 169, "\xc2\xa9"), + ENTITY_DEF("cong", 8773, "\xe2\x89\x85"), + ENTITY_DEF("sacute", 347, "\xc5\x9b"), + ENTITY_DEF("Nopf", 8469, "\xe2\x84\x95"), + ENTITY_DEF("it", 8290, "\xe2\x81\xa2"), + ENTITY_DEF("SOFTcy", 1068, "\xd0\xac"), + ENTITY_DEF("uuarr", 8648, "\xe2\x87\x88"), + ENTITY_DEF("iota", 953, "\xce\xb9"), + ENTITY_DEF("notinE", 8953, "\xe2\x8b\xb9\xcc\xb8"), + ENTITY_DEF("jfr", 120103, "\xf0\x9d\x94\xa7"), + ENTITY_DEF_HEUR("QUOT", 34, "\x22"), + ENTITY_DEF("vsupnE", 10956, "\xe2\xab\x8c\xef\xb8\x80"), + ENTITY_DEF_HEUR("igrave", 236, "\xc3\xac"), + ENTITY_DEF("bsim", 8765, "\xe2\x88\xbd"), + ENTITY_DEF("npreceq", 10927, "\xe2\xaa\xaf\xcc\xb8"), + ENTITY_DEF("zcaron", 382, "\xc5\xbe"), + ENTITY_DEF("DD", 8517, "\xe2\x85\x85"), + ENTITY_DEF("gamma", 947, "\xce\xb3"), + ENTITY_DEF("homtht", 8763, "\xe2\x88\xbb"), + ENTITY_DEF("NonBreakingSpace", 160, "\xc2\xa0"), + ENTITY_DEF("Proportion", 8759, "\xe2\x88\xb7"), + ENTITY_DEF("nedot", 8784, "\xe2\x89\x90\xcc\xb8"), + ENTITY_DEF("nabla", 8711, "\xe2\x88\x87"), + ENTITY_DEF("ac", 8766, "\xe2\x88\xbe"), + ENTITY_DEF("nsupe", 8841, "\xe2\x8a\x89"), + ENTITY_DEF("ell", 8467, "\xe2\x84\x93"), + ENTITY_DEF("boxvR", 9566, "\xe2\x95\x9e"), + ENTITY_DEF("LowerRightArrow", 8600, "\xe2\x86\x98"), + ENTITY_DEF("boxHu", 9575, "\xe2\x95\xa7"), + ENTITY_DEF("lE", 8806, "\xe2\x89\xa6"), + ENTITY_DEF("dzigrarr", 10239, "\xe2\x9f\xbf"), + ENTITY_DEF("rfloor", 8971, "\xe2\x8c\x8b"), + ENTITY_DEF("gneq", 10888, "\xe2\xaa\x88"), + ENTITY_DEF("rightleftharpoons", 8652, "\xe2\x87\x8c"), + ENTITY_DEF("gtquest", 10876, "\xe2\xa9\xbc"), + ENTITY_DEF("searhk", 10533, "\xe2\xa4\xa5"), + ENTITY_DEF("gesdoto", 10882, "\xe2\xaa\x82"), + ENTITY_DEF("cross", 10007, "\xe2\x9c\x97"), + ENTITY_DEF("rdquo", 8221, "\xe2\x80\x9d"), + ENTITY_DEF("sqsupset", 8848, "\xe2\x8a\x90"), + ENTITY_DEF("divonx", 8903, "\xe2\x8b\x87"), + ENTITY_DEF("lat", 10923, "\xe2\xaa\xab"), + ENTITY_DEF("rmoustache", 9137, "\xe2\x8e\xb1"), + ENTITY_DEF("succapprox", 10936, "\xe2\xaa\xb8"), + ENTITY_DEF("nhpar", 10994, "\xe2\xab\xb2"), + ENTITY_DEF("sharp", 9839, "\xe2\x99\xaf"), + ENTITY_DEF("lrcorner", 8991, "\xe2\x8c\x9f"), + ENTITY_DEF("Vscr", 119985, "\xf0\x9d\x92\xb1"), + ENTITY_DEF("varsigma", 962, "\xcf\x82"), + ENTITY_DEF("bsolb", 10693, "\xe2\xa7\x85"), + ENTITY_DEF("cupcap", 10822, "\xe2\xa9\x86"), + ENTITY_DEF("leftrightarrow", 8596, "\xe2\x86\x94"), + ENTITY_DEF("LeftTee", 8867, "\xe2\x8a\xa3"), + ENTITY_DEF("Sqrt", 8730, "\xe2\x88\x9a"), + ENTITY_DEF("Odblac", 336, "\xc5\x90"), + ENTITY_DEF("ocir", 8858, "\xe2\x8a\x9a"), + ENTITY_DEF("eqslantless", 10901, "\xe2\xaa\x95"), + ENTITY_DEF("supedot", 10948, "\xe2\xab\x84"), + ENTITY_DEF("intercal", 8890, "\xe2\x8a\xba"), + ENTITY_DEF("Gbreve", 286, "\xc4\x9e"), + ENTITY_DEF("xrArr", 10233, "\xe2\x9f\xb9"), + ENTITY_DEF("NotTildeEqual", 8772, "\xe2\x89\x84"), + ENTITY_DEF("Bfr", 120069, "\xf0\x9d\x94\x85"), + ENTITY_DEF_HEUR("Iuml", 207, "\xc3\x8f"), + ENTITY_DEF("leg", 8922, "\xe2\x8b\x9a"), + ENTITY_DEF("boxhU", 9576, "\xe2\x95\xa8"), + ENTITY_DEF("Gopf", 120126, "\xf0\x9d\x94\xbe"), + ENTITY_DEF("af", 8289, "\xe2\x81\xa1"), + ENTITY_DEF("xwedge", 8896, "\xe2\x8b\x80"), + ENTITY_DEF("precapprox", 10935, "\xe2\xaa\xb7"), + ENTITY_DEF("lcedil", 316, "\xc4\xbc"), + ENTITY_DEF("between", 8812, "\xe2\x89\xac"), + ENTITY_DEF_HEUR("Oslash", 216, "\xc3\x98"), + ENTITY_DEF("breve", 728, "\xcb\x98"), + ENTITY_DEF("caps", 8745, "\xe2\x88\xa9\xef\xb8\x80"), + ENTITY_DEF("vangrt", 10652, "\xe2\xa6\x9c"), + ENTITY_DEF("lagran", 8466, "\xe2\x84\x92"), + ENTITY_DEF("kopf", 120156, "\xf0\x9d\x95\x9c"), + ENTITY_DEF("ReverseUpEquilibrium", 10607, "\xe2\xa5\xaf"), + ENTITY_DEF("nlsim", 8820, "\xe2\x89\xb4"), + ENTITY_DEF("Cap", 8914, "\xe2\x8b\x92"), + ENTITY_DEF("angmsdac", 10666, "\xe2\xa6\xaa"), + ENTITY_DEF("iocy", 1105, "\xd1\x91"), + ENTITY_DEF("seswar", 10537, "\xe2\xa4\xa9"), + ENTITY_DEF("dzcy", 1119, "\xd1\x9f"), + ENTITY_DEF("nsubset", 8834, "\xe2\x8a\x82\xe2\x83\x92"), + ENTITY_DEF("cup", 8746, "\xe2\x88\xaa"), + ENTITY_DEF("npar", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("late", 10925, "\xe2\xaa\xad"), + ENTITY_DEF("plussim", 10790, "\xe2\xa8\xa6"), + ENTITY_DEF("Darr", 8609, "\xe2\x86\xa1"), + ENTITY_DEF("nexist", 8708, "\xe2\x88\x84"), + ENTITY_DEF_HEUR("cent", 162, "\xc2\xa2"), + ENTITY_DEF("khcy", 1093, "\xd1\x85"), + ENTITY_DEF("smallsetminus", 8726, "\xe2\x88\x96"), + ENTITY_DEF("ycirc", 375, "\xc5\xb7"), + ENTITY_DEF("lharu", 8636, "\xe2\x86\xbc"), + ENTITY_DEF("upuparrows", 8648, "\xe2\x87\x88"), + ENTITY_DEF("sigmaf", 962, "\xcf\x82"), + ENTITY_DEF("nltri", 8938, "\xe2\x8b\xaa"), + ENTITY_DEF("mstpos", 8766, "\xe2\x88\xbe"), + ENTITY_DEF("Zopf", 8484, "\xe2\x84\xa4"), + ENTITY_DEF("dwangle", 10662, "\xe2\xa6\xa6"), + ENTITY_DEF("bowtie", 8904, "\xe2\x8b\x88"), + ENTITY_DEF("Dfr", 120071, "\xf0\x9d\x94\x87"), + ENTITY_DEF_HEUR("iacute", 237, "\xc3\xad"), + ENTITY_DEF("njcy", 1114, "\xd1\x9a"), + ENTITY_DEF("cfr", 120096, "\xf0\x9d\x94\xa0"), + ENTITY_DEF("TripleDot", 8411, "\xe2\x83\x9b"), + ENTITY_DEF("Or", 10836, "\xe2\xa9\x94"), + ENTITY_DEF("blk34", 9619, "\xe2\x96\x93"), + ENTITY_DEF("equiv", 8801, "\xe2\x89\xa1"), + ENTITY_DEF("fflig", 64256, "\xef\xac\x80"), + ENTITY_DEF("Rang", 10219, "\xe2\x9f\xab"), + ENTITY_DEF("Wopf", 120142, "\xf0\x9d\x95\x8e"), + ENTITY_DEF("boxUl", 9564, "\xe2\x95\x9c"), + ENTITY_DEF_HEUR("frac12", 189, "\xc2\xbd"), + ENTITY_DEF("clubs", 9827, "\xe2\x99\xa3"), + ENTITY_DEF("amalg", 10815, "\xe2\xa8\xbf"), + ENTITY_DEF("Lang", 10218, "\xe2\x9f\xaa"), + ENTITY_DEF("asymp", 8776, "\xe2\x89\x88"), + ENTITY_DEF("models", 8871, "\xe2\x8a\xa7"), + ENTITY_DEF("emptyset", 8709, "\xe2\x88\x85"), + ENTITY_DEF("Tscr", 119983, "\xf0\x9d\x92\xaf"), + ENTITY_DEF("nleftarrow", 8602, "\xe2\x86\x9a"), + ENTITY_DEF("Omacr", 332, "\xc5\x8c"), + ENTITY_DEF("gtrarr", 10616, "\xe2\xa5\xb8"), + ENTITY_DEF("excl", 33, "\x21"), + ENTITY_DEF("rarrw", 8605, "\xe2\x86\x9d"), + ENTITY_DEF("abreve", 259, "\xc4\x83"), + ENTITY_DEF("CircleTimes", 8855, "\xe2\x8a\x97"), + ENTITY_DEF("aopf", 120146, "\xf0\x9d\x95\x92"), + ENTITY_DEF("eqvparsl", 10725, "\xe2\xa7\xa5"), + ENTITY_DEF("boxv", 9474, "\xe2\x94\x82"), + ENTITY_DEF("SuchThat", 8715, "\xe2\x88\x8b"), + ENTITY_DEF("varphi", 981, "\xcf\x95"), + ENTITY_DEF("Ropf", 8477, "\xe2\x84\x9d"), + ENTITY_DEF("rscr", 120007, "\xf0\x9d\x93\x87"), + ENTITY_DEF("Rrightarrow", 8667, "\xe2\x87\x9b"), + ENTITY_DEF("equest", 8799, "\xe2\x89\x9f"), + ENTITY_DEF_HEUR("ntilde", 241, "\xc3\xb1"), + ENTITY_DEF("Escr", 8496, "\xe2\x84\xb0"), + ENTITY_DEF("Lopf", 120131, "\xf0\x9d\x95\x83"), + ENTITY_DEF("GreaterGreater", 10914, "\xe2\xaa\xa2"), + ENTITY_DEF("pluscir", 10786, "\xe2\xa8\xa2"), + ENTITY_DEF("nsupset", 8835, "\xe2\x8a\x83\xe2\x83\x92"), + ENTITY_DEF("uArr", 8657, "\xe2\x87\x91"), + ENTITY_DEF("nwarhk", 10531, "\xe2\xa4\xa3"), + ENTITY_DEF("Ycirc", 374, "\xc5\xb6"), + ENTITY_DEF("tdot", 8411, "\xe2\x83\x9b"), + ENTITY_DEF("circledS", 9416, "\xe2\x93\x88"), + ENTITY_DEF("lhard", 8637, "\xe2\x86\xbd"), + ENTITY_DEF("iukcy", 1110, "\xd1\x96"), + ENTITY_DEF("PrecedesSlantEqual", 8828, "\xe2\x89\xbc"), + ENTITY_DEF("Sfr", 120086, "\xf0\x9d\x94\x96"), + ENTITY_DEF("egs", 10902, "\xe2\xaa\x96"), + ENTITY_DEF("oelig", 339, "\xc5\x93"), + ENTITY_DEF("bigtriangledown", 9661, "\xe2\x96\xbd"), + ENTITY_DEF("EmptyVerySmallSquare", 9643, "\xe2\x96\xab"), + ENTITY_DEF("Backslash", 8726, "\xe2\x88\x96"), + ENTITY_DEF("nscr", 120003, "\xf0\x9d\x93\x83"), + ENTITY_DEF("uogon", 371, "\xc5\xb3"), + ENTITY_DEF("circeq", 8791, "\xe2\x89\x97"), + ENTITY_DEF("check", 10003, "\xe2\x9c\x93"), + ENTITY_DEF("Sup", 8913, "\xe2\x8b\x91"), + ENTITY_DEF("Rcaron", 344, "\xc5\x98"), + ENTITY_DEF("lneqq", 8808, "\xe2\x89\xa8"), + ENTITY_DEF("lrhar", 8651, "\xe2\x87\x8b"), + ENTITY_DEF("ulcorn", 8988, "\xe2\x8c\x9c"), + ENTITY_DEF("timesd", 10800, "\xe2\xa8\xb0"), + ENTITY_DEF("Sum", 8721, "\xe2\x88\x91"), + ENTITY_DEF("varpropto", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("Lcaron", 317, "\xc4\xbd"), + ENTITY_DEF("lbrkslu", 10637, "\xe2\xa6\x8d"), + ENTITY_DEF_HEUR("AElig", 198, "\xc3\x86"), + ENTITY_DEF("varr", 8597, "\xe2\x86\x95"), + ENTITY_DEF("nvinfin", 10718, "\xe2\xa7\x9e"), + ENTITY_DEF("leq", 8804, "\xe2\x89\xa4"), + ENTITY_DEF("biguplus", 10756, "\xe2\xa8\x84"), + ENTITY_DEF("rpar", 41, "\x29"), + ENTITY_DEF("eng", 331, "\xc5\x8b"), + ENTITY_DEF("NegativeThinSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("lesssim", 8818, "\xe2\x89\xb2"), + ENTITY_DEF("lBarr", 10510, "\xe2\xa4\x8e"), + ENTITY_DEF("LeftUpTeeVector", 10592, "\xe2\xa5\xa0"), + ENTITY_DEF("gnE", 8809, "\xe2\x89\xa9"), + ENTITY_DEF("efr", 120098, "\xf0\x9d\x94\xa2"), + ENTITY_DEF("barvee", 8893, "\xe2\x8a\xbd"), + ENTITY_DEF("ee", 8519, "\xe2\x85\x87"), + ENTITY_DEF("Uogon", 370, "\xc5\xb2"), + ENTITY_DEF("gnapprox", 10890, "\xe2\xaa\x8a"), + ENTITY_DEF("olcir", 10686, "\xe2\xa6\xbe"), + ENTITY_DEF("boxUL", 9565, "\xe2\x95\x9d"), + ENTITY_DEF("Gg", 8921, "\xe2\x8b\x99"), + ENTITY_DEF("CloseCurlyQuote", 8217, "\xe2\x80\x99"), + ENTITY_DEF("leftharpoondown", 8637, "\xe2\x86\xbd"), + ENTITY_DEF("vfr", 120115, "\xf0\x9d\x94\xb3"), + ENTITY_DEF("gvertneqq", 8809, "\xe2\x89\xa9\xef\xb8\x80"), + ENTITY_DEF_HEUR("ouml", 246, "\xc3\xb6"), + ENTITY_DEF("raemptyv", 10675, "\xe2\xa6\xb3"), + ENTITY_DEF("Zcaron", 381, "\xc5\xbd"), + ENTITY_DEF("scE", 10932, "\xe2\xaa\xb4"), + ENTITY_DEF("boxvh", 9532, "\xe2\x94\xbc"), + ENTITY_DEF("ominus", 8854, "\xe2\x8a\x96"), + ENTITY_DEF("oopf", 120160, "\xf0\x9d\x95\xa0"), + ENTITY_DEF("nsucceq", 10928, "\xe2\xaa\xb0\xcc\xb8"), + ENTITY_DEF("RBarr", 10512, "\xe2\xa4\x90"), + ENTITY_DEF("iprod", 10812, "\xe2\xa8\xbc"), + ENTITY_DEF("lvnE", 8808, "\xe2\x89\xa8\xef\xb8\x80"), + ENTITY_DEF("andand", 10837, "\xe2\xa9\x95"), + ENTITY_DEF("upharpoonright", 8638, "\xe2\x86\xbe"), + ENTITY_DEF("ncongdot", 10861, "\xe2\xa9\xad\xcc\xb8"), + ENTITY_DEF("drcrop", 8972, "\xe2\x8c\x8c"), + ENTITY_DEF("nsimeq", 8772, "\xe2\x89\x84"), + ENTITY_DEF("subsub", 10965, "\xe2\xab\x95"), + ENTITY_DEF("hardcy", 1098, "\xd1\x8a"), + ENTITY_DEF("leqslant", 10877, "\xe2\xa9\xbd"), + ENTITY_DEF("uharl", 8639, "\xe2\x86\xbf"), + ENTITY_DEF("expectation", 8496, "\xe2\x84\xb0"), + ENTITY_DEF("mdash", 8212, "\xe2\x80\x94"), + ENTITY_DEF("VerticalTilde", 8768, "\xe2\x89\x80"), + ENTITY_DEF("rdldhar", 10601, "\xe2\xa5\xa9"), + ENTITY_DEF("leftharpoonup", 8636, "\xe2\x86\xbc"), + ENTITY_DEF("mu", 956, "\xce\xbc"), + ENTITY_DEF("curarrm", 10556, "\xe2\xa4\xbc"), + ENTITY_DEF("Cdot", 266, "\xc4\x8a"), + ENTITY_DEF("NotTildeTilde", 8777, "\xe2\x89\x89"), + ENTITY_DEF("boxul", 9496, "\xe2\x94\x98"), + ENTITY_DEF("planckh", 8462, "\xe2\x84\x8e"), + ENTITY_DEF("CapitalDifferentialD", 8517, "\xe2\x85\x85"), + ENTITY_DEF("boxDL", 9559, "\xe2\x95\x97"), + ENTITY_DEF("cupbrcap", 10824, "\xe2\xa9\x88"), + ENTITY_DEF("boxdL", 9557, "\xe2\x95\x95"), + ENTITY_DEF("supe", 8839, "\xe2\x8a\x87"), + ENTITY_DEF("nvlt", 60, "\x3c\xe2\x83\x92"), + ENTITY_DEF("par", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("InvisibleComma", 8291, "\xe2\x81\xa3"), + ENTITY_DEF("ring", 730, "\xcb\x9a"), + ENTITY_DEF("nvap", 8781, "\xe2\x89\x8d\xe2\x83\x92"), + ENTITY_DEF("veeeq", 8794, "\xe2\x89\x9a"), + ENTITY_DEF("Hfr", 8460, "\xe2\x84\x8c"), + ENTITY_DEF("dstrok", 273, "\xc4\x91"), + ENTITY_DEF("gesles", 10900, "\xe2\xaa\x94"), + ENTITY_DEF("dash", 8208, "\xe2\x80\x90"), + ENTITY_DEF("SHcy", 1064, "\xd0\xa8"), + ENTITY_DEF("congdot", 10861, "\xe2\xa9\xad"), + ENTITY_DEF("imagline", 8464, "\xe2\x84\x90"), + ENTITY_DEF("ncy", 1085, "\xd0\xbd"), + ENTITY_DEF("bigstar", 9733, "\xe2\x98\x85"), + ENTITY_DEF_HEUR("REG", 174, "\xc2\xae"), + ENTITY_DEF("triangleq", 8796, "\xe2\x89\x9c"), + ENTITY_DEF("rsqb", 93, "\x5d"), + ENTITY_DEF("ddarr", 8650, "\xe2\x87\x8a"), + ENTITY_DEF("csub", 10959, "\xe2\xab\x8f"), + ENTITY_DEF("quest", 63, "\x3f"), + ENTITY_DEF("Star", 8902, "\xe2\x8b\x86"), + ENTITY_DEF_HEUR("LT", 60, "\x3c"), + ENTITY_DEF("ncong", 8775, "\xe2\x89\x87"), + ENTITY_DEF("prnE", 10933, "\xe2\xaa\xb5"), + ENTITY_DEF("bigtriangleup", 9651, "\xe2\x96\xb3"), + ENTITY_DEF("Tilde", 8764, "\xe2\x88\xbc"), + ENTITY_DEF("ltrif", 9666, "\xe2\x97\x82"), + ENTITY_DEF("ldrdhar", 10599, "\xe2\xa5\xa7"), + ENTITY_DEF("lcaron", 318, "\xc4\xbe"), + ENTITY_DEF("equivDD", 10872, "\xe2\xa9\xb8"), + ENTITY_DEF("lHar", 10594, "\xe2\xa5\xa2"), + ENTITY_DEF("vBar", 10984, "\xe2\xab\xa8"), + ENTITY_DEF("Mopf", 120132, "\xf0\x9d\x95\x84"), + ENTITY_DEF("LeftArrow", 8592, "\xe2\x86\x90"), + ENTITY_DEF("Rho", 929, "\xce\xa1"), + ENTITY_DEF("Ccirc", 264, "\xc4\x88"), + ENTITY_DEF("ifr", 120102, "\xf0\x9d\x94\xa6"), + ENTITY_DEF("cacute", 263, "\xc4\x87"), + ENTITY_DEF("centerdot", 183, "\xc2\xb7"), + ENTITY_DEF("dollar", 36, "\x24"), + ENTITY_DEF("lang", 10216, "\xe2\x9f\xa8"), + ENTITY_DEF("curvearrowright", 8631, "\xe2\x86\xb7"), + ENTITY_DEF("half", 189, "\xc2\xbd"), + ENTITY_DEF("Ecy", 1069, "\xd0\xad"), + ENTITY_DEF("rcub", 125, "\x7d"), + ENTITY_DEF("rcy", 1088, "\xd1\x80"), + ENTITY_DEF("isins", 8948, "\xe2\x8b\xb4"), + ENTITY_DEF("bsolhsub", 10184, "\xe2\x9f\x88"), + ENTITY_DEF("boxuL", 9563, "\xe2\x95\x9b"), + ENTITY_DEF("shchcy", 1097, "\xd1\x89"), + ENTITY_DEF("cwconint", 8754, "\xe2\x88\xb2"), + ENTITY_DEF("euro", 8364, "\xe2\x82\xac"), + ENTITY_DEF("lesseqqgtr", 10891, "\xe2\xaa\x8b"), + ENTITY_DEF("sim", 8764, "\xe2\x88\xbc"), + ENTITY_DEF("rarrc", 10547, "\xe2\xa4\xb3"), + ENTITY_DEF("boxdl", 9488, "\xe2\x94\x90"), + ENTITY_DEF("Epsilon", 917, "\xce\x95"), + ENTITY_DEF("iiiint", 10764, "\xe2\xa8\x8c"), + ENTITY_DEF("Rightarrow", 8658, "\xe2\x87\x92"), + ENTITY_DEF("conint", 8750, "\xe2\x88\xae"), + ENTITY_DEF("boxDl", 9558, "\xe2\x95\x96"), + ENTITY_DEF("kappav", 1008, "\xcf\xb0"), + ENTITY_DEF("profsurf", 8979, "\xe2\x8c\x93"), + ENTITY_DEF_HEUR("auml", 228, "\xc3\xa4"), + ENTITY_DEF("heartsuit", 9829, "\xe2\x99\xa5"), + ENTITY_DEF_HEUR("eacute", 233, "\xc3\xa9"), + ENTITY_DEF_HEUR("gt", 62, "\x3e"), + ENTITY_DEF("Gcedil", 290, "\xc4\xa2"), + ENTITY_DEF("easter", 10862, "\xe2\xa9\xae"), + ENTITY_DEF("Tcy", 1058, "\xd0\xa2"), + ENTITY_DEF("swarrow", 8601, "\xe2\x86\x99"), + ENTITY_DEF("lopf", 120157, "\xf0\x9d\x95\x9d"), + ENTITY_DEF("Agrave", 192, "\xc3\x80"), + ENTITY_DEF("Aring", 197, "\xc3\x85"), + ENTITY_DEF("fpartint", 10765, "\xe2\xa8\x8d"), + ENTITY_DEF("xoplus", 10753, "\xe2\xa8\x81"), + ENTITY_DEF("LeftDownTeeVector", 10593, "\xe2\xa5\xa1"), + ENTITY_DEF("int", 8747, "\xe2\x88\xab"), + ENTITY_DEF("Zeta", 918, "\xce\x96"), + ENTITY_DEF("loz", 9674, "\xe2\x97\x8a"), + ENTITY_DEF("ncup", 10818, "\xe2\xa9\x82"), + ENTITY_DEF("napE", 10864, "\xe2\xa9\xb0\xcc\xb8"), + ENTITY_DEF("csup", 10960, "\xe2\xab\x90"), + ENTITY_DEF("Ncedil", 325, "\xc5\x85"), + ENTITY_DEF("cuwed", 8911, "\xe2\x8b\x8f"), + ENTITY_DEF("Dot", 168, "\xc2\xa8"), + ENTITY_DEF("SquareIntersection", 8851, "\xe2\x8a\x93"), + ENTITY_DEF("map", 8614, "\xe2\x86\xa6"), + ENTITY_DEF_HEUR("aelig", 230, "\xc3\xa6"), + ENTITY_DEF("RightArrow", 8594, "\xe2\x86\x92"), + ENTITY_DEF("rightharpoondown", 8641, "\xe2\x87\x81"), + ENTITY_DEF("bNot", 10989, "\xe2\xab\xad"), + ENTITY_DEF("nsccue", 8929, "\xe2\x8b\xa1"), + ENTITY_DEF("zigrarr", 8669, "\xe2\x87\x9d"), + ENTITY_DEF("Sacute", 346, "\xc5\x9a"), + ENTITY_DEF("orv", 10843, "\xe2\xa9\x9b"), + ENTITY_DEF("RightVectorBar", 10579, "\xe2\xa5\x93"), + ENTITY_DEF("nrarrw", 8605, "\xe2\x86\x9d\xcc\xb8"), + ENTITY_DEF("nbump", 8782, "\xe2\x89\x8e\xcc\xb8"), + ENTITY_DEF_HEUR("iquest", 191, "\xc2\xbf"), + ENTITY_DEF("wr", 8768, "\xe2\x89\x80"), + ENTITY_DEF("UpArrow", 8593, "\xe2\x86\x91"), + ENTITY_DEF("notinva", 8713, "\xe2\x88\x89"), + ENTITY_DEF("ddagger", 8225, "\xe2\x80\xa1"), + ENTITY_DEF("nLeftarrow", 8653, "\xe2\x87\x8d"), + ENTITY_DEF("rbbrk", 10099, "\xe2\x9d\xb3"), + ENTITY_DEF("RightTriangle", 8883, "\xe2\x8a\xb3"), + ENTITY_DEF("leqq", 8806, "\xe2\x89\xa6"), + ENTITY_DEF("Vert", 8214, "\xe2\x80\x96"), + ENTITY_DEF("gesl", 8923, "\xe2\x8b\x9b\xef\xb8\x80"), + ENTITY_DEF("LeftTeeVector", 10586, "\xe2\xa5\x9a"), + ENTITY_DEF("Union", 8899, "\xe2\x8b\x83"), + ENTITY_DEF("sc", 8827, "\xe2\x89\xbb"), + ENTITY_DEF("ofr", 120108, "\xf0\x9d\x94\xac"), + ENTITY_DEF("quatint", 10774, "\xe2\xa8\x96"), + ENTITY_DEF("apacir", 10863, "\xe2\xa9\xaf"), + ENTITY_DEF("profalar", 9006, "\xe2\x8c\xae"), + ENTITY_DEF("subsetneq", 8842, "\xe2\x8a\x8a"), + ENTITY_DEF("Vvdash", 8874, "\xe2\x8a\xaa"), + ENTITY_DEF("ohbar", 10677, "\xe2\xa6\xb5"), + ENTITY_DEF("Gt", 8811, "\xe2\x89\xab"), + ENTITY_DEF("exist", 8707, "\xe2\x88\x83"), + ENTITY_DEF("gtrapprox", 10886, "\xe2\xaa\x86"), + ENTITY_DEF_HEUR("euml", 235, "\xc3\xab"), + ENTITY_DEF("Equilibrium", 8652, "\xe2\x87\x8c"), + ENTITY_DEF_HEUR("aacute", 225, "\xc3\xa1"), + ENTITY_DEF("omid", 10678, "\xe2\xa6\xb6"), + ENTITY_DEF("loarr", 8701, "\xe2\x87\xbd"), + ENTITY_DEF("SucceedsSlantEqual", 8829, "\xe2\x89\xbd"), + ENTITY_DEF("angsph", 8738, "\xe2\x88\xa2"), + ENTITY_DEF("nsmid", 8740, "\xe2\x88\xa4"), + ENTITY_DEF("lsquor", 8218, "\xe2\x80\x9a"), + ENTITY_DEF("cemptyv", 10674, "\xe2\xa6\xb2"), + ENTITY_DEF("rAarr", 8667, "\xe2\x87\x9b"), + ENTITY_DEF("searr", 8600, "\xe2\x86\x98"), + ENTITY_DEF("complexes", 8450, "\xe2\x84\x82"), + ENTITY_DEF("UnderParenthesis", 9181, "\xe2\x8f\x9d"), + ENTITY_DEF("nparsl", 11005, "\xe2\xab\xbd\xe2\x83\xa5"), + ENTITY_DEF("Lacute", 313, "\xc4\xb9"), + ENTITY_DEF_HEUR("deg", 176, "\xc2\xb0"), + ENTITY_DEF("Racute", 340, "\xc5\x94"), + ENTITY_DEF("Verbar", 8214, "\xe2\x80\x96"), + ENTITY_DEF("sqcups", 8852, "\xe2\x8a\x94\xef\xb8\x80"), + ENTITY_DEF("Hopf", 8461, "\xe2\x84\x8d"), + ENTITY_DEF("naturals", 8469, "\xe2\x84\x95"), + ENTITY_DEF("Cedilla", 184, "\xc2\xb8"), + ENTITY_DEF("exponentiale", 8519, "\xe2\x85\x87"), + ENTITY_DEF("vnsup", 8835, "\xe2\x8a\x83\xe2\x83\x92"), + ENTITY_DEF("leftrightarrows", 8646, "\xe2\x87\x86"), + ENTITY_DEF("Laplacetrf", 8466, "\xe2\x84\x92"), + ENTITY_DEF("vartriangleright", 8883, "\xe2\x8a\xb3"), + ENTITY_DEF("rtri", 9657, "\xe2\x96\xb9"), + ENTITY_DEF("gE", 8807, "\xe2\x89\xa7"), + ENTITY_DEF("SmallCircle", 8728, "\xe2\x88\x98"), + ENTITY_DEF("diamondsuit", 9830, "\xe2\x99\xa6"), + ENTITY_DEF_HEUR("Otilde", 213, "\xc3\x95"), + ENTITY_DEF("lneq", 10887, "\xe2\xaa\x87"), + ENTITY_DEF("lesdoto", 10881, "\xe2\xaa\x81"), + ENTITY_DEF("ltquest", 10875, "\xe2\xa9\xbb"), + ENTITY_DEF("thinsp", 8201, "\xe2\x80\x89"), + ENTITY_DEF("barwed", 8965, "\xe2\x8c\x85"), + ENTITY_DEF("elsdot", 10903, "\xe2\xaa\x97"), + ENTITY_DEF("circ", 710, "\xcb\x86"), + ENTITY_DEF("ni", 8715, "\xe2\x88\x8b"), + ENTITY_DEF("mlcp", 10971, "\xe2\xab\x9b"), + ENTITY_DEF("Vdash", 8873, "\xe2\x8a\xa9"), + ENTITY_DEF("ShortRightArrow", 8594, "\xe2\x86\x92"), + ENTITY_DEF("upharpoonleft", 8639, "\xe2\x86\xbf"), + ENTITY_DEF("UnderBracket", 9141, "\xe2\x8e\xb5"), + ENTITY_DEF("rAtail", 10524, "\xe2\xa4\x9c"), + ENTITY_DEF("iopf", 120154, "\xf0\x9d\x95\x9a"), + ENTITY_DEF("longleftarrow", 10229, "\xe2\x9f\xb5"), + ENTITY_DEF("Zacute", 377, "\xc5\xb9"), + ENTITY_DEF("duhar", 10607, "\xe2\xa5\xaf"), + ENTITY_DEF("Mfr", 120080, "\xf0\x9d\x94\x90"), + ENTITY_DEF("prnap", 10937, "\xe2\xaa\xb9"), + ENTITY_DEF("eqcirc", 8790, "\xe2\x89\x96"), + ENTITY_DEF("rarrlp", 8620, "\xe2\x86\xac"), + ENTITY_DEF("le", 8804, "\xe2\x89\xa4"), + ENTITY_DEF("Oscr", 119978, "\xf0\x9d\x92\xaa"), + ENTITY_DEF("langd", 10641, "\xe2\xa6\x91"), + ENTITY_DEF("Ucirc", 219, "\xc3\x9b"), + ENTITY_DEF("precnapprox", 10937, "\xe2\xaa\xb9"), + ENTITY_DEF("succcurlyeq", 8829, "\xe2\x89\xbd"), + ENTITY_DEF("Tau", 932, "\xce\xa4"), + ENTITY_DEF("larr", 8592, "\xe2\x86\x90"), + ENTITY_DEF("neArr", 8663, "\xe2\x87\x97"), + ENTITY_DEF("subsim", 10951, "\xe2\xab\x87"), + ENTITY_DEF("DScy", 1029, "\xd0\x85"), + ENTITY_DEF("preccurlyeq", 8828, "\xe2\x89\xbc"), + ENTITY_DEF("NotLessLess", 8810, "\xe2\x89\xaa\xcc\xb8"), + ENTITY_DEF("succnapprox", 10938, "\xe2\xaa\xba"), + ENTITY_DEF("prcue", 8828, "\xe2\x89\xbc"), + ENTITY_DEF("Downarrow", 8659, "\xe2\x87\x93"), + ENTITY_DEF("angmsdah", 10671, "\xe2\xa6\xaf"), + ENTITY_DEF("Emacr", 274, "\xc4\x92"), + ENTITY_DEF("lsh", 8624, "\xe2\x86\xb0"), + ENTITY_DEF("simne", 8774, "\xe2\x89\x86"), + ENTITY_DEF("Bumpeq", 8782, "\xe2\x89\x8e"), + ENTITY_DEF("RightUpTeeVector", 10588, "\xe2\xa5\x9c"), + ENTITY_DEF("Sigma", 931, "\xce\xa3"), + ENTITY_DEF("nvltrie", 8884, "\xe2\x8a\xb4\xe2\x83\x92"), + ENTITY_DEF("lfr", 120105, "\xf0\x9d\x94\xa9"), + ENTITY_DEF("emsp13", 8196, "\xe2\x80\x84"), + ENTITY_DEF("parsl", 11005, "\xe2\xab\xbd"), + ENTITY_DEF_HEUR("ucirc", 251, "\xc3\xbb"), + ENTITY_DEF("gsiml", 10896, "\xe2\xaa\x90"), + ENTITY_DEF("xsqcup", 10758, "\xe2\xa8\x86"), + ENTITY_DEF("Omicron", 927, "\xce\x9f"), + ENTITY_DEF("gsime", 10894, "\xe2\xaa\x8e"), + ENTITY_DEF("circlearrowleft", 8634, "\xe2\x86\xba"), + ENTITY_DEF("sqsupe", 8850, "\xe2\x8a\x92"), + ENTITY_DEF("supE", 10950, "\xe2\xab\x86"), + ENTITY_DEF("dlcrop", 8973, "\xe2\x8c\x8d"), + ENTITY_DEF("RightDownTeeVector", 10589, "\xe2\xa5\x9d"), + ENTITY_DEF("Colone", 10868, "\xe2\xa9\xb4"), + ENTITY_DEF("awconint", 8755, "\xe2\x88\xb3"), + ENTITY_DEF("smte", 10924, "\xe2\xaa\xac"), + ENTITY_DEF("lEg", 10891, "\xe2\xaa\x8b"), + ENTITY_DEF("circledast", 8859, "\xe2\x8a\x9b"), + ENTITY_DEF("ecolon", 8789, "\xe2\x89\x95"), + ENTITY_DEF("rect", 9645, "\xe2\x96\xad"), + ENTITY_DEF("Equal", 10869, "\xe2\xa9\xb5"), + ENTITY_DEF("nwnear", 10535, "\xe2\xa4\xa7"), + ENTITY_DEF("capdot", 10816, "\xe2\xa9\x80"), + ENTITY_DEF("straightphi", 981, "\xcf\x95"), + ENTITY_DEF("forkv", 10969, "\xe2\xab\x99"), + ENTITY_DEF("ZHcy", 1046, "\xd0\x96"), + ENTITY_DEF("Element", 8712, "\xe2\x88\x88"), + ENTITY_DEF("rthree", 8908, "\xe2\x8b\x8c"), + ENTITY_DEF("vzigzag", 10650, "\xe2\xa6\x9a"), + ENTITY_DEF("hybull", 8259, "\xe2\x81\x83"), + ENTITY_DEF("intprod", 10812, "\xe2\xa8\xbc"), + ENTITY_DEF("HumpEqual", 8783, "\xe2\x89\x8f"), + ENTITY_DEF("bigsqcup", 10758, "\xe2\xa8\x86"), + ENTITY_DEF("mp", 8723, "\xe2\x88\x93"), + ENTITY_DEF("lescc", 10920, "\xe2\xaa\xa8"), + ENTITY_DEF("NotPrecedes", 8832, "\xe2\x8a\x80"), + ENTITY_DEF("wedge", 8743, "\xe2\x88\xa7"), + ENTITY_DEF("Supset", 8913, "\xe2\x8b\x91"), + ENTITY_DEF("pm", 177, "\xc2\xb1"), + ENTITY_DEF("kfr", 120104, "\xf0\x9d\x94\xa8"), + ENTITY_DEF("ufisht", 10622, "\xe2\xa5\xbe"), + ENTITY_DEF("ecaron", 283, "\xc4\x9b"), + ENTITY_DEF("chcy", 1095, "\xd1\x87"), + ENTITY_DEF("Esim", 10867, "\xe2\xa9\xb3"), + ENTITY_DEF("fltns", 9649, "\xe2\x96\xb1"), + ENTITY_DEF("nsce", 10928, "\xe2\xaa\xb0\xcc\xb8"), + ENTITY_DEF("hookrightarrow", 8618, "\xe2\x86\xaa"), + ENTITY_DEF("semi", 59, "\x3b"), + ENTITY_DEF("ges", 10878, "\xe2\xa9\xbe"), + ENTITY_DEF("approxeq", 8778, "\xe2\x89\x8a"), + ENTITY_DEF("rarrsim", 10612, "\xe2\xa5\xb4"), + ENTITY_DEF("boxhD", 9573, "\xe2\x95\xa5"), + ENTITY_DEF("varpi", 982, "\xcf\x96"), + ENTITY_DEF("larrb", 8676, "\xe2\x87\xa4"), + ENTITY_DEF("copf", 120148, "\xf0\x9d\x95\x94"), + ENTITY_DEF("Dopf", 120123, "\xf0\x9d\x94\xbb"), + ENTITY_DEF("LeftVector", 8636, "\xe2\x86\xbc"), + ENTITY_DEF("iff", 8660, "\xe2\x87\x94"), + ENTITY_DEF("lnap", 10889, "\xe2\xaa\x89"), + ENTITY_DEF("NotGreaterFullEqual", 8807, "\xe2\x89\xa7\xcc\xb8"), + ENTITY_DEF("varrho", 1009, "\xcf\xb1"), + ENTITY_DEF("NotSucceeds", 8833, "\xe2\x8a\x81"), + ENTITY_DEF("ltrPar", 10646, "\xe2\xa6\x96"), + ENTITY_DEF("nlE", 8806, "\xe2\x89\xa6\xcc\xb8"), + ENTITY_DEF("Zfr", 8488, "\xe2\x84\xa8"), + ENTITY_DEF("LeftArrowBar", 8676, "\xe2\x87\xa4"), + ENTITY_DEF("boxplus", 8862, "\xe2\x8a\x9e"), + ENTITY_DEF("sqsube", 8849, "\xe2\x8a\x91"), + ENTITY_DEF("Re", 8476, "\xe2\x84\x9c"), + ENTITY_DEF("Wfr", 120090, "\xf0\x9d\x94\x9a"), + ENTITY_DEF("epsi", 949, "\xce\xb5"), + ENTITY_DEF("oacute", 243, "\xc3\xb3"), + ENTITY_DEF("bdquo", 8222, "\xe2\x80\x9e"), + ENTITY_DEF("wscr", 120012, "\xf0\x9d\x93\x8c"), + ENTITY_DEF("bullet", 8226, "\xe2\x80\xa2"), + ENTITY_DEF("frown", 8994, "\xe2\x8c\xa2"), + ENTITY_DEF("siml", 10909, "\xe2\xaa\x9d"), + ENTITY_DEF("Rarr", 8608, "\xe2\x86\xa0"), + ENTITY_DEF("Scaron", 352, "\xc5\xa0"), + ENTITY_DEF("gtreqqless", 10892, "\xe2\xaa\x8c"), + ENTITY_DEF("Larr", 8606, "\xe2\x86\x9e"), + ENTITY_DEF("notniva", 8716, "\xe2\x88\x8c"), + ENTITY_DEF("gg", 8811, "\xe2\x89\xab"), + ENTITY_DEF("phmmat", 8499, "\xe2\x84\xb3"), + ENTITY_DEF("boxVL", 9571, "\xe2\x95\xa3"), + ENTITY_DEF("sigmav", 962, "\xcf\x82"), + ENTITY_DEF("order", 8500, "\xe2\x84\xb4"), + ENTITY_DEF("subsup", 10963, "\xe2\xab\x93"), + ENTITY_DEF("afr", 120094, "\xf0\x9d\x94\x9e"), + ENTITY_DEF("lbrace", 123, "\x7b"), + ENTITY_DEF("urcorn", 8989, "\xe2\x8c\x9d"), + ENTITY_DEF("Im", 8465, "\xe2\x84\x91"), + ENTITY_DEF("CounterClockwiseContourIntegral", 8755, "\xe2\x88\xb3"), + ENTITY_DEF("lne", 10887, "\xe2\xaa\x87"), + ENTITY_DEF("chi", 967, "\xcf\x87"), + ENTITY_DEF("cudarrl", 10552, "\xe2\xa4\xb8"), + ENTITY_DEF("ang", 8736, "\xe2\x88\xa0"), + ENTITY_DEF("isindot", 8949, "\xe2\x8b\xb5"), + ENTITY_DEF("Lfr", 120079, "\xf0\x9d\x94\x8f"), + ENTITY_DEF("Rsh", 8625, "\xe2\x86\xb1"), + ENTITY_DEF("Ocy", 1054, "\xd0\x9e"), + ENTITY_DEF("nvrArr", 10499, "\xe2\xa4\x83"), + ENTITY_DEF("otimes", 8855, "\xe2\x8a\x97"), + ENTITY_DEF("eqslantgtr", 10902, "\xe2\xaa\x96"), + ENTITY_DEF("Rfr", 8476, "\xe2\x84\x9c"), + ENTITY_DEF("blacktriangleleft", 9666, "\xe2\x97\x82"), + ENTITY_DEF("Lsh", 8624, "\xe2\x86\xb0"), + ENTITY_DEF("boxvr", 9500, "\xe2\x94\x9c"), + ENTITY_DEF("scedil", 351, "\xc5\x9f"), + ENTITY_DEF_HEUR("iuml", 239, "\xc3\xaf"), + ENTITY_DEF("NJcy", 1034, "\xd0\x8a"), + ENTITY_DEF("Dagger", 8225, "\xe2\x80\xa1"), + ENTITY_DEF("rarrap", 10613, "\xe2\xa5\xb5"), + ENTITY_DEF("udblac", 369, "\xc5\xb1"), + ENTITY_DEF("Sopf", 120138, "\xf0\x9d\x95\x8a"), + ENTITY_DEF("scnsim", 8937, "\xe2\x8b\xa9"), + ENTITY_DEF("hbar", 8463, "\xe2\x84\x8f"), + ENTITY_DEF("frac15", 8533, "\xe2\x85\x95"), + ENTITY_DEF_HEUR("sup3", 179, "\xc2\xb3"), + ENTITY_DEF("NegativeThickSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("npr", 8832, "\xe2\x8a\x80"), + ENTITY_DEF("doteq", 8784, "\xe2\x89\x90"), + ENTITY_DEF("subrarr", 10617, "\xe2\xa5\xb9"), + ENTITY_DEF("SquareSubset", 8847, "\xe2\x8a\x8f"), + ENTITY_DEF("vprop", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("OpenCurlyQuote", 8216, "\xe2\x80\x98"), + ENTITY_DEF("supseteq", 8839, "\xe2\x8a\x87"), + ENTITY_DEF("nRightarrow", 8655, "\xe2\x87\x8f"), + ENTITY_DEF("Longleftarrow", 10232, "\xe2\x9f\xb8"), + ENTITY_DEF("lsquo", 8216, "\xe2\x80\x98"), + ENTITY_DEF("hstrok", 295, "\xc4\xa7"), + ENTITY_DEF("NotTilde", 8769, "\xe2\x89\x81"), + ENTITY_DEF("ogt", 10689, "\xe2\xa7\x81"), + ENTITY_DEF("block", 9608, "\xe2\x96\x88"), + ENTITY_DEF("minusd", 8760, "\xe2\x88\xb8"), + ENTITY_DEF("esdot", 8784, "\xe2\x89\x90"), + ENTITY_DEF("nsim", 8769, "\xe2\x89\x81"), + ENTITY_DEF("scsim", 8831, "\xe2\x89\xbf"), + ENTITY_DEF("boxVl", 9570, "\xe2\x95\xa2"), + ENTITY_DEF("ltimes", 8905, "\xe2\x8b\x89"), + ENTITY_DEF("thkap", 8776, "\xe2\x89\x88"), + ENTITY_DEF("vnsub", 8834, "\xe2\x8a\x82\xe2\x83\x92"), + ENTITY_DEF("thetasym", 977, "\xcf\x91"), + ENTITY_DEF("eopf", 120150, "\xf0\x9d\x95\x96"), + ENTITY_DEF("image", 8465, "\xe2\x84\x91"), + ENTITY_DEF("doteqdot", 8785, "\xe2\x89\x91"), + ENTITY_DEF("Udblac", 368, "\xc5\xb0"), + ENTITY_DEF("gnsim", 8935, "\xe2\x8b\xa7"), + ENTITY_DEF("yicy", 1111, "\xd1\x97"), + ENTITY_DEF("vopf", 120167, "\xf0\x9d\x95\xa7"), + ENTITY_DEF("DDotrahd", 10513, "\xe2\xa4\x91"), + ENTITY_DEF("Iota", 921, "\xce\x99"), + ENTITY_DEF("GJcy", 1027, "\xd0\x83"), + ENTITY_DEF("rightthreetimes", 8908, "\xe2\x8b\x8c"), + ENTITY_DEF("nrtri", 8939, "\xe2\x8b\xab"), + ENTITY_DEF("TildeFullEqual", 8773, "\xe2\x89\x85"), + ENTITY_DEF("Dcaron", 270, "\xc4\x8e"), + ENTITY_DEF("ccaron", 269, "\xc4\x8d"), + ENTITY_DEF("lacute", 314, "\xc4\xba"), + ENTITY_DEF("VerticalBar", 8739, "\xe2\x88\xa3"), + ENTITY_DEF("Igrave", 204, "\xc3\x8c"), + ENTITY_DEF("boxH", 9552, "\xe2\x95\x90"), + ENTITY_DEF("Pfr", 120083, "\xf0\x9d\x94\x93"), + ENTITY_DEF("equals", 61, "\x3d"), + ENTITY_DEF("rbrack", 93, "\x5d"), + ENTITY_DEF("OverParenthesis", 9180, "\xe2\x8f\x9c"), + ENTITY_DEF("in", 8712, "\xe2\x88\x88"), + ENTITY_DEF("llcorner", 8990, "\xe2\x8c\x9e"), + ENTITY_DEF("mcomma", 10793, "\xe2\xa8\xa9"), + ENTITY_DEF("NotGreater", 8815, "\xe2\x89\xaf"), + ENTITY_DEF("midcir", 10992, "\xe2\xab\xb0"), + ENTITY_DEF("Edot", 278, "\xc4\x96"), + ENTITY_DEF("oplus", 8853, "\xe2\x8a\x95"), + ENTITY_DEF("geqq", 8807, "\xe2\x89\xa7"), + ENTITY_DEF("curvearrowleft", 8630, "\xe2\x86\xb6"), + ENTITY_DEF("Poincareplane", 8460, "\xe2\x84\x8c"), + ENTITY_DEF("yscr", 120014, "\xf0\x9d\x93\x8e"), + ENTITY_DEF("ccaps", 10829, "\xe2\xa9\x8d"), + ENTITY_DEF("rpargt", 10644, "\xe2\xa6\x94"), + ENTITY_DEF("topfork", 10970, "\xe2\xab\x9a"), + ENTITY_DEF("Gamma", 915, "\xce\x93"), + ENTITY_DEF("umacr", 363, "\xc5\xab"), + ENTITY_DEF("frac13", 8531, "\xe2\x85\x93"), + ENTITY_DEF("cirfnint", 10768, "\xe2\xa8\x90"), + ENTITY_DEF("xlArr", 10232, "\xe2\x9f\xb8"), + ENTITY_DEF("digamma", 989, "\xcf\x9d"), + ENTITY_DEF("Hat", 94, "\x5e"), + ENTITY_DEF("lates", 10925, "\xe2\xaa\xad\xef\xb8\x80"), + ENTITY_DEF("lgE", 10897, "\xe2\xaa\x91"), + ENTITY_DEF("commat", 64, "\x40"), + ENTITY_DEF("NotPrecedesSlantEqual", 8928, "\xe2\x8b\xa0"), + ENTITY_DEF("phone", 9742, "\xe2\x98\x8e"), + ENTITY_DEF("Ecirc", 202, "\xc3\x8a"), + ENTITY_DEF_HEUR("lt", 60, "\x3c"), + ENTITY_DEF("intcal", 8890, "\xe2\x8a\xba"), + ENTITY_DEF("xdtri", 9661, "\xe2\x96\xbd"), + ENTITY_DEF("Abreve", 258, "\xc4\x82"), + ENTITY_DEF("gopf", 120152, "\xf0\x9d\x95\x98"), + ENTITY_DEF("Xopf", 120143, "\xf0\x9d\x95\x8f"), + ENTITY_DEF("Iacute", 205, "\xc3\x8d"), + ENTITY_DEF("Aopf", 120120, "\xf0\x9d\x94\xb8"), + ENTITY_DEF("gbreve", 287, "\xc4\x9f"), + ENTITY_DEF("nleq", 8816, "\xe2\x89\xb0"), + ENTITY_DEF("xopf", 120169, "\xf0\x9d\x95\xa9"), + ENTITY_DEF("SquareSupersetEqual", 8850, "\xe2\x8a\x92"), + ENTITY_DEF("NotLessTilde", 8820, "\xe2\x89\xb4"), + ENTITY_DEF("SubsetEqual", 8838, "\xe2\x8a\x86"), + ENTITY_DEF("Sc", 10940, "\xe2\xaa\xbc"), + ENTITY_DEF("sdote", 10854, "\xe2\xa9\xa6"), + ENTITY_DEF("loplus", 10797, "\xe2\xa8\xad"), + ENTITY_DEF("zfr", 120119, "\xf0\x9d\x94\xb7"), + ENTITY_DEF("subseteqq", 10949, "\xe2\xab\x85"), + ENTITY_DEF("Vdashl", 10982, "\xe2\xab\xa6"), + ENTITY_DEF("integers", 8484, "\xe2\x84\xa4"), + ENTITY_DEF("Umacr", 362, "\xc5\xaa"), + ENTITY_DEF("dopf", 120149, "\xf0\x9d\x95\x95"), + ENTITY_DEF("RightDownVectorBar", 10581, "\xe2\xa5\x95"), + ENTITY_DEF("angmsdaf", 10669, "\xe2\xa6\xad"), + ENTITY_DEF("Jfr", 120077, "\xf0\x9d\x94\x8d"), + ENTITY_DEF("bernou", 8492, "\xe2\x84\xac"), + ENTITY_DEF("lceil", 8968, "\xe2\x8c\x88"), + ENTITY_DEF("nvsim", 8764, "\xe2\x88\xbc\xe2\x83\x92"), + ENTITY_DEF("NotSucceedsSlantEqual", 8929, "\xe2\x8b\xa1"), + ENTITY_DEF("hearts", 9829, "\xe2\x99\xa5"), + ENTITY_DEF("vee", 8744, "\xe2\x88\xa8"), + ENTITY_DEF("LJcy", 1033, "\xd0\x89"), + ENTITY_DEF("nlt", 8814, "\xe2\x89\xae"), + ENTITY_DEF("because", 8757, "\xe2\x88\xb5"), + ENTITY_DEF("hairsp", 8202, "\xe2\x80\x8a"), + ENTITY_DEF("comma", 44, "\x2c"), + ENTITY_DEF("iecy", 1077, "\xd0\xb5"), + ENTITY_DEF("npre", 10927, "\xe2\xaa\xaf\xcc\xb8"), + ENTITY_DEF("NotSquareSubset", 8847, "\xe2\x8a\x8f\xcc\xb8"), + ENTITY_DEF("mscr", 120002, "\xf0\x9d\x93\x82"), + ENTITY_DEF("jopf", 120155, "\xf0\x9d\x95\x9b"), + ENTITY_DEF("bumpE", 10926, "\xe2\xaa\xae"), + ENTITY_DEF("thicksim", 8764, "\xe2\x88\xbc"), + ENTITY_DEF("Nfr", 120081, "\xf0\x9d\x94\x91"), + ENTITY_DEF("yucy", 1102, "\xd1\x8e"), + ENTITY_DEF("notinvc", 8950, "\xe2\x8b\xb6"), + ENTITY_DEF("lstrok", 322, "\xc5\x82"), + ENTITY_DEF("robrk", 10215, "\xe2\x9f\xa7"), + ENTITY_DEF("LeftTriangleBar", 10703, "\xe2\xa7\x8f"), + ENTITY_DEF("hksearow", 10533, "\xe2\xa4\xa5"), + ENTITY_DEF("bigcap", 8898, "\xe2\x8b\x82"), + ENTITY_DEF("udhar", 10606, "\xe2\xa5\xae"), + ENTITY_DEF("Yscr", 119988, "\xf0\x9d\x92\xb4"), + ENTITY_DEF("smeparsl", 10724, "\xe2\xa7\xa4"), + ENTITY_DEF("NotLess", 8814, "\xe2\x89\xae"), + ENTITY_DEF("dcaron", 271, "\xc4\x8f"), + ENTITY_DEF("ange", 10660, "\xe2\xa6\xa4"), + ENTITY_DEF("dHar", 10597, "\xe2\xa5\xa5"), + ENTITY_DEF("UpperRightArrow", 8599, "\xe2\x86\x97"), + ENTITY_DEF("trpezium", 9186, "\xe2\x8f\xa2"), + ENTITY_DEF("boxminus", 8863, "\xe2\x8a\x9f"), + ENTITY_DEF("notni", 8716, "\xe2\x88\x8c"), + ENTITY_DEF("dtrif", 9662, "\xe2\x96\xbe"), + ENTITY_DEF("nhArr", 8654, "\xe2\x87\x8e"), + ENTITY_DEF("larrpl", 10553, "\xe2\xa4\xb9"), + ENTITY_DEF("simeq", 8771, "\xe2\x89\x83"), + ENTITY_DEF("geqslant", 10878, "\xe2\xa9\xbe"), + ENTITY_DEF("RightUpVectorBar", 10580, "\xe2\xa5\x94"), + ENTITY_DEF("nsc", 8833, "\xe2\x8a\x81"), + ENTITY_DEF("div", 247, "\xc3\xb7"), + ENTITY_DEF("orslope", 10839, "\xe2\xa9\x97"), + ENTITY_DEF("lparlt", 10643, "\xe2\xa6\x93"), + ENTITY_DEF("trie", 8796, "\xe2\x89\x9c"), + ENTITY_DEF("cirmid", 10991, "\xe2\xab\xaf"), + ENTITY_DEF("wp", 8472, "\xe2\x84\x98"), + ENTITY_DEF("dagger", 8224, "\xe2\x80\xa0"), + ENTITY_DEF("utri", 9653, "\xe2\x96\xb5"), + ENTITY_DEF("supnE", 10956, "\xe2\xab\x8c"), + ENTITY_DEF("eg", 10906, "\xe2\xaa\x9a"), + ENTITY_DEF("LeftDownVector", 8643, "\xe2\x87\x83"), + ENTITY_DEF("NotLessEqual", 8816, "\xe2\x89\xb0"), + ENTITY_DEF("Bopf", 120121, "\xf0\x9d\x94\xb9"), + ENTITY_DEF("LongLeftRightArrow", 10231, "\xe2\x9f\xb7"), + ENTITY_DEF("Gfr", 120074, "\xf0\x9d\x94\x8a"), + ENTITY_DEF("sqsubseteq", 8849, "\xe2\x8a\x91"), + ENTITY_DEF_HEUR("ograve", 242, "\xc3\xb2"), + ENTITY_DEF("larrhk", 8617, "\xe2\x86\xa9"), + ENTITY_DEF("sigma", 963, "\xcf\x83"), + ENTITY_DEF("NotSquareSupersetEqual", 8931, "\xe2\x8b\xa3"), + ENTITY_DEF("gvnE", 8809, "\xe2\x89\xa9\xef\xb8\x80"), + ENTITY_DEF("timesbar", 10801, "\xe2\xa8\xb1"), + ENTITY_DEF("Iukcy", 1030, "\xd0\x86"), + ENTITY_DEF("bscr", 119991, "\xf0\x9d\x92\xb7"), + ENTITY_DEF("Exists", 8707, "\xe2\x88\x83"), + ENTITY_DEF("tscr", 120009, "\xf0\x9d\x93\x89"), + ENTITY_DEF("tcy", 1090, "\xd1\x82"), + ENTITY_DEF("nwarr", 8598, "\xe2\x86\x96"), + ENTITY_DEF("hoarr", 8703, "\xe2\x87\xbf"), + ENTITY_DEF("lnapprox", 10889, "\xe2\xaa\x89"), + ENTITY_DEF("nu", 957, "\xce\xbd"), + ENTITY_DEF("bcy", 1073, "\xd0\xb1"), + ENTITY_DEF("ndash", 8211, "\xe2\x80\x93"), + ENTITY_DEF("smt", 10922, "\xe2\xaa\xaa"), + ENTITY_DEF("scaron", 353, "\xc5\xa1"), + ENTITY_DEF("IOcy", 1025, "\xd0\x81"), + ENTITY_DEF("Ifr", 8465, "\xe2\x84\x91"), + ENTITY_DEF("cularrp", 10557, "\xe2\xa4\xbd"), + ENTITY_DEF("lvertneqq", 8808, "\xe2\x89\xa8\xef\xb8\x80"), + ENTITY_DEF("nlarr", 8602, "\xe2\x86\x9a"), + ENTITY_DEF("colon", 58, "\x3a"), + ENTITY_DEF("ddotseq", 10871, "\xe2\xa9\xb7"), + ENTITY_DEF("zacute", 378, "\xc5\xba"), + ENTITY_DEF("DoubleVerticalBar", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("larrfs", 10525, "\xe2\xa4\x9d"), + ENTITY_DEF("NotExists", 8708, "\xe2\x88\x84"), + ENTITY_DEF("geq", 8805, "\xe2\x89\xa5"), + ENTITY_DEF("Ffr", 120073, "\xf0\x9d\x94\x89"), + ENTITY_DEF_HEUR("divide", 247, "\xc3\xb7"), + ENTITY_DEF("blank", 9251, "\xe2\x90\xa3"), + ENTITY_DEF("IEcy", 1045, "\xd0\x95"), + ENTITY_DEF_HEUR("ordm", 186, "\xc2\xba"), + ENTITY_DEF("fopf", 120151, "\xf0\x9d\x95\x97"), + ENTITY_DEF("ecir", 8790, "\xe2\x89\x96"), + ENTITY_DEF("complement", 8705, "\xe2\x88\x81"), + ENTITY_DEF("top", 8868, "\xe2\x8a\xa4"), + ENTITY_DEF("DoubleContourIntegral", 8751, "\xe2\x88\xaf"), + ENTITY_DEF("nisd", 8954, "\xe2\x8b\xba"), + ENTITY_DEF("bcong", 8780, "\xe2\x89\x8c"), + ENTITY_DEF("plusdu", 10789, "\xe2\xa8\xa5"), + ENTITY_DEF("TildeTilde", 8776, "\xe2\x89\x88"), + ENTITY_DEF("lnE", 8808, "\xe2\x89\xa8"), + ENTITY_DEF("DoubleLongRightArrow", 10233, "\xe2\x9f\xb9"), + ENTITY_DEF("nsubseteqq", 10949, "\xe2\xab\x85\xcc\xb8"), + ENTITY_DEF("DownTeeArrow", 8615, "\xe2\x86\xa7"), + ENTITY_DEF("Cscr", 119966, "\xf0\x9d\x92\x9e"), + ENTITY_DEF("NegativeVeryThinSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("emsp", 8195, "\xe2\x80\x83"), + ENTITY_DEF("vartriangleleft", 8882, "\xe2\x8a\xb2"), + ENTITY_DEF("ropar", 10630, "\xe2\xa6\x86"), + ENTITY_DEF("checkmark", 10003, "\xe2\x9c\x93"), + ENTITY_DEF("Ycy", 1067, "\xd0\xab"), + ENTITY_DEF("supset", 8835, "\xe2\x8a\x83"), + ENTITY_DEF("gneqq", 8809, "\xe2\x89\xa9"), + ENTITY_DEF("Lstrok", 321, "\xc5\x81"), + ENTITY_DEF_HEUR("AMP", 38, "\x26"), + ENTITY_DEF("acE", 8766, "\xe2\x88\xbe\xcc\xb3"), + ENTITY_DEF("sqsupseteq", 8850, "\xe2\x8a\x92"), + ENTITY_DEF("nle", 8816, "\xe2\x89\xb0"), + ENTITY_DEF("nesear", 10536, "\xe2\xa4\xa8"), + ENTITY_DEF("LeftDownVectorBar", 10585, "\xe2\xa5\x99"), + ENTITY_DEF("Integral", 8747, "\xe2\x88\xab"), + ENTITY_DEF("Beta", 914, "\xce\x92"), + ENTITY_DEF("nvdash", 8876, "\xe2\x8a\xac"), + ENTITY_DEF("nges", 10878, "\xe2\xa9\xbe\xcc\xb8"), + ENTITY_DEF("demptyv", 10673, "\xe2\xa6\xb1"), + ENTITY_DEF("eta", 951, "\xce\xb7"), + ENTITY_DEF("GreaterSlantEqual", 10878, "\xe2\xa9\xbe"), + ENTITY_DEF_HEUR("ccedil", 231, "\xc3\xa7"), + ENTITY_DEF("pfr", 120109, "\xf0\x9d\x94\xad"), + ENTITY_DEF("bbrktbrk", 9142, "\xe2\x8e\xb6"), + ENTITY_DEF("mcy", 1084, "\xd0\xbc"), + ENTITY_DEF("Not", 10988, "\xe2\xab\xac"), + ENTITY_DEF("qscr", 120006, "\xf0\x9d\x93\x86"), + ENTITY_DEF("zwj", 8205, "\xe2\x80\x8d"), + ENTITY_DEF("ntrianglerighteq", 8941, "\xe2\x8b\xad"), + ENTITY_DEF("permil", 8240, "\xe2\x80\xb0"), + ENTITY_DEF("squarf", 9642, "\xe2\x96\xaa"), + ENTITY_DEF("apos", 39, "\x27"), + ENTITY_DEF("lrm", 8206, "\xe2\x80\x8e"), + ENTITY_DEF("male", 9794, "\xe2\x99\x82"), + ENTITY_DEF_HEUR("agrave", 224, "\xc3\xa0"), + ENTITY_DEF("Lt", 8810, "\xe2\x89\xaa"), + ENTITY_DEF("capand", 10820, "\xe2\xa9\x84"), + ENTITY_DEF_HEUR("aring", 229, "\xc3\xa5"), + ENTITY_DEF("Jukcy", 1028, "\xd0\x84"), + ENTITY_DEF("bumpe", 8783, "\xe2\x89\x8f"), + ENTITY_DEF("dd", 8518, "\xe2\x85\x86"), + ENTITY_DEF("tscy", 1094, "\xd1\x86"), + ENTITY_DEF("oS", 9416, "\xe2\x93\x88"), + ENTITY_DEF("succeq", 10928, "\xe2\xaa\xb0"), + ENTITY_DEF("xharr", 10231, "\xe2\x9f\xb7"), + ENTITY_DEF("pluse", 10866, "\xe2\xa9\xb2"), + ENTITY_DEF("rfisht", 10621, "\xe2\xa5\xbd"), + ENTITY_DEF("HorizontalLine", 9472, "\xe2\x94\x80"), + ENTITY_DEF("DiacriticalAcute", 180, "\xc2\xb4"), + ENTITY_DEF("hfr", 120101, "\xf0\x9d\x94\xa5"), + ENTITY_DEF("preceq", 10927, "\xe2\xaa\xaf"), + ENTITY_DEF("rationals", 8474, "\xe2\x84\x9a"), + ENTITY_DEF_HEUR("Auml", 196, "\xc3\x84"), + ENTITY_DEF("LeftRightArrow", 8596, "\xe2\x86\x94"), + ENTITY_DEF("blacktriangleright", 9656, "\xe2\x96\xb8"), + ENTITY_DEF("dharr", 8642, "\xe2\x87\x82"), + ENTITY_DEF("isin", 8712, "\xe2\x88\x88"), + ENTITY_DEF("ldrushar", 10571, "\xe2\xa5\x8b"), + ENTITY_DEF("squ", 9633, "\xe2\x96\xa1"), + ENTITY_DEF("rbrksld", 10638, "\xe2\xa6\x8e"), + ENTITY_DEF("bigwedge", 8896, "\xe2\x8b\x80"), + ENTITY_DEF("swArr", 8665, "\xe2\x87\x99"), + ENTITY_DEF("IJlig", 306, "\xc4\xb2"), + ENTITY_DEF("harr", 8596, "\xe2\x86\x94"), + ENTITY_DEF("range", 10661, "\xe2\xa6\xa5"), + ENTITY_DEF("urtri", 9721, "\xe2\x97\xb9"), + ENTITY_DEF("NotVerticalBar", 8740, "\xe2\x88\xa4"), + ENTITY_DEF("ic", 8291, "\xe2\x81\xa3"), + ENTITY_DEF("solbar", 9023, "\xe2\x8c\xbf"), + ENTITY_DEF("approx", 8776, "\xe2\x89\x88"), + ENTITY_DEF("SquareSuperset", 8848, "\xe2\x8a\x90"), + ENTITY_DEF("numsp", 8199, "\xe2\x80\x87"), + ENTITY_DEF("nLt", 8810, "\xe2\x89\xaa\xe2\x83\x92"), + ENTITY_DEF("tilde", 732, "\xcb\x9c"), + ENTITY_DEF("rlarr", 8644, "\xe2\x87\x84"), + ENTITY_DEF("langle", 10216, "\xe2\x9f\xa8"), + ENTITY_DEF("nleqslant", 10877, "\xe2\xa9\xbd\xcc\xb8"), + ENTITY_DEF("Nacute", 323, "\xc5\x83"), + ENTITY_DEF("NotLeftTriangle", 8938, "\xe2\x8b\xaa"), + ENTITY_DEF("sopf", 120164, "\xf0\x9d\x95\xa4"), + ENTITY_DEF("xmap", 10236, "\xe2\x9f\xbc"), + ENTITY_DEF("supne", 8843, "\xe2\x8a\x8b"), + ENTITY_DEF("Int", 8748, "\xe2\x88\xac"), + ENTITY_DEF("nsupseteqq", 10950, "\xe2\xab\x86\xcc\xb8"), + ENTITY_DEF("circlearrowright", 8635, "\xe2\x86\xbb"), + ENTITY_DEF("NotCongruent", 8802, "\xe2\x89\xa2"), + ENTITY_DEF("Scedil", 350, "\xc5\x9e"), + ENTITY_DEF_HEUR("raquo", 187, "\xc2\xbb"), + ENTITY_DEF("ycy", 1099, "\xd1\x8b"), + ENTITY_DEF("notinvb", 8951, "\xe2\x8b\xb7"), + ENTITY_DEF("andv", 10842, "\xe2\xa9\x9a"), + ENTITY_DEF("nap", 8777, "\xe2\x89\x89"), + ENTITY_DEF("shcy", 1096, "\xd1\x88"), + ENTITY_DEF("ssetmn", 8726, "\xe2\x88\x96"), + ENTITY_DEF("downarrow", 8595, "\xe2\x86\x93"), + ENTITY_DEF("gesdotol", 10884, "\xe2\xaa\x84"), + ENTITY_DEF("Congruent", 8801, "\xe2\x89\xa1"), + ENTITY_DEF_HEUR("pound", 163, "\xc2\xa3"), + ENTITY_DEF("ZeroWidthSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("rdca", 10551, "\xe2\xa4\xb7"), + ENTITY_DEF("rmoust", 9137, "\xe2\x8e\xb1"), + ENTITY_DEF("zcy", 1079, "\xd0\xb7"), + ENTITY_DEF("Square", 9633, "\xe2\x96\xa1"), + ENTITY_DEF("subE", 10949, "\xe2\xab\x85"), + ENTITY_DEF("infintie", 10717, "\xe2\xa7\x9d"), + ENTITY_DEF("Cayleys", 8493, "\xe2\x84\xad"), + ENTITY_DEF("lsaquo", 8249, "\xe2\x80\xb9"), + ENTITY_DEF("realpart", 8476, "\xe2\x84\x9c"), + ENTITY_DEF("nprec", 8832, "\xe2\x8a\x80"), + ENTITY_DEF("RightTriangleBar", 10704, "\xe2\xa7\x90"), + ENTITY_DEF("Kopf", 120130, "\xf0\x9d\x95\x82"), + ENTITY_DEF("Ubreve", 364, "\xc5\xac"), + ENTITY_DEF("Uopf", 120140, "\xf0\x9d\x95\x8c"), + ENTITY_DEF("trianglelefteq", 8884, "\xe2\x8a\xb4"), + ENTITY_DEF("rotimes", 10805, "\xe2\xa8\xb5"), + ENTITY_DEF("qfr", 120110, "\xf0\x9d\x94\xae"), + ENTITY_DEF("gtcc", 10919, "\xe2\xaa\xa7"), + ENTITY_DEF("fnof", 402, "\xc6\x92"), + ENTITY_DEF("tritime", 10811, "\xe2\xa8\xbb"), + ENTITY_DEF("andslope", 10840, "\xe2\xa9\x98"), + ENTITY_DEF("harrw", 8621, "\xe2\x86\xad"), + ENTITY_DEF("NotSquareSuperset", 8848, "\xe2\x8a\x90\xcc\xb8"), + ENTITY_DEF("Amacr", 256, "\xc4\x80"), + ENTITY_DEF("OpenCurlyDoubleQuote", 8220, "\xe2\x80\x9c"), + ENTITY_DEF_HEUR("thorn", 254, "\xc3\xbe"), + ENTITY_DEF_HEUR("ordf", 170, "\xc2\xaa"), + ENTITY_DEF("natur", 9838, "\xe2\x99\xae"), + ENTITY_DEF("xi", 958, "\xce\xbe"), + ENTITY_DEF("infin", 8734, "\xe2\x88\x9e"), + ENTITY_DEF("nspar", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("Jcy", 1049, "\xd0\x99"), + ENTITY_DEF("DownLeftTeeVector", 10590, "\xe2\xa5\x9e"), + ENTITY_DEF("rbarr", 10509, "\xe2\xa4\x8d"), + ENTITY_DEF("Xi", 926, "\xce\x9e"), + ENTITY_DEF("bull", 8226, "\xe2\x80\xa2"), + ENTITY_DEF("cuesc", 8927, "\xe2\x8b\x9f"), + ENTITY_DEF("backcong", 8780, "\xe2\x89\x8c"), + ENTITY_DEF("frac35", 8535, "\xe2\x85\x97"), + ENTITY_DEF("hscr", 119997, "\xf0\x9d\x92\xbd"), + ENTITY_DEF("LessEqualGreater", 8922, "\xe2\x8b\x9a"), + ENTITY_DEF("Implies", 8658, "\xe2\x87\x92"), + ENTITY_DEF("ETH", 208, "\xc3\x90"), + ENTITY_DEF_HEUR("Yacute", 221, "\xc3\x9d"), + ENTITY_DEF_HEUR("shy", 173, "\xc2\xad"), + ENTITY_DEF("Rarrtl", 10518, "\xe2\xa4\x96"), + ENTITY_DEF_HEUR("sup1", 185, "\xc2\xb9"), + ENTITY_DEF("reals", 8477, "\xe2\x84\x9d"), + ENTITY_DEF("blacklozenge", 10731, "\xe2\xa7\xab"), + ENTITY_DEF("ncedil", 326, "\xc5\x86"), + ENTITY_DEF("Lambda", 923, "\xce\x9b"), + ENTITY_DEF("uopf", 120166, "\xf0\x9d\x95\xa6"), + ENTITY_DEF("bigodot", 10752, "\xe2\xa8\x80"), + ENTITY_DEF("ubreve", 365, "\xc5\xad"), + ENTITY_DEF("drbkarow", 10512, "\xe2\xa4\x90"), + ENTITY_DEF("els", 10901, "\xe2\xaa\x95"), + ENTITY_DEF("shortparallel", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("Pcy", 1055, "\xd0\x9f"), + ENTITY_DEF("dsol", 10742, "\xe2\xa7\xb6"), + ENTITY_DEF("supsim", 10952, "\xe2\xab\x88"), + ENTITY_DEF("Longrightarrow", 10233, "\xe2\x9f\xb9"), + ENTITY_DEF("ThickSpace", 8287, "\xe2\x81\x9f\xe2\x80\x8a"), + ENTITY_DEF("Itilde", 296, "\xc4\xa8"), + ENTITY_DEF("nparallel", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("And", 10835, "\xe2\xa9\x93"), + ENTITY_DEF("boxhd", 9516, "\xe2\x94\xac"), + ENTITY_DEF("Dashv", 10980, "\xe2\xab\xa4"), + ENTITY_DEF("NotSuperset", 8835, "\xe2\x8a\x83\xe2\x83\x92"), + ENTITY_DEF("Eta", 919, "\xce\x97"), + ENTITY_DEF("Qopf", 8474, "\xe2\x84\x9a"), + ENTITY_DEF("period", 46, "\x2e"), + ENTITY_DEF("angmsd", 8737, "\xe2\x88\xa1"), + ENTITY_DEF("fllig", 64258, "\xef\xac\x82"), + ENTITY_DEF("cuvee", 8910, "\xe2\x8b\x8e"), + ENTITY_DEF("wedbar", 10847, "\xe2\xa9\x9f"), + ENTITY_DEF("Fscr", 8497, "\xe2\x84\xb1"), + ENTITY_DEF("veebar", 8891, "\xe2\x8a\xbb"), + ENTITY_DEF("Longleftrightarrow", 10234, "\xe2\x9f\xba"), + ENTITY_DEF_HEUR("reg", 174, "\xc2\xae"), + ENTITY_DEF("NegativeMediumSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("Upsi", 978, "\xcf\x92"), + ENTITY_DEF("Mellintrf", 8499, "\xe2\x84\xb3"), + ENTITY_DEF("boxHU", 9577, "\xe2\x95\xa9"), + ENTITY_DEF("frac56", 8538, "\xe2\x85\x9a"), + ENTITY_DEF("utrif", 9652, "\xe2\x96\xb4"), + ENTITY_DEF("LeftTriangle", 8882, "\xe2\x8a\xb2"), + ENTITY_DEF("nsime", 8772, "\xe2\x89\x84"), + ENTITY_DEF("rcedil", 343, "\xc5\x97"), + ENTITY_DEF("aogon", 261, "\xc4\x85"), + ENTITY_DEF("uHar", 10595, "\xe2\xa5\xa3"), + ENTITY_DEF("ForAll", 8704, "\xe2\x88\x80"), + ENTITY_DEF("prE", 10931, "\xe2\xaa\xb3"), + ENTITY_DEF("boxV", 9553, "\xe2\x95\x91"), + ENTITY_DEF("softcy", 1100, "\xd1\x8c"), + ENTITY_DEF("hercon", 8889, "\xe2\x8a\xb9"), + ENTITY_DEF("lmoustache", 9136, "\xe2\x8e\xb0"), + ENTITY_DEF("Product", 8719, "\xe2\x88\x8f"), + ENTITY_DEF("lsimg", 10895, "\xe2\xaa\x8f"), + ENTITY_DEF("verbar", 124, "\x7c"), + ENTITY_DEF("ofcir", 10687, "\xe2\xa6\xbf"), + ENTITY_DEF("curlyeqprec", 8926, "\xe2\x8b\x9e"), + ENTITY_DEF("ldquo", 8220, "\xe2\x80\x9c"), + ENTITY_DEF("bot", 8869, "\xe2\x8a\xa5"), + ENTITY_DEF("Psi", 936, "\xce\xa8"), + ENTITY_DEF("OElig", 338, "\xc5\x92"), + ENTITY_DEF("DownRightVectorBar", 10583, "\xe2\xa5\x97"), + ENTITY_DEF("minusb", 8863, "\xe2\x8a\x9f"), + ENTITY_DEF("Iscr", 8464, "\xe2\x84\x90"), + ENTITY_DEF("Tcedil", 354, "\xc5\xa2"), + ENTITY_DEF("ffilig", 64259, "\xef\xac\x83"), + ENTITY_DEF("Gcy", 1043, "\xd0\x93"), + ENTITY_DEF("oline", 8254, "\xe2\x80\xbe"), + ENTITY_DEF("bottom", 8869, "\xe2\x8a\xa5"), + ENTITY_DEF("nVDash", 8879, "\xe2\x8a\xaf"), + ENTITY_DEF("lessdot", 8918, "\xe2\x8b\x96"), + ENTITY_DEF("cups", 8746, "\xe2\x88\xaa\xef\xb8\x80"), + ENTITY_DEF("gla", 10917, "\xe2\xaa\xa5"), + ENTITY_DEF("hellip", 8230, "\xe2\x80\xa6"), + ENTITY_DEF("hookleftarrow", 8617, "\xe2\x86\xa9"), + ENTITY_DEF("Cup", 8915, "\xe2\x8b\x93"), + ENTITY_DEF("upsi", 965, "\xcf\x85"), + ENTITY_DEF("DownArrowBar", 10515, "\xe2\xa4\x93"), + ENTITY_DEF("lowast", 8727, "\xe2\x88\x97"), + ENTITY_DEF("profline", 8978, "\xe2\x8c\x92"), + ENTITY_DEF("ngsim", 8821, "\xe2\x89\xb5"), + ENTITY_DEF("boxhu", 9524, "\xe2\x94\xb4"), + ENTITY_DEF("operp", 10681, "\xe2\xa6\xb9"), + ENTITY_DEF("cap", 8745, "\xe2\x88\xa9"), + ENTITY_DEF("Hcirc", 292, "\xc4\xa4"), + ENTITY_DEF("Ncy", 1053, "\xd0\x9d"), + ENTITY_DEF("zeetrf", 8488, "\xe2\x84\xa8"), + ENTITY_DEF("cuepr", 8926, "\xe2\x8b\x9e"), + ENTITY_DEF("supsetneq", 8843, "\xe2\x8a\x8b"), + ENTITY_DEF("lfloor", 8970, "\xe2\x8c\x8a"), + ENTITY_DEF("ngtr", 8815, "\xe2\x89\xaf"), + ENTITY_DEF("ccups", 10828, "\xe2\xa9\x8c"), + ENTITY_DEF("pscr", 120005, "\xf0\x9d\x93\x85"), + ENTITY_DEF("Cfr", 8493, "\xe2\x84\xad"), + ENTITY_DEF("dtri", 9663, "\xe2\x96\xbf"), + ENTITY_DEF("icirc", 238, "\xc3\xae"), + ENTITY_DEF("leftarrow", 8592, "\xe2\x86\x90"), + ENTITY_DEF("vdash", 8866, "\xe2\x8a\xa2"), + ENTITY_DEF("leftrightharpoons", 8651, "\xe2\x87\x8b"), + ENTITY_DEF("rightrightarrows", 8649, "\xe2\x87\x89"), + ENTITY_DEF("strns", 175, "\xc2\xaf"), + ENTITY_DEF("intlarhk", 10775, "\xe2\xa8\x97"), + ENTITY_DEF("downharpoonright", 8642, "\xe2\x87\x82"), + ENTITY_DEF_HEUR("yacute", 253, "\xc3\xbd"), + ENTITY_DEF("boxUr", 9561, "\xe2\x95\x99"), + ENTITY_DEF("triangleleft", 9667, "\xe2\x97\x83"), + ENTITY_DEF("DiacriticalDot", 729, "\xcb\x99"), + ENTITY_DEF("thetav", 977, "\xcf\x91"), + ENTITY_DEF("OverBracket", 9140, "\xe2\x8e\xb4"), + ENTITY_DEF("PrecedesTilde", 8830, "\xe2\x89\xbe"), + ENTITY_DEF("rtrie", 8885, "\xe2\x8a\xb5"), + ENTITY_DEF("Scirc", 348, "\xc5\x9c"), + ENTITY_DEF("vsupne", 8843, "\xe2\x8a\x8b\xef\xb8\x80"), + ENTITY_DEF("OverBrace", 9182, "\xe2\x8f\x9e"), + ENTITY_DEF("Yfr", 120092, "\xf0\x9d\x94\x9c"), + ENTITY_DEF("scnE", 10934, "\xe2\xaa\xb6"), + ENTITY_DEF("simlE", 10911, "\xe2\xaa\x9f"), + ENTITY_DEF("Proportional", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("edot", 279, "\xc4\x97"), + ENTITY_DEF("loang", 10220, "\xe2\x9f\xac"), + ENTITY_DEF("gesdot", 10880, "\xe2\xaa\x80"), + ENTITY_DEF("DownBreve", 785, "\xcc\x91"), + ENTITY_DEF("pcy", 1087, "\xd0\xbf"), + ENTITY_DEF("Succeeds", 8827, "\xe2\x89\xbb"), + ENTITY_DEF("mfr", 120106, "\xf0\x9d\x94\xaa"), + ENTITY_DEF("Leftarrow", 8656, "\xe2\x87\x90"), + ENTITY_DEF("boxDr", 9555, "\xe2\x95\x93"), + ENTITY_DEF("Nscr", 119977, "\xf0\x9d\x92\xa9"), + ENTITY_DEF("diam", 8900, "\xe2\x8b\x84"), + ENTITY_DEF("CHcy", 1063, "\xd0\xa7"), + ENTITY_DEF("boxdr", 9484, "\xe2\x94\x8c"), + ENTITY_DEF("rlm", 8207, "\xe2\x80\x8f"), + ENTITY_DEF("Coproduct", 8720, "\xe2\x88\x90"), + ENTITY_DEF("RightTeeArrow", 8614, "\xe2\x86\xa6"), + ENTITY_DEF("tridot", 9708, "\xe2\x97\xac"), + ENTITY_DEF("ldquor", 8222, "\xe2\x80\x9e"), + ENTITY_DEF("sol", 47, "\x2f"), + ENTITY_DEF_HEUR("ecirc", 234, "\xc3\xaa"), + ENTITY_DEF("DoubleLeftArrow", 8656, "\xe2\x87\x90"), + ENTITY_DEF("Gscr", 119970, "\xf0\x9d\x92\xa2"), + ENTITY_DEF("ap", 8776, "\xe2\x89\x88"), + ENTITY_DEF("rbrke", 10636, "\xe2\xa6\x8c"), + ENTITY_DEF("LeftFloor", 8970, "\xe2\x8c\x8a"), + ENTITY_DEF("blk12", 9618, "\xe2\x96\x92"), + ENTITY_DEF("Conint", 8751, "\xe2\x88\xaf"), + ENTITY_DEF("triangledown", 9663, "\xe2\x96\xbf"), + ENTITY_DEF("Icy", 1048, "\xd0\x98"), + ENTITY_DEF("backprime", 8245, "\xe2\x80\xb5"), + ENTITY_DEF("longleftrightarrow", 10231, "\xe2\x9f\xb7"), + ENTITY_DEF("ntriangleleft", 8938, "\xe2\x8b\xaa"), + ENTITY_DEF_HEUR("copy", 169, "\xc2\xa9"), + ENTITY_DEF("mapstodown", 8615, "\xe2\x86\xa7"), + ENTITY_DEF("seArr", 8664, "\xe2\x87\x98"), + ENTITY_DEF("ENG", 330, "\xc5\x8a"), + ENTITY_DEF("DoubleRightArrow", 8658, "\xe2\x87\x92"), + ENTITY_DEF("tfr", 120113, "\xf0\x9d\x94\xb1"), + ENTITY_DEF("rharul", 10604, "\xe2\xa5\xac"), + ENTITY_DEF("bfr", 120095, "\xf0\x9d\x94\x9f"), + ENTITY_DEF("origof", 8886, "\xe2\x8a\xb6"), + ENTITY_DEF("Therefore", 8756, "\xe2\x88\xb4"), + ENTITY_DEF("glE", 10898, "\xe2\xaa\x92"), + ENTITY_DEF("leftarrowtail", 8610, "\xe2\x86\xa2"), + ENTITY_DEF("NotEqual", 8800, "\xe2\x89\xa0"), + ENTITY_DEF("LeftCeiling", 8968, "\xe2\x8c\x88"), + ENTITY_DEF("lArr", 8656, "\xe2\x87\x90"), + ENTITY_DEF("subseteq", 8838, "\xe2\x8a\x86"), + ENTITY_DEF("larrbfs", 10527, "\xe2\xa4\x9f"), + ENTITY_DEF("Gammad", 988, "\xcf\x9c"), + ENTITY_DEF("rtriltri", 10702, "\xe2\xa7\x8e"), + ENTITY_DEF("Fcy", 1060, "\xd0\xa4"), + ENTITY_DEF("Vopf", 120141, "\xf0\x9d\x95\x8d"), + ENTITY_DEF("lrarr", 8646, "\xe2\x87\x86"), + ENTITY_DEF("delta", 948, "\xce\xb4"), + ENTITY_DEF("xodot", 10752, "\xe2\xa8\x80"), + ENTITY_DEF("larrtl", 8610, "\xe2\x86\xa2"), + ENTITY_DEF("gsim", 8819, "\xe2\x89\xb3"), + ENTITY_DEF("ratail", 10522, "\xe2\xa4\x9a"), + ENTITY_DEF("vsubne", 8842, "\xe2\x8a\x8a\xef\xb8\x80"), + ENTITY_DEF("boxur", 9492, "\xe2\x94\x94"), + ENTITY_DEF("succsim", 8831, "\xe2\x89\xbf"), + ENTITY_DEF("triplus", 10809, "\xe2\xa8\xb9"), + ENTITY_DEF("nless", 8814, "\xe2\x89\xae"), + ENTITY_DEF("uharr", 8638, "\xe2\x86\xbe"), + ENTITY_DEF("lambda", 955, "\xce\xbb"), + ENTITY_DEF_HEUR("uuml", 252, "\xc3\xbc"), + ENTITY_DEF("horbar", 8213, "\xe2\x80\x95"), + ENTITY_DEF("ccirc", 265, "\xc4\x89"), + ENTITY_DEF("sqcup", 8852, "\xe2\x8a\x94"), + ENTITY_DEF("Pscr", 119979, "\xf0\x9d\x92\xab"), + ENTITY_DEF("supsup", 10966, "\xe2\xab\x96"), + ENTITY_DEF("Cacute", 262, "\xc4\x86"), + ENTITY_DEF("upsih", 978, "\xcf\x92"), + ENTITY_DEF("precsim", 8830, "\xe2\x89\xbe"), + ENTITY_DEF("longrightarrow", 10230, "\xe2\x9f\xb6"), + ENTITY_DEF("circledR", 174, "\xc2\xae"), + ENTITY_DEF("UpTeeArrow", 8613, "\xe2\x86\xa5"), + ENTITY_DEF("bepsi", 1014, "\xcf\xb6"), + ENTITY_DEF("oast", 8859, "\xe2\x8a\x9b"), + ENTITY_DEF("yfr", 120118, "\xf0\x9d\x94\xb6"), + ENTITY_DEF("rdsh", 8627, "\xe2\x86\xb3"), + ENTITY_DEF("Ograve", 210, "\xc3\x92"), + ENTITY_DEF("LeftVectorBar", 10578, "\xe2\xa5\x92"), + ENTITY_DEF("NotNestedLessLess", 10913, "\xe2\xaa\xa1\xcc\xb8"), + ENTITY_DEF("Jscr", 119973, "\xf0\x9d\x92\xa5"), + ENTITY_DEF("psi", 968, "\xcf\x88"), + ENTITY_DEF("orarr", 8635, "\xe2\x86\xbb"), + ENTITY_DEF("Subset", 8912, "\xe2\x8b\x90"), + ENTITY_DEF("curarr", 8631, "\xe2\x86\xb7"), + ENTITY_DEF("CirclePlus", 8853, "\xe2\x8a\x95"), + ENTITY_DEF("gtrless", 8823, "\xe2\x89\xb7"), + ENTITY_DEF("nvle", 8804, "\xe2\x89\xa4\xe2\x83\x92"), + ENTITY_DEF("prop", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("gEl", 10892, "\xe2\xaa\x8c"), + ENTITY_DEF("gtlPar", 10645, "\xe2\xa6\x95"), + ENTITY_DEF("frasl", 8260, "\xe2\x81\x84"), + ENTITY_DEF("nearr", 8599, "\xe2\x86\x97"), + ENTITY_DEF("NotSubsetEqual", 8840, "\xe2\x8a\x88"), + ENTITY_DEF("planck", 8463, "\xe2\x84\x8f"), + ENTITY_DEF_HEUR("Uuml", 220, "\xc3\x9c"), + ENTITY_DEF("spadesuit", 9824, "\xe2\x99\xa0"), + ENTITY_DEF_HEUR("sect", 167, "\xc2\xa7"), + ENTITY_DEF("cdot", 267, "\xc4\x8b"), + ENTITY_DEF("boxVh", 9579, "\xe2\x95\xab"), + ENTITY_DEF("zscr", 120015, "\xf0\x9d\x93\x8f"), + ENTITY_DEF("nsqsube", 8930, "\xe2\x8b\xa2"), + ENTITY_DEF("grave", 96, "\x60"), + ENTITY_DEF("angrtvb", 8894, "\xe2\x8a\xbe"), + ENTITY_DEF("MediumSpace", 8287, "\xe2\x81\x9f"), + ENTITY_DEF("Ntilde", 209, "\xc3\x91"), + ENTITY_DEF("solb", 10692, "\xe2\xa7\x84"), + ENTITY_DEF("angzarr", 9084, "\xe2\x8d\xbc"), + ENTITY_DEF("nopf", 120159, "\xf0\x9d\x95\x9f"), + ENTITY_DEF("rtrif", 9656, "\xe2\x96\xb8"), + ENTITY_DEF("nrightarrow", 8603, "\xe2\x86\x9b"), + ENTITY_DEF("Kappa", 922, "\xce\x9a"), + ENTITY_DEF("simrarr", 10610, "\xe2\xa5\xb2"), + ENTITY_DEF("imacr", 299, "\xc4\xab"), + ENTITY_DEF("vrtri", 8883, "\xe2\x8a\xb3"), + ENTITY_DEF("part", 8706, "\xe2\x88\x82"), + ENTITY_DEF("esim", 8770, "\xe2\x89\x82"), + ENTITY_DEF_HEUR("atilde", 227, "\xc3\xa3"), + ENTITY_DEF("DownRightTeeVector", 10591, "\xe2\xa5\x9f"), + ENTITY_DEF("jcirc", 309, "\xc4\xb5"), + ENTITY_DEF("Ecaron", 282, "\xc4\x9a"), + ENTITY_DEF("VerticalSeparator", 10072, "\xe2\x9d\x98"), + ENTITY_DEF("rHar", 10596, "\xe2\xa5\xa4"), + ENTITY_DEF("rcaron", 345, "\xc5\x99"), + ENTITY_DEF("subnE", 10955, "\xe2\xab\x8b"), + ENTITY_DEF("ii", 8520, "\xe2\x85\x88"), + ENTITY_DEF("Cconint", 8752, "\xe2\x88\xb0"), + ENTITY_DEF("Mcy", 1052, "\xd0\x9c"), + ENTITY_DEF("eqcolon", 8789, "\xe2\x89\x95"), + ENTITY_DEF("cupor", 10821, "\xe2\xa9\x85"), + ENTITY_DEF("DoubleUpArrow", 8657, "\xe2\x87\x91"), + ENTITY_DEF("boxbox", 10697, "\xe2\xa7\x89"), + ENTITY_DEF("setminus", 8726, "\xe2\x88\x96"), + ENTITY_DEF("Lleftarrow", 8666, "\xe2\x87\x9a"), + ENTITY_DEF("nang", 8736, "\xe2\x88\xa0\xe2\x83\x92"), + ENTITY_DEF("TRADE", 8482, "\xe2\x84\xa2"), + ENTITY_DEF("urcorner", 8989, "\xe2\x8c\x9d"), + ENTITY_DEF("lsqb", 91, "\x5b"), + ENTITY_DEF("cupcup", 10826, "\xe2\xa9\x8a"), + ENTITY_DEF("kjcy", 1116, "\xd1\x9c"), + ENTITY_DEF("llhard", 10603, "\xe2\xa5\xab"), + ENTITY_DEF("mumap", 8888, "\xe2\x8a\xb8"), + ENTITY_DEF("iiint", 8749, "\xe2\x88\xad"), + ENTITY_DEF("RightTee", 8866, "\xe2\x8a\xa2"), + ENTITY_DEF("Tcaron", 356, "\xc5\xa4"), + ENTITY_DEF("bigcirc", 9711, "\xe2\x97\xaf"), + ENTITY_DEF("trianglerighteq", 8885, "\xe2\x8a\xb5"), + ENTITY_DEF("NotLessGreater", 8824, "\xe2\x89\xb8"), + ENTITY_DEF("hArr", 8660, "\xe2\x87\x94"), + ENTITY_DEF("ocy", 1086, "\xd0\xbe"), + ENTITY_DEF("tosa", 10537, "\xe2\xa4\xa9"), + ENTITY_DEF("twixt", 8812, "\xe2\x89\xac"), + ENTITY_DEF("square", 9633, "\xe2\x96\xa1"), + ENTITY_DEF("Otimes", 10807, "\xe2\xa8\xb7"), + ENTITY_DEF("Kcedil", 310, "\xc4\xb6"), + ENTITY_DEF("beth", 8502, "\xe2\x84\xb6"), + ENTITY_DEF("triminus", 10810, "\xe2\xa8\xba"), + ENTITY_DEF("nlArr", 8653, "\xe2\x87\x8d"), + ENTITY_DEF("Oacute", 211, "\xc3\x93"), + ENTITY_DEF("zwnj", 8204, "\xe2\x80\x8c"), + ENTITY_DEF("ll", 8810, "\xe2\x89\xaa"), + ENTITY_DEF("smashp", 10803, "\xe2\xa8\xb3"), + ENTITY_DEF("ngeqq", 8807, "\xe2\x89\xa7\xcc\xb8"), + ENTITY_DEF("rnmid", 10990, "\xe2\xab\xae"), + ENTITY_DEF("nwArr", 8662, "\xe2\x87\x96"), + ENTITY_DEF("RightUpDownVector", 10575, "\xe2\xa5\x8f"), + ENTITY_DEF("lbbrk", 10098, "\xe2\x9d\xb2"), + ENTITY_DEF("compfn", 8728, "\xe2\x88\x98"), + ENTITY_DEF("eDDot", 10871, "\xe2\xa9\xb7"), + ENTITY_DEF("Jsercy", 1032, "\xd0\x88"), + ENTITY_DEF("HARDcy", 1066, "\xd0\xaa"), + ENTITY_DEF("nexists", 8708, "\xe2\x88\x84"), + ENTITY_DEF("theta", 952, "\xce\xb8"), + ENTITY_DEF("plankv", 8463, "\xe2\x84\x8f"), + ENTITY_DEF_HEUR("sup2", 178, "\xc2\xb2"), + ENTITY_DEF("lessapprox", 10885, "\xe2\xaa\x85"), + ENTITY_DEF("gdot", 289, "\xc4\xa1"), + ENTITY_DEF("angmsdae", 10668, "\xe2\xa6\xac"), + ENTITY_DEF("Superset", 8835, "\xe2\x8a\x83"), + ENTITY_DEF("prap", 10935, "\xe2\xaa\xb7"), + ENTITY_DEF("Zscr", 119989, "\xf0\x9d\x92\xb5"), + ENTITY_DEF("nsucc", 8833, "\xe2\x8a\x81"), + ENTITY_DEF("supseteqq", 10950, "\xe2\xab\x86"), + ENTITY_DEF("UpTee", 8869, "\xe2\x8a\xa5"), + ENTITY_DEF("LowerLeftArrow", 8601, "\xe2\x86\x99"), + ENTITY_DEF("ssmile", 8995, "\xe2\x8c\xa3"), + ENTITY_DEF("niv", 8715, "\xe2\x88\x8b"), + ENTITY_DEF("bigvee", 8897, "\xe2\x8b\x81"), + ENTITY_DEF("kscr", 120000, "\xf0\x9d\x93\x80"), + ENTITY_DEF("xutri", 9651, "\xe2\x96\xb3"), + ENTITY_DEF("caret", 8257, "\xe2\x81\x81"), + ENTITY_DEF("caron", 711, "\xcb\x87"), + ENTITY_DEF("Wedge", 8896, "\xe2\x8b\x80"), + ENTITY_DEF("sdotb", 8865, "\xe2\x8a\xa1"), + ENTITY_DEF("bigoplus", 10753, "\xe2\xa8\x81"), + ENTITY_DEF("Breve", 728, "\xcb\x98"), + ENTITY_DEF("ImaginaryI", 8520, "\xe2\x85\x88"), + ENTITY_DEF("longmapsto", 10236, "\xe2\x9f\xbc"), + ENTITY_DEF("boxVH", 9580, "\xe2\x95\xac"), + ENTITY_DEF("lozenge", 9674, "\xe2\x97\x8a"), + ENTITY_DEF("toea", 10536, "\xe2\xa4\xa8"), + ENTITY_DEF("nbumpe", 8783, "\xe2\x89\x8f\xcc\xb8"), + ENTITY_DEF("gcirc", 285, "\xc4\x9d"), + ENTITY_DEF("NotHumpEqual", 8783, "\xe2\x89\x8f\xcc\xb8"), + ENTITY_DEF("pre", 10927, "\xe2\xaa\xaf"), + ENTITY_DEF("ascr", 119990, "\xf0\x9d\x92\xb6"), + ENTITY_DEF("Acirc", 194, "\xc3\x82"), + ENTITY_DEF("questeq", 8799, "\xe2\x89\x9f"), + ENTITY_DEF("ncaron", 328, "\xc5\x88"), + ENTITY_DEF("LeftTeeArrow", 8612, "\xe2\x86\xa4"), + ENTITY_DEF("xcirc", 9711, "\xe2\x97\xaf"), + ENTITY_DEF("swarr", 8601, "\xe2\x86\x99"), + ENTITY_DEF("MinusPlus", 8723, "\xe2\x88\x93"), + ENTITY_DEF("plus", 43, "\x2b"), + ENTITY_DEF("NotDoubleVerticalBar", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("rppolint", 10770, "\xe2\xa8\x92"), + ENTITY_DEF("NotTildeFullEqual", 8775, "\xe2\x89\x87"), + ENTITY_DEF("ltdot", 8918, "\xe2\x8b\x96"), + ENTITY_DEF("NotNestedGreaterGreater", 10914, "\xe2\xaa\xa2\xcc\xb8"), + ENTITY_DEF("Lscr", 8466, "\xe2\x84\x92"), + ENTITY_DEF("pitchfork", 8916, "\xe2\x8b\x94"), + ENTITY_DEF("Eopf", 120124, "\xf0\x9d\x94\xbc"), + ENTITY_DEF("ropf", 120163, "\xf0\x9d\x95\xa3"), + ENTITY_DEF("Delta", 916, "\xce\x94"), + ENTITY_DEF("lozf", 10731, "\xe2\xa7\xab"), + ENTITY_DEF("RightTeeVector", 10587, "\xe2\xa5\x9b"), + ENTITY_DEF("UpDownArrow", 8597, "\xe2\x86\x95"), + ENTITY_DEF("bump", 8782, "\xe2\x89\x8e"), + ENTITY_DEF("Rscr", 8475, "\xe2\x84\x9b"), + ENTITY_DEF("slarr", 8592, "\xe2\x86\x90"), + ENTITY_DEF("lcy", 1083, "\xd0\xbb"), + ENTITY_DEF("Vee", 8897, "\xe2\x8b\x81"), + ENTITY_DEF("Iogon", 302, "\xc4\xae"), + ENTITY_DEF("minus", 8722, "\xe2\x88\x92"), + ENTITY_DEF("GreaterFullEqual", 8807, "\xe2\x89\xa7"), + ENTITY_DEF("xhArr", 10234, "\xe2\x9f\xba"), + ENTITY_DEF("shortmid", 8739, "\xe2\x88\xa3"), + ENTITY_DEF("DoubleDownArrow", 8659, "\xe2\x87\x93"), + ENTITY_DEF("Wscr", 119986, "\xf0\x9d\x92\xb2"), + ENTITY_DEF("rang", 10217, "\xe2\x9f\xa9"), + ENTITY_DEF("lcub", 123, "\x7b"), + ENTITY_DEF("mnplus", 8723, "\xe2\x88\x93"), + ENTITY_DEF("ulcrop", 8975, "\xe2\x8c\x8f"), + ENTITY_DEF("wfr", 120116, "\xf0\x9d\x94\xb4"), + ENTITY_DEF("DifferentialD", 8518, "\xe2\x85\x86"), + ENTITY_DEF("ThinSpace", 8201, "\xe2\x80\x89"), + ENTITY_DEF("NotGreaterGreater", 8811, "\xe2\x89\xab\xcc\xb8"), + ENTITY_DEF("Topf", 120139, "\xf0\x9d\x95\x8b"), + ENTITY_DEF("sbquo", 8218, "\xe2\x80\x9a"), + ENTITY_DEF("sdot", 8901, "\xe2\x8b\x85"), + ENTITY_DEF("DoubleLeftTee", 10980, "\xe2\xab\xa4"), + ENTITY_DEF("vBarv", 10985, "\xe2\xab\xa9"), + ENTITY_DEF("subne", 8842, "\xe2\x8a\x8a"), + ENTITY_DEF("gtrdot", 8919, "\xe2\x8b\x97"), + ENTITY_DEF("opar", 10679, "\xe2\xa6\xb7"), + ENTITY_DEF("apid", 8779, "\xe2\x89\x8b"), + ENTITY_DEF("Cross", 10799, "\xe2\xa8\xaf"), + ENTITY_DEF("lhblk", 9604, "\xe2\x96\x84"), + ENTITY_DEF("capcap", 10827, "\xe2\xa9\x8b"), + ENTITY_DEF("midast", 42, "\x2a"), + ENTITY_DEF("lscr", 120001, "\xf0\x9d\x93\x81"), + ENTITY_DEF("nGt", 8811, "\xe2\x89\xab\xe2\x83\x92"), + ENTITY_DEF_HEUR("Euml", 203, "\xc3\x8b"), + ENTITY_DEF("blacktriangledown", 9662, "\xe2\x96\xbe"), + ENTITY_DEF("Rcy", 1056, "\xd0\xa0"), + ENTITY_DEF("dfisht", 10623, "\xe2\xa5\xbf"), + ENTITY_DEF("dashv", 8867, "\xe2\x8a\xa3"), + ENTITY_DEF("ast", 42, "\x2a"), + ENTITY_DEF("ContourIntegral", 8750, "\xe2\x88\xae"), + ENTITY_DEF("Ofr", 120082, "\xf0\x9d\x94\x92"), + ENTITY_DEF("Lcy", 1051, "\xd0\x9b"), + ENTITY_DEF("nltrie", 8940, "\xe2\x8b\xac"), + ENTITY_DEF("ShortUpArrow", 8593, "\xe2\x86\x91"), + ENTITY_DEF("acy", 1072, "\xd0\xb0"), + ENTITY_DEF("rightarrow", 8594, "\xe2\x86\x92"), + ENTITY_DEF("UnderBar", 95, "\x5f"), + ENTITY_DEF("LongLeftArrow", 10229, "\xe2\x9f\xb5"), + ENTITY_DEF("andd", 10844, "\xe2\xa9\x9c"), + ENTITY_DEF("xlarr", 10229, "\xe2\x9f\xb5"), + ENTITY_DEF("percnt", 37, "\x25"), + ENTITY_DEF("rharu", 8640, "\xe2\x87\x80"), + ENTITY_DEF("plusdo", 8724, "\xe2\x88\x94"), + ENTITY_DEF("TScy", 1062, "\xd0\xa6"), + ENTITY_DEF("kcy", 1082, "\xd0\xba"), + ENTITY_DEF("boxVR", 9568, "\xe2\x95\xa0"), + ENTITY_DEF("looparrowleft", 8619, "\xe2\x86\xab"), + ENTITY_DEF("scirc", 349, "\xc5\x9d"), + ENTITY_DEF("drcorn", 8991, "\xe2\x8c\x9f"), + ENTITY_DEF("iiota", 8489, "\xe2\x84\xa9"), + ENTITY_DEF("Zcy", 1047, "\xd0\x97"), + ENTITY_DEF("frac58", 8541, "\xe2\x85\x9d"), + ENTITY_DEF("alpha", 945, "\xce\xb1"), + ENTITY_DEF("daleth", 8504, "\xe2\x84\xb8"), + ENTITY_DEF("gtreqless", 8923, "\xe2\x8b\x9b"), + ENTITY_DEF("tstrok", 359, "\xc5\xa7"), + ENTITY_DEF("plusb", 8862, "\xe2\x8a\x9e"), + ENTITY_DEF("odsold", 10684, "\xe2\xa6\xbc"), + ENTITY_DEF("varsupsetneqq", 10956, "\xe2\xab\x8c\xef\xb8\x80"), + ENTITY_DEF_HEUR("otilde", 245, "\xc3\xb5"), + ENTITY_DEF("gtcir", 10874, "\xe2\xa9\xba"), + ENTITY_DEF("lltri", 9722, "\xe2\x97\xba"), + ENTITY_DEF("rx", 8478, "\xe2\x84\x9e"), + ENTITY_DEF("ljcy", 1113, "\xd1\x99"), + ENTITY_DEF("parsim", 10995, "\xe2\xab\xb3"), + ENTITY_DEF("NotElement", 8713, "\xe2\x88\x89"), + ENTITY_DEF_HEUR("plusmn", 177, "\xc2\xb1"), + ENTITY_DEF("varsubsetneq", 8842, "\xe2\x8a\x8a\xef\xb8\x80"), + ENTITY_DEF("subset", 8834, "\xe2\x8a\x82"), + ENTITY_DEF("awint", 10769, "\xe2\xa8\x91"), + ENTITY_DEF("laemptyv", 10676, "\xe2\xa6\xb4"), + ENTITY_DEF("phiv", 981, "\xcf\x95"), + ENTITY_DEF("sfrown", 8994, "\xe2\x8c\xa2"), + ENTITY_DEF("DoubleUpDownArrow", 8661, "\xe2\x87\x95"), + ENTITY_DEF("lpar", 40, "\x28"), + ENTITY_DEF("frac45", 8536, "\xe2\x85\x98"), + ENTITY_DEF("rBarr", 10511, "\xe2\xa4\x8f"), + ENTITY_DEF("npolint", 10772, "\xe2\xa8\x94"), + ENTITY_DEF("emacr", 275, "\xc4\x93"), + ENTITY_DEF("maltese", 10016, "\xe2\x9c\xa0"), + ENTITY_DEF("PlusMinus", 177, "\xc2\xb1"), + ENTITY_DEF("ReverseEquilibrium", 8651, "\xe2\x87\x8b"), + ENTITY_DEF("oscr", 8500, "\xe2\x84\xb4"), + ENTITY_DEF("blacksquare", 9642, "\xe2\x96\xaa"), + ENTITY_DEF("TSHcy", 1035, "\xd0\x8b"), + ENTITY_DEF("gap", 10886, "\xe2\xaa\x86"), + ENTITY_DEF("xnis", 8955, "\xe2\x8b\xbb"), + ENTITY_DEF("Ll", 8920, "\xe2\x8b\x98"), + ENTITY_DEF("PrecedesEqual", 10927, "\xe2\xaa\xaf"), + ENTITY_DEF("incare", 8453, "\xe2\x84\x85"), + ENTITY_DEF("nharr", 8622, "\xe2\x86\xae"), + ENTITY_DEF("varnothing", 8709, "\xe2\x88\x85"), + ENTITY_DEF("ShortDownArrow", 8595, "\xe2\x86\x93"), + ENTITY_DEF_HEUR("nbsp", 160, " "), + ENTITY_DEF("asympeq", 8781, "\xe2\x89\x8d"), + ENTITY_DEF("rbrkslu", 10640, "\xe2\xa6\x90"), + ENTITY_DEF("rho", 961, "\xcf\x81"), + ENTITY_DEF("Mscr", 8499, "\xe2\x84\xb3"), + ENTITY_DEF_HEUR("eth", 240, "\xc3\xb0"), + ENTITY_DEF("suplarr", 10619, "\xe2\xa5\xbb"), + ENTITY_DEF("Tab", 9, "\x09"), + ENTITY_DEF("omicron", 959, "\xce\xbf"), + ENTITY_DEF("blacktriangle", 9652, "\xe2\x96\xb4"), + ENTITY_DEF("nldr", 8229, "\xe2\x80\xa5"), + ENTITY_DEF("downharpoonleft", 8643, "\xe2\x87\x83"), + ENTITY_DEF("circledcirc", 8858, "\xe2\x8a\x9a"), + ENTITY_DEF("leftleftarrows", 8647, "\xe2\x87\x87"), + ENTITY_DEF("NotHumpDownHump", 8782, "\xe2\x89\x8e\xcc\xb8"), + ENTITY_DEF("nvgt", 62, "\x3e\xe2\x83\x92"), + ENTITY_DEF("rhard", 8641, "\xe2\x87\x81"), + ENTITY_DEF("nGg", 8921, "\xe2\x8b\x99\xcc\xb8"), + ENTITY_DEF("lurdshar", 10570, "\xe2\xa5\x8a"), + ENTITY_DEF("cirE", 10691, "\xe2\xa7\x83"), + ENTITY_DEF("isinE", 8953, "\xe2\x8b\xb9"), + ENTITY_DEF("eparsl", 10723, "\xe2\xa7\xa3"), + ENTITY_DEF("RightAngleBracket", 10217, "\xe2\x9f\xa9"), + ENTITY_DEF("hcirc", 293, "\xc4\xa5"), + ENTITY_DEF("bumpeq", 8783, "\xe2\x89\x8f"), + ENTITY_DEF("cire", 8791, "\xe2\x89\x97"), + ENTITY_DEF("dotplus", 8724, "\xe2\x88\x94"), + ENTITY_DEF("itilde", 297, "\xc4\xa9"), + ENTITY_DEF("uwangle", 10663, "\xe2\xa6\xa7"), + ENTITY_DEF("rlhar", 8652, "\xe2\x87\x8c"), + ENTITY_DEF("rbrace", 125, "\x7d"), + ENTITY_DEF("mid", 8739, "\xe2\x88\xa3"), + ENTITY_DEF("el", 10905, "\xe2\xaa\x99"), + ENTITY_DEF("KJcy", 1036, "\xd0\x8c"), + ENTITY_DEF("odiv", 10808, "\xe2\xa8\xb8"), + ENTITY_DEF("amacr", 257, "\xc4\x81"), + ENTITY_DEF("qprime", 8279, "\xe2\x81\x97"), + ENTITY_DEF("tcedil", 355, "\xc5\xa3"), + ENTITY_DEF("UpArrowDownArrow", 8645, "\xe2\x87\x85"), + ENTITY_DEF("spades", 9824, "\xe2\x99\xa0"), + ENTITY_DEF("napos", 329, "\xc5\x89"), + ENTITY_DEF("straightepsilon", 1013, "\xcf\xb5"), + ENTITY_DEF("CupCap", 8781, "\xe2\x89\x8d"), + ENTITY_DEF("Oopf", 120134, "\xf0\x9d\x95\x86"), + ENTITY_DEF("sub", 8834, "\xe2\x8a\x82"), + ENTITY_DEF("ohm", 937, "\xce\xa9"), + ENTITY_DEF("UnderBrace", 9183, "\xe2\x8f\x9f"), + ENTITY_DEF("looparrowright", 8620, "\xe2\x86\xac"), + ENTITY_DEF("xotime", 10754, "\xe2\xa8\x82"), + ENTITY_DEF("ntgl", 8825, "\xe2\x89\xb9"), + ENTITY_DEF("minusdu", 10794, "\xe2\xa8\xaa"), + ENTITY_DEF("rarrb", 8677, "\xe2\x87\xa5"), + ENTITY_DEF("nvlArr", 10498, "\xe2\xa4\x82"), + ENTITY_DEF("triangle", 9653, "\xe2\x96\xb5"), + ENTITY_DEF("nacute", 324, "\xc5\x84"), + ENTITY_DEF("boxHD", 9574, "\xe2\x95\xa6"), + ENTITY_DEF("ratio", 8758, "\xe2\x88\xb6"), + ENTITY_DEF("larrsim", 10611, "\xe2\xa5\xb3"), + ENTITY_DEF("LessLess", 10913, "\xe2\xaa\xa1"), + ENTITY_DEF("yacy", 1103, "\xd1\x8f"), + ENTITY_DEF("ctdot", 8943, "\xe2\x8b\xaf"), + ENTITY_DEF("and", 8743, "\xe2\x88\xa7"), + ENTITY_DEF("lrtri", 8895, "\xe2\x8a\xbf"), + ENTITY_DEF("eDot", 8785, "\xe2\x89\x91"), + ENTITY_DEF("sqsub", 8847, "\xe2\x8a\x8f"), + ENTITY_DEF("real", 8476, "\xe2\x84\x9c"), + ENTITY_DEF("Dcy", 1044, "\xd0\x94"), + ENTITY_DEF("vartheta", 977, "\xcf\x91"), + ENTITY_DEF("nsub", 8836, "\xe2\x8a\x84"), + ENTITY_DEF("DownTee", 8868, "\xe2\x8a\xa4"), + ENTITY_DEF_HEUR("acute", 180, "\xc2\xb4"), + ENTITY_DEF("GreaterLess", 8823, "\xe2\x89\xb7"), + ENTITY_DEF("supplus", 10944, "\xe2\xab\x80"), + ENTITY_DEF("Vbar", 10987, "\xe2\xab\xab"), + ENTITY_DEF("divideontimes", 8903, "\xe2\x8b\x87"), + ENTITY_DEF("lsim", 8818, "\xe2\x89\xb2"), + ENTITY_DEF("nearhk", 10532, "\xe2\xa4\xa4"), + ENTITY_DEF("nLtv", 8810, "\xe2\x89\xaa\xcc\xb8"), + ENTITY_DEF("RuleDelayed", 10740, "\xe2\xa7\xb4"), + ENTITY_DEF("smile", 8995, "\xe2\x8c\xa3"), + ENTITY_DEF("coprod", 8720, "\xe2\x88\x90"), + ENTITY_DEF("imof", 8887, "\xe2\x8a\xb7"), + ENTITY_DEF("ecy", 1101, "\xd1\x8d"), + ENTITY_DEF("RightCeiling", 8969, "\xe2\x8c\x89"), + ENTITY_DEF("dlcorn", 8990, "\xe2\x8c\x9e"), + ENTITY_DEF("Nu", 925, "\xce\x9d"), + ENTITY_DEF("frac18", 8539, "\xe2\x85\x9b"), + ENTITY_DEF("diamond", 8900, "\xe2\x8b\x84"), + ENTITY_DEF("Icirc", 206, "\xc3\x8e"), + ENTITY_DEF("ngeq", 8817, "\xe2\x89\xb1"), + ENTITY_DEF("epsilon", 949, "\xce\xb5"), + ENTITY_DEF("fork", 8916, "\xe2\x8b\x94"), + ENTITY_DEF("xrarr", 10230, "\xe2\x9f\xb6"), + ENTITY_DEF("racute", 341, "\xc5\x95"), + ENTITY_DEF("ntlg", 8824, "\xe2\x89\xb8"), + ENTITY_DEF("xvee", 8897, "\xe2\x8b\x81"), + ENTITY_DEF("LeftArrowRightArrow", 8646, "\xe2\x87\x86"), + ENTITY_DEF("DownLeftRightVector", 10576, "\xe2\xa5\x90"), + ENTITY_DEF("Eacute", 201, "\xc3\x89"), + ENTITY_DEF("gimel", 8503, "\xe2\x84\xb7"), + ENTITY_DEF("rtimes", 8906, "\xe2\x8b\x8a"), + ENTITY_DEF("forall", 8704, "\xe2\x88\x80"), + ENTITY_DEF("DiacriticalDoubleAcute", 733, "\xcb\x9d"), + ENTITY_DEF("dArr", 8659, "\xe2\x87\x93"), + ENTITY_DEF("fallingdotseq", 8786, "\xe2\x89\x92"), + ENTITY_DEF("Aogon", 260, "\xc4\x84"), + ENTITY_DEF("PartialD", 8706, "\xe2\x88\x82"), + ENTITY_DEF("mapstoup", 8613, "\xe2\x86\xa5"), + ENTITY_DEF("die", 168, "\xc2\xa8"), + ENTITY_DEF("ngt", 8815, "\xe2\x89\xaf"), + ENTITY_DEF("vcy", 1074, "\xd0\xb2"), + ENTITY_DEF("fjlig", (unsigned) -1, "\x66\x6a"), + ENTITY_DEF("submult", 10945, "\xe2\xab\x81"), + ENTITY_DEF("ubrcy", 1118, "\xd1\x9e"), + ENTITY_DEF("ovbar", 9021, "\xe2\x8c\xbd"), + ENTITY_DEF("bsime", 8909, "\xe2\x8b\x8d"), + ENTITY_DEF("precnsim", 8936, "\xe2\x8b\xa8"), + ENTITY_DEF("DiacriticalTilde", 732, "\xcb\x9c"), + ENTITY_DEF("cwint", 8753, "\xe2\x88\xb1"), + ENTITY_DEF("Scy", 1057, "\xd0\xa1"), + ENTITY_DEF("NotGreaterEqual", 8817, "\xe2\x89\xb1"), + ENTITY_DEF("boxUR", 9562, "\xe2\x95\x9a"), + ENTITY_DEF("LessSlantEqual", 10877, "\xe2\xa9\xbd"), + ENTITY_DEF("Barwed", 8966, "\xe2\x8c\x86"), + ENTITY_DEF("supdot", 10942, "\xe2\xaa\xbe"), + ENTITY_DEF("gel", 8923, "\xe2\x8b\x9b"), + ENTITY_DEF("iscr", 119998, "\xf0\x9d\x92\xbe"), + ENTITY_DEF("doublebarwedge", 8966, "\xe2\x8c\x86"), + ENTITY_DEF("Idot", 304, "\xc4\xb0"), + ENTITY_DEF("DoubleDot", 168, "\xc2\xa8"), + ENTITY_DEF("rsquo", 8217, "\xe2\x80\x99"), + ENTITY_DEF("subsetneqq", 10955, "\xe2\xab\x8b"), + ENTITY_DEF("UpEquilibrium", 10606, "\xe2\xa5\xae"), + ENTITY_DEF("copysr", 8471, "\xe2\x84\x97"), + ENTITY_DEF("RightDoubleBracket", 10215, "\xe2\x9f\xa7"), + ENTITY_DEF("LeftRightVector", 10574, "\xe2\xa5\x8e"), + ENTITY_DEF("DownLeftVectorBar", 10582, "\xe2\xa5\x96"), + ENTITY_DEF("suphsub", 10967, "\xe2\xab\x97"), + ENTITY_DEF_HEUR("cedil", 184, "\xc2\xb8"), + ENTITY_DEF("prurel", 8880, "\xe2\x8a\xb0"), + ENTITY_DEF("imagpart", 8465, "\xe2\x84\x91"), + ENTITY_DEF("Hscr", 8459, "\xe2\x84\x8b"), + ENTITY_DEF("jmath", 567, "\xc8\xb7"), + ENTITY_DEF("nrtrie", 8941, "\xe2\x8b\xad"), + ENTITY_DEF("nsup", 8837, "\xe2\x8a\x85"), + ENTITY_DEF("Ubrcy", 1038, "\xd0\x8e"), + ENTITY_DEF("succnsim", 8937, "\xe2\x8b\xa9"), + ENTITY_DEF("nesim", 8770, "\xe2\x89\x82\xcc\xb8"), + ENTITY_DEF("varepsilon", 1013, "\xcf\xb5"), + ENTITY_DEF("DoubleRightTee", 8872, "\xe2\x8a\xa8"), + ENTITY_DEF_HEUR("not", 172, "\xc2\xac"), + ENTITY_DEF("lesdot", 10879, "\xe2\xa9\xbf"), + ENTITY_DEF("backepsilon", 1014, "\xcf\xb6"), + ENTITY_DEF("srarr", 8594, "\xe2\x86\x92"), + ENTITY_DEF("varsubsetneqq", 10955, "\xe2\xab\x8b\xef\xb8\x80"), + ENTITY_DEF("sqcap", 8851, "\xe2\x8a\x93"), + ENTITY_DEF("rightleftarrows", 8644, "\xe2\x87\x84"), + ENTITY_DEF("diams", 9830, "\xe2\x99\xa6"), + ENTITY_DEF("boxdR", 9554, "\xe2\x95\x92"), + ENTITY_DEF("ngeqslant", 10878, "\xe2\xa9\xbe\xcc\xb8"), + ENTITY_DEF("boxDR", 9556, "\xe2\x95\x94"), + ENTITY_DEF("sext", 10038, "\xe2\x9c\xb6"), + ENTITY_DEF("backsim", 8765, "\xe2\x88\xbd"), + ENTITY_DEF("nfr", 120107, "\xf0\x9d\x94\xab"), + ENTITY_DEF("CloseCurlyDoubleQuote", 8221, "\xe2\x80\x9d"), + ENTITY_DEF("npart", 8706, "\xe2\x88\x82\xcc\xb8"), + ENTITY_DEF("dharl", 8643, "\xe2\x87\x83"), + ENTITY_DEF("NewLine", 10, "\x0a"), + ENTITY_DEF("bigotimes", 10754, "\xe2\xa8\x82"), + ENTITY_DEF("lAtail", 10523, "\xe2\xa4\x9b"), + ENTITY_DEF_HEUR("frac14", 188, "\xc2\xbc"), + ENTITY_DEF("or", 8744, "\xe2\x88\xa8"), + ENTITY_DEF("subedot", 10947, "\xe2\xab\x83"), + ENTITY_DEF("nmid", 8740, "\xe2\x88\xa4"), + ENTITY_DEF("DownArrowUpArrow", 8693, "\xe2\x87\xb5"), + ENTITY_DEF("icy", 1080, "\xd0\xb8"), + ENTITY_DEF("num", 35, "\x23"), + ENTITY_DEF("Gdot", 288, "\xc4\xa0"), + ENTITY_DEF("urcrop", 8974, "\xe2\x8c\x8e"), + ENTITY_DEF("epsiv", 1013, "\xcf\xb5"), + ENTITY_DEF("topcir", 10993, "\xe2\xab\xb1"), + ENTITY_DEF("ne", 8800, "\xe2\x89\xa0"), + ENTITY_DEF("osol", 8856, "\xe2\x8a\x98"), + ENTITY_DEF_HEUR("amp", 38, "\x26"), + ENTITY_DEF("ncap", 10819, "\xe2\xa9\x83"), + ENTITY_DEF("Sscr", 119982, "\xf0\x9d\x92\xae"), + ENTITY_DEF("sung", 9834, "\xe2\x99\xaa"), + ENTITY_DEF("ltri", 9667, "\xe2\x97\x83"), + ENTITY_DEF("frac25", 8534, "\xe2\x85\x96"), + ENTITY_DEF("DZcy", 1039, "\xd0\x8f"), + ENTITY_DEF("RightUpVector", 8638, "\xe2\x86\xbe"), + ENTITY_DEF("rsquor", 8217, "\xe2\x80\x99"), + ENTITY_DEF("uplus", 8846, "\xe2\x8a\x8e"), + ENTITY_DEF("triangleright", 9657, "\xe2\x96\xb9"), + ENTITY_DEF("lAarr", 8666, "\xe2\x87\x9a"), + ENTITY_DEF("HilbertSpace", 8459, "\xe2\x84\x8b"), + ENTITY_DEF("there4", 8756, "\xe2\x88\xb4"), + ENTITY_DEF("vscr", 120011, "\xf0\x9d\x93\x8b"), + ENTITY_DEF("cirscir", 10690, "\xe2\xa7\x82"), + ENTITY_DEF("roarr", 8702, "\xe2\x87\xbe"), + ENTITY_DEF("hslash", 8463, "\xe2\x84\x8f"), + ENTITY_DEF("supdsub", 10968, "\xe2\xab\x98"), + ENTITY_DEF("simg", 10910, "\xe2\xaa\x9e"), + ENTITY_DEF("trade", 8482, "\xe2\x84\xa2"), + ENTITY_DEF("searrow", 8600, "\xe2\x86\x98"), + ENTITY_DEF("DownLeftVector", 8637, "\xe2\x86\xbd"), + ENTITY_DEF("FilledSmallSquare", 9724, "\xe2\x97\xbc"), + ENTITY_DEF("prod", 8719, "\xe2\x88\x8f"), + ENTITY_DEF("oror", 10838, "\xe2\xa9\x96"), + ENTITY_DEF("udarr", 8645, "\xe2\x87\x85"), + ENTITY_DEF("jsercy", 1112, "\xd1\x98"), + ENTITY_DEF("tprime", 8244, "\xe2\x80\xb4"), + ENTITY_DEF("bprime", 8245, "\xe2\x80\xb5"), + ENTITY_DEF("malt", 10016, "\xe2\x9c\xa0"), + ENTITY_DEF("bigcup", 8899, "\xe2\x8b\x83"), + ENTITY_DEF("oint", 8750, "\xe2\x88\xae"), + ENTITY_DEF("female", 9792, "\xe2\x99\x80"), + ENTITY_DEF("omacr", 333, "\xc5\x8d"), + ENTITY_DEF("SquareSubsetEqual", 8849, "\xe2\x8a\x91"), + ENTITY_DEF("SucceedsEqual", 10928, "\xe2\xaa\xb0"), + ENTITY_DEF("plusacir", 10787, "\xe2\xa8\xa3"), + ENTITY_DEF("Gcirc", 284, "\xc4\x9c"), + ENTITY_DEF("lesdotor", 10883, "\xe2\xaa\x83"), + ENTITY_DEF("escr", 8495, "\xe2\x84\xaf"), + ENTITY_DEF_HEUR("THORN", 222, "\xc3\x9e"), + ENTITY_DEF("UpArrowBar", 10514, "\xe2\xa4\x92"), + ENTITY_DEF("nvrtrie", 8885, "\xe2\x8a\xb5\xe2\x83\x92"), + ENTITY_DEF("varkappa", 1008, "\xcf\xb0"), + ENTITY_DEF("NotReverseElement", 8716, "\xe2\x88\x8c"), + ENTITY_DEF("zdot", 380, "\xc5\xbc"), + ENTITY_DEF("ExponentialE", 8519, "\xe2\x85\x87"), + ENTITY_DEF("lesseqgtr", 8922, "\xe2\x8b\x9a"), + ENTITY_DEF("cscr", 119992, "\xf0\x9d\x92\xb8"), + ENTITY_DEF("Dscr", 119967, "\xf0\x9d\x92\x9f"), + ENTITY_DEF("lthree", 8907, "\xe2\x8b\x8b"), + ENTITY_DEF("Ccedil", 199, "\xc3\x87"), + ENTITY_DEF("nge", 8817, "\xe2\x89\xb1"), + ENTITY_DEF("UpperLeftArrow", 8598, "\xe2\x86\x96"), + ENTITY_DEF("vDash", 8872, "\xe2\x8a\xa8"), + ENTITY_DEF("efDot", 8786, "\xe2\x89\x92"), + ENTITY_DEF("telrec", 8981, "\xe2\x8c\x95"), + ENTITY_DEF("vellip", 8942, "\xe2\x8b\xae"), + ENTITY_DEF("nrArr", 8655, "\xe2\x87\x8f"), + ENTITY_DEF_HEUR("ugrave", 249, "\xc3\xb9"), + ENTITY_DEF("uring", 367, "\xc5\xaf"), + ENTITY_DEF("Bernoullis", 8492, "\xe2\x84\xac"), + ENTITY_DEF("nles", 10877, "\xe2\xa9\xbd\xcc\xb8"), + ENTITY_DEF_HEUR("macr", 175, "\xc2\xaf"), + ENTITY_DEF("boxuR", 9560, "\xe2\x95\x98"), + ENTITY_DEF("clubsuit", 9827, "\xe2\x99\xa3"), + ENTITY_DEF("rightarrowtail", 8611, "\xe2\x86\xa3"), + ENTITY_DEF("epar", 8917, "\xe2\x8b\x95"), + ENTITY_DEF("ltcc", 10918, "\xe2\xaa\xa6"), + ENTITY_DEF("twoheadleftarrow", 8606, "\xe2\x86\x9e"), + ENTITY_DEF("aleph", 8501, "\xe2\x84\xb5"), + ENTITY_DEF("Colon", 8759, "\xe2\x88\xb7"), + ENTITY_DEF("vltri", 8882, "\xe2\x8a\xb2"), + ENTITY_DEF("quaternions", 8461, "\xe2\x84\x8d"), + ENTITY_DEF("rfr", 120111, "\xf0\x9d\x94\xaf"), + ENTITY_DEF_HEUR("Ouml", 214, "\xc3\x96"), + ENTITY_DEF("rsh", 8625, "\xe2\x86\xb1"), + ENTITY_DEF("emptyv", 8709, "\xe2\x88\x85"), + ENTITY_DEF("sqsup", 8848, "\xe2\x8a\x90"), + ENTITY_DEF("marker", 9646, "\xe2\x96\xae"), + ENTITY_DEF("Efr", 120072, "\xf0\x9d\x94\x88"), + ENTITY_DEF("DotEqual", 8784, "\xe2\x89\x90"), + ENTITY_DEF("eqsim", 8770, "\xe2\x89\x82"), + ENTITY_DEF("NotSucceedsEqual", 10928, "\xe2\xaa\xb0\xcc\xb8"), + ENTITY_DEF("primes", 8473, "\xe2\x84\x99"), + ENTITY_DEF_HEUR("times", 215, "\xc3\x97"), + ENTITY_DEF("rangd", 10642, "\xe2\xa6\x92"), + ENTITY_DEF("rightharpoonup", 8640, "\xe2\x87\x80"), + ENTITY_DEF("lrhard", 10605, "\xe2\xa5\xad"), + ENTITY_DEF("ape", 8778, "\xe2\x89\x8a"), + ENTITY_DEF("varsupsetneq", 8843, "\xe2\x8a\x8b\xef\xb8\x80"), + ENTITY_DEF("larrlp", 8619, "\xe2\x86\xab"), + ENTITY_DEF("NotPrecedesEqual", 10927, "\xe2\xaa\xaf\xcc\xb8"), + ENTITY_DEF("ulcorner", 8988, "\xe2\x8c\x9c"), + ENTITY_DEF("acd", 8767, "\xe2\x88\xbf"), + ENTITY_DEF("Hacek", 711, "\xcb\x87"), + ENTITY_DEF("xuplus", 10756, "\xe2\xa8\x84"), + ENTITY_DEF("therefore", 8756, "\xe2\x88\xb4"), + ENTITY_DEF("YIcy", 1031, "\xd0\x87"), + ENTITY_DEF("Tfr", 120087, "\xf0\x9d\x94\x97"), + ENTITY_DEF("Jcirc", 308, "\xc4\xb4"), + ENTITY_DEF("LessGreater", 8822, "\xe2\x89\xb6"), + ENTITY_DEF("Uring", 366, "\xc5\xae"), + ENTITY_DEF("Ugrave", 217, "\xc3\x99"), + ENTITY_DEF("rarr", 8594, "\xe2\x86\x92"), + ENTITY_DEF("wopf", 120168, "\xf0\x9d\x95\xa8"), + ENTITY_DEF("imath", 305, "\xc4\xb1"), + ENTITY_DEF("Yopf", 120144, "\xf0\x9d\x95\x90"), + ENTITY_DEF("colone", 8788, "\xe2\x89\x94"), + ENTITY_DEF("csube", 10961, "\xe2\xab\x91"), + ENTITY_DEF("odash", 8861, "\xe2\x8a\x9d"), + ENTITY_DEF("olarr", 8634, "\xe2\x86\xba"), + ENTITY_DEF("angrt", 8735, "\xe2\x88\x9f"), + ENTITY_DEF("NotLeftTriangleBar", 10703, "\xe2\xa7\x8f\xcc\xb8"), + ENTITY_DEF("GreaterEqual", 8805, "\xe2\x89\xa5"), + ENTITY_DEF("scnap", 10938, "\xe2\xaa\xba"), + ENTITY_DEF("pi", 960, "\xcf\x80"), + ENTITY_DEF("lesg", 8922, "\xe2\x8b\x9a\xef\xb8\x80"), + ENTITY_DEF("orderof", 8500, "\xe2\x84\xb4"), + ENTITY_DEF_HEUR("uacute", 250, "\xc3\xba"), + ENTITY_DEF("Barv", 10983, "\xe2\xab\xa7"), + ENTITY_DEF("Theta", 920, "\xce\x98"), + ENTITY_DEF("leftrightsquigarrow", 8621, "\xe2\x86\xad"), + ENTITY_DEF("Atilde", 195, "\xc3\x83"), + ENTITY_DEF("cupdot", 8845, "\xe2\x8a\x8d"), + ENTITY_DEF("ntriangleright", 8939, "\xe2\x8b\xab"), + ENTITY_DEF("measuredangle", 8737, "\xe2\x88\xa1"), + ENTITY_DEF("jscr", 119999, "\xf0\x9d\x92\xbf"), + ENTITY_DEF("inodot", 305, "\xc4\xb1"), + ENTITY_DEF("mopf", 120158, "\xf0\x9d\x95\x9e"), + ENTITY_DEF("hkswarow", 10534, "\xe2\xa4\xa6"), + ENTITY_DEF("lopar", 10629, "\xe2\xa6\x85"), + ENTITY_DEF("thksim", 8764, "\xe2\x88\xbc"), + ENTITY_DEF("bkarow", 10509, "\xe2\xa4\x8d"), + ENTITY_DEF("rarrfs", 10526, "\xe2\xa4\x9e"), + ENTITY_DEF("ntrianglelefteq", 8940, "\xe2\x8b\xac"), + ENTITY_DEF("Bscr", 8492, "\xe2\x84\xac"), + ENTITY_DEF("topf", 120165, "\xf0\x9d\x95\xa5"), + ENTITY_DEF("Uacute", 218, "\xc3\x9a"), + ENTITY_DEF("lap", 10885, "\xe2\xaa\x85"), + ENTITY_DEF("djcy", 1106, "\xd1\x92"), + ENTITY_DEF("bopf", 120147, "\xf0\x9d\x95\x93"), + ENTITY_DEF("empty", 8709, "\xe2\x88\x85"), + ENTITY_DEF("LeftAngleBracket", 10216, "\xe2\x9f\xa8"), + ENTITY_DEF("Imacr", 298, "\xc4\xaa"), + ENTITY_DEF("ltcir", 10873, "\xe2\xa9\xb9"), + ENTITY_DEF("trisb", 10701, "\xe2\xa7\x8d"), + ENTITY_DEF("gjcy", 1107, "\xd1\x93"), + ENTITY_DEF("pr", 8826, "\xe2\x89\xba"), + ENTITY_DEF("Mu", 924, "\xce\x9c"), + ENTITY_DEF("ogon", 731, "\xcb\x9b"), + ENTITY_DEF("pertenk", 8241, "\xe2\x80\xb1"), + ENTITY_DEF("plustwo", 10791, "\xe2\xa8\xa7"), + ENTITY_DEF("Vfr", 120089, "\xf0\x9d\x94\x99"), + ENTITY_DEF("ApplyFunction", 8289, "\xe2\x81\xa1"), + ENTITY_DEF("Sub", 8912, "\xe2\x8b\x90"), + ENTITY_DEF("DoubleLeftRightArrow", 8660, "\xe2\x87\x94"), + ENTITY_DEF("Lmidot", 319, "\xc4\xbf"), + ENTITY_DEF("nwarrow", 8598, "\xe2\x86\x96"), + ENTITY_DEF("angrtvbd", 10653, "\xe2\xa6\x9d"), + ENTITY_DEF("fcy", 1092, "\xd1\x84"), + ENTITY_DEF("ltlarr", 10614, "\xe2\xa5\xb6"), + ENTITY_DEF("CircleMinus", 8854, "\xe2\x8a\x96"), + ENTITY_DEF("angmsdab", 10665, "\xe2\xa6\xa9"), + ENTITY_DEF("wedgeq", 8793, "\xe2\x89\x99"), + ENTITY_DEF("iogon", 303, "\xc4\xaf"), + ENTITY_DEF_HEUR("laquo", 171, "\xc2\xab"), + ENTITY_DEF("NestedGreaterGreater", 8811, "\xe2\x89\xab"), + ENTITY_DEF("UnionPlus", 8846, "\xe2\x8a\x8e"), + ENTITY_DEF("CircleDot", 8857, "\xe2\x8a\x99"), + ENTITY_DEF("coloneq", 8788, "\xe2\x89\x94"), + ENTITY_DEF("csupe", 10962, "\xe2\xab\x92"), + ENTITY_DEF("tcaron", 357, "\xc5\xa5"), + ENTITY_DEF("GreaterTilde", 8819, "\xe2\x89\xb3"), + ENTITY_DEF("Map", 10501, "\xe2\xa4\x85"), + ENTITY_DEF("DoubleLongLeftArrow", 10232, "\xe2\x9f\xb8"), + ENTITY_DEF("Uparrow", 8657, "\xe2\x87\x91"), + ENTITY_DEF("scy", 1089, "\xd1\x81"), + ENTITY_DEF("llarr", 8647, "\xe2\x87\x87"), + ENTITY_DEF("rangle", 10217, "\xe2\x9f\xa9"), + ENTITY_DEF("sstarf", 8902, "\xe2\x8b\x86"), + ENTITY_DEF("InvisibleTimes", 8290, "\xe2\x81\xa2"), + ENTITY_DEF("egsdot", 10904, "\xe2\xaa\x98"), + ENTITY_DEF("target", 8982, "\xe2\x8c\x96"), + ENTITY_DEF("lesges", 10899, "\xe2\xaa\x93"), + ENTITY_DEF_HEUR("curren", 164, "\xc2\xa4"), + ENTITY_DEF("yopf", 120170, "\xf0\x9d\x95\xaa"), + ENTITY_DEF("frac23", 8532, "\xe2\x85\x94"), + ENTITY_DEF("NotSucceedsTilde", 8831, "\xe2\x89\xbf\xcc\xb8"), + ENTITY_DEF("napprox", 8777, "\xe2\x89\x89"), + ENTITY_DEF("odblac", 337, "\xc5\x91"), + ENTITY_DEF("gammad", 989, "\xcf\x9d"), + ENTITY_DEF("dscr", 119993, "\xf0\x9d\x92\xb9"), + ENTITY_DEF("SupersetEqual", 8839, "\xe2\x8a\x87"), + ENTITY_DEF("squf", 9642, "\xe2\x96\xaa"), + ENTITY_DEF("Because", 8757, "\xe2\x88\xb5"), + ENTITY_DEF("sccue", 8829, "\xe2\x89\xbd"), + ENTITY_DEF("KHcy", 1061, "\xd0\xa5"), + ENTITY_DEF("Wcirc", 372, "\xc5\xb4"), + ENTITY_DEF("uparrow", 8593, "\xe2\x86\x91"), + ENTITY_DEF("lessgtr", 8822, "\xe2\x89\xb6"), + ENTITY_DEF("thickapprox", 8776, "\xe2\x89\x88"), + ENTITY_DEF("lbrksld", 10639, "\xe2\xa6\x8f"), + ENTITY_DEF_HEUR("oslash", 248, "\xc3\xb8"), + ENTITY_DEF("NotCupCap", 8813, "\xe2\x89\xad"), + ENTITY_DEF("elinters", 9191, "\xe2\x8f\xa7"), + ENTITY_DEF("Assign", 8788, "\xe2\x89\x94"), + ENTITY_DEF("ClockwiseContourIntegral", 8754, "\xe2\x88\xb2"), + ENTITY_DEF("lfisht", 10620, "\xe2\xa5\xbc"), + ENTITY_DEF("DownArrow", 8595, "\xe2\x86\x93"), + ENTITY_DEF("Zdot", 379, "\xc5\xbb"), + ENTITY_DEF("xscr", 120013, "\xf0\x9d\x93\x8d"), + ENTITY_DEF("DiacriticalGrave", 96, "\x60"), + ENTITY_DEF("DoubleLongLeftRightArrow", 10234, "\xe2\x9f\xba"), + ENTITY_DEF("angle", 8736, "\xe2\x88\xa0"), + ENTITY_DEF("race", 8765, "\xe2\x88\xbd\xcc\xb1"), + ENTITY_DEF("Ascr", 119964, "\xf0\x9d\x92\x9c"), + ENTITY_DEF("Xscr", 119987, "\xf0\x9d\x92\xb3"), + ENTITY_DEF_HEUR("acirc", 226, "\xc3\xa2"), + ENTITY_DEF("otimesas", 10806, "\xe2\xa8\xb6"), + ENTITY_DEF("gscr", 8458, "\xe2\x84\x8a"), + ENTITY_DEF("gcy", 1075, "\xd0\xb3"), + ENTITY_DEF("angmsdag", 10670, "\xe2\xa6\xae"), + ENTITY_DEF("tshcy", 1115, "\xd1\x9b"), + ENTITY_DEF("Acy", 1040, "\xd0\x90"), + ENTITY_DEF("NotGreaterLess", 8825, "\xe2\x89\xb9"), + ENTITY_DEF("dtdot", 8945, "\xe2\x8b\xb1"), + ENTITY_DEF_HEUR("quot", 34, "\x22"), + ENTITY_DEF_HEUR("micro", 181, "\xc2\xb5"), + ENTITY_DEF("simplus", 10788, "\xe2\xa8\xa4"), + ENTITY_DEF("nsupseteq", 8841, "\xe2\x8a\x89"), + ENTITY_DEF("Ufr", 120088, "\xf0\x9d\x94\x98"), + ENTITY_DEF("Pr", 10939, "\xe2\xaa\xbb"), + ENTITY_DEF("napid", 8779, "\xe2\x89\x8b\xcc\xb8"), + ENTITY_DEF("rceil", 8969, "\xe2\x8c\x89"), + ENTITY_DEF("boxtimes", 8864, "\xe2\x8a\xa0"), + ENTITY_DEF("erarr", 10609, "\xe2\xa5\xb1"), + ENTITY_DEF("downdownarrows", 8650, "\xe2\x87\x8a"), + ENTITY_DEF("Kfr", 120078, "\xf0\x9d\x94\x8e"), + ENTITY_DEF("mho", 8487, "\xe2\x84\xa7"), + ENTITY_DEF("scpolint", 10771, "\xe2\xa8\x93"), + ENTITY_DEF("vArr", 8661, "\xe2\x87\x95"), + ENTITY_DEF("Ccaron", 268, "\xc4\x8c"), + ENTITY_DEF("NotRightTriangle", 8939, "\xe2\x8b\xab"), + ENTITY_DEF("topbot", 9014, "\xe2\x8c\xb6"), + ENTITY_DEF("qopf", 120162, "\xf0\x9d\x95\xa2"), + ENTITY_DEF("eogon", 281, "\xc4\x99"), + ENTITY_DEF("luruhar", 10598, "\xe2\xa5\xa6"), + ENTITY_DEF("gtdot", 8919, "\xe2\x8b\x97"), + ENTITY_DEF("Egrave", 200, "\xc3\x88"), + ENTITY_DEF("roplus", 10798, "\xe2\xa8\xae"), + ENTITY_DEF("Intersection", 8898, "\xe2\x8b\x82"), + ENTITY_DEF("Uarr", 8607, "\xe2\x86\x9f"), + ENTITY_DEF("dcy", 1076, "\xd0\xb4"), + ENTITY_DEF("boxvl", 9508, "\xe2\x94\xa4"), + ENTITY_DEF("RightArrowBar", 8677, "\xe2\x87\xa5"), + ENTITY_DEF_HEUR("yuml", 255, "\xc3\xbf"), + ENTITY_DEF("parallel", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("succneqq", 10934, "\xe2\xaa\xb6"), + ENTITY_DEF("bemptyv", 10672, "\xe2\xa6\xb0"), + ENTITY_DEF("starf", 9733, "\xe2\x98\x85"), + ENTITY_DEF("OverBar", 8254, "\xe2\x80\xbe"), + ENTITY_DEF("Alpha", 913, "\xce\x91"), + ENTITY_DEF("LeftUpVectorBar", 10584, "\xe2\xa5\x98"), + ENTITY_DEF("ufr", 120114, "\xf0\x9d\x94\xb2"), + ENTITY_DEF("swarhk", 10534, "\xe2\xa4\xa6"), + ENTITY_DEF("GreaterEqualLess", 8923, "\xe2\x8b\x9b"), + ENTITY_DEF("sscr", 120008, "\xf0\x9d\x93\x88"), + ENTITY_DEF("Pi", 928, "\xce\xa0"), + ENTITY_DEF("boxh", 9472, "\xe2\x94\x80"), + ENTITY_DEF("frac16", 8537, "\xe2\x85\x99"), + ENTITY_DEF("lbrack", 91, "\x5b"), + ENTITY_DEF("vert", 124, "\x7c"), + ENTITY_DEF("precneqq", 10933, "\xe2\xaa\xb5"), + ENTITY_DEF("NotGreaterSlantEqual", 10878, "\xe2\xa9\xbe\xcc\xb8"), + ENTITY_DEF("Omega", 937, "\xce\xa9"), + ENTITY_DEF("uarr", 8593, "\xe2\x86\x91"), + ENTITY_DEF("boxVr", 9567, "\xe2\x95\x9f"), + ENTITY_DEF("ruluhar", 10600, "\xe2\xa5\xa8"), + ENTITY_DEF("ShortLeftArrow", 8592, "\xe2\x86\x90"), + ENTITY_DEF("Qfr", 120084, "\xf0\x9d\x94\x94"), + ENTITY_DEF("olt", 10688, "\xe2\xa7\x80"), + ENTITY_DEF("nequiv", 8802, "\xe2\x89\xa2"), + ENTITY_DEF("fscr", 119995, "\xf0\x9d\x92\xbb"), + ENTITY_DEF("rarrhk", 8618, "\xe2\x86\xaa"), + ENTITY_DEF("nsqsupe", 8931, "\xe2\x8b\xa3"), + ENTITY_DEF("nsubseteq", 8840, "\xe2\x8a\x88"), + ENTITY_DEF("numero", 8470, "\xe2\x84\x96"), + ENTITY_DEF("emsp14", 8197, "\xe2\x80\x85"), + ENTITY_DEF("gl", 8823, "\xe2\x89\xb7"), + ENTITY_DEF("ocirc", 244, "\xc3\xb4"), + ENTITY_DEF("weierp", 8472, "\xe2\x84\x98"), + ENTITY_DEF("boxvL", 9569, "\xe2\x95\xa1"), + ENTITY_DEF("RightArrowLeftArrow", 8644, "\xe2\x87\x84"), + ENTITY_DEF("Precedes", 8826, "\xe2\x89\xba"), + ENTITY_DEF("RightVector", 8640, "\xe2\x87\x80"), + ENTITY_DEF("xcup", 8899, "\xe2\x8b\x83"), + ENTITY_DEF("angmsdad", 10667, "\xe2\xa6\xab"), + ENTITY_DEF("gtrsim", 8819, "\xe2\x89\xb3"), + ENTITY_DEF("natural", 9838, "\xe2\x99\xae"), + ENTITY_DEF("nVdash", 8878, "\xe2\x8a\xae"), + ENTITY_DEF("RightTriangleEqual", 8885, "\xe2\x8a\xb5"), + ENTITY_DEF("dscy", 1109, "\xd1\x95"), + ENTITY_DEF("leftthreetimes", 8907, "\xe2\x8b\x8b"), + ENTITY_DEF("prsim", 8830, "\xe2\x89\xbe"), + ENTITY_DEF("Bcy", 1041, "\xd0\x91"), + ENTITY_DEF("Chi", 935, "\xce\xa7"), + ENTITY_DEF("timesb", 8864, "\xe2\x8a\xa0"), + ENTITY_DEF("Del", 8711, "\xe2\x88\x87"), + ENTITY_DEF("lmidot", 320, "\xc5\x80"), + ENTITY_DEF("RightDownVector", 8642, "\xe2\x87\x82"), + ENTITY_DEF("simdot", 10858, "\xe2\xa9\xaa"), + ENTITY_DEF("FilledVerySmallSquare", 9642, "\xe2\x96\xaa"), + ENTITY_DEF("NotLessSlantEqual", 10877, "\xe2\xa9\xbd\xcc\xb8"), + ENTITY_DEF("SucceedsTilde", 8831, "\xe2\x89\xbf"), + ENTITY_DEF("duarr", 8693, "\xe2\x87\xb5"), + ENTITY_DEF("apE", 10864, "\xe2\xa9\xb0"), + ENTITY_DEF("odot", 8857, "\xe2\x8a\x99"), + ENTITY_DEF("mldr", 8230, "\xe2\x80\xa6"), + ENTITY_DEF("Uarrocir", 10569, "\xe2\xa5\x89"), + ENTITY_DEF("nLl", 8920, "\xe2\x8b\x98\xcc\xb8"), + ENTITY_DEF("rarrpl", 10565, "\xe2\xa5\x85"), + ENTITY_DEF("cir", 9675, "\xe2\x97\x8b"), + ENTITY_DEF("blk14", 9617, "\xe2\x96\x91"), + ENTITY_DEF("VerticalLine", 124, "\x7c"), + ENTITY_DEF("jcy", 1081, "\xd0\xb9"), + ENTITY_DEF("filig", 64257, "\xef\xac\x81"), + ENTITY_DEF("LongRightArrow", 10230, "\xe2\x9f\xb6"), + ENTITY_DEF("beta", 946, "\xce\xb2"), + ENTITY_DEF("ccupssm", 10832, "\xe2\xa9\x90"), + ENTITY_DEF("supsub", 10964, "\xe2\xab\x94"), + ENTITY_DEF("spar", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("Tstrok", 358, "\xc5\xa6"), + ENTITY_DEF("isinv", 8712, "\xe2\x88\x88"), + ENTITY_DEF("rightsquigarrow", 8605, "\xe2\x86\x9d"), + ENTITY_DEF("Diamond", 8900, "\xe2\x8b\x84"), + ENTITY_DEF("curlyeqsucc", 8927, "\xe2\x8b\x9f"), + ENTITY_DEF("ijlig", 307, "\xc4\xb3"), + ENTITY_DEF("puncsp", 8200, "\xe2\x80\x88"), + ENTITY_DEF("hamilt", 8459, "\xe2\x84\x8b"), + ENTITY_DEF("mapstoleft", 8612, "\xe2\x86\xa4"), + ENTITY_DEF("Copf", 8450, "\xe2\x84\x82"), + ENTITY_DEF("prnsim", 8936, "\xe2\x8b\xa8"), + ENTITY_DEF("DotDot", 8412, "\xe2\x83\x9c"), + ENTITY_DEF("lobrk", 10214, "\xe2\x9f\xa6"), + ENTITY_DEF("twoheadrightarrow", 8608, "\xe2\x86\xa0"), + ENTITY_DEF("ngE", 8807, "\xe2\x89\xa7\xcc\xb8"), + ENTITY_DEF("cylcty", 9005, "\xe2\x8c\xad"), + ENTITY_DEF("sube", 8838, "\xe2\x8a\x86"), + ENTITY_DEF("NotEqualTilde", 8770, "\xe2\x89\x82\xcc\xb8"), + ENTITY_DEF_HEUR("Yuml", 376, "\xc5\xb8"), + ENTITY_DEF("comp", 8705, "\xe2\x88\x81"), + ENTITY_DEF("dotminus", 8760, "\xe2\x88\xb8"), + ENTITY_DEF("crarr", 8629, "\xe2\x86\xb5"), + ENTITY_DEF("imped", 437, "\xc6\xb5"), + ENTITY_DEF("barwedge", 8965, "\xe2\x8c\x85"), + ENTITY_DEF("harrcir", 10568, "\xe2\xa5\x88")}; + +class html_entities_storage { + ankerl::unordered_dense::map<std::string_view, html_entity_def> entity_by_name; + ankerl::unordered_dense::map<std::string_view, html_entity_def> entity_by_name_heur; + ankerl::unordered_dense::map<unsigned, html_entity_def> entity_by_id; + +public: + html_entities_storage() + { + auto nelts = G_N_ELEMENTS(html_entities_array); + entity_by_name.reserve(nelts); + entity_by_id.reserve(nelts); + + for (const auto &e: html_entities_array) { + entity_by_name[e.name] = e; + entity_by_id[e.code] = e; + + if (e.allow_heuristic) { + entity_by_name_heur[e.name] = e; + } + } + } + + auto by_name(std::string_view name, bool use_heuristic = false) const -> const html_entity_def * + { + const decltype(entity_by_name) *htb; + + if (use_heuristic) { + htb = &entity_by_name_heur; + } + else { + htb = &entity_by_name; + } + auto it = htb->find(name); + + if (it != htb->end()) { + return &(it->second); + } + + return nullptr; + } + + auto by_id(int id) const -> const html_entity_def * + { + auto it = entity_by_id.find(id); + if (it != entity_by_id.end()) { + return &(it->second); + } + + return nullptr; + } +}; + +static const html_entities_storage html_entities_defs; + +std::size_t +decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) +{ + /* + * t - tortoise (destination ptr) + * h - hare (source ptr) + * e - begin of entity + */ + char *t = s, *h = s, *e = s; + const gchar *end; + bool seen_hash = false, seen_hex = false; + enum { + do_undefined, + do_digits_only, + do_mixed, + } seen_digit_only; + enum class parser_state { + normal_content, + ampersand, + skip_multi_spaces, + skip_start_spaces, + } state = parser_state::normal_content; + + end = s + len; + + auto replace_named_entity = [&](const char *entity, std::size_t len) -> bool { + const auto *entity_def = html_entities_defs.by_name({entity, + (std::size_t)(h - entity)}, + false); + + auto replace_entity = [&]() -> void { + auto l = strlen(entity_def->replacement); + memcpy(t, entity_def->replacement, l); + t += l; + }; + + if (entity_def) { + replace_entity(); + return true; + } + else { + /* Try heuristic */ + auto heuristic_lookup_func = [&](std::size_t lookup_len) -> bool { + if (!entity_def && h - e > lookup_len) { + entity_def = html_entities_defs.by_name({entity, lookup_len}, true); + + if (entity_def) { + replace_entity(); + /* Adjust h back */ + h = e + lookup_len; + + return true; + } + + entity_def = nullptr; + } + + return false; + }; + + heuristic_lookup_func(5); + heuristic_lookup_func(4); + heuristic_lookup_func(3); + heuristic_lookup_func(2); + + /* Leave undecoded */ + if (!entity_def && (end - t > h - e)) { + memmove(t, e, h - e); + t += h - e; + } + else if (entity_def) { + return true; + } + } + + return false; + }; + + /* Strtoul works merely for 0 terminated strings, so leave it alone... */ + auto dec_to_int = [](const char *str, std::size_t len) -> std::optional<int> { + int n = 0; + + /* Avoid INT_MIN overflow by moving to negative numbers */ + while (len > 0 && g_ascii_isdigit(*str)) { + n = 10 * n - (*str++ - '0'); + len--; + } + + if (len == 0) { + return -(n); + } + else { + return std::nullopt; + } + }; + auto hex_to_int = [](const char *str, std::size_t len) -> std::optional<int> { + int n = 0; + + /* Avoid INT_MIN overflow by moving to negative numbers */ + while (len > 0 && g_ascii_isxdigit(*str)) { + if (*str <= 0x39) { + n = 16 * n - (*str++ - '0'); + } + else { + n = 16 * n - (((*str++) | ' ') - 'a' + 10); + } + len--; + } + + if (len == 0) { + return -(n); + } + else { + return std::nullopt; + } + }; + auto oct_to_int = [](const char *str, std::size_t len) -> std::optional<int> { + int n = 0; + + /* Avoid INT_MIN overflow by moving to negative numbers */ + while (len > 0 && g_ascii_isdigit(*str)) { + if (*str > '7') { + break; + } + else { + n = 8 * n - (*str++ - '0'); + } + len--; + } + + if (len == 0) { + return -(n); + } + else { + return std::nullopt; + } + }; + + auto replace_numeric_entity = [&](const char *entity) -> bool { + UChar32 uc; + std::optional<int> maybe_num; + + if (*entity == 'x' || *entity == 'X') { + maybe_num = hex_to_int(entity + 1, h - (entity + 1)); + } + else if (*entity == 'o' || *entity == 'O') { + maybe_num = oct_to_int(entity + 1, h - (entity + 1)); + } + else { + maybe_num = dec_to_int(entity, h - entity); + } + + if (!maybe_num) { + /* Skip undecoded */ + if (end - t >= h - e) { + memmove(t, e, h - e); + t += h - e; + } + + return false; + } + else { + uc = maybe_num.value(); + /* Search for a replacement */ + const auto *entity_def = html_entities_defs.by_id(uc); + + if (entity_def) { + auto rep_len = strlen(entity_def->replacement); + + if (end - t >= rep_len) { + memcpy(t, entity_def->replacement, + rep_len); + t += rep_len; + } + + return true; + } + else { + /* Unicode point */ + goffset off = t - s; + UBool is_error = 0; + + if (uc > 0) { + U8_APPEND((std::uint8_t *) s, off, len, uc, is_error); + + if (!is_error) { + t = s + off; + } + else if (end - t > 3) { + /* Not printable code point replace with 0xFFFD */ + *t++ = '\357'; + *t++ = '\277'; + *t++ = '\275'; + + return true; + } + } + else if (end - t > 3) { + /* Not printable code point replace with 0xFFFD */ + *t++ = '\357'; + *t++ = '\277'; + *t++ = '\275'; + } + } + + return true; + } + + return false; + }; + + auto replace_entity = [&]() -> bool { + if (e + 1 < end) { + const auto *entity_start = e + 1; + + if (*entity_start != '#') { + return replace_named_entity(entity_start, (h - entity_start)); + } + else if (entity_start + 1 < h) { + return replace_numeric_entity(entity_start + 1); + } + } + + return false; + }; + + if (norm_spaces && g_ascii_isspace(*h)) { + state = parser_state::skip_start_spaces; + } + + while (h - s < len && t <= h) { + switch (state) { + case parser_state::normal_content: + if (*h == '&') { + state = parser_state::ampersand; + seen_hash = false; + seen_hex = false; + seen_digit_only = do_undefined; + e = h; + h++; + continue; + } + else { + if (norm_spaces && g_ascii_isspace(*h)) { + *t++ = ' '; + state = parser_state::skip_multi_spaces; + h++; + } + else { + *t++ = *h++; + } + } + break; + case parser_state::ampersand: + if ((*h == ';' || g_ascii_isspace(*h)) && h > e) { + replace_entity(); + state = parser_state::normal_content; + + if (g_ascii_isspace(*h)) { + /* Avoid increase of h */ + continue; + } + } + else if (*h == '&') { + /* Previous `&` was bogus */ + state = parser_state::ampersand; + + if (end - t > h - e) { + memmove(t, e, h - e); + t += h - e; + } + + e = h; + } + else if (*h == '#') { + seen_hash = true; + + if (h + 1 < end && h[1] == 'x') { + seen_hex = true; + /* Skip one more character */ + h++; + } + } + else if (seen_digit_only != do_mixed && + (g_ascii_isdigit(*h) || (seen_hex && g_ascii_isxdigit(*h)))) { + seen_digit_only = do_digits_only; + } + else { + if (seen_digit_only == do_digits_only && seen_hash && h > e) { + /* We have seen some digits, so we can try to decode, eh */ + /* Fuck retarded email clients... */ + replace_entity(); + state = parser_state::normal_content; + continue; + } + + seen_digit_only = do_mixed; + } + + h++; + + break; + case parser_state::skip_multi_spaces: + if (g_ascii_isspace(*h)) { + h++; + } + else { + state = parser_state::normal_content; + } + break; + case parser_state::skip_start_spaces: + if (g_ascii_isspace(*h)) { + h++; + } + else { + state = parser_state::normal_content; + } + break; + } + } + + /* Leftover */ + if (state == parser_state::ampersand && h > e) { + /* Unfinished entity, copy as is */ + if (replace_entity()) { + /* To follow FSM semantics */ + h++; + } + else { + h = e; /* Include the last & */ + } + + /* Leftover after replacement */ + if (h < end && t + (end - h) <= end) { + memmove(t, h, end - h); + t += end - h; + } + } + + if (norm_spaces) { + bool seen_spaces = false; + + while (t > s && g_ascii_isspace(*(t - 1))) { + seen_spaces = true; + t--; + } + + if (seen_spaces) { + *t++ = ' '; + } + } + + return (t - s); +} + +auto decode_html_entitles_inplace(std::string &st) -> void +{ + auto nlen = decode_html_entitles_inplace(st.data(), st.size()); + st.resize(nlen); +} + +TEST_SUITE("html entities") +{ + + TEST_CASE("html entities decode") + { + std::vector<std::pair<std::string, std::string>> cases{ + {"", ""}, + {"abc", "abc"}, + {"abc def", "abc def"}, + {"abc def", "abc def"}, + {"abc\ndef", "abc def"}, + {"abc\n \tdef", "abc def"}, + {" abc def ", "abc def "}, + {"FOO>BAR", "FOO>BAR"}, + {"FOO>BAR", "FOO>BAR"}, + {"FOO> BAR", "FOO> BAR"}, + {"FOO>;;BAR", "FOO>;;BAR"}, + {"I'm ¬it;", "I'm Β¬it;"}, + {"I'm ∉", "I'm β"}, + {"FOO& BAR", "FOO& BAR"}, + {"FOO&&&>BAR", "FOO&&&>BAR"}, + {"FOO)BAR", "FOO)BAR"}, + {"FOOABAR", "FOOABAR"}, + {"FOOABAR", "FOOABAR"}, + {"FOO&#BAR", "FOO&#BAR"}, + {"FOO&#ZOO", "FOO&#ZOO"}, + {"FOOºR", "FOOΒΊR"}, + {"FOO䆺R", "FOOδΊR"}, + {"FOO�ZOO", "FOO\uFFFDZOO"}, + {"FOOZOO", "FOO\u0081ZOO"}, + {"FOO�ZOO", "FOO\uFFFDZOO"}, + {"FOO�ZOO", "FOO\uFFFDZOO"}, + {"ZZ£_id=23", "ZZΒ£_id=23"}, + {"ZZ&prod_id=23", "ZZ&prod_id=23"}, + {"ZZ>", "ZZ>"}, + {"ZZ&", "ZZ&"}, + {"ZZÆ=", "ZZΓ="}, + }; + + for (const auto &c: cases) { + SUBCASE(("decode entities: " + c.first).c_str()) + { + auto *cpy = new char[c.first.size()]; + memcpy(cpy, c.first.data(), c.first.size()); + auto nlen = decode_html_entitles_inplace(cpy, c.first.size(), true); + CHECK(std::string{cpy, nlen} == c.second); + delete[] cpy; + } + } + } +} + +}// namespace rspamd::html
\ No newline at end of file diff --git a/src/libserver/html/html_entities.hxx b/src/libserver/html/html_entities.hxx new file mode 100644 index 0000000..fc1f7cc --- /dev/null +++ b/src/libserver/html/html_entities.hxx @@ -0,0 +1,31 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_ENTITIES_H +#define RSPAMD_HTML_ENTITIES_H +#pragma once + +#include <utility> +#include <string> + +namespace rspamd::html { + +auto decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces = false) -> std::size_t; +auto decode_html_entitles_inplace(std::string &st) -> void; + +}// namespace rspamd::html + +#endif diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx new file mode 100644 index 0000000..309d761 --- /dev/null +++ b/src/libserver/html/html_tag.hxx @@ -0,0 +1,159 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_TAG_HXX +#define RSPAMD_HTML_TAG_HXX +#pragma once + +#include <utility> +#include <string_view> +#include <variant> +#include <vector> +#include <optional> +#include <cstdint> + +#include "html_tags.h" + +struct rspamd_url; +struct html_image; + +namespace rspamd::html { + +struct html_content; /* Forward declaration */ + +enum class html_component_type : std::uint8_t { + RSPAMD_HTML_COMPONENT_NAME = 0, + RSPAMD_HTML_COMPONENT_HREF, + RSPAMD_HTML_COMPONENT_COLOR, + RSPAMD_HTML_COMPONENT_BGCOLOR, + RSPAMD_HTML_COMPONENT_STYLE, + RSPAMD_HTML_COMPONENT_CLASS, + RSPAMD_HTML_COMPONENT_WIDTH, + RSPAMD_HTML_COMPONENT_HEIGHT, + RSPAMD_HTML_COMPONENT_SIZE, + RSPAMD_HTML_COMPONENT_REL, + RSPAMD_HTML_COMPONENT_ALT, + RSPAMD_HTML_COMPONENT_ID, + RSPAMD_HTML_COMPONENT_HIDDEN, +}; + +/* Public tags flags */ +/* XML tag */ +#define FL_XML (1u << CM_USER_SHIFT) +/* Fully closed tag (e.g. <a attrs />) */ +#define FL_CLOSED (1 << (CM_USER_SHIFT + 1)) +#define FL_BROKEN (1 << (CM_USER_SHIFT + 2)) +#define FL_IGNORE (1 << (CM_USER_SHIFT + 3)) +#define FL_BLOCK (1 << (CM_USER_SHIFT + 4)) +#define FL_HREF (1 << (CM_USER_SHIFT + 5)) +#define FL_COMMENT (1 << (CM_USER_SHIFT + 6)) +#define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7)) + +/** + * Returns component type from a string + * @param st + * @return + */ +auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>; + +using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>; +struct html_tag_component { + html_component_type type; + std::string_view value; + + html_tag_component(html_component_type type, std::string_view value) + : type(type), value(value) + { + } +}; + +/* Pairing closing tag representation */ +struct html_closing_tag { + int start = -1; + int end = -1; + + auto clear() -> void + { + start = end = -1; + } +}; + +struct html_tag { + unsigned int tag_start = 0; + unsigned int content_offset = 0; + std::uint32_t flags = 0; + std::int32_t id = Tag_UNKNOWN; + html_closing_tag closing; + + std::vector<html_tag_component> components; + + html_tag_extra_t extra; + mutable struct html_block *block = nullptr; + std::vector<struct html_tag *> children; + struct html_tag *parent; + + auto find_component(html_component_type what) const -> std::optional<std::string_view> + { + for (const auto &comp: components) { + if (comp.type == what) { + return comp.value; + } + } + + return std::nullopt; + } + + auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view> + { + if (what) { + return find_component(what.value()); + } + + return std::nullopt; + } + + auto clear(void) -> void + { + id = Tag_UNKNOWN; + tag_start = content_offset = 0; + extra = std::monostate{}; + components.clear(); + flags = 0; + block = nullptr; + children.clear(); + closing.clear(); + } + + constexpr auto get_content_length() const -> std::size_t + { + if (flags & (FL_IGNORE | CM_HEAD)) { + return 0; + } + if (closing.start > content_offset) { + return closing.start - content_offset; + } + + return 0; + } + + auto get_content(const struct html_content *hc) const -> std::string_view; +}; + +static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY); + +}// namespace rspamd::html + +#endif//RSPAMD_HTML_TAG_HXX diff --git a/src/libserver/html/html_tag_defs.hxx b/src/libserver/html/html_tag_defs.hxx new file mode 100644 index 0000000..647f7c3 --- /dev/null +++ b/src/libserver/html/html_tag_defs.hxx @@ -0,0 +1,194 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_HTML_TAG_DEFS_HXX +#define RSPAMD_HTML_TAG_DEFS_HXX + +#include "config.h" +#include "html_tags.h" +#include "libutil/cxx/util.hxx" + +#include <string> +#include "contrib/ankerl/unordered_dense.h" + +namespace rspamd::html { + +struct html_tag_def { + std::string name; + tag_id_t id; + guint flags; +}; + +#define TAG_DEF(id, name, flags) \ + html_tag_def \ + { \ + (name), (id), (flags) \ + } + +static const auto html_tag_defs_array = rspamd::array_of( + /* W3C defined elements */ + TAG_DEF(Tag_A, "a", FL_HREF), + TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)), + TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)), + TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)), + TAG_DEF(Tag_APPLET, "applet", (CM_IMG | CM_INLINE | CM_PARAM)), + TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)), + TAG_DEF(Tag_B, "b", (CM_INLINE | FL_BLOCK)), + TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)), + TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)), + TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)), + TAG_DEF(Tag_BIG, "big", (CM_INLINE)), + TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)), + TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)), + TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)), + TAG_DEF(Tag_BUTTON, "button", (CM_INLINE | FL_BLOCK)), + TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)), + TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)), + TAG_DEF(Tag_CITE, "cite", (CM_INLINE)), + TAG_DEF(Tag_CODE, "code", (CM_INLINE)), + TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)), + TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)), + TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)), + TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK)), + TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)), + TAG_DEF(Tag_DIR, "dir", (CM_BLOCK)), + TAG_DEF(Tag_DIV, "div", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_DL, "dl", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)), + TAG_DEF(Tag_EM, "em", (CM_INLINE)), + TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)), + TAG_DEF(Tag_FONT, "font", (FL_BLOCK)), + TAG_DEF(Tag_FORM, "form", (CM_BLOCK | FL_HREF)), + TAG_DEF(Tag_FRAME, "frame", (CM_EMPTY | FL_HREF)), + TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML)), + TAG_DEF(Tag_H1, "h1", (CM_BLOCK)), + TAG_DEF(Tag_H2, "h2", (CM_BLOCK)), + TAG_DEF(Tag_H3, "h3", (CM_BLOCK)), + TAG_DEF(Tag_H4, "h4", (CM_BLOCK)), + TAG_DEF(Tag_H5, "h5", (CM_BLOCK)), + TAG_DEF(Tag_H6, "h6", (CM_BLOCK)), + TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)), + TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)), + TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)), + TAG_DEF(Tag_I, "i", (CM_INLINE)), + TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)), + TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)), + TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)), + TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK)), + TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)), + TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)), + TAG_DEF(Tag_LABEL, "label", (CM_INLINE)), + TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)), + TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)), + TAG_DEF(Tag_LINK, "link", (CM_EMPTY | FL_HREF)), + TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK)), + TAG_DEF(Tag_MAP, "map", (CM_INLINE | FL_HREF)), + TAG_DEF(Tag_MENU, "menu", (CM_BLOCK)), + TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)), + TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK)), + TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_RAW)), + TAG_DEF(Tag_OBJECT, "object", (CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)), + TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)), + TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)), + TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)), + TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)), + TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK)), + TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)), + TAG_DEF(Tag_Q, "q", (CM_INLINE)), + TAG_DEF(Tag_RB, "rb", (CM_INLINE)), + TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)), + TAG_DEF(Tag_RP, "rp", (CM_INLINE)), + TAG_DEF(Tag_RT, "rt", (CM_INLINE)), + TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)), + TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)), + TAG_DEF(Tag_S, "s", (CM_INLINE)), + TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)), + TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_RAW)), + TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)), + TAG_DEF(Tag_SMALL, "small", (CM_INLINE)), + TAG_DEF(Tag_SPAN, "span", (CM_NO_INDENT | FL_BLOCK)), + TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)), + TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)), + TAG_DEF(Tag_STYLE, "style", (CM_HEAD | CM_RAW)), + TAG_DEF(Tag_SUB, "sub", (CM_INLINE)), + TAG_DEF(Tag_SUP, "sup", (CM_INLINE)), + TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT | FL_BLOCK)), + TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)), + TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)), + TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)), + TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)), + TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)), + TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)), + TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT | FL_BLOCK)), + TAG_DEF(Tag_TT, "tt", (CM_INLINE)), + TAG_DEF(Tag_U, "u", (CM_INLINE)), + TAG_DEF(Tag_UL, "ul", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_VAR, "var", (CM_INLINE)), + TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK)), + TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY))); + +class html_tags_storage { + ankerl::unordered_dense::map<std::string_view, html_tag_def> tag_by_name; + ankerl::unordered_dense::map<tag_id_t, html_tag_def> tag_by_id; + +public: + html_tags_storage() + { + tag_by_name.reserve(html_tag_defs_array.size()); + tag_by_id.reserve(html_tag_defs_array.size()); + + for (const auto &t: html_tag_defs_array) { + tag_by_name[t.name] = t; + tag_by_id[t.id] = t; + } + } + + auto by_name(std::string_view name) const -> const html_tag_def * + { + auto it = tag_by_name.find(name); + + if (it != tag_by_name.end()) { + return &(it->second); + } + + return nullptr; + } + + auto by_id(int id) const -> const html_tag_def * + { + auto it = tag_by_id.find(static_cast<tag_id_t>(id)); + if (it != tag_by_id.end()) { + return &(it->second); + } + + return nullptr; + } + + auto name_by_id_safe(int id) const -> std::string_view + { + auto it = tag_by_id.find(static_cast<tag_id_t>(id)); + if (it != tag_by_id.end()) { + return it->second.name; + } + + return "unknown"; + } +}; + +}// namespace rspamd::html + +#endif//RSPAMD_HTML_TAG_DEFS_HXX diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h new file mode 100644 index 0000000..c186314 --- /dev/null +++ b/src/libserver/html/html_tags.h @@ -0,0 +1,176 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBSERVER_HTML_TAGS_H_ +#define SRC_LIBSERVER_HTML_TAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Known HTML tags */ +typedef enum { + Tag_UNKNOWN = 0, /**< Unknown tag! */ + Tag_A, /**< A */ + Tag_ABBR, /**< ABBR */ + Tag_ACRONYM, /**< ACRONYM */ + Tag_ADDRESS, /**< ADDRESS */ + Tag_APPLET, /**< APPLET */ + Tag_AREA, /**< AREA */ + Tag_B, /**< B */ + Tag_BASE, /**< BASE */ + Tag_BASEFONT, /**< BASEFONT */ + Tag_BDO, /**< BDO */ + Tag_BIG, /**< BIG */ + Tag_BLOCKQUOTE, /**< BLOCKQUOTE */ + Tag_BODY, /**< BODY */ + Tag_BR, /**< BR */ + Tag_BUTTON, /**< BUTTON */ + Tag_CAPTION, /**< CAPTION */ + Tag_CENTER, /**< CENTER */ + Tag_CITE, /**< CITE */ + Tag_CODE, /**< CODE */ + Tag_COL, /**< COL */ + Tag_COLGROUP, /**< COLGROUP */ + Tag_DD, /**< DD */ + Tag_DEL, /**< DEL */ + Tag_DFN, /**< DFN */ + Tag_DIR, /**< DIR */ + Tag_DIV, /**< DIF */ + Tag_DL, /**< DL */ + Tag_DT, /**< DT */ + Tag_EM, /**< EM */ + Tag_FIELDSET, /**< FIELDSET */ + Tag_FONT, /**< FONT */ + Tag_FORM, /**< FORM */ + Tag_FRAME, /**< FRAME */ + Tag_FRAMESET, /**< FRAMESET */ + Tag_H1, /**< H1 */ + Tag_H2, /**< H2 */ + Tag_H3, /**< H3 */ + Tag_H4, /**< H4 */ + Tag_H5, /**< H5 */ + Tag_H6, /**< H6 */ + Tag_HEAD, /**< HEAD */ + Tag_HR, /**< HR */ + Tag_HTML, /**< HTML */ + Tag_I, /**< I */ + Tag_IFRAME, /**< IFRAME */ + Tag_IMG, /**< IMG */ + Tag_INPUT, /**< INPUT */ + Tag_INS, /**< INS */ + Tag_ISINDEX, /**< ISINDEX */ + Tag_KBD, /**< KBD */ + Tag_KEYGEN, /**< KEYGEN */ + Tag_LABEL, /**< LABEL */ + Tag_LEGEND, /**< LEGEND */ + Tag_LI, /**< LI */ + Tag_LINK, /**< LINK */ + Tag_LISTING, /**< LISTING */ + Tag_MAP, /**< MAP */ + Tag_MENU, /**< MENU */ + Tag_META, /**< META */ + Tag_NOFRAMES, /**< NOFRAMES */ + Tag_NOSCRIPT, /**< NOSCRIPT */ + Tag_OBJECT, /**< OBJECT */ + Tag_OL, /**< OL */ + Tag_OPTGROUP, /**< OPTGROUP */ + Tag_OPTION, /**< OPTION */ + Tag_P, /**< P */ + Tag_PARAM, /**< PARAM */ + Tag_PLAINTEXT, /**< PLAINTEXT */ + Tag_PRE, /**< PRE */ + Tag_Q, /**< Q */ + Tag_RB, /**< RB */ + Tag_RBC, /**< RBC */ + Tag_RP, /**< RP */ + Tag_RT, /**< RT */ + Tag_RTC, /**< RTC */ + Tag_RUBY, /**< RUBY */ + Tag_S, /**< S */ + Tag_SAMP, /**< SAMP */ + Tag_SCRIPT, /**< SCRIPT */ + Tag_SELECT, /**< SELECT */ + Tag_SMALL, /**< SMALL */ + Tag_SPAN, /**< SPAN */ + Tag_STRIKE, /**< STRIKE */ + Tag_STRONG, /**< STRONG */ + Tag_STYLE, /**< STYLE */ + Tag_SUB, /**< SUB */ + Tag_SUP, /**< SUP */ + Tag_TABLE, /**< TABLE */ + Tag_TBODY, /**< TBODY */ + Tag_TD, /**< TD */ + Tag_TEXTAREA, /**< TEXTAREA */ + Tag_TFOOT, /**< TFOOT */ + Tag_TH, /**< TH */ + Tag_THEAD, /**< THEAD */ + Tag_TITLE, /**< TITLE */ + Tag_TR, /**< TR */ + Tag_TT, /**< TT */ + Tag_U, /**< U */ + Tag_UL, /**< UL */ + Tag_VAR, /**< VAR */ + Tag_XMP, /**< XMP */ + Tag_NEXTID, /**< NEXTID */ + Tag_MAX, + + N_TAGS = -1 /**< Must be -1 */ +} tag_id_t; + +#define CM_UNKNOWN 0 +/* Elements with no content. Map to HTML specification. */ +#define CM_EMPTY (1 << 0) +/* Elements that appear outside of "BODY". */ +#define CM_HTML (1 << 1) +/* Elements that can appear within HEAD. */ +#define CM_HEAD (1 << 2) +/* HTML "block" elements. */ +#define CM_BLOCK (1 << 3) +/* HTML "inline" elements. */ +#define CM_INLINE (1 << 4) +/* Elements that mark list item ("LI"). */ +#define CM_LIST (1 << 5) +/* Elements that mark definition list item ("DL", "DT"). */ +#define CM_DEFLIST (1 << 6) +/* Elements that can appear inside TABLE. */ +#define CM_TABLE (1 << 7) +/* Used for "THEAD", "TFOOT" or "TBODY". */ +#define CM_ROWGRP (1 << 8) +/* Used for "TD", "TH" */ +#define CM_ROW (1 << 9) +/* Elements whose content must be protected against white space movement. + Includes some elements that can found in forms. */ +#define CM_FIELD (1 << 10) +#define CM_RAW (1 << 11) +/* Elements that allows "PARAM". */ +#define CM_PARAM (1 << 12) +/* Elements with an optional end tag. */ +#define CM_OPT (1 << 13) +/* Elements that use "align" attribute for vertical position. */ +#define CM_IMG (1 << 14) +#define CM_NO_INDENT (1 << 15) +/* Elements that cannot be omitted. */ +#define CM_OMITST (1 << 16) +/* Unique elements */ +#define CM_UNIQUE (1 << 17) + +#define CM_USER_SHIFT (18) + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBSERVER_HTML_TAGS_H_ */ diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx new file mode 100644 index 0000000..2fe6702 --- /dev/null +++ b/src/libserver/html/html_tests.cxx @@ -0,0 +1,304 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "html.hxx" +#include "libserver/task.h" + +#include <vector> +#include <fmt/core.h> + + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +namespace rspamd::html { + +/* + * Tests part + */ + +TEST_SUITE("html") +{ + TEST_CASE("html parsing") + { + + const std::vector<std::pair<std::string, std::string>> cases{ + {"<html><!DOCTYPE html><body>", "+html;++xml;++body;"}, + {"<html><div><div></div></div></html>", "+html;++div;+++div;"}, + {"<html><div><div></div></html>", "+html;++div;+++div;"}, + {"<html><div><div></div></html></div>", "+html;++div;+++div;"}, + {"<p><p><a></p></a></a>", "+p;++p;+++a;"}, + {"<div><a href=\"http://example.com\"></div></a>", "+div;++a;"}, + /* Broken, as I don't know how the hell this should be really parsed */ + //{"<html><!DOCTYPE html><body><head><body></body></html></body></html>", + // "+html;++xml;++body;+++head;+++body;"} + }; + + rspamd_url_init(NULL); + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "html", 0); + struct rspamd_task fake_task; + memset(&fake_task, 0, sizeof(fake_task)); + fake_task.task_pool = pool; + + for (const auto &c: cases) { + SUBCASE((std::string("extract tags from: ") + c.first).c_str()) + { + GByteArray *tmp = g_byte_array_sized_new(c.first.size()); + g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); + auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr); + CHECK(hc != nullptr); + auto dump = html_debug_structure(*hc); + CHECK(c.second == dump); + g_byte_array_free(tmp, TRUE); + } + } + + rspamd_mempool_delete(pool); + } + + TEST_CASE("html text extraction") + { + using namespace std::string_literals; + const std::vector<std::pair<std::string, std::string>> cases{ + {"test", "test"}, + {"test\0"s, "test\uFFFD"s}, + {"test\0test"s, "test\uFFFDtest"s}, + {"test\0\0test"s, "test\uFFFD\uFFFDtest"s}, + {"test ", "test"}, + {"test foo, bar", "test foo, bar"}, + {"<p>text</p>", "text\n"}, + {"olo<p>text</p>lolo", "olo\ntext\nlolo"}, + {"<div>foo</div><div>bar</div>", "foo\nbar\n"}, + {"<b>foo<i>bar</b>baz</i>", "foobarbaz"}, + {"<b>foo<i>bar</i>baz</b>", "foobarbaz"}, + {"foo<br>baz", "foo\nbaz"}, + {"<a href=https://example.com>test</a>", "test"}, + {"<img alt=test>", "test"}, + {" <body>\n" + " <!-- escape content -->\n" + " a b a > b a < b a & b 'a "a"\n" + " </body>", + R"|(a b a > b a < b a & b 'a "a")|"}, + /* XML tags */ + {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n" + " <!DOCTYPE html\n" + " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n" + " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" + "<body>test</body>", + "test"}, + {"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>" + " <body>\n" + " <p><br>\n" + " </p>\n" + " <div class=\"moz-forward-container\"><br>\n" + " <br>\n" + " test</div>" + "</body>", + "\n\n\ntest\n"}, + {"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>" + "sh<span style=\"FONT-SIZE: 0px\">aring </span></div>", + "fish\n"}, + /* FIXME: broken until rework of css parser */ + //{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>" + // "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"}, + /* Complex html with bad tags */ + {"<!DOCTYPE html>\n" + "<html lang=\"en\">\n" + " <head>\n" + " <meta charset=\"utf-8\">\n" + " <title>title</title>\n" + " <link rel=\"stylesheet\" href=\"style.css\">\n" + " <script src=\"script.js\"></script>\n" + " </head>\n" + " <body>\n" + " <!-- page content -->\n" + " Hello, world! <b>test</b>\n" + " <p>data<>\n" + " </P>\n" + " <b>stuff</p>?\n" + " </body>\n" + "</html>", + "Hello, world! test \ndata<>\nstuff\n?"}, + {"<p><!--comment-->test</br></hr><br>", "test\n"}, + /* Tables */ + {"<table>\n" + " <tr>\n" + " <th>heada</th>\n" + " <th>headb</th>\n" + " </tr>\n" + " <tr>\n" + " <td>data1</td>\n" + " <td>data2</td>\n" + " </tr>\n" + " </table>", + "heada headb\ndata1 data2\n"}, + /* Invalid closing br and hr + comment */ + {" <body>\n" + " <!-- page content -->\n" + " Hello, world!<br>test</br><br>content</hr>more content<br>\n" + " <div>\n" + " content inside div\n" + " </div>\n" + " </body>", + "Hello, world!\ntest\ncontentmore content\ncontent inside div\n"}, + /* First closing tag */ + {"</head>\n" + "<body>\n" + "<p> Hello. I have some bad news.\n" + "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n" + "</body>\n" + "</html>", + "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"}, + /* Invalid tags */ + {"lol <sht> omg </sht> oh my!\n" + "<name>words words</name> goodbye", + "lol omg oh my! words words goodbye"}, + /* Invisible stuff */ + {"<div style=\"color:#555555;font-family:Arial, 'Helvetica Neue', Helvetica, sans-serif;line-height:1.2;padding-top:10px;padding-right:10px;padding-bottom:10px;padding-left:10px;font-style: italic;\">\n" + "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n" + "<span style=\"color:#FFFFFF; \">F</span>Sincerely,</p>\n" + "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n" + "<span style=\"color:#FFFFFF; \">8</span>Sky<span style=\"opacity:1;\"></span>pe<span style=\"color:#FFFFFF; \">F</span>Web<span style=\"color:#FFFFFF; \">F</span></p>\n" + "<span style=\"color:#FFFFFF; \">kreyes</span>\n" + "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n" + " </p>", + " Sincerely,\n Skype Web\n"}, + {"lala<p hidden>fafa</p>", "lala"}, + {"<table style=\"FONT-SIZE: 0px;\"><tbody><tr><td>\n" + "DONKEY\n" + "</td></tr></tbody></table>", + ""}, + /* bgcolor propagation */ + {"<a style=\"display: inline-block; color: #ffffff; background-color: #00aff0;\">\n" + "<span style=\"color: #00aff0;\">F</span>Rev<span style=\"opacity: 1;\"></span></span>ie<span style=\"opacity: 1;\"></span>" + "</span>w<span style=\"color: #00aff0;\">F<span style=\"opacity: 1;\">ΜΉ</span></span>", + " Review"}, + {"<td style=\"color:#ffffff\" bgcolor=\"#005595\">\n" + "hello world\n" + "</td>", + "hello world"}, + /* Colors */ + {"goodbye <span style=\"COLOR: rgb(64,64,64)\">cruel</span>" + "<span>world</span>", + "goodbye cruelworld"}, + /* Font-size propagation */ + {"<p style=\"font-size: 11pt;line-height:22px\">goodbye <span style=\"font-size:0px\">cruel</span>world</p>", + "goodbye world\n"}, + /* Newline before tag -> must be space */ + {"goodbye <span style=\"COLOR: rgb(64,64,64)\">cruel</span>\n" + "<span>world</span>", + "goodbye cruel world"}, + /* Head tag with some stuff */ + {"<html><head><p>oh my god</head><body></body></html>", "oh my god\n"}, + {"<html><head><title>oh my god</head><body></body></html>", ""}, + {"<html><body><html><head>displayed</body></html></body></html>", "displayed"}, + + }; + + rspamd_url_init(NULL); + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "html", 0); + struct rspamd_task fake_task; + memset(&fake_task, 0, sizeof(fake_task)); + fake_task.task_pool = pool; + + auto replace_newlines = [](std::string &str) { + auto start_pos = 0; + while ((start_pos = str.find("\n", start_pos, 1)) != std::string::npos) { + str.replace(start_pos, 1, "\\n", 2); + start_pos += 2; + } + }; + + auto i = 1; + for (const auto &c: cases) { + SUBCASE((fmt::format("html extraction case {}", i)).c_str()) + { + GByteArray *tmp = g_byte_array_sized_new(c.first.size()); + g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); + auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr); + CHECK(hc != nullptr); + replace_newlines(hc->parsed); + auto expected = c.second; + replace_newlines(expected); + CHECK(hc->parsed == expected); + g_byte_array_free(tmp, TRUE); + } + i++; + } + + rspamd_mempool_delete(pool); + } + + TEST_CASE("html urls extraction") + { + using namespace std::string_literals; + const std::vector<std::tuple<std::string, std::vector<std::string>, std::optional<std::string>>> cases{ + {"<style></style><a href=\"https://www.example.com\">yolo</a>", + {"https://www.example.com"}, + "yolo"}, + {"<a href=\"https://example.com\">test</a>", {"https://example.com"}, "test"}, + {"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}, "hello"}, + {"<html>\n" + "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\">\n" + "<body>\n" + "<a href=\"https://www.example.com\">hello</a>\n" + "</body>\n" + "</html>", + {"https://www.example.com"}, + "hello"}, + }; + + rspamd_url_init(NULL); + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "html", 0); + struct rspamd_task fake_task; + memset(&fake_task, 0, sizeof(fake_task)); + fake_task.task_pool = pool; + + auto i = 1; + for (const auto &c: cases) { + SUBCASE((fmt::format("html url extraction case {}", i)).c_str()) + { + GPtrArray *purls = g_ptr_array_new(); + auto input = std::get<0>(c); + GByteArray *tmp = g_byte_array_sized_new(input.size()); + g_byte_array_append(tmp, (const guint8 *) input.data(), input.size()); + auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true, nullptr); + CHECK(hc != nullptr); + auto &expected_text = std::get<2>(c); + if (expected_text.has_value()) { + CHECK(hc->parsed == expected_text.value()); + } + const auto &expected_urls = std::get<1>(c); + CHECK(expected_urls.size() == purls->len); + for (auto j = 0; j < expected_urls.size(); ++j) { + auto *url = (rspamd_url *) g_ptr_array_index(purls, j); + CHECK(expected_urls[j] == std::string{url->string, url->urllen}); + } + g_byte_array_free(tmp, TRUE); + g_ptr_array_free(purls, TRUE); + } + ++i; + } + + rspamd_mempool_delete(pool); + } +} + +} /* namespace rspamd::html */ diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx new file mode 100644 index 0000000..8f29f2c --- /dev/null +++ b/src/libserver/html/html_url.cxx @@ -0,0 +1,496 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "html_url.hxx" +#include "libutil/str_util.h" +#include "libserver/url.h" +#include "libserver/logger.h" +#include "rspamd.h" + +#include <unicode/idna.h> + +namespace rspamd::html { + +static auto +rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool +{ + const auto *p1 = t1.data() + t1.size() - 1; + const auto *p2 = t2.data() + t2.size() - 1; + + /* Skip trailing dots */ + while (p1 > t1.data()) { + if (*p1 != '.') { + break; + } + + p1--; + } + + while (p2 > t2.data()) { + if (*p2 != '.') { + break; + } + + p2--; + } + + while (p1 > t1.data() && p2 > t2.data()) { + if (*p1 != *p2) { + break; + } + + p1--; + p2--; + } + + if (p2 == t2.data()) { + /* p2 can be subdomain of p1 if *p1 is '.' */ + if (p1 != t1.data() && *(p1 - 1) == '.') { + return true; + } + } + else if (p1 == t1.data()) { + if (p2 != t2.data() && *(p2 - 1) == '.') { + return true; + } + } + + return false; +} + + +static auto +get_icu_idna_instance(void) -> auto +{ + auto uc_err = U_ZERO_ERROR; + static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err); + + return udn; +} + +static auto +convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld) + -> std::string_view +{ + std::string_view ret = use_tld ? std::string_view{rspamd_url_tld_unsafe(url), url->tldlen} : std::string_view{rspamd_url_host_unsafe(url), url->hostlen}; + + /* Handle IDN url's */ + if (ret.size() > 4 && + rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) { + + const auto buf_capacity = ret.size() * 2 + 1; + auto *idn_hbuf = (char *) rspamd_mempool_alloc(pool, buf_capacity); + icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int) buf_capacity}; + + /* We need to convert it to the normal value first */ + icu::IDNAInfo info; + auto uc_err = U_ZERO_ERROR; + auto *udn = get_icu_idna_instance(); + udn->nameToUnicodeUTF8(icu::StringPiece(ret.data(), ret.size()), + byte_sink, info, uc_err); + + if (uc_err == U_ZERO_ERROR && !info.hasErrors()) { + /* idn_hbuf is allocated in mempool, so it is safe to use */ + ret = std::string_view{idn_hbuf, (std::size_t) byte_sink.NumberOfBytesWritten()}; + } + else { + msg_err_pool("cannot convert to IDN: %s (0x%xd)", + u_errorName(uc_err), info.getErrors()); + } + } + + return ret; +}; + +constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto +{ + return (s1.size() == s2.size()) && + std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(), + [](const auto c1, const auto c2) { + return g_ascii_tolower(c1) == g_ascii_tolower(c2); + }); +} + +constexpr auto +is_transfer_proto(struct rspamd_url *u) -> bool +{ + return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0; +} + +auto html_url_is_phished(rspamd_mempool_t *pool, + struct rspamd_url *href_url, + std::string_view text_data) -> std::optional<rspamd_url *> +{ + struct rspamd_url *text_url; + std::string_view disp_tok, href_tok; + goffset url_pos; + gchar *url_str = NULL; + + auto sz = text_data.size(); + const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz); + text_data = std::string_view(trimmed, sz); + + if (text_data.size() > 4 && + rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str, + RSPAMD_URL_FIND_ALL, + &url_pos, NULL) && + url_str != nullptr) { + + if (url_pos > 0) { + /* + * We have some url at some offset, so we need to check what is + * at the start of the text + */ + return std::nullopt; + } + + text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url); + auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool, + RSPAMD_URL_PARSE_TEXT); + + if (rc == URI_ERRNO_OK) { + text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; + href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL; + + /* Check for phishing */ + if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) { + disp_tok = convert_idna_hostname_maybe(pool, text_url, false); + href_tok = convert_idna_hostname_maybe(pool, href_url, false); + + if (!sv_equals(disp_tok, href_tok) && + text_url->tldlen > 0 && href_url->tldlen > 0) { + + /* Apply the same logic for TLD */ + disp_tok = convert_idna_hostname_maybe(pool, text_url, true); + href_tok = convert_idna_hostname_maybe(pool, href_url, true); + + if (!sv_equals(disp_tok, href_tok)) { + /* Check if one url is a subdomain for another */ + + if (!rspamd_url_is_subdomain(disp_tok, href_tok)) { + href_url->flags |= RSPAMD_URL_FLAG_PHISHED; + text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; + + if (href_url->ext == nullptr) { + href_url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext); + } + href_url->ext->linked_url = text_url; + } + } + } + } + + return text_url; + } + else { + /* + * We have found something that looks like an url but it was + * not parsed correctly. + * Sometimes it means an obfuscation attempt, so we have to check + * what's inside of the text + */ + gboolean obfuscation_found = FALSE; + + if (text_data.size() > 4 && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 && + rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) { + /* Clearly an obfuscation attempt */ + obfuscation_found = TRUE; + } + + msg_info_pool("extract of url '%s' failed: %s; obfuscation detected: %s", + url_str, + rspamd_url_strerror(rc), + obfuscation_found ? "yes" : "no"); + + if (obfuscation_found) { + href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED; + } + } + } + + return std::nullopt; +} + +void html_check_displayed_url(rspamd_mempool_t *pool, + GList **exceptions, + void *url_set, + std::string_view visible_part, + goffset href_offset, + struct rspamd_url *url) +{ + struct rspamd_url *displayed_url = nullptr; + struct rspamd_url *turl; + struct rspamd_process_exception *ex; + guint saved_flags = 0; + gsize dlen; + + if (visible_part.empty()) { + /* No displayed url, just some text within <a> tag */ + return; + } + + if (url->ext == nullptr) { + url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext); + } + url->ext->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1); + rspamd_strlcpy(url->ext->visible_part, + visible_part.data(), + visible_part.size() + 1); + dlen = visible_part.size(); + + /* Strip unicode spaces from the start and the end */ + url->ext->visible_part = const_cast<char *>( + rspamd_string_unicode_trim_inplace(url->ext->visible_part, + &dlen)); + auto maybe_url = html_url_is_phished(pool, url, + {url->ext->visible_part, dlen}); + + if (maybe_url) { + url->flags |= saved_flags; + displayed_url = maybe_url.value(); + } + + if (exceptions && displayed_url != nullptr) { + ex = rspamd_mempool_alloc_type(pool, struct rspamd_process_exception); + ex->pos = href_offset; + ex->len = dlen; + ex->type = RSPAMD_EXCEPTION_URL; + ex->ptr = url; + + *exceptions = g_list_prepend(*exceptions, ex); + } + + if (displayed_url && url_set) { + turl = rspamd_url_set_add_or_return((khash_t(rspamd_url_hash) *) url_set, displayed_url); + + if (turl != nullptr) { + /* Here, we assume the following: + * if we have a URL in the text part which + * is the same as displayed URL in the + * HTML part, we assume that it is also + * hint only. + */ + if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) { + + /* + * We have the same URL for href and displayed url, so we + * know that this url cannot be both target and display (as + * it breaks logic in many places), so we do not + * propagate html flags + */ + if (!(turl->flags & RSPAMD_URL_FLAG_DISPLAY_URL)) { + turl->flags |= displayed_url->flags; + } + turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT; + } + + turl->count++; + } + else { + /* Already inserted by `rspamd_url_set_add_or_return` */ + } + } + + rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen); +} + +auto html_process_url(rspamd_mempool_t *pool, std::string_view &input) + -> std::optional<struct rspamd_url *> +{ + struct rspamd_url *url; + guint saved_flags = 0; + gint rc; + const gchar *s, *prefix = "http://"; + gchar *d; + gsize dlen; + gboolean has_bad_chars = FALSE, no_prefix = FALSE; + static const gchar hexdigests[] = "0123456789abcdef"; + + auto sz = input.length(); + const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz); + input = {trimmed, sz}; + + const auto *start = input.data(); + s = start; + dlen = 0; + + for (auto i = 0; i < sz; i++) { + if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) { + dlen += 3; + } + else { + dlen++; + } + } + + if (rspamd_substring_search(start, sz, "://", 3) == -1) { + if (sz >= sizeof("mailto:") && + (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 || + memcmp(start, "tel:", sizeof("tel:") - 1) == 0 || + memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) { + /* Exclusion, has valid but 'strange' prefix */ + } + else { + for (auto i = 0; i < sz; i++) { + if (!((s[i] & 0x80) || g_ascii_isalnum(s[i]))) { + if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') { + prefix = "http:"; + dlen += sizeof("http:") - 1; + no_prefix = TRUE; + } + else if (s[i] == '@') { + /* Likely email prefix */ + prefix = "mailto://"; + dlen += sizeof("mailto://") - 1; + no_prefix = TRUE; + } + else if (s[i] == ':' && i != 0) { + /* Special case */ + no_prefix = FALSE; + } + else { + if (i == 0) { + /* No valid data */ + return std::nullopt; + } + else { + no_prefix = TRUE; + dlen += strlen(prefix); + } + } + + break; + } + } + } + } + + auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1); + d = decoded; + + if (no_prefix) { + gsize plen = strlen(prefix); + memcpy(d, prefix, plen); + d += plen; + } + + /* + * We also need to remove all internal newlines, spaces + * and encode unsafe characters + * Another obfuscation find in the wild was encoding of the SAFE url characters, + * including essential ones + */ + for (auto i = 0; i < sz; i++) { + if (G_UNLIKELY(g_ascii_isspace(s[i]))) { + continue; + } + else if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) { + /* URL encode */ + *d++ = '%'; + *d++ = hexdigests[(s[i] >> 4) & 0xf]; + *d++ = hexdigests[s[i] & 0xf]; + has_bad_chars = TRUE; + } + else if (G_UNLIKELY(s[i] == '%')) { + if (i + 2 < sz) { + auto c1 = s[i + 1]; + auto c2 = s[i + 2]; + + if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) { + auto codepoint = 0; + + if (c1 >= '0' && c1 <= '9') codepoint = c1 - '0'; + else if (c1 >= 'A' && c1 <= 'F') + codepoint = c1 - 'A' + 10; + else if (c1 >= 'a' && c1 <= 'f') + codepoint = c1 - 'a' + 10; + + codepoint <<= 4; + + if (c2 >= '0' && c2 <= '9') codepoint += c2 - '0'; + else if (c2 >= 'A' && c2 <= 'F') + codepoint += c2 - 'A' + 10; + else if (c2 >= 'a' && c2 <= 'f') + codepoint += c2 - 'a' + 10; + + /* Now check for 'interesting' codepoints */ + if (codepoint == '@' || codepoint == ':' || codepoint == '|' || + codepoint == '?' || codepoint == '\\' || codepoint == '/') { + /* Replace it back */ + *d++ = (char) (codepoint & 0xff); + i += 2; + } + else { + *d++ = s[i]; + } + } + else { + *d++ = s[i]; + } + } + else { + *d++ = s[i]; + } + } + else { + *d++ = s[i]; + } + } + + *d = '\0'; + dlen = d - decoded; + + url = rspamd_mempool_alloc0_type(pool, struct rspamd_url); + rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags); + rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF); + + /* Filter some completely damaged urls */ + if (rc == URI_ERRNO_OK && url->hostlen > 0 && + !((url->protocol & PROTOCOL_UNKNOWN))) { + url->flags |= saved_flags; + + if (has_bad_chars) { + url->flags |= RSPAMD_URL_FLAG_OBSCURED; + } + + if (no_prefix) { + url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; + + if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) { + /* Ignore urls with both no schema and no tld */ + return std::nullopt; + } + } + + decoded = url->string; + + input = {decoded, url->urllen}; + + /* Spaces in href usually mean an attempt to obfuscate URL */ + /* See https://github.com/vstakhov/rspamd/issues/593 */ +#if 0 + if (has_spaces) { + url->flags |= RSPAMD_URL_FLAG_OBSCURED; + } +#endif + + return url; + } + + return std::nullopt; +} + +}// namespace rspamd::html
\ No newline at end of file diff --git a/src/libserver/html/html_url.hxx b/src/libserver/html/html_url.hxx new file mode 100644 index 0000000..46dde6d --- /dev/null +++ b/src/libserver/html/html_url.hxx @@ -0,0 +1,68 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_URL_HXX +#define RSPAMD_HTML_URL_HXX +#pragma once + +#include "libutil/mem_pool.h" + +#include <string_view> +#include <optional> + +struct rspamd_url; /* Forward declaration */ + +namespace rspamd::html { + + +/** + * Checks if an html url is likely phished by some displayed url + * @param pool + * @param href_url + * @param text_data + * @return + */ +auto html_url_is_phished(rspamd_mempool_t *pool, + struct rspamd_url *href_url, + std::string_view text_data) -> std::optional<rspamd_url *>; + +/** + * Check displayed part of the url at specified offset + * @param pool + * @param exceptions + * @param url_set + * @param visible_part + * @param href_offset + * @param url + */ +auto html_check_displayed_url(rspamd_mempool_t *pool, + GList **exceptions, + void *url_set, + std::string_view visible_part, + goffset href_offset, + struct rspamd_url *url) -> void; + +/** + * Process HTML url (e.g. for href component) + * @param pool + * @param input may be modified during the process + * @return + */ +auto html_process_url(rspamd_mempool_t *pool, std::string_view &input) + -> std::optional<struct rspamd_url *>; +}// namespace rspamd::html + +#endif//RSPAMD_HTML_URL_HXX
\ No newline at end of file |