/*- * Copyright 2016 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "lua_common.h" #include "message.h" #include "libserver/html/html.h" #include "libserver/html/html.hxx" #include "libserver/html/html_tag.hxx" #include "libserver/html/html_block.hxx" #include "images.h" #include "contrib/ankerl/unordered_dense.h" #include #include /*** * @module rspamd_html * This module provides different methods to access HTML tags. To get HTML context * from an HTML part you could use method `part:get_html()` * @example rspamd_config.R_EMPTY_IMAGE = function(task) local tp = task:get_text_parts() -- get text parts in a message for _,p in ipairs(tp) do -- iterate over text parts array using `ipairs` if p:is_html() then -- if the current part is html part local hc = p:get_html() -- we get HTML context local len = p:get_length() -- and part's length if len < 50 then -- if we have a part that has less than 50 bytes of text local images = hc:get_images() -- then we check for HTML images if images then -- if there are images for _,i in ipairs(images) do -- then iterate over images in the part if i['height'] + i['width'] >= 400 then -- if we have a large image return true -- add symbol end end end end end end end */ /*** * @method html:has_tag(name) * Checks if a specified tag `name` is presented in a part * @param {string} name name of tag to check * @return {boolean} `true` if the tag exists in HTML tree */ LUA_FUNCTION_DEF(html, has_tag); /*** * @method html:check_property(name) * Checks if the HTML has a specific property. Here is the list of available properties: * * - `no_html` - no html tag presented * - `bad_element` - part has some broken elements * - `xml` - part is xhtml * - `unknown_element` - part has some unknown elements * - `duplicate_element` - part has some duplicate elements that should be unique (namely, `title` tag) * - `unbalanced` - part has unbalanced tags * @param {string} name name of property * @return {boolean} true if the part has the specified property */ LUA_FUNCTION_DEF(html, has_property); /*** * @method html:get_images() * Returns a table of images found in html. Each image is, in turn, a table with the following fields: * * - `src` - link to the source * - `height` - height in pixels * - `width` - width in pixels * - `embedded` - `true` if an image is embedded in a message * @return {table} table of images in html part */ LUA_FUNCTION_DEF(html, get_images); /*** * @method html:foreach_tag(tagname, callback) * Processes HTML tree calling the specified callback for each tag of the specified * type. * * Callback is called with the following attributes: * * - `tag`: html tag structure * - `content_length`: length of content within a tag * * Callback function should return `true` to **stop** processing and `false` to continue * @return nothing */ LUA_FUNCTION_DEF(html, foreach_tag); /*** * @method html:get_invisible() * Returns invisible content of the HTML data * @return */ LUA_FUNCTION_DEF(html, get_invisible); static const struct luaL_reg htmllib_m[] = { LUA_INTERFACE_DEF(html, has_tag), LUA_INTERFACE_DEF(html, has_property), LUA_INTERFACE_DEF(html, get_images), LUA_INTERFACE_DEF(html, foreach_tag), LUA_INTERFACE_DEF(html, get_invisible), {"__tostring", rspamd_lua_class_tostring}, {NULL, NULL}}; /*** * @method html_tag:get_type() * Returns string representation of HTML type for a tag * @return {string} type of tag */ LUA_FUNCTION_DEF(html_tag, get_type); /*** * @method html_tag:get_extra() * Returns extra data associated with the tag * @return {url|image|nil} extra data associated with the tag */ LUA_FUNCTION_DEF(html_tag, get_extra); /*** * @method html_tag:get_parent() * Returns parent node for a specified tag * @return {html_tag} parent object for a specified tag */ LUA_FUNCTION_DEF(html_tag, get_parent); /*** * @method html_tag:get_flags() * Returns flags a specified tag: * * - `closed`: tag is properly closed * - `closing`: tag is a closing tag * - `broken`: tag is somehow broken * - `unbalanced`: tag is unbalanced * - `xml`: tag is xml tag * @return {table} table of flags */ LUA_FUNCTION_DEF(html_tag, get_flags); /*** * @method html_tag:get_content() * Returns content of tag (approximate for some cases) * @return {rspamd_text} rspamd text with tag's content */ LUA_FUNCTION_DEF(html_tag, get_content); /*** * @method html_tag:get_content_length() * Returns length of a tag's content * @return {number} size of content enclosed within a tag */ LUA_FUNCTION_DEF(html_tag, get_content_length); /*** * @method html_tag:get_style() * Returns style calculated for the element * @return {table} table associated with the style */ LUA_FUNCTION_DEF(html_tag, get_style); /*** * @method html_tag:get_attribute(name) * Returns value of attribute for the element * Refer to `html_components_map` in `src/libserver/html/html.cxx` for recognised names * @return {string|nil} value of the attribute */ LUA_FUNCTION_DEF(html_tag, get_attribute); static const struct luaL_reg taglib_m[] = { LUA_INTERFACE_DEF(html_tag, get_type), LUA_INTERFACE_DEF(html_tag, get_extra), LUA_INTERFACE_DEF(html_tag, get_parent), LUA_INTERFACE_DEF(html_tag, get_flags), LUA_INTERFACE_DEF(html_tag, get_content), LUA_INTERFACE_DEF(html_tag, get_content_length), LUA_INTERFACE_DEF(html_tag, get_style), LUA_INTERFACE_DEF(html_tag, get_attribute), {"__tostring", rspamd_lua_class_tostring}, {NULL, NULL}}; static struct rspamd::html::html_content * lua_check_html(lua_State *L, gint pos) { void *ud = rspamd_lua_check_udata(L, pos, "rspamd{html}"); luaL_argcheck(L, ud != NULL, pos, "'html' expected"); return ud ? *((struct rspamd::html::html_content **) ud) : NULL; } struct lua_html_tag { rspamd::html::html_content *html; const rspamd::html::html_tag *tag; }; static struct lua_html_tag * lua_check_html_tag(lua_State *L, gint pos) { void *ud = rspamd_lua_check_udata(L, pos, "rspamd{html_tag}"); luaL_argcheck(L, ud != NULL, pos, "'html_tag' expected"); return ud ? ((struct lua_html_tag *) ud) : NULL; } static gint lua_html_has_tag(lua_State *L) { LUA_TRACE_POINT; auto *hc = lua_check_html(L, 1); const gchar *tagname = luaL_checkstring(L, 2); gboolean ret = FALSE; if (hc && tagname) { if (rspamd_html_tag_seen(hc, tagname)) { ret = TRUE; } } lua_pushboolean(L, ret); return 1; } constexpr const auto prop_map = frozen::make_unordered_map({ {"no_html", RSPAMD_HTML_FLAG_BAD_START}, {"bad_start", RSPAMD_HTML_FLAG_BAD_START}, {"bad_element", RSPAMD_HTML_FLAG_BAD_ELEMENTS}, {"bad_elements", RSPAMD_HTML_FLAG_BAD_ELEMENTS}, {"xml", RSPAMD_HTML_FLAG_XML}, {"unknown_element", RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS}, {"unknown_elements", RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS}, {"duplicate_element", RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS}, {"duplicate_elements", RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS}, {"unbalanced", RSPAMD_HTML_FLAG_UNBALANCED}, {"data_urls", RSPAMD_HTML_FLAG_HAS_DATA_URLS}, }); static gint lua_html_has_property(lua_State *L) { LUA_TRACE_POINT; auto *hc = lua_check_html(L, 1); const gchar *propname = luaL_checkstring(L, 2); gboolean ret = FALSE; if (hc && propname) { auto found_prop = prop_map.find(frozen::string(propname)); if (found_prop != prop_map.end()) { ret = hc->flags & found_prop->second; } } lua_pushboolean(L, ret); return 1; } static void lua_html_push_image(lua_State *L, const struct html_image *img) { LUA_TRACE_POINT; struct lua_html_tag *ltag; struct rspamd_url **purl; lua_createtable(L, 0, 7); if (img->src) { lua_pushstring(L, "src"); if (img->flags & RSPAMD_HTML_FLAG_IMAGE_DATA) { struct rspamd_lua_text *t; t = static_cast(lua_newuserdata(L, sizeof(*t))); t->start = img->src; t->len = strlen(img->src); t->flags = 0; rspamd_lua_setclass(L, "rspamd{text}", -1); } else { lua_pushstring(L, img->src); } lua_settable(L, -3); } if (img->url) { lua_pushstring(L, "url"); purl = static_cast(lua_newuserdata(L, sizeof(gpointer))); *purl = img->url; rspamd_lua_setclass(L, "rspamd{url}", -1); lua_settable(L, -3); } if (img->tag) { lua_pushstring(L, "tag"); ltag = static_cast(lua_newuserdata(L, sizeof(struct lua_html_tag))); ltag->tag = static_cast(img->tag); ltag->html = NULL; rspamd_lua_setclass(L, "rspamd{html_tag}", -1); lua_settable(L, -3); } lua_pushstring(L, "height"); lua_pushinteger(L, img->height); lua_settable(L, -3); lua_pushstring(L, "width"); lua_pushinteger(L, img->width); lua_settable(L, -3); lua_pushstring(L, "embedded"); lua_pushboolean(L, img->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED); lua_settable(L, -3); lua_pushstring(L, "data"); lua_pushboolean(L, img->flags & RSPAMD_HTML_FLAG_IMAGE_DATA); lua_settable(L, -3); } static gint lua_html_get_images(lua_State *L) { LUA_TRACE_POINT; auto *hc = lua_check_html(L, 1); guint i = 1; if (hc != NULL) { lua_createtable(L, hc->images.size(), 0); for (const auto *img: hc->images) { lua_html_push_image(L, img); lua_rawseti(L, -2, i++); } } else { lua_newtable(L); } return 1; } static void lua_html_push_block(lua_State *L, const struct rspamd::html::html_block *bl) { LUA_TRACE_POINT; lua_createtable(L, 0, 6); if (bl->fg_color_mask) { lua_pushstring(L, "color"); lua_createtable(L, 4, 0); lua_pushinteger(L, bl->fg_color.r); lua_rawseti(L, -2, 1); lua_pushinteger(L, bl->fg_color.g); lua_rawseti(L, -2, 2); lua_pushinteger(L, bl->fg_color.b); lua_rawseti(L, -2, 3); lua_pushinteger(L, bl->fg_color.alpha); lua_rawseti(L, -2, 4); lua_settable(L, -3); } if (bl->bg_color_mask) { lua_pushstring(L, "bgcolor"); lua_createtable(L, 4, 0); lua_pushinteger(L, bl->bg_color.r); lua_rawseti(L, -2, 1); lua_pushinteger(L, bl->bg_color.g); lua_rawseti(L, -2, 2); lua_pushinteger(L, bl->bg_color.b); lua_rawseti(L, -2, 3); lua_pushinteger(L, bl->bg_color.alpha); lua_rawseti(L, -2, 4); lua_settable(L, -3); } if (bl->font_mask) { lua_pushstring(L, "font_size"); lua_pushinteger(L, bl->font_size); lua_settable(L, -3); } lua_pushstring(L, "visible"); lua_pushboolean(L, bl->is_visible()); lua_settable(L, -3); lua_pushstring(L, "transparent"); lua_pushboolean(L, bl->is_transparent()); lua_settable(L, -3); } static gint lua_html_foreach_tag(lua_State *L) { LUA_TRACE_POINT; auto *hc = lua_check_html(L, 1); const gchar *tagname; gint id; auto any = false; ankerl::unordered_dense::set tags; if (lua_type(L, 2) == LUA_TSTRING) { tagname = luaL_checkstring(L, 2); if (strcmp(tagname, "any") == 0) { any = true; } else { id = rspamd_html_tag_by_name(tagname); if (id == -1) { return luaL_error(L, "invalid tagname: %s", tagname); } tags.insert(id); } } else if (lua_type(L, 2) == LUA_TTABLE) { lua_pushvalue(L, 2); for (lua_pushnil(L); lua_next(L, -2); lua_pop(L, 1)) { tagname = luaL_checkstring(L, -1); if (strcmp(tagname, "any") == 0) { any = TRUE; } else { id = rspamd_html_tag_by_name(tagname); if (id == -1) { return luaL_error(L, "invalid tagname: %s", tagname); } tags.insert(id); } } lua_pop(L, 1); } if (hc && (any || !tags.empty()) && lua_isfunction(L, 3)) { hc->traverse_all_tags([&](const rspamd::html::html_tag *tag) -> bool { if (tag && (any || tags.contains(tag->id))) { lua_pushcfunction(L, &rspamd_lua_traceback); auto err_idx = lua_gettop(L); lua_pushvalue(L, 3); auto *ltag = static_cast(lua_newuserdata(L, sizeof(lua_html_tag))); ltag->tag = tag; ltag->html = hc; auto ct = ltag->tag->get_content(hc); rspamd_lua_setclass(L, "rspamd{html_tag}", -1); lua_pushinteger(L, ct.size()); /* Leaf flag */ if (tag->children.empty()) { lua_pushboolean(L, true); } else { lua_pushboolean(L, false); } if (lua_pcall(L, 3, 1, err_idx) != 0) { msg_err("error in foreach_tag callback: %s", lua_tostring(L, -1)); lua_settop(L, err_idx - 1); return false; } if (lua_toboolean(L, -1)) { lua_settop(L, err_idx - 1); return false; } lua_settop(L, err_idx - 1); } return true; }); } else { return luaL_error(L, "invalid arguments"); } return 0; } static gint lua_html_get_invisible(lua_State *L) { LUA_TRACE_POINT; auto *hc = lua_check_html(L, 1); if (hc != NULL) { lua_new_text(L, hc->invisible.c_str(), hc->invisible.size(), false); } else { lua_newtable(L); } return 1; } static gint lua_html_tag_get_type(lua_State *L) { LUA_TRACE_POINT; struct lua_html_tag *ltag = lua_check_html_tag(L, 1); const gchar *tagname; if (ltag != NULL) { tagname = rspamd_html_tag_by_id(ltag->tag->id); if (tagname) { lua_pushstring(L, tagname); } else { lua_pushnil(L); } } else { return luaL_error(L, "invalid arguments"); } return 1; } static gint lua_html_tag_get_parent(lua_State *L) { LUA_TRACE_POINT; struct lua_html_tag *ltag = lua_check_html_tag(L, 1), *ptag; if (ltag != NULL) { auto *parent = ltag->tag->parent; if (parent) { ptag = static_cast(lua_newuserdata(L, sizeof(*ptag))); ptag->tag = static_cast(parent); ptag->html = ltag->html; rspamd_lua_setclass(L, "rspamd{html_tag}", -1); } else { lua_pushnil(L); } } else { return luaL_error(L, "invalid arguments"); } return 1; } static gint lua_html_tag_get_flags(lua_State *L) { LUA_TRACE_POINT; struct lua_html_tag *ltag = lua_check_html_tag(L, 1); gint i = 1; if (ltag && ltag->tag) { /* Push flags */ lua_createtable(L, 4, 0); if (ltag->tag->flags & FL_HREF) { lua_pushstring(L, "href"); lua_rawseti(L, -2, i++); } if (ltag->tag->flags & FL_CLOSED) { lua_pushstring(L, "closed"); lua_rawseti(L, -2, i++); } if (ltag->tag->flags & FL_BROKEN) { lua_pushstring(L, "broken"); lua_rawseti(L, -2, i++); } if (ltag->tag->flags & FL_XML) { lua_pushstring(L, "xml"); lua_rawseti(L, -2, i++); } if (ltag->tag->flags & RSPAMD_HTML_FLAG_UNBALANCED) { lua_pushstring(L, "unbalanced"); lua_rawseti(L, -2, i++); } } else { return luaL_error(L, "invalid arguments"); } return 1; } static gint lua_html_tag_get_content(lua_State *L) { LUA_TRACE_POINT; struct lua_html_tag *ltag = lua_check_html_tag(L, 1); struct rspamd_lua_text *t; if (ltag) { if (ltag->html) { auto ct = ltag->tag->get_content(ltag->html); if (ct.size() > 0) { t = static_cast(lua_newuserdata(L, sizeof(*t))); rspamd_lua_setclass(L, "rspamd{text}", -1); t->start = ct.data(); t->len = ct.size(); t->flags = 0; } else { lua_pushnil(L); } } else { lua_pushnil(L); } } else { return luaL_error(L, "invalid arguments"); } return 1; } static gint lua_html_tag_get_content_length(lua_State *L) { LUA_TRACE_POINT; struct lua_html_tag *ltag = lua_check_html_tag(L, 1); if (ltag) { if (ltag->html) { auto ct = ltag->tag->get_content(ltag->html); lua_pushinteger(L, ct.size()); } else { lua_pushinteger(L, ltag->tag->get_content_length()); } } else { return luaL_error(L, "invalid arguments"); } return 1; } static gint lua_html_tag_get_extra(lua_State *L) { LUA_TRACE_POINT; struct lua_html_tag *ltag = lua_check_html_tag(L, 1); struct html_image *img; if (ltag) { if (!std::holds_alternative(ltag->tag->extra)) { if (std::holds_alternative(ltag->tag->extra)) { img = std::get(ltag->tag->extra); lua_html_push_image(L, img); } else if (std::holds_alternative(ltag->tag->extra)) { /* For A that's URL */ auto *lua_url = static_cast(lua_newuserdata(L, sizeof(rspamd_lua_url))); lua_url->url = std::get(ltag->tag->extra); rspamd_lua_setclass(L, "rspamd{url}", -1); } else { /* Unknown extra ? */ lua_pushnil(L); } } else { lua_pushnil(L); } } else { return luaL_error(L, "invalid arguments"); } return 1; } static gint lua_html_tag_get_style(lua_State *L) { LUA_TRACE_POINT; struct lua_html_tag *ltag = lua_check_html_tag(L, 1); if (ltag) { if (ltag->tag->block) { lua_html_push_block(L, ltag->tag->block); } } else { return luaL_error(L, "invalid arguments"); } return 1; } static gint lua_html_tag_get_attribute(lua_State *L) { LUA_TRACE_POINT; struct lua_html_tag *ltag = lua_check_html_tag(L, 1); gsize slen; const gchar *attr_name = luaL_checklstring(L, 2, &slen); if (ltag && attr_name) { auto maybe_attr = ltag->tag->find_component( rspamd::html::html_component_from_string({attr_name, slen})); if (maybe_attr) { lua_pushlstring(L, maybe_attr->data(), maybe_attr->size()); } else { lua_pushnil(L); } } else { return luaL_error(L, "invalid arguments"); } return 1; } void luaopen_html(lua_State *L) { rspamd_lua_new_class(L, "rspamd{html}", htmllib_m); lua_pop(L, 1); rspamd_lua_new_class(L, "rspamd{html_tag}", taglib_m); lua_pop(L, 1); }