summaryrefslogtreecommitdiffstats
path: root/rules/html.lua
diff options
context:
space:
mode:
Diffstat (limited to 'rules/html.lua')
-rw-r--r--rules/html.lua462
1 files changed, 462 insertions, 0 deletions
diff --git a/rules/html.lua b/rules/html.lua
new file mode 100644
index 0000000..7c352c2
--- /dev/null
+++ b/rules/html.lua
@@ -0,0 +1,462 @@
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements. See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to you under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License. You may obtain a copy of the License at:
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+
+local reconf = config['regexp']
+
+local rspamd_regexp = require "rspamd_regexp"
+
+-- Messages that have only HTML part
+reconf['MIME_HTML_ONLY'] = {
+ re = 'has_only_html_part()',
+ score = 0.2,
+ description = 'Message has only an HTML part',
+ group = 'headers'
+}
+
+local function has_anchor_parent(tag)
+ local parent = tag
+ repeat
+ parent = parent:get_parent()
+ if parent then
+ if parent:get_type() == 'a' then
+ return true
+ end
+ end
+ until not parent
+
+ return false
+end
+
+local function check_html_image(task, min, max)
+ local tp = task:get_text_parts()
+
+ for _, p in ipairs(tp) do
+ if p:is_html() then
+ local hc = p:get_html()
+ local len = p:get_length()
+
+ if hc and len >= min and len < max then
+ local images = hc:get_images()
+ if images then
+ for _, i in ipairs(images) do
+ local tag = i['tag']
+ if tag then
+ if has_anchor_parent(tag) then
+ -- do not trigger on small and unknown size images
+ if i['height'] + i['width'] >= 210 and i['embedded'] then
+ return true
+ end
+ end
+ end
+ end
+ end
+ end
+ end
+ end
+end
+
+rspamd_config.HTML_SHORT_LINK_IMG_1 = {
+ callback = function(task)
+ return check_html_image(task, 0, 1024)
+ end,
+ score = 2.0,
+ group = 'html',
+ description = 'Short HTML part (0..1K) with a link to an image'
+}
+
+rspamd_config.HTML_SHORT_LINK_IMG_2 = {
+ callback = function(task)
+ return check_html_image(task, 1024, 1536)
+ end,
+ score = 1.0,
+ group = 'html',
+ description = 'Short HTML part (1K..1.5K) with a link to an image'
+}
+
+rspamd_config.HTML_SHORT_LINK_IMG_3 = {
+ callback = function(task)
+ return check_html_image(task, 1536, 2048)
+ end,
+ score = 0.5,
+ group = 'html',
+ description = 'Short HTML part (1.5K..2K) with a link to an image'
+}
+
+rspamd_config.R_EMPTY_IMAGE = {
+ callback = function(task)
+ local tp = task:get_text_parts() -- get text parts in a message
+
+ for _, p in ipairs(tp) do
+ -- iterate over text parts array using `ipairs`
+ if p:is_html() then
+ -- if the current part is html part
+ local hc = p:get_html() -- we get HTML context
+ local len = p:get_length() -- and part's length
+ if hc and len < 50 then
+ -- if we have a part that has less than 50 bytes of text
+ local images = hc:get_images() -- then we check for HTML images
+
+ if images then
+ -- if there are images
+ for _, i in ipairs(images) do
+ -- then iterate over images in the part
+ if i['height'] + i['width'] >= 400 then
+ -- if we have a large image
+ local tag = i['tag']
+ if tag then
+ if not has_anchor_parent(tag) then
+ return true
+ end
+ end
+ end
+ end
+ end
+ end
+ end
+ end
+ end,
+
+ score = 2.0,
+ group = 'html',
+ description = 'Message contains empty parts and image'
+}
+
+rspamd_config.R_SUSPICIOUS_IMAGES = {
+ callback = function(task)
+ local tp = task:get_text_parts() -- get text parts in a message
+
+ for _, p in ipairs(tp) do
+ local h = p:get_html()
+
+ if h then
+ local l = p:get_words_count()
+ local img = h:get_images()
+ local pic_words = 0
+
+ if img then
+ for _, i in ipairs(img) do
+ local dim = i['width'] + i['height']
+ local tag = i['tag']
+
+ if tag then
+ if has_anchor_parent(tag) then
+ if dim > 100 and dim < 3000 then
+ -- We assume that a single picture 100x200 contains approx 3 words of text
+ pic_words = pic_words + dim / 100
+ end
+ end
+ end
+ end
+ end
+
+ if l + pic_words > 0 then
+ local rel = pic_words / (l + pic_words)
+
+ if rel > 0.5 then
+ return true, (rel - 0.5) * 2
+ end
+ end
+ end
+ end
+
+ return false
+ end,
+
+ score = 5.0,
+ group = 'html',
+ description = 'Message contains many suspicious messages'
+}
+
+local vis_check_id = rspamd_config:register_symbol {
+ name = 'HTML_VISIBLE_CHECKS',
+ type = 'callback',
+ group = 'html',
+ callback = function(task)
+ --local logger = require "rspamd_logger"
+ local tp = task:get_text_parts() -- get text parts in a message
+ local ret = false
+ local transp_rate = 0
+ local invisible_blocks = 0
+ local zero_size_blocks = 0
+ local arg
+
+ local normal_len = 0
+ local transp_len = 0
+
+ for _, p in ipairs(tp) do
+ -- iterate over text parts array using `ipairs`
+ normal_len = normal_len + p:get_length()
+ if p:is_html() and p:get_html() then
+ -- if the current part is html part
+ local hc = p:get_html() -- we get HTML context
+
+ hc:foreach_tag({ 'font', 'span', 'div', 'p', 'td' }, function(tag, clen, is_leaf)
+ local bl = tag:get_style()
+ if bl then
+ if not bl.visible and clen > 0 and is_leaf then
+ invisible_blocks = invisible_blocks + 1
+ end
+
+ if (bl.font_size or 12) == 0 and clen > 0 and is_leaf then
+ zero_size_blocks = zero_size_blocks + 1
+ end
+
+ if bl.transparent and is_leaf then
+ ret = true
+ invisible_blocks = invisible_blocks + 1 -- This block is invisible
+ transp_len = transp_len + clen
+ normal_len = normal_len - clen
+ local tr = transp_len / (normal_len + transp_len)
+ if tr > transp_rate then
+ transp_rate = tr
+ if not bl.color then
+ bl.color = { 0, 0, 0 }
+ end
+ if not bl.bgcolor then
+ bl.bgcolor = { 0, 0, 0 }
+ end
+ arg = string.format('%s color #%x%x%x bgcolor #%x%x%x',
+ tag:get_type(),
+ bl.color[1], bl.color[2], bl.color[3],
+ bl.bgcolor[1], bl.bgcolor[2], bl.bgcolor[3])
+ end
+ end
+ end
+
+ return false -- Continue search
+ end)
+
+ end
+ end
+
+ if ret then
+ transp_rate = transp_len / (normal_len + transp_len)
+
+ if transp_rate > 0.1 then
+ if transp_rate > 0.5 or transp_rate ~= transp_rate then
+ transp_rate = 0.5
+ end
+
+ task:insert_result('R_WHITE_ON_WHITE', (transp_rate * 2.0), arg)
+ end
+ end
+
+ if invisible_blocks > 0 then
+ if invisible_blocks > 10 then
+ invisible_blocks = 10
+ end
+ local rates = { -- From 1 to 10
+ 0.05,
+ 0.1,
+ 0.2,
+ 0.3,
+ 0.4,
+ 0.5,
+ 0.6,
+ 0.7,
+ 0.8,
+ 1.0,
+ }
+ task:insert_result('MANY_INVISIBLE_PARTS', rates[invisible_blocks],
+ tostring(invisible_blocks))
+ end
+
+ if zero_size_blocks > 0 then
+ if zero_size_blocks > 5 then
+ if zero_size_blocks > 10 then
+ -- Full score
+ task:insert_result('ZERO_FONT', 1.0,
+ tostring(zero_size_blocks))
+ else
+ zero_size_blocks = 5
+ end
+ end
+
+ if zero_size_blocks <= 5 then
+ local rates = { -- From 1 to 5
+ 0.1,
+ 0.2,
+ 0.2,
+ 0.3,
+ 0.5,
+ }
+ task:insert_result('ZERO_FONT', rates[zero_size_blocks],
+ tostring(zero_size_blocks))
+ end
+ end
+ end,
+}
+
+rspamd_config:register_symbol {
+ type = 'virtual',
+ parent = vis_check_id,
+ name = 'R_WHITE_ON_WHITE',
+ description = 'Message contains low contrast text',
+ score = 4.0,
+ group = 'html',
+ one_shot = true,
+}
+
+rspamd_config:register_symbol {
+ type = 'virtual',
+ parent = vis_check_id,
+ name = 'ZERO_FONT',
+ description = 'Zero sized font used',
+ score = 1.0, -- Reached if more than 5 elements have zero size
+ one_shot = true,
+ group = 'html'
+}
+
+rspamd_config:register_symbol {
+ type = 'virtual',
+ parent = vis_check_id,
+ name = 'MANY_INVISIBLE_PARTS',
+ description = 'Many parts are visually hidden',
+ score = 1.0, -- Reached if more than 10 elements are hidden
+ one_shot = true,
+ group = 'html'
+}
+
+rspamd_config.EXT_CSS = {
+ callback = function(task)
+ local regexp_lib = require "rspamd_regexp"
+ local re = regexp_lib.create_cached('/^.*\\.css(?:[?#].*)?$/i')
+ local tp = task:get_text_parts() -- get text parts in a message
+ local ret = false
+ for _, p in ipairs(tp) do
+ -- iterate over text parts array using `ipairs`
+ if p:is_html() and p:get_html() then
+ -- if the current part is html part
+ local hc = p:get_html() -- we get HTML context
+ hc:foreach_tag({ 'link' }, function(tag)
+ local bl = tag:get_extra()
+ if bl then
+ local s = tostring(bl)
+ if s and re:match(s) then
+ ret = true
+ end
+ end
+
+ return ret -- Continue search
+ end)
+
+ end
+ end
+
+ return ret
+ end,
+
+ score = 1.0,
+ group = 'html',
+ description = 'Message contains external CSS reference'
+}
+
+local https_re = rspamd_regexp.create_cached('/^https:/i')
+
+rspamd_config.HTTP_TO_HTTPS = {
+ callback = function(task)
+ local found_opts
+ local tp = task:get_text_parts() or {}
+
+ for _, p in ipairs(tp) do
+ if p:is_html() then
+ local hc = p:get_html()
+ if (not hc) then
+ return false
+ end
+
+ local found = false
+
+ hc:foreach_tag('a', function(tag, _)
+ -- Skip this loop if we already have a match
+ if (found) then
+ return true
+ end
+
+ local c = tag:get_content()
+ if (c) then
+ if (not https_re:match(c)) then
+ return false
+ end
+
+ local u = tag:get_extra()
+ if (not u) then
+ return false
+ end
+ local url_proto = u:get_protocol()
+
+ if url_proto ~= 'http' then
+ return false
+ end
+ -- Capture matches for http in href to https in visible part only
+ found = true
+ found_opts = u:get_host()
+ return true
+ end
+
+ return false
+ end)
+
+ if (found) then
+ return true, 1.0, found_opts
+ end
+
+ return false
+ end
+ end
+ return false
+ end,
+ description = 'The anchor text contains a distinct scheme compared to the target URL',
+ score = 0.5,
+ group = 'html'
+}
+
+rspamd_config.HTTP_TO_IP = {
+ callback = function(task)
+ local tp = task:get_text_parts()
+ if (not tp) then
+ return false
+ end
+ for _, p in ipairs(tp) do
+ if p:is_html() then
+ local hc = p:get_html()
+ if (not hc) then
+ return false
+ end
+ local found = false
+ hc:foreach_tag('a', function(tag, length)
+ if (found) then
+ return true
+ end
+ local u = tag:get_extra()
+ if (u) then
+ u = tostring(u):lower()
+ if (u:match('^https?://%d+%.%d+%.%d+%.%d+')) then
+ found = true
+ end
+ end
+ return false
+ end)
+ if found then
+ return true
+ end
+ return false
+ end
+ end
+ end,
+ description = 'HTML anchor points to an IP address',
+ score = 1.0,
+ group = 'html'
+}