diff options
Diffstat (limited to '')
-rw-r--r-- | lualib/lua_magic/heuristics.lua | 605 | ||||
-rw-r--r-- | lualib/lua_magic/init.lua | 388 | ||||
-rw-r--r-- | lualib/lua_magic/patterns.lua | 471 | ||||
-rw-r--r-- | lualib/lua_magic/types.lua | 327 |
4 files changed, 1791 insertions, 0 deletions
diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua new file mode 100644 index 0000000..b8a1b41 --- /dev/null +++ b/lualib/lua_magic/heuristics.lua @@ -0,0 +1,605 @@ +--[[ +Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +--[[[ +-- @module lua_magic/heuristics +-- This module contains heuristics for some specific cases +--]] + +local rspamd_trie = require "rspamd_trie" +local rspamd_util = require "rspamd_util" +local lua_util = require "lua_util" +local bit = require "bit" +local fun = require "fun" + +local N = "lua_magic" +local msoffice_trie +local msoffice_patterns = { + doc = { [[WordDocument]] }, + xls = { [[Workbook]], [[Book]] }, + ppt = { [[PowerPoint Document]], [[Current User]] }, + vsd = { [[VisioDocument]] }, +} +local msoffice_trie_clsid +local msoffice_clsids = { + doc = { [[0609020000000000c000000000000046]] }, + xls = { [[1008020000000000c000000000000046]], [[2008020000000000c000000000000046]] }, + ppt = { [[108d81649b4fcf1186ea00aa00b929e8]] }, + msg = { [[46f0060000000000c000000000000046]], [[0b0d020000000000c000000000000046]] }, + msi = { [[84100c0000000000c000000000000046]] }, +} +local zip_trie +local zip_patterns = { + -- https://lists.oasis-open.org/archives/office/200505/msg00006.html + odt = { + [[mimetypeapplication/vnd\.oasis\.opendocument\.text]], + [[mimetypeapplication/vnd\.oasis\.opendocument\.image]], + [[mimetypeapplication/vnd\.oasis\.opendocument\.graphic]] + }, + ods = { + [[mimetypeapplication/vnd\.oasis\.opendocument\.spreadsheet]], + [[mimetypeapplication/vnd\.oasis\.opendocument\.formula]], + [[mimetypeapplication/vnd\.oasis\.opendocument\.chart]] + }, + odp = { [[mimetypeapplication/vnd\.oasis\.opendocument\.presentation]] }, + epub = { [[epub\+zip]] }, + asice = { [[mimetypeapplication/vnd\.etsi\.asic-e\+zipPK]] }, + asics = { [[mimetypeapplication/vnd\.etsi\.asic-s\+zipPK]] }, +} + +local txt_trie +local txt_patterns = { + html = { + { [=[(?i)<html[\s>]]=], 32 }, + { [[(?i)<script\b]], 20 }, -- Commonly used by spammers + { [[<script\s+type="text\/javascript">]], 31 }, -- Another spammy pattern + { [[(?i)<\!DOCTYPE HTML\b]], 33 }, + { [[(?i)<body\b]], 20 }, + { [[(?i)<table\b]], 20 }, + { [[(?i)<a\s]], 10 }, + { [[(?i)<p\b]], 10 }, + { [[(?i)<div\b]], 10 }, + { [[(?i)<span\b]], 10 }, + }, + csv = { + { [[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+,?[ ]*[\r\n])]], 20 } + }, + ics = { + { [[^BEGIN:VCALENDAR\r?\n]], 40 }, + }, + vcf = { + { [[^BEGIN:VCARD\r?\n]], 40 }, + }, + xml = { + { [[<\?xml\b.+\?>]], 31 }, + } +} + +-- Used to match pattern index and extension +local msoffice_clsid_indexes = {} +local msoffice_patterns_indexes = {} +local zip_patterns_indexes = {} +local txt_patterns_indexes = {} + +local exports = {} + +local function compile_tries() + local default_compile_flags = bit.bor(rspamd_trie.flags.re, + rspamd_trie.flags.dot_all, + rspamd_trie.flags.single_match, + rspamd_trie.flags.no_start) + local function compile_pats(patterns, indexes, transform_func, compile_flags) + local strs = {} + for ext, pats in pairs(patterns) do + for _, pat in ipairs(pats) do + -- These are utf16 strings in fact... + strs[#strs + 1] = transform_func(pat) + indexes[#indexes + 1] = { ext, pat } + end + end + + return rspamd_trie.create(strs, compile_flags or default_compile_flags) + end + + if not msoffice_trie then + -- Directory names + local function msoffice_pattern_transform(pat) + return '^' .. + table.concat( + fun.totable( + fun.map(function(c) + return c .. [[\x{00}]] + end, + fun.iter(pat)))) + end + local function msoffice_clsid_transform(pat) + local hex_table = {} + for i = 1, #pat, 2 do + local subc = pat:sub(i, i + 1) + hex_table[#hex_table + 1] = string.format('\\x{%s}', subc) + end + + return '^' .. table.concat(hex_table) .. '$' + end + -- Directory entries + msoffice_trie = compile_pats(msoffice_patterns, msoffice_patterns_indexes, + msoffice_pattern_transform) + -- Clsids + msoffice_trie_clsid = compile_pats(msoffice_clsids, msoffice_clsid_indexes, + msoffice_clsid_transform) + -- Misc zip patterns at the initial fragment + zip_trie = compile_pats(zip_patterns, zip_patterns_indexes, + function(pat) + return pat + end) + -- Text patterns at the initial fragment + txt_trie = compile_pats(txt_patterns, txt_patterns_indexes, + function(pat_tbl) + return pat_tbl[1] + end, + bit.bor(rspamd_trie.flags.re, + rspamd_trie.flags.dot_all, + rspamd_trie.flags.no_start)) + end +end + +-- Call immediately on require +compile_tries() + +local function detect_ole_format(input, log_obj, _, part) + local inplen = #input + if inplen < 0x31 + 4 then + lua_util.debugm(N, log_obj, "short length: %s", inplen) + return nil + end + + local bom, sec_size = rspamd_util.unpack('<I2<I2', input:span(29, 4)) + if bom == 0xFFFE then + bom = '<' + else + lua_util.debugm(N, log_obj, "bom file!: %s", bom) + bom = '>'; + sec_size = bit.bswap(sec_size) + end + + if sec_size < 7 or sec_size > 31 then + lua_util.debugm(N, log_obj, "bad sec_size: %s", sec_size) + return nil + end + + sec_size = 2 ^ sec_size + + -- SecID of first sector of the directory stream + local directory_offset = (rspamd_util.unpack(bom .. 'I4', input:span(0x31, 4))) + * sec_size + 512 + 1 + lua_util.debugm(N, log_obj, "directory: %s", directory_offset) + + if inplen < directory_offset then + lua_util.debugm(N, log_obj, "short length: %s", inplen) + return nil + end + + local function process_dir_entry(offset) + local dtype = input:byte(offset + 66) + lua_util.debugm(N, log_obj, "dtype: %s, offset: %s", dtype, offset) + + if dtype then + if dtype == 5 then + -- Extract clsid + local matches = msoffice_trie_clsid:match(input:span(offset + 80, 16)) + if matches then + for n, _ in pairs(matches) do + if msoffice_clsid_indexes[n] then + lua_util.debugm(N, log_obj, "found valid clsid for %s", + msoffice_clsid_indexes[n][1]) + return true, msoffice_clsid_indexes[n][1] + end + end + end + return true, nil + elseif dtype == 2 then + local matches = msoffice_trie:match(input:span(offset, 64)) + if matches then + for n, _ in pairs(matches) do + if msoffice_patterns_indexes[n] then + return true, msoffice_patterns_indexes[n][1] + end + end + end + return true, nil + elseif dtype >= 0 and dtype < 5 then + -- Bad type + return true, nil + end + end + + return false, nil + end + + repeat + local res, ext = process_dir_entry(directory_offset) + + if res and ext then + return ext, 60 + end + + if not res then + break + end + + directory_offset = directory_offset + 128 + until directory_offset >= inplen +end + +exports.ole_format_heuristic = detect_ole_format + +local function process_top_detected(res) + local extensions = lua_util.keys(res) + + if #extensions > 0 then + table.sort(extensions, function(ex1, ex2) + return res[ex1] > res[ex2] + end) + + return extensions[1], res[extensions[1]] + end + + return nil +end + +local function detect_archive_flaw(part, arch, log_obj, _) + local arch_type = arch:get_type() + local res = { + docx = 0, + xlsx = 0, + pptx = 0, + jar = 0, + odt = 0, + odp = 0, + ods = 0, + apk = 0, + } -- ext + confidence pairs + + -- General msoffice patterns + local function add_msoffice_confidence(incr) + res.docx = res.docx + incr + res.xlsx = res.xlsx + incr + res.pptx = res.pptx + incr + end + + if arch_type == 'zip' then + -- Find specific files/folders in zip file + local files = arch:get_files(100) or {} + for _, file in ipairs(files) do + if file == '[Content_Types].xml' then + add_msoffice_confidence(10) + elseif file:sub(1, 3) == 'xl/' then + res.xlsx = res.xlsx + 30 + elseif file:sub(1, 5) == 'word/' then + res.docx = res.docx + 30 + elseif file:sub(1, 4) == 'ppt/' then + res.pptx = res.pptx + 30 + elseif file == 'META-INF/MANIFEST.MF' then + res.jar = res.jar + 40 + elseif file == 'AndroidManifest.xml' then + res.apk = res.apk + 60 + end + end + + local ext, weight = process_top_detected(res) + + if weight >= 40 then + return ext, weight + end + + -- Apply misc Zip detection logic + local content = part:get_content() + + if #content > 128 then + local start_span = content:span(1, 128) + + local matches = zip_trie:match(start_span) + if matches then + for n, _ in pairs(matches) do + if zip_patterns_indexes[n] then + lua_util.debugm(N, log_obj, "found zip pattern for %s", + zip_patterns_indexes[n][1]) + return zip_patterns_indexes[n][1], 40 + end + end + end + end + end + + return arch_type:lower(), 40 +end + +local csv_grammar +-- Returns a grammar that will count commas +local function get_csv_grammar() + if not csv_grammar then + local lpeg = require 'lpeg' + + local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P '""' / '"') ^ 0) * '"' + + lpeg.C((1 - lpeg.S ',\n"') ^ 0) + + csv_grammar = lpeg.Cf(lpeg.Cc(0) * field * lpeg.P((lpeg.P(',') + + lpeg.P('\t')) * field) ^ 1 * (lpeg.S '\r\n' + -1), + function(acc) + return acc + 1 + end) + end + + return csv_grammar +end +local function validate_csv(part, content, log_obj) + local max_chunk = 32768 + local chunk = content:sub(1, max_chunk) + + local expected_commas + local matched_lines = 0 + local max_matched_lines = 10 + + lua_util.debugm(N, log_obj, "check for csv pattern") + + for s in chunk:lines() do + local ncommas = get_csv_grammar():match(s) + + if not ncommas then + lua_util.debugm(N, log_obj, "not a csv line at line number %s", + matched_lines) + return false + end + + if expected_commas and ncommas ~= expected_commas then + -- Mismatched commas + lua_util.debugm(N, log_obj, "missmatched commas on line %s: %s != %s", + matched_lines, ncommas, expected_commas) + return false + elseif not expected_commas then + if ncommas == 0 then + lua_util.debugm(N, log_obj, "no commas in the first line") + return false + end + expected_commas = ncommas + end + + matched_lines = matched_lines + 1 + + if matched_lines > max_matched_lines then + break + end + end + + lua_util.debugm(N, log_obj, "csv content is sane: %s fields; %s lines checked", + expected_commas, matched_lines) + + return true +end + +exports.mime_part_heuristic = function(part, log_obj, _) + if part:is_archive() then + local arch = part:get_archive() + return detect_archive_flaw(part, arch, log_obj) + end + + return nil +end + +exports.text_part_heuristic = function(part, log_obj, _) + -- We get some span of data and check it + local function is_span_text(span) + -- We examine 8 bit content, and we assume it might be localized text + -- if it has more than 3 subsequent 8 bit characters + local function rough_8bit_check(bytes, idx, remain, len) + local b = bytes[idx] + local n8bit = 0 + + while b >= 127 and idx < len do + -- utf8 part + if bit.band(b, 0xe0) == 0xc0 and remain > 1 and + bit.band(bytes[idx + 1], 0xc0) == 0x80 then + return true, 1 + elseif bit.band(b, 0xf0) == 0xe0 and remain > 2 and + bit.band(bytes[idx + 1], 0xc0) == 0x80 and + bit.band(bytes[idx + 2], 0xc0) == 0x80 then + return true, 2 + elseif bit.band(b, 0xf8) == 0xf0 and remain > 3 and + bit.band(bytes[idx + 1], 0xc0) == 0x80 and + bit.band(bytes[idx + 2], 0xc0) == 0x80 and + bit.band(bytes[idx + 3], 0xc0) == 0x80 then + return true, 3 + end + + n8bit = n8bit + 1 + idx = idx + 1 + b = bytes[idx] + remain = remain - 1 + end + + if n8bit >= 3 then + return true, n8bit + end + + return false, 0 + end + + -- Convert to string as LuaJIT can optimise string.sub (and fun.iter) but not C calls + local tlen = #span + local non_printable = 0 + local bytes = span:bytes() + local i = 1 + repeat + local b = bytes[i] + + if (b < 0x20) and not (b == 0x0d or b == 0x0a or b == 0x09) then + non_printable = non_printable + 1 + elseif b >= 127 then + local c, nskip = rough_8bit_check(bytes, i, tlen - i, tlen) + + if not c then + non_printable = non_printable + 1 + else + i = i + nskip + end + end + i = i + 1 + until i > tlen + + lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total", + tlen - non_printable, non_printable, tlen) + if non_printable / tlen > 0.0078125 then + return false + end + + return true + end + + local parent = part:get_parent() + + if parent then + local parent_type, parent_subtype = parent:get_type() + + if parent_type == 'multipart' and parent_subtype == 'encrypted' then + -- Skip text heuristics for encrypted parts + lua_util.debugm(N, log_obj, "text part check: parent is encrypted, not a text part") + + return false + end + end + + local content = part:get_content() + local mtype, msubtype = part:get_type() + local clen = #content + local is_text + + if clen > 0 then + if clen > 80 * 3 then + -- Use chunks + is_text = is_span_text(content:span(1, 160)) and is_span_text(content:span(clen - 80, 80)) + else + is_text = is_span_text(content) + end + + if is_text and mtype ~= 'message' then + -- Try patterns + local span_len = math.min(4096, clen) + local start_span = content:span(1, span_len) + local matches = txt_trie:match(start_span) + local res = {} + local fname = part:get_filename() + + if matches then + -- Require at least 2 occurrences of those patterns + for n, positions in pairs(matches) do + local ext, weight = txt_patterns_indexes[n][1], txt_patterns_indexes[n][2][2] + if ext then + res[ext] = (res[ext] or 0) + weight * #positions + lua_util.debugm(N, log_obj, "found txt pattern for %s: %s, total: %s; %s/%s announced", + ext, weight * #positions, res[ext], mtype, msubtype) + end + end + + if res.html and res.html >= 40 then + -- HTML has priority over something like js... + return 'html', res.html + end + + local ext, weight = process_top_detected(res) + + if weight then + if weight >= 40 then + -- Extra validation for csv extension + if ext ~= 'csv' or validate_csv(part, content, log_obj) then + return ext, weight + end + elseif fname and weight >= 20 then + return ext, weight + end + end + end + + -- Content type stuff + if (mtype == 'text' or mtype == 'application') and + (msubtype == 'html' or msubtype == 'xhtml+xml') then + return 'html', 21 + end + + if msubtype:lower() == 'csv' then + if validate_csv(part, content, log_obj) then + return 'csv', 40 + end + end + + -- Extension stuff + local function has_extension(file, ext) + local ext_len = ext:len() + return file:len() > ext_len + 1 + and file:sub(-ext_len):lower() == ext + and file:sub(-ext_len - 1, -ext_len - 1) == '.' + end + + if fname and (has_extension(fname, 'htm') or has_extension(fname, 'html')) then + return 'html', 21 + end + + if mtype ~= 'text' then + -- Do not treat non text patterns as text + return nil + end + + return 'txt', 40 + end + end +end + +exports.pdf_format_heuristic = function(input, log_obj, pos, part) + local weight = 10 + local ext = string.match(part:get_filename() or '', '%.([^.]+)$') + -- If we found a pattern at the beginning + if pos <= 10 then + weight = weight + 30 + end + -- If the announced extension is `pdf` + if ext and ext:lower() == 'pdf' then + weight = weight + 30 + end + + return 'pdf', weight +end + +exports.pe_part_heuristic = function(input, log_obj, pos, part) + if not input then + return + end + + -- pe header should start at the offset that is placed in msdos header at position 60..64 + local pe_ptr_bin = input:sub(60, 64) + if #pe_ptr_bin ~= 4 then + return + end + + -- it is an LE 32 bit integer + local pe_ptr = rspamd_util.unpack("<I4", pe_ptr_bin) + -- if pe header magic matches the offset, it is definitely a PE file + if pe_ptr ~= pos then + return + end + + return 'exe', 30 +end + +return exports diff --git a/lualib/lua_magic/init.lua b/lualib/lua_magic/init.lua new file mode 100644 index 0000000..38bfddb --- /dev/null +++ b/lualib/lua_magic/init.lua @@ -0,0 +1,388 @@ +--[[ +Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +--[[[ +-- @module lua_magic +-- This module contains file types detection logic +--]] + +local patterns = require "lua_magic/patterns" +local types = require "lua_magic/types" +local heuristics = require "lua_magic/heuristics" +local fun = require "fun" +local lua_util = require "lua_util" + +local rspamd_text = require "rspamd_text" +local rspamd_trie = require "rspamd_trie" + +local N = "lua_magic" +local exports = {} +-- trie objects +local compiled_patterns +local compiled_short_patterns +local compiled_tail_patterns +-- {<str>, <match_object>, <pattern_object>} indexed by pattern number +local processed_patterns = {} +local short_patterns = {} +local tail_patterns = {} + +local short_match_limit = 128 +local max_short_offset = -1 +local min_tail_offset = math.huge + +local function process_patterns(log_obj) + -- Add pattern to either short patterns or to normal patterns + local function add_processed(str, match, pattern) + if match.position and type(match.position) == 'number' then + if match.tail then + -- Tail pattern + tail_patterns[#tail_patterns + 1] = { + str, match, pattern + } + if min_tail_offset > match.tail then + min_tail_offset = match.tail + end + + lua_util.debugm(N, log_obj, 'add tail pattern %s for ext %s', + str, pattern.ext) + elseif match.position < short_match_limit then + short_patterns[#short_patterns + 1] = { + str, match, pattern + } + if str:sub(1, 1) == '^' then + lua_util.debugm(N, log_obj, 'add head pattern %s for ext %s', + str, pattern.ext) + else + lua_util.debugm(N, log_obj, 'add short pattern %s for ext %s', + str, pattern.ext) + end + + if max_short_offset < match.position then + max_short_offset = match.position + end + else + processed_patterns[#processed_patterns + 1] = { + str, match, pattern + } + + lua_util.debugm(N, log_obj, 'add long pattern %s for ext %s', + str, pattern.ext) + end + else + processed_patterns[#processed_patterns + 1] = { + str, match, pattern + } + + lua_util.debugm(N, log_obj, 'add long pattern %s for ext %s', + str, pattern.ext) + end + end + + if not compiled_patterns then + for ext, pattern in pairs(patterns) do + assert(types[ext], 'not found type: ' .. ext) + pattern.ext = ext + for _, match in ipairs(pattern.matches) do + if match.string then + if match.relative_position and not match.position then + match.position = match.relative_position + #match.string + + if match.relative_position == 0 then + if match.string:sub(1, 1) ~= '^' then + match.string = '^' .. match.string + end + end + end + add_processed(match.string, match, pattern) + elseif match.hex then + local hex_table = {} + + for i = 1, #match.hex, 2 do + local subc = match.hex:sub(i, i + 1) + hex_table[#hex_table + 1] = string.format('\\x{%s}', subc) + end + + if match.relative_position and not match.position then + match.position = match.relative_position + #match.hex / 2 + end + if match.relative_position == 0 then + table.insert(hex_table, 1, '^') + end + add_processed(table.concat(hex_table), match, pattern) + end + end + end + local bit = require "bit" + local compile_flags = bit.bor(rspamd_trie.flags.re, rspamd_trie.flags.dot_all) + compile_flags = bit.bor(compile_flags, rspamd_trie.flags.single_match) + compile_flags = bit.bor(compile_flags, rspamd_trie.flags.no_start) + compiled_patterns = rspamd_trie.create(fun.totable( + fun.map(function(t) + return t[1] + end, processed_patterns)), + compile_flags + ) + compiled_short_patterns = rspamd_trie.create(fun.totable( + fun.map(function(t) + return t[1] + end, short_patterns)), + compile_flags + ) + compiled_tail_patterns = rspamd_trie.create(fun.totable( + fun.map(function(t) + return t[1] + end, tail_patterns)), + compile_flags + ) + + lua_util.debugm(N, log_obj, + 'compiled %s (%s short; %s long; %s tail) patterns', + #processed_patterns + #short_patterns + #tail_patterns, + #short_patterns, #processed_patterns, #tail_patterns) + end +end + +process_patterns(rspamd_config) + +local function match_chunk(chunk, input, tlen, offset, trie, processed_tbl, log_obj, res, part) + local matches = trie:match(chunk) + + local last = tlen + + local function add_result(weight, ext) + if not res[ext] then + res[ext] = 0 + end + if weight then + res[ext] = res[ext] + weight + else + res[ext] = res[ext] + 1 + end + + lua_util.debugm(N, log_obj, 'add pattern for %s, weight %s, total weight %s', + ext, weight, res[ext]) + end + + local function match_position(pos, expected) + local cmp = function(a, b) + return a == b + end + if type(expected) == 'table' then + -- Something like {'>', 0} + if expected[1] == '>' then + cmp = function(a, b) + return a > b + end + elseif expected[1] == '>=' then + cmp = function(a, b) + return a >= b + end + elseif expected[1] == '<' then + cmp = function(a, b) + return a < b + end + elseif expected[1] == '<=' then + cmp = function(a, b) + return a <= b + end + elseif expected[1] == '!=' then + cmp = function(a, b) + return a ~= b + end + end + expected = expected[2] + end + + -- Tail match + if expected < 0 then + expected = last + expected + 1 + end + return cmp(pos, expected) + end + + for npat, matched_positions in pairs(matches) do + local pat_data = processed_tbl[npat] + local pattern = pat_data[3] + local match = pat_data[2] + + -- Single position + if match.position then + local position = match.position + + for _, pos in ipairs(matched_positions) do + lua_util.debugm(N, log_obj, 'found match %s at offset %s(from %s)', + pattern.ext, pos, offset) + if match_position(pos + offset, position) then + if match.heuristic then + local ext, weight = match.heuristic(input, log_obj, pos + offset, part) + + if ext then + add_result(weight, ext) + break + end + else + add_result(match.weight, pattern.ext) + break + end + end + end + elseif match.positions then + -- Match all positions + local all_right = true + local matched_pos = 0 + for _, position in ipairs(match.positions) do + local matched = false + for _, pos in ipairs(matched_positions) do + lua_util.debugm(N, log_obj, 'found match %s at offset %s(from %s)', + pattern.ext, pos, offset) + if not match_position(pos + offset, position) then + matched = true + matched_pos = pos + break + end + end + if not matched then + all_right = false + break + end + end + + if all_right then + if match.heuristic then + local ext, weight = match.heuristic(input, log_obj, matched_pos + offset, part) + + if ext then + add_result(weight, ext) + break + end + else + add_result(match.weight, pattern.ext) + break + end + end + end + end + +end + +local function process_detected(res) + local extensions = lua_util.keys(res) + + if #extensions > 0 then + table.sort(extensions, function(ex1, ex2) + return res[ex1] > res[ex2] + end) + + return extensions, res[extensions[1]] + end + + return nil +end + +exports.detect = function(part, log_obj) + if not log_obj then + log_obj = rspamd_config + end + local input = part:get_content() + + local res = {} + + if type(input) == 'string' then + -- Convert to rspamd_text + input = rspamd_text.fromstring(input) + end + + if type(input) == 'userdata' then + local inplen = #input + + -- Check tail matches + if inplen > min_tail_offset then + local tail = input:span(inplen - min_tail_offset, min_tail_offset) + match_chunk(tail, input, inplen, inplen - min_tail_offset, + compiled_tail_patterns, tail_patterns, log_obj, res, part) + end + + -- Try short match + local head = input:span(1, math.min(max_short_offset, inplen)) + match_chunk(head, input, inplen, 0, + compiled_short_patterns, short_patterns, log_obj, res, part) + + -- Check if we have enough data or go to long patterns + local extensions, confidence = process_detected(res) + + if extensions and #extensions > 0 and confidence > 30 then + -- We are done on short patterns + return extensions[1], types[extensions[1]] + end + + -- No way, let's check data in chunks or just the whole input if it is small enough + if #input > exports.chunk_size * 3 then + -- Chunked version as input is too long + local chunk1, chunk2 = input:span(1, exports.chunk_size * 2), + input:span(inplen - exports.chunk_size, exports.chunk_size) + local offset1, offset2 = 0, inplen - exports.chunk_size + + match_chunk(chunk1, input, inplen, + offset1, compiled_patterns, processed_patterns, log_obj, res, part) + match_chunk(chunk2, input, inplen, + offset2, compiled_patterns, processed_patterns, log_obj, res, part) + else + -- Input is short enough to match it at all + match_chunk(input, input, inplen, 0, + compiled_patterns, processed_patterns, log_obj, res, part) + end + else + -- Table input is NYI + assert(0, 'table input for match') + end + + local extensions = process_detected(res) + + if extensions and #extensions > 0 then + return extensions[1], types[extensions[1]] + end + + -- Nothing found + return nil +end + +exports.detect_mime_part = function(part, log_obj) + local ext, weight = heuristics.mime_part_heuristic(part, log_obj) + + if ext and weight and weight > 20 then + return ext, types[ext] + end + + ext = exports.detect(part, log_obj) + + if ext then + return ext, types[ext] + end + + -- Text/html and other parts + ext, weight = heuristics.text_part_heuristic(part, log_obj) + if ext and weight and weight > 20 then + return ext, types[ext] + end +end + +-- This parameter specifies how many bytes are checked in the input +-- Rspamd checks 2 chunks at start and 1 chunk at the end +exports.chunk_size = 32768 + +exports.types = types + +return exports
\ No newline at end of file diff --git a/lualib/lua_magic/patterns.lua b/lualib/lua_magic/patterns.lua new file mode 100644 index 0000000..971ddd9 --- /dev/null +++ b/lualib/lua_magic/patterns.lua @@ -0,0 +1,471 @@ +--[[ +Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +--[[[ +-- @module lua_magic/patterns +-- This module contains most common patterns +--]] + +local heuristics = require "lua_magic/heuristics" + +local patterns = { + pdf = { + -- These are alternatives + matches = { + { + string = [[%PDF-[12]\.\d]], + position = { '<=', 1024 }, + weight = 60, + heuristic = heuristics.pdf_format_heuristic + }, + { + string = [[%FDF-[12]\.\d]], + position = { '<=', 1024 }, + weight = 60, + heuristic = heuristics.pdf_format_heuristic + }, + }, + }, + ps = { + matches = { + { + string = [[%!PS-Adobe]], + relative_position = 0, + weight = 60, + }, + }, + }, + -- RTF document + rtf = { + matches = { + { + string = [[^{\\rt]], + position = 4, + weight = 60, + } + } + }, + chm = { + matches = { + { + string = [[ITSF]], + relative_position = 0, + weight = 60, + } + } + }, + djvu = { + matches = { + { + string = [[AT&TFORM]], + relative_position = 0, + weight = 60, + }, + { + string = [[DJVM]], + relative_position = 0x0c, + weight = 60, + } + } + }, + -- MS Office format, needs heuristic + ole = { + matches = { + { + hex = [[d0cf11e0a1b11ae1]], + relative_position = 0, + weight = 60, + heuristic = heuristics.ole_format_heuristic + } + } + }, + -- MS Exe file + exe = { + matches = { + { + string = [[MZ]], + relative_position = 0, + weight = 15, + }, + -- PE part + { + string = [[PE\x{00}\x{00}]], + position = { '>=', 0x3c + 4 }, + weight = 15, + heuristic = heuristics.pe_part_heuristic, + } + } + }, + elf = { + matches = { + { + hex = [[7f454c46]], + relative_position = 0, + weight = 60, + }, + } + }, + lnk = { + matches = { + { + hex = [[4C0000000114020000000000C000000000000046]], + relative_position = 0, + weight = 60, + }, + } + }, + bat = { + matches = { + { + string = [[(?i)@\s*ECHO\s+OFF]], + position = { '>=', 0 }, + weight = 60, + }, + } + }, + class = { + -- Technically, this also matches MachO files, but I don't care about + -- Apple and their mental health problems here: just consider Java files, + -- Mach object files and all other cafe babes as bad and block them! + matches = { + { + hex = [[cafebabe]], + relative_position = 0, + weight = 60, + }, + } + }, + ics = { + matches = { + { + string = [[BEGIN:VCALENDAR]], + weight = 60, + relative_position = 0, + } + } + }, + vcf = { + matches = { + { + string = [[BEGIN:VCARD]], + weight = 60, + relative_position = 0, + } + } + }, + -- Archives + arj = { + matches = { + { + hex = '60EA', + relative_position = 0, + weight = 60, + }, + } + }, + ace = { + matches = { + { + string = [[\*\*ACE\*\*]], + position = 14, + weight = 60, + }, + } + }, + cab = { + matches = { + { + hex = [[4d53434600000000]], -- Can be anywhere for SFX :( + position = { '>=', 8 }, + weight = 60, + }, + } + }, + tar = { + matches = { + { + string = [[ustar]], + relative_position = 257, + weight = 60, + }, + } + }, + bz2 = { + matches = { + { + string = "^BZ[h0]", + position = 3, + weight = 60, + }, + } + }, + lz4 = { + matches = { + { + hex = "04224d18", + relative_position = 0, + weight = 60, + }, + { + hex = "03214c18", + relative_position = 0, + weight = 60, + }, + { + hex = "02214c18", + relative_position = 0, + weight = 60, + }, + { + -- MozLZ4 + hex = '6d6f7a4c7a343000', + relative_position = 0, + weight = 60, + } + } + }, + zst = { + matches = { + { + string = [[^[\x{22}-\x{40}]\x{B5}\x{2F}\x{FD}]], + position = 4, + weight = 60, + }, + } + }, + zoo = { + matches = { + { + hex = [[dca7c4fd]], + relative_position = 20, + weight = 60, + }, + } + }, + xar = { + matches = { + { + string = [[xar!]], + relative_position = 0, + weight = 60, + }, + } + }, + iso = { + matches = { + { + string = [[\x{01}CD001\x{01}]], + position = { '>=', 0x8000 + 7 }, -- first 32k is unused + weight = 60, + }, + } + }, + egg = { + -- ALZip egg + matches = { + { + string = [[EGGA]], + weight = 60, + relative_position = 0, + }, + } + }, + alz = { + -- ALZip alz + matches = { + { + string = [[ALZ\x{01}]], + weight = 60, + relative_position = 0, + }, + } + }, + -- Apple is a 'special' child: this needs to be matched at the data tail... + dmg = { + matches = { + { + string = [[koly\x{00}\x{00}\x{00}\x{04}]], + position = -512 + 8, + weight = 61, + tail = 512, + }, + } + }, + szdd = { + matches = { + { + hex = [[535a4444]], + relative_position = 0, + weight = 60, + }, + } + }, + xz = { + matches = { + { + hex = [[FD377A585A00]], + relative_position = 0, + weight = 60, + }, + } + }, + -- Images + psd = { + matches = { + { + string = [[8BPS]], + relative_position = 0, + weight = 60, + }, + } + }, + ico = { + matches = { + { + hex = [[00000100]], + relative_position = 0, + weight = 60, + }, + } + }, + pcx = { + matches = { + { + hex = [[0A050108]], + relative_position = 0, + weight = 60, + }, + } + }, + pic = { + matches = { + { + hex = [[FF80C9C71A00]], + relative_position = 0, + weight = 60, + }, + } + }, + swf = { + matches = { + { + hex = [[5a5753]], -- LZMA + relative_position = 0, + weight = 60, + }, + { + hex = [[435753]], -- Zlib + relative_position = 0, + weight = 60, + }, + { + hex = [[465753]], -- Uncompressed + relative_position = 0, + weight = 60, + }, + } + }, + tiff = { + matches = { + { + hex = [[49492a00]], -- LE encoded + relative_position = 0, + weight = 60, + }, + { + hex = [[4d4d]], -- BE tiff + relative_position = 0, + weight = 60, + }, + } + }, + -- Other + pgp = { + matches = { + { + hex = [[A803504750]], + relative_position = 0, + weight = 60, + }, + { + hex = [[2D424547494E20504750204D4553534147452D]], + relative_position = 0, + weight = 60, + }, + } + }, + uue = { + matches = { + { + hex = [[626567696e20]], + relative_position = 0, + weight = 60, + }, + } + }, + dwg = { + matches = { + { + string = '^AC10[12][2-9]', + position = 6, + weight = 60, + } + } + }, + jpg = { + matches = { + { -- JPEG2000 + hex = [[0000000c6a5020200d0a870a]], + relative_position = 0, + weight = 60, + }, + { + string = [[^\x{ff}\x{d8}\x{ff}]], + weight = 60, + position = 3, + }, + }, + }, + png = { + matches = { + { + string = [[^\x{89}PNG\x{0d}\x{0a}\x{1a}\x{0a}]], + position = 8, + weight = 60, + }, + } + }, + gif = { + matches = { + { + string = [[^GIF8\d]], + position = 5, + weight = 60, + }, + } + }, + bmp = { + matches = { + { + string = [[^BM...\x{00}\x{00}\x{00}\x{00}]], + position = 9, + weight = 60, + }, + } + }, +} + +return patterns diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua new file mode 100644 index 0000000..3dce2e1 --- /dev/null +++ b/lualib/lua_magic/types.lua @@ -0,0 +1,327 @@ +--[[ +Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +--[[[ +-- @module lua_magic/patterns +-- This module contains types definitions +--]] + +-- This table is indexed by msdos extension for convenience + +local types = { + -- exe + exe = { + ct = 'application/x-ms-application', + type = 'executable', + }, + elf = { + ct = 'application/x-elf-executable', + type = 'executable', + }, + lnk = { + ct = 'application/x-ms-application', + type = 'executable', + }, + class = { + ct = 'application/x-java-applet', + type = 'executable', + }, + jar = { + ct = 'application/java-archive', + type = 'archive', + }, + apk = { + ct = 'application/vnd.android.package-archive', + type = 'archive', + }, + bat = { + ct = 'application/x-bat', + type = 'executable', + }, + -- text + rtf = { + ct = "application/rtf", + type = 'binary', + }, + pdf = { + ct = 'application/pdf', + type = 'binary', + }, + ps = { + ct = 'application/postscript', + type = 'binary', + }, + chm = { + ct = 'application/x-chm', + type = 'binary', + }, + djvu = { + ct = 'application/x-djvu', + type = 'binary', + }, + -- archives + arj = { + ct = 'application/x-arj', + type = 'archive', + }, + cab = { + ct = 'application/x-cab', + type = 'archive', + }, + ace = { + ct = 'application/x-ace', + type = 'archive', + }, + tar = { + ct = 'application/x-tar', + type = 'archive', + }, + bz2 = { + ct = 'application/x-bzip', + type = 'archive', + }, + xz = { + ct = 'application/x-xz', + type = 'archive', + }, + lz4 = { + ct = 'application/x-lz4', + type = 'archive', + }, + zst = { + ct = 'application/x-zstandard', + type = 'archive', + }, + dmg = { + ct = 'application/x-dmg', + type = 'archive', + }, + iso = { + ct = 'application/x-iso', + type = 'archive', + }, + zoo = { + ct = 'application/x-zoo', + type = 'archive', + }, + egg = { + ct = 'application/x-egg', + type = 'archive', + }, + alz = { + ct = 'application/x-alz', + type = 'archive', + }, + xar = { + ct = 'application/x-xar', + type = 'archive', + }, + epub = { + ct = 'application/x-epub', + type = 'archive' + }, + szdd = { -- in fact, their MSDOS extension is like FOO.TX_ or FOO.TX$ + ct = 'application/x-compressed', + type = 'archive', + }, + -- images + psd = { + ct = 'image/psd', + type = 'image', + av_check = false, + }, + pcx = { + ct = 'image/pcx', + type = 'image', + av_check = false, + }, + pic = { + ct = 'image/pic', + type = 'image', + av_check = false, + }, + tiff = { + ct = 'image/tiff', + type = 'image', + av_check = false, + }, + ico = { + ct = 'image/ico', + type = 'image', + av_check = false, + }, + swf = { + ct = 'application/x-shockwave-flash', + type = 'image', + }, + -- Ole files + ole = { + ct = 'application/octet-stream', + type = 'office' + }, + doc = { + ct = 'application/msword', + type = 'office' + }, + xls = { + ct = 'application/vnd.ms-excel', + type = 'office' + }, + ppt = { + ct = 'application/vnd.ms-powerpoint', + type = 'office' + }, + vsd = { + ct = 'application/vnd.visio', + type = 'office' + }, + msi = { + ct = 'application/x-msi', + type = 'executable' + }, + msg = { + ct = 'application/vnd.ms-outlook', + type = 'office' + }, + -- newer office (2007+) + docx = { + ct = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + type = 'office' + }, + xlsx = { + ct = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + type = 'office' + }, + pptx = { + ct = 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + type = 'office' + }, + -- OpenOffice formats + odt = { + ct = 'application/vnd.oasis.opendocument.text', + type = 'office' + }, + ods = { + ct = 'application/vnd.oasis.opendocument.spreadsheet', + type = 'office' + }, + odp = { + ct = 'application/vnd.oasis.opendocument.presentation', + type = 'office' + }, + -- https://en.wikipedia.org/wiki/Associated_Signature_Containers + asice = { + ct = 'application/vnd.etsi.asic-e+zip', + type = 'office' + }, + asics = { + ct = 'application/vnd.etsi.asic-s+zip', + type = 'office' + }, + -- other + pgp = { + ct = 'application/encrypted', + type = 'encrypted' + }, + uue = { + ct = 'application/x-uuencoded', + type = 'binary', + }, + -- Types that are detected by Rspamd itself + -- Archives + zip = { + ct = 'application/zip', + type = 'archive', + }, + rar = { + ct = 'application/x-rar', + type = 'archive', + }, + ['7z'] = { + ct = 'application/x-7z-compressed', + type = 'archive', + }, + gz = { + ct = 'application/gzip', + type = 'archive', + }, + -- Images + png = { + ct = 'image/png', + type = 'image', + av_check = false, + }, + gif = { + ct = 'image/gif', + type = 'image', + av_check = false, + }, + jpg = { + ct = 'image/jpeg', + type = 'image', + av_check = false, + }, + bmp = { + type = 'image', + ct = 'image/bmp', + av_check = false, + }, + dwg = { + type = 'image', + ct = 'image/vnd.dwg', + }, + -- Text + xml = { + ct = 'application/xml', + type = 'text', + no_text = true, + }, + txt = { + type = 'text', + ct = 'text/plain', + av_check = false, + }, + html = { + type = 'text', + ct = 'text/html', + av_check = false, + }, + csv = { + type = 'text', + ct = 'text/csv', + av_check = false, + no_text = true, + }, + ics = { + type = 'text', + ct = 'text/calendar', + av_check = false, + no_text = true, + }, + vcf = { + type = 'text', + ct = 'text/vcard', + av_check = false, + no_text = true, + }, + eml = { + type = 'message', + ct = 'message/rfc822', + av_check = false, + }, +} + +return types
\ No newline at end of file |