diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /lualib/lua_content | |
parent | Initial commit. (diff) | |
download | rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | lualib/lua_content/ical.lua | 105 | ||||
-rw-r--r-- | lualib/lua_content/init.lua | 109 | ||||
-rw-r--r-- | lualib/lua_content/pdf.lua | 1424 | ||||
-rw-r--r-- | lualib/lua_content/vcard.lua | 84 |
4 files changed, 1722 insertions, 0 deletions
diff --git a/lualib/lua_content/ical.lua b/lualib/lua_content/ical.lua new file mode 100644 index 0000000..d018a85 --- /dev/null +++ b/lualib/lua_content/ical.lua @@ -0,0 +1,105 @@ +--[[ +Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +local l = require 'lpeg' +local lua_util = require "lua_util" +local N = "lua_content" + +local ical_grammar + +local function gen_grammar() + if not ical_grammar then + local wsp = l.S(" \t\v\f") + local crlf = (l.P "\r" ^ -1 * l.P "\n") + l.P "\r" + local eol = (crlf * #crlf) + (crlf - (crlf ^ -1 * wsp)) + local name = l.C((l.P(1) - (l.P ":")) ^ 1) / function(v) + return (v:gsub("[\n\r]+%s", "")) + end + local value = l.C((l.P(1) - eol) ^ 0) / function(v) + return (v:gsub("[\n\r]+%s", "")) + end + ical_grammar = name * ":" * wsp ^ 0 * value * eol ^ -1 + end + + return ical_grammar +end + +local exports = {} + +local function extract_text_data(specific) + local fun = require "fun" + + local tbl = fun.totable(fun.map(function(e) + return e[2]:lower() + end, specific.elts)) + return table.concat(tbl, '\n') +end + + +-- Keys that can have visible urls +local url_keys = lua_util.list_to_hash { + 'description', + 'location', + 'summary', + 'organizer', + 'organiser', + 'attendee', + 'url' +} + +local function process_ical(input, mpart, task) + local control = { n = '\n', r = '' } + local rspamd_url = require "rspamd_url" + local escaper = l.Ct((gen_grammar() / function(key, value) + value = value:gsub("\\(.)", control) + key = key:lower():match('^([^;]+)') + + if key and url_keys[key] then + local local_urls = rspamd_url.all(task:get_mempool(), value) + + if local_urls and #local_urls > 0 then + for _, u in ipairs(local_urls) do + lua_util.debugm(N, task, 'ical: found URL in ical key "%s": %s', + key, tostring(u)) + task:inject_url(u, mpart) + end + end + end + lua_util.debugm(N, task, 'ical: ical key %s = "%s"', + key, value) + return { key, value } + end) ^ 1) + + local elts = escaper:match(input) + + if not elts then + return nil + end + + return { + tag = 'ical', + extract_text = extract_text_data, + elts = elts + } +end + +--[[[ +-- @function lua_ical.process(input) +-- Returns all values from ical as a plain text. Names are completely ignored. +--]] +exports.process = process_ical + +return exports
\ No newline at end of file diff --git a/lualib/lua_content/init.lua b/lualib/lua_content/init.lua new file mode 100644 index 0000000..701d223 --- /dev/null +++ b/lualib/lua_content/init.lua @@ -0,0 +1,109 @@ +--[[ +Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +--[[[ +-- @module lua_content +-- This module contains content processing logic +--]] + + +local exports = {} +local N = "lua_content" +local lua_util = require "lua_util" + +local content_modules = { + ical = { + mime_type = { "text/calendar", "application/calendar" }, + module = require "lua_content/ical", + extensions = { 'ics' }, + output = "text" + }, + vcf = { + mime_type = { "text/vcard", "application/vcard" }, + module = require "lua_content/vcard", + extensions = { 'vcf' }, + output = "text" + }, + pdf = { + mime_type = "application/pdf", + module = require "lua_content/pdf", + extensions = { 'pdf' }, + output = "table" + }, +} + +local modules_by_mime_type +local modules_by_extension + +local function init() + modules_by_mime_type = {} + modules_by_extension = {} + for k, v in pairs(content_modules) do + if v.mime_type then + if type(v.mime_type) == 'table' then + for _, mt in ipairs(v.mime_type) do + modules_by_mime_type[mt] = { k, v } + end + else + modules_by_mime_type[v.mime_type] = { k, v } + end + + end + if v.extensions then + for _, ext in ipairs(v.extensions) do + modules_by_extension[ext] = { k, v } + end + end + end +end + +exports.maybe_process_mime_part = function(part, task) + if not modules_by_mime_type then + init() + end + + local ctype, csubtype = part:get_type() + local mt = string.format("%s/%s", ctype or 'application', + csubtype or 'octet-stream') + local pair = modules_by_mime_type[mt] + + if not pair then + local ext = part:get_detected_ext() + + if ext then + pair = modules_by_extension[ext] + end + end + + if pair then + lua_util.debugm(N, task, "found known content of type %s: %s", + mt, pair[1]) + + local data = pair[2].module.process(part:get_content(), part, task) + + if data then + lua_util.debugm(N, task, "extracted content from %s: %s type", + pair[1], type(data)) + part:set_specific(data) + else + lua_util.debugm(N, task, "failed to extract anything from %s", + pair[1]) + end + end + +end + +return exports
\ No newline at end of file diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua new file mode 100644 index 0000000..f6d5c0b --- /dev/null +++ b/lualib/lua_content/pdf.lua @@ -0,0 +1,1424 @@ +--[[ +Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +--[[[ +-- @module lua_content/pdf +-- This module contains some heuristics for PDF files +--]] + +local rspamd_trie = require "rspamd_trie" +local rspamd_util = require "rspamd_util" +local rspamd_text = require "rspamd_text" +local rspamd_url = require "rspamd_url" +local bit = require "bit" +local N = "lua_content" +local lua_util = require "lua_util" +local rspamd_regexp = require "rspamd_regexp" +local lpeg = require "lpeg" +local pdf_patterns = { + trailer = { + patterns = { + [[\ntrailer\r?\n]] + } + }, + suspicious = { + patterns = { + [[netsh\s]], + [[echo\s]], + [=[\/[A-Za-z]*#\d\d[#A-Za-z<>/\s]]=], -- Hex encode obfuscation + } + }, + start_object = { + patterns = { + [=[[\r\n\0]\s*\d+\s+\d+\s+obj[\s<]]=] + } + }, + end_object = { + patterns = { + [=[endobj[\r\n]]=] + } + }, + start_stream = { + patterns = { + [=[>\s*stream[\r\n]]=], + } + }, + end_stream = { + patterns = { + [=[endstream[\r\n]]=] + } + } +} + +local pdf_text_patterns = { + start = { + patterns = { + [[\sBT\s]] + } + }, + stop = { + patterns = { + [[\sET\b]] + } + } +} + +local pdf_cmap_patterns = { + start = { + patterns = { + [[\d\s+beginbfchar\s]], + [[\d\s+beginbfrange\s]] + } + }, + stop = { + patterns = { + [[\sendbfrange\b]], + [[\sendbchar\b]] + } + } +} + +-- index[n] -> +-- t[1] - pattern, +-- t[2] - key in patterns table, +-- t[3] - value in patterns table +-- t[4] - local pattern index +local pdf_indexes = {} +local pdf_text_indexes = {} +local pdf_cmap_indexes = {} + +local pdf_trie +local pdf_text_trie +local pdf_cmap_trie + +local exports = {} + +local config = { + max_extraction_size = 512 * 1024, + max_processing_size = 32 * 1024, + text_extraction = false, -- NYI feature + url_extraction = true, + enabled = true, + js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts + min_js_fuzzy = 256, -- Minimum size of js to be considered as a fuzzy + openaction_fuzzy_only = false, -- Generate fuzzy from all scripts + max_pdf_objects = 10000, -- Maximum number of objects to be considered + max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse) + max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer + pdf_process_timeout = 1.0, -- Timeout in seconds for processing +} + +-- Used to process patterns found in PDF +-- positions for functional processors should be a iter/table from trie matcher in form +---- [{n1, pat_idx1}, ... {nn, pat_idxn}] where +---- pat_idxn is pattern index and n1 ... nn are match positions +local processors = {} +-- PDF objects outer grammar in LPEG style (performing table captures) +local pdf_outer_grammar +local pdf_text_grammar + +-- Used to match objects +local object_re = rspamd_regexp.create_cached([=[/(\d+)\s+(\d+)\s+obj\s*/]=]) + +local function config_module() + local opts = rspamd_config:get_all_opt('lua_content') + + if opts and opts.pdf then + config = lua_util.override_defaults(config, opts.pdf) + end +end + +local function compile_tries() + local default_compile_flags = bit.bor(rspamd_trie.flags.re, + rspamd_trie.flags.dot_all, + rspamd_trie.flags.no_start) + local function compile_pats(patterns, indexes, compile_flags) + local strs = {} + for what, data in pairs(patterns) do + for i, pat in ipairs(data.patterns) do + strs[#strs + 1] = pat + indexes[#indexes + 1] = { what, data, pat, i } + end + end + + return rspamd_trie.create(strs, compile_flags or default_compile_flags) + end + + if not pdf_trie then + pdf_trie = compile_pats(pdf_patterns, pdf_indexes) + end + if not pdf_text_trie then + pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes) + end + if not pdf_cmap_trie then + pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes) + end +end + +-- Returns a table with generic grammar elements for PDF +local function generic_grammar_elts() + local P = lpeg.P + local R = lpeg.R + local S = lpeg.S + local V = lpeg.V + local C = lpeg.C + local D = R '09' -- Digits + + local grammar_elts = {} + + -- Helper functions + local function pdf_hexstring_unescape(s) + if #s % 2 == 0 then + -- Sane hex string + return lua_util.unhex(s) + end + + -- WTF hex string + -- Append '0' to it and unescape... + return lua_util.unhex(s:sub(1, #s - 1)) .. lua_util.unhex((s:sub(#s) .. '0')) + end + + local function pdf_string_unescape(s) + local function ue_single(cc) + if cc == '\\r' then + return '\r' + elseif cc == '\\n' then + return '\n' + else + return cc:gsub(2, 2) + end + end + -- simple unescape \char + s = s:gsub('\\[^%d]', ue_single) + -- unescape octal + local function ue_octal(cc) + -- Replace unknown stuff with '?' + return string.char(tonumber(cc:sub(2), 8) or 63) + end + s = s:gsub('\\%d%d?%d?', ue_octal) + + return s + end + + local function pdf_id_unescape(s) + return (s:gsub('#%d%d', function(cc) + return string.char(tonumber(cc:sub(2), 16)) + end)) + end + + local delim = S '()<>[]{}/%' + grammar_elts.ws = S '\0 \r\n\t\f' + local hex = R 'af' + R 'AF' + D + -- Comments. + local eol = P '\r\n' + '\n' + local line = (1 - S '\r\n\f') ^ 0 * eol ^ -1 + grammar_elts.comment = P '%' * line + + -- Numbers. + local sign = S '+-' ^ -1 + local decimal = D ^ 1 + local float = D ^ 1 * P '.' * D ^ 0 + P '.' * D ^ 1 + grammar_elts.number = C(sign * (float + decimal)) / tonumber + + -- String + grammar_elts.str = P { "(" * C(((1 - S "()\\") + (P '\\' * 1) + V(1)) ^ 0) / pdf_string_unescape * ")" } + grammar_elts.hexstr = P { "<" * C(hex ^ 0) / pdf_hexstring_unescape * ">" } + + -- Identifier + grammar_elts.id = P { '/' * C((1 - (delim + grammar_elts.ws)) ^ 1) / pdf_id_unescape } + + -- Booleans (who care about them?) + grammar_elts.boolean = C(P("true") + P("false")) + + -- Stupid references + grammar_elts.ref = lpeg.Ct { lpeg.Cc("%REF%") * C(D ^ 1) * " " * C(D ^ 1) * " " * "R" } + + return grammar_elts +end + + +-- Generates a grammar to parse outer elements (external objects in PDF notation) +local function gen_outer_grammar() + local V = lpeg.V + local gen = generic_grammar_elts() + + return lpeg.P { + "EXPR"; + EXPR = gen.ws ^ 0 * V("ELT") ^ 0 * gen.ws ^ 0, + ELT = V("ARRAY") + V("DICT") + V("ATOM"), + ATOM = gen.ws ^ 0 * (gen.comment + gen.boolean + gen.ref + + gen.number + V("STRING") + gen.id) * gen.ws ^ 0, + DICT = "<<" * gen.ws ^ 0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR") ^ 0, rawset) * gen.ws ^ 0 * ">>", + KV_PAIR = lpeg.Cg(gen.id * gen.ws ^ 0 * V("ELT") * gen.ws ^ 0), + ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct(V("ELT") ^ 0) * gen.ws ^ 0 * "]", + STRING = lpeg.P { gen.str + gen.hexstr }, + } +end + +-- Graphic state in PDF +local function gen_graphics_unary() + local P = lpeg.P + local S = lpeg.S + + return P("q") + P("Q") + P("h") + + S("WSsFfBb") * P("*") ^ 0 + P("n") + +end +local function gen_graphics_binary() + local P = lpeg.P + local S = lpeg.S + + return S("gGwJjMi") + + P("M") + P("ri") + P("gs") + + P("CS") + P("cs") + P("sh") +end +local function gen_graphics_ternary() + local P = lpeg.P + local S = lpeg.S + + return P("d") + P("m") + S("lm") +end +local function gen_graphics_nary() + local P = lpeg.P + local S = lpeg.S + + return P("SC") + P("sc") + P("SCN") + P("scn") + P("k") + P("K") + P("re") + S("cvy") + + P("RG") + P("rg") +end + +-- Generates a grammar to parse text blocks (between BT and ET) +local function gen_text_grammar() + local V = lpeg.V + local P = lpeg.P + local C = lpeg.C + local gen = generic_grammar_elts() + + local empty = "" + local unary_ops = C("T*") / "\n" + + C(gen_graphics_unary()) / empty + local binary_ops = P("Tc") + P("Tw") + P("Tz") + P("TL") + P("Tr") + P("Ts") + + gen_graphics_binary() + local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary() + local nary_op = P("Tm") + gen_graphics_nary() + local text_binary_op = P("Tj") + P("TJ") + P("'") + local text_quote_op = P('"') + local font_op = P("Tf") + + return lpeg.P { + "EXPR"; + EXPR = gen.ws ^ 0 * lpeg.Ct(V("COMMAND") ^ 0), + COMMAND = (V("UNARY") + V("BINARY") + V("TERNARY") + V("NARY") + V("TEXT") + + V("FONT") + gen.comment) * gen.ws ^ 0, + UNARY = unary_ops, + BINARY = V("ARG") / empty * gen.ws ^ 1 * binary_ops, + TERNARY = V("ARG") / empty * gen.ws ^ 1 * V("ARG") / empty * gen.ws ^ 1 * ternary_ops, + NARY = (gen.number / 0 * gen.ws ^ 1) ^ 1 * (gen.id / empty * gen.ws ^ 0) ^ -1 * nary_op, + ARG = V("ARRAY") + V("DICT") + V("ATOM"), + ATOM = (gen.comment + gen.boolean + gen.ref + + gen.number + V("STRING") + gen.id), + DICT = "<<" * gen.ws ^ 0 * lpeg.Cf(lpeg.Ct("") * V("KV_PAIR") ^ 0, rawset) * gen.ws ^ 0 * ">>", + KV_PAIR = lpeg.Cg(gen.id * gen.ws ^ 0 * V("ARG") * gen.ws ^ 0), + ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct(V("ARG") ^ 0) * gen.ws ^ 0 * "]", + STRING = lpeg.P { gen.str + gen.hexstr }, + TEXT = (V("TEXT_ARG") * gen.ws ^ 1 * text_binary_op) + + (V("ARG") / 0 * gen.ws ^ 1 * V("ARG") / 0 * gen.ws ^ 1 * V("TEXT_ARG") * gen.ws ^ 1 * text_quote_op), + FONT = (V("FONT_ARG") * gen.ws ^ 1 * (gen.number / 0) * gen.ws ^ 1 * font_op), + FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id), + TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"), + TEXT_ARRAY = "[" * + lpeg.Ct(((gen.ws ^ 0 * (gen.ws ^ 0 * (gen.number / 0) ^ 0 * gen.ws ^ 0 * (gen.str + gen.hexstr))) ^ 1)) * gen.ws ^ 0 * "]", + } +end + + +-- Call immediately on require +compile_tries() +config_module() +pdf_outer_grammar = gen_outer_grammar() +pdf_text_grammar = gen_text_grammar() + +local function extract_text_data(specific) + return nil -- NYI +end + +-- Generates index for major/minor pair +local function obj_ref(major, minor) + return major * 10.0 + 1.0 / (minor + 1.0) +end + +-- Return indirect object reference (if needed) +local function maybe_dereference_object(elt, pdf, task) + if type(elt) == 'table' and elt[1] == '%REF%' then + local ref = obj_ref(elt[2], elt[3]) + + if pdf.ref[ref] then + -- No recursion! + return pdf.ref[ref] + else + lua_util.debugm(N, task, 'cannot dereference %s:%s -> %s, no object', + elt[2], elt[3], obj_ref(elt[2], elt[3])) + return nil + end + end + + return elt +end + +-- Apply PDF stream filter +local function apply_pdf_filter(input, filt) + if filt == 'FlateDecode' then + return rspamd_util.inflate(input, config.max_extraction_size) + end + + return nil +end + +-- Conditionally apply a pipeline of stream filters and return uncompressed data +local function maybe_apply_filter(dict, data, pdf, task) + local uncompressed = data + + if dict.Filter then + local filt = dict.Filter + if type(filt) == 'string' then + filt = { filt } + end + + if dict.DecodeParms then + local decode_params = maybe_dereference_object(dict.DecodeParms, pdf, task) + + if type(decode_params) == 'table' then + if decode_params.Predictor then + return nil, 'predictor exists' + end + end + end + + for _, f in ipairs(filt) do + uncompressed = apply_pdf_filter(uncompressed, f) + + if not uncompressed then + break + end + end + end + + return uncompressed, nil +end + +-- Conditionally extract stream data from object and attach it as obj.uncompressed +local function maybe_extract_object_stream(obj, pdf, task) + if pdf.encrypted then + -- TODO add decryption some day + return nil + end + local dict = obj.dict + if dict.Length and type(obj.stream) == 'table' then + local len = math.min(obj.stream.len, + tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0) + if len > 0 then + local real_stream = obj.stream.data:span(1, len) + + local uncompressed, filter_err = maybe_apply_filter(dict, real_stream, pdf, task) + + if uncompressed then + obj.uncompressed = uncompressed + lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)', + obj.major, obj.minor, len, uncompressed:len()) + return obj.uncompressed + else + lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s: %s', + obj.major, obj.minor, len, dict.Filter, filter_err) + end + else + lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s', + obj.major, obj.minor, len) + end + end +end + +local function parse_object_grammar(obj, task, pdf) + -- Parse grammar + local obj_dict_span + if obj.stream then + obj_dict_span = obj.data:span(1, obj.stream.start - obj.start) + else + obj_dict_span = obj.data + end + + if obj_dict_span:len() < config.max_processing_size then + local ret, obj_or_err = pcall(pdf_outer_grammar.match, pdf_outer_grammar, obj_dict_span) + + if ret then + if obj.stream then + if type(obj_or_err) == 'table' then + obj.dict = obj_or_err + else + obj.dict = {} + end + + lua_util.debugm(N, task, 'stream object %s:%s is parsed to: %s', + obj.major, obj.minor, obj_or_err) + else + -- Direct object + if type(obj_or_err) == 'table' then + obj.dict = obj_or_err + obj.uncompressed = obj_or_err + lua_util.debugm(N, task, 'direct object %s:%s is parsed to: %s', + obj.major, obj.minor, obj_or_err) + pdf.ref[obj_ref(obj.major, obj.minor)] = obj + else + lua_util.debugm(N, task, 'direct object %s:%s is parsed to raw data: %s', + obj.major, obj.minor, obj_or_err) + pdf.ref[obj_ref(obj.major, obj.minor)] = obj_or_err + obj.dict = {} + obj.uncompressed = obj_or_err + end + end + else + lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s', + obj.major, obj.minor, obj_or_err) + end + else + lua_util.debugm(N, task, 'object %s:%s cannot be parsed: too large %s', + obj.major, obj.minor, obj_dict_span:len()) + end +end + +-- Extracts font data and process /ToUnicode mappings +-- NYI in fact as cmap is ridiculously stupid and complicated +--[[ +local function process_font(task, pdf, font, fname) + local dict = font + if font.dict then + dict = font.dict + end + + if type(dict) == 'table' and dict.ToUnicode then + local cmap = maybe_dereference_object(dict.ToUnicode, pdf, task) + + if cmap and cmap.dict then + maybe_extract_object_stream(cmap, pdf, task) + lua_util.debugm(N, task, 'found cmap for font %s: %s', + fname, cmap.uncompressed) + end + end +end +--]] + +-- Forward declaration +local process_dict + +-- This function processes javascript string and returns JS hash and JS rspamd_text +local function process_javascript(task, pdf, js, obj) + local rspamd_cryptobox_hash = require "rspamd_cryptobox_hash" + if type(js) == 'string' then + js = rspamd_text.fromstring(js):oneline() + elseif type(js) == 'userdata' then + js = js:oneline() + else + return nil + end + + local hash = rspamd_cryptobox_hash.create(js) + local bin_hash = hash:bin() + + if not pdf.scripts then + pdf.scripts = {} + end + + if pdf.scripts[bin_hash] then + -- Duplicate + return pdf.scripts[bin_hash] + end + + local njs = { + data = js, + hash = hash:hex(), + bin_hash = bin_hash, + object = obj, + } + pdf.scripts[bin_hash] = njs + return njs +end + +-- Extract interesting stuff from /Action, e.g. javascript +local function process_action(task, pdf, obj) + if not (obj.js or obj.launch) and (obj.dict and obj.dict.JS) then + local js = maybe_dereference_object(obj.dict.JS, pdf, task) + + if js then + if type(js) == 'table' then + local extracted_js = maybe_extract_object_stream(js, pdf, task) + + if not extracted_js then + lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s', + obj.major, obj.minor, js) + else + js = extracted_js + end + end + + js = process_javascript(task, pdf, js, obj) + if js then + obj.js = js + lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s', + obj.major, obj.minor, obj.js.data) + else + lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s', + obj.major, obj.minor, js) + end + elseif obj.dict.F then + local launch = maybe_dereference_object(obj.dict.F, pdf, task) + + if launch then + if type(launch) == 'string' then + obj.launch = rspamd_text.fromstring(launch):exclude_chars('%n%c') + lua_util.debugm(N, task, 'extracted launch from %s:%s: %s', + obj.major, obj.minor, obj.launch) + elseif type(launch) == 'userdata' then + obj.launch = launch:exclude_chars('%n%c') + lua_util.debugm(N, task, 'extracted launch from %s:%s: %s', + obj.major, obj.minor, obj.launch) + else + lua_util.debugm(N, task, 'invalid type for launch from %s:%s: %s', + obj.major, obj.minor, launch) + end + end + else + + lua_util.debugm(N, task, 'no JS attribute in action %s:%s', + obj.major, obj.minor) + end + end +end + +-- Extract interesting stuff from /Catalog, e.g. javascript in /OpenAction +local function process_catalog(task, pdf, obj) + if obj.dict then + if obj.dict.OpenAction then + local action = maybe_dereference_object(obj.dict.OpenAction, pdf, task) + + if action and type(action) == 'table' then + -- This also processes action js (if not already processed) + process_dict(task, pdf, action, action.dict) + if action.js then + lua_util.debugm(N, task, 'found openaction JS in %s:%s: %s', + obj.major, obj.minor, action.js) + pdf.openaction = action.js + action.js.object = obj + elseif action.launch then + lua_util.debugm(N, task, 'found openaction launch in %s:%s: %s', + obj.major, obj.minor, action.launch) + pdf.launch = action.launch + else + lua_util.debugm(N, task, 'no JS in openaction %s:%s: %s', + obj.major, obj.minor, action) + end + else + lua_util.debugm(N, task, 'cannot find openaction %s:%s: %s -> %s', + obj.major, obj.minor, obj.dict.OpenAction, action) + end + else + lua_util.debugm(N, task, 'no openaction in catalog %s:%s', + obj.major, obj.minor) + end + end +end + +local function process_xref(task, pdf, obj) + if obj.dict then + if obj.dict.Encrypt then + local encrypt = maybe_dereference_object(obj.dict.Encrypt, pdf, task) + lua_util.debugm(N, task, 'found encrypt: %s in xref object %s:%s', + encrypt, obj.major, obj.minor) + pdf.encrypted = true + end + end +end + +process_dict = function(task, pdf, obj, dict) + if not obj.type and type(dict) == 'table' then + if dict.Type and type(dict.Type) == 'string' then + -- Common stuff + obj.type = dict.Type + end + + if not obj.type then + + if obj.dict.S and obj.dict.JS then + obj.type = 'Javascript' + lua_util.debugm(N, task, 'implicit type for JavaScript object %s:%s', + obj.major, obj.minor) + else + lua_util.debugm(N, task, 'no type for %s:%s', + obj.major, obj.minor) + return + end + end + + lua_util.debugm(N, task, 'processed stream dictionary for object %s:%s -> %s', + obj.major, obj.minor, obj.type) + local contents = dict.Contents + if contents and type(contents) == 'table' then + if contents[1] == '%REF%' then + -- Single reference + contents = { contents } + end + obj.contents = {} + + for _, c in ipairs(contents) do + local cobj = maybe_dereference_object(c, pdf, task) + if cobj and type(cobj) == 'table' then + obj.contents[#obj.contents + 1] = cobj + cobj.parent = obj + cobj.type = 'content' + end + end + + lua_util.debugm(N, task, 'found content objects for %s:%s -> %s', + obj.major, obj.minor, #obj.contents) + end + + local resources = dict.Resources + if resources and type(resources) == 'table' then + local res_ref = maybe_dereference_object(resources, pdf, task) + + if type(res_ref) ~= 'table' then + lua_util.debugm(N, task, 'cannot parse resources from pdf: %s', + resources) + obj.resources = {} + elseif res_ref.dict then + obj.resources = res_ref.dict + else + obj.resources = {} + end + else + -- Fucking pdf: we need to inherit from parent + resources = {} + if dict.Parent then + local parent = maybe_dereference_object(dict.Parent, pdf, task) + + if parent and type(parent) == 'table' and parent.dict then + if parent.resources then + lua_util.debugm(N, task, 'propagated resources from %s:%s to %s:%s', + parent.major, parent.minor, obj.major, obj.minor) + resources = parent.resources + end + end + end + + obj.resources = resources + end + + + + --[[Disabled fonts extraction + local fonts = obj.resources.Font + if fonts and type(fonts) == 'table' then + obj.fonts = {} + for k,v in pairs(fonts) do + obj.fonts[k] = maybe_dereference_object(v, pdf, task) + + if obj.fonts[k] then + local font = obj.fonts[k] + + if config.text_extraction then + process_font(task, pdf, font, k) + lua_util.debugm(N, task, 'found font "%s" for object %s:%s -> %s', + k, obj.major, obj.minor, font) + end + end + end + end + ]] + + lua_util.debugm(N, task, 'found resources for object %s:%s (%s): %s', + obj.major, obj.minor, obj.type, obj.resources) + + if obj.type == 'Action' then + process_action(task, pdf, obj) + elseif obj.type == 'Catalog' then + process_catalog(task, pdf, obj) + elseif obj.type == 'XRef' then + -- XRef stream instead of trailer from PDF 1.5 (thanks Adobe) + process_xref(task, pdf, obj) + elseif obj.type == 'Javascript' then + local js = maybe_dereference_object(obj.dict.JS, pdf, task) + + if js then + if type(js) == 'table' then + local extracted_js = maybe_extract_object_stream(js, pdf, task) + + if not extracted_js then + lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s', + obj.major, obj.minor, js) + else + js = extracted_js + end + end + + js = process_javascript(task, pdf, js, obj) + if js then + obj.js = js + lua_util.debugm(N, task, 'extracted javascript from %s:%s: %s', + obj.major, obj.minor, obj.js.data) + else + lua_util.debugm(N, task, 'invalid type for JavaScript from %s:%s: %s', + obj.major, obj.minor, js) + end + end + end + end -- Already processed dict (obj.type is not empty) +end + +-- This function is intended to unpack objects from ObjStm crappy structure +local compound_obj_grammar +local function compound_obj_grammar_gen() + if not compound_obj_grammar then + local gen = generic_grammar_elts() + compound_obj_grammar = gen.ws ^ 0 * (gen.comment * gen.ws ^ 1) ^ 0 * + lpeg.Ct(lpeg.Ct(gen.number * gen.ws ^ 1 * gen.number * gen.ws ^ 0) ^ 1) + end + + return compound_obj_grammar +end +local function pdf_compound_object_unpack(_, uncompressed, pdf, task, first) + -- First, we need to parse data line by line likely to find a line + -- that consists of pairs of numbers + compound_obj_grammar_gen() + local elts = compound_obj_grammar:match(uncompressed) + if elts and #elts > 0 then + lua_util.debugm(N, task, 'compound elts (chunk length %s): %s', + #uncompressed, elts) + + for i, pair in ipairs(elts) do + local obj_number, offset = pair[1], pair[2] + + offset = offset + first + if offset < #uncompressed then + local span_len + if i == #elts then + span_len = #uncompressed - offset + else + span_len = (elts[i + 1][2] + first) - offset + end + + if span_len > 0 and offset + span_len <= #uncompressed then + local obj = { + major = obj_number, + minor = 0, -- Implicit + data = uncompressed:span(offset + 1, span_len), + ref = obj_ref(obj_number, 0) + } + parse_object_grammar(obj, task, pdf) + + if obj.dict then + pdf.objects[#pdf.objects + 1] = obj + end + else + lua_util.debugm(N, task, 'invalid span_len for compound object %s:%s; offset = %s, len = %s', + pair[1], pair[2], offset + span_len, #uncompressed) + end + end + end + end +end + +-- PDF 1.5 ObjStmt +local function extract_pdf_compound_objects(task, pdf) + for i, obj in ipairs(pdf.objects or {}) do + if i > 0 and i % 100 == 0 then + local now = rspamd_util.get_ticks() + + if now >= pdf.end_timestamp then + pdf.timeout_processing = now - pdf.start_timestamp + + lua_util.debugm(N, task, 'pdf: timeout processing compound objects after spending %s seconds, ' .. + '%s elements processed', + pdf.timeout_processing, i) + break + end + end + if obj.stream and obj.dict and type(obj.dict) == 'table' then + local t = obj.dict.Type + if t and t == 'ObjStm' then + -- We are in troubles sir... + local nobjs = tonumber(maybe_dereference_object(obj.dict.N, pdf, task)) + local first = tonumber(maybe_dereference_object(obj.dict.First, pdf, task)) + + if nobjs and first then + --local extend = maybe_dereference_object(obj.dict.Extends, pdf, task) + lua_util.debugm(N, task, 'extract ObjStm with %s objects (%s first) %s extend', + nobjs, first, obj.dict.Extends) + + local uncompressed = maybe_extract_object_stream(obj, pdf, task) + + if uncompressed then + pdf_compound_object_unpack(obj, uncompressed, pdf, task, first) + end + else + lua_util.debugm(N, task, 'ObjStm object %s:%s has bad dict: %s', + obj.major, obj.minor, obj.dict) + end + end + end + end +end + +-- This function arranges starts and ends of all objects and process them into initial +-- set of objects +local function extract_outer_objects(task, input, pdf) + local start_pos, end_pos = 1, 1 + local max_start_pos, max_end_pos + local obj_count = 0 + + max_start_pos = math.min(config.max_pdf_objects, #pdf.start_objects) + max_end_pos = math.min(config.max_pdf_objects, #pdf.end_objects) + lua_util.debugm(N, task, "pdf: extract objects from %s start positions and %s end positions", + max_start_pos, max_end_pos) + + while start_pos <= max_start_pos and end_pos <= max_end_pos do + local first = pdf.start_objects[start_pos] + local last = pdf.end_objects[end_pos] + + -- 7 is length of `endobj\n` + if first + 6 < last then + local len = last - first - 6 + + -- Also get the starting span and try to match it versus obj re to get numbers + local obj_line_potential = first - 32 + if obj_line_potential < 1 then + obj_line_potential = 1 + end + local prev_obj_end = pdf.end_objects[end_pos - 1] + if end_pos > 1 and prev_obj_end >= obj_line_potential and prev_obj_end < first then + obj_line_potential = prev_obj_end + 1 + end + + local obj_line_span = input:span(obj_line_potential, first - obj_line_potential + 1) + local matches = object_re:search(obj_line_span, true, true) + + if matches and matches[1] then + local nobj = { + start = first, + len = len, + data = input:span(first, len), + major = tonumber(matches[1][2]), + minor = tonumber(matches[1][3]), + } + pdf.objects[obj_count + 1] = nobj + if nobj.major and nobj.minor then + -- Add reference + local ref = obj_ref(nobj.major, nobj.minor) + nobj.ref = ref -- Our internal reference + pdf.ref[ref] = nobj + end + end + + obj_count = obj_count + 1 + start_pos = start_pos + 1 + end_pos = end_pos + 1 + elseif first > last then + end_pos = end_pos + 1 + else + start_pos = start_pos + 1 + end_pos = end_pos + 1 + end + end +end + +-- This function attaches streams to objects and processes outer pdf grammar +local function attach_pdf_streams(task, input, pdf) + if pdf.start_streams and pdf.end_streams then + local start_pos, end_pos = 1, 1 + local max_start_pos, max_end_pos + + max_start_pos = math.min(config.max_pdf_objects, #pdf.start_streams) + max_end_pos = math.min(config.max_pdf_objects, #pdf.end_streams) + + for _, obj in ipairs(pdf.objects) do + while start_pos <= max_start_pos and end_pos <= max_end_pos do + local first = pdf.start_streams[start_pos] + local last = pdf.end_streams[end_pos] + last = last - 10 -- Exclude endstream\n pattern + lua_util.debugm(N, task, "start: %s, end: %s; obj: %s-%s", + first, last, obj.start, obj.start + obj.len) + if first > obj.start and last < obj.start + obj.len and last > first then + -- In case if we have fake endstream :( + while pdf.end_streams[end_pos + 1] and pdf.end_streams[end_pos + 1] < obj.start + obj.len do + end_pos = end_pos + 1 + last = pdf.end_streams[end_pos] + end + -- Strip the first \n + while first < last do + local chr = input:byte(first) + if chr ~= 13 and chr ~= 10 then + break + end + first = first + 1 + end + local len = last - first + obj.stream = { + start = first, + len = len, + data = input:span(first, len) + } + start_pos = start_pos + 1 + end_pos = end_pos + 1 + break + elseif first < obj.start then + start_pos = start_pos + 1 + elseif last > obj.start + obj.len then + -- Not this object + break + else + start_pos = start_pos + 1 + end_pos = end_pos + 1 + end + end + if obj.stream then + lua_util.debugm(N, task, 'found object %s:%s %s start %s len, %s stream start, %s stream length', + obj.major, obj.minor, obj.start, obj.len, obj.stream.start, obj.stream.len) + else + lua_util.debugm(N, task, 'found object %s:%s %s start %s len, no stream', + obj.major, obj.minor, obj.start, obj.len) + end + end + end +end + +-- Processes PDF objects: extracts streams, object numbers, process outer grammar, +-- augment object types +local function postprocess_pdf_objects(task, input, pdf) + pdf.objects = {} -- objects table + pdf.ref = {} -- references table + extract_outer_objects(task, input, pdf) + + -- Now we have objects and we need to attach streams that are in bounds + attach_pdf_streams(task, input, pdf) + -- Parse grammar for outer objects + for i, obj in ipairs(pdf.objects) do + if i > 0 and i % 100 == 0 then + local now = rspamd_util.get_ticks() + + if now >= pdf.end_timestamp then + pdf.timeout_processing = now - pdf.start_timestamp + + lua_util.debugm(N, task, 'pdf: timeout processing grammars after spending %s seconds, ' .. + '%s elements processed', + pdf.timeout_processing, i) + break + end + end + if obj.ref then + parse_object_grammar(obj, task, pdf) + + -- Special early handling + if obj.dict and obj.dict.Type and obj.dict.Type == 'XRef' then + process_xref(task, pdf, obj) + end + end + end + + if not pdf.timeout_processing then + extract_pdf_compound_objects(task, pdf) + else + -- ENOTIME + return + end + + -- Now we might probably have all objects being processed + for i, obj in ipairs(pdf.objects) do + if obj.dict then + -- Types processing + if i > 0 and i % 100 == 0 then + local now = rspamd_util.get_ticks() + + if now >= pdf.end_timestamp then + pdf.timeout_processing = now - pdf.start_timestamp + + lua_util.debugm(N, task, 'pdf: timeout processing dicts after spending %s seconds, ' .. + '%s elements processed', + pdf.timeout_processing, i) + break + end + end + process_dict(task, pdf, obj, obj.dict) + end + end +end + +local function offsets_to_blocks(starts, ends, out) + local start_pos, end_pos = 1, 1 + + while start_pos <= #starts and end_pos <= #ends do + local first = starts[start_pos] + local last = ends[end_pos] + + if first < last then + local len = last - first + out[#out + 1] = { + start = first, + len = len, + } + start_pos = start_pos + 1 + end_pos = end_pos + 1 + elseif first > last then + end_pos = end_pos + 1 + else + -- Not ordered properly! + break + end + end +end + +local function search_text(task, pdf) + for _, obj in ipairs(pdf.objects) do + if obj.type == 'Page' and obj.contents then + local text = {} + for _, tobj in ipairs(obj.contents) do + maybe_extract_object_stream(tobj, pdf, task) + local matches = pdf_text_trie:match(tobj.uncompressed or '') + if matches then + local text_blocks = {} + local starts = {} + local ends = {} + + for npat, matched_positions in pairs(matches) do + if npat == 1 then + for _, pos in ipairs(matched_positions) do + starts[#starts + 1] = pos + end + else + for _, pos in ipairs(matched_positions) do + ends[#ends + 1] = pos + end + end + end + + offsets_to_blocks(starts, ends, text_blocks) + for _, bl in ipairs(text_blocks) do + if bl.len > 2 then + -- To remove \s+ET\b pattern (it can leave trailing space or not but it doesn't matter) + bl.len = bl.len - 2 + end + + bl.data = tobj.uncompressed:span(bl.start, bl.len) + --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s', + -- tobj.major, tobj.minor, bl.data) + + if bl.len < config.max_processing_size then + local ret, obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar, + bl.data) + + if ret then + text[#text + 1] = obj_or_err + lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s', + obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor) + else + lua_util.debugm(N, task, 'object %s:%s cannot be parsed: %s', + obj.major, obj.minor, obj_or_err) + end + + end + end + end + end + + -- Join all text data together + if #text > 0 then + obj.text = rspamd_text.fromtable(text) + lua_util.debugm(N, task, 'object %s:%s is parsed to: %s', + obj.major, obj.minor, obj.text) + end + end + end +end + +-- This function searches objects for `/URI` key and parses it's content +local function search_urls(task, pdf, mpart) + local function recursive_object_traverse(obj, dict, rec) + if rec > 10 then + lua_util.debugm(N, task, 'object %s:%s recurses too much', + obj.major, obj.minor) + return + end + + for k, v in pairs(dict) do + if type(v) == 'table' then + recursive_object_traverse(obj, v, rec + 1) + elseif k == 'URI' then + v = maybe_dereference_object(v, pdf, task) + if type(v) == 'string' then + local url = rspamd_url.create(task:get_mempool(), v, { 'content' }) + + if url then + lua_util.debugm(N, task, 'found url %s in object %s:%s', + v, obj.major, obj.minor) + task:inject_url(url, mpart) + end + end + end + end + end + + for _, obj in ipairs(pdf.objects) do + if obj.dict and type(obj.dict) == 'table' then + recursive_object_traverse(obj, obj.dict, 0) + end + end +end + +local function process_pdf(input, mpart, task) + + if not config.enabled then + -- Skip processing + return {} + end + + local matches = pdf_trie:match(input) + + if matches then + local start_ts = rspamd_util.get_ticks() + -- Temp object used to share data between pdf extraction methods + local pdf_object = { + tag = 'pdf', + extract_text = extract_text_data, + start_timestamp = start_ts, + end_timestamp = start_ts + config.pdf_process_timeout, + } + -- Output object that excludes all internal stuff + local pdf_output = lua_util.shallowcopy(pdf_object) + local grouped_processors = {} + for npat, matched_positions in pairs(matches) do + local index = pdf_indexes[npat] + + local proc_key, loc_npat = index[1], index[4] + + if not grouped_processors[proc_key] then + grouped_processors[proc_key] = { + processor_func = processors[proc_key], + offsets = {}, + } + end + local proc = grouped_processors[proc_key] + -- Fill offsets + for _, pos in ipairs(matched_positions) do + proc.offsets[#proc.offsets + 1] = { pos, loc_npat } + end + end + + for name, processor in pairs(grouped_processors) do + -- Sort by offset + lua_util.debugm(N, task, "pdf: process group %s with %s matches", + name, #processor.offsets) + table.sort(processor.offsets, function(e1, e2) + return e1[1] < e2[1] + end) + processor.processor_func(input, task, processor.offsets, pdf_object, pdf_output) + end + + pdf_output.flags = {} + + if pdf_object.start_objects and pdf_object.end_objects then + if #pdf_object.start_objects > config.max_pdf_objects then + pdf_output.many_objects = #pdf_object.start_objects + -- Trim + end + + -- Postprocess objects + postprocess_pdf_objects(task, input, pdf_object) + if config.text_extraction then + search_text(task, pdf_object, pdf_output) + end + if config.url_extraction then + search_urls(task, pdf_object, mpart, pdf_output) + end + + if config.js_fuzzy and pdf_object.scripts then + pdf_output.fuzzy_hashes = {} + if config.openaction_fuzzy_only then + -- OpenAction only + if pdf_object.openaction and pdf_object.openaction.bin_hash then + if config.min_js_fuzzy and #pdf_object.openaction.data >= config.min_js_fuzzy then + lua_util.debugm(N, task, "pdf: add fuzzy hash from openaction: %s; size = %s; object: %s:%s", + pdf_object.openaction.hash, + #pdf_object.openaction.data, + pdf_object.openaction.object.major, pdf_object.openaction.object.minor) + table.insert(pdf_output.fuzzy_hashes, pdf_object.openaction.bin_hash) + else + lua_util.debugm(N, task, "pdf: skip fuzzy hash from JavaScript: %s, too short: %s", + pdf_object.openaction.hash, #pdf_object.openaction.data) + end + end + else + -- All hashes + for h, sc in pairs(pdf_object.scripts) do + if config.min_js_fuzzy and #sc.data >= config.min_js_fuzzy then + lua_util.debugm(N, task, "pdf: add fuzzy hash from JavaScript: %s; size = %s; object: %s:%s", + sc.hash, + #sc.data, + sc.object.major, sc.object.minor) + table.insert(pdf_output.fuzzy_hashes, h) + else + lua_util.debugm(N, task, "pdf: skip fuzzy hash from JavaScript: %s, too short: %s", + sc.hash, #sc.data) + end + end + + end + end + else + pdf_output.flags.no_objects = true + end + + -- Propagate from object to output + if pdf_object.encrypted then + pdf_output.encrypted = true + end + if pdf_object.scripts then + pdf_output.scripts = true + end + + return pdf_output + end +end + +-- Processes the PDF trailer +processors.trailer = function(input, task, positions, pdf_object, pdf_output) + local last_pos = positions[#positions] + + lua_util.debugm(N, task, 'pdf: process trailer at position %s (%s total length)', + last_pos, #input) + + if last_pos[1] > config.max_pdf_trailer then + pdf_output.long_trailer = #input - last_pos[1] + return + end + + local last_span = input:span(last_pos[1]) + local lines_checked = 0 + for line in last_span:lines(true) do + if line:find('/Encrypt ') then + lua_util.debugm(N, task, "pdf: found encrypted line in trailer: %s", + line) + pdf_output.encrypted = true + pdf_object.encrypted = true + break + end + lines_checked = lines_checked + 1 + + if lines_checked > config.max_pdf_trailer_lines then + lua_util.debugm(N, task, "pdf: trailer has too many lines, stop checking") + pdf_output.long_trailer = #input - last_pos[1] + break + end + end +end + +processors.suspicious = function(input, task, positions, pdf_object, pdf_output) + local suspicious_factor = 0.0 + local nexec = 0 + local nencoded = 0 + local close_encoded = 0 + local last_encoded + for _, match in ipairs(positions) do + if match[2] == 1 then + -- netsh + suspicious_factor = suspicious_factor + 0.5 + elseif match[2] == 2 then + nexec = nexec + 1 + elseif match[2] == 3 then + local enc_data = input:sub(match[1] - 2, match[1] - 1) + local legal_escape = false + + if enc_data then + enc_data = enc_data:strtoul() + + if enc_data then + -- Legit encode cases are non printable characters (e.g. spaces) + if enc_data < 0x21 or enc_data >= 0x7f then + legal_escape = true + end + end + end + + if not legal_escape then + nencoded = nencoded + 1 + + if last_encoded then + if match[1] - last_encoded < 8 then + -- likely consecutive encoded chars, increase factor + close_encoded = close_encoded + 1 + end + end + last_encoded = match[1] + + end + end + end + + if nencoded > 10 then + suspicious_factor = suspicious_factor + nencoded / 10 + end + if nexec > 1 then + suspicious_factor = suspicious_factor + nexec / 2.0 + end + if close_encoded > 4 and nencoded - close_encoded < 5 then + -- Too many close encoded comparing to the total number of encoded characters + suspicious_factor = suspicious_factor + 0.5 + end + + lua_util.debugm(N, task, 'pdf: found a suspicious patterns: %s exec, %s encoded (%s close), ' .. + '%s final factor', + nexec, nencoded, close_encoded, suspicious_factor) + + if suspicious_factor > 1.0 then + suspicious_factor = 1.0 + end + + pdf_output.suspicious = suspicious_factor +end + +local function generic_table_inserter(positions, pdf_object, output_key) + if not pdf_object[output_key] then + pdf_object[output_key] = {} + end + local shift = #pdf_object[output_key] + for i, pos in ipairs(positions) do + pdf_object[output_key][i + shift] = pos[1] + end +end + +processors.start_object = function(_, task, positions, pdf_object) + generic_table_inserter(positions, pdf_object, 'start_objects') +end + +processors.end_object = function(_, task, positions, pdf_object) + generic_table_inserter(positions, pdf_object, 'end_objects') +end + +processors.start_stream = function(_, task, positions, pdf_object) + generic_table_inserter(positions, pdf_object, 'start_streams') +end + +processors.end_stream = function(_, task, positions, pdf_object) + generic_table_inserter(positions, pdf_object, 'end_streams') +end + +exports.process = process_pdf + +return exports
\ No newline at end of file diff --git a/lualib/lua_content/vcard.lua b/lualib/lua_content/vcard.lua new file mode 100644 index 0000000..ed14412 --- /dev/null +++ b/lualib/lua_content/vcard.lua @@ -0,0 +1,84 @@ +--[[ +Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +local l = require 'lpeg' +local lua_util = require "lua_util" +local N = "lua_content" + +local vcard_grammar + +-- XXX: Currently it is a copy of ical grammar +local function gen_grammar() + if not vcard_grammar then + local wsp = l.S(" \t\v\f") + local crlf = (l.P "\r" ^ -1 * l.P "\n") + l.P "\r" + local eol = (crlf * #crlf) + (crlf - (crlf ^ -1 * wsp)) + local name = l.C((l.P(1) - (l.P ":")) ^ 1) / function(v) + return (v:gsub("[\n\r]+%s", "")) + end + local value = l.C((l.P(1) - eol) ^ 0) / function(v) + return (v:gsub("[\n\r]+%s", "")) + end + vcard_grammar = name * ":" * wsp ^ 0 * value * eol ^ -1 + end + + return vcard_grammar +end + +local exports = {} + +local function process_vcard(input, mpart, task) + local control = { n = '\n', r = '' } + local rspamd_url = require "rspamd_url" + local escaper = l.Ct((gen_grammar() / function(key, value) + value = value:gsub("\\(.)", control) + key = key:lower() + local local_urls = rspamd_url.all(task:get_mempool(), value) + + if local_urls and #local_urls > 0 then + for _, u in ipairs(local_urls) do + lua_util.debugm(N, task, 'vcard: found URL in vcard %s', + tostring(u)) + task:inject_url(u, mpart) + end + end + lua_util.debugm(N, task, 'vcard: vcard key %s = "%s"', + key, value) + return { key, value } + end) ^ 1) + + local elts = escaper:match(input) + + if not elts then + return nil + end + + return { + tag = 'vcard', + extract_text = function() + return nil + end, -- NYI + elts = elts + } +end + +--[[[ +-- @function vcard.process(input) +-- Returns all values from vcard as a plain text. Names are completely ignored. +--]] +exports.process = process_vcard + +return exports
\ No newline at end of file |