diff options
Diffstat (limited to 'lualib/lua_bayes_learn.lua')
-rw-r--r-- | lualib/lua_bayes_learn.lua | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/lualib/lua_bayes_learn.lua b/lualib/lua_bayes_learn.lua new file mode 100644 index 0000000..ea97db6 --- /dev/null +++ b/lualib/lua_bayes_learn.lua @@ -0,0 +1,151 @@ +--[[ +Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com> + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +-- This file contains functions to simplify bayes classifier auto-learning + +local lua_util = require "lua_util" +local lua_verdict = require "lua_verdict" +local N = "lua_bayes" + +local exports = {} + +exports.can_learn = function(task, is_spam, is_unlearn) + local learn_type = task:get_request_header('Learn-Type') + + if not (learn_type and tostring(learn_type) == 'bulk') then + local prob = task:get_mempool():get_variable('bayes_prob', 'double') + + if prob then + local in_class = false + local cl + if is_spam then + cl = 'spam' + in_class = prob >= 0.95 + else + cl = 'ham' + in_class = prob <= 0.05 + end + + if in_class then + return false, string.format( + 'already in class %s; probability %.2f%%', + cl, math.abs((prob - 0.5) * 200.0)) + end + end + end + + return true +end + +exports.autolearn = function(task, conf) + local function log_can_autolearn(verdict, score, threshold) + local from = task:get_from('smtp') + local mime_rcpts = 'undef' + local mr = task:get_recipients('mime') + if mr then + for _, r in ipairs(mr) do + if mime_rcpts == 'undef' then + mime_rcpts = r.addr + else + mime_rcpts = mime_rcpts .. ',' .. r.addr + end + end + end + + lua_util.debugm(N, task, 'id: %s, from: <%s>: can autolearn %s: score %s %s %s, mime_rcpts: <%s>', + task:get_header('Message-Id') or '<undef>', + from and from[1].addr or 'undef', + verdict, + string.format("%.2f", score), + verdict == 'ham' and '<=' or verdict == 'spam' and '>=' or '/', + threshold, + mime_rcpts) + end + + -- We have autolearn config so let's figure out what is requested + local verdict, score = lua_verdict.get_specific_verdict("bayes", task) + local learn_spam, learn_ham = false, false + + if verdict == 'passthrough' then + -- No need to autolearn + lua_util.debugm(N, task, 'no need to autolearn - verdict: %s', + verdict) + return + end + + if conf.spam_threshold and conf.ham_threshold then + if verdict == 'spam' then + if conf.spam_threshold and score >= conf.spam_threshold then + log_can_autolearn(verdict, score, conf.spam_threshold) + learn_spam = true + end + elseif verdict == 'junk' then + if conf.junk_threshold and score >= conf.junk_threshold then + log_can_autolearn(verdict, score, conf.junk_threshold) + learn_spam = true + end + elseif verdict == 'ham' then + if conf.ham_threshold and score <= conf.ham_threshold then + log_can_autolearn(verdict, score, conf.ham_threshold) + learn_ham = true + end + end + elseif conf.learn_verdict then + if verdict == 'spam' or verdict == 'junk' then + learn_spam = true + elseif verdict == 'ham' then + learn_ham = true + end + end + + if conf.check_balance then + -- Check balance of learns + local spam_learns = task:get_mempool():get_variable('spam_learns', 'int64') or 0 + local ham_learns = task:get_mempool():get_variable('ham_learns', 'int64') or 0 + + local min_balance = 0.9 + if conf.min_balance then + min_balance = conf.min_balance + end + + if spam_learns > 0 or ham_learns > 0 then + local max_ratio = 1.0 / min_balance + local spam_learns_ratio = spam_learns / (ham_learns + 1) + if spam_learns_ratio > max_ratio and learn_spam then + lua_util.debugm(N, task, + 'skip learning spam, balance is not satisfied: %s < %s; %s spam learns; %s ham learns', + spam_learns_ratio, min_balance, spam_learns, ham_learns) + learn_spam = false + end + + local ham_learns_ratio = ham_learns / (spam_learns + 1) + if ham_learns_ratio > max_ratio and learn_ham then + lua_util.debugm(N, task, + 'skip learning ham, balance is not satisfied: %s < %s; %s spam learns; %s ham learns', + ham_learns_ratio, min_balance, spam_learns, ham_learns) + learn_ham = false + end + end + end + + if learn_spam then + return 'spam' + elseif learn_ham then + return 'ham' + end +end + +return exports
\ No newline at end of file |