summaryrefslogtreecommitdiffstats
path: root/lualib/lua_bayes_learn.lua
diff options
context:
space:
mode:
Diffstat (limited to 'lualib/lua_bayes_learn.lua')
-rw-r--r--lualib/lua_bayes_learn.lua151
1 files changed, 151 insertions, 0 deletions
diff --git a/lualib/lua_bayes_learn.lua b/lualib/lua_bayes_learn.lua
new file mode 100644
index 0000000..ea97db6
--- /dev/null
+++ b/lualib/lua_bayes_learn.lua
@@ -0,0 +1,151 @@
+--[[
+Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]--
+
+-- This file contains functions to simplify bayes classifier auto-learning
+
+local lua_util = require "lua_util"
+local lua_verdict = require "lua_verdict"
+local N = "lua_bayes"
+
+local exports = {}
+
+exports.can_learn = function(task, is_spam, is_unlearn)
+ local learn_type = task:get_request_header('Learn-Type')
+
+ if not (learn_type and tostring(learn_type) == 'bulk') then
+ local prob = task:get_mempool():get_variable('bayes_prob', 'double')
+
+ if prob then
+ local in_class = false
+ local cl
+ if is_spam then
+ cl = 'spam'
+ in_class = prob >= 0.95
+ else
+ cl = 'ham'
+ in_class = prob <= 0.05
+ end
+
+ if in_class then
+ return false, string.format(
+ 'already in class %s; probability %.2f%%',
+ cl, math.abs((prob - 0.5) * 200.0))
+ end
+ end
+ end
+
+ return true
+end
+
+exports.autolearn = function(task, conf)
+ local function log_can_autolearn(verdict, score, threshold)
+ local from = task:get_from('smtp')
+ local mime_rcpts = 'undef'
+ local mr = task:get_recipients('mime')
+ if mr then
+ for _, r in ipairs(mr) do
+ if mime_rcpts == 'undef' then
+ mime_rcpts = r.addr
+ else
+ mime_rcpts = mime_rcpts .. ',' .. r.addr
+ end
+ end
+ end
+
+ lua_util.debugm(N, task, 'id: %s, from: <%s>: can autolearn %s: score %s %s %s, mime_rcpts: <%s>',
+ task:get_header('Message-Id') or '<undef>',
+ from and from[1].addr or 'undef',
+ verdict,
+ string.format("%.2f", score),
+ verdict == 'ham' and '<=' or verdict == 'spam' and '>=' or '/',
+ threshold,
+ mime_rcpts)
+ end
+
+ -- We have autolearn config so let's figure out what is requested
+ local verdict, score = lua_verdict.get_specific_verdict("bayes", task)
+ local learn_spam, learn_ham = false, false
+
+ if verdict == 'passthrough' then
+ -- No need to autolearn
+ lua_util.debugm(N, task, 'no need to autolearn - verdict: %s',
+ verdict)
+ return
+ end
+
+ if conf.spam_threshold and conf.ham_threshold then
+ if verdict == 'spam' then
+ if conf.spam_threshold and score >= conf.spam_threshold then
+ log_can_autolearn(verdict, score, conf.spam_threshold)
+ learn_spam = true
+ end
+ elseif verdict == 'junk' then
+ if conf.junk_threshold and score >= conf.junk_threshold then
+ log_can_autolearn(verdict, score, conf.junk_threshold)
+ learn_spam = true
+ end
+ elseif verdict == 'ham' then
+ if conf.ham_threshold and score <= conf.ham_threshold then
+ log_can_autolearn(verdict, score, conf.ham_threshold)
+ learn_ham = true
+ end
+ end
+ elseif conf.learn_verdict then
+ if verdict == 'spam' or verdict == 'junk' then
+ learn_spam = true
+ elseif verdict == 'ham' then
+ learn_ham = true
+ end
+ end
+
+ if conf.check_balance then
+ -- Check balance of learns
+ local spam_learns = task:get_mempool():get_variable('spam_learns', 'int64') or 0
+ local ham_learns = task:get_mempool():get_variable('ham_learns', 'int64') or 0
+
+ local min_balance = 0.9
+ if conf.min_balance then
+ min_balance = conf.min_balance
+ end
+
+ if spam_learns > 0 or ham_learns > 0 then
+ local max_ratio = 1.0 / min_balance
+ local spam_learns_ratio = spam_learns / (ham_learns + 1)
+ if spam_learns_ratio > max_ratio and learn_spam then
+ lua_util.debugm(N, task,
+ 'skip learning spam, balance is not satisfied: %s < %s; %s spam learns; %s ham learns',
+ spam_learns_ratio, min_balance, spam_learns, ham_learns)
+ learn_spam = false
+ end
+
+ local ham_learns_ratio = ham_learns / (spam_learns + 1)
+ if ham_learns_ratio > max_ratio and learn_ham then
+ lua_util.debugm(N, task,
+ 'skip learning ham, balance is not satisfied: %s < %s; %s spam learns; %s ham learns',
+ ham_learns_ratio, min_balance, spam_learns, ham_learns)
+ learn_ham = false
+ end
+ end
+ end
+
+ if learn_spam then
+ return 'spam'
+ elseif learn_ham then
+ return 'ham'
+ end
+end
+
+return exports \ No newline at end of file