summaryrefslogtreecommitdiffstats
path: root/lualib/redis_scripts/bayes_classify.lua
diff options
context:
space:
mode:
Diffstat (limited to 'lualib/redis_scripts/bayes_classify.lua')
-rw-r--r--lualib/redis_scripts/bayes_classify.lua37
1 files changed, 37 insertions, 0 deletions
diff --git a/lualib/redis_scripts/bayes_classify.lua b/lualib/redis_scripts/bayes_classify.lua
new file mode 100644
index 0000000..e94f645
--- /dev/null
+++ b/lualib/redis_scripts/bayes_classify.lua
@@ -0,0 +1,37 @@
+-- Lua script to perform bayes classification
+-- This script accepts the following parameters:
+-- key1 - prefix for bayes tokens (e.g. for per-user classification)
+-- key2 - set of tokens encoded in messagepack array of strings
+
+local prefix = KEYS[1]
+local output_spam = {}
+local output_ham = {}
+
+local learned_ham = tonumber(redis.call('HGET', prefix, 'learns_ham')) or 0
+local learned_spam = tonumber(redis.call('HGET', prefix, 'learns_spam')) or 0
+
+-- Output is a set of pairs (token_index, token_count), tokens that are not
+-- found are not filled.
+-- This optimisation will save a lot of space for sparse tokens, and in Bayes that assumption is normally held
+
+if learned_ham > 0 and learned_spam > 0 then
+ local input_tokens = cmsgpack.unpack(KEYS[2])
+ for i, token in ipairs(input_tokens) do
+ local token_data = redis.call('HMGET', token, 'H', 'S')
+
+ if token_data then
+ local ham_count = token_data[1]
+ local spam_count = token_data[2]
+
+ if ham_count then
+ table.insert(output_ham, { i, tonumber(ham_count) })
+ end
+
+ if spam_count then
+ table.insert(output_spam, { i, tonumber(spam_count) })
+ end
+ end
+ end
+end
+
+return { learned_ham, learned_spam, output_ham, output_spam } \ No newline at end of file