summaryrefslogtreecommitdiffstats
path: root/test/lua/unit/tokenizer.lua
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
commit133a45c109da5310add55824db21af5239951f93 (patch)
treeba6ac4c0a950a0dda56451944315d66409923918 /test/lua/unit/tokenizer.lua
parentInitial commit. (diff)
downloadrspamd-133a45c109da5310add55824db21af5239951f93.tar.xz
rspamd-133a45c109da5310add55824db21af5239951f93.zip
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'test/lua/unit/tokenizer.lua')
-rw-r--r--test/lua/unit/tokenizer.lua81
1 files changed, 81 insertions, 0 deletions
diff --git a/test/lua/unit/tokenizer.lua b/test/lua/unit/tokenizer.lua
new file mode 100644
index 0000000..fbf7ee3
--- /dev/null
+++ b/test/lua/unit/tokenizer.lua
@@ -0,0 +1,81 @@
+context("Text tokenization test", function()
+ local util = require "rspamd_util"
+ local logger = require "rspamd_logger"
+
+ local cases = {
+ {"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer mattis, nibh",
+ {"Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit",
+ "Integer", "mattis", "nibh"
+ }
+ },
+ {"Հետաքրքրվողների համար ոտորև ներկայացված",
+ {"Հետաքրքրվողների", "համար", "ոտորև", "ներկայացված"}
+ },
+ {"", {}},
+ {",,,,,", {}},
+ {"word,,,,,word ", {"word", "word"}},
+ {"word", {"word"}},
+ {",,,,word,,,", {"word"}}
+ }
+
+ for i,c in ipairs(cases) do
+ test("Tokenize simple " .. i, function()
+ local w = util.tokenize_text(c[1])
+ if #c[2] == 0 then
+ assert_equal(#w, 0, "must not have tokens " .. c[1])
+ else
+ assert_not_nil(w, "must tokenize " .. c[1])
+
+ for i,wrd in ipairs(w) do
+ assert_equal(wrd, c[2][i])
+ end
+ end
+ end)
+ end
+
+ cases = {
+ {"word https://example.com/path word",
+ {{5, 24}},
+ {"word", "!!EX!!", "word"}
+ },
+ {"համար https://example.com/path համար",
+ {{11, 24}},
+ {"համար", "!!EX!!", "համար"}
+ },
+ {"word https://example.com/path https://example.com/path word",
+ {{5, 24}, {30, 24}},
+ {"word", "!!EX!!", "!!EX!!", "word"}
+ },
+ {"word https://example.com/path https://example.com/path",
+ {{5, 24}, {30, 24}},
+ {"word", "!!EX!!", "!!EX!!"}
+ },
+ {"https://example.com/path https://example.com/path word",
+ {{0, 24}, {25, 24}},
+ {"!!EX!!", "!!EX!!", "word"}
+ },
+ {"https://example.com/path https://example.com/path",
+ {{0, 24}, {25, 24}},
+ {"!!EX!!", "!!EX!!"}
+ },
+ {",,,,https://example.com/path https://example.com/path ",
+ {{4, 24}, {29, 24}},
+ {"!!EX!!", "!!EX!!"}
+ },
+ }
+
+ for i,c in ipairs(cases) do
+ test("Tokenize with exceptions " .. i, function()
+ local w = util.tokenize_text(c[1], c[2])
+ if #c[3] == 0 then
+ assert_equal(#w, 0, "must not have tokens " .. c[1])
+ else
+ assert_not_nil(w, "must tokenize " .. c[1])
+ for i,wrd in ipairs(w) do
+ assert_equal(wrd, c[3][i])
+ end
+ end
+ end)
+ end
+
+end) \ No newline at end of file