summaryrefslogtreecommitdiffstats
path: root/test/lua/unit/utf.lua
diff options
context:
space:
mode:
Diffstat (limited to 'test/lua/unit/utf.lua')
-rw-r--r--test/lua/unit/utf.lua207
1 files changed, 207 insertions, 0 deletions
diff --git a/test/lua/unit/utf.lua b/test/lua/unit/utf.lua
new file mode 100644
index 0000000..dbdab7f
--- /dev/null
+++ b/test/lua/unit/utf.lua
@@ -0,0 +1,207 @@
+-- Test utf routines
+
+context("UTF8 check functions", function()
+ local ffi = require("ffi")
+ ffi.cdef[[
+ unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size);
+ unsigned int rspamd_str_lc (char *str, unsigned int size);
+ void rspamd_fast_utf8_library_init (unsigned flags);
+ void ottery_rand_bytes(void *buf, size_t n);
+ double rspamd_get_ticks(int allow);
+ size_t rspamd_fast_utf8_validate (const unsigned char *data, size_t len);
+ size_t rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len);
+ size_t rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len);
+ size_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len);
+ char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen, void *);
+ ]]
+
+ local cases = {
+ {"АбЫрвАлг", "абырвалг"},
+ {"АAБBвc", "аaбbвc"},
+ --{"STRASSE", "straße"}, XXX: NYI
+ {"KEÇİ", "keçi"},
+ }
+
+ for i,c in ipairs(cases) do
+ test("UTF lowercase " .. tostring(i), function()
+ local buf = ffi.new("char[?]", #c[1] + 1)
+ ffi.copy(buf, c[1])
+ local nlen = ffi.C.rspamd_str_lc_utf8(buf, #c[1])
+ local s = ffi.string(buf, nlen)
+ assert_equal(s, c[2])
+ end)
+ end
+
+ cases = {
+ {"AbCdEf", "abcdef"},
+ {"A", "a"},
+ {"AaAa", "aaaa"},
+ {"AaAaAaAa", "aaaaaaaa"}
+ }
+
+ for i,c in ipairs(cases) do
+ test("ASCII lowercase " .. tostring(i), function()
+ local buf = ffi.new("char[?]", #c[1] + 1)
+ ffi.copy(buf, c[1])
+ ffi.C.rspamd_str_lc(buf, #c[1])
+ local s = ffi.string(buf)
+ assert_equal(s, c[2])
+ end)
+ end
+
+ cases = {
+ {'тест', 'тест'},
+ {'\200\213\202', '���'},
+ {'тест\200\213\202test', 'тест���test'},
+ {'\200\213\202test', '���test'},
+ {'\200\213\202test\200\213\202', '���test���'},
+ {'тест\200\213\202test\200\213\202', 'тест���test���'},
+ {'тест\200\213\202test\200\213\202тест', 'тест���test���тест'},
+ }
+
+ local NULL = ffi.new 'void*'
+ for i,c in ipairs(cases) do
+ test("Unicode make valid " .. tostring(i), function()
+ local buf = ffi.new("char[?]", #c[1] + 1)
+ ffi.copy(buf, c[1])
+
+ local s = ffi.string(ffi.C.rspamd_str_make_utf_valid(buf, #c[1], NULL, NULL))
+ local function to_hex(s)
+ return (s:gsub('.', function (c)
+ return string.format('%02X', string.byte(c))
+ end))
+ end
+ print(to_hex(s))
+ print(to_hex(c[2]))
+ assert_equal(s, c[2])
+ end)
+ end
+
+ -- Enable sse and avx2
+ ffi.C.rspamd_fast_utf8_library_init(3)
+ local valid_cases = {
+ "a",
+ "\xc3\xb1",
+ "\xe2\x82\xa1",
+ "\xf0\x90\x8c\xbc",
+ "안녕하세요, 세상"
+ }
+ for i,c in ipairs(valid_cases) do
+ test("Unicode validate success: " .. tostring(i), function()
+ local buf = ffi.new("char[?]", #c + 1)
+ ffi.copy(buf, c)
+
+ local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
+ assert_equal(ret, 0)
+ end)
+ end
+ local invalid_cases = {
+ "\xc3\x28",
+ "\xa0\xa1",
+ "\xe2\x28\xa1",
+ "\xe2\x82\x28",
+ "\xf0\x28\x8c\xbc",
+ "\xf0\x90\x28\xbc",
+ "\xf0\x28\x8c\x28",
+ "\xc0\x9f",
+ "\xf5\xff\xff\xff",
+ "\xed\xa0\x81",
+ "\xf8\x90\x80\x80\x80",
+ "123456789012345\xed",
+ "123456789012345\xf1",
+ "123456789012345\xc2",
+ "\xC2\x7F"
+ }
+ for i,c in ipairs(invalid_cases) do
+ test("Unicode validate fail: " .. tostring(i), function()
+ local buf = ffi.new("char[?]", #c + 1)
+ ffi.copy(buf, c)
+
+ local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
+ assert_not_equal(ret, 0)
+ end)
+ end
+
+ if os.getenv("RSPAMD_LUA_EXPENSIVE_TESTS") then
+ local speed_iters = 10000
+ local function test_size(buflen, is_valid, impl)
+ local logger = require "rspamd_logger"
+ local test_str
+ if is_valid then
+ test_str = table.concat(valid_cases)
+ else
+ test_str = table.concat(valid_cases) .. table.concat(invalid_cases)
+ end
+
+ local buf = ffi.new("char[?]", buflen)
+ if #test_str < buflen then
+ local t = {}
+ local len = #test_str
+ while len < buflen do
+ t[#t + 1] = test_str
+ len = len + #test_str
+ end
+ test_str = table.concat(t)
+ end
+ ffi.copy(buf, test_str:sub(1, buflen))
+
+ local tm = 0
+
+ for _=1,speed_iters do
+ if impl == 'ref' then
+ local t1 = ffi.C.rspamd_get_ticks(1)
+ ffi.C.rspamd_fast_utf8_validate_ref(buf, buflen)
+ local t2 = ffi.C.rspamd_get_ticks(1)
+ tm = tm + (t2 - t1)
+ elseif impl == 'sse' then
+ local t1 = ffi.C.rspamd_get_ticks(1)
+ ffi.C.rspamd_fast_utf8_validate_sse41(buf, buflen)
+ local t2 = ffi.C.rspamd_get_ticks(1)
+ tm = tm + (t2 - t1)
+ else
+ local t1 = ffi.C.rspamd_get_ticks(1)
+ ffi.C.rspamd_fast_utf8_validate_avx2(buf, buflen)
+ local t2 = ffi.C.rspamd_get_ticks(1)
+ tm = tm + (t2 - t1)
+ end
+ end
+
+ logger.messagex("%s utf8 %s check (valid = %s): %s ticks per iter, %s ticks per byte",
+ impl, buflen, is_valid,
+ tm / speed_iters, tm / speed_iters / buflen)
+
+ return 0
+ end
+
+ for _,sz in ipairs({78, 512, 65535}) do
+ test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'valid'), function()
+ local res = test_size(sz, true, 'ref')
+ assert_equal(res, 0)
+ end)
+ test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'invalid'), function()
+ local res = test_size(sz, false, 'ref')
+ assert_equal(res, 0)
+ end)
+
+ if jit.arch == 'x64' then
+ test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'valid'), function()
+ local res = test_size(sz, true, 'sse')
+ assert_equal(res, 0)
+ end)
+ test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'invalid'), function()
+ local res = test_size(sz, false, 'sse')
+ assert_equal(res, 0)
+ end)
+ test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'valid'), function()
+ local res = test_size(sz, true, 'avx2')
+ assert_equal(res, 0)
+ end)
+ test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'invalid'), function()
+ local res = test_size(sz, false, 'avx2')
+ assert_equal(res, 0)
+ end)
+ end
+ end
+ end
+
+end) \ No newline at end of file