diff options
Diffstat (limited to 'test/lua/unit/utf.lua')
-rw-r--r-- | test/lua/unit/utf.lua | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/test/lua/unit/utf.lua b/test/lua/unit/utf.lua new file mode 100644 index 0000000..dbdab7f --- /dev/null +++ b/test/lua/unit/utf.lua @@ -0,0 +1,207 @@ +-- Test utf routines + +context("UTF8 check functions", function() + local ffi = require("ffi") + ffi.cdef[[ + unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size); + unsigned int rspamd_str_lc (char *str, unsigned int size); + void rspamd_fast_utf8_library_init (unsigned flags); + void ottery_rand_bytes(void *buf, size_t n); + double rspamd_get_ticks(int allow); + size_t rspamd_fast_utf8_validate (const unsigned char *data, size_t len); + size_t rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len); + size_t rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len); + size_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len); + char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen, void *); + ]] + + local cases = { + {"АбЫрвАлг", "абырвалг"}, + {"АAБBвc", "аaбbвc"}, + --{"STRASSE", "straße"}, XXX: NYI + {"KEÇİ", "keçi"}, + } + + for i,c in ipairs(cases) do + test("UTF lowercase " .. tostring(i), function() + local buf = ffi.new("char[?]", #c[1] + 1) + ffi.copy(buf, c[1]) + local nlen = ffi.C.rspamd_str_lc_utf8(buf, #c[1]) + local s = ffi.string(buf, nlen) + assert_equal(s, c[2]) + end) + end + + cases = { + {"AbCdEf", "abcdef"}, + {"A", "a"}, + {"AaAa", "aaaa"}, + {"AaAaAaAa", "aaaaaaaa"} + } + + for i,c in ipairs(cases) do + test("ASCII lowercase " .. tostring(i), function() + local buf = ffi.new("char[?]", #c[1] + 1) + ffi.copy(buf, c[1]) + ffi.C.rspamd_str_lc(buf, #c[1]) + local s = ffi.string(buf) + assert_equal(s, c[2]) + end) + end + + cases = { + {'тест', 'тест'}, + {'\200\213\202', '���'}, + {'тест\200\213\202test', 'тест���test'}, + {'\200\213\202test', '���test'}, + {'\200\213\202test\200\213\202', '���test���'}, + {'тест\200\213\202test\200\213\202', 'тест���test���'}, + {'тест\200\213\202test\200\213\202тест', 'тест���test���тест'}, + } + + local NULL = ffi.new 'void*' + for i,c in ipairs(cases) do + test("Unicode make valid " .. tostring(i), function() + local buf = ffi.new("char[?]", #c[1] + 1) + ffi.copy(buf, c[1]) + + local s = ffi.string(ffi.C.rspamd_str_make_utf_valid(buf, #c[1], NULL, NULL)) + local function to_hex(s) + return (s:gsub('.', function (c) + return string.format('%02X', string.byte(c)) + end)) + end + print(to_hex(s)) + print(to_hex(c[2])) + assert_equal(s, c[2]) + end) + end + + -- Enable sse and avx2 + ffi.C.rspamd_fast_utf8_library_init(3) + local valid_cases = { + "a", + "\xc3\xb1", + "\xe2\x82\xa1", + "\xf0\x90\x8c\xbc", + "안녕하세요, 세상" + } + for i,c in ipairs(valid_cases) do + test("Unicode validate success: " .. tostring(i), function() + local buf = ffi.new("char[?]", #c + 1) + ffi.copy(buf, c) + + local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c) + assert_equal(ret, 0) + end) + end + local invalid_cases = { + "\xc3\x28", + "\xa0\xa1", + "\xe2\x28\xa1", + "\xe2\x82\x28", + "\xf0\x28\x8c\xbc", + "\xf0\x90\x28\xbc", + "\xf0\x28\x8c\x28", + "\xc0\x9f", + "\xf5\xff\xff\xff", + "\xed\xa0\x81", + "\xf8\x90\x80\x80\x80", + "123456789012345\xed", + "123456789012345\xf1", + "123456789012345\xc2", + "\xC2\x7F" + } + for i,c in ipairs(invalid_cases) do + test("Unicode validate fail: " .. tostring(i), function() + local buf = ffi.new("char[?]", #c + 1) + ffi.copy(buf, c) + + local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c) + assert_not_equal(ret, 0) + end) + end + + if os.getenv("RSPAMD_LUA_EXPENSIVE_TESTS") then + local speed_iters = 10000 + local function test_size(buflen, is_valid, impl) + local logger = require "rspamd_logger" + local test_str + if is_valid then + test_str = table.concat(valid_cases) + else + test_str = table.concat(valid_cases) .. table.concat(invalid_cases) + end + + local buf = ffi.new("char[?]", buflen) + if #test_str < buflen then + local t = {} + local len = #test_str + while len < buflen do + t[#t + 1] = test_str + len = len + #test_str + end + test_str = table.concat(t) + end + ffi.copy(buf, test_str:sub(1, buflen)) + + local tm = 0 + + for _=1,speed_iters do + if impl == 'ref' then + local t1 = ffi.C.rspamd_get_ticks(1) + ffi.C.rspamd_fast_utf8_validate_ref(buf, buflen) + local t2 = ffi.C.rspamd_get_ticks(1) + tm = tm + (t2 - t1) + elseif impl == 'sse' then + local t1 = ffi.C.rspamd_get_ticks(1) + ffi.C.rspamd_fast_utf8_validate_sse41(buf, buflen) + local t2 = ffi.C.rspamd_get_ticks(1) + tm = tm + (t2 - t1) + else + local t1 = ffi.C.rspamd_get_ticks(1) + ffi.C.rspamd_fast_utf8_validate_avx2(buf, buflen) + local t2 = ffi.C.rspamd_get_ticks(1) + tm = tm + (t2 - t1) + end + end + + logger.messagex("%s utf8 %s check (valid = %s): %s ticks per iter, %s ticks per byte", + impl, buflen, is_valid, + tm / speed_iters, tm / speed_iters / buflen) + + return 0 + end + + for _,sz in ipairs({78, 512, 65535}) do + test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'valid'), function() + local res = test_size(sz, true, 'ref') + assert_equal(res, 0) + end) + test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'invalid'), function() + local res = test_size(sz, false, 'ref') + assert_equal(res, 0) + end) + + if jit.arch == 'x64' then + test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'valid'), function() + local res = test_size(sz, true, 'sse') + assert_equal(res, 0) + end) + test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'invalid'), function() + local res = test_size(sz, false, 'sse') + assert_equal(res, 0) + end) + test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'valid'), function() + local res = test_size(sz, true, 'avx2') + assert_equal(res, 0) + end) + test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'invalid'), function() + local res = test_size(sz, false, 'avx2') + assert_equal(res, 0) + end) + end + end + end + +end)
\ No newline at end of file |