diff options
Diffstat (limited to '')
-rw-r--r-- | src/plugins/chartable.cxx | 2122 |
1 files changed, 2122 insertions, 0 deletions
diff --git a/src/plugins/chartable.cxx b/src/plugins/chartable.cxx new file mode 100644 index 0000000..704f12a --- /dev/null +++ b/src/plugins/chartable.cxx @@ -0,0 +1,2122 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/***MODULE:chartable + * rspamd module that make marks based on symbol chains + * + * Allowed options: + * - symbol (string): symbol to insert (default: 'R_BAD_CHARSET') + * - threshold (double): value that would be used as threshold in expression characters_changed / total_characters + * (e.g. if threshold is 0.1 than charset change should occur more often than in 10 symbols), default: 0.1 + */ + + +#include "config.h" +#include "libmime/message.h" +#include "rspamd.h" +#include "libstat/stat_api.h" +#include "libmime/lang_detection.h" + +#include "unicode/utf8.h" +#include "unicode/uchar.h" +#include "contrib/ankerl/unordered_dense.h" + +#define DEFAULT_SYMBOL "R_MIXED_CHARSET" +#define DEFAULT_URL_SYMBOL "R_MIXED_CHARSET_URL" +#define DEFAULT_THRESHOLD 0.1 + +#define msg_debug_chartable(...) rspamd_conditional_debug_fast(nullptr, task->from_addr, \ + rspamd_chartable_log_id, "chartable", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(chartable) + +/* Initialization */ +gint chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx); + +gint chartable_module_config(struct rspamd_config *cfg, bool validate); + +gint chartable_module_reconfig(struct rspamd_config *cfg); + +module_t chartable_module = { + "chartable", + chartable_module_init, + chartable_module_config, + chartable_module_reconfig, + nullptr, + RSPAMD_MODULE_VER, + (guint) -1, +}; + +struct chartable_ctx { + struct module_ctx ctx; + const gchar *symbol; + const gchar *url_symbol; + double threshold; + guint max_word_len; +}; + +static inline struct chartable_ctx * +chartable_get_context(struct rspamd_config *cfg) +{ + return (struct chartable_ctx *) g_ptr_array_index(cfg->c_modules, + chartable_module.ctx_offset); +} + +static void chartable_symbol_callback(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + void *unused); + +static void chartable_url_symbol_callback(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + void *unused); + +gint chartable_module_init(struct rspamd_config *cfg, struct module_ctx **ctx) +{ + struct chartable_ctx *chartable_module_ctx; + + chartable_module_ctx = rspamd_mempool_alloc0_type(cfg->cfg_pool, + struct chartable_ctx); + chartable_module_ctx->max_word_len = 10; + + *ctx = (struct module_ctx *) chartable_module_ctx; + + return 0; +} + + +gint chartable_module_config(struct rspamd_config *cfg, bool _) +{ + const ucl_object_t *value; + gint res = TRUE; + struct chartable_ctx *chartable_module_ctx = chartable_get_context(cfg); + + if (!rspamd_config_is_module_enabled(cfg, "chartable")) { + return TRUE; + } + + if ((value = + rspamd_config_get_module_opt(cfg, "chartable", "symbol")) != nullptr) { + chartable_module_ctx->symbol = ucl_obj_tostring(value); + } + else { + chartable_module_ctx->symbol = DEFAULT_SYMBOL; + } + if ((value = + rspamd_config_get_module_opt(cfg, "chartable", "url_symbol")) != nullptr) { + chartable_module_ctx->url_symbol = ucl_obj_tostring(value); + } + else { + chartable_module_ctx->url_symbol = DEFAULT_URL_SYMBOL; + } + if ((value = + rspamd_config_get_module_opt(cfg, "chartable", "threshold")) != nullptr) { + if (!ucl_obj_todouble_safe(value, &chartable_module_ctx->threshold)) { + msg_warn_config("invalid numeric value"); + chartable_module_ctx->threshold = DEFAULT_THRESHOLD; + } + } + else { + chartable_module_ctx->threshold = DEFAULT_THRESHOLD; + } + if ((value = + rspamd_config_get_module_opt(cfg, "chartable", "max_word_len")) != nullptr) { + chartable_module_ctx->max_word_len = ucl_object_toint(value); + } + else { + chartable_module_ctx->threshold = DEFAULT_THRESHOLD; + } + + rspamd_symcache_add_symbol(cfg->cache, + chartable_module_ctx->symbol, + 0, + chartable_symbol_callback, + nullptr, + SYMBOL_TYPE_NORMAL, + -1); + rspamd_symcache_add_symbol(cfg->cache, + chartable_module_ctx->url_symbol, + 0, + chartable_url_symbol_callback, + nullptr, + SYMBOL_TYPE_NORMAL, + -1); + + msg_info_config("init internal chartable module"); + + return res; +} + +gint chartable_module_reconfig(struct rspamd_config *cfg) +{ + return chartable_module_config(cfg, false); +} + +static const auto latin_confusable = ankerl::unordered_dense::set<int>{ + 0x02028, + 0x02029, + 0x01680, + 0x02000, + 0x02001, + 0x02002, + 0x02003, + 0x02004, + 0x02005, + 0x02006, + 0x02008, + 0x02009, + 0x0200a, + 0x0205f, + 0x000a0, + 0x02007, + 0x0202f, + 0x007fa, + 0x0fe4d, + 0x0fe4e, + 0x0fe4f, + 0x02010, + 0x02011, + 0x02012, + 0x02013, + 0x0fe58, + 0x006d4, + 0x02043, + 0x002d7, + 0x02212, + 0x02796, + 0x02cba, + 0x0060d, + 0x0066b, + 0x0201a, + 0x000b8, + 0x0a4f9, + 0x0037e, + 0x00903, + 0x00a83, + 0x0ff1a, + 0x00589, + 0x00703, + 0x00704, + 0x016ec, + 0x0fe30, + 0x01803, + 0x01809, + 0x0205a, + 0x005c3, + 0x002f8, + 0x0a789, + 0x02236, + 0x002d0, + 0x0a4fd, + 0x0ff01, + 0x001c3, + 0x02d51, + 0x00294, + 0x00241, + 0x0097d, + 0x013ae, + 0x0a6eb, + 0x1d16d, + 0x02024, + 0x00701, + 0x00702, + 0x0a60e, + 0x10a50, + 0x00660, + 0x006f0, + 0x0a4f8, + 0x0055d, + 0x0ff07, + 0x02018, + 0x02019, + 0x0201b, + 0x02032, + 0x02035, + 0x0055a, + 0x005f3, + 0x00060, + 0x01fef, + 0x0ff40, + 0x000b4, + 0x00384, + 0x01ffd, + 0x01fbd, + 0x01fbf, + 0x01ffe, + 0x002b9, + 0x00374, + 0x002c8, + 0x002ca, + 0x002cb, + 0x002f4, + 0x002bb, + 0x002bd, + 0x002bc, + 0x002be, + 0x0a78c, + 0x005d9, + 0x007f4, + 0x007f5, + 0x0144a, + 0x016cc, + 0x16f51, + 0x16f52, + 0x0ff3b, + 0x02768, + 0x02772, + 0x03014, + 0x0fd3e, + 0x0ff3d, + 0x02769, + 0x02773, + 0x03015, + 0x0fd3f, + 0x02774, + 0x1d114, + 0x02775, + 0x0204e, + 0x0066d, + 0x02217, + 0x1031f, + 0x01735, + 0x02041, + 0x02215, + 0x02044, + 0x02571, + 0x027cb, + 0x029f8, + 0x1d23a, + 0x031d3, + 0x03033, + 0x02cc6, + 0x030ce, + 0x04e3f, + 0x02f03, + 0x0ff3c, + 0x0fe68, + 0x02216, + 0x027cd, + 0x029f5, + 0x029f9, + 0x1d20f, + 0x1d23b, + 0x031d4, + 0x04e36, + 0x02f02, + 0x0a778, + 0x002c4, + 0x002c6, + 0x016ed, + 0x02795, + 0x1029b, + 0x02039, + 0x0276e, + 0x002c2, + 0x1d236, + 0x01438, + 0x016b2, + 0x01400, + 0x02e40, + 0x030a0, + 0x0a4ff, + 0x0203a, + 0x0276f, + 0x002c3, + 0x1d237, + 0x01433, + 0x16f3f, + 0x02053, + 0x002dc, + 0x01fc0, + 0x0223c, + 0x1d7d0, + 0x1d7da, + 0x1d7e4, + 0x1d7ee, + 0x1d7f8, + 0x0a75a, + 0x001a7, + 0x003e8, + 0x0a644, + 0x014bf, + 0x0a6ef, + 0x1d206, + 0x1d7d1, + 0x1d7db, + 0x1d7e5, + 0x1d7ef, + 0x1d7f9, + 0x0a7ab, + 0x0021c, + 0x001b7, + 0x0a76a, + 0x02ccc, + 0x00417, + 0x004e0, + 0x16f3b, + 0x118ca, + 0x1d7d2, + 0x1d7dc, + 0x1d7e6, + 0x1d7f0, + 0x1d7fa, + 0x013ce, + 0x118af, + 0x1d7d3, + 0x1d7dd, + 0x1d7e7, + 0x1d7f1, + 0x1d7fb, + 0x001bc, + 0x118bb, + 0x1d7d4, + 0x1d7de, + 0x1d7e8, + 0x1d7f2, + 0x1d7fc, + 0x02cd2, + 0x00431, + 0x013ee, + 0x118d5, + 0x1d212, + 0x1d7d5, + 0x1d7df, + 0x1d7e9, + 0x1d7f3, + 0x1d7fd, + 0x104d2, + 0x118c6, + 0x00b03, + 0x009ea, + 0x00a6a, + 0x1e8cb, + 0x1d7d6, + 0x1d7e0, + 0x1d7ea, + 0x1d7f4, + 0x1d7fe, + 0x00223, + 0x00222, + 0x1031a, + 0x00a67, + 0x00b68, + 0x009ed, + 0x00d6d, + 0x1d7d7, + 0x1d7e1, + 0x1d7eb, + 0x1d7f5, + 0x1d7ff, + 0x0a76e, + 0x02cca, + 0x118cc, + 0x118ac, + 0x118d6, + 0x0237a, + 0x0ff41, + 0x1d41a, + 0x1d44e, + 0x1d482, + 0x1d4b6, + 0x1d4ea, + 0x1d51e, + 0x1d552, + 0x1d586, + 0x1d5ba, + 0x1d5ee, + 0x1d622, + 0x1d656, + 0x1d68a, + 0x00251, + 0x003b1, + 0x1d6c2, + 0x1d6fc, + 0x1d736, + 0x1d770, + 0x1d7aa, + 0x00430, + 0x0ff21, + 0x1d400, + 0x1d434, + 0x1d468, + 0x1d49c, + 0x1d4d0, + 0x1d504, + 0x1d538, + 0x1d56c, + 0x1d5a0, + 0x1d5d4, + 0x1d608, + 0x1d63c, + 0x1d670, + 0x00391, + 0x1d6a8, + 0x1d6e2, + 0x1d71c, + 0x1d756, + 0x1d790, + 0x00410, + 0x013aa, + 0x015c5, + 0x0a4ee, + 0x16f40, + 0x102a0, + 0x1d41b, + 0x1d44f, + 0x1d483, + 0x1d4b7, + 0x1d4eb, + 0x1d51f, + 0x1d553, + 0x1d587, + 0x1d5bb, + 0x1d5ef, + 0x1d623, + 0x1d657, + 0x1d68b, + 0x00184, + 0x0042c, + 0x013cf, + 0x015af, + 0x0ff22, + 0x0212c, + 0x1d401, + 0x1d435, + 0x1d469, + 0x1d4d1, + 0x1d505, + 0x1d539, + 0x1d56d, + 0x1d5a1, + 0x1d5d5, + 0x1d609, + 0x1d63d, + 0x1d671, + 0x0a7b4, + 0x00392, + 0x1d6a9, + 0x1d6e3, + 0x1d71d, + 0x1d757, + 0x1d791, + 0x00412, + 0x013f4, + 0x015f7, + 0x0a4d0, + 0x10282, + 0x102a1, + 0x10301, + 0x0ff43, + 0x0217d, + 0x1d41c, + 0x1d450, + 0x1d484, + 0x1d4b8, + 0x1d4ec, + 0x1d520, + 0x1d554, + 0x1d588, + 0x1d5bc, + 0x1d5f0, + 0x1d624, + 0x1d658, + 0x1d68c, + 0x01d04, + 0x003f2, + 0x02ca5, + 0x00441, + 0x0abaf, + 0x1043d, + 0x1f74c, + 0x118f2, + 0x118e9, + 0x0ff23, + 0x0216d, + 0x02102, + 0x0212d, + 0x1d402, + 0x1d436, + 0x1d46a, + 0x1d49e, + 0x1d4d2, + 0x1d56e, + 0x1d5a2, + 0x1d5d6, + 0x1d60a, + 0x1d63e, + 0x1d672, + 0x003f9, + 0x02ca4, + 0x00421, + 0x013df, + 0x0a4da, + 0x102a2, + 0x10302, + 0x10415, + 0x1051c, + 0x0217e, + 0x02146, + 0x1d41d, + 0x1d451, + 0x1d485, + 0x1d4b9, + 0x1d4ed, + 0x1d521, + 0x1d555, + 0x1d589, + 0x1d5bd, + 0x1d5f1, + 0x1d625, + 0x1d659, + 0x1d68d, + 0x00501, + 0x013e7, + 0x0146f, + 0x0a4d2, + 0x0216e, + 0x02145, + 0x1d403, + 0x1d437, + 0x1d46b, + 0x1d49f, + 0x1d4d3, + 0x1d507, + 0x1d53b, + 0x1d56f, + 0x1d5a3, + 0x1d5d7, + 0x1d60b, + 0x1d63f, + 0x1d673, + 0x013a0, + 0x015de, + 0x015ea, + 0x0a4d3, + 0x0212e, + 0x0ff45, + 0x0212f, + 0x02147, + 0x1d41e, + 0x1d452, + 0x1d486, + 0x1d4ee, + 0x1d522, + 0x1d556, + 0x1d58a, + 0x1d5be, + 0x1d5f2, + 0x1d626, + 0x1d65a, + 0x1d68e, + 0x0ab32, + 0x00435, + 0x004bd, + 0x022ff, + 0x0ff25, + 0x02130, + 0x1d404, + 0x1d438, + 0x1d46c, + 0x1d4d4, + 0x1d508, + 0x1d53c, + 0x1d570, + 0x1d5a4, + 0x1d5d8, + 0x1d60c, + 0x1d640, + 0x1d674, + 0x00395, + 0x1d6ac, + 0x1d6e6, + 0x1d720, + 0x1d75a, + 0x1d794, + 0x00415, + 0x02d39, + 0x013ac, + 0x0a4f0, + 0x118a6, + 0x118ae, + 0x10286, + 0x1d41f, + 0x1d453, + 0x1d487, + 0x1d4bb, + 0x1d4ef, + 0x1d523, + 0x1d557, + 0x1d58b, + 0x1d5bf, + 0x1d5f3, + 0x1d627, + 0x1d65b, + 0x1d68f, + 0x0ab35, + 0x0a799, + 0x0017f, + 0x01e9d, + 0x00584, + 0x1d213, + 0x02131, + 0x1d405, + 0x1d439, + 0x1d46d, + 0x1d4d5, + 0x1d509, + 0x1d53d, + 0x1d571, + 0x1d5a5, + 0x1d5d9, + 0x1d60d, + 0x1d641, + 0x1d675, + 0x0a798, + 0x003dc, + 0x1d7ca, + 0x015b4, + 0x0a4dd, + 0x118c2, + 0x118a2, + 0x10287, + 0x102a5, + 0x10525, + 0x0ff47, + 0x0210a, + 0x1d420, + 0x1d454, + 0x1d488, + 0x1d4f0, + 0x1d524, + 0x1d558, + 0x1d58c, + 0x1d5c0, + 0x1d5f4, + 0x1d628, + 0x1d65c, + 0x1d690, + 0x00261, + 0x01d83, + 0x0018d, + 0x00581, + 0x1d406, + 0x1d43a, + 0x1d46e, + 0x1d4a2, + 0x1d4d6, + 0x1d50a, + 0x1d53e, + 0x1d572, + 0x1d5a6, + 0x1d5da, + 0x1d60e, + 0x1d642, + 0x1d676, + 0x0050c, + 0x013c0, + 0x013f3, + 0x0a4d6, + 0x0ff48, + 0x0210e, + 0x1d421, + 0x1d489, + 0x1d4bd, + 0x1d4f1, + 0x1d525, + 0x1d559, + 0x1d58d, + 0x1d5c1, + 0x1d5f5, + 0x1d629, + 0x1d65d, + 0x1d691, + 0x004bb, + 0x00570, + 0x013c2, + 0x0ff28, + 0x0210b, + 0x0210c, + 0x0210d, + 0x1d407, + 0x1d43b, + 0x1d46f, + 0x1d4d7, + 0x1d573, + 0x1d5a7, + 0x1d5db, + 0x1d60f, + 0x1d643, + 0x1d677, + 0x00397, + 0x1d6ae, + 0x1d6e8, + 0x1d722, + 0x1d75c, + 0x1d796, + 0x02c8e, + 0x0041d, + 0x013bb, + 0x0157c, + 0x0a4e7, + 0x102cf, + 0x002db, + 0x02373, + 0x0ff49, + 0x02170, + 0x02139, + 0x02148, + 0x1d422, + 0x1d456, + 0x1d48a, + 0x1d4be, + 0x1d4f2, + 0x1d526, + 0x1d55a, + 0x1d58e, + 0x1d5c2, + 0x1d5f6, + 0x1d62a, + 0x1d65e, + 0x1d692, + 0x00131, + 0x1d6a4, + 0x0026a, + 0x00269, + 0x003b9, + 0x01fbe, + 0x0037a, + 0x1d6ca, + 0x1d704, + 0x1d73e, + 0x1d778, + 0x1d7b2, + 0x00456, + 0x0a647, + 0x004cf, + 0x0ab75, + 0x013a5, + 0x118c3, + 0x0ff4a, + 0x02149, + 0x1d423, + 0x1d457, + 0x1d48b, + 0x1d4bf, + 0x1d4f3, + 0x1d527, + 0x1d55b, + 0x1d58f, + 0x1d5c3, + 0x1d5f7, + 0x1d62b, + 0x1d65f, + 0x1d693, + 0x003f3, + 0x00458, + 0x0ff2a, + 0x1d409, + 0x1d43d, + 0x1d471, + 0x1d4a5, + 0x1d4d9, + 0x1d50d, + 0x1d541, + 0x1d575, + 0x1d5a9, + 0x1d5dd, + 0x1d611, + 0x1d645, + 0x1d679, + 0x0a7b2, + 0x0037f, + 0x00408, + 0x013ab, + 0x0148d, + 0x0a4d9, + 0x1d424, + 0x1d458, + 0x1d48c, + 0x1d4c0, + 0x1d4f4, + 0x1d528, + 0x1d55c, + 0x1d590, + 0x1d5c4, + 0x1d5f8, + 0x1d62c, + 0x1d660, + 0x1d694, + 0x0212a, + 0x0ff2b, + 0x1d40a, + 0x1d43e, + 0x1d472, + 0x1d4a6, + 0x1d4da, + 0x1d50e, + 0x1d542, + 0x1d576, + 0x1d5aa, + 0x1d5de, + 0x1d612, + 0x1d646, + 0x1d67a, + 0x0039a, + 0x1d6b1, + 0x1d6eb, + 0x1d725, + 0x1d75f, + 0x1d799, + 0x02c94, + 0x0041a, + 0x013e6, + 0x016d5, + 0x0a4d7, + 0x10518, + 0x005c0, + 0x0007c, + 0x02223, + 0x023fd, + 0x0ffe8, + 0x00031, + 0x00661, + 0x006f1, + 0x10320, + 0x1e8c7, + 0x1d7cf, + 0x1d7d9, + 0x1d7e3, + 0x1d7ed, + 0x1d7f7, + 0x00049, + 0x0ff29, + 0x02160, + 0x02110, + 0x02111, + 0x1d408, + 0x1d43c, + 0x1d470, + 0x1d4d8, + 0x1d540, + 0x1d574, + 0x1d5a8, + 0x1d5dc, + 0x1d610, + 0x1d644, + 0x1d678, + 0x00196, + 0x0ff4c, + 0x0217c, + 0x02113, + 0x1d425, + 0x1d459, + 0x1d48d, + 0x1d4c1, + 0x1d4f5, + 0x1d529, + 0x1d55d, + 0x1d591, + 0x1d5c5, + 0x1d5f9, + 0x1d62d, + 0x1d661, + 0x1d695, + 0x001c0, + 0x00399, + 0x1d6b0, + 0x1d6ea, + 0x1d724, + 0x1d75e, + 0x1d798, + 0x02c92, + 0x00406, + 0x004c0, + 0x005d5, + 0x005df, + 0x00627, + 0x1ee00, + 0x1ee80, + 0x0fe8e, + 0x0fe8d, + 0x007ca, + 0x02d4f, + 0x016c1, + 0x0a4f2, + 0x16f28, + 0x1028a, + 0x10309, + 0x1d22a, + 0x0216c, + 0x02112, + 0x1d40b, + 0x1d43f, + 0x1d473, + 0x1d4db, + 0x1d50f, + 0x1d543, + 0x1d577, + 0x1d5ab, + 0x1d5df, + 0x1d613, + 0x1d647, + 0x1d67b, + 0x02cd0, + 0x013de, + 0x014aa, + 0x0a4e1, + 0x16f16, + 0x118a3, + 0x118b2, + 0x1041b, + 0x10526, + 0x0ff2d, + 0x0216f, + 0x02133, + 0x1d40c, + 0x1d440, + 0x1d474, + 0x1d4dc, + 0x1d510, + 0x1d544, + 0x1d578, + 0x1d5ac, + 0x1d5e0, + 0x1d614, + 0x1d648, + 0x1d67c, + 0x0039c, + 0x1d6b3, + 0x1d6ed, + 0x1d727, + 0x1d761, + 0x1d79b, + 0x003fa, + 0x02c98, + 0x0041c, + 0x013b7, + 0x015f0, + 0x016d6, + 0x0a4df, + 0x102b0, + 0x10311, + 0x1d427, + 0x1d45b, + 0x1d48f, + 0x1d4c3, + 0x1d4f7, + 0x1d52b, + 0x1d55f, + 0x1d593, + 0x1d5c7, + 0x1d5fb, + 0x1d62f, + 0x1d663, + 0x1d697, + 0x00578, + 0x0057c, + 0x0ff2e, + 0x02115, + 0x1d40d, + 0x1d441, + 0x1d475, + 0x1d4a9, + 0x1d4dd, + 0x1d511, + 0x1d579, + 0x1d5ad, + 0x1d5e1, + 0x1d615, + 0x1d649, + 0x1d67d, + 0x0039d, + 0x1d6b4, + 0x1d6ee, + 0x1d728, + 0x1d762, + 0x1d79c, + 0x02c9a, + 0x0a4e0, + 0x10513, + 0x00c02, + 0x00c82, + 0x00d02, + 0x00d82, + 0x00966, + 0x00a66, + 0x00ae6, + 0x00be6, + 0x00c66, + 0x00ce6, + 0x00d66, + 0x00e50, + 0x00ed0, + 0x01040, + 0x00665, + 0x006f5, + 0x0ff4f, + 0x02134, + 0x1d428, + 0x1d45c, + 0x1d490, + 0x1d4f8, + 0x1d52c, + 0x1d560, + 0x1d594, + 0x1d5c8, + 0x1d5fc, + 0x1d630, + 0x1d664, + 0x1d698, + 0x01d0f, + 0x01d11, + 0x0ab3d, + 0x003bf, + 0x1d6d0, + 0x1d70a, + 0x1d744, + 0x1d77e, + 0x1d7b8, + 0x003c3, + 0x1d6d4, + 0x1d70e, + 0x1d748, + 0x1d782, + 0x1d7bc, + 0x02c9f, + 0x0043e, + 0x010ff, + 0x00585, + 0x005e1, + 0x00647, + 0x1ee24, + 0x1ee64, + 0x1ee84, + 0x0feeb, + 0x0feec, + 0x0feea, + 0x0fee9, + 0x006be, + 0x0fbac, + 0x0fbad, + 0x0fbab, + 0x0fbaa, + 0x006c1, + 0x0fba8, + 0x0fba9, + 0x0fba7, + 0x0fba6, + 0x006d5, + 0x00d20, + 0x0101d, + 0x104ea, + 0x118c8, + 0x118d7, + 0x1042c, + 0x00030, + 0x007c0, + 0x009e6, + 0x00b66, + 0x03007, + 0x114d0, + 0x118e0, + 0x1d7ce, + 0x1d7d8, + 0x1d7e2, + 0x1d7ec, + 0x1d7f6, + 0x0ff2f, + 0x1d40e, + 0x1d442, + 0x1d476, + 0x1d4aa, + 0x1d4de, + 0x1d512, + 0x1d546, + 0x1d57a, + 0x1d5ae, + 0x1d5e2, + 0x1d616, + 0x1d64a, + 0x1d67e, + 0x0039f, + 0x1d6b6, + 0x1d6f0, + 0x1d72a, + 0x1d764, + 0x1d79e, + 0x02c9e, + 0x0041e, + 0x00555, + 0x02d54, + 0x012d0, + 0x00b20, + 0x104c2, + 0x0a4f3, + 0x118b5, + 0x10292, + 0x102ab, + 0x10404, + 0x10516, + 0x02374, + 0x0ff50, + 0x1d429, + 0x1d45d, + 0x1d491, + 0x1d4c5, + 0x1d4f9, + 0x1d52d, + 0x1d561, + 0x1d595, + 0x1d5c9, + 0x1d5fd, + 0x1d631, + 0x1d665, + 0x1d699, + 0x003c1, + 0x003f1, + 0x1d6d2, + 0x1d6e0, + 0x1d70c, + 0x1d71a, + 0x1d746, + 0x1d754, + 0x1d780, + 0x1d78e, + 0x1d7ba, + 0x1d7c8, + 0x02ca3, + 0x00440, + 0x0ff30, + 0x02119, + 0x1d40f, + 0x1d443, + 0x1d477, + 0x1d4ab, + 0x1d4df, + 0x1d513, + 0x1d57b, + 0x1d5af, + 0x1d5e3, + 0x1d617, + 0x1d64b, + 0x1d67f, + 0x003a1, + 0x1d6b8, + 0x1d6f2, + 0x1d72c, + 0x1d766, + 0x1d7a0, + 0x02ca2, + 0x00420, + 0x013e2, + 0x0146d, + 0x0a4d1, + 0x10295, + 0x1d42a, + 0x1d45e, + 0x1d492, + 0x1d4c6, + 0x1d4fa, + 0x1d52e, + 0x1d562, + 0x1d596, + 0x1d5ca, + 0x1d5fe, + 0x1d632, + 0x1d666, + 0x1d69a, + 0x0051b, + 0x00563, + 0x00566, + 0x0211a, + 0x1d410, + 0x1d444, + 0x1d478, + 0x1d4ac, + 0x1d4e0, + 0x1d514, + 0x1d57c, + 0x1d5b0, + 0x1d5e4, + 0x1d618, + 0x1d64c, + 0x1d680, + 0x02d55, + 0x1d42b, + 0x1d45f, + 0x1d493, + 0x1d4c7, + 0x1d4fb, + 0x1d52f, + 0x1d563, + 0x1d597, + 0x1d5cb, + 0x1d5ff, + 0x1d633, + 0x1d667, + 0x1d69b, + 0x0ab47, + 0x0ab48, + 0x01d26, + 0x02c85, + 0x00433, + 0x0ab81, + 0x1d216, + 0x0211b, + 0x0211c, + 0x0211d, + 0x1d411, + 0x1d445, + 0x1d479, + 0x1d4e1, + 0x1d57d, + 0x1d5b1, + 0x1d5e5, + 0x1d619, + 0x1d64d, + 0x1d681, + 0x001a6, + 0x013a1, + 0x013d2, + 0x104b4, + 0x01587, + 0x0a4e3, + 0x16f35, + 0x0ff53, + 0x1d42c, + 0x1d460, + 0x1d494, + 0x1d4c8, + 0x1d4fc, + 0x1d530, + 0x1d564, + 0x1d598, + 0x1d5cc, + 0x1d600, + 0x1d634, + 0x1d668, + 0x1d69c, + 0x0a731, + 0x001bd, + 0x00455, + 0x0abaa, + 0x118c1, + 0x10448, + 0x0ff33, + 0x1d412, + 0x1d446, + 0x1d47a, + 0x1d4ae, + 0x1d4e2, + 0x1d516, + 0x1d54a, + 0x1d57e, + 0x1d5b2, + 0x1d5e6, + 0x1d61a, + 0x1d64e, + 0x1d682, + 0x00405, + 0x0054f, + 0x013d5, + 0x013da, + 0x0a4e2, + 0x16f3a, + 0x10296, + 0x10420, + 0x1d42d, + 0x1d461, + 0x1d495, + 0x1d4c9, + 0x1d4fd, + 0x1d531, + 0x1d565, + 0x1d599, + 0x1d5cd, + 0x1d601, + 0x1d635, + 0x1d669, + 0x1d69d, + 0x022a4, + 0x027d9, + 0x1f768, + 0x0ff34, + 0x1d413, + 0x1d447, + 0x1d47b, + 0x1d4af, + 0x1d4e3, + 0x1d517, + 0x1d54b, + 0x1d57f, + 0x1d5b3, + 0x1d5e7, + 0x1d61b, + 0x1d64f, + 0x1d683, + 0x003a4, + 0x1d6bb, + 0x1d6f5, + 0x1d72f, + 0x1d769, + 0x1d7a3, + 0x02ca6, + 0x00422, + 0x013a2, + 0x0a4d4, + 0x16f0a, + 0x118bc, + 0x10297, + 0x102b1, + 0x10315, + 0x1d42e, + 0x1d462, + 0x1d496, + 0x1d4ca, + 0x1d4fe, + 0x1d532, + 0x1d566, + 0x1d59a, + 0x1d5ce, + 0x1d602, + 0x1d636, + 0x1d66a, + 0x1d69e, + 0x0a79f, + 0x01d1c, + 0x0ab4e, + 0x0ab52, + 0x0028b, + 0x003c5, + 0x1d6d6, + 0x1d710, + 0x1d74a, + 0x1d784, + 0x1d7be, + 0x0057d, + 0x104f6, + 0x118d8, + 0x0222a, + 0x022c3, + 0x1d414, + 0x1d448, + 0x1d47c, + 0x1d4b0, + 0x1d4e4, + 0x1d518, + 0x1d54c, + 0x1d580, + 0x1d5b4, + 0x1d5e8, + 0x1d61c, + 0x1d650, + 0x1d684, + 0x0054d, + 0x01200, + 0x104ce, + 0x0144c, + 0x0a4f4, + 0x16f42, + 0x118b8, + 0x02228, + 0x022c1, + 0x0ff56, + 0x02174, + 0x1d42f, + 0x1d463, + 0x1d497, + 0x1d4cb, + 0x1d4ff, + 0x1d533, + 0x1d567, + 0x1d59b, + 0x1d5cf, + 0x1d603, + 0x1d637, + 0x1d66b, + 0x1d69f, + 0x01d20, + 0x003bd, + 0x1d6ce, + 0x1d708, + 0x1d742, + 0x1d77c, + 0x1d7b6, + 0x00475, + 0x005d8, + 0x11706, + 0x0aba9, + 0x118c0, + 0x1d20d, + 0x00667, + 0x006f7, + 0x02164, + 0x1d415, + 0x1d449, + 0x1d47d, + 0x1d4b1, + 0x1d4e5, + 0x1d519, + 0x1d54d, + 0x1d581, + 0x1d5b5, + 0x1d5e9, + 0x1d61d, + 0x1d651, + 0x1d685, + 0x00474, + 0x02d38, + 0x013d9, + 0x0142f, + 0x0a6df, + 0x0a4e6, + 0x16f08, + 0x118a0, + 0x1051d, + 0x0026f, + 0x1d430, + 0x1d464, + 0x1d498, + 0x1d4cc, + 0x1d500, + 0x1d534, + 0x1d568, + 0x1d59c, + 0x1d5d0, + 0x1d604, + 0x1d638, + 0x1d66c, + 0x1d6a0, + 0x01d21, + 0x00461, + 0x0051d, + 0x00561, + 0x1170a, + 0x1170e, + 0x1170f, + 0x0ab83, + 0x118ef, + 0x118e6, + 0x1d416, + 0x1d44a, + 0x1d47e, + 0x1d4b2, + 0x1d4e6, + 0x1d51a, + 0x1d54e, + 0x1d582, + 0x1d5b6, + 0x1d5ea, + 0x1d61e, + 0x1d652, + 0x1d686, + 0x0051c, + 0x013b3, + 0x013d4, + 0x0a4ea, + 0x0166e, + 0x000d7, + 0x0292b, + 0x0292c, + 0x02a2f, + 0x0ff58, + 0x02179, + 0x1d431, + 0x1d465, + 0x1d499, + 0x1d4cd, + 0x1d501, + 0x1d535, + 0x1d569, + 0x1d59d, + 0x1d5d1, + 0x1d605, + 0x1d639, + 0x1d66d, + 0x1d6a1, + 0x00445, + 0x01541, + 0x0157d, + 0x0166d, + 0x02573, + 0x10322, + 0x118ec, + 0x0ff38, + 0x02169, + 0x1d417, + 0x1d44b, + 0x1d47f, + 0x1d4b3, + 0x1d4e7, + 0x1d51b, + 0x1d54f, + 0x1d583, + 0x1d5b7, + 0x1d5eb, + 0x1d61f, + 0x1d653, + 0x1d687, + 0x0a7b3, + 0x003a7, + 0x1d6be, + 0x1d6f8, + 0x1d732, + 0x1d76c, + 0x1d7a6, + 0x02cac, + 0x00425, + 0x02d5d, + 0x016b7, + 0x0a4eb, + 0x10290, + 0x102b4, + 0x10317, + 0x10527, + 0x00263, + 0x01d8c, + 0x0ff59, + 0x1d432, + 0x1d466, + 0x1d49a, + 0x1d4ce, + 0x1d502, + 0x1d536, + 0x1d56a, + 0x1d59e, + 0x1d5d2, + 0x1d606, + 0x1d63a, + 0x1d66e, + 0x1d6a2, + 0x0028f, + 0x01eff, + 0x0ab5a, + 0x003b3, + 0x0213d, + 0x1d6c4, + 0x1d6fe, + 0x1d738, + 0x1d772, + 0x1d7ac, + 0x00443, + 0x004af, + 0x010e7, + 0x118dc, + 0x0ff39, + 0x1d418, + 0x1d44c, + 0x1d480, + 0x1d4b4, + 0x1d4e8, + 0x1d51c, + 0x1d550, + 0x1d584, + 0x1d5b8, + 0x1d5ec, + 0x1d620, + 0x1d654, + 0x1d688, + 0x003a5, + 0x003d2, + 0x1d6bc, + 0x1d6f6, + 0x1d730, + 0x1d76a, + 0x1d7a4, + 0x02ca8, + 0x00423, + 0x004ae, + 0x013a9, + 0x013bd, + 0x0a4ec, + 0x16f43, + 0x118a4, + 0x102b2, + 0x1d433, + 0x1d467, + 0x1d49b, + 0x1d4cf, + 0x1d503, + 0x1d537, + 0x1d56b, + 0x1d59f, + 0x1d5d3, + 0x1d607, + 0x1d63b, + 0x1d66f, + 0x1d6a3, + 0x01d22, + 0x0ab93, + 0x118c4, + 0x102f5, + 0x118e5, + 0x0ff3a, + 0x02124, + 0x02128, + 0x1d419, + 0x1d44d, + 0x1d481, + 0x1d4b5, + 0x1d4e9, + 0x1d585, + 0x1d5b9, + 0x1d5ed, + 0x1d621, + 0x1d655, + 0x1d689, + 0x00396, + 0x1d6ad, + 0x1d6e7, + 0x1d721, + 0x1d75b, + 0x1d795, + 0x013c3, + 0x0a4dc, + 0x118a9, +}; + +static gboolean +rspamd_can_alias_latin(gint ch) +{ + return latin_confusable.contains(ch); +} + +static gdouble +rspamd_chartable_process_word_utf(struct rspamd_task *task, + rspamd_stat_token_t *w, + gboolean is_url, + guint *ncap, + struct chartable_ctx *chartable_module_ctx, + gboolean ignore_diacritics) +{ + const UChar32 *p, *end; + gdouble badness = 0.0; + UChar32 uc; + UBlockCode sc; + guint cat; + gint last_is_latin = -1; + guint same_script_count = 0, nsym = 0, nspecial = 0; + enum { + start_process = 0, + got_alpha, + got_digit, + got_unknown, + } state = start_process, + prev_state = start_process; + + p = w->unicode.begin; + end = p + w->unicode.len; + + /* We assume that w is normalized */ + + while (p < end) { + uc = *p++; + + if (((gint32) uc) < 0) { + break; + } + + sc = ublock_getCode(uc); + cat = u_charType(uc); + + if (!ignore_diacritics) { + if (cat == U_NON_SPACING_MARK || + (sc == UBLOCK_LATIN_1_SUPPLEMENT) || + (sc == UBLOCK_LATIN_EXTENDED_A) || + (sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) || + (sc == UBLOCK_LATIN_EXTENDED_B) || + (sc == UBLOCK_COMBINING_DIACRITICAL_MARKS)) { + nspecial++; + } + } + + if (u_isalpha(uc)) { + + if (sc <= UBLOCK_COMBINING_DIACRITICAL_MARKS || + sc == UBLOCK_LATIN_EXTENDED_ADDITIONAL) { + /* + * Assume all latin, IPA, diacritic and space modifiers + * characters as basic latin + */ + sc = UBLOCK_BASIC_LATIN; + } + + if (sc != UBLOCK_BASIC_LATIN && u_isupper(uc)) { + if (ncap) { + (*ncap)++; + } + } + + if (state == got_digit) { + /* Penalize digit -> alpha translations */ + if (!is_url && sc != UBLOCK_BASIC_LATIN && + prev_state != start_process) { + badness += 0.25; + } + } + else if (state == got_alpha) { + /* Check script */ + if (same_script_count > 0) { + if (sc != UBLOCK_BASIC_LATIN && last_is_latin) { + + if (rspamd_can_alias_latin(uc)) { + badness += 1.0 / (gdouble) same_script_count; + } + + last_is_latin = 0; + same_script_count = 1; + } + else { + same_script_count++; + } + } + else { + last_is_latin = sc == UBLOCK_BASIC_LATIN; + same_script_count = 1; + } + } + + prev_state = state; + state = got_alpha; + } + else if (u_isdigit(uc)) { + if (state != got_digit) { + prev_state = state; + } + + state = got_digit; + same_script_count = 0; + } + else { + /* We don't care about unknown characters here */ + if (state != got_unknown) { + prev_state = state; + } + + state = got_unknown; + same_script_count = 0; + } + + nsym++; + } + + if (nspecial > 0) { + if (!ignore_diacritics) { + /* Count diacritics */ + badness += nspecial; + } + else if (nspecial > 1) { + badness += (nspecial - 1.0) / 2.0; + } + } + + /* Try to avoid FP for long words */ + if (nsym > chartable_module_ctx->max_word_len) { + badness = 0; + } + else { + if (badness > 4.0) { + badness = 4.0; + } + } + + msg_debug_chartable("word %*s, badness: %.2f", + (gint) w->normalized.len, w->normalized.begin, + badness); + + return badness; +} + +static gdouble +rspamd_chartable_process_word_ascii(struct rspamd_task *task, + rspamd_stat_token_t *w, + gboolean is_url, + struct chartable_ctx *chartable_module_ctx) +{ + gdouble badness = 0.0; + enum { + ascii = 1, + non_ascii + } sc, + last_sc; + gint same_script_count = 0, seen_alpha = FALSE; + enum { + start_process = 0, + got_alpha, + got_digit, + got_unknown, + } state = start_process; + + const auto *p = (const unsigned char *) w->normalized.begin; + const auto *end = p + w->normalized.len; + last_sc = non_ascii; + + if (w->normalized.len > chartable_module_ctx->max_word_len) { + return 0.0; + } + + /* We assume that w is normalized */ + while (p < end) { + if (g_ascii_isalpha(*p) || *p > 0x7f) { + + if (state == got_digit) { + /* Penalize digit -> alpha translations */ + if (seen_alpha && !is_url && !g_ascii_isxdigit(*p)) { + badness += 0.25; + } + } + else if (state == got_alpha) { + /* Check script */ + sc = (*p > 0x7f) ? ascii : non_ascii; + + if (same_script_count > 0) { + if (sc != last_sc) { + badness += 1.0 / (gdouble) same_script_count; + last_sc = sc; + same_script_count = 1; + } + else { + same_script_count++; + } + } + else { + last_sc = sc; + same_script_count = 1; + } + } + + seen_alpha = TRUE; + state = got_alpha; + } + else if (g_ascii_isdigit(*p)) { + state = got_digit; + same_script_count = 0; + } + else { + /* We don't care about unknown characters here */ + state = got_unknown; + same_script_count = 0; + } + + p++; + } + + if (badness > 4.0) { + badness = 4.0; + } + + msg_debug_chartable("word %*s, badness: %.2f", + (gint) w->normalized.len, w->normalized.begin, + badness); + + return badness; +} + +static gboolean +rspamd_chartable_process_part(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + struct chartable_ctx *chartable_module_ctx, + gboolean ignore_diacritics) +{ + rspamd_stat_token_t *w; + guint i, ncap = 0; + gdouble cur_score = 0.0; + + if (part == nullptr || part->utf_words == nullptr || + part->utf_words->len == 0 || part->nwords == 0) { + return FALSE; + } + + for (i = 0; i < part->utf_words->len; i++) { + w = &g_array_index(part->utf_words, rspamd_stat_token_t, i); + + if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { + + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { + cur_score += rspamd_chartable_process_word_utf(task, w, FALSE, + &ncap, chartable_module_ctx, ignore_diacritics); + } + else { + cur_score += rspamd_chartable_process_word_ascii(task, w, + FALSE, chartable_module_ctx); + } + } + } + + /* + * TODO: perhaps, we should do this analysis somewhere else and get + * something like: <SYM_SC><SYM_SC><SYM_SC> representing classes for all + * symbols in the text + */ + part->capital_letters += ncap; + + cur_score /= (gdouble) part->nwords; + + if (cur_score > 1.0) { + cur_score = 1.0; + } + + if (cur_score > chartable_module_ctx->threshold) { + rspamd_task_insert_result(task, chartable_module_ctx->symbol, + cur_score, nullptr); + return TRUE; + } + + return FALSE; +} + +static void +chartable_symbol_callback(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + void *_) +{ + guint i; + struct rspamd_mime_text_part *part; + struct chartable_ctx *chartable_module_ctx = chartable_get_context(task->cfg); + gboolean ignore_diacritics = TRUE, seen_violated_part = FALSE; + + /* Check if we have parts with diacritic symbols language */ + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part) + { + if (part->languages && part->languages->len > 0) { + auto *lang = (struct rspamd_lang_detector_res *) g_ptr_array_index(part->languages, 0); + gint flags; + + flags = rspamd_language_detector_elt_flags(lang->elt); + + if ((flags & RS_LANGUAGE_DIACRITICS)) { + ignore_diacritics = TRUE; + } + else if (lang->prob > 0.75) { + ignore_diacritics = FALSE; + } + } + + if (rspamd_chartable_process_part(task, part, chartable_module_ctx, ignore_diacritics)) { + seen_violated_part = TRUE; + } + } + + if (MESSAGE_FIELD(task, text_parts)->len == 0) { + /* No text parts, assume that we should ignore diacritics checks for metatokens */ + ignore_diacritics = TRUE; + } + + if (task->meta_words != nullptr && task->meta_words->len > 0) { + rspamd_stat_token_t *w; + gdouble cur_score = 0; + gsize arlen = task->meta_words->len; + + for (i = 0; i < arlen; i++) { + w = &g_array_index(task->meta_words, rspamd_stat_token_t, i); + cur_score += rspamd_chartable_process_word_utf(task, w, FALSE, + nullptr, chartable_module_ctx, ignore_diacritics); + } + + cur_score /= (gdouble) (arlen + 1); + + if (cur_score > 1.0) { + cur_score = 1.0; + } + + if (cur_score > chartable_module_ctx->threshold) { + if (!seen_violated_part) { + /* Further penalise */ + if (cur_score > 0.25) { + cur_score = 0.25; + } + } + + rspamd_task_insert_result(task, chartable_module_ctx->symbol, + cur_score, "subject"); + } + } + + rspamd_symcache_finalize_item(task, item); +} + +static void +chartable_url_symbol_callback(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + void *unused) +{ + /* XXX: TODO: unbreak module once URLs unicode project is over */ +#if 0 + struct rspamd_url *u; + GHashTableIter it; + gpointer k, v; + rspamd_stat_token_t w; + gdouble cur_score = 0.0; + struct chartable_ctx *chartable_module_ctx = chartable_get_context (task->cfg); + + g_hash_table_iter_init (&it, task->urls); + + while (g_hash_table_iter_next (&it, &k, &v)) { + u = v; + + if (cur_score > 2.0) { + cur_score = 2.0; + break; + } + + if (u->hostlen > 0) { + w.stemmed.begin = u->host; + w.stemmed.len = u->hostlen; + + if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, nullptr)) { + cur_score += rspamd_chartable_process_word_utf (task, &w, + TRUE, nullptr, chartable_module_ctx); + } + else { + cur_score += rspamd_chartable_process_word_ascii (task, &w, + TRUE, chartable_module_ctx); + } + } + } + + g_hash_table_iter_init (&it, task->emails); + + while (g_hash_table_iter_next (&it, &k, &v)) { + u = v; + + if (cur_score > 2.0) { + cur_score = 2.0; + break; + } + + if (u->hostlen > 0) { + w.stemmed.begin = u->host; + w.stemmed.len = u->hostlen; + + if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, nullptr)) { + cur_score += rspamd_chartable_process_word_utf (task, &w, + TRUE, nullptr, chartable_module_ctx); + } + else { + cur_score += rspamd_chartable_process_word_ascii (task, &w, + TRUE, chartable_module_ctx); + } + } + } + + if (cur_score > chartable_module_ctx->threshold) { + rspamd_task_insert_result (task, chartable_module_ctx->symbol, + cur_score, nullptr); + + } +#endif + rspamd_symcache_finalize_item(task, item); +} |