diff options
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 2103 |
1 files changed, 2103 insertions, 0 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c new file mode 100644 index 0000000..bdd0aad --- /dev/null +++ b/src/libmime/lang_detection.c @@ -0,0 +1,2103 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lang_detection.h" +#include "lang_detection_fasttext.h" +#include "libserver/logger.h" +#include "libcryptobox/cryptobox.h" +#include "libutil/multipattern.h" +#include "ucl.h" +#include "khash.h" +#include "libstemmer.h" + +#include <glob.h> +#include <unicode/utf8.h> +#include <unicode/utf16.h> +#include <unicode/ucnv.h> +#include <unicode/uchar.h> +#include <unicode/ustring.h> +#include <math.h> + +static const gsize default_short_text_limit = 10; +static const gsize default_words = 80; +static const gdouble update_prob = 0.6; +static const gchar *default_languages_path = RSPAMD_SHAREDIR "/languages"; + +#undef EXTRA_LANGDET_DEBUG + +struct rspamd_language_unicode_match { + const gchar *lang; + gint unicode_code; +}; + +/* + * List of languages detected by unicode scripts + */ +static const struct rspamd_language_unicode_match unicode_langs[] = { + {"el", RSPAMD_UNICODE_GREEK}, + {"ml", RSPAMD_UNICODE_MALAYALAM}, + {"te", RSPAMD_UNICODE_TELUGU}, + {"ta", RSPAMD_UNICODE_TAMIL}, + {"gu", RSPAMD_UNICODE_GUJARATI}, + {"th", RSPAMD_UNICODE_THAI}, + {"ka", RSPAMD_UNICODE_GEORGIAN}, + {"si", RSPAMD_UNICODE_SINHALA}, + {"hy", RSPAMD_UNICODE_ARMENIAN}, + {"ja", RSPAMD_UNICODE_JP}, + {"ko", RSPAMD_UNICODE_HANGUL}, +}; + +/* + * Top languages + */ +static const gchar *tier0_langs[] = { + "en", +}; +static const gchar *tier1_langs[] = { + "fr", "it", "de", "es", "nl", + "pt", "ru", "pl", "tk", "th", "ar"}; + +enum rspamd_language_category { + RSPAMD_LANGUAGE_LATIN = 0, + RSPAMD_LANGUAGE_CYRILLIC, + RSPAMD_LANGUAGE_DEVANAGARI, + RSPAMD_LANGUAGE_ARAB, + RSPAMD_LANGUAGE_MAX, +}; + +struct rspamd_language_elt { + const gchar *name; /* e.g. "en" or "ru" */ + gint flags; /* enum rspamd_language_elt_flags */ + enum rspamd_language_category category; + guint trigrams_words; + guint stop_words; + gdouble mean; + gdouble std; + guint occurrences; /* total number of parts with this language */ +}; + +struct rspamd_ngramm_elt { + struct rspamd_language_elt *elt; + gdouble prob; +}; + +struct rspamd_ngramm_chain { + GPtrArray *languages; + gdouble mean; + gdouble std; + gchar *utf; +}; + +struct rspamd_stop_word_range { + guint start; + guint stop; + struct rspamd_language_elt *elt; +}; + +struct rspamd_stop_word_elt { + struct rspamd_multipattern *mp; + GArray *ranges; /* of rspamd_stop_word_range */ +}; + +#define msg_debug_lang_det(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE_PUBLIC(langdet) + +static const struct rspamd_language_unicode_match * +rspamd_language_search_unicode_match(const gchar *key, + const struct rspamd_language_unicode_match *elts, size_t nelts) +{ + size_t i; + + for (i = 0; i < nelts; i++) { + if (strcmp(elts[i].lang, key) == 0) { + return &elts[i]; + } + } + + return NULL; +} + +static gboolean +rspamd_language_search_str(const gchar *key, const gchar *elts[], size_t nelts) +{ + size_t i; + + for (i = 0; i < nelts; i++) { + if (strcmp(elts[i], key) == 0) { + return TRUE; + } + } + return FALSE; +} + +static guint +rspamd_trigram_hash_func(gconstpointer key) +{ + return rspamd_cryptobox_fast_hash(key, 3 * sizeof(UChar32), + rspamd_hash_seed()); +} + +static gboolean +rspamd_trigram_equal_func(gconstpointer v, gconstpointer v2) +{ + return memcmp(v, v2, 3 * sizeof(UChar32)) == 0; +} + +KHASH_INIT(rspamd_trigram_hash, const UChar32 *, struct rspamd_ngramm_chain, true, + rspamd_trigram_hash_func, rspamd_trigram_equal_func); +KHASH_INIT(rspamd_candidates_hash, const gchar *, + struct rspamd_lang_detector_res *, true, + rspamd_str_hash, rspamd_str_equal); +KHASH_INIT(rspamd_stopwords_hash, rspamd_ftok_t *, + char, false, + rspamd_ftok_hash, rspamd_ftok_equal); + +KHASH_INIT(rspamd_languages_hash, const gchar *, struct rspamd_language_elt *, true, + rspamd_str_hash, rspamd_str_equal); +struct rspamd_lang_detector { + khash_t(rspamd_languages_hash) * languages; + khash_t(rspamd_trigram_hash) * trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */ + struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX]; + khash_t(rspamd_stopwords_hash) * stop_words_norm; + UConverter *uchar_converter; + gsize short_text_limit; + bool prefer_fasttext; + gsize total_occurrences; /* number of all languages found */ + gpointer fasttext_detector; + ref_entry_t ref; +}; + +static void +rspamd_language_detector_ucs_lowercase(UChar32 *s, gsize len) +{ + gsize i; + + for (i = 0; i < len; i++) { + s[i] = u_tolower(s[i]); + } +} + +static gboolean +rspamd_language_detector_ucs_is_latin(const UChar32 *s, gsize len) +{ + gsize i; + gboolean ret = TRUE; + + for (i = 0; i < len; i++) { + if (s[i] >= 128 || !(g_ascii_isalnum(s[i]) || s[i] == ' ')) { + ret = FALSE; + break; + } + } + + return ret; +} + +struct rspamd_language_ucs_elt { + guint freq; + const gchar *utf; + UChar32 s[0]; +}; + +static void +rspamd_language_detector_init_ngramm(struct rspamd_config *cfg, + struct rspamd_lang_detector *d, + struct rspamd_language_elt *lelt, + struct rspamd_language_ucs_elt *ucs, + guint len, + guint freq, + guint total, + khash_t(rspamd_trigram_hash) * htb) +{ + struct rspamd_ngramm_chain *chain = NULL, st_chain; + struct rspamd_ngramm_elt *elt; + khiter_t k; + guint i; + gboolean found; + + switch (len) { + case 1: + case 2: + g_assert_not_reached(); + break; + case 3: + k = kh_get(rspamd_trigram_hash, htb, ucs->s); + if (k != kh_end(htb)) { + chain = &kh_value(htb, k); + } + break; + default: + g_assert_not_reached(); + break; + } + + if (chain == NULL) { + /* New element */ + chain = &st_chain; + memset(chain, 0, sizeof(st_chain)); + chain->languages = g_ptr_array_sized_new(32); + rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard, + chain->languages); + chain->utf = rspamd_mempool_strdup(cfg->cfg_pool, ucs->utf); + elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt)); + elt->elt = lelt; + elt->prob = ((gdouble) freq) / ((gdouble) total); + g_ptr_array_add(chain->languages, elt); + + k = kh_put(rspamd_trigram_hash, htb, ucs->s, &i); + kh_value(htb, k) = *chain; + } + else { + /* Check sanity */ + found = FALSE; + + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + if (strcmp(elt->elt->name, lelt->name) == 0) { + found = TRUE; + elt->prob += ((gdouble) freq) / ((gdouble) total); + break; + } + } + + if (!found) { + elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt)); + elt->elt = lelt; + elt->prob = ((gdouble) freq) / ((gdouble) total); + g_ptr_array_add(chain->languages, elt); + } + } +} + +static inline enum rspamd_language_category +rspamd_language_detector_get_category(guint uflags) +{ + enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN; + + if (uflags & RSPAMD_UNICODE_CYRILLIC) { + cat = RSPAMD_LANGUAGE_CYRILLIC; + } + else if (uflags & RSPAMD_UNICODE_DEVANAGARI) { + cat = RSPAMD_LANGUAGE_DEVANAGARI; + } + else if (uflags & RSPAMD_UNICODE_ARABIC) { + cat = RSPAMD_LANGUAGE_ARAB; + } + + return cat; +} + +static const gchar * +rspamd_language_detector_print_flags(struct rspamd_language_elt *elt) +{ + static gchar flags_buf[256]; + goffset r = 0; + + if (elt->flags & RS_LANGUAGE_TIER1) { + r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier1,"); + } + if (elt->flags & RS_LANGUAGE_TIER0) { + r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier0,"); + } + if (elt->flags & RS_LANGUAGE_LATIN) { + r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "latin,"); + } + + if (r > 0) { + flags_buf[r - 1] = '\0'; + } + else { + flags_buf[r] = '\0'; + } + + return flags_buf; +} + +static gint +rspamd_language_detector_cmp_ngramm(gconstpointer a, gconstpointer b) +{ + struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **) a; + struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **) b; + + return (gint) e2->freq - (gint) e1->freq; +} + +static void +rspamd_language_detector_read_file(struct rspamd_config *cfg, + struct rspamd_lang_detector *d, + const gchar *path, + const ucl_object_t *stop_words) +{ + struct ucl_parser *parser; + ucl_object_t *top; + const ucl_object_t *freqs, *n_words, *cur, *type, *flags; + ucl_object_iter_t it = NULL; + UErrorCode uc_err = U_ZERO_ERROR; + struct rspamd_language_elt *nelt; + struct rspamd_language_ucs_elt *ucs_elt; + khash_t(rspamd_trigram_hash) *htb = NULL; + gchar *pos; + guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped, + loaded; + gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0; + enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX; + + parser = ucl_parser_new(UCL_PARSER_NO_FILEVARS); + if (!ucl_parser_add_file(parser, path)) { + msg_warn_config("cannot parse file %s: %s", path, + ucl_parser_get_error(parser)); + ucl_parser_free(parser); + + return; + } + + top = ucl_parser_get_object(parser); + ucl_parser_free(parser); + + freqs = ucl_object_lookup(top, "freq"); + + if (freqs == NULL) { + msg_warn_config("file %s has no 'freq' key", path); + ucl_object_unref(top); + + return; + } + + pos = strrchr(path, '/'); + g_assert(pos != NULL); + nelt = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*nelt)); + nelt->name = rspamd_mempool_strdup(cfg->cfg_pool, pos + 1); + /* Remove extension */ + pos = strchr(nelt->name, '.'); + g_assert(pos != NULL); + *pos = '\0'; + + n_words = ucl_object_lookup(top, "n_words"); + + if (n_words == NULL || ucl_object_type(n_words) != UCL_ARRAY || + n_words->len != 3) { + msg_warn_config("cannot find n_words in language %s", nelt->name); + ucl_object_unref(top); + + return; + } + else { + nelt->trigrams_words = ucl_object_toint(ucl_array_find_index(n_words, + 2)); + } + + type = ucl_object_lookup(top, "type"); + + if (type == NULL || ucl_object_type(type) != UCL_STRING) { + msg_debug_config("cannot find type in language %s", nelt->name); + ucl_object_unref(top); + + return; + } + else { + const gchar *stype = ucl_object_tostring(type); + + if (strcmp(stype, "latin") == 0) { + cat = RSPAMD_LANGUAGE_LATIN; + } + else if (strcmp(stype, "cyrillic") == 0) { + cat = RSPAMD_LANGUAGE_CYRILLIC; + } + else if (strcmp(stype, "arab") == 0) { + cat = RSPAMD_LANGUAGE_ARAB; + } + else if (strcmp(stype, "devanagari") == 0) { + cat = RSPAMD_LANGUAGE_DEVANAGARI; + } + else { + msg_debug_config("unknown type %s of language %s", stype, nelt->name); + ucl_object_unref(top); + + return; + } + } + + flags = ucl_object_lookup(top, "flags"); + + if (flags != NULL && ucl_object_type(flags) == UCL_ARRAY) { + ucl_object_iter_t it = NULL; + const ucl_object_t *cur; + + while ((cur = ucl_object_iterate(flags, &it, true)) != NULL) { + const gchar *fl = ucl_object_tostring(cur); + + if (cur) { + if (strcmp(fl, "diacritics") == 0) { + nelt->flags |= RS_LANGUAGE_DIACRITICS; + } + else if (strcmp(fl, "ascii") == 0) { + nelt->flags |= RS_LANGUAGE_ASCII; + } + else { + msg_debug_config("unknown flag %s of language %s", fl, nelt->name); + } + } + else { + msg_debug_config("unknown flags type of language %s", nelt->name); + } + } + } + + if (stop_words) { + const ucl_object_t *specific_stop_words; + + specific_stop_words = ucl_object_lookup(stop_words, nelt->name); + + if (specific_stop_words) { + struct sb_stemmer *stem = NULL; + it = NULL; + const ucl_object_t *w; + guint start, stop; + + stem = sb_stemmer_new(nelt->name, "UTF_8"); + start = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp); + + while ((w = ucl_object_iterate(specific_stop_words, &it, true)) != NULL) { + gsize wlen; + const char *word = ucl_object_tolstring(w, &wlen); + const char *saved; + guint mp_flags = RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8; + + if (rspamd_multipattern_has_hyperscan()) { + mp_flags |= RSPAMD_MULTIPATTERN_RE; + } + + rspamd_multipattern_add_pattern_len(d->stop_words[cat].mp, + word, wlen, + mp_flags); + nelt->stop_words++; + + /* Also lemmatise and store normalised */ + if (stem) { + const char *nw = sb_stemmer_stem(stem, word, wlen); + + + if (nw) { + saved = nw; + wlen = strlen(nw); + } + else { + saved = word; + } + } + else { + saved = word; + } + + if (saved) { + gint rc; + rspamd_ftok_t *tok; + gchar *dst; + + tok = rspamd_mempool_alloc(cfg->cfg_pool, + sizeof(*tok) + wlen + 1); + dst = ((gchar *) tok) + sizeof(*tok); + rspamd_strlcpy(dst, saved, wlen + 1); + tok->begin = dst; + tok->len = wlen; + + kh_put(rspamd_stopwords_hash, d->stop_words_norm, + tok, &rc); + } + } + + if (stem) { + sb_stemmer_delete(stem); + } + + stop = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp); + + struct rspamd_stop_word_range r; + + r.start = start; + r.stop = stop; + r.elt = nelt; + + g_array_append_val(d->stop_words[cat].ranges, r); + it = NULL; + } + } + + nelt->category = cat; + htb = d->trigrams[cat]; + + GPtrArray *ngramms; + guint nsym; + + if (rspamd_language_search_str(nelt->name, tier1_langs, + G_N_ELEMENTS(tier1_langs))) { + nelt->flags |= RS_LANGUAGE_TIER1; + } + + if (rspamd_language_search_str(nelt->name, tier0_langs, + G_N_ELEMENTS(tier0_langs))) { + nelt->flags |= RS_LANGUAGE_TIER0; + } + + it = NULL; + ngramms = g_ptr_array_sized_new(freqs->len); + i = 0; + skipped = 0; + loaded = 0; + + while ((cur = ucl_object_iterate(freqs, &it, true)) != NULL) { + const gchar *key; + gsize keylen; + guint freq; + + key = ucl_object_keyl(cur, &keylen); + freq = ucl_object_toint(cur); + + i++; + delta = freq - mean; + mean += delta / i; + delta2 = freq - mean; + m2 += delta * delta2; + + if (key != NULL) { + UChar32 *cur_ucs; + const char *end = key + keylen, *cur_utf = key; + + ucs_elt = rspamd_mempool_alloc(cfg->cfg_pool, + sizeof(*ucs_elt) + (keylen + 1) * sizeof(UChar32)); + + cur_ucs = ucs_elt->s; + nsym = 0; + uc_err = U_ZERO_ERROR; + + while (cur_utf < end) { + *cur_ucs++ = ucnv_getNextUChar(d->uchar_converter, &cur_utf, + end, &uc_err); + if (!U_SUCCESS(uc_err)) { + break; + } + + nsym++; + } + + if (!U_SUCCESS(uc_err)) { + msg_warn_config("cannot convert key %*s to unicode: %s", + (gint) keylen, key, u_errorName(uc_err)); + + continue; + } + + ucs_elt->utf = key; + rspamd_language_detector_ucs_lowercase(ucs_elt->s, nsym); + + if (nsym == 3) { + g_ptr_array_add(ngramms, ucs_elt); + } + else { + continue; + } + + if (rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) { + total_latin++; + } + + ucs_elt->freq = freq; + + total_ngramms++; + } + } + + std = sqrt(m2 / (i - 1)); + + if (total_latin >= total_ngramms / 3) { + nelt->flags |= RS_LANGUAGE_LATIN; + } + + nsym = 3; + + total = 0; + PTR_ARRAY_FOREACH(ngramms, i, ucs_elt) + { + + if (!(nelt->flags & RS_LANGUAGE_LATIN) && + rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) { + ucs_elt->freq = 0; + /* Skip latin ngramm for non-latin language to avoid garbage */ + skipped++; + continue; + } + + /* Now, discriminate low frequency ngramms */ + + total += ucs_elt->freq; + loaded++; + } + + g_ptr_array_sort(ngramms, rspamd_language_detector_cmp_ngramm); + + PTR_ARRAY_FOREACH(ngramms, i, ucs_elt) + { + if (ucs_elt->freq > 0) { + rspamd_language_detector_init_ngramm(cfg, d, + nelt, ucs_elt, nsym, + ucs_elt->freq, total, htb); + } + } + +#ifdef EXTRA_LANGDET_DEBUG + /* Useful for debug */ + for (i = 0; i < 10; i++) { + ucs_elt = g_ptr_array_index(ngramms, i); + + msg_debug_lang_det_cfg("%s -> %s: %d", nelt->name, + ucs_elt->utf, ucs_elt->freq); + } +#endif + + g_ptr_array_free(ngramms, TRUE); + nelt->mean = mean; + nelt->std = std; + + msg_debug_lang_det_cfg("loaded %s language, %d trigrams, " + "%d ngramms loaded; " + "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; " + "(%s)", + nelt->name, + (gint) nelt->trigrams_words, + total, + std, mean, + skipped, loaded, nelt->stop_words, + rspamd_language_detector_print_flags(nelt)); + + int ret; + khiter_t k = kh_put(rspamd_languages_hash, d->languages, nelt->name, &ret); + g_assert(ret > 0); /* must be unique */ + kh_value(d->languages, k) = nelt; + ucl_object_unref(top); +} + +static gboolean +rspamd_ucl_array_find_str(const gchar *str, const ucl_object_t *ar) +{ + ucl_object_iter_t it = NULL; + const ucl_object_t *cur; + + if (ar == NULL || ar->len == 0) { + return FALSE; + } + + while ((cur = ucl_object_iterate(ar, &it, true)) != NULL) { + if (ucl_object_type(cur) == UCL_STRING && rspamd_strcase_equal( + ucl_object_tostring(cur), str)) { + return TRUE; + } + } + + return FALSE; +} + +static void +rspamd_language_detector_process_chain(struct rspamd_config *cfg, + struct rspamd_ngramm_chain *chain) +{ + struct rspamd_ngramm_elt *elt; + guint i; + gdouble delta, mean = 0, delta2, m2 = 0, std; + + if (chain->languages->len > 3) { + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + delta = elt->prob - mean; + mean += delta / (i + 1); + delta2 = elt->prob - mean; + m2 += delta * delta2; + } + + std = sqrt(m2 / (i - 1)); + chain->mean = mean; + chain->std = std; + + /* Now, filter elements that are lower than mean */ + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + if (elt->prob < mean) { + g_ptr_array_remove_index_fast(chain->languages, i); +#ifdef EXTRA_LANGDET_DEBUG + msg_debug_lang_det_cfg("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f", + elt->elt->name, chain->utf, elt->prob, mean, std); +#endif + } + } + } + else { + /* We have a unique ngramm, increase its weight */ + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + elt->prob *= 4.0; +#ifdef EXTRA_LANGDET_DEBUG + msg_debug_lang_det_cfg("increase weight of %s in %s; prob: %.4f", + elt->elt->name, chain->utf, elt->prob); +#endif + } + } +} + +static void +rspamd_language_detector_dtor(struct rspamd_lang_detector *d) +{ + if (d) { + for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i++) { + kh_destroy(rspamd_trigram_hash, d->trigrams[i]); + rspamd_multipattern_destroy(d->stop_words[i].mp); + g_array_free(d->stop_words[i].ranges, TRUE); + } + + if (d->languages) { + kh_destroy(rspamd_languages_hash, d->languages); + } + + kh_destroy(rspamd_stopwords_hash, d->stop_words_norm); + rspamd_lang_detection_fasttext_destroy(d->fasttext_detector); + } +} + +struct rspamd_lang_detector * +rspamd_language_detector_init(struct rspamd_config *cfg) +{ + const ucl_object_t *section, *elt, *languages_enable = NULL, + *languages_disable = NULL; + const gchar *languages_path = default_languages_path; + glob_t gl; + size_t i, short_text_limit = default_short_text_limit, total = 0; + UErrorCode uc_err = U_ZERO_ERROR; + GString *languages_pattern; + struct rspamd_ngramm_chain *chain, schain; + gchar *fname; + struct rspamd_lang_detector *ret = NULL; + struct ucl_parser *parser; + ucl_object_t *stop_words; + bool prefer_fasttext = true; + + section = ucl_object_lookup(cfg->cfg_ucl_obj, "lang_detection"); + + if (section != NULL) { + elt = ucl_object_lookup(section, "languages"); + + if (elt) { + languages_path = ucl_object_tostring(elt); + } + + elt = ucl_object_lookup(section, "short_text_limit"); + + if (elt) { + short_text_limit = ucl_object_toint(elt); + } + + languages_enable = ucl_object_lookup(section, "languages_enable"); + languages_disable = ucl_object_lookup(section, "languages_disable"); + + elt = ucl_object_lookup(section, "prefer_fasttext"); + if (elt) { + prefer_fasttext = ucl_object_toboolean(elt); + } + } + + languages_pattern = g_string_sized_new(PATH_MAX); + rspamd_printf_gstring(languages_pattern, "%s/stop_words", languages_path); + parser = ucl_parser_new(UCL_PARSER_DEFAULT); + + if (ucl_parser_add_file(parser, languages_pattern->str)) { + stop_words = ucl_parser_get_object(parser); + } + else { + msg_err_config("cannot read stop words from %s: %s", + languages_pattern->str, + ucl_parser_get_error(parser)); + stop_words = NULL; + } + + ucl_parser_free(parser); + languages_pattern->len = 0; + + rspamd_printf_gstring(languages_pattern, "%s/*.json", languages_path); + memset(&gl, 0, sizeof(gl)); + + if (glob(languages_pattern->str, 0, NULL, &gl) != 0) { + msg_err_config("cannot read any files matching %v", languages_pattern); + goto end; + } + + ret = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*ret)); + ret->languages = kh_init(rspamd_languages_hash); + kh_resize(rspamd_languages_hash, ret->languages, gl.gl_pathc); + ret->uchar_converter = rspamd_get_utf8_converter(); + ret->short_text_limit = short_text_limit; + ret->stop_words_norm = kh_init(rspamd_stopwords_hash); + ret->prefer_fasttext = prefer_fasttext; + + /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */ + for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) { + ret->trigrams[i] = kh_init(rspamd_trigram_hash); +#ifdef WITH_HYPERSCAN + ret->stop_words[i].mp = rspamd_multipattern_create( + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 | + RSPAMD_MULTIPATTERN_RE); +#else + ret->stop_words[i].mp = rspamd_multipattern_create( + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); +#endif + + ret->stop_words[i].ranges = g_array_new(FALSE, FALSE, + sizeof(struct rspamd_stop_word_range)); + } + + g_assert(uc_err == U_ZERO_ERROR); + + for (i = 0; i < gl.gl_pathc; i++) { + fname = g_path_get_basename(gl.gl_pathv[i]); + + if (!rspamd_ucl_array_find_str(fname, languages_disable) || + (languages_enable == NULL || + rspamd_ucl_array_find_str(fname, languages_enable))) { + rspamd_language_detector_read_file(cfg, ret, gl.gl_pathv[i], + stop_words); + } + else { + msg_info_config("skip language file %s: disabled", fname); + } + + g_free(fname); + } + + for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) { + GError *err = NULL; + + kh_foreach_value(ret->trigrams[i], schain, { + chain = &schain; + rspamd_language_detector_process_chain(cfg, chain); + }); + + if (!rspamd_multipattern_compile(ret->stop_words[i].mp, &err)) { + msg_err_config("cannot compile stop words for %z language group: %e", + i, err); + g_error_free(err); + } + + total += kh_size(ret->trigrams[i]); + } + + ret->fasttext_detector = rspamd_lang_detection_fasttext_init(cfg); + char *fasttext_status = rspamd_lang_detection_fasttext_show_info(ret->fasttext_detector); + + msg_info_config("loaded %d languages, " + "%d trigrams; %s", + (gint) kh_size(ret->languages), + (gint) total, fasttext_status); + g_free(fasttext_status); + + if (stop_words) { + ucl_object_unref(stop_words); + } + + REF_INIT_RETAIN(ret, rspamd_language_detector_dtor); + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) rspamd_language_detector_unref, + ret); + +end: + if (gl.gl_pathc > 0) { + globfree(&gl); + } + + g_string_free(languages_pattern, TRUE); + + return ret; +} + +static void +rspamd_language_detector_random_select(GArray *ucs_tokens, guint nwords, + goffset *offsets_out, + guint64 *seed) +{ + guint step_len, remainder, i, out_idx; + guint64 coin, sel; + rspamd_stat_token_t *tok; + + g_assert(nwords != 0); + g_assert(offsets_out != NULL); + g_assert(ucs_tokens->len >= nwords); + /* + * We split input array into `nwords` parts. For each part we randomly select + * an element from this particular split. Here is an example: + * + * nwords=2, input_len=5 + * + * w1 w2 w3 w4 w5 + * ^ ^ + * part1 part2 + * vv vv + * w2 w5 + * + * So we have 2 output words from 5 input words selected randomly within + * their splits. It is not uniform distribution but it seems to be better + * to include words from different text parts + */ + step_len = ucs_tokens->len / nwords; + remainder = ucs_tokens->len % nwords; + + out_idx = 0; + coin = rspamd_random_uint64_fast_seed(seed); + sel = coin % (step_len + remainder); + offsets_out[out_idx] = sel; + + for (i = step_len + remainder; i < ucs_tokens->len; + i += step_len, out_idx++) { + guint ntries = 0; + coin = rspamd_random_uint64_fast_seed(seed); + sel = (coin % step_len) + i; + + for (;;) { + tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel); + /* Filter bad tokens */ + + if (tok->unicode.len >= 2 && + !(tok->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION) && + u_isalpha(tok->unicode.begin[0]) && + u_isalpha(tok->unicode.begin[tok->unicode.len - 1])) { + offsets_out[out_idx] = sel; + break; + } + else { + ntries++; + coin = rspamd_random_uint64_fast_seed(seed); + + if (ntries < step_len) { + sel = (coin % step_len) + i; + } + else if (ntries < ucs_tokens->len) { + sel = coin % ucs_tokens->len; + } + else { + offsets_out[out_idx] = sel; + break; + } + } + } + } + + /* + * Fisher-Yates algorithm: + * for i from 0 to nā2 do + * j ā random integer such that i ā¤ j < n + * exchange a[i] and a[j] + */ +#if 0 + if (out_idx > 2) { + for (i = 0; i < out_idx - 2; i++) { + coin = rspamd_random_uint64_fast (); + sel = (coin % (out_idx - i)) + i; + /* swap */ + tmp = offsets_out[i]; + offsets_out[i] = offsets_out[sel]; + offsets_out[sel] = tmp; + } + } +#endif +} + +static goffset +rspamd_language_detector_next_ngramm(rspamd_stat_token_t *tok, UChar32 *window, + guint wlen, goffset cur_off) +{ + guint i; + + if (wlen > 1) { + /* Deal with spaces at the beginning and ending */ + + if (cur_off == 0) { + window[0] = (UChar32) ' '; + + for (i = 0; i < wlen - 1; i++) { + window[i + 1] = tok->unicode.begin[i]; + } + } + else if (cur_off + wlen == tok->unicode.len + 1) { + /* Add trailing space */ + for (i = 0; i < wlen - 1; i++) { + window[i] = tok->unicode.begin[cur_off + i]; + } + window[wlen - 1] = (UChar32) ' '; + } + else if (cur_off + wlen > tok->unicode.len + 1) { + /* No more fun */ + return -1; + } + else { + /* Normal case */ + for (i = 0; i < wlen; i++) { + window[i] = tok->unicode.begin[cur_off + i]; + } + } + } + else { + if (tok->normalized.len <= cur_off) { + return -1; + } + + window[0] = tok->unicode.begin[cur_off]; + } + + return cur_off + 1; +} + +/* + * Do full guess for a specific ngramm, checking all languages defined + */ +static void +rspamd_language_detector_process_ngramm_full(struct rspamd_task *task, + struct rspamd_lang_detector *d, + UChar32 *window, + khash_t(rspamd_candidates_hash) * candidates, + khash_t(rspamd_trigram_hash) * trigrams) +{ + guint i; + gint ret; + struct rspamd_ngramm_chain *chain = NULL; + struct rspamd_ngramm_elt *elt; + struct rspamd_lang_detector_res *cand; + khiter_t k; + gdouble prob; + + k = kh_get(rspamd_trigram_hash, trigrams, window); + if (k != kh_end(trigrams)) { + chain = &kh_value(trigrams, k); + } + + if (chain) { + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + prob = elt->prob; + + if (prob < chain->mean) { + continue; + } + + k = kh_get(rspamd_candidates_hash, candidates, elt->elt->name); + if (k != kh_end(candidates)) { + cand = kh_value(candidates, k); + } + else { + cand = NULL; + } + +#ifdef NGRAMMS_DEBUG + msg_err("gramm: %s, lang: %s, prob: %.3f", chain->utf, + elt->elt->name, log2(elt->prob)); +#endif + if (cand == NULL) { + cand = rspamd_mempool_alloc(task->task_pool, sizeof(*cand)); + cand->elt = elt->elt; + cand->lang = elt->elt->name; + cand->prob = prob; + + k = kh_put(rspamd_candidates_hash, candidates, elt->elt->name, + &ret); + kh_value(candidates, k) = cand; + } + else { + /* Update guess */ + cand->prob += prob; + } + } + } +} + +static void +rspamd_language_detector_detect_word(struct rspamd_task *task, + struct rspamd_lang_detector *d, + rspamd_stat_token_t *tok, + khash_t(rspamd_candidates_hash) * candidates, + khash_t(rspamd_trigram_hash) * trigrams) +{ + const guint wlen = 3; + UChar32 window[3]; + goffset cur = 0; + + /* Split words */ + while ((cur = rspamd_language_detector_next_ngramm(tok, window, wlen, cur)) != -1) { + rspamd_language_detector_process_ngramm_full(task, + d, window, candidates, trigrams); + } +} + +static const gdouble cutoff_limit = -8.0; +/* + * Converts frequencies to log probabilities, filter those candidates who + * has the lowest probabilities + */ + +static inline void +rspamd_language_detector_filter_step1(struct rspamd_task *task, + struct rspamd_lang_detector_res *cand, + gdouble *max_prob, guint *filtered) +{ + if (!isnan(cand->prob)) { + if (cand->prob == 0) { + cand->prob = NAN; + msg_debug_lang_det( + "exclude language %s", + cand->lang); + (*filtered)++; + } + else { + cand->prob = log2(cand->prob); + if (cand->prob < cutoff_limit) { + msg_debug_lang_det( + "exclude language %s: %.3f, cutoff limit: %.3f", + cand->lang, cand->prob, cutoff_limit); + cand->prob = NAN; + (*filtered)++; + } + else if (cand->prob > *max_prob) { + *max_prob = cand->prob; + } + } + } +} + +static inline void +rspamd_language_detector_filter_step2(struct rspamd_task *task, + struct rspamd_lang_detector_res *cand, + gdouble max_prob, guint *filtered) +{ + /* + * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that + * prob2 is 2^4 less than prob1 + */ + if (!isnan(cand->prob) && max_prob - cand->prob > 1) { + msg_debug_lang_det("exclude language %s: %.3f (%.3f max)", + cand->lang, cand->prob, max_prob); + cand->prob = NAN; + (*filtered)++; + } +} + +static void +rspamd_language_detector_filter_negligible(struct rspamd_task *task, + khash_t(rspamd_candidates_hash) * candidates) +{ + struct rspamd_lang_detector_res *cand; + guint filtered = 0; + gdouble max_prob = -(G_MAXDOUBLE); + + kh_foreach_value(candidates, cand, + rspamd_language_detector_filter_step1(task, cand, &max_prob, &filtered)); + kh_foreach_value(candidates, cand, + rspamd_language_detector_filter_step2(task, cand, max_prob, &filtered)); + + msg_debug_lang_det("removed %d languages", filtered); +} + +static void +rspamd_language_detector_detect_type(struct rspamd_task *task, + guint nwords, + struct rspamd_lang_detector *d, + GArray *words, + enum rspamd_language_category cat, + khash_t(rspamd_candidates_hash) * candidates, + struct rspamd_mime_text_part *part) +{ + guint nparts = MIN(words->len, nwords); + goffset *selected_words; + rspamd_stat_token_t *tok; + guint i; + guint64 seed; + + /* Seed PRNG with part digest to provide some sort of determinism */ + memcpy(&seed, part->mime_part->digest, sizeof(seed)); + selected_words = g_new0(goffset, nparts); + rspamd_language_detector_random_select(words, nparts, selected_words, &seed); + msg_debug_lang_det("randomly selected %d words", nparts); + + for (i = 0; i < nparts; i++) { + tok = &g_array_index(words, rspamd_stat_token_t, + selected_words[i]); + + if (tok->unicode.len >= 3) { + rspamd_language_detector_detect_word(task, d, tok, candidates, + d->trigrams[cat]); + } + } + + /* Filter negligible candidates */ + rspamd_language_detector_filter_negligible(task, candidates); + g_free(selected_words); +} + +static gint +rspamd_language_detector_cmp(gconstpointer a, gconstpointer b) +{ + const struct rspamd_lang_detector_res + *canda = *(const struct rspamd_lang_detector_res **) a, + *candb = *(const struct rspamd_lang_detector_res **) b; + + if (canda->prob > candb->prob) { + return -1; + } + else if (candb->prob > canda->prob) { + return 1; + } + + return 0; +} + +enum rspamd_language_detected_type { + rs_detect_none = 0, + rs_detect_single, + rs_detect_multiple, +}; + +static enum rspamd_language_detected_type +rspamd_language_detector_try_ngramm(struct rspamd_task *task, + guint nwords, + struct rspamd_lang_detector *d, + GArray *ucs_tokens, + enum rspamd_language_category cat, + khash_t(rspamd_candidates_hash) * candidates, + struct rspamd_mime_text_part *part) +{ + guint cand_len = 0; + struct rspamd_lang_detector_res *cand; + + rspamd_language_detector_detect_type(task, + nwords, + d, + ucs_tokens, + cat, + candidates, + part); + + kh_foreach_value(candidates, cand, { + if (!isnan(cand->prob)) { + cand_len++; + } + }); + + if (cand_len == 0) { + return rs_detect_none; + } + else if (cand_len == 1) { + return rs_detect_single; + } + + return rs_detect_multiple; +} + +enum rspamd_language_sort_flags { + RSPAMD_LANG_FLAG_DEFAULT = 0, + RSPAMD_LANG_FLAG_SHORT = 1 << 0, +}; + +struct rspamd_frequency_sort_cbdata { + struct rspamd_lang_detector *d; + enum rspamd_language_sort_flags flags; + gdouble std; + gdouble mean; +}; + +static const gdouble tier0_adjustment = 1.2; +static const gdouble tier1_adjustment = 0.8; +static const gdouble frequency_adjustment = 0.8; + +static gint +rspamd_language_detector_cmp_heuristic(gconstpointer a, gconstpointer b, + gpointer ud) +{ + struct rspamd_frequency_sort_cbdata *cbd = ud; + struct rspamd_lang_detector_res + *canda = *(struct rspamd_lang_detector_res **) a, + *candb = *(struct rspamd_lang_detector_res **) b; + gdouble adj; + gdouble proba_adjusted, probb_adjusted, freqa, freqb; + + if (cbd->d->total_occurrences == 0) { + /* Not enough data, compare directly */ + return rspamd_language_detector_cmp(a, b); + } + + freqa = ((gdouble) canda->elt->occurrences) / + (gdouble) cbd->d->total_occurrences; + freqb = ((gdouble) candb->elt->occurrences) / + (gdouble) cbd->d->total_occurrences; + + proba_adjusted = canda->prob; + probb_adjusted = candb->prob; + + if (isnormal(freqa) && isnormal(freqb)) { + proba_adjusted += cbd->std * (frequency_adjustment * freqa); + probb_adjusted += cbd->std * (frequency_adjustment * freqb); + } + + if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) { + adj = tier1_adjustment * 2.0; + } + else { + adj = tier1_adjustment; + } + if (canda->elt->flags & RS_LANGUAGE_TIER1) { + proba_adjusted += cbd->std * adj; + } + + if (candb->elt->flags & RS_LANGUAGE_TIER1) { + probb_adjusted += cbd->std * adj; + } + + if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) { + adj = tier0_adjustment * 16.0; + } + else { + adj = tier0_adjustment; + } + + if (canda->elt->flags & RS_LANGUAGE_TIER0) { + proba_adjusted += cbd->std * adj; + } + + if (candb->elt->flags & RS_LANGUAGE_TIER0) { + probb_adjusted += cbd->std * adj; + } + + /* Hack: adjust probability directly */ + canda->prob = proba_adjusted; + candb->prob = probb_adjusted; + + if (proba_adjusted > probb_adjusted) { + return -1; + } + else if (probb_adjusted > proba_adjusted) { + return 1; + } + + return 0; +} + +static void +rspamd_language_detector_unicode_scripts(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + guint *pchinese, + guint *pspecial) +{ + const gchar *p = part->utf_stripped_content->data, *end; + guint i = 0, cnt = 0; + end = p + part->utf_stripped_content->len; + gint32 uc, sc; + guint nlatin = 0, nchinese = 0, nspecial = 0; + const guint cutoff_limit = 32; + + while (p + i < end) { + U8_NEXT(p, i, part->utf_stripped_content->len, uc); + + if (((gint32) uc) < 0) { + break; + } + + if (u_isalpha(uc)) { + sc = ublock_getCode(uc); + cnt++; + + switch (sc) { + case UBLOCK_BASIC_LATIN: + case UBLOCK_LATIN_1_SUPPLEMENT: + part->unicode_scripts |= RSPAMD_UNICODE_LATIN; + nlatin++; + break; + case UBLOCK_HEBREW: + part->unicode_scripts |= RSPAMD_UNICODE_HEBREW; + nspecial++; + break; + case UBLOCK_GREEK: + part->unicode_scripts |= RSPAMD_UNICODE_GREEK; + nspecial++; + break; + case UBLOCK_CYRILLIC: + part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC; + nspecial++; + break; + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS: + case UBLOCK_CJK_COMPATIBILITY: + case UBLOCK_CJK_RADICALS_SUPPLEMENT: + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: + part->unicode_scripts |= RSPAMD_UNICODE_CJK; + nchinese++; + break; + case UBLOCK_HIRAGANA: + case UBLOCK_KATAKANA: + part->unicode_scripts |= RSPAMD_UNICODE_JP; + nspecial++; + break; + case UBLOCK_HANGUL_JAMO: + case UBLOCK_HANGUL_COMPATIBILITY_JAMO: + part->unicode_scripts |= RSPAMD_UNICODE_HANGUL; + nspecial++; + break; + case UBLOCK_ARABIC: + part->unicode_scripts |= RSPAMD_UNICODE_ARABIC; + nspecial++; + break; + case UBLOCK_DEVANAGARI: + part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI; + nspecial++; + break; + case UBLOCK_ARMENIAN: + part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN; + nspecial++; + break; + case UBLOCK_GEORGIAN: + part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN; + nspecial++; + break; + case UBLOCK_GUJARATI: + part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI; + nspecial++; + break; + case UBLOCK_TELUGU: + part->unicode_scripts |= RSPAMD_UNICODE_TELUGU; + nspecial++; + break; + case UBLOCK_TAMIL: + part->unicode_scripts |= RSPAMD_UNICODE_TAMIL; + nspecial++; + break; + case UBLOCK_THAI: + part->unicode_scripts |= RSPAMD_UNICODE_THAI; + nspecial++; + break; + case RSPAMD_UNICODE_MALAYALAM: + part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM; + nspecial++; + break; + case RSPAMD_UNICODE_SINHALA: + part->unicode_scripts |= RSPAMD_UNICODE_SINHALA; + nspecial++; + break; + } + } + + if (nspecial > cutoff_limit && nspecial > nlatin) { + break; + } + else if (nchinese > cutoff_limit && nchinese > nlatin) { + if (nspecial > 0) { + /* Likely japanese */ + break; + } + } + } + + msg_debug_lang_det("stop after checking %d characters, " + "%d latin, %d special, %d chinese", + cnt, nlatin, nspecial, nchinese); + + *pchinese = nchinese; + *pspecial = nspecial; +} + +static inline void +rspamd_language_detector_set_language(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + const gchar *code, + struct rspamd_language_elt *elt) +{ + struct rspamd_lang_detector_res *r; + + r = rspamd_mempool_alloc0(task->task_pool, sizeof(*r)); + r->prob = 1.0; + r->lang = code; + r->elt = elt; + + if (part->languages == NULL) { + part->languages = g_ptr_array_sized_new(1); + } + + g_ptr_array_add(part->languages, r); + part->language = code; +} + +static gboolean +rspamd_language_detector_try_uniscript(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + guint nchinese, + guint nspecial) +{ + guint i; + + for (i = 0; i < G_N_ELEMENTS(unicode_langs); i++) { + if (unicode_langs[i].unicode_code & part->unicode_scripts) { + + if (unicode_langs[i].unicode_code != RSPAMD_UNICODE_JP) { + msg_debug_lang_det("set language based on unicode script %s", + unicode_langs[i].lang); + rspamd_language_detector_set_language(task, part, + unicode_langs[i].lang, NULL); + + return TRUE; + } + else { + /* Japanese <-> Chinese guess */ + + /* + * Typically there might be around 0-70% of kanji glyphs + * and the rest are Haragana/Katakana + * + * If we discover that Kanji is more than 80% then we consider + * it Chinese + */ + if (nchinese <= 5 || nchinese < nspecial * 5) { + msg_debug_lang_det("set language based on unicode script %s", + unicode_langs[i].lang); + rspamd_language_detector_set_language(task, part, + unicode_langs[i].lang, NULL); + + return TRUE; + } + } + } + } + + if (part->unicode_scripts & RSPAMD_UNICODE_CJK) { + msg_debug_lang_det("guess chinese based on CJK characters: %d chinese, %d special", + nchinese, nspecial); + rspamd_language_detector_set_language(task, part, + "zh-CN", NULL); + + return TRUE; + } + + return FALSE; +} + +static guint +rspamd_langelt_hash_func(gconstpointer key) +{ + const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *) key; + return rspamd_cryptobox_fast_hash(elt->name, strlen(elt->name), + rspamd_hash_seed()); +} + +static gboolean +rspamd_langelt_equal_func(gconstpointer v, gconstpointer v2) +{ + const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *) v, + *elt2 = (const struct rspamd_language_elt *) v2; + return strcmp(elt1->name, elt2->name) == 0; +} + +/* This hash set stores a word index in the language to avoid duplicate stop words */ +KHASH_INIT(rspamd_sw_res_set, int, char, 0, kh_int_hash_func, kh_int_hash_equal); + +KHASH_INIT(rspamd_sw_hash, struct rspamd_language_elt *, khash_t(rspamd_sw_res_set) *, 1, + rspamd_langelt_hash_func, rspamd_langelt_equal_func); + +struct rspamd_sw_cbdata { + struct rspamd_task *task; + khash_t(rspamd_sw_hash) * res; + GArray *ranges; +}; + +static gint +rspamd_ranges_cmp(const void *k, const void *memb) +{ + gint pos = GPOINTER_TO_INT(k); + const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *) memb; + + if (pos >= r->start && pos < r->stop) { + return 0; + } + else if (pos < r->start) { + return -1; + } + + return 1; +} + +static gint +rspamd_language_detector_sw_cb(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + /* Check if boundary */ + const gchar *prev = text, *next = text + len; + struct rspamd_stop_word_range *r; + struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *) context; + khiter_t k; + static const gsize max_stop_words = 80; + struct rspamd_task *task; + + if (match_start > 0) { + prev = text + match_start - 1; + + if (!(g_ascii_isspace(*prev) || g_ascii_ispunct(*prev))) { + return 0; + } + } + + if (match_pos < len) { + next = text + match_pos; + + if (!(g_ascii_isspace(*next) || g_ascii_ispunct(*next))) { + return 0; + } + } + + /* We have a word on the boundary, check range */ + task = cbdata->task; + r = bsearch(GINT_TO_POINTER(strnum), cbdata->ranges->data, + cbdata->ranges->len, sizeof(*r), rspamd_ranges_cmp); + + g_assert(r != NULL); + + k = kh_get(rspamd_sw_hash, cbdata->res, r->elt); + gint nwords = 1; + + if (k != kh_end(cbdata->res)) { + khiter_t set_k; + int tt; + + set_k = kh_get(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum); + nwords = kh_size(kh_value(cbdata->res, k)); + + if (set_k == kh_end(kh_value(cbdata->res, k))) { + /* New word */ + set_k = kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt); + msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)", + (int) (next - prev - 1), prev + 1, r->elt->name, nwords); + } + + if (nwords > max_stop_words) { + return 1; + } + } + else { + gint tt; + + k = kh_put(rspamd_sw_hash, cbdata->res, r->elt, &tt); + kh_value(cbdata->res, k) = kh_init(rspamd_sw_res_set); + kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt); + + msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)", + (int) (next - prev - 1), prev + 1, r->elt->name, nwords); + } + + return 0; +} + +static gboolean +rspamd_language_detector_try_stop_words(struct rspamd_task *task, + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part, + enum rspamd_language_category cat) +{ + struct rspamd_stop_word_elt *elt; + struct rspamd_sw_cbdata cbdata; + gboolean ret = FALSE; + static const int stop_words_threshold = 4, /* minimum stop words count */ + strong_confidence_threshold = 10 /* we are sure that this is enough */; + + elt = &d->stop_words[cat]; + cbdata.res = kh_init(rspamd_sw_hash); + cbdata.ranges = elt->ranges; + cbdata.task = task; + + rspamd_multipattern_lookup(elt->mp, part->utf_stripped_content->data, + part->utf_stripped_content->len, rspamd_language_detector_sw_cb, + &cbdata, NULL); + + if (kh_size(cbdata.res) > 0) { + khash_t(rspamd_sw_res_set) * cur_res; + double max_rate = G_MINDOUBLE; + struct rspamd_language_elt *cur_lang, *sel = NULL; + gboolean ignore_ascii = FALSE, ignore_latin = FALSE; + + again: + kh_foreach(cbdata.res, cur_lang, cur_res, { + int cur_matches = kh_size(cur_res); + + if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) { + /* Restart matches */ + ignore_ascii = TRUE; + sel = NULL; + max_rate = G_MINDOUBLE; + msg_debug_lang_det("ignore ascii after finding %d stop words from %s", + cur_matches, cur_lang->name); + goto again; + } + + if (!ignore_latin && cur_lang->category != RSPAMD_LANGUAGE_LATIN) { + /* Restart matches */ + ignore_latin = TRUE; + sel = NULL; + max_rate = G_MINDOUBLE; + msg_debug_lang_det("ignore latin after finding stop %d words from %s", + cur_matches, cur_lang->name); + goto again; + } + + if (cur_matches < stop_words_threshold) { + continue; + } + + if (cur_matches < strong_confidence_threshold) { + /* Ignore mixed languages when not enough confidence */ + if (ignore_ascii && (cur_lang->flags & RS_LANGUAGE_ASCII)) { + continue; + } + + if (ignore_latin && cur_lang->category == RSPAMD_LANGUAGE_LATIN) { + continue; + } + } + + double rate = (double) cur_matches / (double) cur_lang->stop_words; + + if (rate > max_rate) { + max_rate = rate; + sel = cur_lang; + } + + msg_debug_lang_det("found %d stop words from %s: %3f rate", + cur_matches, cur_lang->name, rate); + }); + + /* Cleanup */ + kh_foreach(cbdata.res, cur_lang, cur_res, { + kh_destroy(rspamd_sw_res_set, cur_res); + }); + + if (max_rate > 0 && sel) { + msg_debug_lang_det("set language based on stop words script %s, %.3f found", + sel->name, max_rate); + rspamd_language_detector_set_language(task, part, + sel->name, sel); + + ret = TRUE; + } + } + else { + msg_debug_lang_det("found no stop words in a text"); + } + + kh_destroy(rspamd_sw_hash, cbdata.res); + + return ret; +} + +gboolean +rspamd_language_detector_detect(struct rspamd_task *task, + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part) +{ + khash_t(rspamd_candidates_hash) * candidates; + GPtrArray *result; + gdouble mean, std, start_ticks, end_ticks; + guint cand_len; + enum rspamd_language_category cat; + struct rspamd_lang_detector_res *cand; + enum rspamd_language_detected_type r; + struct rspamd_frequency_sort_cbdata cbd; + /* Check if we have sorted candidates based on frequency */ + gboolean frequency_heuristic_applied = FALSE, ret = FALSE; + + if (!part->utf_stripped_content) { + return FALSE; + } + + start_ticks = rspamd_get_ticks(TRUE); + + guint nchinese = 0, nspecial = 0; + rspamd_language_detector_unicode_scripts(task, part, &nchinese, &nspecial); + + /* Disable internal language detection heuristics if we have fasttext */ + if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector) || !d->prefer_fasttext) { + /* Apply unicode scripts heuristic */ + if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) { + ret = TRUE; + } + + cat = rspamd_language_detector_get_category(part->unicode_scripts); + + if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) { + ret = TRUE; + } + } + + if (!ret) { + unsigned ndetected = 0; + if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) { + rspamd_fasttext_predict_result_t fasttext_predict_result = + rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task, + part->utf_words, 4); + + ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result); + + if (ndetected > 0) { + candidates = kh_init(rspamd_candidates_hash); + kh_resize(rspamd_candidates_hash, candidates, ndetected); + + /* Now fill all results where probability is above threshold */ + float max_prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, 0); + + for (unsigned int i = 0; i < ndetected; i++) { + float prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i); + if (prob > max_prob * 0.75) { + char *lang = rspamd_mempool_strdup(task->task_pool, + rspamd_lang_detection_fasttext_get_lang(fasttext_predict_result, i)); + int tmp; + khiter_t k = kh_put(rspamd_candidates_hash, candidates, lang, &tmp); + + kh_value(candidates, k) = rspamd_mempool_alloc0(task->task_pool, sizeof(*cand)); + cand = kh_value(candidates, k); + cand->lang = lang; + cand->prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i); + + /* Find the corresponding language elt */ + k = kh_get(rspamd_languages_hash, d->languages, lang); + if (k != kh_end(d->languages)) { + cand->elt = kh_value(d->languages, k); + } + } + } + + if (kh_size(candidates) == 1) { + r = rs_detect_single; + } + else if (kh_size(candidates) > 1) { + r = rs_detect_multiple; + } + else { + r = rs_detect_none; + } + } + + rspamd_fasttext_predict_result_destroy(fasttext_predict_result); + } + if (ndetected == 0) { + if (part->utf_words->len < default_short_text_limit) { + r = rs_detect_none; + msg_debug_lang_det("text is too short for trigrams detection: " + "%d words; at least %d words required", + (int) part->utf_words->len, + (int) default_short_text_limit); + switch (cat) { + case RSPAMD_LANGUAGE_CYRILLIC: + rspamd_language_detector_set_language(task, part, "ru", NULL); + break; + case RSPAMD_LANGUAGE_DEVANAGARI: + rspamd_language_detector_set_language(task, part, "hi", NULL); + break; + case RSPAMD_LANGUAGE_ARAB: + rspamd_language_detector_set_language(task, part, "ar", NULL); + break; + default: + case RSPAMD_LANGUAGE_LATIN: + rspamd_language_detector_set_language(task, part, "en", NULL); + break; + } + msg_debug_lang_det("set %s language based on symbols category", + part->language); + + candidates = kh_init(rspamd_candidates_hash); + } + else { + candidates = kh_init(rspamd_candidates_hash); + kh_resize(rspamd_candidates_hash, candidates, 32); + + r = rspamd_language_detector_try_ngramm(task, + default_words, + d, + part->utf_words, + cat, + candidates, + part); + + if (r == rs_detect_none) { + msg_debug_lang_det("no trigrams found, fallback to english"); + rspamd_language_detector_set_language(task, part, "en", NULL); + } + else if (r == rs_detect_multiple) { + /* Check our guess */ + + mean = 0.0; + std = 0.0; + cand_len = 0; + + /* Check distribution */ + kh_foreach_value(candidates, cand, { + if (!isnan(cand->prob)) { + mean += cand->prob; + cand_len++; + } + }); + + if (cand_len > 0) { + mean /= cand_len; + + kh_foreach_value(candidates, cand, { + gdouble err; + if (!isnan(cand->prob)) { + err = cand->prob - mean; + std += fabs(err); + } + }); + + std /= cand_len; + } + + msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev", + cand_len, mean, std); + + if (cand_len > 0 && std / fabs(mean) < 0.25) { + msg_debug_lang_det("apply frequency heuristic sorting"); + frequency_heuristic_applied = TRUE; + cbd.d = d; + cbd.mean = mean; + cbd.std = std; + cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; + + if (part->nwords < default_words / 2) { + cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + } + } + } + } + } + + /* Now, convert hash to array and sort it */ + if (r != rs_detect_none && kh_size(candidates) > 0) { + result = g_ptr_array_sized_new(kh_size(candidates)); + + kh_foreach_value(candidates, cand, { + if (!isnan(cand->prob)) { + msg_debug_lang_det("pre-sorting probability %s -> %.2f", cand->lang, + cand->prob); + g_ptr_array_add(result, cand); + } + }); + + if (frequency_heuristic_applied) { + g_ptr_array_sort_with_data(result, + rspamd_language_detector_cmp_heuristic, + (gpointer) &cbd); + } + else { + g_ptr_array_sort(result, rspamd_language_detector_cmp); + } + + int i; + PTR_ARRAY_FOREACH(result, i, cand) + { + msg_debug_lang_det("final probability %s -> %.2f", cand->lang, + cand->prob); + } + + if (part->languages != NULL) { + g_ptr_array_unref(part->languages); + } + + part->languages = result; + part->language = ((struct rspamd_lang_detector_res *) g_ptr_array_index(result, 0))->lang; + ret = TRUE; + } + else if (part->languages == NULL) { + rspamd_language_detector_set_language(task, part, "en", NULL); + } + + kh_destroy(rspamd_candidates_hash, candidates); + } + + /* Update internal stat */ + if (part->languages != NULL && part->languages->len > 0 && !frequency_heuristic_applied) { + cand = g_ptr_array_index(part->languages, 0); + if (cand->elt) { + cand->elt->occurrences++; + d->total_occurrences++; + + msg_debug_lang_det("updated stat for %s: %d occurrences, %z total detected", + cand->elt->name, cand->elt->occurrences, + d->total_occurrences); + } + } + + end_ticks = rspamd_get_ticks(TRUE); + msg_debug_lang_det("detected languages in %.0f ticks", + (end_ticks - start_ticks)); + + return ret; +} + + +struct rspamd_lang_detector * +rspamd_language_detector_ref(struct rspamd_lang_detector *d) +{ + REF_RETAIN(d); + + return d; +} + +void rspamd_language_detector_unref(struct rspamd_lang_detector *d) +{ + REF_RELEASE(d); +} + +gboolean +rspamd_language_detector_is_stop_word(struct rspamd_lang_detector *d, + const gchar *word, gsize wlen) +{ + khiter_t k; + rspamd_ftok_t search; + + search.begin = word; + search.len = wlen; + + k = kh_get(rspamd_stopwords_hash, d->stop_words_norm, &search); + + if (k != kh_end(d->stop_words_norm)) { + return TRUE; + } + + return FALSE; +} + +gint rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt) +{ + if (elt) { + return elt->flags; + } + + return 0; +}
\ No newline at end of file |