diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /src/libmime/message.c | |
parent | Initial commit. (diff) | |
download | rspamd-upstream.tar.xz rspamd-upstream.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/libmime/message.c | 1732 |
1 files changed, 1732 insertions, 0 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c new file mode 100644 index 0000000..3acc935 --- /dev/null +++ b/src/libmime/message.c @@ -0,0 +1,1732 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "util.h" +#include "rspamd.h" +#include "message.h" +#include "libserver/html/html.h" +#include "images.h" +#include "archives.h" +#include "tokenizers/tokenizers.h" +#include "smtp_parsers.h" +#include "mime_parser.h" +#include "mime_encoding.h" +#include "lang_detection.h" +#include "libutil/multipattern.h" +#include "libserver/mempool_vars_internal.h" + +#ifdef WITH_SNOWBALL +#include "libstemmer.h" +#endif + +#include <math.h> +#include <unicode/uchar.h> +#include "sodium.h" +#include "libserver/cfg_file_private.h" +#include "lua/lua_common.h" +#include "contrib/uthash/utlist.h" +#include "contrib/t1ha/t1ha.h" +#include "received.h" + +#define GTUBE_SYMBOL "GTUBE" + +#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF) +#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF) + +static const gchar gtube_pattern_reject[] = "XJS*C4JDBQADN1.NSBN3*2IDNEN*" + "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X"; +static const gchar gtube_pattern_add_header[] = "YJS*C4JDBQADN1.NSBN3*2IDNEN*" + "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X"; +static const gchar gtube_pattern_rewrite_subject[] = "ZJS*C4JDBQADN1.NSBN3*2IDNEN*" + "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X"; +static const gchar gtube_pattern_no_action[] = "AJS*C4JDBQADN1.NSBN3*2IDNEN*" + "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X"; +struct rspamd_multipattern *gtube_matcher = NULL; +static const guint64 words_hash_seed = 0xdeadbabe; + +static void +free_byte_array_callback(void *pointer) +{ + GByteArray *arr = (GByteArray *) pointer; + g_byte_array_free(arr, TRUE); +} + +static void +rspamd_mime_part_extract_words(struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + rspamd_stat_token_t *w; + guint i, total_len = 0, short_len = 0; + + if (part->utf_words) { + rspamd_stem_words(part->utf_words, task->task_pool, part->language, + task->lang_det); + + for (i = 0; i < part->utf_words->len; i++) { + guint64 h; + + w = &g_array_index(part->utf_words, rspamd_stat_token_t, i); + + if (w->stemmed.len > 0) { + /* + * We use static hash seed if we would want to use that in shingles + * computation in future + */ + h = rspamd_cryptobox_fast_hash_specific( + RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, + w->stemmed.begin, w->stemmed.len, words_hash_seed); + g_array_append_val(part->normalized_hashes, h); + total_len += w->stemmed.len; + + if (w->stemmed.len <= 3) { + short_len++; + } + + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT && + !(w->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) { + part->nwords++; + } + } + + if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE | + RSPAMD_STAT_TOKEN_FLAG_NORMALISED | + RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) { + task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE; + } + } + + if (part->utf_words->len) { + gdouble *avg_len_p, *short_len_p; + + avg_len_p = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_AVG_WORDS_LEN); + + if (avg_len_p == NULL) { + avg_len_p = rspamd_mempool_alloc(task->task_pool, + sizeof(double)); + *avg_len_p = total_len; + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_AVG_WORDS_LEN, avg_len_p, NULL); + } + else { + *avg_len_p += total_len; + } + + short_len_p = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_SHORT_WORDS_CNT); + + if (short_len_p == NULL) { + short_len_p = rspamd_mempool_alloc(task->task_pool, + sizeof(double)); + *short_len_p = short_len; + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_SHORT_WORDS_CNT, avg_len_p, NULL); + } + else { + *short_len_p += short_len; + } + } + } +} + +static void +rspamd_mime_part_create_words(struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + enum rspamd_tokenize_type tok_type; + + if (IS_TEXT_PART_UTF(part)) { + +#if U_ICU_VERSION_MAJOR_NUM < 50 + /* Hack to prevent hang with Thai in old libicu */ + const gchar *p = part->utf_stripped_content->data, *end; + guint i = 0; + end = p + part->utf_stripped_content->len; + gint32 uc, sc; + + tok_type = RSPAMD_TOKENIZE_UTF; + + while (p + i < end) { + U8_NEXT(p, i, part->utf_stripped_content->len, uc); + + if (((gint32) uc) < 0) { + tok_type = RSPAMD_TOKENIZE_RAW; + break; + } + + if (u_isalpha(uc)) { + sc = ublock_getCode(uc); + + if (sc == UBLOCK_THAI) { + msg_info_task("enable workaround for Thai characters for old libicu"); + tok_type = RSPAMD_TOKENIZE_RAW; + break; + } + } + } +#else + tok_type = RSPAMD_TOKENIZE_UTF; +#endif + } + else { + tok_type = RSPAMD_TOKENIZE_RAW; + } + + part->utf_words = rspamd_tokenize_text( + part->utf_stripped_content->data, + part->utf_stripped_content->len, + &part->utf_stripped_text, + tok_type, task->cfg, + part->exceptions, + NULL, + NULL, + task->task_pool); + + + if (part->utf_words) { + part->normalized_hashes = g_array_sized_new(FALSE, FALSE, + sizeof(guint64), part->utf_words->len); + rspamd_normalize_words(part->utf_words, task->task_pool); + } +} + +static void +rspamd_mime_part_detect_language(struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + struct rspamd_lang_detector_res *lang; + + if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 && + task->lang_det) { + if (rspamd_language_detector_detect(task, task->lang_det, part)) { + lang = g_ptr_array_index(part->languages, 0); + part->language = lang->lang; + + msg_info_task("detected part language: %s", part->language); + } + else { + part->language = "en"; /* Safe fallback */ + } + } +} + +static void +rspamd_strip_newlines_parse(struct rspamd_task *task, + const gchar *begin, const gchar *pe, + struct rspamd_mime_text_part *part) +{ + const gchar *p = begin, *c = begin; + gboolean crlf_added = FALSE, is_utf = IS_TEXT_PART_UTF(part); + gboolean url_open_bracket = FALSE; + UChar32 uc; + + enum { + normal_char, + seen_cr, + seen_lf, + } state = normal_char; + + while (p < pe) { + if (U8_IS_LEAD(*p) && is_utf) { + gint32 off = p - begin; + U8_NEXT(begin, off, pe - begin, uc); + + if (uc != -1) { + while (p < pe && off < (pe - begin)) { + if (IS_ZERO_WIDTH_SPACE(uc)) { + /* Invisible space ! */ + task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE; + part->spaces++; + + if (p > c) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) c, p - c); + c = begin + off; + p = c; + } + + U8_NEXT(begin, off, pe - begin, uc); + + if (!IS_ZERO_WIDTH_SPACE(uc)) { + break; + } + + part->double_spaces++; + p = begin + off; + c = p; + } + else { + break; + } + } + } + } + + if (G_UNLIKELY(p >= pe)) { + /* + * This is reached when there is a utf8 part and we + * have zero width spaces at the end of the text + * So we just check overflow and refuse to access *p if it is + * after our real content. + */ + break; + } + else if (*p == '\r') { + switch (state) { + case normal_char: + state = seen_cr; + if (p > c) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) c, p - c); + } + + crlf_added = FALSE; + c = p + 1; + break; + case seen_cr: + /* Double \r\r */ + if (!crlf_added) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + crlf_added = TRUE; + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + part->nlines++; + part->empty_lines++; + c = p + 1; + break; + case seen_lf: + /* Likely \r\n\r...*/ + state = seen_cr; + c = p + 1; + break; + } + + url_open_bracket = FALSE; + + p++; + } + else if (*p == '\n') { + switch (state) { + case normal_char: + state = seen_lf; + + if (p > c) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) c, p - c); + } + + c = p + 1; + + if (IS_TEXT_PART_HTML(part) || !url_open_bracket) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + crlf_added = TRUE; + } + else { + crlf_added = FALSE; + } + + break; + case seen_cr: + /* \r\n */ + if (!crlf_added) { + if (IS_TEXT_PART_HTML(part) || !url_open_bracket) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + crlf_added = TRUE; + } + + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + c = p + 1; + state = seen_lf; + + break; + case seen_lf: + /* Double \n\n */ + if (!crlf_added) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + crlf_added = TRUE; + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + part->nlines++; + part->empty_lines++; + + c = p + 1; + break; + } + url_open_bracket = FALSE; + + p++; + } + else { + if ((*p) == '<') { + url_open_bracket = TRUE; + } + else if ((*p) == '>') { + url_open_bracket = FALSE; + } + + switch (state) { + case normal_char: + if (*p == ' ') { + part->spaces++; + + if (p > begin && *(p - 1) == ' ') { + part->double_spaces++; + } + } + else { + part->non_spaces++; + + if ((*p) & 0x80) { + part->non_ascii_chars++; + } + else { + if (g_ascii_isupper(*p)) { + part->capital_letters++; + } + else if (g_ascii_isdigit(*p)) { + part->numeric_characters++; + } + + part->ascii_chars++; + } + } + break; + case seen_cr: + case seen_lf: + part->nlines++; + + if (!crlf_added) { + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + /* Skip initial spaces */ + if (*p == ' ') { + if (!crlf_added) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + } + + while (p < pe && *p == ' ') { + p++; + c++; + part->spaces++; + } + + if (p < pe && (*p == '\r' || *p == '\n')) { + part->empty_lines++; + } + } + + state = normal_char; + continue; + } + + p++; + } + } + + /* Leftover */ + if (p > c) { + if (p > pe) { + p = pe; + } + + switch (state) { + case normal_char: + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) c, p - c); + + while (c < p) { + if (*c == ' ') { + part->spaces++; + + if (c > begin && *(c - 1) == ' ') { + part->double_spaces++; + } + } + else { + part->non_spaces++; + + if ((*c) & 0x80) { + part->non_ascii_chars++; + } + else { + part->ascii_chars++; + } + } + + c++; + } + break; + default: + + if (!crlf_added) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + part->nlines++; + break; + } + } +} + +static void +rspamd_u_text_dtor(void *p) +{ + utext_close((UText *) p); +} + +static void +rspamd_normalize_text_part(struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + const gchar *p, *end; + guint i; + goffset off; + struct rspamd_process_exception *ex; + UErrorCode uc_err = U_ZERO_ERROR; + + part->newlines = g_ptr_array_sized_new(128); + + if (IS_TEXT_PART_EMPTY(part)) { + part->utf_stripped_content = g_byte_array_new(); + } + else { + part->utf_stripped_content = g_byte_array_sized_new(part->utf_content.len); + + p = (const gchar *) part->utf_content.begin; + end = p + part->utf_content.len; + + rspamd_strip_newlines_parse(task, p, end, part); + + for (i = 0; i < part->newlines->len; i++) { + ex = rspamd_mempool_alloc(task->task_pool, sizeof(*ex)); + off = (goffset) g_ptr_array_index(part->newlines, i); + g_ptr_array_index(part->newlines, i) = (gpointer) (goffset) (part->utf_stripped_content->data + off); + ex->pos = off; + ex->len = 0; + ex->type = RSPAMD_EXCEPTION_NEWLINE; + part->exceptions = g_list_prepend(part->exceptions, ex); + } + } + + if (IS_TEXT_PART_UTF(part)) { + utext_openUTF8(&part->utf_stripped_text, + part->utf_stripped_content->data, + part->utf_stripped_content->len, + &uc_err); + + if (!U_SUCCESS(uc_err)) { + msg_warn_task("cannot open text from utf content"); + /* Probably, should be an assertion */ + } + else { + rspamd_mempool_add_destructor(task->task_pool, + rspamd_u_text_dtor, + &part->utf_stripped_text); + } + } + + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) free_byte_array_callback, + part->utf_stripped_content); + rspamd_mempool_notify_alloc(task->task_pool, + part->utf_stripped_content->len); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, + part->newlines); +} + +#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) + +static guint +rspamd_words_levenshtein_distance(struct rspamd_task *task, + GArray *w1, GArray *w2) +{ + guint s1len, s2len, x, y, lastdiag, olddiag; + guint *column, ret; + guint64 h1, h2; + gint eq; + static const guint max_words = 8192; + + s1len = w1->len; + s2len = w2->len; + + if (s1len + s2len > max_words) { + msg_info_task("cannot direct compare multipart/alternative parts with more than %ud words in total: " + "(%ud words in one part and %ud in another)", + max_words, s1len, s2len); + + /* Use approximate comparison of number of words */ + if (s1len > s2len) { + return s1len - s2len; + } + else { + return s2len - s1len; + } + } + + column = g_malloc0((s1len + 1) * sizeof(guint)); + + for (y = 1; y <= s1len; y++) { + column[y] = y; + } + + for (x = 1; x <= s2len; x++) { + column[0] = x; + + for (y = 1, lastdiag = x - 1; y <= s1len; y++) { + olddiag = column[y]; + h1 = g_array_index(w1, guint64, y - 1); + h2 = g_array_index(w2, guint64, x - 1); + eq = (h1 == h2) ? 1 : 0; + /* + * Cost of replacement is twice higher than cost of add/delete + * to calculate percentage properly + */ + column[y] = MIN3(column[y] + 1, column[y - 1] + 1, + lastdiag + (eq * 2)); + lastdiag = olddiag; + } + } + + ret = column[s1len]; + g_free(column); + + return ret; +} + +static gint +rspamd_multipattern_gtube_cb(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + struct rspamd_task *task = (struct rspamd_task *) context; + + if (strnum > 0) { + if (task->cfg->gtube_patterns_policy == RSPAMD_GTUBE_ALL) { + return strnum + 1; + } + + return 0; + } + + return strnum + 1; /* To distinguish from zero */ +} + +static enum rspamd_action_type +rspamd_check_gtube(struct rspamd_task *task, struct rspamd_mime_text_part *part) +{ + static const gsize max_check_size = 8 * 1024; + gint ret; + enum rspamd_action_type act = METRIC_ACTION_NOACTION; + enum rspamd_gtube_patterns_policy policy = task->cfg ? task->cfg->gtube_patterns_policy : RSPAMD_GTUBE_REJECT; + g_assert(part != NULL); + + if (gtube_matcher == NULL && policy != RSPAMD_GTUBE_DISABLED) { + gtube_matcher = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT); + + rspamd_multipattern_add_pattern(gtube_matcher, + gtube_pattern_reject, + RSPAMD_MULTIPATTERN_DEFAULT); + rspamd_multipattern_add_pattern(gtube_matcher, + gtube_pattern_add_header, + RSPAMD_MULTIPATTERN_DEFAULT); + rspamd_multipattern_add_pattern(gtube_matcher, + gtube_pattern_rewrite_subject, + RSPAMD_MULTIPATTERN_DEFAULT); + rspamd_multipattern_add_pattern(gtube_matcher, + gtube_pattern_no_action, + RSPAMD_MULTIPATTERN_DEFAULT); + + GError *err = NULL; + rspamd_multipattern_compile(gtube_matcher, &err); + + if (err != NULL) { + /* It will be expensive, but I don't care, still better than to abort */ + msg_err("cannot compile gtube matcher: %s", err->message); + g_error_free(err); + } + } + + if (part->utf_content.len >= sizeof(gtube_pattern_reject) && + part->utf_content.len <= max_check_size && + policy != RSPAMD_GTUBE_DISABLED) { + if ((ret = rspamd_multipattern_lookup(gtube_matcher, part->utf_content.begin, + part->utf_content.len, + rspamd_multipattern_gtube_cb, task, NULL)) > 0) { + + switch (ret) { + case 1: + act = METRIC_ACTION_REJECT; + break; + case 2: + act = METRIC_ACTION_ADD_HEADER; + break; + case 3: + act = METRIC_ACTION_REWRITE_SUBJECT; + break; + case 4: + act = METRIC_ACTION_NOACTION; + break; + } + + if (ret != 0) { + task->flags |= RSPAMD_TASK_FLAG_SKIP; + task->flags |= RSPAMD_TASK_FLAG_GTUBE; + msg_info_task( + "gtube %s pattern has been found in part of length %uz", + rspamd_action_to_str(act), + part->utf_content.len); + } + } + } + + return act; +} + +static gint +exceptions_compare_func(gconstpointer a, gconstpointer b) +{ + const struct rspamd_process_exception *ea = a, *eb = b; + + return ea->pos - eb->pos; +} + +static gboolean +rspamd_message_process_plain_text_part(struct rspamd_task *task, + struct rspamd_mime_text_part *text_part) +{ + if (text_part->parsed.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + + return TRUE; + } + + rspamd_mime_text_part_maybe_convert(task, text_part); + + if (text_part->utf_raw_content != NULL) { + /* Just have the same content */ + text_part->utf_content.begin = (const gchar *) text_part->utf_raw_content->data; + text_part->utf_content.len = text_part->utf_raw_content->len; + } + else { + /* + * We ignore unconverted parts from now as it is dangerous + * to treat them as text parts + */ + text_part->utf_content.begin = NULL; + text_part->utf_content.len = 0; + + return FALSE; + } + + return TRUE; +} + +static gboolean +rspamd_message_process_html_text_part(struct rspamd_task *task, + struct rspamd_mime_text_part *text_part, + uint16_t *cur_url_order) +{ + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; + + if (text_part->parsed.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + + return TRUE; + } + + rspamd_mime_text_part_maybe_convert(task, text_part); + + if (text_part->utf_raw_content == NULL) { + return FALSE; + } + + + text_part->html = rspamd_html_process_part_full( + task, + text_part->utf_raw_content, + &text_part->exceptions, + MESSAGE_FIELD(task, urls), + text_part->mime_part->urls, + task->cfg ? task->cfg->enable_css_parser : true, + cur_url_order); + rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content); + + if (text_part->utf_content.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + } + + return TRUE; +} + +enum rspamd_message_part_is_text_result { + RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN = 0, + RSPAMD_MESSAGE_PART_IS_TEXT_HTML, + RSPAMD_MESSAGE_PART_IS_NOT_TEXT +}; + +static enum rspamd_message_part_is_text_result +rspamd_message_part_can_be_parsed_as_text(struct rspamd_task *task, + struct rspamd_mime_part *mime_part) +{ + enum rspamd_message_part_is_text_result res = RSPAMD_MESSAGE_PART_IS_NOT_TEXT; + + if ((mime_part->ct && (mime_part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) || + (mime_part->detected_type && strcmp(mime_part->detected_type, "text") == 0)) { + + res = RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN; + rspamd_ftok_t html_tok, xhtml_tok; + + html_tok.begin = "html"; + html_tok.len = 4; + xhtml_tok.begin = "xhtml"; + xhtml_tok.len = 5; + + if (rspamd_ftok_casecmp(&mime_part->ct->subtype, &html_tok) == 0 || + rspamd_ftok_casecmp(&mime_part->ct->subtype, &xhtml_tok) == 0 || + (mime_part->detected_ext && + strcmp(mime_part->detected_ext, "html") == 0)) { + res = RSPAMD_MESSAGE_PART_IS_TEXT_HTML; + } + } + + /* Skip attachments */ + if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT && + (mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) { + if (!task->cfg->check_text_attachements) { + debug_task("skip attachments for checking as text parts"); + return RSPAMD_MESSAGE_PART_IS_NOT_TEXT; + } + } + + return res; +} + +static gboolean +rspamd_message_process_text_part_maybe(struct rspamd_task *task, + struct rspamd_mime_part *mime_part, + enum rspamd_message_part_is_text_result is_text, + uint16_t *cur_url_order) +{ + struct rspamd_mime_text_part *text_part; + guint flags = 0; + enum rspamd_action_type act; + + /* Skip attachments */ + if ((mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) { + flags |= RSPAMD_MIME_TEXT_PART_ATTACHMENT; + } + + text_part = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_mime_text_part)); + text_part->mime_part = mime_part; + text_part->raw.begin = mime_part->raw_data.begin; + text_part->raw.len = mime_part->raw_data.len; + text_part->parsed.begin = mime_part->parsed_data.begin; + text_part->parsed.len = mime_part->parsed_data.len; + text_part->utf_stripped_text = (UText) UTEXT_INITIALIZER; + text_part->flags |= flags; + + if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) { + if (!rspamd_message_process_html_text_part(task, text_part, cur_url_order)) { + return FALSE; + } + } + else { + if (!rspamd_message_process_plain_text_part(task, text_part)) { + return FALSE; + } + } + + g_ptr_array_add(MESSAGE_FIELD(task, text_parts), text_part); + mime_part->part_type = RSPAMD_MIME_PART_TEXT; + mime_part->specific.txt = text_part; + + act = rspamd_check_gtube(task, text_part); + if (act != METRIC_ACTION_NOACTION) { + struct rspamd_action *action; + gdouble score = NAN; + + action = rspamd_config_get_action_by_type(task->cfg, act); + + if (action) { + score = action->threshold; + + rspamd_add_passthrough_result(task, action, + RSPAMD_PASSTHROUGH_CRITICAL, + score, "Gtube pattern", + "GTUBE", 0, NULL); + } + + rspamd_task_insert_result(task, GTUBE_SYMBOL, 0, NULL); + + return TRUE; + } + + /* Post process part */ + rspamd_normalize_text_part(task, text_part); + + if (!IS_TEXT_PART_HTML(text_part)) { + if (mime_part->parent_part) { + struct rspamd_mime_part *parent = mime_part->parent_part; + + if (IS_PART_MULTIPART(parent) && parent->specific.mp->children->len == 2) { + /* + * Use strict extraction mode: we will extract missing urls from + * an html part if needed + */ + rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order, + RSPAMD_URL_FIND_STRICT); + } + else { + /* + * Fall back to full text extraction using TLD patterns + */ + rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order, + RSPAMD_URL_FIND_ALL); + } + } + else { + /* + * Fall back to full text extraction using TLD patterns + */ + rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order, + RSPAMD_URL_FIND_ALL); + } + } + else { + rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order, + RSPAMD_URL_FIND_STRICT); + } + + if (text_part->exceptions) { + text_part->exceptions = g_list_sort(text_part->exceptions, + exceptions_compare_func); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) g_list_free, + text_part->exceptions); + } + + rspamd_mime_part_create_words(task, text_part); + + return TRUE; +} + +/* Creates message from various data using libmagic to detect type */ +static void +rspamd_message_from_data(struct rspamd_task *task, const guchar *start, + gsize len) +{ + struct rspamd_content_type *ct = NULL; + struct rspamd_mime_part *part; + const char *mb = "application/octet-stream"; + gchar *mid; + rspamd_ftok_t srch, *tok; + gchar cdbuf[1024]; + + g_assert(start != NULL); + + part = rspamd_mempool_alloc0(task->task_pool, sizeof(*part)); + + part->raw_data.begin = start; + part->raw_data.len = len; + part->parsed_data.begin = start; + part->parsed_data.len = len; + part->part_number = MESSAGE_FIELD(task, parts)->len; + part->urls = g_ptr_array_new(); + part->raw_headers = rspamd_message_headers_new(); + part->headers_order = NULL; + + tok = rspamd_task_get_request_header(task, "Content-Type"); + + if (tok) { + /* We have Content-Type defined */ + ct = rspamd_content_type_parse(tok->begin, tok->len, + task->task_pool); + part->ct = ct; + } + else if (task->cfg && task->cfg->libs_ctx) { + lua_State *L = task->cfg->lua_state; + + if (rspamd_lua_require_function(L, + "lua_magic", "detect_mime_part")) { + + struct rspamd_mime_part **pmime; + struct rspamd_task **ptask; + + pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *)); + rspamd_lua_setclass(L, "rspamd{mimepart}", -1); + *pmime = part; + ptask = lua_newuserdata(L, sizeof(struct rspamd_task *)); + rspamd_lua_setclass(L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall(L, 2, 2, 0) != 0) { + msg_err_task("cannot detect type: %s", lua_tostring(L, -1)); + } + else { + if (lua_istable(L, -1)) { + lua_pushstring(L, "ct"); + lua_gettable(L, -2); + + if (lua_isstring(L, -1)) { + mb = rspamd_mempool_strdup(task->task_pool, + lua_tostring(L, -1)); + } + } + } + + lua_settop(L, 0); + } + else { + msg_err_task("cannot require lua_magic.detect_mime_part"); + } + + if (mb) { + srch.begin = mb; + srch.len = strlen(mb); + ct = rspamd_content_type_parse(srch.begin, srch.len, + task->task_pool); + + if (!part->ct) { + msg_info_task("construct fake mime of type: %s", mb); + part->ct = ct; + } + else { + /* Check sanity */ + if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) { + RSPAMD_FTOK_FROM_STR(&srch, "application"); + + if (rspamd_ftok_cmp(&ct->type, &srch) == 0) { + msg_info_task("construct fake mime of type: %s", mb); + part->ct = ct; + } + } + else { + msg_info_task("construct fake mime of type: %T/%T, detected %s", + &part->ct->type, &part->ct->subtype, mb); + } + } + + part->detected_ct = ct; + } + } + + + tok = rspamd_task_get_request_header(task, "Filename"); + + if (tok) { + rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline; filename=\"%T\"", tok); + } + else { + rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline"); + } + + part->cd = rspamd_content_disposition_parse(cdbuf, strlen(cdbuf), + task->task_pool); + + g_ptr_array_add(MESSAGE_FIELD(task, parts), part); + rspamd_mime_parser_calc_digest(part); + + /* Generate message ID */ + mid = rspamd_mime_message_id_generate("localhost.localdomain"); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) g_free, mid); + MESSAGE_FIELD(task, message_id) = mid; + task->queue_id = mid; +} + +static void +rspamd_message_dtor(struct rspamd_message *msg) +{ + guint i; + struct rspamd_mime_part *p; + struct rspamd_mime_text_part *tp; + + + PTR_ARRAY_FOREACH(msg->parts, i, p) + { + if (p->raw_headers) { + rspamd_message_headers_unref(p->raw_headers); + } + + if (IS_PART_MULTIPART(p)) { + if (p->specific.mp->children) { + g_ptr_array_free(p->specific.mp->children, TRUE); + } + } + + if (p->part_type == RSPAMD_MIME_PART_CUSTOM_LUA && + p->specific.lua_specific.cbref != -1) { + luaL_unref(msg->task->cfg->lua_state, + LUA_REGISTRYINDEX, + p->specific.lua_specific.cbref); + } + + if (p->urls) { + g_ptr_array_unref(p->urls); + } + } + + PTR_ARRAY_FOREACH(msg->text_parts, i, tp) + { + if (tp->utf_words) { + g_array_free(tp->utf_words, TRUE); + } + if (tp->normalized_hashes) { + g_array_free(tp->normalized_hashes, TRUE); + } + if (tp->languages) { + g_ptr_array_unref(tp->languages); + } + } + + rspamd_message_headers_unref(msg->raw_headers); + + g_ptr_array_unref(msg->text_parts); + g_ptr_array_unref(msg->parts); + + kh_destroy(rspamd_url_hash, msg->urls); +} + +struct rspamd_message * +rspamd_message_new(struct rspamd_task *task) +{ + struct rspamd_message *msg; + + msg = rspamd_mempool_alloc0(task->task_pool, sizeof(*msg)); + + msg->raw_headers = rspamd_message_headers_new(); + msg->urls = kh_init(rspamd_url_hash); + msg->parts = g_ptr_array_sized_new(4); + msg->text_parts = g_ptr_array_sized_new(2); + msg->task = task; + + REF_INIT_RETAIN(msg, rspamd_message_dtor); + + return msg; +} + +gboolean +rspamd_message_parse(struct rspamd_task *task) +{ + const gchar *p; + gsize len; + guint i; + GError *err = NULL; + guint64 n[2], seed; + + if (RSPAMD_TASK_IS_EMPTY(task)) { + /* Don't do anything with empty task */ + task->flags |= RSPAMD_TASK_FLAG_SKIP_PROCESS; + return TRUE; + } + + p = task->msg.begin; + len = task->msg.len; + + /* Skip any space characters to avoid some bad messages to be unparsed */ + while (len > 0 && g_ascii_isspace(*p)) { + p++; + len--; + } + + /* + * Exim somehow uses mailbox format for messages being scanned: + * From xxx@xxx.com Fri May 13 19:08:48 2016 + * + * So we check if a task has this line to avoid possible issues + */ + if (len > sizeof("From ") - 1) { + if (memcmp(p, "From ", sizeof("From ") - 1) == 0) { + /* Skip to CRLF */ + msg_info_task("mailbox input detected, enable workaround"); + p += sizeof("From ") - 1; + len -= sizeof("From ") - 1; + + while (len > 0 && *p != '\n') { + p++; + len--; + } + while (len > 0 && g_ascii_isspace(*p)) { + p++; + len--; + } + } + } + + task->msg.begin = p; + task->msg.len = len; + + /* Cleanup old message */ + if (task->message) { + rspamd_message_unref(task->message); + } + + task->message = rspamd_message_new(task); + + if (task->flags & RSPAMD_TASK_FLAG_MIME) { + enum rspamd_mime_parse_error ret; + + debug_task("construct mime parser from string length %d", + (gint) task->msg.len); + ret = rspamd_mime_parse_task(task, &err); + + switch (ret) { + case RSPAMD_MIME_PARSE_FATAL: + msg_err_task("cannot construct mime from stream: %e", err); + + if (task->cfg && (!task->cfg->allow_raw_input)) { + msg_err_task("cannot construct mime from stream"); + if (err) { + task->err = err; + } + + return FALSE; + } + else { + task->flags &= ~RSPAMD_TASK_FLAG_MIME; + rspamd_message_from_data(task, p, len); + } + break; + case RSPAMD_MIME_PARSE_NESTING: + msg_warn_task("cannot construct full mime from stream: %e", err); + task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; + break; + case RSPAMD_MIME_PARSE_OK: + default: + break; + } + + if (err) { + g_error_free(err); + } + } + else { + rspamd_message_from_data(task, p, len); + } + + + if (MESSAGE_FIELD(task, message_id) == NULL) { + MESSAGE_FIELD(task, message_id) = "undef"; + } + + debug_task("found %ud parts in message", MESSAGE_FIELD(task, parts)->len); + if (task->queue_id == NULL) { + task->queue_id = "undef"; + } + + rspamd_received_maybe_fix_task(task); + + struct rspamd_mime_part *part; + + /* Blake2b applied to string 'rspamd' */ + static const guchar RSPAMD_ALIGNED(32) hash_key[] = { + 0xef, + 0x43, + 0xae, + 0x80, + 0xcc, + 0x8d, + 0xc3, + 0x4c, + 0x6f, + 0x1b, + 0xd6, + 0x18, + 0x1b, + 0xae, + 0x87, + 0x74, + 0x0c, + 0xca, + 0xf7, + 0x8e, + 0x5f, + 0x2e, + 0x54, + 0x32, + 0xf6, + 0x79, + 0xb9, + 0x27, + 0x26, + 0x96, + 0x20, + 0x92, + 0x70, + 0x07, + 0x85, + 0xeb, + 0x83, + 0xf7, + 0x89, + 0xe0, + 0xd7, + 0x32, + 0x2a, + 0xd2, + 0x1a, + 0x64, + 0x41, + 0xef, + 0x49, + 0xff, + 0xc3, + 0x8c, + 0x54, + 0xf9, + 0x67, + 0x74, + 0x30, + 0x1e, + 0x70, + 0x2e, + 0xb7, + 0x12, + 0x09, + 0xfe, + }; + + memcpy(&seed, hash_key, sizeof(seed)); + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + n[0] = t1ha2_atonce128(&n[1], + part->digest, sizeof(part->digest), + seed); + + seed = n[0] ^ n[1]; + } + + memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n)); + + if (MESSAGE_FIELD(task, subject)) { + p = MESSAGE_FIELD(task, subject); + len = strlen(p); + n[0] = t1ha2_atonce128(&n[1], + p, len, + seed); + memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n)); + } + + if (task->queue_id) { + msg_info_task("loaded message; id: <%s>; queue-id: <%s>; size: %z; " + "checksum: <%*xs>", + MESSAGE_FIELD(task, message_id), task->queue_id, task->msg.len, + (gint) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest)); + } + else { + msg_info_task("loaded message; id: <%s>; size: %z; " + "checksum: <%*xs>", + MESSAGE_FIELD(task, message_id), task->msg.len, + (gint) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest)); + } + + return TRUE; +} + + +/* + * A helper structure to store text parts positions, if it was C++, I could just use std::pair, + * but here I have to make it all manually, sigh... + */ +struct rspamd_mime_part_text_position { + unsigned pos; + enum rspamd_message_part_is_text_result res; +}; + +/* Place html parts first during analysis */ +static int +rspamd_mime_text_part_position_compare_func(const void *v1, const void *v2) +{ + const struct rspamd_mime_part_text_position *p1 = (const struct rspamd_mime_part_text_position *) v1; + const struct rspamd_mime_part_text_position *p2 = (const struct rspamd_mime_part_text_position *) v2; + + if (p1->res == p2->res) { + return (int) p2->pos - (int) p1->pos; + } + else { + if (p1->res == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) { + return -1; + } + else { + return 1; + } + } +} + +void rspamd_message_process(struct rspamd_task *task) +{ + guint i; + struct rspamd_mime_text_part *p1, *p2; + gdouble diff, *pdiff; + guint tw, *ptw, dw; + struct rspamd_mime_part *part; + lua_State *L = NULL; + gint magic_func_pos = -1, content_func_pos = -1, old_top = -1, funcs_top = -1; + + if (task->cfg) { + L = task->cfg->lua_state; + } + + rspamd_archives_process(task); + + if (L) { + old_top = lua_gettop(L); + } + + if (L && rspamd_lua_require_function(L, + "lua_magic", "detect_mime_part")) { + magic_func_pos = lua_gettop(L); + } + else { + msg_err_task("cannot require lua_magic.detect_mime_part"); + } + + if (L && rspamd_lua_require_function(L, + "lua_content", "maybe_process_mime_part")) { + content_func_pos = lua_gettop(L); + } + else { + msg_err_task("cannot require lua_content.maybe_process_mime_part"); + } + + if (L) { + funcs_top = lua_gettop(L); + } + + GArray *detected_text_parts = g_array_sized_new(FALSE, FALSE, sizeof(struct rspamd_mime_part_text_position), 2); + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + if (magic_func_pos != -1 && part->parsed_data.len > 0) { + struct rspamd_mime_part **pmime; + struct rspamd_task **ptask; + + lua_pushcfunction(L, &rspamd_lua_traceback); + gint err_idx = lua_gettop(L); + lua_pushvalue(L, magic_func_pos); + pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *)); + rspamd_lua_setclass(L, "rspamd{mimepart}", -1); + *pmime = part; + ptask = lua_newuserdata(L, sizeof(struct rspamd_task *)); + rspamd_lua_setclass(L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall(L, 2, 2, err_idx) != 0) { + msg_err_task("cannot detect type: %s", lua_tostring(L, -1)); + } + else { + if (lua_istable(L, -1)) { + const gchar *mb; + + /* First returned value */ + part->detected_ext = rspamd_mempool_strdup(task->task_pool, + lua_tostring(L, -2)); + + lua_pushstring(L, "ct"); + lua_gettable(L, -2); + + if (lua_isstring(L, -1)) { + mb = lua_tostring(L, -1); + + if (mb) { + rspamd_ftok_t srch; + + srch.begin = mb; + srch.len = strlen(mb); + part->detected_ct = rspamd_content_type_parse(srch.begin, + srch.len, + task->task_pool); + } + } + + lua_pop(L, 1); + + lua_pushstring(L, "type"); + lua_gettable(L, -2); + + if (lua_isstring(L, -1)) { + part->detected_type = rspamd_mempool_strdup(task->task_pool, + lua_tostring(L, -1)); + } + + lua_pop(L, 1); + + lua_pushstring(L, "no_text"); + lua_gettable(L, -2); + + if (lua_isboolean(L, -1)) { + if (!!lua_toboolean(L, -1)) { + part->flags |= RSPAMD_MIME_PART_NO_TEXT_EXTRACTION; + } + } + + lua_pop(L, 1); + } + } + + lua_settop(L, funcs_top); + } + + /* Now detect content */ + if (content_func_pos != -1 && part->parsed_data.len > 0 && + part->part_type == RSPAMD_MIME_PART_UNDEFINED) { + struct rspamd_mime_part **pmime; + struct rspamd_task **ptask; + + lua_pushcfunction(L, &rspamd_lua_traceback); + gint err_idx = lua_gettop(L); + lua_pushvalue(L, content_func_pos); + pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *)); + rspamd_lua_setclass(L, "rspamd{mimepart}", -1); + *pmime = part; + ptask = lua_newuserdata(L, sizeof(struct rspamd_task *)); + rspamd_lua_setclass(L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall(L, 2, 0, err_idx) != 0) { + msg_err_task("cannot detect content: %s", lua_tostring(L, -1)); + } + + lua_settop(L, funcs_top); + } + + /* Try to detect image before checking for text */ + rspamd_images_process_mime_part_maybe(task, part); + + if (part->part_type == RSPAMD_MIME_PART_UNDEFINED && + !(part->flags & RSPAMD_MIME_PART_NO_TEXT_EXTRACTION)) { + enum rspamd_message_part_is_text_result res = rspamd_message_part_can_be_parsed_as_text(task, part); + + if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT) { + struct rspamd_mime_part_text_position p = { + .pos = i, + .res = res}; + g_array_append_val(detected_text_parts, p); + } + } + } + + uint16_t cur_url_order = 0; + g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func); + /* One more iteration to process text parts in a more specific order */ + for (i = 0; i < detected_text_parts->len; i++) { + part = g_ptr_array_index(MESSAGE_FIELD(task, parts), + g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos); + rspamd_message_process_text_part_maybe(task, part, + g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res, &cur_url_order); + } + + g_array_free(detected_text_parts, TRUE); + + if (old_top != -1) { + lua_settop(L, old_top); + } + + /* Parse urls inside Subject header */ + if (MESSAGE_FIELD(task, subject)) { + rspamd_url_find_multiple(task->task_pool, MESSAGE_FIELD(task, subject), + strlen(MESSAGE_FIELD(task, subject)), + RSPAMD_URL_FIND_STRICT, NULL, + rspamd_url_task_subject_callback, + task); + } + + /* Calculate average words length and number of short words */ + struct rspamd_mime_text_part *text_part; + gdouble *var; + guint total_words = 0; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + if (!text_part->language) { + rspamd_mime_part_detect_language(task, text_part); + } + + rspamd_mime_part_extract_words(task, text_part); + + if (text_part->utf_words) { + total_words += text_part->nwords; + } + } + + /* Calculate distance for 2-parts messages */ + if (i == 2) { + p1 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 0); + p2 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 1); + + /* First of all check parent object */ + if (p1->mime_part->parent_part) { + rspamd_ftok_t srch; + + srch.begin = "alternative"; + srch.len = 11; + + if (rspamd_ftok_cmp(&p1->mime_part->parent_part->ct->subtype, &srch) == 0) { + if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) && + p1->normalized_hashes && p2->normalized_hashes) { + /* + * We also detect language on one part and propagate it to + * another one + */ + struct rspamd_mime_text_part *sel; + + /* Prefer HTML as text part is not displayed normally */ + if (IS_TEXT_PART_HTML(p1)) { + sel = p1; + } + else if (IS_TEXT_PART_HTML(p2)) { + sel = p2; + } + else { + if (p1->utf_content.len > p2->utf_content.len) { + sel = p1; + } + else { + sel = p2; + } + } + + if (sel->language && sel->language[0]) { + /* Propagate language */ + if (sel == p1) { + if (p2->languages) { + g_ptr_array_unref(p2->languages); + } + + p2->language = sel->language; + p2->languages = g_ptr_array_ref(sel->languages); + } + else { + if (p1->languages) { + g_ptr_array_unref(p1->languages); + } + + p1->language = sel->language; + p1->languages = g_ptr_array_ref(sel->languages); + } + } + + tw = p1->normalized_hashes->len + p2->normalized_hashes->len; + + if (tw > 0) { + dw = rspamd_words_levenshtein_distance(task, + p1->normalized_hashes, + p2->normalized_hashes); + diff = dw / (gdouble) tw; + + msg_debug_task( + "different words: %d, total words: %d, " + "got diff between parts of %.2f", + dw, tw, + diff); + + pdiff = rspamd_mempool_alloc(task->task_pool, + sizeof(gdouble)); + *pdiff = diff; + rspamd_mempool_set_variable(task->task_pool, + "parts_distance", + pdiff, + NULL); + ptw = rspamd_mempool_alloc(task->task_pool, + sizeof(gint)); + *ptw = tw; + rspamd_mempool_set_variable(task->task_pool, + "total_words", + ptw, + NULL); + } + } + } + } + else { + debug_task( + "message contains two parts but they are in different multi-parts"); + } + } + + if (total_words > 0) { + var = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_AVG_WORDS_LEN); + + if (var) { + *var /= (double) total_words; + } + + var = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_SHORT_WORDS_CNT); + + if (var) { + *var /= (double) total_words; + } + } + + rspamd_images_link(task); + rspamd_tokenize_meta_words(task); +} + + +struct rspamd_message * +rspamd_message_ref(struct rspamd_message *msg) +{ + REF_RETAIN(msg); + + return msg; +} + +void rspamd_message_unref(struct rspamd_message *msg) +{ + if (msg) { + REF_RELEASE(msg); + } +} + +void rspamd_message_update_digest(struct rspamd_message *msg, + const void *input, gsize len) +{ + guint64 n[2]; + /* Sanity */ + G_STATIC_ASSERT(sizeof(n) == sizeof(msg->digest)); + + memcpy(n, msg->digest, sizeof(msg->digest)); + n[0] = t1ha2_atonce128(&n[1], input, len, n[0]); + memcpy(msg->digest, n, sizeof(msg->digest)); +} |