diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /src/libmime | |
parent | Initial commit. (diff) | |
download | rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/libmime')
33 files changed, 21349 insertions, 0 deletions
diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt new file mode 100644 index 0000000..09e5dbf --- /dev/null +++ b/src/libmime/CMakeLists.txt @@ -0,0 +1,19 @@ +# Librspamd mime +SET(LIBRSPAMDMIMESRC + ${CMAKE_CURRENT_SOURCE_DIR}/received.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/email_addr.c + ${CMAKE_CURRENT_SOURCE_DIR}/mime_expressions.c + ${CMAKE_CURRENT_SOURCE_DIR}/scan_result.c + ${CMAKE_CURRENT_SOURCE_DIR}/images.c + ${CMAKE_CURRENT_SOURCE_DIR}/message.c + ${CMAKE_CURRENT_SOURCE_DIR}/archives.c + ${CMAKE_CURRENT_SOURCE_DIR}/content_type.c + ${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c + ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c + ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c + ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c + ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection_fasttext.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx + ) + +SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE)
\ No newline at end of file diff --git a/src/libmime/archives.c b/src/libmime/archives.c new file mode 100644 index 0000000..ea0ea55 --- /dev/null +++ b/src/libmime/archives.c @@ -0,0 +1,2057 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "message.h" +#include "task.h" +#include "archives.h" +#include "libmime/mime_encoding.h" +#include <unicode/uchar.h> +#include <unicode/utf8.h> +#include <unicode/utf16.h> +#include <unicode/ucnv.h> + +#define msg_debug_archive(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_archive_log_id, "archive", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(archive) + +static void +rspamd_archive_dtor(gpointer p) +{ + struct rspamd_archive *arch = p; + struct rspamd_archive_file *f; + guint i; + + for (i = 0; i < arch->files->len; i++) { + f = g_ptr_array_index(arch->files, i); + + if (f->fname) { + g_string_free(f->fname, TRUE); + } + + g_free(f); + } + + g_ptr_array_free(arch->files, TRUE); +} + +static bool +rspamd_archive_file_try_utf(struct rspamd_task *task, + struct rspamd_archive *arch, + struct rspamd_archive_file *fentry, + const gchar *in, gsize inlen) +{ + const gchar *charset = NULL, *p, *end; + GString *res; + + charset = rspamd_mime_charset_find_by_content(in, inlen, TRUE); + + if (charset) { + UChar *tmp; + UErrorCode uc_err = U_ZERO_ERROR; + gint32 r, clen, dlen; + struct rspamd_charset_converter *conv; + UConverter *utf8_converter; + + conv = rspamd_mime_get_converter_cached(charset, task->task_pool, + TRUE, &uc_err); + utf8_converter = rspamd_get_utf8_converter(); + + if (conv == NULL) { + msg_info_task("cannot open converter for %s: %s", + charset, u_errorName(uc_err)); + fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED; + fentry->fname = g_string_new_len(in, inlen); + + return false; + } + + tmp = g_malloc(sizeof(*tmp) * (inlen + 1)); + r = rspamd_converter_to_uchars(conv, tmp, inlen + 1, + in, inlen, &uc_err); + if (!U_SUCCESS(uc_err)) { + msg_info_task("cannot convert data to unicode from %s: %s", + charset, u_errorName(uc_err)); + g_free(tmp); + + fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED; + fentry->fname = g_string_new_len(in, inlen); + + return NULL; + } + + int i = 0; + + while (i < r) { + UChar32 uc; + + U16_NEXT(tmp, i, r, uc); + + if (IS_ZERO_WIDTH_SPACE(uc) || u_iscntrl(uc)) { + msg_info_task("control character in archive file name found: 0x%02xd " + "(filename=%T)", + uc, arch->archive_name); + fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED; + break; + } + } + + clen = ucnv_getMaxCharSize(utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen); + res = g_string_sized_new(dlen); + r = ucnv_fromUChars(utf8_converter, res->str, dlen, tmp, r, &uc_err); + + if (!U_SUCCESS(uc_err)) { + msg_info_task("cannot convert data from unicode from %s: %s", + charset, u_errorName(uc_err)); + g_free(tmp); + g_string_free(res, TRUE); + fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED; + fentry->fname = g_string_new_len(in, inlen); + + return NULL; + } + + g_free(tmp); + res->len = r; + + msg_debug_archive("converted from %s to UTF-8 inlen: %z, outlen: %d", + charset, inlen, r); + fentry->fname = res; + } + else { + /* Convert unsafe characters to '?' */ + res = g_string_sized_new(inlen); + p = in; + end = in + inlen; + + while (p < end) { + if (g_ascii_isgraph(*p)) { + g_string_append_c(res, *p); + } + else { + g_string_append_c(res, '?'); + + if (*p < 0x7f && (g_ascii_iscntrl(*p) || *p == '\0')) { + if (!(fentry->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED)) { + msg_info_task("suspicious character in archive file name found: 0x%02xd " + "(filename=%T)", + (int) *p, arch->archive_name); + fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED; + } + } + } + + p++; + } + fentry->fname = res; + } + + return true; +} + +static void +rspamd_archive_process_zip(struct rspamd_task *task, + struct rspamd_mime_part *part) +{ + const guchar *p, *start, *end, *eocd = NULL, *cd; + const guint32 eocd_magic = 0x06054b50, cd_basic_len = 46; + const guchar cd_magic[] = {0x50, 0x4b, 0x01, 0x02}; + const guint max_processed = 1024; + guint32 cd_offset, cd_size, comp_size, uncomp_size, processed = 0; + guint16 extra_len, fname_len, comment_len; + struct rspamd_archive *arch; + struct rspamd_archive_file *f = NULL; + + /* Zip files have interesting data at the end of archive */ + p = part->parsed_data.begin + part->parsed_data.len - 1; + start = part->parsed_data.begin; + end = p; + + /* Search for EOCD: + * 22 bytes is a typical size of eocd without a comment and + * end points one byte after the last character + */ + p -= 21; + + while (p > start + sizeof(guint32)) { + guint32 t; + + if (processed > max_processed) { + break; + } + + /* XXX: not an efficient approach */ + memcpy(&t, p, sizeof(t)); + + if (GUINT32_FROM_LE(t) == eocd_magic) { + eocd = p; + break; + } + + p--; + processed++; + } + + + if (eocd == NULL) { + /* Not a zip file */ + msg_info_task("zip archive is invalid (no EOCD)"); + + return; + } + + if (end - eocd < 21) { + msg_info_task("zip archive is invalid (short EOCD)"); + + return; + } + + + memcpy(&cd_size, eocd + 12, sizeof(cd_size)); + cd_size = GUINT32_FROM_LE(cd_size); + memcpy(&cd_offset, eocd + 16, sizeof(cd_offset)); + cd_offset = GUINT32_FROM_LE(cd_offset); + + /* We need to check sanity as well */ + if (cd_offset + cd_size > (guint) (eocd - start)) { + msg_info_task("zip archive is invalid (bad size/offset for CD)"); + + return; + } + + cd = start + cd_offset; + + arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch)); + arch->files = g_ptr_array_new(); + arch->type = RSPAMD_ARCHIVE_ZIP; + if (part->cd) { + arch->archive_name = &part->cd->filename; + } + rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor, + arch); + + while (cd < start + cd_offset + cd_size) { + guint16 flags; + + /* Read central directory record */ + if (eocd - cd < cd_basic_len || + memcmp(cd, cd_magic, sizeof(cd_magic)) != 0) { + msg_info_task("zip archive is invalid (bad cd record)"); + + return; + } + + memcpy(&flags, cd + 8, sizeof(guint16)); + flags = GUINT16_FROM_LE(flags); + memcpy(&comp_size, cd + 20, sizeof(guint32)); + comp_size = GUINT32_FROM_LE(comp_size); + memcpy(&uncomp_size, cd + 24, sizeof(guint32)); + uncomp_size = GUINT32_FROM_LE(uncomp_size); + memcpy(&fname_len, cd + 28, sizeof(fname_len)); + fname_len = GUINT16_FROM_LE(fname_len); + memcpy(&extra_len, cd + 30, sizeof(extra_len)); + extra_len = GUINT16_FROM_LE(extra_len); + memcpy(&comment_len, cd + 32, sizeof(comment_len)); + comment_len = GUINT16_FROM_LE(comment_len); + + if (cd + fname_len + comment_len + extra_len + cd_basic_len > eocd) { + msg_info_task("zip archive is invalid (too large cd record)"); + + return; + } + + f = g_malloc0(sizeof(*f)); + rspamd_archive_file_try_utf(task, arch, f, cd + cd_basic_len, fname_len); + + f->compressed_size = comp_size; + f->uncompressed_size = uncomp_size; + + if (flags & 0x41u) { + f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED; + } + + if (f->fname) { + if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) { + arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES; + } + + g_ptr_array_add(arch->files, f); + msg_debug_archive("found file in zip archive: %v", f->fname); + } + else { + g_free(f); + + return; + } + + /* Process extra fields */ + const guchar *extra = cd + fname_len + cd_basic_len; + p = extra; + + while (p + sizeof(guint16) * 2 < extra + extra_len) { + guint16 hid, hlen; + + memcpy(&hid, p, sizeof(guint16)); + hid = GUINT16_FROM_LE(hid); + memcpy(&hlen, p + sizeof(guint16), sizeof(guint16)); + hlen = GUINT16_FROM_LE(hlen); + + if (hid == 0x0017) { + f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED; + } + + p += hlen + sizeof(guint16) * 2; + } + + cd += fname_len + comment_len + extra_len + cd_basic_len; + } + + part->part_type = RSPAMD_MIME_PART_ARCHIVE; + part->specific.arch = arch; + + arch->size = part->parsed_data.len; +} + +static inline gint +rspamd_archive_rar_read_vint(const guchar *start, gsize remain, guint64 *res) +{ + /* + * From http://www.rarlab.com/technote.htm: + * Variable length integer. Can include one or more bytes, where + * lower 7 bits of every byte contain integer data and highest bit + * in every byte is the continuation flag. + * If highest bit is 0, this is the last byte in sequence. + * So first byte contains 7 least significant bits of integer and + * continuation flag. Second byte, if present, contains next 7 bits and so on. + */ + guint64 t = 0; + guint shift = 0; + const guchar *p = start; + + while (remain > 0 && shift <= 57) { + if (*p & 0x80) { + t |= ((guint64) (*p & 0x7f)) << shift; + } + else { + t |= ((guint64) (*p & 0x7f)) << shift; + p++; + break; + } + + shift += 7; + p++; + remain--; + } + + if (remain == 0 || shift > 64) { + return -1; + } + + *res = GUINT64_FROM_LE(t); + + return p - start; +} + +#define RAR_SKIP_BYTES(n) \ + do { \ + if ((n) <= 0) { \ + msg_debug_archive("rar archive is invalid (bad skip value)"); \ + return; \ + } \ + if ((gsize) (end - p) < (n)) { \ + msg_debug_archive("rar archive is invalid (truncated)"); \ + return; \ + } \ + p += (n); \ + } while (0) + +#define RAR_READ_VINT() \ + do { \ + r = rspamd_archive_rar_read_vint(p, end - p, &vint); \ + if (r == -1) { \ + msg_debug_archive("rar archive is invalid (bad vint)"); \ + return; \ + } \ + else if (r == 0) { \ + msg_debug_archive("rar archive is invalid (BAD vint offset)"); \ + return; \ + } \ + } while (0) + +#define RAR_READ_VINT_SKIP() \ + do { \ + r = rspamd_archive_rar_read_vint(p, end - p, &vint); \ + if (r == -1) { \ + msg_debug_archive("rar archive is invalid (bad vint)"); \ + return; \ + } \ + p += r; \ + } while (0) + +#define RAR_READ_UINT16(n) \ + do { \ + if (end - p < (glong) sizeof(guint16)) { \ + msg_debug_archive("rar archive is invalid (bad int16)"); \ + return; \ + } \ + n = p[0] + (p[1] << 8); \ + p += sizeof(guint16); \ + } while (0) + +#define RAR_READ_UINT32(n) \ + do { \ + if (end - p < (glong) sizeof(guint32)) { \ + msg_debug_archive("rar archive is invalid (bad int32)"); \ + return; \ + } \ + n = (guint) p[0] + ((guint) p[1] << 8) + ((guint) p[2] << 16) + ((guint) p[3] << 24); \ + p += sizeof(guint32); \ + } while (0) + +static void +rspamd_archive_process_rar_v4(struct rspamd_task *task, const guchar *start, + const guchar *end, struct rspamd_mime_part *part) +{ + const guchar *p = start, *start_section; + guint8 type; + guint flags; + guint64 sz, comp_sz = 0, uncomp_sz = 0; + struct rspamd_archive *arch; + struct rspamd_archive_file *f; + + arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch)); + arch->files = g_ptr_array_new(); + arch->type = RSPAMD_ARCHIVE_RAR; + if (part->cd) { + arch->archive_name = &part->cd->filename; + } + rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor, + arch); + + while (p < end) { + /* Crc16 */ + start_section = p; + RAR_SKIP_BYTES(sizeof(guint16)); + type = *p; + p++; + RAR_READ_UINT16(flags); + + if (type == 0x73) { + /* Main header, check for encryption */ + if (flags & 0x80) { + arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED; + goto end; + } + } + + RAR_READ_UINT16(sz); + + if (flags & 0x8000) { + /* We also need to read ADD_SIZE element */ + guint32 tmp; + + RAR_READ_UINT32(tmp); + sz += tmp; + /* This is also used as PACK_SIZE */ + comp_sz = tmp; + } + + if (sz == 0) { + /* Zero sized block - error */ + msg_debug_archive("rar archive is invalid (zero size block)"); + + return; + } + + if (type == 0x74) { + guint fname_len; + + /* File header */ + /* Uncompressed size */ + RAR_READ_UINT32(uncomp_sz); + /* Skip to NAME_SIZE element */ + RAR_SKIP_BYTES(11); + RAR_READ_UINT16(fname_len); + + if (fname_len == 0 || fname_len > (gsize) (end - p)) { + msg_debug_archive("rar archive is invalid (bad filename size: %d)", + fname_len); + + return; + } + + /* Attrs */ + RAR_SKIP_BYTES(4); + + if (flags & 0x100) { + /* We also need to read HIGH_PACK_SIZE */ + guint32 tmp; + + RAR_READ_UINT32(tmp); + sz += tmp; + comp_sz += tmp; + /* HIGH_UNP_SIZE */ + RAR_READ_UINT32(tmp); + uncomp_sz += tmp; + } + + f = g_malloc0(sizeof(*f)); + + if (flags & 0x200) { + /* We have unicode + normal version */ + guchar *tmp; + + tmp = memchr(p, '\0', fname_len); + + if (tmp != NULL) { + /* Just use ASCII version */ + rspamd_archive_file_try_utf(task, arch, f, p, tmp - p); + msg_debug_archive("found ascii filename in rarv4 archive: %v", + f->fname); + } + else { + /* We have UTF8 filename, use it as is */ + rspamd_archive_file_try_utf(task, arch, f, p, fname_len); + msg_debug_archive("found utf filename in rarv4 archive: %v", + f->fname); + } + } + else { + rspamd_archive_file_try_utf(task, arch, f, p, fname_len); + msg_debug_archive("found ascii (old) filename in rarv4 archive: %v", + f->fname); + } + + f->compressed_size = comp_sz; + f->uncompressed_size = uncomp_sz; + + if (flags & 0x4) { + f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED; + } + + if (f->fname) { + if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) { + arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES; + } + g_ptr_array_add(arch->files, f); + } + else { + g_free(f); + } + } + + p = start_section; + RAR_SKIP_BYTES(sz); + } + +end: + part->part_type = RSPAMD_MIME_PART_ARCHIVE; + part->specific.arch = arch; + arch->size = part->parsed_data.len; +} + +static void +rspamd_archive_process_rar(struct rspamd_task *task, + struct rspamd_mime_part *part) +{ + const guchar *p, *end, *section_start; + const guchar rar_v5_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00}, + rar_v4_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00}; + const guint rar_encrypted_header = 4, rar_main_header = 1, + rar_file_header = 2; + guint64 vint, sz, comp_sz = 0, uncomp_sz = 0, flags = 0, type = 0, + extra_sz = 0; + struct rspamd_archive *arch; + struct rspamd_archive_file *f; + gint r; + + p = part->parsed_data.begin; + end = p + part->parsed_data.len; + + if ((gsize) (end - p) <= sizeof(rar_v5_magic)) { + msg_debug_archive("rar archive is invalid (too small)"); + + return; + } + + if (memcmp(p, rar_v5_magic, sizeof(rar_v5_magic)) == 0) { + p += sizeof(rar_v5_magic); + } + else if (memcmp(p, rar_v4_magic, sizeof(rar_v4_magic)) == 0) { + p += sizeof(rar_v4_magic); + + rspamd_archive_process_rar_v4(task, p, end, part); + return; + } + else { + msg_debug_archive("rar archive is invalid (no rar magic)"); + + return; + } + + /* Rar v5 format */ + arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch)); + arch->files = g_ptr_array_new(); + arch->type = RSPAMD_ARCHIVE_RAR; + if (part->cd) { + arch->archive_name = &part->cd->filename; + } + rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor, + arch); + + /* Now we can have either encryption header or archive header */ + /* Crc 32 */ + RAR_SKIP_BYTES(sizeof(guint32)); + /* Size */ + RAR_READ_VINT_SKIP(); + sz = vint; + /* Type */ + section_start = p; + RAR_READ_VINT_SKIP(); + type = vint; + /* Header flags */ + RAR_READ_VINT_SKIP(); + flags = vint; + + if (flags & 0x1) { + /* Have extra zone */ + RAR_READ_VINT_SKIP(); + } + if (flags & 0x2) { + /* Data zone is presented */ + RAR_READ_VINT_SKIP(); + sz += vint; + } + + if (type == rar_encrypted_header) { + /* We can't read any further information as archive is encrypted */ + arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED; + goto end; + } + else if (type != rar_main_header) { + msg_debug_archive("rar archive is invalid (bad main header)"); + + return; + } + + /* Nothing useful in main header */ + p = section_start; + RAR_SKIP_BYTES(sz); + + while (p < end) { + gboolean has_extra = FALSE; + /* Read the next header */ + /* Crc 32 */ + RAR_SKIP_BYTES(sizeof(guint32)); + /* Size */ + RAR_READ_VINT_SKIP(); + + sz = vint; + if (sz == 0) { + /* Zero sized block - error */ + msg_debug_archive("rar archive is invalid (zero size block)"); + + return; + } + + section_start = p; + /* Type */ + RAR_READ_VINT_SKIP(); + type = vint; + /* Header flags */ + RAR_READ_VINT_SKIP(); + flags = vint; + + if (flags & 0x1) { + /* Have extra zone */ + RAR_READ_VINT_SKIP(); + extra_sz = vint; + has_extra = TRUE; + } + + if (flags & 0x2) { + /* Data zone is presented */ + RAR_READ_VINT_SKIP(); + sz += vint; + comp_sz = vint; + } + + if (type != rar_file_header) { + p = section_start; + RAR_SKIP_BYTES(sz); + } + else { + /* We have a file header, go forward */ + guint64 fname_len; + bool is_directory = false; + + /* File header specific flags */ + RAR_READ_VINT_SKIP(); + flags = vint; + + /* Unpacked size */ + RAR_READ_VINT_SKIP(); + uncomp_sz = vint; + /* Attributes */ + RAR_READ_VINT_SKIP(); + + if (flags & 0x2) { + /* Unix mtime */ + RAR_SKIP_BYTES(sizeof(guint32)); + } + if (flags & 0x4) { + /* Crc32 */ + RAR_SKIP_BYTES(sizeof(guint32)); + } + if (flags & 0x1) { + /* Ignore directories for sanity purposes */ + is_directory = true; + msg_debug_archive("skip directory record in a rar archive"); + } + + if (!is_directory) { + /* Compression */ + RAR_READ_VINT_SKIP(); + /* Host OS */ + RAR_READ_VINT_SKIP(); + /* Filename length (finally!) */ + RAR_READ_VINT_SKIP(); + fname_len = vint; + + if (fname_len == 0 || fname_len > (gsize) (end - p)) { + msg_debug_archive("rar archive is invalid (bad filename size)"); + + return; + } + + f = g_malloc0(sizeof(*f)); + f->uncompressed_size = uncomp_sz; + f->compressed_size = comp_sz; + rspamd_archive_file_try_utf(task, arch, f, p, fname_len); + + if (f->fname) { + msg_debug_archive("added rarv5 file: %v", f->fname); + g_ptr_array_add(arch->files, f); + if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) { + arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES; + } + } + else { + g_free(f); + f = NULL; + } + + if (f && has_extra && extra_sz > 0 && + p + fname_len + extra_sz < end) { + /* Try to find encryption record in extra field */ + const guchar *ex = p + fname_len; + + while (ex < p + extra_sz) { + const guchar *t; + gint64 cur_sz = 0, sec_type = 0; + + r = rspamd_archive_rar_read_vint(ex, extra_sz, &cur_sz); + if (r == -1) { + msg_debug_archive("rar archive is invalid (bad vint)"); + return; + } + + t = ex + r; + + r = rspamd_archive_rar_read_vint(t, extra_sz - r, &sec_type); + if (r == -1) { + msg_debug_archive("rar archive is invalid (bad vint)"); + return; + } + + if (sec_type == 0x01) { + f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED; + arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED; + break; + } + + ex += cur_sz; + } + } + } + + /* Restore p to the beginning of the header */ + p = section_start; + RAR_SKIP_BYTES(sz); + } + } + +end: + part->part_type = RSPAMD_MIME_PART_ARCHIVE; + part->specific.arch = arch; + arch->size = part->parsed_data.len; +} + +static inline gint +rspamd_archive_7zip_read_vint(const guchar *start, gsize remain, guint64 *res) +{ + /* + * REAL_UINT64 means real UINT64. + * UINT64 means real UINT64 encoded with the following scheme: + * + * Size of encoding sequence depends from first byte: + * First_Byte Extra_Bytes Value + * (binary) + * 0xxxxxxx : ( xxxxxxx ) + * 10xxxxxx BYTE y[1] : ( xxxxxx << (8 * 1)) + y + * 110xxxxx BYTE y[2] : ( xxxxx << (8 * 2)) + y + * ... + * 1111110x BYTE y[6] : ( x << (8 * 6)) + y + * 11111110 BYTE y[7] : y + * 11111111 BYTE y[8] : y + */ + guchar t; + + if (remain == 0) { + return -1; + } + + t = *start; + + if (!isset(&t, 7)) { + /* Trivial case */ + *res = t; + return 1; + } + else if (t == 0xFF) { + if (remain >= sizeof(guint64) + 1) { + memcpy(res, start + 1, sizeof(guint64)); + *res = GUINT64_FROM_LE(*res); + + return sizeof(guint64) + 1; + } + } + else { + gint cur_bit = 6, intlen = 1; + const guchar bmask = 0xFF; + guint64 tgt; + + while (cur_bit > 0) { + if (!isset(&t, cur_bit)) { + if (remain >= intlen + 1) { + memcpy(&tgt, start + 1, intlen); + tgt = GUINT64_FROM_LE(tgt); + /* Shift back */ + tgt >>= sizeof(tgt) - NBBY * intlen; + /* Add masked value */ + tgt += (guint64) (t & (bmask >> (NBBY - cur_bit))) + << (NBBY * intlen); + *res = tgt; + + return intlen + 1; + } + } + cur_bit--; + intlen++; + } + } + + return -1; +} + +#define SZ_READ_VINT_SKIP() \ + do { \ + r = rspamd_archive_7zip_read_vint(p, end - p, &vint); \ + if (r == -1) { \ + msg_debug_archive("7z archive is invalid (bad vint)"); \ + return; \ + } \ + p += r; \ + } while (0) +#define SZ_READ_VINT(var) \ + do { \ + int r; \ + r = rspamd_archive_7zip_read_vint(p, end - p, &(var)); \ + if (r == -1) { \ + msg_debug_archive("7z archive is invalid (bad vint): %s", G_STRLOC); \ + return NULL; \ + } \ + p += r; \ + } while (0) + +#define SZ_READ_UINT64(n) \ + do { \ + if (end - p < (goffset) sizeof(guint64)) { \ + msg_debug_archive("7zip archive is invalid (bad uint64): %s", G_STRLOC); \ + return; \ + } \ + memcpy(&(n), p, sizeof(guint64)); \ + n = GUINT64_FROM_LE(n); \ + p += sizeof(guint64); \ + } while (0) +#define SZ_SKIP_BYTES(n) \ + do { \ + if (end - p >= (n)) { \ + p += (n); \ + } \ + else { \ + msg_debug_archive("7zip archive is invalid (truncated); wanted to read %d bytes, %d avail: %s", (gint) (n), (gint) (end - p), G_STRLOC); \ + return NULL; \ + } \ + } while (0) + +enum rspamd_7zip_header_mark { + kEnd = 0x00, + kHeader = 0x01, + kArchiveProperties = 0x02, + kAdditionalStreamsInfo = 0x03, + kMainStreamsInfo = 0x04, + kFilesInfo = 0x05, + kPackInfo = 0x06, + kUnPackInfo = 0x07, + kSubStreamsInfo = 0x08, + kSize = 0x09, + kCRC = 0x0A, + kFolder = 0x0B, + kCodersUnPackSize = 0x0C, + kNumUnPackStream = 0x0D, + kEmptyStream = 0x0E, + kEmptyFile = 0x0F, + kAnti = 0x10, + kName = 0x11, + kCTime = 0x12, + kATime = 0x13, + kMTime = 0x14, + kWinAttributes = 0x15, + kComment = 0x16, + kEncodedHeader = 0x17, + kStartPos = 0x18, + kDummy = 0x19, +}; + + +#define _7Z_CRYPTO_MAIN_ZIP 0x06F10101 /* Main Zip crypto algo */ +#define _7Z_CRYPTO_RAR_29 0x06F10303 /* Rar29 AES-128 + (modified SHA-1) */ +#define _7Z_CRYPTO_AES_256_SHA_256 0x06F10701 /* AES-256 + SHA-256 */ + +#define IS_SZ_ENCRYPTED(codec_id) (((codec_id) == _7Z_CRYPTO_MAIN_ZIP) || \ + ((codec_id) == _7Z_CRYPTO_RAR_29) || \ + ((codec_id) == _7Z_CRYPTO_AES_256_SHA_256)) + +static const guchar * +rspamd_7zip_read_bits(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch, guint nbits, + guint *pbits_set) +{ + unsigned mask = 0, avail = 0, i; + gboolean bit_set = 0; + + for (i = 0; i < nbits; i++) { + if (mask == 0) { + avail = *p; + SZ_SKIP_BYTES(1); + mask = 0x80; + } + + bit_set = (avail & mask) ? 1 : 0; + + if (bit_set && pbits_set) { + (*pbits_set)++; + } + + mask >>= 1; + } + + return p; +} + +static const guchar * +rspamd_7zip_read_digest(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch, + guint64 num_streams, + guint *pdigest_read) +{ + guchar all_defined = *p; + guint64 i; + guint num_defined = 0; + /* + * BYTE AllAreDefined + * if (AllAreDefined == 0) + * { + * for(NumStreams) + * BIT Defined + * } + * UINT32 CRCs[NumDefined] + */ + SZ_SKIP_BYTES(1); + + if (all_defined) { + num_defined = num_streams; + } + else { + if (num_streams > 8192) { + /* Gah */ + return NULL; + } + + p = rspamd_7zip_read_bits(task, p, end, arch, num_streams, &num_defined); + + if (p == NULL) { + return NULL; + } + } + + for (i = 0; i < num_defined; i++) { + SZ_SKIP_BYTES(sizeof(guint32)); + } + + if (pdigest_read) { + *pdigest_read = num_defined; + } + + return p; +} + +static const guchar * +rspamd_7zip_read_pack_info(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch) +{ + guint64 pack_pos = 0, pack_streams = 0, i, cur_sz; + guint num_digests = 0; + guchar t; + /* + * UINT64 PackPos + * UINT64 NumPackStreams + * + * [] + * BYTE NID::kSize (0x09) + * UINT64 PackSizes[NumPackStreams] + * [] + * + * [] + * BYTE NID::kCRC (0x0A) + * PackStreamDigests[NumPackStreams] + * [] + * BYTE NID::kEnd + */ + + SZ_READ_VINT(pack_pos); + SZ_READ_VINT(pack_streams); + + while (p != NULL && p < end) { + t = *p; + SZ_SKIP_BYTES(1); + msg_debug_archive("7zip: read pack info %xc", t); + + switch (t) { + case kSize: + /* We need to skip pack_streams VINTS */ + for (i = 0; i < pack_streams; i++) { + SZ_READ_VINT(cur_sz); + } + break; + case kCRC: + /* CRCs are more complicated */ + p = rspamd_7zip_read_digest(task, p, end, arch, pack_streams, + &num_digests); + break; + case kEnd: + goto end; + break; + default: + p = NULL; + msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC); + goto end; + break; + } + } + +end: + + return p; +} + +static const guchar * +rspamd_7zip_read_folder(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch, guint *pnstreams, guint *ndigests) +{ + guint64 ncoders = 0, i, j, noutstreams = 0, ninstreams = 0; + + SZ_READ_VINT(ncoders); + + for (i = 0; i < ncoders && p != NULL && p < end; i++) { + guint64 sz, tmp; + guchar t; + /* + * BYTE + * { + * 0:3 CodecIdSize + * 4: Is Complex Coder + * 5: There Are Attributes + * 6: Reserved + * 7: There are more alternative methods. (Not used anymore, must be 0). + * } + * BYTE CodecId[CodecIdSize] + * if (Is Complex Coder) + * { + * UINT64 NumInStreams; + * UINT64 NumOutStreams; + * } + * if (There Are Attributes) + * { + * UINT64 PropertiesSize + * BYTE Properties[PropertiesSize] + * } + */ + t = *p; + SZ_SKIP_BYTES(1); + sz = t & 0xF; + /* Codec ID */ + tmp = 0; + for (j = 0; j < sz; j++) { + tmp <<= 8; + tmp += p[j]; + } + + msg_debug_archive("7zip: read codec id: %L", tmp); + + if (IS_SZ_ENCRYPTED(tmp)) { + arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED; + } + + SZ_SKIP_BYTES(sz); + + if (t & (1u << 4)) { + /* Complex */ + SZ_READ_VINT(tmp); /* InStreams */ + ninstreams += tmp; + SZ_READ_VINT(tmp); /* OutStreams */ + noutstreams += tmp; + } + else { + /* XXX: is it correct ? */ + noutstreams++; + ninstreams++; + } + if (t & (1u << 5)) { + /* Attributes ... */ + SZ_READ_VINT(tmp); /* Size of attrs */ + SZ_SKIP_BYTES(tmp); + } + } + + if (noutstreams > 1) { + /* BindPairs, WTF, huh */ + for (i = 0; i < noutstreams - 1; i++) { + guint64 tmp; + + SZ_READ_VINT(tmp); + SZ_READ_VINT(tmp); + } + } + + gint64 npacked = (gint64) ninstreams - (gint64) noutstreams + 1; + msg_debug_archive("7zip: instreams=%L, outstreams=%L, packed=%L", + ninstreams, noutstreams, npacked); + + if (npacked > 1) { + /* Gah... */ + for (i = 0; i < npacked; i++) { + guint64 tmp; + + SZ_READ_VINT(tmp); + } + } + + *pnstreams = noutstreams; + (*ndigests) += npacked; + + return p; +} + +static const guchar * +rspamd_7zip_read_coders_info(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch, + guint *pnum_folders, guint *pnum_nodigest) +{ + guint64 num_folders = 0, i, tmp; + guchar t; + guint *folder_nstreams = NULL, num_digests = 0, digests_read = 0; + + while (p != NULL && p < end) { + /* + * BYTE NID::kFolder (0x0B) + * UINT64 NumFolders + * BYTE External + * switch(External) + * { + * case 0: + * Folders[NumFolders] + * case 1: + * UINT64 DataStreamIndex + * } + * BYTE ID::kCodersUnPackSize (0x0C) + * for(Folders) + * for(Folder.NumOutStreams) + * UINT64 UnPackSize; + * [] + * BYTE NID::kCRC (0x0A) + * UnPackDigests[NumFolders] + * [] + * BYTE NID::kEnd + */ + + t = *p; + SZ_SKIP_BYTES(1); + msg_debug_archive("7zip: read coders info %xc", t); + + switch (t) { + case kFolder: + SZ_READ_VINT(num_folders); + msg_debug_archive("7zip: nfolders=%L", num_folders); + + if (*p != 0) { + /* External folders */ + SZ_SKIP_BYTES(1); + SZ_READ_VINT(tmp); + } + else { + SZ_SKIP_BYTES(1); + + if (num_folders > 8192) { + /* Gah */ + return NULL; + } + + if (folder_nstreams) { + g_free(folder_nstreams); + } + + folder_nstreams = g_malloc(sizeof(int) * num_folders); + + for (i = 0; i < num_folders && p != NULL && p < end; i++) { + p = rspamd_7zip_read_folder(task, p, end, arch, + &folder_nstreams[i], &num_digests); + } + } + break; + case kCodersUnPackSize: + for (i = 0; i < num_folders && p != NULL && p < end; i++) { + if (folder_nstreams) { + for (guint j = 0; j < folder_nstreams[i]; j++) { + SZ_READ_VINT(tmp); /* Unpacked size */ + msg_debug_archive("7zip: unpacked size " + "(folder=%d, stream=%d) = %L", + (gint) i, j, tmp); + } + } + else { + msg_err_task("internal 7zip error"); + } + } + break; + case kCRC: + /* + * Here are dragons. Spec tells that here there could be up + * to nfolders digests. However, according to the actual source + * code, in case of multiple out streams there should be digests + * for all out streams. + * + * In the real life (tm) it is even more idiotic: all these digests + * are in another section! But that section needs number of digests + * that are absent here. It is the most stupid thing I've ever seen + * in any file format. + * + * I hope there *WAS* some reason to do such shit... + */ + p = rspamd_7zip_read_digest(task, p, end, arch, num_digests, + &digests_read); + break; + case kEnd: + goto end; + break; + default: + p = NULL; + msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC); + goto end; + break; + } + } + +end: + + if (pnum_nodigest) { + *pnum_nodigest = num_digests - digests_read; + } + if (pnum_folders) { + *pnum_folders = num_folders; + } + + if (folder_nstreams) { + g_free(folder_nstreams); + } + + return p; +} + +static const guchar * +rspamd_7zip_read_substreams_info(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch, + guint num_folders, guint num_nodigest) +{ + guchar t; + guint i; + guint64 *folder_nstreams; + + if (num_folders > 8192) { + /* Gah */ + return NULL; + } + + folder_nstreams = g_alloca(sizeof(guint64) * num_folders); + memset(folder_nstreams, 0, sizeof(guint64) * num_folders); + + while (p != NULL && p < end) { + /* + * [] + * BYTE NID::kNumUnPackStream; (0x0D) + * UINT64 NumUnPackStreamsInFolders[NumFolders]; + * [] + * + * [] + * BYTE NID::kSize (0x09) + * UINT64 UnPackSizes[??] + * [] + * + * + * [] + * BYTE NID::kCRC (0x0A) + * Digests[Number of streams with unknown CRC] + * [] + + */ + t = *p; + SZ_SKIP_BYTES(1); + + msg_debug_archive("7zip: read substream info %xc", t); + + switch (t) { + case kNumUnPackStream: + for (i = 0; i < num_folders; i++) { + guint64 tmp; + + SZ_READ_VINT(tmp); + folder_nstreams[i] = tmp; + } + break; + case kCRC: + /* + * Read the comment in the rspamd_7zip_read_coders_info + */ + p = rspamd_7zip_read_digest(task, p, end, arch, num_nodigest, + NULL); + break; + case kSize: + /* + * Another brain damaged logic, but we have to support it + * as there are no ways to proceed without it. + * In fact, it is just absent in the real life... + */ + for (i = 0; i < num_folders; i++) { + for (guint j = 0; j < folder_nstreams[i]; j++) { + guint64 tmp; + + SZ_READ_VINT(tmp); /* Who cares indeed */ + } + } + break; + case kEnd: + goto end; + break; + default: + p = NULL; + msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC); + goto end; + break; + } + } + +end: + return p; +} + +static const guchar * +rspamd_7zip_read_main_streams_info(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch) +{ + guchar t; + guint num_folders = 0, unknown_digests = 0; + + while (p != NULL && p < end) { + t = *p; + SZ_SKIP_BYTES(1); + msg_debug_archive("7zip: read main streams info %xc", t); + + /* + * + * [] + * PackInfo + * [] + + * [] + * CodersInfo + * [] + * + * [] + * SubStreamsInfo + * [] + * + * BYTE NID::kEnd + */ + switch (t) { + case kPackInfo: + p = rspamd_7zip_read_pack_info(task, p, end, arch); + break; + case kUnPackInfo: + p = rspamd_7zip_read_coders_info(task, p, end, arch, &num_folders, + &unknown_digests); + break; + case kSubStreamsInfo: + p = rspamd_7zip_read_substreams_info(task, p, end, arch, num_folders, + unknown_digests); + break; + break; + case kEnd: + goto end; + break; + default: + p = NULL; + msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC); + goto end; + break; + } + } + +end: + return p; +} + +static const guchar * +rspamd_7zip_read_archive_props(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch) +{ + guchar proptype; + guint64 proplen; + + /* + * for (;;) + * { + * BYTE PropertyType; + * if (aType == 0) + * break; + * UINT64 PropertySize; + * BYTE PropertyData[PropertySize]; + * } + */ + + if (p != NULL) { + proptype = *p; + SZ_SKIP_BYTES(1); + + while (proptype != 0) { + SZ_READ_VINT(proplen); + + if (p + proplen < end) { + p += proplen; + } + else { + return NULL; + } + + proptype = *p; + SZ_SKIP_BYTES(1); + } + } + + return p; +} + +static GString * +rspamd_7zip_ucs2_to_utf8(struct rspamd_task *task, const guchar *p, + const guchar *end) +{ + GString *res; + goffset dest_pos = 0, src_pos = 0; + const gsize len = (end - p) / sizeof(guint16); + guint16 *up; + UChar32 wc; + UBool is_error = 0; + + res = g_string_sized_new((end - p) * 3 / 2 + sizeof(wc) + 1); + up = (guint16 *) p; + + while (src_pos < len) { + U16_NEXT(up, src_pos, len, wc); + + if (wc > 0) { + U8_APPEND(res->str, dest_pos, + res->allocated_len - 1, + wc, is_error); + } + + if (is_error) { + g_string_free(res, TRUE); + + return NULL; + } + } + + g_assert(dest_pos < res->allocated_len); + + res->len = dest_pos; + res->str[dest_pos] = '\0'; + + return res; +} + +static const guchar * +rspamd_7zip_read_files_info(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch) +{ + guint64 nfiles = 0, sz, i; + guchar t, b; + struct rspamd_archive_file *fentry; + + SZ_READ_VINT(nfiles); + + for (; p != NULL && p < end;) { + t = *p; + SZ_SKIP_BYTES(1); + + msg_debug_archive("7zip: read file data type %xc", t); + + if (t == kEnd) { + goto end; + } + + /* This is SO SPECIAL, gah */ + SZ_READ_VINT(sz); + + switch (t) { + case kEmptyStream: + case kEmptyFile: + case kAnti: /* AntiFile, OMFG */ + /* We don't care about these bits */ + case kCTime: + case kATime: + case kMTime: + /* We don't care of these guys, but we still have to parse them, gah */ + if (sz > 0) { + SZ_SKIP_BYTES(sz); + } + break; + case kName: + /* The most useful part in this whole bloody format */ + b = *p; /* External flag */ + SZ_SKIP_BYTES(1); + + if (b) { + /* TODO: for the god sake, do something about external + * filenames... + */ + guint64 tmp; + + SZ_READ_VINT(tmp); + } + else { + for (i = 0; i < nfiles; i++) { + /* Zero terminated wchar_t: happy converting... */ + /* First, find terminator */ + const guchar *fend = NULL, *tp = p; + GString *res; + + while (tp < end - 1) { + if (*tp == 0 && *(tp + 1) == 0) { + fend = tp; + break; + } + + tp += 2; + } + + if (fend == NULL || fend - p == 0) { + /* Crap instead of fname */ + msg_debug_archive("bad 7zip name; %s", G_STRLOC); + goto end; + } + + res = rspamd_7zip_ucs2_to_utf8(task, p, fend); + + if (res != NULL) { + fentry = g_malloc0(sizeof(*fentry)); + fentry->fname = res; + g_ptr_array_add(arch->files, fentry); + msg_debug_archive("7zip: found file %v", res); + } + else { + msg_debug_archive("bad 7zip name; %s", G_STRLOC); + } + /* Skip zero terminating character */ + p = fend + 2; + } + } + break; + case kDummy: + case kWinAttributes: + if (sz > 0) { + SZ_SKIP_BYTES(sz); + } + break; + default: + p = NULL; + msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC); + goto end; + break; + } + } + +end: + return p; +} + +static const guchar * +rspamd_7zip_read_next_section(struct rspamd_task *task, + const guchar *p, const guchar *end, + struct rspamd_archive *arch) +{ + guchar t = *p; + + SZ_SKIP_BYTES(1); + + msg_debug_archive("7zip: read section %xc", t); + + switch (t) { + case kHeader: + /* We just skip byte and go further */ + break; + case kEncodedHeader: + /* + * In fact, headers are just packed, but we assume it as + * encrypted to distinguish from the normal archives + */ + msg_debug_archive("7zip: encoded header, needs to be uncompressed"); + arch->flags |= RSPAMD_ARCHIVE_CANNOT_READ; + p = NULL; /* Cannot get anything useful */ + break; + case kArchiveProperties: + p = rspamd_7zip_read_archive_props(task, p, end, arch); + break; + case kMainStreamsInfo: + p = rspamd_7zip_read_main_streams_info(task, p, end, arch); + break; + case kAdditionalStreamsInfo: + p = rspamd_7zip_read_main_streams_info(task, p, end, arch); + break; + case kFilesInfo: + p = rspamd_7zip_read_files_info(task, p, end, arch); + break; + case kEnd: + p = NULL; + msg_debug_archive("7zip: read final section"); + break; + default: + p = NULL; + msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC); + break; + } + + return p; +} + +static void +rspamd_archive_process_7zip(struct rspamd_task *task, + struct rspamd_mime_part *part) +{ + struct rspamd_archive *arch; + const guchar *start, *p, *end; + const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C}; + guint64 section_offset = 0, section_length = 0; + + start = part->parsed_data.begin; + p = start; + end = p + part->parsed_data.len; + + if (end - p <= sizeof(guint64) + sizeof(guint32) || + memcmp(p, sz_magic, sizeof(sz_magic)) != 0) { + msg_debug_archive("7z archive is invalid (no 7z magic)"); + + return; + } + + arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch)); + arch->files = g_ptr_array_new(); + arch->type = RSPAMD_ARCHIVE_7ZIP; + rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor, + arch); + + /* Magic (6 bytes) + version (2 bytes) + crc32 (4 bytes) */ + p += sizeof(guint64) + sizeof(guint32); + + SZ_READ_UINT64(section_offset); + SZ_READ_UINT64(section_length); + + if (end - p > sizeof(guint32)) { + p += sizeof(guint32); + } + else { + msg_debug_archive("7z archive is invalid (truncated crc)"); + + return; + } + + if (end - p > section_offset) { + p += section_offset; + } + else { + msg_debug_archive("7z archive is invalid (incorrect section offset)"); + + return; + } + + while ((p = rspamd_7zip_read_next_section(task, p, end, arch)) != NULL) + ; + + part->part_type = RSPAMD_MIME_PART_ARCHIVE; + part->specific.arch = arch; + if (part->cd != NULL) { + arch->archive_name = &part->cd->filename; + } + arch->size = part->parsed_data.len; +} + +static void +rspamd_archive_process_gzip(struct rspamd_task *task, + struct rspamd_mime_part *part) +{ + struct rspamd_archive *arch; + const guchar *start, *p, *end; + const guchar gz_magic[] = {0x1F, 0x8B}; + guchar flags; + + start = part->parsed_data.begin; + p = start; + end = p + part->parsed_data.len; + + if (end - p <= 10 || memcmp(p, gz_magic, sizeof(gz_magic)) != 0) { + msg_debug_archive("gzip archive is invalid (no gzip magic)"); + + return; + } + + arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch)); + arch->files = g_ptr_array_sized_new(1); + arch->type = RSPAMD_ARCHIVE_GZIP; + if (part->cd) { + arch->archive_name = &part->cd->filename; + } + rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor, + arch); + + flags = p[3]; + + if (flags & (1u << 5)) { + arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED; + } + + if (flags & (1u << 3)) { + /* We have file name presented in archive, try to use it */ + if (flags & (1u << 1)) { + /* Multipart */ + p += 12; + } + else { + p += 10; + } + + if (flags & (1u << 2)) { + /* Optional section */ + guint16 optlen = 0; + + RAR_READ_UINT16(optlen); + + if (end <= p + optlen) { + msg_debug_archive("gzip archive is invalid, bad extra length: %d", + (int) optlen); + + return; + } + + p += optlen; + } + + /* Read file name */ + const guchar *fname_start = p; + + while (p < end) { + if (*p == '\0') { + if (p > fname_start) { + struct rspamd_archive_file *f; + + f = g_malloc0(sizeof(*f)); + + rspamd_archive_file_try_utf(task, arch, f, + fname_start, p - fname_start); + + if (f->fname) { + g_ptr_array_add(arch->files, f); + + if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) { + arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES; + } + } + else { + /* Invalid filename, skip */ + g_free(f); + } + + goto set; + } + } + + p++; + } + + /* Wrong filename, not zero terminated */ + msg_debug_archive("gzip archive is invalid, bad filename at pos %d", + (int) (p - start)); + + return; + } + + /* Fallback, we need to extract file name from archive name if possible */ + if (part->cd && part->cd->filename.len > 0) { + const gchar *dot_pos, *slash_pos; + + dot_pos = rspamd_memrchr(part->cd->filename.begin, '.', + part->cd->filename.len); + + if (dot_pos) { + struct rspamd_archive_file *f; + + slash_pos = rspamd_memrchr(part->cd->filename.begin, '/', + part->cd->filename.len); + + if (slash_pos && slash_pos < dot_pos) { + f = g_malloc0(sizeof(*f)); + f->fname = g_string_sized_new(dot_pos - slash_pos); + g_string_append_len(f->fname, slash_pos + 1, + dot_pos - slash_pos - 1); + + msg_debug_archive("fallback to gzip filename based on cd: %v", + f->fname); + + g_ptr_array_add(arch->files, f); + + goto set; + } + else { + const gchar *fname_start = part->cd->filename.begin; + + f = g_malloc0(sizeof(*f)); + + if (memchr(fname_start, '.', part->cd->filename.len) != dot_pos) { + /* Double dots, something like foo.exe.gz */ + f->fname = g_string_sized_new(dot_pos - fname_start); + g_string_append_len(f->fname, fname_start, + dot_pos - fname_start); + } + else { + /* Single dot, something like foo.gzz */ + f->fname = g_string_sized_new(part->cd->filename.len); + g_string_append_len(f->fname, fname_start, + part->cd->filename.len); + } + + msg_debug_archive("fallback to gzip filename based on cd: %v", + f->fname); + + g_ptr_array_add(arch->files, f); + + goto set; + } + } + } + + return; + +set: + /* Set archive data */ + part->part_type = RSPAMD_MIME_PART_ARCHIVE; + part->specific.arch = arch; + arch->size = part->parsed_data.len; +} + +static gboolean +rspamd_archive_cheat_detect(struct rspamd_mime_part *part, const gchar *str, + const guchar *magic_start, gsize magic_len) +{ + struct rspamd_content_type *ct; + const gchar *p; + rspamd_ftok_t srch, *fname; + + ct = part->ct; + RSPAMD_FTOK_ASSIGN(&srch, "application"); + + if (ct && ct->type.len && ct->subtype.len > 0 && rspamd_ftok_cmp(&ct->type, &srch) == 0) { + if (rspamd_substring_search_caseless(ct->subtype.begin, ct->subtype.len, + str, strlen(str)) != -1) { + /* We still need to check magic, see #1848 */ + if (magic_start != NULL) { + if (part->parsed_data.len > magic_len && + memcmp(part->parsed_data.begin, + magic_start, magic_len) == 0) { + return TRUE; + } + /* No magic, refuse this type of archive */ + return FALSE; + } + else { + return TRUE; + } + } + } + + if (part->cd) { + fname = &part->cd->filename; + + if (fname && fname->len > strlen(str)) { + p = fname->begin + fname->len - strlen(str); + + if (rspamd_lc_cmp(p, str, strlen(str)) == 0) { + if (*(p - 1) == '.') { + if (magic_start != NULL) { + if (part->parsed_data.len > magic_len && + memcmp(part->parsed_data.begin, + magic_start, magic_len) == 0) { + return TRUE; + } + /* No magic, refuse this type of archive */ + return FALSE; + } + + return TRUE; + } + } + } + + if (magic_start != NULL) { + if (part->parsed_data.len > magic_len && + memcmp(part->parsed_data.begin, magic_start, magic_len) == 0) { + return TRUE; + } + } + } + else { + if (magic_start != NULL) { + if (part->parsed_data.len > magic_len && + memcmp(part->parsed_data.begin, magic_start, magic_len) == 0) { + return TRUE; + } + } + } + + return FALSE; +} + +void rspamd_archives_process(struct rspamd_task *task) +{ + guint i; + struct rspamd_mime_part *part; + const guchar rar_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07}; + const guchar zip_magic[] = {0x50, 0x4b, 0x03, 0x04}; + const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C}; + const guchar gz_magic[] = {0x1F, 0x8B, 0x08}; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + if (part->part_type == RSPAMD_MIME_PART_UNDEFINED) { + if (part->parsed_data.len > 0) { + if (rspamd_archive_cheat_detect(part, "zip", + zip_magic, sizeof(zip_magic))) { + rspamd_archive_process_zip(task, part); + } + else if (rspamd_archive_cheat_detect(part, "rar", + rar_magic, sizeof(rar_magic))) { + rspamd_archive_process_rar(task, part); + } + else if (rspamd_archive_cheat_detect(part, "7z", + sz_magic, sizeof(sz_magic))) { + rspamd_archive_process_7zip(task, part); + } + else if (rspamd_archive_cheat_detect(part, "gz", + gz_magic, sizeof(gz_magic))) { + rspamd_archive_process_gzip(task, part); + } + + if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT) && + part->part_type == RSPAMD_MIME_PART_ARCHIVE && + part->specific.arch) { + struct rspamd_archive *arch = part->specific.arch; + + msg_info_task("found %s archive with incorrect content-type: %T/%T", + rspamd_archive_type_str(arch->type), + &part->ct->type, &part->ct->subtype); + + if (!(part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) { + part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; + } + } + } + } + } +} + + +const gchar * +rspamd_archive_type_str(enum rspamd_archive_type type) +{ + const gchar *ret = "unknown"; + + switch (type) { + case RSPAMD_ARCHIVE_ZIP: + ret = "zip"; + break; + case RSPAMD_ARCHIVE_RAR: + ret = "rar"; + break; + case RSPAMD_ARCHIVE_7ZIP: + ret = "7z"; + break; + case RSPAMD_ARCHIVE_GZIP: + ret = "gz"; + break; + } + + return ret; +} diff --git a/src/libmime/archives.h b/src/libmime/archives.h new file mode 100644 index 0000000..56beb62 --- /dev/null +++ b/src/libmime/archives.h @@ -0,0 +1,72 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_ARCHIVES_H_ +#define SRC_LIBMIME_ARCHIVES_H_ + +#include "config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum rspamd_archive_type { + RSPAMD_ARCHIVE_ZIP, + RSPAMD_ARCHIVE_RAR, + RSPAMD_ARCHIVE_7ZIP, + RSPAMD_ARCHIVE_GZIP, +}; + +enum rspamd_archive_flags { + RSPAMD_ARCHIVE_ENCRYPTED = (1u << 0u), + RSPAMD_ARCHIVE_CANNOT_READ = (1u << 1u), + RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES = (1u << 2u), +}; + +enum rspamd_archive_file_flags { + RSPAMD_ARCHIVE_FILE_ENCRYPTED = (1u << 0u), + RSPAMD_ARCHIVE_FILE_OBFUSCATED = (1u << 1u), +}; + +struct rspamd_archive_file { + GString *fname; + gsize compressed_size; + gsize uncompressed_size; + enum rspamd_archive_file_flags flags; +}; + +struct rspamd_archive { + enum rspamd_archive_type type; + const rspamd_ftok_t *archive_name; + gsize size; + enum rspamd_archive_flags flags; + GPtrArray *files; /* Array of struct rspamd_archive_file */ +}; + +/** + * Process archives from a worker task + */ +void rspamd_archives_process(struct rspamd_task *task); + +/** + * Get textual representation of an archive's type + */ +const gchar *rspamd_archive_type_str(enum rspamd_archive_type type); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBMIME_ARCHIVES_H_ */ diff --git a/src/libmime/content_type.c b/src/libmime/content_type.c new file mode 100644 index 0000000..765cb87 --- /dev/null +++ b/src/libmime/content_type.c @@ -0,0 +1,884 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "libmime/content_type.h" +#include "smtp_parsers.h" +#include "utlist.h" +#include "libserver/url.h" +#include "libmime/mime_encoding.h" + +static gboolean +rspamd_rfc2231_decode(rspamd_mempool_t *pool, + struct rspamd_content_type_param *param, + gchar *value_start, gchar *value_end) +{ + gchar *quote_pos; + + quote_pos = memchr(value_start, '\'', value_end - value_start); + + if (quote_pos == NULL) { + /* Plain percent encoding */ + gsize r = rspamd_url_decode(value_start, value_start, + value_end - value_start); + param->value.begin = value_start; + param->value.len = r; + } + else { + /* + * We can have encoding'language'data, or + * encoding'data (in theory). + * Try to handle both... + */ + const gchar *charset = NULL; + rspamd_ftok_t ctok; + + ctok.begin = value_start; + ctok.len = quote_pos - value_start; + + if (ctok.len > 0) { + charset = rspamd_mime_detect_charset(&ctok, pool); + } + + /* Now, we can check for either next quote sign or, eh, ignore that */ + value_start = quote_pos + 1; + + quote_pos = memchr(value_start, '\'', value_end - value_start); + + if (quote_pos) { + /* Ignore language */ + value_start = quote_pos + 1; + } + + /* Perform percent decoding */ + gsize r = rspamd_url_decode(value_start, value_start, + value_end - value_start); + GError *err = NULL; + + if (charset == NULL) { + /* Try heuristic */ + charset = rspamd_mime_charset_find_by_content(value_start, r, TRUE); + } + + if (charset == NULL) { + msg_warn_pool("cannot convert parameter from charset %T", &ctok); + + return FALSE; + } + + param->value.begin = rspamd_mime_text_to_utf8(pool, + value_start, r, + charset, ¶m->value.len, &err); + + if (param->value.begin == NULL) { + msg_warn_pool("cannot convert parameter from charset %s: %e", + charset, err); + + if (err) { + g_error_free(err); + } + + return FALSE; + } + } + + param->flags |= RSPAMD_CONTENT_PARAM_RFC2231; + + return TRUE; +} + +static gboolean +rspamd_param_maybe_rfc2231_process(rspamd_mempool_t *pool, + struct rspamd_content_type_param *param, + gchar *name_start, gchar *name_end, + gchar *value_start, gchar *value_end) +{ + const gchar *star_pos; + + star_pos = memchr(name_start, '*', name_end - name_start); + + if (star_pos == NULL) { + return FALSE; + } + + /* We have three possibilities here: + * 1. name* (just name + 2231 encoding) + * 2. name*(\d+) (piecewise stuff but no rfc2231 encoding) + * 3. name*(\d+)* (piecewise stuff and rfc2231 encoding) + */ + + if (star_pos == name_end - 1) { + /* First */ + if (rspamd_rfc2231_decode(pool, param, value_start, value_end)) { + param->name.begin = name_start; + param->name.len = name_end - name_start - 1; + } + } + else if (*(name_end - 1) == '*') { + /* Third */ + /* Check number */ + gulong tmp; + + if (!rspamd_strtoul(star_pos + 1, name_end - star_pos - 2, &tmp)) { + return FALSE; + } + + param->flags |= RSPAMD_CONTENT_PARAM_PIECEWISE | RSPAMD_CONTENT_PARAM_RFC2231; + param->rfc2231_id = tmp; + param->name.begin = name_start; + param->name.len = star_pos - name_start; + param->value.begin = value_start; + param->value.len = value_end - value_start; + + /* Deal with that later... */ + } + else { + /* Second case */ + gulong tmp; + + if (!rspamd_strtoul(star_pos + 1, name_end - star_pos - 1, &tmp)) { + return FALSE; + } + + param->flags |= RSPAMD_CONTENT_PARAM_PIECEWISE; + param->rfc2231_id = tmp; + param->name.begin = name_start; + param->name.len = star_pos - name_start; + param->value.begin = value_start; + param->value.len = value_end - value_start; + } + + return TRUE; +} + +static gint32 +rspamd_cmp_pieces(struct rspamd_content_type_param *p1, struct rspamd_content_type_param *p2) +{ + return p1->rfc2231_id - p2->rfc2231_id; +} + +static void +rspamd_postprocess_ct_attributes(rspamd_mempool_t *pool, + GHashTable *htb, + void (*proc)(rspamd_mempool_t *, struct rspamd_content_type_param *, gpointer ud), + gpointer procd) +{ + GHashTableIter it; + gpointer k, v; + struct rspamd_content_type_param *param, *sorted, *cur; + + if (htb == NULL) { + return; + } + + g_hash_table_iter_init(&it, htb); + + while (g_hash_table_iter_next(&it, &k, &v)) { + param = (struct rspamd_content_type_param *) v; + + if (param->flags & RSPAMD_CONTENT_PARAM_PIECEWISE) { + /* Reconstruct param */ + gsize tlen = 0; + gchar *ndata, *pos; + + sorted = param; + DL_SORT(sorted, rspamd_cmp_pieces); + + DL_FOREACH(sorted, cur) + { + tlen += cur->value.len; + } + + ndata = rspamd_mempool_alloc(pool, tlen); + pos = ndata; + + DL_FOREACH(sorted, cur) + { + memcpy(pos, cur->value.begin, cur->value.len); + pos += cur->value.len; + } + + if (param->flags & RSPAMD_CONTENT_PARAM_RFC2231) { + if (!rspamd_rfc2231_decode(pool, param, + ndata, pos)) { + param->flags |= RSPAMD_CONTENT_PARAM_BROKEN; + param->value.begin = ndata; + param->value.len = tlen; + } + } + else { + param->value.begin = ndata; + param->value.len = tlen; + } + + /* Detach from list */ + param->next = NULL; + param->prev = param; + } + + gboolean invalid_utf = FALSE; + + if (param->value.begin != NULL && param->value.len > 0) { + param->value.begin = rspamd_mime_header_decode(pool, param->value.begin, + param->value.len, &invalid_utf); + param->value.len = strlen(param->value.begin); + } + + if (invalid_utf) { + param->flags |= RSPAMD_CONTENT_PARAM_BROKEN; + } + + proc(pool, param, procd); + } +} + +static void +rspamd_content_type_postprocess(rspamd_mempool_t *pool, + struct rspamd_content_type_param *param, + gpointer ud) +{ + rspamd_ftok_t srch; + struct rspamd_content_type_param *found = NULL; + + struct rspamd_content_type *ct = (struct rspamd_content_type *) ud; + + RSPAMD_FTOK_ASSIGN(&srch, "charset"); + + if (rspamd_ftok_icase_equal(¶m->name, &srch)) { + /* Adjust charset */ + found = param; + ct->charset.begin = param->value.begin; + ct->charset.len = param->value.len; + } + + RSPAMD_FTOK_ASSIGN(&srch, "boundary"); + + if (rspamd_ftok_icase_equal(¶m->name, &srch)) { + found = param; + gchar *lc_boundary; + /* Adjust boundary */ + lc_boundary = rspamd_mempool_alloc(pool, param->value.len); + memcpy(lc_boundary, param->value.begin, param->value.len); + rspamd_str_lc(lc_boundary, param->value.len); + ct->boundary.begin = lc_boundary; + ct->boundary.len = param->value.len; + /* Preserve original (case sensitive) boundary */ + ct->orig_boundary.begin = param->value.begin; + ct->orig_boundary.len = param->value.len; + } + + if (!found) { + RSPAMD_FTOK_ASSIGN(&srch, "name"); + if (!rspamd_ftok_icase_equal(¶m->name, &srch)) { + /* Just lowercase */ + rspamd_str_lc_utf8((gchar *) param->value.begin, param->value.len); + } + } +} + +static void +rspamd_content_disposition_postprocess(rspamd_mempool_t *pool, + struct rspamd_content_type_param *param, + gpointer ud) +{ + rspamd_ftok_t srch; + struct rspamd_content_disposition *cd = (struct rspamd_content_disposition *) ud; + + srch.begin = "filename"; + srch.len = 8; + + if (rspamd_ftok_icase_equal(¶m->name, &srch)) { + /* Adjust filename */ + cd->filename.begin = param->value.begin; + cd->filename.len = param->value.len; + } +} + +void rspamd_content_type_add_param(rspamd_mempool_t *pool, + struct rspamd_content_type *ct, + gchar *name_start, gchar *name_end, + gchar *value_start, gchar *value_end) +{ + struct rspamd_content_type_param *nparam; + rspamd_ftok_t srch; + struct rspamd_content_type_param *found = NULL; + + g_assert(ct != NULL); + + nparam = rspamd_mempool_alloc0(pool, sizeof(*nparam)); + rspamd_str_lc(name_start, name_end - name_start); + + if (!rspamd_param_maybe_rfc2231_process(pool, nparam, name_start, + name_end, value_start, value_end)) { + nparam->name.begin = name_start; + nparam->name.len = name_end - name_start; + nparam->value.begin = value_start; + nparam->value.len = value_end - value_start; + } + + srch.begin = nparam->name.begin; + srch.len = nparam->name.len; + + if (ct->attrs) { + found = g_hash_table_lookup(ct->attrs, &srch); + } + else { + ct->attrs = g_hash_table_new(rspamd_ftok_icase_hash, + rspamd_ftok_icase_equal); + } + + if (!found) { + DL_APPEND(found, nparam); + g_hash_table_insert(ct->attrs, &nparam->name, nparam); + } + else { + DL_APPEND(found, nparam); + } +} + +static struct rspamd_content_type * +rspamd_content_type_parser(gchar *in, gsize len, rspamd_mempool_t *pool) +{ + guint obraces = 0, ebraces = 0, qlen = 0; + gchar *p, *c, *end, *pname_start = NULL, *pname_end = NULL; + struct rspamd_content_type *res = NULL, val; + gboolean eqsign_seen = FALSE; + enum { + parse_type, + parse_subtype, + parse_after_subtype, + parse_param_name, + parse_param_after_name, + parse_param_value, + parse_param_value_after_quote, + parse_space, + parse_quoted, + parse_comment, + } state = parse_space, + next_state = parse_type; + + p = in; + c = p; + end = p + len; + memset(&val, 0, sizeof(val)); + val.cpy = in; + + while (p < end) { + switch (state) { + case parse_type: + if (g_ascii_isspace(*p) || *p == ';') { + /* We have type without subtype */ + val.type.begin = c; + val.type.len = p - c; + state = parse_after_subtype; + } + else if (*p == '/') { + val.type.begin = c; + val.type.len = p - c; + state = parse_space; + next_state = parse_subtype; + p++; + } + else { + p++; + } + break; + case parse_subtype: + if (g_ascii_isspace(*p) || *p == ';') { + val.subtype.begin = c; + val.subtype.len = p - c; + state = parse_after_subtype; + } + else { + p++; + } + break; + case parse_after_subtype: + if (*p == ';' || g_ascii_isspace(*p)) { + p++; + } + else if (*p == '(') { + c = p; + state = parse_comment; + next_state = parse_param_name; + obraces = 1; + ebraces = 0; + pname_start = NULL; + pname_end = NULL; + eqsign_seen = FALSE; + p++; + } + else { + c = p; + state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + eqsign_seen = FALSE; + } + break; + case parse_param_name: + if (*p == '=') { + pname_start = c; + pname_end = p; + state = parse_param_after_name; + eqsign_seen = TRUE; + p++; + } + else if (g_ascii_isspace(*p)) { + pname_start = c; + pname_end = p; + state = parse_param_after_name; + } + else { + p++; + } + break; + case parse_param_after_name: + if (g_ascii_isspace(*p)) { + p++; + } + else if (*p == '=') { + if (eqsign_seen) { + /* Treat as value start */ + c = p; + eqsign_seen = FALSE; + state = parse_param_value; + p++; + } + else { + eqsign_seen = TRUE; + p++; + } + } + else { + if (eqsign_seen) { + state = parse_param_value; + c = p; + } + else { + /* Invalid parameter without value */ + c = p; + state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } + } + break; + case parse_param_value: + if (*p == '"') { + p++; + c = p; + state = parse_quoted; + next_state = parse_param_value_after_quote; + } + else if (g_ascii_isspace(*p)) { + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param(pool, &val, pname_start, + pname_end, c, p); + } + + state = parse_space; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } + else if (*p == '(') { + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param(pool, &val, pname_start, + pname_end, c, p); + } + + obraces = 1; + ebraces = 0; + p++; + state = parse_comment; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } + else if (*p == ';') { + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param(pool, &val, pname_start, + pname_end, c, p); + } + + p++; + state = parse_space; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } + else { + p++; + } + break; + case parse_param_value_after_quote: + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param(pool, &val, pname_start, + pname_end, c, c + qlen); + } + + if (*p == '"') { + p++; + + if (p == end) { + /* Last quote: done... */ + state = parse_space; + break; + } + + if (*p == ';') { + p++; + state = parse_space; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + continue; + } + } + + /* We should not normally be here in fact */ + if (g_ascii_isspace(*p)) { + state = parse_space; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } + else if (*p == '(') { + obraces = 1; + ebraces = 0; + p++; + state = parse_comment; + next_state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + } + else { + state = parse_param_name; + pname_start = NULL; + pname_end = NULL; + c = p; + } + break; + case parse_quoted: + if (*p == '\\') { + /* Quoted pair */ + if (p + 1 < end) { + p += 2; + } + else { + p++; + } + } + else if (*p == '"') { + qlen = p - c; + state = next_state; + } + else { + p++; + } + break; + case parse_comment: + if (*p == '(') { + obraces++; + p++; + } + else if (*p == ')') { + ebraces++; + p++; + + if (ebraces == obraces && p < end) { + if (g_ascii_isspace(*p)) { + state = parse_space; + } + else { + c = p; + state = next_state; + } + } + } + else { + p++; + } + break; + case parse_space: + if (g_ascii_isspace(*p)) { + p++; + } + else if (*p == '(') { + obraces = 1; + ebraces = 0; + p++; + state = parse_comment; + } + else { + c = p; + state = next_state; + } + break; + } + } + + /* Process leftover */ + switch (state) { + case parse_type: + val.type.begin = c; + val.type.len = p - c; + break; + case parse_subtype: + val.subtype.begin = c; + val.subtype.len = p - c; + break; + case parse_param_value: + if (pname_start && pname_end && pname_end > pname_start) { + if (p > c && *(p - 1) == ';') { + p--; + } + + rspamd_content_type_add_param(pool, &val, pname_start, + pname_end, c, p); + } + break; + case parse_param_value_after_quote: + if (pname_start && pname_end && pname_end > pname_start) { + rspamd_content_type_add_param(pool, &val, pname_start, + pname_end, c, c + qlen); + } + break; + default: + break; + } + + if (val.type.len > 0) { + gchar *tmp; + + res = rspamd_mempool_alloc(pool, sizeof(val)); + memcpy(res, &val, sizeof(val)); + + /* + * Lowercase type and subtype as they are specified as case insensitive + * in rfc2045 section 5.1 + */ + tmp = rspamd_mempool_alloc(pool, val.type.len); + memcpy(tmp, val.type.begin, val.type.len); + rspamd_str_lc(tmp, val.type.len); + res->type.begin = tmp; + + if (val.subtype.len > 0) { + tmp = rspamd_mempool_alloc(pool, val.subtype.len); + memcpy(tmp, val.subtype.begin, val.subtype.len); + rspamd_str_lc(tmp, val.subtype.len); + res->subtype.begin = tmp; + } + } + + return res; +} + +struct rspamd_content_type * +rspamd_content_type_parse(const gchar *in, + gsize len, rspamd_mempool_t *pool) +{ + struct rspamd_content_type *res = NULL; + rspamd_ftok_t srch; + gchar *cpy; + + cpy = rspamd_mempool_alloc(pool, len + 1); + rspamd_strlcpy(cpy, in, len + 1); + + if ((res = rspamd_content_type_parser(cpy, len, pool)) != NULL) { + if (res->attrs) { + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) g_hash_table_unref, res->attrs); + + rspamd_postprocess_ct_attributes(pool, res->attrs, + rspamd_content_type_postprocess, res); + } + + /* Now do some hacks to work with broken content types */ + if (res->subtype.len == 0) { + res->flags |= RSPAMD_CONTENT_TYPE_BROKEN; + RSPAMD_FTOK_ASSIGN(&srch, "text"); + + if (rspamd_ftok_casecmp(&res->type, &srch) == 0) { + /* Workaround for Content-Type: text */ + /* Assume text/plain */ + RSPAMD_FTOK_ASSIGN(&srch, "plain"); + } + else { + RSPAMD_FTOK_ASSIGN(&srch, "html"); + + if (rspamd_ftok_casecmp(&res->type, &srch) == 0) { + /* Workaround for Content-Type: html */ + RSPAMD_FTOK_ASSIGN(&res->type, "text"); + RSPAMD_FTOK_ASSIGN(&res->subtype, "html"); + } + else { + RSPAMD_FTOK_ASSIGN(&srch, "application"); + + if (rspamd_ftok_casecmp(&res->type, &srch) == 0) { + RSPAMD_FTOK_ASSIGN(&res->subtype, "octet-stream"); + } + } + } + } + else { + /* Common mistake done by retards */ + RSPAMD_FTOK_ASSIGN(&srch, "alternate"); + + if (rspamd_ftok_casecmp(&res->subtype, &srch) == 0) { + res->flags |= RSPAMD_CONTENT_TYPE_BROKEN; + RSPAMD_FTOK_ASSIGN(&res->subtype, "alternative"); + } + + /* PKCS7 smime */ + RSPAMD_FTOK_ASSIGN(&srch, "pkcs7-mime"); + if (rspamd_substring_search(res->subtype.begin, res->subtype.len, + srch.begin, srch.len) != -1) { + res->flags |= RSPAMD_CONTENT_TYPE_SMIME; + } + } + + RSPAMD_FTOK_ASSIGN(&srch, "multipart"); + + if (rspamd_ftok_casecmp(&res->type, &srch) == 0) { + res->flags |= RSPAMD_CONTENT_TYPE_MULTIPART; + + RSPAMD_FTOK_ASSIGN(&srch, "encrypted"); + if (rspamd_ftok_casecmp(&res->subtype, &srch) == 0) { + res->flags |= RSPAMD_CONTENT_TYPE_ENCRYPTED; + } + } + else { + RSPAMD_FTOK_ASSIGN(&srch, "text"); + + if (rspamd_ftok_casecmp(&res->type, &srch) == 0) { + res->flags |= RSPAMD_CONTENT_TYPE_TEXT; + } + else { + RSPAMD_FTOK_ASSIGN(&srch, "message"); + + if (rspamd_ftok_casecmp(&res->type, &srch) == 0) { + RSPAMD_FTOK_ASSIGN(&srch, "delivery-status"); + + if (rspamd_ftok_casecmp(&res->subtype, &srch) == 0) { + res->flags |= RSPAMD_CONTENT_TYPE_TEXT | RSPAMD_CONTENT_TYPE_DSN; + } + else { + RSPAMD_FTOK_ASSIGN(&srch, "notification"); + + if (rspamd_substring_search_caseless(res->subtype.begin, + res->subtype.len, srch.begin, srch.len) != -1) { + res->flags |= RSPAMD_CONTENT_TYPE_TEXT | + RSPAMD_CONTENT_TYPE_DSN; + } + else { + res->flags |= RSPAMD_CONTENT_TYPE_MESSAGE; + } + } + } + } + } + } + else { + msg_warn_pool("cannot parse content type: %*s", (gint) len, cpy); + } + + return res; +} + +void rspamd_content_disposition_add_param(rspamd_mempool_t *pool, + struct rspamd_content_disposition *cd, + const gchar *name_start, const gchar *name_end, + const gchar *value_start, const gchar *value_end) +{ + rspamd_ftok_t srch; + gchar *name_cpy, *value_cpy, *name_cpy_end, *value_cpy_end; + struct rspamd_content_type_param *found = NULL, *nparam; + + g_assert(cd != NULL); + + name_cpy = rspamd_mempool_alloc(pool, name_end - name_start); + memcpy(name_cpy, name_start, name_end - name_start); + name_cpy_end = name_cpy + (name_end - name_start); + + value_cpy = rspamd_mempool_alloc(pool, value_end - value_start); + memcpy(value_cpy, value_start, value_end - value_start); + value_cpy_end = value_cpy + (value_end - value_start); + + nparam = rspamd_mempool_alloc0(pool, sizeof(*nparam)); + rspamd_str_lc(name_cpy, name_cpy_end - name_cpy); + + if (!rspamd_param_maybe_rfc2231_process(pool, nparam, name_cpy, + name_cpy_end, value_cpy, value_cpy_end)) { + nparam->name.begin = name_cpy; + nparam->name.len = name_cpy_end - name_cpy; + nparam->value.begin = value_cpy; + nparam->value.len = value_cpy_end - value_cpy; + } + + srch.begin = nparam->name.begin; + srch.len = nparam->name.len; + + if (cd->attrs) { + found = g_hash_table_lookup(cd->attrs, &srch); + } + else { + cd->attrs = g_hash_table_new(rspamd_ftok_icase_hash, + rspamd_ftok_icase_equal); + } + + if (!found) { + DL_APPEND(found, nparam); + g_hash_table_insert(cd->attrs, &nparam->name, nparam); + } + else { + DL_APPEND(found, nparam); + } +} + +struct rspamd_content_disposition * +rspamd_content_disposition_parse(const gchar *in, + gsize len, rspamd_mempool_t *pool) +{ + struct rspamd_content_disposition *res = NULL, val; + + if (rspamd_content_disposition_parser(in, len, &val, pool)) { + + if (val.type == RSPAMD_CT_UNKNOWN) { + /* 'Fix' type to attachment as MUA does */ + val.type = RSPAMD_CT_ATTACHMENT; + } + + res = rspamd_mempool_alloc(pool, sizeof(val)); + memcpy(res, &val, sizeof(val)); + res->lc_data = rspamd_mempool_alloc(pool, len + 1); + rspamd_strlcpy(res->lc_data, in, len + 1); + rspamd_str_lc(res->lc_data, len); + + if (res->attrs) { + rspamd_postprocess_ct_attributes(pool, res->attrs, + rspamd_content_disposition_postprocess, res); + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) g_hash_table_unref, res->attrs); + } + } + else { + msg_warn_pool("cannot parse content disposition: %*s", + (gint) len, in); + } + + return res; +} diff --git a/src/libmime/content_type.h b/src/libmime/content_type.h new file mode 100644 index 0000000..ac49bdc --- /dev/null +++ b/src/libmime/content_type.h @@ -0,0 +1,130 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_CONTENT_TYPE_H_ +#define SRC_LIBMIME_CONTENT_TYPE_H_ + +#include "config.h" +#include "libutil/fstring.h" +#include "libutil/mem_pool.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum rspamd_content_type_flags { + RSPAMD_CONTENT_TYPE_VALID = 0, + RSPAMD_CONTENT_TYPE_BROKEN = 1 << 0, + RSPAMD_CONTENT_TYPE_MULTIPART = 1 << 1, + RSPAMD_CONTENT_TYPE_TEXT = 1 << 2, + RSPAMD_CONTENT_TYPE_MESSAGE = 1 << 3, + RSPAMD_CONTENT_TYPE_DSN = 1 << 4, + RSPAMD_CONTENT_TYPE_MISSING = 1 << 5, + RSPAMD_CONTENT_TYPE_ENCRYPTED = 1 << 6, + RSPAMD_CONTENT_TYPE_SMIME = 1 << 7, +}; + +enum rspamd_content_param_flags { + RSPAMD_CONTENT_PARAM_NORMAL = 0, + RSPAMD_CONTENT_PARAM_RFC2231 = (1 << 0), + RSPAMD_CONTENT_PARAM_PIECEWISE = (1 << 1), + RSPAMD_CONTENT_PARAM_BROKEN = (1 << 2), +}; + +struct rspamd_content_type_param { + rspamd_ftok_t name; + rspamd_ftok_t value; + guint rfc2231_id; + enum rspamd_content_param_flags flags; + struct rspamd_content_type_param *prev, *next; +}; + +struct rspamd_content_type { + gchar *cpy; + rspamd_ftok_t type; + rspamd_ftok_t subtype; + rspamd_ftok_t charset; + rspamd_ftok_t boundary; + rspamd_ftok_t orig_boundary; + enum rspamd_content_type_flags flags; + GHashTable *attrs; /* Can be empty */ +}; + +enum rspamd_content_disposition_type { + RSPAMD_CT_UNKNOWN = 0, + RSPAMD_CT_INLINE = 1, + RSPAMD_CT_ATTACHMENT = 2, +}; + +struct rspamd_content_disposition { + gchar *lc_data; + enum rspamd_content_disposition_type type; + rspamd_ftok_t filename; + GHashTable *attrs; /* Can be empty */ +}; + +/** + * Adds new parameter to content type structure + * @param ct + * @param name_start (can be modified) + * @param name_end + * @param value_start (can be modified) + * @param value_end + */ +void rspamd_content_type_add_param(rspamd_mempool_t *pool, + struct rspamd_content_type *ct, + gchar *name_start, gchar *name_end, + gchar *value_start, gchar *value_end); + +/** + * Parse content type from the header (performs copy + lowercase) + * @param in + * @param len + * @param pool + * @return + */ +struct rspamd_content_type *rspamd_content_type_parse(const gchar *in, + gsize len, rspamd_mempool_t *pool); + +/** + * Adds new param for content disposition header + * @param pool + * @param cd + * @param name_start + * @param name_end + * @param value_start + * @param value_end + */ +void rspamd_content_disposition_add_param(rspamd_mempool_t *pool, + struct rspamd_content_disposition *cd, + const gchar *name_start, const gchar *name_end, + const gchar *value_start, const gchar *value_end); + +/** + * Parse content-disposition header + * @param in + * @param len + * @param pool + * @return + */ +struct rspamd_content_disposition *rspamd_content_disposition_parse(const gchar *in, + gsize len, + rspamd_mempool_t *pool); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBMIME_CONTENT_TYPE_H_ */ diff --git a/src/libmime/email_addr.c b/src/libmime/email_addr.c new file mode 100644 index 0000000..0af7388 --- /dev/null +++ b/src/libmime/email_addr.c @@ -0,0 +1,563 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "email_addr.h" +#include "message.h" +#include "printf.h" +#include "smtp_parsers.h" + +static void +rspamd_email_address_unescape(struct rspamd_email_address *addr) +{ + const char *h, *end; + char *t, *d; + + if (addr->user_len == 0) { + return; + } + + d = g_malloc(addr->user_len); + t = d; + h = addr->user; + end = h + addr->user_len; + + while (h < end) { + if (*h != '\\') { + *t++ = *h; + } + h++; + } + + addr->user = d; + addr->user_len = t - d; + addr->flags |= RSPAMD_EMAIL_ADDR_USER_ALLOCATED; +} + +struct rspamd_email_address * +rspamd_email_address_from_smtp(const gchar *str, guint len) +{ + struct rspamd_email_address addr, *ret; + gsize nlen; + + if (str == NULL || len == 0) { + return NULL; + } + + rspamd_smtp_addr_parse(str, len, &addr); + + if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) { + ret = g_malloc(sizeof(*ret)); + memcpy(ret, &addr, sizeof(addr)); + + if ((ret->flags & RSPAMD_EMAIL_ADDR_QUOTED) && ret->addr[0] == '"') { + if (ret->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) { + /* We also need to unquote user */ + rspamd_email_address_unescape(ret); + } + + /* We need to unquote addr */ + nlen = ret->domain_len + ret->user_len + 2; + ret->addr = g_malloc(nlen + 1); + ret->addr_len = rspamd_snprintf((char *) ret->addr, nlen, "%*s@%*s", + (gint) ret->user_len, ret->user, + (gint) ret->domain_len, ret->domain); + ret->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED; + } + + return ret; + } + + return NULL; +} + +void rspamd_email_address_free(struct rspamd_email_address *addr) +{ + if (addr) { + if (addr->flags & RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED) { + g_free((void *) addr->addr); + } + + if (addr->flags & RSPAMD_EMAIL_ADDR_USER_ALLOCATED) { + g_free((void *) addr->user); + } + + g_free(addr); + } +} + +static inline void +rspamd_email_address_add(rspamd_mempool_t *pool, + GPtrArray *ar, + struct rspamd_email_address *addr, + GString *name) +{ + struct rspamd_email_address *elt; + guint nlen; + + elt = g_malloc0(sizeof(*elt)); + rspamd_mempool_notify_alloc(pool, sizeof(*elt)); + + if (addr != NULL) { + memcpy(elt, addr, sizeof(*addr)); + } + else { + elt->addr = ""; + elt->domain = ""; + elt->raw = "<>"; + elt->raw_len = 2; + elt->user = ""; + elt->flags |= RSPAMD_EMAIL_ADDR_EMPTY; + } + + if ((elt->flags & RSPAMD_EMAIL_ADDR_QUOTED) && elt->addr[0] == '"') { + if (elt->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) { + /* We also need to unquote user */ + rspamd_email_address_unescape(elt); + } + + /* We need to unquote addr */ + nlen = elt->domain_len + elt->user_len + 2; + elt->addr = g_malloc(nlen + 1); + rspamd_mempool_notify_alloc(pool, nlen + 1); + elt->addr_len = rspamd_snprintf((char *) elt->addr, nlen, "%*s@%*s", + (gint) elt->user_len, elt->user, + (gint) elt->domain_len, elt->domain); + elt->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED; + } + + if (name->len > 0) { + rspamd_gstring_strip(name, " \t\v"); + elt->name = rspamd_mime_header_decode(pool, name->str, name->len, NULL); + } + + rspamd_mempool_notify_alloc(pool, name->len); + g_ptr_array_add(ar, elt); +} + +/* + * Tries to parse an email address that doesn't conform RFC + */ +static gboolean +rspamd_email_address_parse_heuristic(const char *data, size_t len, + struct rspamd_email_address *addr) +{ + const gchar *p = data, *at = NULL, *end = data + len; + gboolean ret = FALSE; + + memset(addr, 0, sizeof(*addr)); + + if (*p == '<' && len > 1) { + /* Angled address */ + addr->addr_len = rspamd_memcspn(p + 1, ">", len - 1); + addr->addr = p + 1; + addr->raw = p; + addr->raw_len = len; + ret = TRUE; + + p = p + 1; + len = addr->addr_len; + end = p + len; + } + else if (len > 0) { + addr->addr = p; + addr->addr_len = len; + addr->raw = p; + addr->raw_len = len; + ret = TRUE; + } + + if (ret) { + at = rspamd_memrchr(p, '@', len); + + if (at != NULL && at + 1 < end) { + addr->domain = at + 1; + addr->domain_len = end - (at + 1); + addr->user = p; + addr->user_len = at - p; + } + + if (rspamd_str_has_8bit(p, len)) { + addr->flags |= RSPAMD_EMAIL_ADDR_HAS_8BIT; + } + } + + return ret; +} + +static inline int +rspamd_email_address_check_and_add(const gchar *start, gsize len, + GPtrArray *res, + rspamd_mempool_t *pool, + GString *ns, + gint max_elements) +{ + struct rspamd_email_address addr; + + g_assert(res != NULL); + + if (max_elements > 0 && res->len >= max_elements) { + msg_info_pool_check("reached maximum number of elements %d when adding %v", + max_elements, + ns); + + return -1; + } + + /* The whole email is likely address */ + memset(&addr, 0, sizeof(addr)); + rspamd_smtp_addr_parse(start, len, &addr); + + if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) { + rspamd_email_address_add(pool, res, &addr, ns); + } + else { + /* Try heuristic */ + if (rspamd_email_address_parse_heuristic(start, + len, &addr)) { + rspamd_email_address_add(pool, res, &addr, ns); + + return 1; + } + else { + return 0; + } + } + + return 1; +} + +GPtrArray * +rspamd_email_address_from_mime(rspamd_mempool_t *pool, const gchar *hdr, + guint len, + GPtrArray *src, + gint max_elements) +{ + GPtrArray *res = src; + gboolean seen_at = FALSE, seen_obrace = FALSE; + + const gchar *p = hdr, *end = hdr + len, *c = hdr, *t; + GString *ns, *cpy; + gint obraces, ebraces; + enum { + parse_name = 0, + parse_quoted, + parse_addr, + skip_spaces + } state = parse_name, + next_state = parse_name; + + if (res == NULL) { + res = g_ptr_array_sized_new(2); + rspamd_mempool_add_destructor(pool, rspamd_email_address_list_destroy, + res); + } + else if (max_elements > 0 && res->len >= max_elements) { + msg_info_pool_check("reached maximum number of elements %d", max_elements); + + return res; + } + + ns = g_string_sized_new(len); + cpy = g_string_sized_new(len); + + rspamd_mempool_add_destructor(pool, rspamd_gstring_free_hard, cpy); + + /* First, we need to remove all comments as they are terrible */ + obraces = 0; + ebraces = 0; + + while (p < end) { + if (state == parse_name) { + if (*p == '\\') { + if (obraces == 0) { + g_string_append_c(cpy, *p); + } + + p++; + } + else { + if (*p == '"') { + state = parse_quoted; + } + else if (*p == '(') { + obraces++; /* To avoid ) itself being copied */ + } + else if (*p == ')') { + ebraces++; + p++; + } + + if (obraces == ebraces) { + obraces = 0; + ebraces = 0; + } + } + + if (p < end && obraces == 0) { + g_string_append_c(cpy, *p); + } + } + else { + /* Quoted elt */ + if (*p == '\\') { + g_string_append_c(cpy, *p); + p++; + } + else { + if (*p == '"') { + state = parse_name; + } + } + + if (p < end) { + g_string_append_c(cpy, *p); + } + } + + p++; + } + + state = parse_name; + + p = cpy->str; + c = p; + end = p + cpy->len; + + while (p < end) { + switch (state) { + case parse_name: + if (*p == '"') { + /* We need to strip last spaces and update `ns` */ + if (p > c) { + guint nspaces = 0; + + t = p - 1; + + while (t > c && g_ascii_isspace(*t)) { + t--; + nspaces++; + } + + g_string_append_len(ns, c, t - c + 1); + + if (nspaces > 0) { + g_string_append_c(ns, ' '); + } + } + + state = parse_quoted; + c = p + 1; + } + else if (*p == '<') { + if (p > c) { + t = p - 1; + + while (t > c && g_ascii_isspace(*t)) { + t--; + } + + g_string_append_len(ns, c, t - c + 1); + } + + c = p; + state = parse_addr; + } + else if (*p == ',') { + if (p > c && seen_at) { + /* + * Last token must be the address: + * e.g. Some name name@domain.com + */ + t = p - 1; + + while (t > c && g_ascii_isspace(*t)) { + t--; + } + + int check = rspamd_email_address_check_and_add(c, t - c + 1, + res, pool, ns, max_elements); + + if (check == 0 && res->len == 0) { + /* Insert fake address */ + rspamd_email_address_add(pool, res, NULL, ns); + } + else if (check != 1) { + goto end; + } + + /* Cleanup for the next use */ + g_string_set_size(ns, 0); + seen_at = FALSE; + } + + state = skip_spaces; + next_state = parse_name; + } + else if (*p == '@') { + seen_at = TRUE; + } + + p++; + break; + case parse_quoted: + if (*p == '\\') { + if (p > c) { + g_string_append_len(ns, c, p - c); + } + + p++; + c = p; + } + else if (*p == '"') { + if (p > c) { + g_string_append_len(ns, c, p - c); + } + + if (p + 1 < end && g_ascii_isspace(p[1])) { + g_string_append_c(ns, ' '); + } + + state = skip_spaces; + next_state = parse_name; + } + else if (*p == '@' && seen_obrace) { + seen_at = TRUE; + } + else if (*p == '<') { + seen_obrace = TRUE; + } + p++; + break; + case parse_addr: + if (*p == '>') { + int check = rspamd_email_address_check_and_add(c, p - c + 1, + res, pool, ns, max_elements); + if (check == 0 && res->len == 0) { + /* Insert a fake address */ + rspamd_email_address_add(pool, res, NULL, ns); + } + else if (check != 1) { + goto end; + } + + /* Cleanup for the next use */ + g_string_set_size(ns, 0); + seen_at = FALSE; + state = skip_spaces; + next_state = parse_name; + } + else if (*p == '@') { + seen_at = TRUE; + } + p++; + break; + case skip_spaces: + if (!g_ascii_isspace(*p)) { + c = p; + state = next_state; + } + else { + p++; + } + break; + } + } + + /* Handle leftover */ + switch (state) { + case parse_name: + /* Assume the whole header as name (bad thing) */ + if (p > c) { + while (p > c && g_ascii_isspace(*p)) { + p--; + } + + if (p > c) { + if (seen_at) { + /* The whole email is likely address */ + int check = rspamd_email_address_check_and_add(c, p - c, + res, pool, ns, max_elements); + if (check == 0 && res->len == 0) { + /* Insert a fake address */ + rspamd_email_address_add(pool, res, NULL, ns); + } + else if (check != 1) { + goto end; + } + } + else { + /* No @ seen */ + g_string_append_len(ns, c, p - c); + + if (res->len == 0) { + rspamd_email_address_add(pool, res, NULL, ns); + } + } + } + else if (res->len == 0) { + rspamd_email_address_add(pool, res, NULL, ns); + } + } + break; + case parse_addr: + if (p > c) { + if (rspamd_email_address_check_and_add(c, p - c, + res, pool, ns, max_elements) == 0) { + if (res->len == 0) { + rspamd_email_address_add(pool, res, NULL, ns); + } + } + } + break; + case parse_quoted: + /* Unfinished quoted string or a comment */ + /* If we have seen obrace + at, then we still can try to resolve address */ + if (seen_at && seen_obrace) { + p = rspamd_memrchr(cpy->str, '<', cpy->len); + g_assert(p != NULL); + if (rspamd_email_address_check_and_add(p, end - p, + res, pool, ns, max_elements) == 0) { + if (res->len == 0) { + rspamd_email_address_add(pool, res, NULL, ns); + } + } + } + break; + default: + /* Do nothing */ + break; + } +end: + rspamd_mempool_notify_alloc(pool, cpy->len); + g_string_free(ns, TRUE); + + return res; +} + +void rspamd_email_address_list_destroy(gpointer ptr) +{ + GPtrArray *ar = ptr; + guint i; + struct rspamd_email_address *addr; + + PTR_ARRAY_FOREACH(ar, i, addr) + { + rspamd_email_address_free(addr); + } + + g_ptr_array_free(ar, TRUE); +}
\ No newline at end of file diff --git a/src/libmime/email_addr.h b/src/libmime/email_addr.h new file mode 100644 index 0000000..ed00722 --- /dev/null +++ b/src/libmime/email_addr.h @@ -0,0 +1,97 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_EMAIL_ADDR_H_ +#define SRC_LIBMIME_EMAIL_ADDR_H_ + +#include "config.h" +#include "libutil/mem_pool.h" +#include "libutil/ref.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_mime_header; + +enum rspamd_email_address_flags { + RSPAMD_EMAIL_ADDR_VALID = (1 << 0), + RSPAMD_EMAIL_ADDR_IP = (1 << 1), + RSPAMD_EMAIL_ADDR_BRACED = (1 << 2), + RSPAMD_EMAIL_ADDR_QUOTED = (1 << 3), + RSPAMD_EMAIL_ADDR_EMPTY = (1 << 4), + RSPAMD_EMAIL_ADDR_HAS_BACKSLASH = (1 << 5), + RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED = (1 << 6), + RSPAMD_EMAIL_ADDR_USER_ALLOCATED = (1 << 7), + RSPAMD_EMAIL_ADDR_HAS_8BIT = (1 << 8), + RSPAMD_EMAIL_ADDR_ALIASED = (1 << 9), + RSPAMD_EMAIL_ADDR_ORIGINAL = (1 << 10), +}; + +/* + * Structure that represents email address in a convenient way + */ +struct rspamd_email_address { + const gchar *raw; + const gchar *addr; + const gchar *user; + const gchar *domain; + const gchar *name; + + guint raw_len; + guint addr_len; + guint domain_len; + guint user_len; + guint flags; +}; + +struct rspamd_task; + +/** + * Create email address from a single rfc822 address (e.g. from mail from:) + * @param str string to use + * @param len length of string + * @return + */ +struct rspamd_email_address *rspamd_email_address_from_smtp(const gchar *str, guint len); + +/** + * Parses email address from the mime header, decodes names and return the array + * of `rspamd_email_address`. If `src` is NULL, then this function creates a new + * array and adds a destructor to remove elements when `pool` is destroyed. + * Otherwise, addresses are appended to `src`. + * @param hdr + * @param len + * @return + */ +GPtrArray * +rspamd_email_address_from_mime(rspamd_mempool_t *pool, const gchar *hdr, guint len, + GPtrArray *src, gint max_elements); + +/** + * Destroys list of email addresses + * @param ptr + */ +void rspamd_email_address_list_destroy(gpointer ptr); + +void rspamd_email_address_free(struct rspamd_email_address *addr); + + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBMIME_EMAIL_ADDR_H_ */ diff --git a/src/libmime/images.c b/src/libmime/images.c new file mode 100644 index 0000000..1344d91 --- /dev/null +++ b/src/libmime/images.c @@ -0,0 +1,718 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "images.h" +#include "task.h" +#include "message.h" +#include "libserver/html/html.h" + +#define msg_debug_images(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_images_log_id, "images", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(images) + +#ifdef USABLE_GD +#include "gd.h" +#include "hash.h" +#include <math.h> + +#define RSPAMD_NORMALIZED_DIM 64 + +static rspamd_lru_hash_t *images_hash = NULL; +#endif + +static const guint8 png_signature[] = {137, 80, 78, 71, 13, 10, 26, 10}; +static const guint8 jpg_sig1[] = {0xff, 0xd8}; +static const guint8 jpg_sig_jfif[] = {0xff, 0xe0}; +static const guint8 jpg_sig_exif[] = {0xff, 0xe1}; +static const guint8 gif_signature[] = {'G', 'I', 'F', '8'}; +static const guint8 bmp_signature[] = {'B', 'M'}; + +static bool process_image(struct rspamd_task *task, struct rspamd_mime_part *part); + + +bool rspamd_images_process_mime_part_maybe(struct rspamd_task *task, + struct rspamd_mime_part *part) +{ + if (part->part_type == RSPAMD_MIME_PART_UNDEFINED) { + if (part->detected_type && + strcmp(part->detected_type, "image") == 0 && + part->parsed_data.len > 0) { + + return process_image(task, part); + } + } + + return false; +} + +void rspamd_images_process(struct rspamd_task *task) +{ + guint i; + struct rspamd_mime_part *part; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + rspamd_images_process_mime_part_maybe(task, part); + } +} + +static enum rspamd_image_type +detect_image_type(rspamd_ftok_t *data) +{ + if (data->len > sizeof(png_signature) / sizeof(png_signature[0])) { + if (memcmp(data->begin, png_signature, sizeof(png_signature)) == 0) { + return IMAGE_TYPE_PNG; + } + } + if (data->len > 10) { + if (memcmp(data->begin, jpg_sig1, sizeof(jpg_sig1)) == 0) { + if (memcmp(data->begin + 2, jpg_sig_jfif, sizeof(jpg_sig_jfif)) == 0 || + memcmp(data->begin + 2, jpg_sig_exif, sizeof(jpg_sig_exif)) == 0) { + return IMAGE_TYPE_JPG; + } + } + } + if (data->len > sizeof(gif_signature) / sizeof(gif_signature[0])) { + if (memcmp(data->begin, gif_signature, sizeof(gif_signature)) == 0) { + return IMAGE_TYPE_GIF; + } + } + if (data->len > sizeof(bmp_signature) / sizeof(bmp_signature[0])) { + if (memcmp(data->begin, bmp_signature, sizeof(bmp_signature)) == 0) { + return IMAGE_TYPE_BMP; + } + } + + return IMAGE_TYPE_UNKNOWN; +} + + +static struct rspamd_image * +process_png_image(rspamd_mempool_t *pool, rspamd_ftok_t *data) +{ + struct rspamd_image *img; + guint32 t; + const guint8 *p; + + if (data->len < 24) { + msg_info_pool("bad png detected (maybe striped)"); + return NULL; + } + + /* In png we should find iHDR section and get data from it */ + /* Skip signature and read header section */ + p = data->begin + 12; + if (memcmp(p, "IHDR", 4) != 0) { + msg_info_pool("png doesn't begins with IHDR section"); + return NULL; + } + + img = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_image)); + img->type = IMAGE_TYPE_PNG; + img->data = data; + + p += 4; + memcpy(&t, p, sizeof(guint32)); + img->width = ntohl(t); + p += 4; + memcpy(&t, p, sizeof(guint32)); + img->height = ntohl(t); + + return img; +} + +static struct rspamd_image * +process_jpg_image(rspamd_mempool_t *pool, rspamd_ftok_t *data) +{ + const guint8 *p, *end; + guint16 h, w; + struct rspamd_image *img; + + img = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_image)); + img->type = IMAGE_TYPE_JPG; + img->data = data; + + p = data->begin; + end = p + data->len - 8; + p += 2; + + while (p < end) { + if (p[0] == 0xFF && p[1] != 0xFF) { + guint len = p[2] * 256 + p[3]; + + p++; + + if (*p == 0xc0 || *p == 0xc1 || *p == 0xc2 || *p == 0xc3 || + *p == 0xc9 || *p == 0xca || *p == 0xcb) { + memcpy(&h, p + 4, sizeof(guint16)); + h = p[4] * 0xff + p[5]; + img->height = h; + w = p[6] * 0xff + p[7]; + img->width = w; + + return img; + } + + + p += len; + } + else { + p++; + } + } + + return NULL; +} + +static struct rspamd_image * +process_gif_image(rspamd_mempool_t *pool, rspamd_ftok_t *data) +{ + struct rspamd_image *img; + const guint8 *p; + guint16 t; + + if (data->len < 10) { + msg_info_pool("bad gif detected (maybe striped)"); + return NULL; + } + + img = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_image)); + img->type = IMAGE_TYPE_GIF; + img->data = data; + + p = data->begin + 6; + memcpy(&t, p, sizeof(guint16)); + img->width = GUINT16_FROM_LE(t); + memcpy(&t, p + 2, sizeof(guint16)); + img->height = GUINT16_FROM_LE(t); + + return img; +} + +static struct rspamd_image * +process_bmp_image(rspamd_mempool_t *pool, rspamd_ftok_t *data) +{ + struct rspamd_image *img; + gint32 t; + const guint8 *p; + + if (data->len < 28) { + msg_info_pool("bad bmp detected (maybe striped)"); + return NULL; + } + + img = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_image)); + img->type = IMAGE_TYPE_BMP; + img->data = data; + p = data->begin + 18; + memcpy(&t, p, sizeof(guint32)); + img->width = GUINT32_FROM_LE(t); + memcpy(&t, p + 4, sizeof(gint32)); + img->height = GUINT32_FROM_LE(t); + + return img; +} + +#ifdef USABLE_GD +/* + * DCT from Emil Mikulic. + * http://unix4lyfe.org/dct/ + */ +static void +rspamd_image_dct_block(gint pixels[8][8], gdouble *out) +{ + gint i; + gint rows[8][8]; + + static const gint c1 = 1004 /* cos(pi/16) << 10 */, + s1 = 200 /* sin(pi/16) */, + c3 = 851 /* cos(3pi/16) << 10 */, + s3 = 569 /* sin(3pi/16) << 10 */, + r2c6 = 554 /* sqrt(2)*cos(6pi/16) << 10 */, + r2s6 = 1337 /* sqrt(2)*sin(6pi/16) << 10 */, + r2 = 181; /* sqrt(2) << 7*/ + + gint x0, x1, x2, x3, x4, x5, x6, x7, x8; + + /* transform rows */ + for (i = 0; i < 8; i++) { + x0 = pixels[0][i]; + x1 = pixels[1][i]; + x2 = pixels[2][i]; + x3 = pixels[3][i]; + x4 = pixels[4][i]; + x5 = pixels[5][i]; + x6 = pixels[6][i]; + x7 = pixels[7][i]; + + /* Stage 1 */ + x8 = x7 + x0; + x0 -= x7; + x7 = x1 + x6; + x1 -= x6; + x6 = x2 + x5; + x2 -= x5; + x5 = x3 + x4; + x3 -= x4; + + /* Stage 2 */ + x4 = x8 + x5; + x8 -= x5; + x5 = x7 + x6; + x7 -= x6; + x6 = c1 * (x1 + x2); + x2 = (-s1 - c1) * x2 + x6; + x1 = (s1 - c1) * x1 + x6; + x6 = c3 * (x0 + x3); + x3 = (-s3 - c3) * x3 + x6; + x0 = (s3 - c3) * x0 + x6; + + /* Stage 3 */ + x6 = x4 + x5; + x4 -= x5; + x5 = r2c6 * (x7 + x8); + x7 = (-r2s6 - r2c6) * x7 + x5; + x8 = (r2s6 - r2c6) * x8 + x5; + x5 = x0 + x2; + x0 -= x2; + x2 = x3 + x1; + x3 -= x1; + + /* Stage 4 and output */ + rows[i][0] = x6; + rows[i][4] = x4; + rows[i][2] = x8 >> 10; + rows[i][6] = x7 >> 10; + rows[i][7] = (x2 - x5) >> 10; + rows[i][1] = (x2 + x5) >> 10; + rows[i][3] = (x3 * r2) >> 17; + rows[i][5] = (x0 * r2) >> 17; + } + + /* transform columns */ + for (i = 0; i < 8; i++) { + x0 = rows[0][i]; + x1 = rows[1][i]; + x2 = rows[2][i]; + x3 = rows[3][i]; + x4 = rows[4][i]; + x5 = rows[5][i]; + x6 = rows[6][i]; + x7 = rows[7][i]; + + /* Stage 1 */ + x8 = x7 + x0; + x0 -= x7; + x7 = x1 + x6; + x1 -= x6; + x6 = x2 + x5; + x2 -= x5; + x5 = x3 + x4; + x3 -= x4; + + /* Stage 2 */ + x4 = x8 + x5; + x8 -= x5; + x5 = x7 + x6; + x7 -= x6; + x6 = c1 * (x1 + x2); + x2 = (-s1 - c1) * x2 + x6; + x1 = (s1 - c1) * x1 + x6; + x6 = c3 * (x0 + x3); + x3 = (-s3 - c3) * x3 + x6; + x0 = (s3 - c3) * x0 + x6; + + /* Stage 3 */ + x6 = x4 + x5; + x4 -= x5; + x5 = r2c6 * (x7 + x8); + x7 = (-r2s6 - r2c6) * x7 + x5; + x8 = (r2s6 - r2c6) * x8 + x5; + x5 = x0 + x2; + x0 -= x2; + x2 = x3 + x1; + x3 -= x1; + + /* Stage 4 and output */ + out[i * 8] = (double) ((x6 + 16) >> 3); + out[i * 8 + 1] = (double) ((x4 + 16) >> 3); + out[i * 8 + 2] = (double) ((x8 + 16384) >> 13); + out[i * 8 + 3] = (double) ((x7 + 16384) >> 13); + out[i * 8 + 4] = (double) ((x2 - x5 + 16384) >> 13); + out[i * 8 + 5] = (double) ((x2 + x5 + 16384) >> 13); + out[i * 8 + 6] = (double) (((x3 >> 8) * r2 + 8192) >> 12); + out[i * 8 + 7] = (double) (((x0 >> 8) * r2 + 8192) >> 12); + } +} + +struct rspamd_image_cache_entry { + guchar digest[64]; + guchar dct[RSPAMD_DCT_LEN / NBBY]; +}; + +static void +rspamd_image_cache_entry_dtor(gpointer p) +{ + struct rspamd_image_cache_entry *entry = p; + g_free(entry); +} + +static guint32 +rspamd_image_dct_hash(gconstpointer p) +{ + return rspamd_cryptobox_fast_hash(p, rspamd_cryptobox_HASHBYTES, + rspamd_hash_seed()); +} + +static gboolean +rspamd_image_dct_equal(gconstpointer a, gconstpointer b) +{ + return memcmp(a, b, rspamd_cryptobox_HASHBYTES) == 0; +} + +static void +rspamd_image_create_cache(struct rspamd_config *cfg) +{ + images_hash = rspamd_lru_hash_new_full(cfg->images_cache_size, NULL, + rspamd_image_cache_entry_dtor, + rspamd_image_dct_hash, rspamd_image_dct_equal); +} + +static gboolean +rspamd_image_check_hash(struct rspamd_task *task, struct rspamd_image *img) +{ + struct rspamd_image_cache_entry *found; + + if (images_hash == NULL) { + rspamd_image_create_cache(task->cfg); + } + + found = rspamd_lru_hash_lookup(images_hash, img->parent->digest, + task->tv.tv_sec); + + if (found) { + /* We need to decompress */ + img->dct = g_malloc(RSPAMD_DCT_LEN / NBBY); + rspamd_mempool_add_destructor(task->task_pool, g_free, + img->dct); + /* Copy as found could be destroyed by LRU */ + memcpy(img->dct, found->dct, RSPAMD_DCT_LEN / NBBY); + img->is_normalized = TRUE; + + return TRUE; + } + + return FALSE; +} + +static void +rspamd_image_save_hash(struct rspamd_task *task, struct rspamd_image *img) +{ + struct rspamd_image_cache_entry *found; + + if (img->is_normalized) { + found = rspamd_lru_hash_lookup(images_hash, img->parent->digest, + task->tv.tv_sec); + + if (!found) { + found = g_malloc0(sizeof(*found)); + memcpy(found->dct, img->dct, RSPAMD_DCT_LEN / NBBY); + memcpy(found->digest, img->parent->digest, sizeof(found->digest)); + + rspamd_lru_hash_insert(images_hash, found->digest, found, + task->tv.tv_sec, 0); + } + } +} + +#endif + +void rspamd_image_normalize(struct rspamd_task *task, struct rspamd_image *img) +{ +#ifdef USABLE_GD + gdImagePtr src = NULL, dst = NULL; + guint i, j, k, l; + gdouble *dct; + + if (img->data->len == 0 || img->data->len > G_MAXINT32) { + return; + } + + if (img->height <= RSPAMD_NORMALIZED_DIM || + img->width <= RSPAMD_NORMALIZED_DIM) { + return; + } + + if (img->data->len > task->cfg->max_pic_size) { + return; + } + + if (rspamd_image_check_hash(task, img)) { + return; + } + + switch (img->type) { + case IMAGE_TYPE_JPG: + src = gdImageCreateFromJpegPtr(img->data->len, (void *) img->data->begin); + break; + case IMAGE_TYPE_PNG: + src = gdImageCreateFromPngPtr(img->data->len, (void *) img->data->begin); + break; + case IMAGE_TYPE_GIF: + src = gdImageCreateFromGifPtr(img->data->len, (void *) img->data->begin); + break; + case IMAGE_TYPE_BMP: + src = gdImageCreateFromBmpPtr(img->data->len, (void *) img->data->begin); + break; + default: + return; + } + + if (src == NULL) { + msg_info_task("cannot load image of type %s from %T", + rspamd_image_type_str(img->type), img->filename); + } + else { + gdImageSetInterpolationMethod(src, GD_BILINEAR_FIXED); + + dst = gdImageScale(src, RSPAMD_NORMALIZED_DIM, RSPAMD_NORMALIZED_DIM); + gdImageGrayScale(dst); + gdImageDestroy(src); + + img->is_normalized = TRUE; + dct = g_malloc0(sizeof(gdouble) * RSPAMD_DCT_LEN); + img->dct = g_malloc0(RSPAMD_DCT_LEN / NBBY); + rspamd_mempool_add_destructor(task->task_pool, g_free, + img->dct); + + /* + * Split message into blocks: + * + * **** + * **** + * + * Get sum of saturation values, and set bit if sum is > avg + * Then go further + * + * **** + * **** + * + * and repeat this algorithm. + * + * So on each iteration we move by 16 pixels and calculate 2 elements of + * signature + */ + for (i = 0; i < RSPAMD_NORMALIZED_DIM; i += 8) { + for (j = 0; j < RSPAMD_NORMALIZED_DIM; j += 8) { + gint p[8][8]; + + for (k = 0; k < 8; k++) { + p[k][0] = gdImageGetPixel(dst, i + k, j); + p[k][1] = gdImageGetPixel(dst, i + k, j + 1); + p[k][2] = gdImageGetPixel(dst, i + k, j + 2); + p[k][3] = gdImageGetPixel(dst, i + k, j + 3); + p[k][4] = gdImageGetPixel(dst, i + k, j + 4); + p[k][5] = gdImageGetPixel(dst, i + k, j + 5); + p[k][6] = gdImageGetPixel(dst, i + k, j + 6); + p[k][7] = gdImageGetPixel(dst, i + k, j + 7); + } + + rspamd_image_dct_block(p, + dct + i * RSPAMD_NORMALIZED_DIM + j); + + gdouble avg = 0.0; + + for (k = 0; k < 8; k++) { + for (l = 0; l < 8; l++) { + gdouble x = *(dct + + i * RSPAMD_NORMALIZED_DIM + j + k * 8 + l); + avg += (x - avg) / (gdouble) (k * 8 + l + 1); + } + } + + + for (k = 0; k < 8; k++) { + for (l = 0; l < 8; l++) { + guint idx = i * RSPAMD_NORMALIZED_DIM + j + k * 8 + l; + + if (dct[idx] >= avg) { + setbit(img->dct, idx); + } + } + } + } + } + + gdImageDestroy(dst); + g_free(dct); + rspamd_image_save_hash(task, img); + } +#endif +} + +struct rspamd_image * +rspamd_maybe_process_image(rspamd_mempool_t *pool, + rspamd_ftok_t *data) +{ + enum rspamd_image_type type; + struct rspamd_image *img = NULL; + + if ((type = detect_image_type(data)) != IMAGE_TYPE_UNKNOWN) { + switch (type) { + case IMAGE_TYPE_PNG: + img = process_png_image(pool, data); + break; + case IMAGE_TYPE_JPG: + img = process_jpg_image(pool, data); + break; + case IMAGE_TYPE_GIF: + img = process_gif_image(pool, data); + break; + case IMAGE_TYPE_BMP: + img = process_bmp_image(pool, data); + break; + default: + img = NULL; + break; + } + } + + return img; +} + +static bool +process_image(struct rspamd_task *task, struct rspamd_mime_part *part) +{ + struct rspamd_image *img; + + img = rspamd_maybe_process_image(task->task_pool, &part->parsed_data); + + if (img != NULL) { + msg_debug_images("detected %s image of size %ud x %ud", + rspamd_image_type_str(img->type), + img->width, img->height); + + if (part->cd) { + img->filename = &part->cd->filename; + } + + img->parent = part; + + part->part_type = RSPAMD_MIME_PART_IMAGE; + part->specific.img = img; + + return true; + } + + return false; +} + +const gchar * +rspamd_image_type_str(enum rspamd_image_type type) +{ + switch (type) { + case IMAGE_TYPE_PNG: + return "PNG"; + break; + case IMAGE_TYPE_JPG: + return "JPEG"; + break; + case IMAGE_TYPE_GIF: + return "GIF"; + break; + case IMAGE_TYPE_BMP: + return "BMP"; + break; + default: + break; + } + + return "unknown"; +} + +static void +rspamd_image_process_part(struct rspamd_task *task, struct rspamd_mime_part *part) +{ + struct rspamd_mime_header *rh; + struct rspamd_mime_text_part *tp; + struct html_image *himg; + const gchar *cid; + guint cid_len, i; + struct rspamd_image *img; + + img = (struct rspamd_image *) part->specific.img; + + if (img) { + /* Check Content-Id */ + rh = rspamd_message_get_header_from_hash(part->raw_headers, + "Content-Id", FALSE); + + if (rh) { + cid = rh->decoded; + + if (*cid == '<') { + cid++; + } + + cid_len = strlen(cid); + + if (cid_len > 0) { + if (cid[cid_len - 1] == '>') { + cid_len--; + } + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp) + { + if (IS_TEXT_PART_HTML(tp) && tp->html != NULL) { + himg = rspamd_html_find_embedded_image(tp->html, cid, cid_len); + + if (himg != NULL) { + img->html_image = himg; + himg->embedded_image = img; + + msg_debug_images("found linked image by cid: <%s>", + cid); + + if (himg->height == 0) { + himg->height = img->height; + } + + if (himg->width == 0) { + himg->width = img->width; + } + } + } + } + } + } + } +} + +void rspamd_images_link(struct rspamd_task *task) +{ + struct rspamd_mime_part *part; + guint i; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + if (part->part_type == RSPAMD_MIME_PART_IMAGE) { + rspamd_image_process_part(task, part); + } + } +}
\ No newline at end of file diff --git a/src/libmime/images.h b/src/libmime/images.h new file mode 100644 index 0000000..bf8b3be --- /dev/null +++ b/src/libmime/images.h @@ -0,0 +1,76 @@ +#ifndef IMAGES_H_ +#define IMAGES_H_ + +#include "config.h" +#include "fstring.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct html_image; +struct rspamd_task; +struct rspamd_mime_part; + +#define RSPAMD_DCT_LEN (64 * 64) + +enum rspamd_image_type { + IMAGE_TYPE_PNG = 0, + IMAGE_TYPE_JPG, + IMAGE_TYPE_GIF, + IMAGE_TYPE_BMP, + IMAGE_TYPE_UNKNOWN +}; + +struct rspamd_image { + struct rspamd_mime_part *parent; + rspamd_ftok_t *data; + rspamd_ftok_t *filename; + struct html_image *html_image; + enum rspamd_image_type type; + guint32 width; + guint32 height; + gboolean is_normalized; + guchar *dct; +}; + +/* + * Process images from a worker task + */ +void rspamd_images_process(struct rspamd_task *task); + +/** + * Process image if possible in a single mime part + * @param task + * @param part + * @return + */ +bool rspamd_images_process_mime_part_maybe(struct rspamd_task *task, + struct rspamd_mime_part *part); + +/* + * Link embedded images to the HTML parts + */ +void rspamd_images_link(struct rspamd_task *task); + +/** + * Processes image in raw data + * @param task + * @param data + * @return + */ +struct rspamd_image *rspamd_maybe_process_image(rspamd_mempool_t *pool, + rspamd_ftok_t *data); + +/* + * Get textual representation of an image's type + */ +const gchar *rspamd_image_type_str(enum rspamd_image_type type); + +void rspamd_image_normalize(struct rspamd_task *task, struct rspamd_image *img); + +#ifdef __cplusplus +} +#endif + +#endif /* IMAGES_H_ */ diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c new file mode 100644 index 0000000..bdd0aad --- /dev/null +++ b/src/libmime/lang_detection.c @@ -0,0 +1,2103 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lang_detection.h" +#include "lang_detection_fasttext.h" +#include "libserver/logger.h" +#include "libcryptobox/cryptobox.h" +#include "libutil/multipattern.h" +#include "ucl.h" +#include "khash.h" +#include "libstemmer.h" + +#include <glob.h> +#include <unicode/utf8.h> +#include <unicode/utf16.h> +#include <unicode/ucnv.h> +#include <unicode/uchar.h> +#include <unicode/ustring.h> +#include <math.h> + +static const gsize default_short_text_limit = 10; +static const gsize default_words = 80; +static const gdouble update_prob = 0.6; +static const gchar *default_languages_path = RSPAMD_SHAREDIR "/languages"; + +#undef EXTRA_LANGDET_DEBUG + +struct rspamd_language_unicode_match { + const gchar *lang; + gint unicode_code; +}; + +/* + * List of languages detected by unicode scripts + */ +static const struct rspamd_language_unicode_match unicode_langs[] = { + {"el", RSPAMD_UNICODE_GREEK}, + {"ml", RSPAMD_UNICODE_MALAYALAM}, + {"te", RSPAMD_UNICODE_TELUGU}, + {"ta", RSPAMD_UNICODE_TAMIL}, + {"gu", RSPAMD_UNICODE_GUJARATI}, + {"th", RSPAMD_UNICODE_THAI}, + {"ka", RSPAMD_UNICODE_GEORGIAN}, + {"si", RSPAMD_UNICODE_SINHALA}, + {"hy", RSPAMD_UNICODE_ARMENIAN}, + {"ja", RSPAMD_UNICODE_JP}, + {"ko", RSPAMD_UNICODE_HANGUL}, +}; + +/* + * Top languages + */ +static const gchar *tier0_langs[] = { + "en", +}; +static const gchar *tier1_langs[] = { + "fr", "it", "de", "es", "nl", + "pt", "ru", "pl", "tk", "th", "ar"}; + +enum rspamd_language_category { + RSPAMD_LANGUAGE_LATIN = 0, + RSPAMD_LANGUAGE_CYRILLIC, + RSPAMD_LANGUAGE_DEVANAGARI, + RSPAMD_LANGUAGE_ARAB, + RSPAMD_LANGUAGE_MAX, +}; + +struct rspamd_language_elt { + const gchar *name; /* e.g. "en" or "ru" */ + gint flags; /* enum rspamd_language_elt_flags */ + enum rspamd_language_category category; + guint trigrams_words; + guint stop_words; + gdouble mean; + gdouble std; + guint occurrences; /* total number of parts with this language */ +}; + +struct rspamd_ngramm_elt { + struct rspamd_language_elt *elt; + gdouble prob; +}; + +struct rspamd_ngramm_chain { + GPtrArray *languages; + gdouble mean; + gdouble std; + gchar *utf; +}; + +struct rspamd_stop_word_range { + guint start; + guint stop; + struct rspamd_language_elt *elt; +}; + +struct rspamd_stop_word_elt { + struct rspamd_multipattern *mp; + GArray *ranges; /* of rspamd_stop_word_range */ +}; + +#define msg_debug_lang_det(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE_PUBLIC(langdet) + +static const struct rspamd_language_unicode_match * +rspamd_language_search_unicode_match(const gchar *key, + const struct rspamd_language_unicode_match *elts, size_t nelts) +{ + size_t i; + + for (i = 0; i < nelts; i++) { + if (strcmp(elts[i].lang, key) == 0) { + return &elts[i]; + } + } + + return NULL; +} + +static gboolean +rspamd_language_search_str(const gchar *key, const gchar *elts[], size_t nelts) +{ + size_t i; + + for (i = 0; i < nelts; i++) { + if (strcmp(elts[i], key) == 0) { + return TRUE; + } + } + return FALSE; +} + +static guint +rspamd_trigram_hash_func(gconstpointer key) +{ + return rspamd_cryptobox_fast_hash(key, 3 * sizeof(UChar32), + rspamd_hash_seed()); +} + +static gboolean +rspamd_trigram_equal_func(gconstpointer v, gconstpointer v2) +{ + return memcmp(v, v2, 3 * sizeof(UChar32)) == 0; +} + +KHASH_INIT(rspamd_trigram_hash, const UChar32 *, struct rspamd_ngramm_chain, true, + rspamd_trigram_hash_func, rspamd_trigram_equal_func); +KHASH_INIT(rspamd_candidates_hash, const gchar *, + struct rspamd_lang_detector_res *, true, + rspamd_str_hash, rspamd_str_equal); +KHASH_INIT(rspamd_stopwords_hash, rspamd_ftok_t *, + char, false, + rspamd_ftok_hash, rspamd_ftok_equal); + +KHASH_INIT(rspamd_languages_hash, const gchar *, struct rspamd_language_elt *, true, + rspamd_str_hash, rspamd_str_equal); +struct rspamd_lang_detector { + khash_t(rspamd_languages_hash) * languages; + khash_t(rspamd_trigram_hash) * trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */ + struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX]; + khash_t(rspamd_stopwords_hash) * stop_words_norm; + UConverter *uchar_converter; + gsize short_text_limit; + bool prefer_fasttext; + gsize total_occurrences; /* number of all languages found */ + gpointer fasttext_detector; + ref_entry_t ref; +}; + +static void +rspamd_language_detector_ucs_lowercase(UChar32 *s, gsize len) +{ + gsize i; + + for (i = 0; i < len; i++) { + s[i] = u_tolower(s[i]); + } +} + +static gboolean +rspamd_language_detector_ucs_is_latin(const UChar32 *s, gsize len) +{ + gsize i; + gboolean ret = TRUE; + + for (i = 0; i < len; i++) { + if (s[i] >= 128 || !(g_ascii_isalnum(s[i]) || s[i] == ' ')) { + ret = FALSE; + break; + } + } + + return ret; +} + +struct rspamd_language_ucs_elt { + guint freq; + const gchar *utf; + UChar32 s[0]; +}; + +static void +rspamd_language_detector_init_ngramm(struct rspamd_config *cfg, + struct rspamd_lang_detector *d, + struct rspamd_language_elt *lelt, + struct rspamd_language_ucs_elt *ucs, + guint len, + guint freq, + guint total, + khash_t(rspamd_trigram_hash) * htb) +{ + struct rspamd_ngramm_chain *chain = NULL, st_chain; + struct rspamd_ngramm_elt *elt; + khiter_t k; + guint i; + gboolean found; + + switch (len) { + case 1: + case 2: + g_assert_not_reached(); + break; + case 3: + k = kh_get(rspamd_trigram_hash, htb, ucs->s); + if (k != kh_end(htb)) { + chain = &kh_value(htb, k); + } + break; + default: + g_assert_not_reached(); + break; + } + + if (chain == NULL) { + /* New element */ + chain = &st_chain; + memset(chain, 0, sizeof(st_chain)); + chain->languages = g_ptr_array_sized_new(32); + rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard, + chain->languages); + chain->utf = rspamd_mempool_strdup(cfg->cfg_pool, ucs->utf); + elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt)); + elt->elt = lelt; + elt->prob = ((gdouble) freq) / ((gdouble) total); + g_ptr_array_add(chain->languages, elt); + + k = kh_put(rspamd_trigram_hash, htb, ucs->s, &i); + kh_value(htb, k) = *chain; + } + else { + /* Check sanity */ + found = FALSE; + + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + if (strcmp(elt->elt->name, lelt->name) == 0) { + found = TRUE; + elt->prob += ((gdouble) freq) / ((gdouble) total); + break; + } + } + + if (!found) { + elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt)); + elt->elt = lelt; + elt->prob = ((gdouble) freq) / ((gdouble) total); + g_ptr_array_add(chain->languages, elt); + } + } +} + +static inline enum rspamd_language_category +rspamd_language_detector_get_category(guint uflags) +{ + enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN; + + if (uflags & RSPAMD_UNICODE_CYRILLIC) { + cat = RSPAMD_LANGUAGE_CYRILLIC; + } + else if (uflags & RSPAMD_UNICODE_DEVANAGARI) { + cat = RSPAMD_LANGUAGE_DEVANAGARI; + } + else if (uflags & RSPAMD_UNICODE_ARABIC) { + cat = RSPAMD_LANGUAGE_ARAB; + } + + return cat; +} + +static const gchar * +rspamd_language_detector_print_flags(struct rspamd_language_elt *elt) +{ + static gchar flags_buf[256]; + goffset r = 0; + + if (elt->flags & RS_LANGUAGE_TIER1) { + r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier1,"); + } + if (elt->flags & RS_LANGUAGE_TIER0) { + r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier0,"); + } + if (elt->flags & RS_LANGUAGE_LATIN) { + r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "latin,"); + } + + if (r > 0) { + flags_buf[r - 1] = '\0'; + } + else { + flags_buf[r] = '\0'; + } + + return flags_buf; +} + +static gint +rspamd_language_detector_cmp_ngramm(gconstpointer a, gconstpointer b) +{ + struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **) a; + struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **) b; + + return (gint) e2->freq - (gint) e1->freq; +} + +static void +rspamd_language_detector_read_file(struct rspamd_config *cfg, + struct rspamd_lang_detector *d, + const gchar *path, + const ucl_object_t *stop_words) +{ + struct ucl_parser *parser; + ucl_object_t *top; + const ucl_object_t *freqs, *n_words, *cur, *type, *flags; + ucl_object_iter_t it = NULL; + UErrorCode uc_err = U_ZERO_ERROR; + struct rspamd_language_elt *nelt; + struct rspamd_language_ucs_elt *ucs_elt; + khash_t(rspamd_trigram_hash) *htb = NULL; + gchar *pos; + guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped, + loaded; + gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0; + enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX; + + parser = ucl_parser_new(UCL_PARSER_NO_FILEVARS); + if (!ucl_parser_add_file(parser, path)) { + msg_warn_config("cannot parse file %s: %s", path, + ucl_parser_get_error(parser)); + ucl_parser_free(parser); + + return; + } + + top = ucl_parser_get_object(parser); + ucl_parser_free(parser); + + freqs = ucl_object_lookup(top, "freq"); + + if (freqs == NULL) { + msg_warn_config("file %s has no 'freq' key", path); + ucl_object_unref(top); + + return; + } + + pos = strrchr(path, '/'); + g_assert(pos != NULL); + nelt = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*nelt)); + nelt->name = rspamd_mempool_strdup(cfg->cfg_pool, pos + 1); + /* Remove extension */ + pos = strchr(nelt->name, '.'); + g_assert(pos != NULL); + *pos = '\0'; + + n_words = ucl_object_lookup(top, "n_words"); + + if (n_words == NULL || ucl_object_type(n_words) != UCL_ARRAY || + n_words->len != 3) { + msg_warn_config("cannot find n_words in language %s", nelt->name); + ucl_object_unref(top); + + return; + } + else { + nelt->trigrams_words = ucl_object_toint(ucl_array_find_index(n_words, + 2)); + } + + type = ucl_object_lookup(top, "type"); + + if (type == NULL || ucl_object_type(type) != UCL_STRING) { + msg_debug_config("cannot find type in language %s", nelt->name); + ucl_object_unref(top); + + return; + } + else { + const gchar *stype = ucl_object_tostring(type); + + if (strcmp(stype, "latin") == 0) { + cat = RSPAMD_LANGUAGE_LATIN; + } + else if (strcmp(stype, "cyrillic") == 0) { + cat = RSPAMD_LANGUAGE_CYRILLIC; + } + else if (strcmp(stype, "arab") == 0) { + cat = RSPAMD_LANGUAGE_ARAB; + } + else if (strcmp(stype, "devanagari") == 0) { + cat = RSPAMD_LANGUAGE_DEVANAGARI; + } + else { + msg_debug_config("unknown type %s of language %s", stype, nelt->name); + ucl_object_unref(top); + + return; + } + } + + flags = ucl_object_lookup(top, "flags"); + + if (flags != NULL && ucl_object_type(flags) == UCL_ARRAY) { + ucl_object_iter_t it = NULL; + const ucl_object_t *cur; + + while ((cur = ucl_object_iterate(flags, &it, true)) != NULL) { + const gchar *fl = ucl_object_tostring(cur); + + if (cur) { + if (strcmp(fl, "diacritics") == 0) { + nelt->flags |= RS_LANGUAGE_DIACRITICS; + } + else if (strcmp(fl, "ascii") == 0) { + nelt->flags |= RS_LANGUAGE_ASCII; + } + else { + msg_debug_config("unknown flag %s of language %s", fl, nelt->name); + } + } + else { + msg_debug_config("unknown flags type of language %s", nelt->name); + } + } + } + + if (stop_words) { + const ucl_object_t *specific_stop_words; + + specific_stop_words = ucl_object_lookup(stop_words, nelt->name); + + if (specific_stop_words) { + struct sb_stemmer *stem = NULL; + it = NULL; + const ucl_object_t *w; + guint start, stop; + + stem = sb_stemmer_new(nelt->name, "UTF_8"); + start = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp); + + while ((w = ucl_object_iterate(specific_stop_words, &it, true)) != NULL) { + gsize wlen; + const char *word = ucl_object_tolstring(w, &wlen); + const char *saved; + guint mp_flags = RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8; + + if (rspamd_multipattern_has_hyperscan()) { + mp_flags |= RSPAMD_MULTIPATTERN_RE; + } + + rspamd_multipattern_add_pattern_len(d->stop_words[cat].mp, + word, wlen, + mp_flags); + nelt->stop_words++; + + /* Also lemmatise and store normalised */ + if (stem) { + const char *nw = sb_stemmer_stem(stem, word, wlen); + + + if (nw) { + saved = nw; + wlen = strlen(nw); + } + else { + saved = word; + } + } + else { + saved = word; + } + + if (saved) { + gint rc; + rspamd_ftok_t *tok; + gchar *dst; + + tok = rspamd_mempool_alloc(cfg->cfg_pool, + sizeof(*tok) + wlen + 1); + dst = ((gchar *) tok) + sizeof(*tok); + rspamd_strlcpy(dst, saved, wlen + 1); + tok->begin = dst; + tok->len = wlen; + + kh_put(rspamd_stopwords_hash, d->stop_words_norm, + tok, &rc); + } + } + + if (stem) { + sb_stemmer_delete(stem); + } + + stop = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp); + + struct rspamd_stop_word_range r; + + r.start = start; + r.stop = stop; + r.elt = nelt; + + g_array_append_val(d->stop_words[cat].ranges, r); + it = NULL; + } + } + + nelt->category = cat; + htb = d->trigrams[cat]; + + GPtrArray *ngramms; + guint nsym; + + if (rspamd_language_search_str(nelt->name, tier1_langs, + G_N_ELEMENTS(tier1_langs))) { + nelt->flags |= RS_LANGUAGE_TIER1; + } + + if (rspamd_language_search_str(nelt->name, tier0_langs, + G_N_ELEMENTS(tier0_langs))) { + nelt->flags |= RS_LANGUAGE_TIER0; + } + + it = NULL; + ngramms = g_ptr_array_sized_new(freqs->len); + i = 0; + skipped = 0; + loaded = 0; + + while ((cur = ucl_object_iterate(freqs, &it, true)) != NULL) { + const gchar *key; + gsize keylen; + guint freq; + + key = ucl_object_keyl(cur, &keylen); + freq = ucl_object_toint(cur); + + i++; + delta = freq - mean; + mean += delta / i; + delta2 = freq - mean; + m2 += delta * delta2; + + if (key != NULL) { + UChar32 *cur_ucs; + const char *end = key + keylen, *cur_utf = key; + + ucs_elt = rspamd_mempool_alloc(cfg->cfg_pool, + sizeof(*ucs_elt) + (keylen + 1) * sizeof(UChar32)); + + cur_ucs = ucs_elt->s; + nsym = 0; + uc_err = U_ZERO_ERROR; + + while (cur_utf < end) { + *cur_ucs++ = ucnv_getNextUChar(d->uchar_converter, &cur_utf, + end, &uc_err); + if (!U_SUCCESS(uc_err)) { + break; + } + + nsym++; + } + + if (!U_SUCCESS(uc_err)) { + msg_warn_config("cannot convert key %*s to unicode: %s", + (gint) keylen, key, u_errorName(uc_err)); + + continue; + } + + ucs_elt->utf = key; + rspamd_language_detector_ucs_lowercase(ucs_elt->s, nsym); + + if (nsym == 3) { + g_ptr_array_add(ngramms, ucs_elt); + } + else { + continue; + } + + if (rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) { + total_latin++; + } + + ucs_elt->freq = freq; + + total_ngramms++; + } + } + + std = sqrt(m2 / (i - 1)); + + if (total_latin >= total_ngramms / 3) { + nelt->flags |= RS_LANGUAGE_LATIN; + } + + nsym = 3; + + total = 0; + PTR_ARRAY_FOREACH(ngramms, i, ucs_elt) + { + + if (!(nelt->flags & RS_LANGUAGE_LATIN) && + rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) { + ucs_elt->freq = 0; + /* Skip latin ngramm for non-latin language to avoid garbage */ + skipped++; + continue; + } + + /* Now, discriminate low frequency ngramms */ + + total += ucs_elt->freq; + loaded++; + } + + g_ptr_array_sort(ngramms, rspamd_language_detector_cmp_ngramm); + + PTR_ARRAY_FOREACH(ngramms, i, ucs_elt) + { + if (ucs_elt->freq > 0) { + rspamd_language_detector_init_ngramm(cfg, d, + nelt, ucs_elt, nsym, + ucs_elt->freq, total, htb); + } + } + +#ifdef EXTRA_LANGDET_DEBUG + /* Useful for debug */ + for (i = 0; i < 10; i++) { + ucs_elt = g_ptr_array_index(ngramms, i); + + msg_debug_lang_det_cfg("%s -> %s: %d", nelt->name, + ucs_elt->utf, ucs_elt->freq); + } +#endif + + g_ptr_array_free(ngramms, TRUE); + nelt->mean = mean; + nelt->std = std; + + msg_debug_lang_det_cfg("loaded %s language, %d trigrams, " + "%d ngramms loaded; " + "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; " + "(%s)", + nelt->name, + (gint) nelt->trigrams_words, + total, + std, mean, + skipped, loaded, nelt->stop_words, + rspamd_language_detector_print_flags(nelt)); + + int ret; + khiter_t k = kh_put(rspamd_languages_hash, d->languages, nelt->name, &ret); + g_assert(ret > 0); /* must be unique */ + kh_value(d->languages, k) = nelt; + ucl_object_unref(top); +} + +static gboolean +rspamd_ucl_array_find_str(const gchar *str, const ucl_object_t *ar) +{ + ucl_object_iter_t it = NULL; + const ucl_object_t *cur; + + if (ar == NULL || ar->len == 0) { + return FALSE; + } + + while ((cur = ucl_object_iterate(ar, &it, true)) != NULL) { + if (ucl_object_type(cur) == UCL_STRING && rspamd_strcase_equal( + ucl_object_tostring(cur), str)) { + return TRUE; + } + } + + return FALSE; +} + +static void +rspamd_language_detector_process_chain(struct rspamd_config *cfg, + struct rspamd_ngramm_chain *chain) +{ + struct rspamd_ngramm_elt *elt; + guint i; + gdouble delta, mean = 0, delta2, m2 = 0, std; + + if (chain->languages->len > 3) { + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + delta = elt->prob - mean; + mean += delta / (i + 1); + delta2 = elt->prob - mean; + m2 += delta * delta2; + } + + std = sqrt(m2 / (i - 1)); + chain->mean = mean; + chain->std = std; + + /* Now, filter elements that are lower than mean */ + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + if (elt->prob < mean) { + g_ptr_array_remove_index_fast(chain->languages, i); +#ifdef EXTRA_LANGDET_DEBUG + msg_debug_lang_det_cfg("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f", + elt->elt->name, chain->utf, elt->prob, mean, std); +#endif + } + } + } + else { + /* We have a unique ngramm, increase its weight */ + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + elt->prob *= 4.0; +#ifdef EXTRA_LANGDET_DEBUG + msg_debug_lang_det_cfg("increase weight of %s in %s; prob: %.4f", + elt->elt->name, chain->utf, elt->prob); +#endif + } + } +} + +static void +rspamd_language_detector_dtor(struct rspamd_lang_detector *d) +{ + if (d) { + for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i++) { + kh_destroy(rspamd_trigram_hash, d->trigrams[i]); + rspamd_multipattern_destroy(d->stop_words[i].mp); + g_array_free(d->stop_words[i].ranges, TRUE); + } + + if (d->languages) { + kh_destroy(rspamd_languages_hash, d->languages); + } + + kh_destroy(rspamd_stopwords_hash, d->stop_words_norm); + rspamd_lang_detection_fasttext_destroy(d->fasttext_detector); + } +} + +struct rspamd_lang_detector * +rspamd_language_detector_init(struct rspamd_config *cfg) +{ + const ucl_object_t *section, *elt, *languages_enable = NULL, + *languages_disable = NULL; + const gchar *languages_path = default_languages_path; + glob_t gl; + size_t i, short_text_limit = default_short_text_limit, total = 0; + UErrorCode uc_err = U_ZERO_ERROR; + GString *languages_pattern; + struct rspamd_ngramm_chain *chain, schain; + gchar *fname; + struct rspamd_lang_detector *ret = NULL; + struct ucl_parser *parser; + ucl_object_t *stop_words; + bool prefer_fasttext = true; + + section = ucl_object_lookup(cfg->cfg_ucl_obj, "lang_detection"); + + if (section != NULL) { + elt = ucl_object_lookup(section, "languages"); + + if (elt) { + languages_path = ucl_object_tostring(elt); + } + + elt = ucl_object_lookup(section, "short_text_limit"); + + if (elt) { + short_text_limit = ucl_object_toint(elt); + } + + languages_enable = ucl_object_lookup(section, "languages_enable"); + languages_disable = ucl_object_lookup(section, "languages_disable"); + + elt = ucl_object_lookup(section, "prefer_fasttext"); + if (elt) { + prefer_fasttext = ucl_object_toboolean(elt); + } + } + + languages_pattern = g_string_sized_new(PATH_MAX); + rspamd_printf_gstring(languages_pattern, "%s/stop_words", languages_path); + parser = ucl_parser_new(UCL_PARSER_DEFAULT); + + if (ucl_parser_add_file(parser, languages_pattern->str)) { + stop_words = ucl_parser_get_object(parser); + } + else { + msg_err_config("cannot read stop words from %s: %s", + languages_pattern->str, + ucl_parser_get_error(parser)); + stop_words = NULL; + } + + ucl_parser_free(parser); + languages_pattern->len = 0; + + rspamd_printf_gstring(languages_pattern, "%s/*.json", languages_path); + memset(&gl, 0, sizeof(gl)); + + if (glob(languages_pattern->str, 0, NULL, &gl) != 0) { + msg_err_config("cannot read any files matching %v", languages_pattern); + goto end; + } + + ret = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*ret)); + ret->languages = kh_init(rspamd_languages_hash); + kh_resize(rspamd_languages_hash, ret->languages, gl.gl_pathc); + ret->uchar_converter = rspamd_get_utf8_converter(); + ret->short_text_limit = short_text_limit; + ret->stop_words_norm = kh_init(rspamd_stopwords_hash); + ret->prefer_fasttext = prefer_fasttext; + + /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */ + for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) { + ret->trigrams[i] = kh_init(rspamd_trigram_hash); +#ifdef WITH_HYPERSCAN + ret->stop_words[i].mp = rspamd_multipattern_create( + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 | + RSPAMD_MULTIPATTERN_RE); +#else + ret->stop_words[i].mp = rspamd_multipattern_create( + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); +#endif + + ret->stop_words[i].ranges = g_array_new(FALSE, FALSE, + sizeof(struct rspamd_stop_word_range)); + } + + g_assert(uc_err == U_ZERO_ERROR); + + for (i = 0; i < gl.gl_pathc; i++) { + fname = g_path_get_basename(gl.gl_pathv[i]); + + if (!rspamd_ucl_array_find_str(fname, languages_disable) || + (languages_enable == NULL || + rspamd_ucl_array_find_str(fname, languages_enable))) { + rspamd_language_detector_read_file(cfg, ret, gl.gl_pathv[i], + stop_words); + } + else { + msg_info_config("skip language file %s: disabled", fname); + } + + g_free(fname); + } + + for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) { + GError *err = NULL; + + kh_foreach_value(ret->trigrams[i], schain, { + chain = &schain; + rspamd_language_detector_process_chain(cfg, chain); + }); + + if (!rspamd_multipattern_compile(ret->stop_words[i].mp, &err)) { + msg_err_config("cannot compile stop words for %z language group: %e", + i, err); + g_error_free(err); + } + + total += kh_size(ret->trigrams[i]); + } + + ret->fasttext_detector = rspamd_lang_detection_fasttext_init(cfg); + char *fasttext_status = rspamd_lang_detection_fasttext_show_info(ret->fasttext_detector); + + msg_info_config("loaded %d languages, " + "%d trigrams; %s", + (gint) kh_size(ret->languages), + (gint) total, fasttext_status); + g_free(fasttext_status); + + if (stop_words) { + ucl_object_unref(stop_words); + } + + REF_INIT_RETAIN(ret, rspamd_language_detector_dtor); + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) rspamd_language_detector_unref, + ret); + +end: + if (gl.gl_pathc > 0) { + globfree(&gl); + } + + g_string_free(languages_pattern, TRUE); + + return ret; +} + +static void +rspamd_language_detector_random_select(GArray *ucs_tokens, guint nwords, + goffset *offsets_out, + guint64 *seed) +{ + guint step_len, remainder, i, out_idx; + guint64 coin, sel; + rspamd_stat_token_t *tok; + + g_assert(nwords != 0); + g_assert(offsets_out != NULL); + g_assert(ucs_tokens->len >= nwords); + /* + * We split input array into `nwords` parts. For each part we randomly select + * an element from this particular split. Here is an example: + * + * nwords=2, input_len=5 + * + * w1 w2 w3 w4 w5 + * ^ ^ + * part1 part2 + * vv vv + * w2 w5 + * + * So we have 2 output words from 5 input words selected randomly within + * their splits. It is not uniform distribution but it seems to be better + * to include words from different text parts + */ + step_len = ucs_tokens->len / nwords; + remainder = ucs_tokens->len % nwords; + + out_idx = 0; + coin = rspamd_random_uint64_fast_seed(seed); + sel = coin % (step_len + remainder); + offsets_out[out_idx] = sel; + + for (i = step_len + remainder; i < ucs_tokens->len; + i += step_len, out_idx++) { + guint ntries = 0; + coin = rspamd_random_uint64_fast_seed(seed); + sel = (coin % step_len) + i; + + for (;;) { + tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel); + /* Filter bad tokens */ + + if (tok->unicode.len >= 2 && + !(tok->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION) && + u_isalpha(tok->unicode.begin[0]) && + u_isalpha(tok->unicode.begin[tok->unicode.len - 1])) { + offsets_out[out_idx] = sel; + break; + } + else { + ntries++; + coin = rspamd_random_uint64_fast_seed(seed); + + if (ntries < step_len) { + sel = (coin % step_len) + i; + } + else if (ntries < ucs_tokens->len) { + sel = coin % ucs_tokens->len; + } + else { + offsets_out[out_idx] = sel; + break; + } + } + } + } + + /* + * Fisher-Yates algorithm: + * for i from 0 to n−2 do + * j ← random integer such that i ≤ j < n + * exchange a[i] and a[j] + */ +#if 0 + if (out_idx > 2) { + for (i = 0; i < out_idx - 2; i++) { + coin = rspamd_random_uint64_fast (); + sel = (coin % (out_idx - i)) + i; + /* swap */ + tmp = offsets_out[i]; + offsets_out[i] = offsets_out[sel]; + offsets_out[sel] = tmp; + } + } +#endif +} + +static goffset +rspamd_language_detector_next_ngramm(rspamd_stat_token_t *tok, UChar32 *window, + guint wlen, goffset cur_off) +{ + guint i; + + if (wlen > 1) { + /* Deal with spaces at the beginning and ending */ + + if (cur_off == 0) { + window[0] = (UChar32) ' '; + + for (i = 0; i < wlen - 1; i++) { + window[i + 1] = tok->unicode.begin[i]; + } + } + else if (cur_off + wlen == tok->unicode.len + 1) { + /* Add trailing space */ + for (i = 0; i < wlen - 1; i++) { + window[i] = tok->unicode.begin[cur_off + i]; + } + window[wlen - 1] = (UChar32) ' '; + } + else if (cur_off + wlen > tok->unicode.len + 1) { + /* No more fun */ + return -1; + } + else { + /* Normal case */ + for (i = 0; i < wlen; i++) { + window[i] = tok->unicode.begin[cur_off + i]; + } + } + } + else { + if (tok->normalized.len <= cur_off) { + return -1; + } + + window[0] = tok->unicode.begin[cur_off]; + } + + return cur_off + 1; +} + +/* + * Do full guess for a specific ngramm, checking all languages defined + */ +static void +rspamd_language_detector_process_ngramm_full(struct rspamd_task *task, + struct rspamd_lang_detector *d, + UChar32 *window, + khash_t(rspamd_candidates_hash) * candidates, + khash_t(rspamd_trigram_hash) * trigrams) +{ + guint i; + gint ret; + struct rspamd_ngramm_chain *chain = NULL; + struct rspamd_ngramm_elt *elt; + struct rspamd_lang_detector_res *cand; + khiter_t k; + gdouble prob; + + k = kh_get(rspamd_trigram_hash, trigrams, window); + if (k != kh_end(trigrams)) { + chain = &kh_value(trigrams, k); + } + + if (chain) { + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + prob = elt->prob; + + if (prob < chain->mean) { + continue; + } + + k = kh_get(rspamd_candidates_hash, candidates, elt->elt->name); + if (k != kh_end(candidates)) { + cand = kh_value(candidates, k); + } + else { + cand = NULL; + } + +#ifdef NGRAMMS_DEBUG + msg_err("gramm: %s, lang: %s, prob: %.3f", chain->utf, + elt->elt->name, log2(elt->prob)); +#endif + if (cand == NULL) { + cand = rspamd_mempool_alloc(task->task_pool, sizeof(*cand)); + cand->elt = elt->elt; + cand->lang = elt->elt->name; + cand->prob = prob; + + k = kh_put(rspamd_candidates_hash, candidates, elt->elt->name, + &ret); + kh_value(candidates, k) = cand; + } + else { + /* Update guess */ + cand->prob += prob; + } + } + } +} + +static void +rspamd_language_detector_detect_word(struct rspamd_task *task, + struct rspamd_lang_detector *d, + rspamd_stat_token_t *tok, + khash_t(rspamd_candidates_hash) * candidates, + khash_t(rspamd_trigram_hash) * trigrams) +{ + const guint wlen = 3; + UChar32 window[3]; + goffset cur = 0; + + /* Split words */ + while ((cur = rspamd_language_detector_next_ngramm(tok, window, wlen, cur)) != -1) { + rspamd_language_detector_process_ngramm_full(task, + d, window, candidates, trigrams); + } +} + +static const gdouble cutoff_limit = -8.0; +/* + * Converts frequencies to log probabilities, filter those candidates who + * has the lowest probabilities + */ + +static inline void +rspamd_language_detector_filter_step1(struct rspamd_task *task, + struct rspamd_lang_detector_res *cand, + gdouble *max_prob, guint *filtered) +{ + if (!isnan(cand->prob)) { + if (cand->prob == 0) { + cand->prob = NAN; + msg_debug_lang_det( + "exclude language %s", + cand->lang); + (*filtered)++; + } + else { + cand->prob = log2(cand->prob); + if (cand->prob < cutoff_limit) { + msg_debug_lang_det( + "exclude language %s: %.3f, cutoff limit: %.3f", + cand->lang, cand->prob, cutoff_limit); + cand->prob = NAN; + (*filtered)++; + } + else if (cand->prob > *max_prob) { + *max_prob = cand->prob; + } + } + } +} + +static inline void +rspamd_language_detector_filter_step2(struct rspamd_task *task, + struct rspamd_lang_detector_res *cand, + gdouble max_prob, guint *filtered) +{ + /* + * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that + * prob2 is 2^4 less than prob1 + */ + if (!isnan(cand->prob) && max_prob - cand->prob > 1) { + msg_debug_lang_det("exclude language %s: %.3f (%.3f max)", + cand->lang, cand->prob, max_prob); + cand->prob = NAN; + (*filtered)++; + } +} + +static void +rspamd_language_detector_filter_negligible(struct rspamd_task *task, + khash_t(rspamd_candidates_hash) * candidates) +{ + struct rspamd_lang_detector_res *cand; + guint filtered = 0; + gdouble max_prob = -(G_MAXDOUBLE); + + kh_foreach_value(candidates, cand, + rspamd_language_detector_filter_step1(task, cand, &max_prob, &filtered)); + kh_foreach_value(candidates, cand, + rspamd_language_detector_filter_step2(task, cand, max_prob, &filtered)); + + msg_debug_lang_det("removed %d languages", filtered); +} + +static void +rspamd_language_detector_detect_type(struct rspamd_task *task, + guint nwords, + struct rspamd_lang_detector *d, + GArray *words, + enum rspamd_language_category cat, + khash_t(rspamd_candidates_hash) * candidates, + struct rspamd_mime_text_part *part) +{ + guint nparts = MIN(words->len, nwords); + goffset *selected_words; + rspamd_stat_token_t *tok; + guint i; + guint64 seed; + + /* Seed PRNG with part digest to provide some sort of determinism */ + memcpy(&seed, part->mime_part->digest, sizeof(seed)); + selected_words = g_new0(goffset, nparts); + rspamd_language_detector_random_select(words, nparts, selected_words, &seed); + msg_debug_lang_det("randomly selected %d words", nparts); + + for (i = 0; i < nparts; i++) { + tok = &g_array_index(words, rspamd_stat_token_t, + selected_words[i]); + + if (tok->unicode.len >= 3) { + rspamd_language_detector_detect_word(task, d, tok, candidates, + d->trigrams[cat]); + } + } + + /* Filter negligible candidates */ + rspamd_language_detector_filter_negligible(task, candidates); + g_free(selected_words); +} + +static gint +rspamd_language_detector_cmp(gconstpointer a, gconstpointer b) +{ + const struct rspamd_lang_detector_res + *canda = *(const struct rspamd_lang_detector_res **) a, + *candb = *(const struct rspamd_lang_detector_res **) b; + + if (canda->prob > candb->prob) { + return -1; + } + else if (candb->prob > canda->prob) { + return 1; + } + + return 0; +} + +enum rspamd_language_detected_type { + rs_detect_none = 0, + rs_detect_single, + rs_detect_multiple, +}; + +static enum rspamd_language_detected_type +rspamd_language_detector_try_ngramm(struct rspamd_task *task, + guint nwords, + struct rspamd_lang_detector *d, + GArray *ucs_tokens, + enum rspamd_language_category cat, + khash_t(rspamd_candidates_hash) * candidates, + struct rspamd_mime_text_part *part) +{ + guint cand_len = 0; + struct rspamd_lang_detector_res *cand; + + rspamd_language_detector_detect_type(task, + nwords, + d, + ucs_tokens, + cat, + candidates, + part); + + kh_foreach_value(candidates, cand, { + if (!isnan(cand->prob)) { + cand_len++; + } + }); + + if (cand_len == 0) { + return rs_detect_none; + } + else if (cand_len == 1) { + return rs_detect_single; + } + + return rs_detect_multiple; +} + +enum rspamd_language_sort_flags { + RSPAMD_LANG_FLAG_DEFAULT = 0, + RSPAMD_LANG_FLAG_SHORT = 1 << 0, +}; + +struct rspamd_frequency_sort_cbdata { + struct rspamd_lang_detector *d; + enum rspamd_language_sort_flags flags; + gdouble std; + gdouble mean; +}; + +static const gdouble tier0_adjustment = 1.2; +static const gdouble tier1_adjustment = 0.8; +static const gdouble frequency_adjustment = 0.8; + +static gint +rspamd_language_detector_cmp_heuristic(gconstpointer a, gconstpointer b, + gpointer ud) +{ + struct rspamd_frequency_sort_cbdata *cbd = ud; + struct rspamd_lang_detector_res + *canda = *(struct rspamd_lang_detector_res **) a, + *candb = *(struct rspamd_lang_detector_res **) b; + gdouble adj; + gdouble proba_adjusted, probb_adjusted, freqa, freqb; + + if (cbd->d->total_occurrences == 0) { + /* Not enough data, compare directly */ + return rspamd_language_detector_cmp(a, b); + } + + freqa = ((gdouble) canda->elt->occurrences) / + (gdouble) cbd->d->total_occurrences; + freqb = ((gdouble) candb->elt->occurrences) / + (gdouble) cbd->d->total_occurrences; + + proba_adjusted = canda->prob; + probb_adjusted = candb->prob; + + if (isnormal(freqa) && isnormal(freqb)) { + proba_adjusted += cbd->std * (frequency_adjustment * freqa); + probb_adjusted += cbd->std * (frequency_adjustment * freqb); + } + + if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) { + adj = tier1_adjustment * 2.0; + } + else { + adj = tier1_adjustment; + } + if (canda->elt->flags & RS_LANGUAGE_TIER1) { + proba_adjusted += cbd->std * adj; + } + + if (candb->elt->flags & RS_LANGUAGE_TIER1) { + probb_adjusted += cbd->std * adj; + } + + if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) { + adj = tier0_adjustment * 16.0; + } + else { + adj = tier0_adjustment; + } + + if (canda->elt->flags & RS_LANGUAGE_TIER0) { + proba_adjusted += cbd->std * adj; + } + + if (candb->elt->flags & RS_LANGUAGE_TIER0) { + probb_adjusted += cbd->std * adj; + } + + /* Hack: adjust probability directly */ + canda->prob = proba_adjusted; + candb->prob = probb_adjusted; + + if (proba_adjusted > probb_adjusted) { + return -1; + } + else if (probb_adjusted > proba_adjusted) { + return 1; + } + + return 0; +} + +static void +rspamd_language_detector_unicode_scripts(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + guint *pchinese, + guint *pspecial) +{ + const gchar *p = part->utf_stripped_content->data, *end; + guint i = 0, cnt = 0; + end = p + part->utf_stripped_content->len; + gint32 uc, sc; + guint nlatin = 0, nchinese = 0, nspecial = 0; + const guint cutoff_limit = 32; + + while (p + i < end) { + U8_NEXT(p, i, part->utf_stripped_content->len, uc); + + if (((gint32) uc) < 0) { + break; + } + + if (u_isalpha(uc)) { + sc = ublock_getCode(uc); + cnt++; + + switch (sc) { + case UBLOCK_BASIC_LATIN: + case UBLOCK_LATIN_1_SUPPLEMENT: + part->unicode_scripts |= RSPAMD_UNICODE_LATIN; + nlatin++; + break; + case UBLOCK_HEBREW: + part->unicode_scripts |= RSPAMD_UNICODE_HEBREW; + nspecial++; + break; + case UBLOCK_GREEK: + part->unicode_scripts |= RSPAMD_UNICODE_GREEK; + nspecial++; + break; + case UBLOCK_CYRILLIC: + part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC; + nspecial++; + break; + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS: + case UBLOCK_CJK_COMPATIBILITY: + case UBLOCK_CJK_RADICALS_SUPPLEMENT: + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: + part->unicode_scripts |= RSPAMD_UNICODE_CJK; + nchinese++; + break; + case UBLOCK_HIRAGANA: + case UBLOCK_KATAKANA: + part->unicode_scripts |= RSPAMD_UNICODE_JP; + nspecial++; + break; + case UBLOCK_HANGUL_JAMO: + case UBLOCK_HANGUL_COMPATIBILITY_JAMO: + part->unicode_scripts |= RSPAMD_UNICODE_HANGUL; + nspecial++; + break; + case UBLOCK_ARABIC: + part->unicode_scripts |= RSPAMD_UNICODE_ARABIC; + nspecial++; + break; + case UBLOCK_DEVANAGARI: + part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI; + nspecial++; + break; + case UBLOCK_ARMENIAN: + part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN; + nspecial++; + break; + case UBLOCK_GEORGIAN: + part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN; + nspecial++; + break; + case UBLOCK_GUJARATI: + part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI; + nspecial++; + break; + case UBLOCK_TELUGU: + part->unicode_scripts |= RSPAMD_UNICODE_TELUGU; + nspecial++; + break; + case UBLOCK_TAMIL: + part->unicode_scripts |= RSPAMD_UNICODE_TAMIL; + nspecial++; + break; + case UBLOCK_THAI: + part->unicode_scripts |= RSPAMD_UNICODE_THAI; + nspecial++; + break; + case RSPAMD_UNICODE_MALAYALAM: + part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM; + nspecial++; + break; + case RSPAMD_UNICODE_SINHALA: + part->unicode_scripts |= RSPAMD_UNICODE_SINHALA; + nspecial++; + break; + } + } + + if (nspecial > cutoff_limit && nspecial > nlatin) { + break; + } + else if (nchinese > cutoff_limit && nchinese > nlatin) { + if (nspecial > 0) { + /* Likely japanese */ + break; + } + } + } + + msg_debug_lang_det("stop after checking %d characters, " + "%d latin, %d special, %d chinese", + cnt, nlatin, nspecial, nchinese); + + *pchinese = nchinese; + *pspecial = nspecial; +} + +static inline void +rspamd_language_detector_set_language(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + const gchar *code, + struct rspamd_language_elt *elt) +{ + struct rspamd_lang_detector_res *r; + + r = rspamd_mempool_alloc0(task->task_pool, sizeof(*r)); + r->prob = 1.0; + r->lang = code; + r->elt = elt; + + if (part->languages == NULL) { + part->languages = g_ptr_array_sized_new(1); + } + + g_ptr_array_add(part->languages, r); + part->language = code; +} + +static gboolean +rspamd_language_detector_try_uniscript(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + guint nchinese, + guint nspecial) +{ + guint i; + + for (i = 0; i < G_N_ELEMENTS(unicode_langs); i++) { + if (unicode_langs[i].unicode_code & part->unicode_scripts) { + + if (unicode_langs[i].unicode_code != RSPAMD_UNICODE_JP) { + msg_debug_lang_det("set language based on unicode script %s", + unicode_langs[i].lang); + rspamd_language_detector_set_language(task, part, + unicode_langs[i].lang, NULL); + + return TRUE; + } + else { + /* Japanese <-> Chinese guess */ + + /* + * Typically there might be around 0-70% of kanji glyphs + * and the rest are Haragana/Katakana + * + * If we discover that Kanji is more than 80% then we consider + * it Chinese + */ + if (nchinese <= 5 || nchinese < nspecial * 5) { + msg_debug_lang_det("set language based on unicode script %s", + unicode_langs[i].lang); + rspamd_language_detector_set_language(task, part, + unicode_langs[i].lang, NULL); + + return TRUE; + } + } + } + } + + if (part->unicode_scripts & RSPAMD_UNICODE_CJK) { + msg_debug_lang_det("guess chinese based on CJK characters: %d chinese, %d special", + nchinese, nspecial); + rspamd_language_detector_set_language(task, part, + "zh-CN", NULL); + + return TRUE; + } + + return FALSE; +} + +static guint +rspamd_langelt_hash_func(gconstpointer key) +{ + const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *) key; + return rspamd_cryptobox_fast_hash(elt->name, strlen(elt->name), + rspamd_hash_seed()); +} + +static gboolean +rspamd_langelt_equal_func(gconstpointer v, gconstpointer v2) +{ + const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *) v, + *elt2 = (const struct rspamd_language_elt *) v2; + return strcmp(elt1->name, elt2->name) == 0; +} + +/* This hash set stores a word index in the language to avoid duplicate stop words */ +KHASH_INIT(rspamd_sw_res_set, int, char, 0, kh_int_hash_func, kh_int_hash_equal); + +KHASH_INIT(rspamd_sw_hash, struct rspamd_language_elt *, khash_t(rspamd_sw_res_set) *, 1, + rspamd_langelt_hash_func, rspamd_langelt_equal_func); + +struct rspamd_sw_cbdata { + struct rspamd_task *task; + khash_t(rspamd_sw_hash) * res; + GArray *ranges; +}; + +static gint +rspamd_ranges_cmp(const void *k, const void *memb) +{ + gint pos = GPOINTER_TO_INT(k); + const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *) memb; + + if (pos >= r->start && pos < r->stop) { + return 0; + } + else if (pos < r->start) { + return -1; + } + + return 1; +} + +static gint +rspamd_language_detector_sw_cb(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + /* Check if boundary */ + const gchar *prev = text, *next = text + len; + struct rspamd_stop_word_range *r; + struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *) context; + khiter_t k; + static const gsize max_stop_words = 80; + struct rspamd_task *task; + + if (match_start > 0) { + prev = text + match_start - 1; + + if (!(g_ascii_isspace(*prev) || g_ascii_ispunct(*prev))) { + return 0; + } + } + + if (match_pos < len) { + next = text + match_pos; + + if (!(g_ascii_isspace(*next) || g_ascii_ispunct(*next))) { + return 0; + } + } + + /* We have a word on the boundary, check range */ + task = cbdata->task; + r = bsearch(GINT_TO_POINTER(strnum), cbdata->ranges->data, + cbdata->ranges->len, sizeof(*r), rspamd_ranges_cmp); + + g_assert(r != NULL); + + k = kh_get(rspamd_sw_hash, cbdata->res, r->elt); + gint nwords = 1; + + if (k != kh_end(cbdata->res)) { + khiter_t set_k; + int tt; + + set_k = kh_get(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum); + nwords = kh_size(kh_value(cbdata->res, k)); + + if (set_k == kh_end(kh_value(cbdata->res, k))) { + /* New word */ + set_k = kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt); + msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)", + (int) (next - prev - 1), prev + 1, r->elt->name, nwords); + } + + if (nwords > max_stop_words) { + return 1; + } + } + else { + gint tt; + + k = kh_put(rspamd_sw_hash, cbdata->res, r->elt, &tt); + kh_value(cbdata->res, k) = kh_init(rspamd_sw_res_set); + kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt); + + msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)", + (int) (next - prev - 1), prev + 1, r->elt->name, nwords); + } + + return 0; +} + +static gboolean +rspamd_language_detector_try_stop_words(struct rspamd_task *task, + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part, + enum rspamd_language_category cat) +{ + struct rspamd_stop_word_elt *elt; + struct rspamd_sw_cbdata cbdata; + gboolean ret = FALSE; + static const int stop_words_threshold = 4, /* minimum stop words count */ + strong_confidence_threshold = 10 /* we are sure that this is enough */; + + elt = &d->stop_words[cat]; + cbdata.res = kh_init(rspamd_sw_hash); + cbdata.ranges = elt->ranges; + cbdata.task = task; + + rspamd_multipattern_lookup(elt->mp, part->utf_stripped_content->data, + part->utf_stripped_content->len, rspamd_language_detector_sw_cb, + &cbdata, NULL); + + if (kh_size(cbdata.res) > 0) { + khash_t(rspamd_sw_res_set) * cur_res; + double max_rate = G_MINDOUBLE; + struct rspamd_language_elt *cur_lang, *sel = NULL; + gboolean ignore_ascii = FALSE, ignore_latin = FALSE; + + again: + kh_foreach(cbdata.res, cur_lang, cur_res, { + int cur_matches = kh_size(cur_res); + + if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) { + /* Restart matches */ + ignore_ascii = TRUE; + sel = NULL; + max_rate = G_MINDOUBLE; + msg_debug_lang_det("ignore ascii after finding %d stop words from %s", + cur_matches, cur_lang->name); + goto again; + } + + if (!ignore_latin && cur_lang->category != RSPAMD_LANGUAGE_LATIN) { + /* Restart matches */ + ignore_latin = TRUE; + sel = NULL; + max_rate = G_MINDOUBLE; + msg_debug_lang_det("ignore latin after finding stop %d words from %s", + cur_matches, cur_lang->name); + goto again; + } + + if (cur_matches < stop_words_threshold) { + continue; + } + + if (cur_matches < strong_confidence_threshold) { + /* Ignore mixed languages when not enough confidence */ + if (ignore_ascii && (cur_lang->flags & RS_LANGUAGE_ASCII)) { + continue; + } + + if (ignore_latin && cur_lang->category == RSPAMD_LANGUAGE_LATIN) { + continue; + } + } + + double rate = (double) cur_matches / (double) cur_lang->stop_words; + + if (rate > max_rate) { + max_rate = rate; + sel = cur_lang; + } + + msg_debug_lang_det("found %d stop words from %s: %3f rate", + cur_matches, cur_lang->name, rate); + }); + + /* Cleanup */ + kh_foreach(cbdata.res, cur_lang, cur_res, { + kh_destroy(rspamd_sw_res_set, cur_res); + }); + + if (max_rate > 0 && sel) { + msg_debug_lang_det("set language based on stop words script %s, %.3f found", + sel->name, max_rate); + rspamd_language_detector_set_language(task, part, + sel->name, sel); + + ret = TRUE; + } + } + else { + msg_debug_lang_det("found no stop words in a text"); + } + + kh_destroy(rspamd_sw_hash, cbdata.res); + + return ret; +} + +gboolean +rspamd_language_detector_detect(struct rspamd_task *task, + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part) +{ + khash_t(rspamd_candidates_hash) * candidates; + GPtrArray *result; + gdouble mean, std, start_ticks, end_ticks; + guint cand_len; + enum rspamd_language_category cat; + struct rspamd_lang_detector_res *cand; + enum rspamd_language_detected_type r; + struct rspamd_frequency_sort_cbdata cbd; + /* Check if we have sorted candidates based on frequency */ + gboolean frequency_heuristic_applied = FALSE, ret = FALSE; + + if (!part->utf_stripped_content) { + return FALSE; + } + + start_ticks = rspamd_get_ticks(TRUE); + + guint nchinese = 0, nspecial = 0; + rspamd_language_detector_unicode_scripts(task, part, &nchinese, &nspecial); + + /* Disable internal language detection heuristics if we have fasttext */ + if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector) || !d->prefer_fasttext) { + /* Apply unicode scripts heuristic */ + if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) { + ret = TRUE; + } + + cat = rspamd_language_detector_get_category(part->unicode_scripts); + + if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) { + ret = TRUE; + } + } + + if (!ret) { + unsigned ndetected = 0; + if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) { + rspamd_fasttext_predict_result_t fasttext_predict_result = + rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task, + part->utf_words, 4); + + ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result); + + if (ndetected > 0) { + candidates = kh_init(rspamd_candidates_hash); + kh_resize(rspamd_candidates_hash, candidates, ndetected); + + /* Now fill all results where probability is above threshold */ + float max_prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, 0); + + for (unsigned int i = 0; i < ndetected; i++) { + float prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i); + if (prob > max_prob * 0.75) { + char *lang = rspamd_mempool_strdup(task->task_pool, + rspamd_lang_detection_fasttext_get_lang(fasttext_predict_result, i)); + int tmp; + khiter_t k = kh_put(rspamd_candidates_hash, candidates, lang, &tmp); + + kh_value(candidates, k) = rspamd_mempool_alloc0(task->task_pool, sizeof(*cand)); + cand = kh_value(candidates, k); + cand->lang = lang; + cand->prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i); + + /* Find the corresponding language elt */ + k = kh_get(rspamd_languages_hash, d->languages, lang); + if (k != kh_end(d->languages)) { + cand->elt = kh_value(d->languages, k); + } + } + } + + if (kh_size(candidates) == 1) { + r = rs_detect_single; + } + else if (kh_size(candidates) > 1) { + r = rs_detect_multiple; + } + else { + r = rs_detect_none; + } + } + + rspamd_fasttext_predict_result_destroy(fasttext_predict_result); + } + if (ndetected == 0) { + if (part->utf_words->len < default_short_text_limit) { + r = rs_detect_none; + msg_debug_lang_det("text is too short for trigrams detection: " + "%d words; at least %d words required", + (int) part->utf_words->len, + (int) default_short_text_limit); + switch (cat) { + case RSPAMD_LANGUAGE_CYRILLIC: + rspamd_language_detector_set_language(task, part, "ru", NULL); + break; + case RSPAMD_LANGUAGE_DEVANAGARI: + rspamd_language_detector_set_language(task, part, "hi", NULL); + break; + case RSPAMD_LANGUAGE_ARAB: + rspamd_language_detector_set_language(task, part, "ar", NULL); + break; + default: + case RSPAMD_LANGUAGE_LATIN: + rspamd_language_detector_set_language(task, part, "en", NULL); + break; + } + msg_debug_lang_det("set %s language based on symbols category", + part->language); + + candidates = kh_init(rspamd_candidates_hash); + } + else { + candidates = kh_init(rspamd_candidates_hash); + kh_resize(rspamd_candidates_hash, candidates, 32); + + r = rspamd_language_detector_try_ngramm(task, + default_words, + d, + part->utf_words, + cat, + candidates, + part); + + if (r == rs_detect_none) { + msg_debug_lang_det("no trigrams found, fallback to english"); + rspamd_language_detector_set_language(task, part, "en", NULL); + } + else if (r == rs_detect_multiple) { + /* Check our guess */ + + mean = 0.0; + std = 0.0; + cand_len = 0; + + /* Check distribution */ + kh_foreach_value(candidates, cand, { + if (!isnan(cand->prob)) { + mean += cand->prob; + cand_len++; + } + }); + + if (cand_len > 0) { + mean /= cand_len; + + kh_foreach_value(candidates, cand, { + gdouble err; + if (!isnan(cand->prob)) { + err = cand->prob - mean; + std += fabs(err); + } + }); + + std /= cand_len; + } + + msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev", + cand_len, mean, std); + + if (cand_len > 0 && std / fabs(mean) < 0.25) { + msg_debug_lang_det("apply frequency heuristic sorting"); + frequency_heuristic_applied = TRUE; + cbd.d = d; + cbd.mean = mean; + cbd.std = std; + cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; + + if (part->nwords < default_words / 2) { + cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + } + } + } + } + } + + /* Now, convert hash to array and sort it */ + if (r != rs_detect_none && kh_size(candidates) > 0) { + result = g_ptr_array_sized_new(kh_size(candidates)); + + kh_foreach_value(candidates, cand, { + if (!isnan(cand->prob)) { + msg_debug_lang_det("pre-sorting probability %s -> %.2f", cand->lang, + cand->prob); + g_ptr_array_add(result, cand); + } + }); + + if (frequency_heuristic_applied) { + g_ptr_array_sort_with_data(result, + rspamd_language_detector_cmp_heuristic, + (gpointer) &cbd); + } + else { + g_ptr_array_sort(result, rspamd_language_detector_cmp); + } + + int i; + PTR_ARRAY_FOREACH(result, i, cand) + { + msg_debug_lang_det("final probability %s -> %.2f", cand->lang, + cand->prob); + } + + if (part->languages != NULL) { + g_ptr_array_unref(part->languages); + } + + part->languages = result; + part->language = ((struct rspamd_lang_detector_res *) g_ptr_array_index(result, 0))->lang; + ret = TRUE; + } + else if (part->languages == NULL) { + rspamd_language_detector_set_language(task, part, "en", NULL); + } + + kh_destroy(rspamd_candidates_hash, candidates); + } + + /* Update internal stat */ + if (part->languages != NULL && part->languages->len > 0 && !frequency_heuristic_applied) { + cand = g_ptr_array_index(part->languages, 0); + if (cand->elt) { + cand->elt->occurrences++; + d->total_occurrences++; + + msg_debug_lang_det("updated stat for %s: %d occurrences, %z total detected", + cand->elt->name, cand->elt->occurrences, + d->total_occurrences); + } + } + + end_ticks = rspamd_get_ticks(TRUE); + msg_debug_lang_det("detected languages in %.0f ticks", + (end_ticks - start_ticks)); + + return ret; +} + + +struct rspamd_lang_detector * +rspamd_language_detector_ref(struct rspamd_lang_detector *d) +{ + REF_RETAIN(d); + + return d; +} + +void rspamd_language_detector_unref(struct rspamd_lang_detector *d) +{ + REF_RELEASE(d); +} + +gboolean +rspamd_language_detector_is_stop_word(struct rspamd_lang_detector *d, + const gchar *word, gsize wlen) +{ + khiter_t k; + rspamd_ftok_t search; + + search.begin = word; + search.len = wlen; + + k = kh_get(rspamd_stopwords_hash, d->stop_words_norm, &search); + + if (k != kh_end(d->stop_words_norm)) { + return TRUE; + } + + return FALSE; +} + +gint rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt) +{ + if (elt) { + return elt->flags; + } + + return 0; +}
\ No newline at end of file diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h new file mode 100644 index 0000000..5423c13 --- /dev/null +++ b/src/libmime/lang_detection.h @@ -0,0 +1,110 @@ +/*- + * Copyright 2017 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_LANG_DETECTION_H +#define RSPAMD_LANG_DETECTION_H + +#include "config.h" +#include "libserver/cfg_file.h" +#include "libstat/stat_api.h" +#include "libmime/message.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_lang_detector; +struct rspamd_language_elt; +struct rspamd_task; + +enum rspamd_unicode_scripts { + RSPAMD_UNICODE_LATIN = (1 << 0), + RSPAMD_UNICODE_GREEK = (1 << 1), + RSPAMD_UNICODE_CYRILLIC = (1 << 2), + RSPAMD_UNICODE_HEBREW = (1 << 3), + RSPAMD_UNICODE_CJK = (1 << 4), + RSPAMD_UNICODE_JP = (1 << 5), + RSPAMD_UNICODE_ARABIC = (1 << 6), + RSPAMD_UNICODE_DEVANAGARI = (1 << 7), + RSPAMD_UNICODE_THAI = (1 << 8), + RSPAMD_UNICODE_ARMENIAN = (1 << 9), + RSPAMD_UNICODE_GEORGIAN = (1 << 10), + RSPAMD_UNICODE_GUJARATI = (1 << 11), + RSPAMD_UNICODE_TAMIL = (1 << 12), + RSPAMD_UNICODE_TELUGU = (1 << 13), + RSPAMD_UNICODE_MALAYALAM = (1 << 14), + RSPAMD_UNICODE_SINHALA = (1 << 15), + RSPAMD_UNICODE_HANGUL = (1 << 16), +}; + +enum rspamd_language_elt_flags { + RS_LANGUAGE_DEFAULT = 0, + RS_LANGUAGE_LATIN = (1 << 0), + RS_LANGUAGE_TIER1 = (1 << 3), + RS_LANGUAGE_TIER0 = (1 << 4), + RS_LANGUAGE_DIACRITICS = (1 << 5), + RS_LANGUAGE_ASCII = (1 << 6), +}; + +struct rspamd_lang_detector_res { + gdouble prob; + const gchar *lang; + struct rspamd_language_elt *elt; +}; + +/** + * Create new language detector object using configuration object + * @param cfg + * @return + */ +struct rspamd_lang_detector *rspamd_language_detector_init(struct rspamd_config *cfg); + +struct rspamd_lang_detector *rspamd_language_detector_ref(struct rspamd_lang_detector *d); + +void rspamd_language_detector_unref(struct rspamd_lang_detector *d); + +/** + * Try to detect language of words + * @param d + * @param ucs_tokens + * @param words_len + * @return array of struct rspamd_lang_detector_res sorted by freq descending + */ +gboolean rspamd_language_detector_detect(struct rspamd_task *task, + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part); + +/** + * Returns TRUE if the specified word is known to be a stop word + * @param d + * @param word + * @param wlen + * @return + */ +gboolean rspamd_language_detector_is_stop_word(struct rspamd_lang_detector *d, + const gchar *word, gsize wlen); + +/** + * Return language flags for a specific language elt + * @param elt + * @return + */ +gint rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx new file mode 100644 index 0000000..c973ed7 --- /dev/null +++ b/src/libmime/lang_detection_fasttext.cxx @@ -0,0 +1,269 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lang_detection_fasttext.h" + +#ifdef WITH_FASTTEXT +#include "fasttext/fasttext.h" +#include "libserver/cfg_file.h" +#include "libserver/logger.h" +#include "fmt/core.h" +#include "stat_api.h" +#include <exception> +#include <string_view> +#include <vector> +#endif + +#ifdef WITH_FASTTEXT + +EXTERN_LOG_MODULE_DEF(langdet); +#define msg_debug_lang_det(...) rspamd_conditional_debug_fast(nullptr, nullptr, \ + rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \ + __FUNCTION__, \ + __VA_ARGS__) + +namespace rspamd::langdet { +class fasttext_langdet { +private: + fasttext::FastText ft; + std::string model_fname; + bool loaded = false; + +public: + explicit fasttext_langdet(struct rspamd_config *cfg) + { + const auto *ucl_obj = cfg->cfg_ucl_obj; + const auto *opts_section = ucl_object_find_key(ucl_obj, "lang_detection"); + + if (opts_section) { + const auto *model = ucl_object_find_key(opts_section, "fasttext_model"); + + if (model) { + try { + ft.loadModel(ucl_object_tostring(model)); + loaded = true; + model_fname = std::string{ucl_object_tostring(model)}; + } catch (std::exception &e) { + auto err_message = fmt::format("cannot load fasttext model: {}", e.what()); + msg_err_config("%s", err_message.c_str()); + loaded = false; + } + } + } + } + + /* Disallow multiple initialisation */ + fasttext_langdet() = delete; + fasttext_langdet(const fasttext_langdet &) = delete; + fasttext_langdet(fasttext_langdet &&) = delete; + + ~fasttext_langdet() = default; + + auto is_enabled() const -> bool + { + return loaded; + } + auto word2vec(const char *in, std::size_t len, std::vector<std::int32_t> &word_ngramms) const + { + if (!loaded) { + return; + } + + std::string tok{in, len}; + const auto &dic = ft.getDictionary(); + auto h = dic->hash(tok); + auto wid = dic->getId(tok, h); + auto type = wid < 0 ? dic->getType(tok) : dic->getType(wid); + + if (type == fasttext::entry_type::word) { + if (wid < 0) { + auto pipelined_word = fmt::format("{}{}{}", fasttext::Dictionary::BOW, tok, fasttext::Dictionary::EOW); + dic->computeSubwords(pipelined_word, word_ngramms); + } + else { + if (ft.getArgs().maxn <= 0) { + word_ngramms.push_back(wid); + } + else { + const auto ngrams = dic->getSubwords(wid); + word_ngramms.insert(word_ngramms.end(), ngrams.cbegin(), ngrams.cend()); + } + } + } + } + auto detect_language(std::vector<std::int32_t> &words, int k) + -> std::vector<std::pair<fasttext::real, std::string>> * + { + if (!loaded) { + return nullptr; + } + + auto predictions = new std::vector<std::pair<fasttext::real, std::string>>; + predictions->reserve(k); + fasttext::Predictions line_predictions; + line_predictions.reserve(k); + ft.predict(k, words, line_predictions, 0.0f); + const auto *dict = ft.getDictionary().get(); + + for (const auto &pred: line_predictions) { + predictions->push_back(std::make_pair(std::exp(pred.first), dict->getLabel(pred.second))); + } + return predictions; + } + + auto model_info(void) const -> const std::string + { + if (!loaded) { + static const auto not_loaded = std::string{"fasttext model is not loaded"}; + return not_loaded; + } + else { + return fmt::format("fasttext model {}: {} languages, {} tokens", model_fname, + ft.getDictionary()->nlabels(), ft.getDictionary()->ntokens()); + } + } +}; +}// namespace rspamd::langdet +#endif + +/* C API part */ +G_BEGIN_DECLS + +#define FASTTEXT_MODEL_TO_C_API(p) reinterpret_cast<rspamd::langdet::fasttext_langdet *>(p) +#define FASTTEXT_RESULT_TO_C_API(res) reinterpret_cast<std::vector<std::pair<fasttext::real, std::string>> *>(res) + +void *rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg) +{ +#ifndef WITH_FASTTEXT + return nullptr; +#else + return (void *) new rspamd::langdet::fasttext_langdet(cfg); +#endif +} + +char *rspamd_lang_detection_fasttext_show_info(void *ud) +{ +#ifndef WITH_FASTTEXT + return g_strdup("fasttext is not compiled in"); +#else + auto model_info = FASTTEXT_MODEL_TO_C_API(ud)->model_info(); + + return g_strdup(model_info.c_str()); +#endif +} + +bool rspamd_lang_detection_fasttext_is_enabled(void *ud) +{ +#ifdef WITH_FASTTEXT + auto *real_model = FASTTEXT_MODEL_TO_C_API(ud); + + if (real_model) { + return real_model->is_enabled(); + } +#endif + + return false; +} + +rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, + struct rspamd_task *task, + GArray *utf_words, + int k) +{ +#ifndef WITH_FASTTEXT + return nullptr; +#else + /* Avoid too long inputs */ + static const guint max_fasttext_input_len = 1024 * 1024; + auto *real_model = FASTTEXT_MODEL_TO_C_API(ud); + std::vector<std::int32_t> words_vec; + words_vec.reserve(utf_words->len); + + for (auto i = 0; i < std::min(utf_words->len, max_fasttext_input_len); i++) { + const auto *w = &g_array_index(utf_words, rspamd_stat_token_t, i); + if (w->original.len > 0) { + real_model->word2vec(w->original.begin, w->original.len, words_vec); + } + } + + msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), utf_words->len); + + auto *res = real_model->detect_language(words_vec, k); + + return (rspamd_fasttext_predict_result_t) res; +#endif +} + +void rspamd_lang_detection_fasttext_destroy(void *ud) +{ +#ifdef WITH_FASTTEXT + delete FASTTEXT_MODEL_TO_C_API(ud); +#endif +} + + +guint rspamd_lang_detection_fasttext_get_nlangs(rspamd_fasttext_predict_result_t res) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + if (real_res) { + return real_res->size(); + } +#endif + return 0; +} + +const char * +rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res, unsigned int idx) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + if (real_res && real_res->size() > idx) { + /* Fasttext returns result in form __label__<lang>, so we need to remove __label__ prefix */ + auto lang = std::string_view{real_res->at(idx).second}; + if (lang.size() > sizeof("__label__") && lang.substr(0, sizeof("__label__") - 1) == "__label__") { + lang.remove_prefix(sizeof("__label__") - 1); + } + return lang.data(); + } +#endif + return nullptr; +} + +float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res, unsigned int idx) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + if (real_res && real_res->size() > idx) { + return real_res->at(idx).first; + } +#endif + return 0.0f; +} + +void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res) +{ +#ifdef WITH_FASTTEXT + auto *real_res = FASTTEXT_RESULT_TO_C_API(res); + + delete real_res; +#endif +} + +G_END_DECLS
\ No newline at end of file diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h new file mode 100644 index 0000000..c8710d3 --- /dev/null +++ b/src/libmime/lang_detection_fasttext.h @@ -0,0 +1,91 @@ +/*- + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_LANG_DETECTION_FASTTEXT_H +#define RSPAMD_LANG_DETECTION_FASTTEXT_H + +#include "config.h" + +G_BEGIN_DECLS +struct rspamd_config; +struct rspamd_task; /* for logging */ +/** + * Initialize fasttext language detector + * @param cfg + * @return opaque pointer + */ +void *rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg); + +/** + * Check if fasttext language detector is enabled + * @param ud + * @return + */ +bool rspamd_lang_detection_fasttext_is_enabled(void *ud); + +/** + * Show info about fasttext language detector + * @param ud + * @return + */ +char *rspamd_lang_detection_fasttext_show_info(void *ud); + + +typedef void *rspamd_fasttext_predict_result_t; +/** + * Detect language using fasttext + * @param ud opaque pointer + * @param in input text + * @param len length of input text + * @param k number of results to return + * @return TRUE if language is detected + */ +rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, + struct rspamd_task *task, GArray *utf_words, int k); + +/** + * Get number of languages detected + * @param ud + * @return + */ +guint rspamd_lang_detection_fasttext_get_nlangs(rspamd_fasttext_predict_result_t ud); +/** + * Get language from fasttext result + * @param res + * @return + */ +const char *rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res, unsigned int idx); + +/** + * Get probability from fasttext result + * @param res + * @return + */ +float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res, unsigned int idx); + +/** + * Destroy fasttext result + * @param res + */ +void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res); + +/** + * Destroy fasttext language detector + */ +void rspamd_lang_detection_fasttext_destroy(void *ud); + + +G_END_DECLS +#endif /* RSPAMD_LANG_DETECTION_FASTTEXT_H */ diff --git a/src/libmime/message.c b/src/libmime/message.c new file mode 100644 index 0000000..3acc935 --- /dev/null +++ b/src/libmime/message.c @@ -0,0 +1,1732 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "util.h" +#include "rspamd.h" +#include "message.h" +#include "libserver/html/html.h" +#include "images.h" +#include "archives.h" +#include "tokenizers/tokenizers.h" +#include "smtp_parsers.h" +#include "mime_parser.h" +#include "mime_encoding.h" +#include "lang_detection.h" +#include "libutil/multipattern.h" +#include "libserver/mempool_vars_internal.h" + +#ifdef WITH_SNOWBALL +#include "libstemmer.h" +#endif + +#include <math.h> +#include <unicode/uchar.h> +#include "sodium.h" +#include "libserver/cfg_file_private.h" +#include "lua/lua_common.h" +#include "contrib/uthash/utlist.h" +#include "contrib/t1ha/t1ha.h" +#include "received.h" + +#define GTUBE_SYMBOL "GTUBE" + +#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF) +#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF) + +static const gchar gtube_pattern_reject[] = "XJS*C4JDBQADN1.NSBN3*2IDNEN*" + "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X"; +static const gchar gtube_pattern_add_header[] = "YJS*C4JDBQADN1.NSBN3*2IDNEN*" + "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X"; +static const gchar gtube_pattern_rewrite_subject[] = "ZJS*C4JDBQADN1.NSBN3*2IDNEN*" + "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X"; +static const gchar gtube_pattern_no_action[] = "AJS*C4JDBQADN1.NSBN3*2IDNEN*" + "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X"; +struct rspamd_multipattern *gtube_matcher = NULL; +static const guint64 words_hash_seed = 0xdeadbabe; + +static void +free_byte_array_callback(void *pointer) +{ + GByteArray *arr = (GByteArray *) pointer; + g_byte_array_free(arr, TRUE); +} + +static void +rspamd_mime_part_extract_words(struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + rspamd_stat_token_t *w; + guint i, total_len = 0, short_len = 0; + + if (part->utf_words) { + rspamd_stem_words(part->utf_words, task->task_pool, part->language, + task->lang_det); + + for (i = 0; i < part->utf_words->len; i++) { + guint64 h; + + w = &g_array_index(part->utf_words, rspamd_stat_token_t, i); + + if (w->stemmed.len > 0) { + /* + * We use static hash seed if we would want to use that in shingles + * computation in future + */ + h = rspamd_cryptobox_fast_hash_specific( + RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, + w->stemmed.begin, w->stemmed.len, words_hash_seed); + g_array_append_val(part->normalized_hashes, h); + total_len += w->stemmed.len; + + if (w->stemmed.len <= 3) { + short_len++; + } + + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT && + !(w->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) { + part->nwords++; + } + } + + if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE | + RSPAMD_STAT_TOKEN_FLAG_NORMALISED | + RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) { + task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE; + } + } + + if (part->utf_words->len) { + gdouble *avg_len_p, *short_len_p; + + avg_len_p = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_AVG_WORDS_LEN); + + if (avg_len_p == NULL) { + avg_len_p = rspamd_mempool_alloc(task->task_pool, + sizeof(double)); + *avg_len_p = total_len; + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_AVG_WORDS_LEN, avg_len_p, NULL); + } + else { + *avg_len_p += total_len; + } + + short_len_p = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_SHORT_WORDS_CNT); + + if (short_len_p == NULL) { + short_len_p = rspamd_mempool_alloc(task->task_pool, + sizeof(double)); + *short_len_p = short_len; + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_SHORT_WORDS_CNT, avg_len_p, NULL); + } + else { + *short_len_p += short_len; + } + } + } +} + +static void +rspamd_mime_part_create_words(struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + enum rspamd_tokenize_type tok_type; + + if (IS_TEXT_PART_UTF(part)) { + +#if U_ICU_VERSION_MAJOR_NUM < 50 + /* Hack to prevent hang with Thai in old libicu */ + const gchar *p = part->utf_stripped_content->data, *end; + guint i = 0; + end = p + part->utf_stripped_content->len; + gint32 uc, sc; + + tok_type = RSPAMD_TOKENIZE_UTF; + + while (p + i < end) { + U8_NEXT(p, i, part->utf_stripped_content->len, uc); + + if (((gint32) uc) < 0) { + tok_type = RSPAMD_TOKENIZE_RAW; + break; + } + + if (u_isalpha(uc)) { + sc = ublock_getCode(uc); + + if (sc == UBLOCK_THAI) { + msg_info_task("enable workaround for Thai characters for old libicu"); + tok_type = RSPAMD_TOKENIZE_RAW; + break; + } + } + } +#else + tok_type = RSPAMD_TOKENIZE_UTF; +#endif + } + else { + tok_type = RSPAMD_TOKENIZE_RAW; + } + + part->utf_words = rspamd_tokenize_text( + part->utf_stripped_content->data, + part->utf_stripped_content->len, + &part->utf_stripped_text, + tok_type, task->cfg, + part->exceptions, + NULL, + NULL, + task->task_pool); + + + if (part->utf_words) { + part->normalized_hashes = g_array_sized_new(FALSE, FALSE, + sizeof(guint64), part->utf_words->len); + rspamd_normalize_words(part->utf_words, task->task_pool); + } +} + +static void +rspamd_mime_part_detect_language(struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + struct rspamd_lang_detector_res *lang; + + if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 && + task->lang_det) { + if (rspamd_language_detector_detect(task, task->lang_det, part)) { + lang = g_ptr_array_index(part->languages, 0); + part->language = lang->lang; + + msg_info_task("detected part language: %s", part->language); + } + else { + part->language = "en"; /* Safe fallback */ + } + } +} + +static void +rspamd_strip_newlines_parse(struct rspamd_task *task, + const gchar *begin, const gchar *pe, + struct rspamd_mime_text_part *part) +{ + const gchar *p = begin, *c = begin; + gboolean crlf_added = FALSE, is_utf = IS_TEXT_PART_UTF(part); + gboolean url_open_bracket = FALSE; + UChar32 uc; + + enum { + normal_char, + seen_cr, + seen_lf, + } state = normal_char; + + while (p < pe) { + if (U8_IS_LEAD(*p) && is_utf) { + gint32 off = p - begin; + U8_NEXT(begin, off, pe - begin, uc); + + if (uc != -1) { + while (p < pe && off < (pe - begin)) { + if (IS_ZERO_WIDTH_SPACE(uc)) { + /* Invisible space ! */ + task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE; + part->spaces++; + + if (p > c) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) c, p - c); + c = begin + off; + p = c; + } + + U8_NEXT(begin, off, pe - begin, uc); + + if (!IS_ZERO_WIDTH_SPACE(uc)) { + break; + } + + part->double_spaces++; + p = begin + off; + c = p; + } + else { + break; + } + } + } + } + + if (G_UNLIKELY(p >= pe)) { + /* + * This is reached when there is a utf8 part and we + * have zero width spaces at the end of the text + * So we just check overflow and refuse to access *p if it is + * after our real content. + */ + break; + } + else if (*p == '\r') { + switch (state) { + case normal_char: + state = seen_cr; + if (p > c) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) c, p - c); + } + + crlf_added = FALSE; + c = p + 1; + break; + case seen_cr: + /* Double \r\r */ + if (!crlf_added) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + crlf_added = TRUE; + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + part->nlines++; + part->empty_lines++; + c = p + 1; + break; + case seen_lf: + /* Likely \r\n\r...*/ + state = seen_cr; + c = p + 1; + break; + } + + url_open_bracket = FALSE; + + p++; + } + else if (*p == '\n') { + switch (state) { + case normal_char: + state = seen_lf; + + if (p > c) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) c, p - c); + } + + c = p + 1; + + if (IS_TEXT_PART_HTML(part) || !url_open_bracket) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + crlf_added = TRUE; + } + else { + crlf_added = FALSE; + } + + break; + case seen_cr: + /* \r\n */ + if (!crlf_added) { + if (IS_TEXT_PART_HTML(part) || !url_open_bracket) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + crlf_added = TRUE; + } + + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + c = p + 1; + state = seen_lf; + + break; + case seen_lf: + /* Double \n\n */ + if (!crlf_added) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + crlf_added = TRUE; + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + part->nlines++; + part->empty_lines++; + + c = p + 1; + break; + } + url_open_bracket = FALSE; + + p++; + } + else { + if ((*p) == '<') { + url_open_bracket = TRUE; + } + else if ((*p) == '>') { + url_open_bracket = FALSE; + } + + switch (state) { + case normal_char: + if (*p == ' ') { + part->spaces++; + + if (p > begin && *(p - 1) == ' ') { + part->double_spaces++; + } + } + else { + part->non_spaces++; + + if ((*p) & 0x80) { + part->non_ascii_chars++; + } + else { + if (g_ascii_isupper(*p)) { + part->capital_letters++; + } + else if (g_ascii_isdigit(*p)) { + part->numeric_characters++; + } + + part->ascii_chars++; + } + } + break; + case seen_cr: + case seen_lf: + part->nlines++; + + if (!crlf_added) { + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + /* Skip initial spaces */ + if (*p == ' ') { + if (!crlf_added) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + } + + while (p < pe && *p == ' ') { + p++; + c++; + part->spaces++; + } + + if (p < pe && (*p == '\r' || *p == '\n')) { + part->empty_lines++; + } + } + + state = normal_char; + continue; + } + + p++; + } + } + + /* Leftover */ + if (p > c) { + if (p > pe) { + p = pe; + } + + switch (state) { + case normal_char: + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) c, p - c); + + while (c < p) { + if (*c == ' ') { + part->spaces++; + + if (c > begin && *(c - 1) == ' ') { + part->double_spaces++; + } + } + else { + part->non_spaces++; + + if ((*c) & 0x80) { + part->non_ascii_chars++; + } + else { + part->ascii_chars++; + } + } + + c++; + } + break; + default: + + if (!crlf_added) { + g_byte_array_append(part->utf_stripped_content, + (const guint8 *) " ", 1); + g_ptr_array_add(part->newlines, + (((gpointer) (goffset) (part->utf_stripped_content->len)))); + } + + part->nlines++; + break; + } + } +} + +static void +rspamd_u_text_dtor(void *p) +{ + utext_close((UText *) p); +} + +static void +rspamd_normalize_text_part(struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + const gchar *p, *end; + guint i; + goffset off; + struct rspamd_process_exception *ex; + UErrorCode uc_err = U_ZERO_ERROR; + + part->newlines = g_ptr_array_sized_new(128); + + if (IS_TEXT_PART_EMPTY(part)) { + part->utf_stripped_content = g_byte_array_new(); + } + else { + part->utf_stripped_content = g_byte_array_sized_new(part->utf_content.len); + + p = (const gchar *) part->utf_content.begin; + end = p + part->utf_content.len; + + rspamd_strip_newlines_parse(task, p, end, part); + + for (i = 0; i < part->newlines->len; i++) { + ex = rspamd_mempool_alloc(task->task_pool, sizeof(*ex)); + off = (goffset) g_ptr_array_index(part->newlines, i); + g_ptr_array_index(part->newlines, i) = (gpointer) (goffset) (part->utf_stripped_content->data + off); + ex->pos = off; + ex->len = 0; + ex->type = RSPAMD_EXCEPTION_NEWLINE; + part->exceptions = g_list_prepend(part->exceptions, ex); + } + } + + if (IS_TEXT_PART_UTF(part)) { + utext_openUTF8(&part->utf_stripped_text, + part->utf_stripped_content->data, + part->utf_stripped_content->len, + &uc_err); + + if (!U_SUCCESS(uc_err)) { + msg_warn_task("cannot open text from utf content"); + /* Probably, should be an assertion */ + } + else { + rspamd_mempool_add_destructor(task->task_pool, + rspamd_u_text_dtor, + &part->utf_stripped_text); + } + } + + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) free_byte_array_callback, + part->utf_stripped_content); + rspamd_mempool_notify_alloc(task->task_pool, + part->utf_stripped_content->len); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, + part->newlines); +} + +#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) + +static guint +rspamd_words_levenshtein_distance(struct rspamd_task *task, + GArray *w1, GArray *w2) +{ + guint s1len, s2len, x, y, lastdiag, olddiag; + guint *column, ret; + guint64 h1, h2; + gint eq; + static const guint max_words = 8192; + + s1len = w1->len; + s2len = w2->len; + + if (s1len + s2len > max_words) { + msg_info_task("cannot direct compare multipart/alternative parts with more than %ud words in total: " + "(%ud words in one part and %ud in another)", + max_words, s1len, s2len); + + /* Use approximate comparison of number of words */ + if (s1len > s2len) { + return s1len - s2len; + } + else { + return s2len - s1len; + } + } + + column = g_malloc0((s1len + 1) * sizeof(guint)); + + for (y = 1; y <= s1len; y++) { + column[y] = y; + } + + for (x = 1; x <= s2len; x++) { + column[0] = x; + + for (y = 1, lastdiag = x - 1; y <= s1len; y++) { + olddiag = column[y]; + h1 = g_array_index(w1, guint64, y - 1); + h2 = g_array_index(w2, guint64, x - 1); + eq = (h1 == h2) ? 1 : 0; + /* + * Cost of replacement is twice higher than cost of add/delete + * to calculate percentage properly + */ + column[y] = MIN3(column[y] + 1, column[y - 1] + 1, + lastdiag + (eq * 2)); + lastdiag = olddiag; + } + } + + ret = column[s1len]; + g_free(column); + + return ret; +} + +static gint +rspamd_multipattern_gtube_cb(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + struct rspamd_task *task = (struct rspamd_task *) context; + + if (strnum > 0) { + if (task->cfg->gtube_patterns_policy == RSPAMD_GTUBE_ALL) { + return strnum + 1; + } + + return 0; + } + + return strnum + 1; /* To distinguish from zero */ +} + +static enum rspamd_action_type +rspamd_check_gtube(struct rspamd_task *task, struct rspamd_mime_text_part *part) +{ + static const gsize max_check_size = 8 * 1024; + gint ret; + enum rspamd_action_type act = METRIC_ACTION_NOACTION; + enum rspamd_gtube_patterns_policy policy = task->cfg ? task->cfg->gtube_patterns_policy : RSPAMD_GTUBE_REJECT; + g_assert(part != NULL); + + if (gtube_matcher == NULL && policy != RSPAMD_GTUBE_DISABLED) { + gtube_matcher = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT); + + rspamd_multipattern_add_pattern(gtube_matcher, + gtube_pattern_reject, + RSPAMD_MULTIPATTERN_DEFAULT); + rspamd_multipattern_add_pattern(gtube_matcher, + gtube_pattern_add_header, + RSPAMD_MULTIPATTERN_DEFAULT); + rspamd_multipattern_add_pattern(gtube_matcher, + gtube_pattern_rewrite_subject, + RSPAMD_MULTIPATTERN_DEFAULT); + rspamd_multipattern_add_pattern(gtube_matcher, + gtube_pattern_no_action, + RSPAMD_MULTIPATTERN_DEFAULT); + + GError *err = NULL; + rspamd_multipattern_compile(gtube_matcher, &err); + + if (err != NULL) { + /* It will be expensive, but I don't care, still better than to abort */ + msg_err("cannot compile gtube matcher: %s", err->message); + g_error_free(err); + } + } + + if (part->utf_content.len >= sizeof(gtube_pattern_reject) && + part->utf_content.len <= max_check_size && + policy != RSPAMD_GTUBE_DISABLED) { + if ((ret = rspamd_multipattern_lookup(gtube_matcher, part->utf_content.begin, + part->utf_content.len, + rspamd_multipattern_gtube_cb, task, NULL)) > 0) { + + switch (ret) { + case 1: + act = METRIC_ACTION_REJECT; + break; + case 2: + act = METRIC_ACTION_ADD_HEADER; + break; + case 3: + act = METRIC_ACTION_REWRITE_SUBJECT; + break; + case 4: + act = METRIC_ACTION_NOACTION; + break; + } + + if (ret != 0) { + task->flags |= RSPAMD_TASK_FLAG_SKIP; + task->flags |= RSPAMD_TASK_FLAG_GTUBE; + msg_info_task( + "gtube %s pattern has been found in part of length %uz", + rspamd_action_to_str(act), + part->utf_content.len); + } + } + } + + return act; +} + +static gint +exceptions_compare_func(gconstpointer a, gconstpointer b) +{ + const struct rspamd_process_exception *ea = a, *eb = b; + + return ea->pos - eb->pos; +} + +static gboolean +rspamd_message_process_plain_text_part(struct rspamd_task *task, + struct rspamd_mime_text_part *text_part) +{ + if (text_part->parsed.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + + return TRUE; + } + + rspamd_mime_text_part_maybe_convert(task, text_part); + + if (text_part->utf_raw_content != NULL) { + /* Just have the same content */ + text_part->utf_content.begin = (const gchar *) text_part->utf_raw_content->data; + text_part->utf_content.len = text_part->utf_raw_content->len; + } + else { + /* + * We ignore unconverted parts from now as it is dangerous + * to treat them as text parts + */ + text_part->utf_content.begin = NULL; + text_part->utf_content.len = 0; + + return FALSE; + } + + return TRUE; +} + +static gboolean +rspamd_message_process_html_text_part(struct rspamd_task *task, + struct rspamd_mime_text_part *text_part, + uint16_t *cur_url_order) +{ + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; + + if (text_part->parsed.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + + return TRUE; + } + + rspamd_mime_text_part_maybe_convert(task, text_part); + + if (text_part->utf_raw_content == NULL) { + return FALSE; + } + + + text_part->html = rspamd_html_process_part_full( + task, + text_part->utf_raw_content, + &text_part->exceptions, + MESSAGE_FIELD(task, urls), + text_part->mime_part->urls, + task->cfg ? task->cfg->enable_css_parser : true, + cur_url_order); + rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content); + + if (text_part->utf_content.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + } + + return TRUE; +} + +enum rspamd_message_part_is_text_result { + RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN = 0, + RSPAMD_MESSAGE_PART_IS_TEXT_HTML, + RSPAMD_MESSAGE_PART_IS_NOT_TEXT +}; + +static enum rspamd_message_part_is_text_result +rspamd_message_part_can_be_parsed_as_text(struct rspamd_task *task, + struct rspamd_mime_part *mime_part) +{ + enum rspamd_message_part_is_text_result res = RSPAMD_MESSAGE_PART_IS_NOT_TEXT; + + if ((mime_part->ct && (mime_part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) || + (mime_part->detected_type && strcmp(mime_part->detected_type, "text") == 0)) { + + res = RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN; + rspamd_ftok_t html_tok, xhtml_tok; + + html_tok.begin = "html"; + html_tok.len = 4; + xhtml_tok.begin = "xhtml"; + xhtml_tok.len = 5; + + if (rspamd_ftok_casecmp(&mime_part->ct->subtype, &html_tok) == 0 || + rspamd_ftok_casecmp(&mime_part->ct->subtype, &xhtml_tok) == 0 || + (mime_part->detected_ext && + strcmp(mime_part->detected_ext, "html") == 0)) { + res = RSPAMD_MESSAGE_PART_IS_TEXT_HTML; + } + } + + /* Skip attachments */ + if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT && + (mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) { + if (!task->cfg->check_text_attachements) { + debug_task("skip attachments for checking as text parts"); + return RSPAMD_MESSAGE_PART_IS_NOT_TEXT; + } + } + + return res; +} + +static gboolean +rspamd_message_process_text_part_maybe(struct rspamd_task *task, + struct rspamd_mime_part *mime_part, + enum rspamd_message_part_is_text_result is_text, + uint16_t *cur_url_order) +{ + struct rspamd_mime_text_part *text_part; + guint flags = 0; + enum rspamd_action_type act; + + /* Skip attachments */ + if ((mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) { + flags |= RSPAMD_MIME_TEXT_PART_ATTACHMENT; + } + + text_part = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_mime_text_part)); + text_part->mime_part = mime_part; + text_part->raw.begin = mime_part->raw_data.begin; + text_part->raw.len = mime_part->raw_data.len; + text_part->parsed.begin = mime_part->parsed_data.begin; + text_part->parsed.len = mime_part->parsed_data.len; + text_part->utf_stripped_text = (UText) UTEXT_INITIALIZER; + text_part->flags |= flags; + + if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) { + if (!rspamd_message_process_html_text_part(task, text_part, cur_url_order)) { + return FALSE; + } + } + else { + if (!rspamd_message_process_plain_text_part(task, text_part)) { + return FALSE; + } + } + + g_ptr_array_add(MESSAGE_FIELD(task, text_parts), text_part); + mime_part->part_type = RSPAMD_MIME_PART_TEXT; + mime_part->specific.txt = text_part; + + act = rspamd_check_gtube(task, text_part); + if (act != METRIC_ACTION_NOACTION) { + struct rspamd_action *action; + gdouble score = NAN; + + action = rspamd_config_get_action_by_type(task->cfg, act); + + if (action) { + score = action->threshold; + + rspamd_add_passthrough_result(task, action, + RSPAMD_PASSTHROUGH_CRITICAL, + score, "Gtube pattern", + "GTUBE", 0, NULL); + } + + rspamd_task_insert_result(task, GTUBE_SYMBOL, 0, NULL); + + return TRUE; + } + + /* Post process part */ + rspamd_normalize_text_part(task, text_part); + + if (!IS_TEXT_PART_HTML(text_part)) { + if (mime_part->parent_part) { + struct rspamd_mime_part *parent = mime_part->parent_part; + + if (IS_PART_MULTIPART(parent) && parent->specific.mp->children->len == 2) { + /* + * Use strict extraction mode: we will extract missing urls from + * an html part if needed + */ + rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order, + RSPAMD_URL_FIND_STRICT); + } + else { + /* + * Fall back to full text extraction using TLD patterns + */ + rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order, + RSPAMD_URL_FIND_ALL); + } + } + else { + /* + * Fall back to full text extraction using TLD patterns + */ + rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order, + RSPAMD_URL_FIND_ALL); + } + } + else { + rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order, + RSPAMD_URL_FIND_STRICT); + } + + if (text_part->exceptions) { + text_part->exceptions = g_list_sort(text_part->exceptions, + exceptions_compare_func); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) g_list_free, + text_part->exceptions); + } + + rspamd_mime_part_create_words(task, text_part); + + return TRUE; +} + +/* Creates message from various data using libmagic to detect type */ +static void +rspamd_message_from_data(struct rspamd_task *task, const guchar *start, + gsize len) +{ + struct rspamd_content_type *ct = NULL; + struct rspamd_mime_part *part; + const char *mb = "application/octet-stream"; + gchar *mid; + rspamd_ftok_t srch, *tok; + gchar cdbuf[1024]; + + g_assert(start != NULL); + + part = rspamd_mempool_alloc0(task->task_pool, sizeof(*part)); + + part->raw_data.begin = start; + part->raw_data.len = len; + part->parsed_data.begin = start; + part->parsed_data.len = len; + part->part_number = MESSAGE_FIELD(task, parts)->len; + part->urls = g_ptr_array_new(); + part->raw_headers = rspamd_message_headers_new(); + part->headers_order = NULL; + + tok = rspamd_task_get_request_header(task, "Content-Type"); + + if (tok) { + /* We have Content-Type defined */ + ct = rspamd_content_type_parse(tok->begin, tok->len, + task->task_pool); + part->ct = ct; + } + else if (task->cfg && task->cfg->libs_ctx) { + lua_State *L = task->cfg->lua_state; + + if (rspamd_lua_require_function(L, + "lua_magic", "detect_mime_part")) { + + struct rspamd_mime_part **pmime; + struct rspamd_task **ptask; + + pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *)); + rspamd_lua_setclass(L, "rspamd{mimepart}", -1); + *pmime = part; + ptask = lua_newuserdata(L, sizeof(struct rspamd_task *)); + rspamd_lua_setclass(L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall(L, 2, 2, 0) != 0) { + msg_err_task("cannot detect type: %s", lua_tostring(L, -1)); + } + else { + if (lua_istable(L, -1)) { + lua_pushstring(L, "ct"); + lua_gettable(L, -2); + + if (lua_isstring(L, -1)) { + mb = rspamd_mempool_strdup(task->task_pool, + lua_tostring(L, -1)); + } + } + } + + lua_settop(L, 0); + } + else { + msg_err_task("cannot require lua_magic.detect_mime_part"); + } + + if (mb) { + srch.begin = mb; + srch.len = strlen(mb); + ct = rspamd_content_type_parse(srch.begin, srch.len, + task->task_pool); + + if (!part->ct) { + msg_info_task("construct fake mime of type: %s", mb); + part->ct = ct; + } + else { + /* Check sanity */ + if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) { + RSPAMD_FTOK_FROM_STR(&srch, "application"); + + if (rspamd_ftok_cmp(&ct->type, &srch) == 0) { + msg_info_task("construct fake mime of type: %s", mb); + part->ct = ct; + } + } + else { + msg_info_task("construct fake mime of type: %T/%T, detected %s", + &part->ct->type, &part->ct->subtype, mb); + } + } + + part->detected_ct = ct; + } + } + + + tok = rspamd_task_get_request_header(task, "Filename"); + + if (tok) { + rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline; filename=\"%T\"", tok); + } + else { + rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline"); + } + + part->cd = rspamd_content_disposition_parse(cdbuf, strlen(cdbuf), + task->task_pool); + + g_ptr_array_add(MESSAGE_FIELD(task, parts), part); + rspamd_mime_parser_calc_digest(part); + + /* Generate message ID */ + mid = rspamd_mime_message_id_generate("localhost.localdomain"); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) g_free, mid); + MESSAGE_FIELD(task, message_id) = mid; + task->queue_id = mid; +} + +static void +rspamd_message_dtor(struct rspamd_message *msg) +{ + guint i; + struct rspamd_mime_part *p; + struct rspamd_mime_text_part *tp; + + + PTR_ARRAY_FOREACH(msg->parts, i, p) + { + if (p->raw_headers) { + rspamd_message_headers_unref(p->raw_headers); + } + + if (IS_PART_MULTIPART(p)) { + if (p->specific.mp->children) { + g_ptr_array_free(p->specific.mp->children, TRUE); + } + } + + if (p->part_type == RSPAMD_MIME_PART_CUSTOM_LUA && + p->specific.lua_specific.cbref != -1) { + luaL_unref(msg->task->cfg->lua_state, + LUA_REGISTRYINDEX, + p->specific.lua_specific.cbref); + } + + if (p->urls) { + g_ptr_array_unref(p->urls); + } + } + + PTR_ARRAY_FOREACH(msg->text_parts, i, tp) + { + if (tp->utf_words) { + g_array_free(tp->utf_words, TRUE); + } + if (tp->normalized_hashes) { + g_array_free(tp->normalized_hashes, TRUE); + } + if (tp->languages) { + g_ptr_array_unref(tp->languages); + } + } + + rspamd_message_headers_unref(msg->raw_headers); + + g_ptr_array_unref(msg->text_parts); + g_ptr_array_unref(msg->parts); + + kh_destroy(rspamd_url_hash, msg->urls); +} + +struct rspamd_message * +rspamd_message_new(struct rspamd_task *task) +{ + struct rspamd_message *msg; + + msg = rspamd_mempool_alloc0(task->task_pool, sizeof(*msg)); + + msg->raw_headers = rspamd_message_headers_new(); + msg->urls = kh_init(rspamd_url_hash); + msg->parts = g_ptr_array_sized_new(4); + msg->text_parts = g_ptr_array_sized_new(2); + msg->task = task; + + REF_INIT_RETAIN(msg, rspamd_message_dtor); + + return msg; +} + +gboolean +rspamd_message_parse(struct rspamd_task *task) +{ + const gchar *p; + gsize len; + guint i; + GError *err = NULL; + guint64 n[2], seed; + + if (RSPAMD_TASK_IS_EMPTY(task)) { + /* Don't do anything with empty task */ + task->flags |= RSPAMD_TASK_FLAG_SKIP_PROCESS; + return TRUE; + } + + p = task->msg.begin; + len = task->msg.len; + + /* Skip any space characters to avoid some bad messages to be unparsed */ + while (len > 0 && g_ascii_isspace(*p)) { + p++; + len--; + } + + /* + * Exim somehow uses mailbox format for messages being scanned: + * From xxx@xxx.com Fri May 13 19:08:48 2016 + * + * So we check if a task has this line to avoid possible issues + */ + if (len > sizeof("From ") - 1) { + if (memcmp(p, "From ", sizeof("From ") - 1) == 0) { + /* Skip to CRLF */ + msg_info_task("mailbox input detected, enable workaround"); + p += sizeof("From ") - 1; + len -= sizeof("From ") - 1; + + while (len > 0 && *p != '\n') { + p++; + len--; + } + while (len > 0 && g_ascii_isspace(*p)) { + p++; + len--; + } + } + } + + task->msg.begin = p; + task->msg.len = len; + + /* Cleanup old message */ + if (task->message) { + rspamd_message_unref(task->message); + } + + task->message = rspamd_message_new(task); + + if (task->flags & RSPAMD_TASK_FLAG_MIME) { + enum rspamd_mime_parse_error ret; + + debug_task("construct mime parser from string length %d", + (gint) task->msg.len); + ret = rspamd_mime_parse_task(task, &err); + + switch (ret) { + case RSPAMD_MIME_PARSE_FATAL: + msg_err_task("cannot construct mime from stream: %e", err); + + if (task->cfg && (!task->cfg->allow_raw_input)) { + msg_err_task("cannot construct mime from stream"); + if (err) { + task->err = err; + } + + return FALSE; + } + else { + task->flags &= ~RSPAMD_TASK_FLAG_MIME; + rspamd_message_from_data(task, p, len); + } + break; + case RSPAMD_MIME_PARSE_NESTING: + msg_warn_task("cannot construct full mime from stream: %e", err); + task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; + break; + case RSPAMD_MIME_PARSE_OK: + default: + break; + } + + if (err) { + g_error_free(err); + } + } + else { + rspamd_message_from_data(task, p, len); + } + + + if (MESSAGE_FIELD(task, message_id) == NULL) { + MESSAGE_FIELD(task, message_id) = "undef"; + } + + debug_task("found %ud parts in message", MESSAGE_FIELD(task, parts)->len); + if (task->queue_id == NULL) { + task->queue_id = "undef"; + } + + rspamd_received_maybe_fix_task(task); + + struct rspamd_mime_part *part; + + /* Blake2b applied to string 'rspamd' */ + static const guchar RSPAMD_ALIGNED(32) hash_key[] = { + 0xef, + 0x43, + 0xae, + 0x80, + 0xcc, + 0x8d, + 0xc3, + 0x4c, + 0x6f, + 0x1b, + 0xd6, + 0x18, + 0x1b, + 0xae, + 0x87, + 0x74, + 0x0c, + 0xca, + 0xf7, + 0x8e, + 0x5f, + 0x2e, + 0x54, + 0x32, + 0xf6, + 0x79, + 0xb9, + 0x27, + 0x26, + 0x96, + 0x20, + 0x92, + 0x70, + 0x07, + 0x85, + 0xeb, + 0x83, + 0xf7, + 0x89, + 0xe0, + 0xd7, + 0x32, + 0x2a, + 0xd2, + 0x1a, + 0x64, + 0x41, + 0xef, + 0x49, + 0xff, + 0xc3, + 0x8c, + 0x54, + 0xf9, + 0x67, + 0x74, + 0x30, + 0x1e, + 0x70, + 0x2e, + 0xb7, + 0x12, + 0x09, + 0xfe, + }; + + memcpy(&seed, hash_key, sizeof(seed)); + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + n[0] = t1ha2_atonce128(&n[1], + part->digest, sizeof(part->digest), + seed); + + seed = n[0] ^ n[1]; + } + + memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n)); + + if (MESSAGE_FIELD(task, subject)) { + p = MESSAGE_FIELD(task, subject); + len = strlen(p); + n[0] = t1ha2_atonce128(&n[1], + p, len, + seed); + memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n)); + } + + if (task->queue_id) { + msg_info_task("loaded message; id: <%s>; queue-id: <%s>; size: %z; " + "checksum: <%*xs>", + MESSAGE_FIELD(task, message_id), task->queue_id, task->msg.len, + (gint) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest)); + } + else { + msg_info_task("loaded message; id: <%s>; size: %z; " + "checksum: <%*xs>", + MESSAGE_FIELD(task, message_id), task->msg.len, + (gint) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest)); + } + + return TRUE; +} + + +/* + * A helper structure to store text parts positions, if it was C++, I could just use std::pair, + * but here I have to make it all manually, sigh... + */ +struct rspamd_mime_part_text_position { + unsigned pos; + enum rspamd_message_part_is_text_result res; +}; + +/* Place html parts first during analysis */ +static int +rspamd_mime_text_part_position_compare_func(const void *v1, const void *v2) +{ + const struct rspamd_mime_part_text_position *p1 = (const struct rspamd_mime_part_text_position *) v1; + const struct rspamd_mime_part_text_position *p2 = (const struct rspamd_mime_part_text_position *) v2; + + if (p1->res == p2->res) { + return (int) p2->pos - (int) p1->pos; + } + else { + if (p1->res == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) { + return -1; + } + else { + return 1; + } + } +} + +void rspamd_message_process(struct rspamd_task *task) +{ + guint i; + struct rspamd_mime_text_part *p1, *p2; + gdouble diff, *pdiff; + guint tw, *ptw, dw; + struct rspamd_mime_part *part; + lua_State *L = NULL; + gint magic_func_pos = -1, content_func_pos = -1, old_top = -1, funcs_top = -1; + + if (task->cfg) { + L = task->cfg->lua_state; + } + + rspamd_archives_process(task); + + if (L) { + old_top = lua_gettop(L); + } + + if (L && rspamd_lua_require_function(L, + "lua_magic", "detect_mime_part")) { + magic_func_pos = lua_gettop(L); + } + else { + msg_err_task("cannot require lua_magic.detect_mime_part"); + } + + if (L && rspamd_lua_require_function(L, + "lua_content", "maybe_process_mime_part")) { + content_func_pos = lua_gettop(L); + } + else { + msg_err_task("cannot require lua_content.maybe_process_mime_part"); + } + + if (L) { + funcs_top = lua_gettop(L); + } + + GArray *detected_text_parts = g_array_sized_new(FALSE, FALSE, sizeof(struct rspamd_mime_part_text_position), 2); + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + if (magic_func_pos != -1 && part->parsed_data.len > 0) { + struct rspamd_mime_part **pmime; + struct rspamd_task **ptask; + + lua_pushcfunction(L, &rspamd_lua_traceback); + gint err_idx = lua_gettop(L); + lua_pushvalue(L, magic_func_pos); + pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *)); + rspamd_lua_setclass(L, "rspamd{mimepart}", -1); + *pmime = part; + ptask = lua_newuserdata(L, sizeof(struct rspamd_task *)); + rspamd_lua_setclass(L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall(L, 2, 2, err_idx) != 0) { + msg_err_task("cannot detect type: %s", lua_tostring(L, -1)); + } + else { + if (lua_istable(L, -1)) { + const gchar *mb; + + /* First returned value */ + part->detected_ext = rspamd_mempool_strdup(task->task_pool, + lua_tostring(L, -2)); + + lua_pushstring(L, "ct"); + lua_gettable(L, -2); + + if (lua_isstring(L, -1)) { + mb = lua_tostring(L, -1); + + if (mb) { + rspamd_ftok_t srch; + + srch.begin = mb; + srch.len = strlen(mb); + part->detected_ct = rspamd_content_type_parse(srch.begin, + srch.len, + task->task_pool); + } + } + + lua_pop(L, 1); + + lua_pushstring(L, "type"); + lua_gettable(L, -2); + + if (lua_isstring(L, -1)) { + part->detected_type = rspamd_mempool_strdup(task->task_pool, + lua_tostring(L, -1)); + } + + lua_pop(L, 1); + + lua_pushstring(L, "no_text"); + lua_gettable(L, -2); + + if (lua_isboolean(L, -1)) { + if (!!lua_toboolean(L, -1)) { + part->flags |= RSPAMD_MIME_PART_NO_TEXT_EXTRACTION; + } + } + + lua_pop(L, 1); + } + } + + lua_settop(L, funcs_top); + } + + /* Now detect content */ + if (content_func_pos != -1 && part->parsed_data.len > 0 && + part->part_type == RSPAMD_MIME_PART_UNDEFINED) { + struct rspamd_mime_part **pmime; + struct rspamd_task **ptask; + + lua_pushcfunction(L, &rspamd_lua_traceback); + gint err_idx = lua_gettop(L); + lua_pushvalue(L, content_func_pos); + pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *)); + rspamd_lua_setclass(L, "rspamd{mimepart}", -1); + *pmime = part; + ptask = lua_newuserdata(L, sizeof(struct rspamd_task *)); + rspamd_lua_setclass(L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall(L, 2, 0, err_idx) != 0) { + msg_err_task("cannot detect content: %s", lua_tostring(L, -1)); + } + + lua_settop(L, funcs_top); + } + + /* Try to detect image before checking for text */ + rspamd_images_process_mime_part_maybe(task, part); + + if (part->part_type == RSPAMD_MIME_PART_UNDEFINED && + !(part->flags & RSPAMD_MIME_PART_NO_TEXT_EXTRACTION)) { + enum rspamd_message_part_is_text_result res = rspamd_message_part_can_be_parsed_as_text(task, part); + + if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT) { + struct rspamd_mime_part_text_position p = { + .pos = i, + .res = res}; + g_array_append_val(detected_text_parts, p); + } + } + } + + uint16_t cur_url_order = 0; + g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func); + /* One more iteration to process text parts in a more specific order */ + for (i = 0; i < detected_text_parts->len; i++) { + part = g_ptr_array_index(MESSAGE_FIELD(task, parts), + g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos); + rspamd_message_process_text_part_maybe(task, part, + g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res, &cur_url_order); + } + + g_array_free(detected_text_parts, TRUE); + + if (old_top != -1) { + lua_settop(L, old_top); + } + + /* Parse urls inside Subject header */ + if (MESSAGE_FIELD(task, subject)) { + rspamd_url_find_multiple(task->task_pool, MESSAGE_FIELD(task, subject), + strlen(MESSAGE_FIELD(task, subject)), + RSPAMD_URL_FIND_STRICT, NULL, + rspamd_url_task_subject_callback, + task); + } + + /* Calculate average words length and number of short words */ + struct rspamd_mime_text_part *text_part; + gdouble *var; + guint total_words = 0; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + if (!text_part->language) { + rspamd_mime_part_detect_language(task, text_part); + } + + rspamd_mime_part_extract_words(task, text_part); + + if (text_part->utf_words) { + total_words += text_part->nwords; + } + } + + /* Calculate distance for 2-parts messages */ + if (i == 2) { + p1 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 0); + p2 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 1); + + /* First of all check parent object */ + if (p1->mime_part->parent_part) { + rspamd_ftok_t srch; + + srch.begin = "alternative"; + srch.len = 11; + + if (rspamd_ftok_cmp(&p1->mime_part->parent_part->ct->subtype, &srch) == 0) { + if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) && + p1->normalized_hashes && p2->normalized_hashes) { + /* + * We also detect language on one part and propagate it to + * another one + */ + struct rspamd_mime_text_part *sel; + + /* Prefer HTML as text part is not displayed normally */ + if (IS_TEXT_PART_HTML(p1)) { + sel = p1; + } + else if (IS_TEXT_PART_HTML(p2)) { + sel = p2; + } + else { + if (p1->utf_content.len > p2->utf_content.len) { + sel = p1; + } + else { + sel = p2; + } + } + + if (sel->language && sel->language[0]) { + /* Propagate language */ + if (sel == p1) { + if (p2->languages) { + g_ptr_array_unref(p2->languages); + } + + p2->language = sel->language; + p2->languages = g_ptr_array_ref(sel->languages); + } + else { + if (p1->languages) { + g_ptr_array_unref(p1->languages); + } + + p1->language = sel->language; + p1->languages = g_ptr_array_ref(sel->languages); + } + } + + tw = p1->normalized_hashes->len + p2->normalized_hashes->len; + + if (tw > 0) { + dw = rspamd_words_levenshtein_distance(task, + p1->normalized_hashes, + p2->normalized_hashes); + diff = dw / (gdouble) tw; + + msg_debug_task( + "different words: %d, total words: %d, " + "got diff between parts of %.2f", + dw, tw, + diff); + + pdiff = rspamd_mempool_alloc(task->task_pool, + sizeof(gdouble)); + *pdiff = diff; + rspamd_mempool_set_variable(task->task_pool, + "parts_distance", + pdiff, + NULL); + ptw = rspamd_mempool_alloc(task->task_pool, + sizeof(gint)); + *ptw = tw; + rspamd_mempool_set_variable(task->task_pool, + "total_words", + ptw, + NULL); + } + } + } + } + else { + debug_task( + "message contains two parts but they are in different multi-parts"); + } + } + + if (total_words > 0) { + var = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_AVG_WORDS_LEN); + + if (var) { + *var /= (double) total_words; + } + + var = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_SHORT_WORDS_CNT); + + if (var) { + *var /= (double) total_words; + } + } + + rspamd_images_link(task); + rspamd_tokenize_meta_words(task); +} + + +struct rspamd_message * +rspamd_message_ref(struct rspamd_message *msg) +{ + REF_RETAIN(msg); + + return msg; +} + +void rspamd_message_unref(struct rspamd_message *msg) +{ + if (msg) { + REF_RELEASE(msg); + } +} + +void rspamd_message_update_digest(struct rspamd_message *msg, + const void *input, gsize len) +{ + guint64 n[2]; + /* Sanity */ + G_STATIC_ASSERT(sizeof(n) == sizeof(msg->digest)); + + memcpy(n, msg->digest, sizeof(msg->digest)); + n[0] = t1ha2_atonce128(&n[1], input, len, n[0]); + memcpy(msg->digest, n, sizeof(msg->digest)); +} diff --git a/src/libmime/message.h b/src/libmime/message.h new file mode 100644 index 0000000..52dedab --- /dev/null +++ b/src/libmime/message.h @@ -0,0 +1,239 @@ +/** + * @file message.h + * Message processing functions and structures + */ + +#ifndef RSPAMD_MESSAGE_H +#define RSPAMD_MESSAGE_H + +#include "config.h" + +#include "libmime/email_addr.h" +#include "libutil/addr.h" +#include "libcryptobox/cryptobox.h" +#include "libmime/mime_headers.h" +#include "libmime/content_type.h" +#include "libserver/url.h" +#include "libutil/ref.h" +#include "libutil/str_util.h" + +#include <unicode/uchar.h> +#include <unicode/utext.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct controller_session; +struct rspamd_image; +struct rspamd_archive; + +enum rspamd_mime_part_flags { + RSPAMD_MIME_PART_ATTACHEMENT = (1u << 1u), + RSPAMD_MIME_PART_BAD_CTE = (1u << 4u), + RSPAMD_MIME_PART_MISSING_CTE = (1u << 5u), + RSPAMD_MIME_PART_NO_TEXT_EXTRACTION = (1u << 6u), +}; + +enum rspamd_mime_part_type { + RSPAMD_MIME_PART_UNDEFINED = 0, + RSPAMD_MIME_PART_MULTIPART, + RSPAMD_MIME_PART_MESSAGE, + RSPAMD_MIME_PART_TEXT, + RSPAMD_MIME_PART_ARCHIVE, + RSPAMD_MIME_PART_IMAGE, + RSPAMD_MIME_PART_CUSTOM_LUA +}; + +#define IS_PART_MULTIPART(part) ((part) && ((part)->part_type == RSPAMD_MIME_PART_MULTIPART)) +#define IS_PART_TEXT(part) ((part) && ((part)->part_type == RSPAMD_MIME_PART_TEXT)) +#define IS_PART_MESSAGE(part) ((part) && ((part)->part_type == RSPAMD_MIME_PART_MESSAGE)) + +enum rspamd_cte { + RSPAMD_CTE_UNKNOWN = 0, + RSPAMD_CTE_7BIT = 1, + RSPAMD_CTE_8BIT = 2, + RSPAMD_CTE_QP = 3, + RSPAMD_CTE_B64 = 4, + RSPAMD_CTE_UUE = 5, +}; + +struct rspamd_mime_text_part; + +struct rspamd_mime_multipart { + GPtrArray *children; + rspamd_ftok_t boundary; +}; + +enum rspamd_lua_specific_type { + RSPAMD_LUA_PART_TEXT, + RSPAMD_LUA_PART_STRING, + RSPAMD_LUA_PART_TABLE, + RSPAMD_LUA_PART_FUNCTION, + RSPAMD_LUA_PART_UNKNOWN, +}; + +struct rspamd_lua_specific_part { + gint cbref; + enum rspamd_lua_specific_type type; +}; + +struct rspamd_mime_part { + struct rspamd_content_type *ct; + struct rspamd_content_type *detected_ct; + gchar *detected_type; + gchar *detected_ext; + struct rspamd_content_disposition *cd; + rspamd_ftok_t raw_data; + rspamd_ftok_t parsed_data; + struct rspamd_mime_part *parent_part; + + struct rspamd_mime_header *headers_order; + struct rspamd_mime_headers_table *raw_headers; + GPtrArray *urls; + + gchar *raw_headers_str; + gsize raw_headers_len; + + enum rspamd_cte cte; + guint flags; + enum rspamd_mime_part_type part_type; + guint part_number; + + union { + struct rspamd_mime_multipart *mp; + struct rspamd_mime_text_part *txt; + struct rspamd_image *img; + struct rspamd_archive *arch; + struct rspamd_lua_specific_part lua_specific; + } specific; + + guchar digest[rspamd_cryptobox_HASHBYTES]; +}; + +#define RSPAMD_MIME_TEXT_PART_FLAG_UTF (1 << 0) +#define RSPAMD_MIME_TEXT_PART_FLAG_EMPTY (1 << 1) +#define RSPAMD_MIME_TEXT_PART_FLAG_HTML (1 << 2) +#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW (1 << 3) +#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED (1 << 4) +#define RSPAMD_MIME_TEXT_PART_ATTACHMENT (1 << 5) + +#define IS_TEXT_PART_EMPTY(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_EMPTY) +#define IS_TEXT_PART_UTF(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF) +#define IS_TEXT_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML) +#define IS_TEXT_PART_ATTACHMENT(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_ATTACHMENT) + + +struct rspamd_mime_text_part { + const gchar *language; + GPtrArray *languages; + const gchar *real_charset; + + /* Raw data in native encoding */ + rspamd_ftok_t raw; + rspamd_ftok_t parsed; /* decoded from mime encodings */ + + /* UTF8 content */ + rspamd_ftok_t utf_content; /* utf8 encoded processed content */ + GByteArray *utf_raw_content; /* utf raw content */ + GByteArray *utf_stripped_content; /* utf content with no newlines */ + GArray *normalized_hashes; /* Array of guint64 */ + GArray *utf_words; /* Array of rspamd_stat_token_t */ + UText utf_stripped_text; /* Used by libicu to represent the utf8 content */ + + GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ + void *html; + GList *exceptions; /**< list of offsets of urls */ + struct rspamd_mime_part *mime_part; + + guint flags; + guint nlines; + guint spaces; + guint nwords; + guint non_ascii_chars; + guint ascii_chars; + guint double_spaces; + guint non_spaces; + guint empty_lines; + guint capital_letters; + guint numeric_characters; + guint unicode_scripts; +}; + +struct rspamd_message_raw_headers_content { + const gchar *begin; + gsize len; + const gchar *body_start; +}; + +struct rspamd_message { + const gchar *message_id; + gchar *subject; + + GPtrArray *parts; /**< list of parsed parts */ + GPtrArray *text_parts; /**< list of text parts */ + struct rspamd_message_raw_headers_content raw_headers_content; + void *received_headers; /**< list of received headers */ + khash_t(rspamd_url_hash) * urls; + struct rspamd_mime_headers_table *raw_headers; /**< list of raw headers */ + struct rspamd_mime_header *headers_order; /**< order of raw headers */ + struct rspamd_task *task; + GPtrArray *rcpt_mime; + GPtrArray *from_mime; + guchar digest[16]; + enum rspamd_newlines_type nlines_type; /**< type of newlines (detected on most of headers */ + ref_entry_t ref; +}; + +#define MESSAGE_FIELD(task, field) ((task)->message->field) +#define MESSAGE_FIELD_CHECK(task, field) ((task)->message ? (task)->message->field : (__typeof__((task)->message->field)) NULL) + +/** + * Parse and pre-process mime message + * @param task worker_task object + * @return + */ +gboolean rspamd_message_parse(struct rspamd_task *task); + +/** + * Process content in task (e.g. HTML parsing) + * @param task + */ +void rspamd_message_process(struct rspamd_task *task); + + +/** + * Converts string to cte + * @param str + * @return + */ +enum rspamd_cte rspamd_cte_from_string(const gchar *str); + +/** + * Converts cte to string + * @param ct + * @return + */ +const gchar *rspamd_cte_to_string(enum rspamd_cte ct); + +struct rspamd_message *rspamd_message_new(struct rspamd_task *task); + +struct rspamd_message *rspamd_message_ref(struct rspamd_message *msg); + +void rspamd_message_unref(struct rspamd_message *msg); + +/** + * Updates digest of the message if modified + * @param msg + * @param input + * @param len + */ +void rspamd_message_update_digest(struct rspamd_message *msg, + const void *input, gsize len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c new file mode 100644 index 0000000..48a97a4 --- /dev/null +++ b/src/libmime/mime_encoding.c @@ -0,0 +1,864 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "libutil/mem_pool.h" +#include "libutil/regexp.h" +#include "libutil/hash.h" +#include "libserver/cfg_file.h" +#include "libserver/task.h" +#include "mime_encoding.h" +#include "message.h" +#include "contrib/fastutf8/fastutf8.h" +#include "contrib/google-ced/ced_c.h" +#include <unicode/ucnv.h> +#if U_ICU_VERSION_MAJOR_NUM >= 44 +#include <unicode/unorm2.h> +#endif +#include <math.h> + +#define UTF8_CHARSET "UTF-8" + +#define RSPAMD_CHARSET_FLAG_UTF (1 << 0) +#define RSPAMD_CHARSET_FLAG_ASCII (1 << 1) + +#define RSPAMD_CHARSET_CACHE_SIZE 32 +#define RSPAMD_CHARSET_MAX_CONTENT 512 + +#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF) +#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF) + +static rspamd_regexp_t *utf_compatible_re = NULL; + +struct rspamd_charset_substitution { + const gchar *input; + const gchar *canon; + gint flags; +}; + +#include "mime_encoding_list.h" + +static GHashTable *sub_hash = NULL; + +static const UChar iso_8859_16_map[] = { + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, + 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, + 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7, + 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B, + 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7, + 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C, + 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A, + 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B, + 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF}; + +struct rspamd_charset_converter { + gchar *canon_name; + union { + UConverter *conv; + const UChar *cnv_table; + } d; + gboolean is_internal; +}; + +static GQuark +rspamd_charset_conv_error_quark(void) +{ + return g_quark_from_static_string("charset conversion error"); +} + +static void +rspamd_converter_dtor(gpointer p) +{ + struct rspamd_charset_converter *c = (struct rspamd_charset_converter *) p; + + if (!c->is_internal) { + ucnv_close(c->d.conv); + } + + g_free(c->canon_name); + g_free(c); +} + +int32_t +rspamd_converter_to_uchars(struct rspamd_charset_converter *cnv, + UChar *dest, + int32_t destCapacity, + const char *src, + int32_t srcLength, + UErrorCode *pErrorCode) +{ + if (!cnv->is_internal) { + return ucnv_toUChars(cnv->d.conv, + dest, destCapacity, + src, srcLength, + pErrorCode); + } + else { + UChar *d = dest, *dend = dest + destCapacity; + const guchar *p = src, *end = src + srcLength; + + while (p < end && d < dend) { + if (*p <= 127) { + *d++ = (UChar) *p; + } + else { + *d++ = cnv->d.cnv_table[*p - 128]; + } + + p++; + } + + return d - dest; + } +} + + +struct rspamd_charset_converter * +rspamd_mime_get_converter_cached(const gchar *enc, + rspamd_mempool_t *pool, + gboolean is_canon, + UErrorCode *err) +{ + const gchar *canon_name; + static rspamd_lru_hash_t *cache; + struct rspamd_charset_converter *conv; + + if (cache == NULL) { + cache = rspamd_lru_hash_new_full(RSPAMD_CHARSET_CACHE_SIZE, NULL, + rspamd_converter_dtor, rspamd_str_hash, + rspamd_str_equal); + } + + if (enc == NULL) { + return NULL; + } + + if (!is_canon) { + rspamd_ftok_t cset_tok; + + RSPAMD_FTOK_FROM_STR(&cset_tok, enc); + canon_name = rspamd_mime_detect_charset(&cset_tok, pool); + } + else { + canon_name = enc; + } + + if (canon_name == NULL) { + return NULL; + } + + conv = rspamd_lru_hash_lookup(cache, (gpointer) canon_name, 0); + + if (conv == NULL) { + if (!(strcmp(canon_name, "ISO-8859-16") == 0 || + strcmp(canon_name, "latin10") == 0 || + strcmp(canon_name, "iso-ir-226") == 0)) { + conv = g_malloc0(sizeof(*conv)); + conv->d.conv = ucnv_open(canon_name, err); + conv->canon_name = g_strdup(canon_name); + + if (conv->d.conv != NULL) { + ucnv_setToUCallBack(conv->d.conv, + UCNV_TO_U_CALLBACK_SUBSTITUTE, + NULL, + NULL, + NULL, + err); + rspamd_lru_hash_insert(cache, conv->canon_name, conv, 0, 0); + } + else { + g_free(conv); + conv = NULL; + } + } + else { + /* ISO-8859-16 */ + conv = g_malloc0(sizeof(*conv)); + conv->is_internal = TRUE; + conv->d.cnv_table = iso_8859_16_map; + conv->canon_name = g_strdup(canon_name); + + rspamd_lru_hash_insert(cache, conv->canon_name, conv, 0, 0); + } + } + + return conv; +} + +static void +rspamd_mime_encoding_substitute_init(void) +{ + guint i; + + sub_hash = g_hash_table_new(rspamd_strcase_hash, rspamd_strcase_equal); + + for (i = 0; i < G_N_ELEMENTS(sub); i++) { + g_hash_table_insert(sub_hash, (void *) sub[i].input, (void *) &sub[i]); + } +} + +static void +rspamd_charset_normalize(gchar *in) +{ + /* + * This is a simple routine to validate input charset + * we just check that charset starts with alphanumeric and ends + * with alphanumeric + */ + gchar *begin, *end; + gboolean changed = FALSE; + + begin = in; + + while (*begin && !g_ascii_isalnum(*begin)) { + begin++; + changed = TRUE; + } + + end = begin + strlen(begin) - 1; + + while (end > begin && !g_ascii_isalnum(*end)) { + end--; + changed = TRUE; + } + + if (changed) { + memmove(in, begin, end - begin + 2); + *(end + 1) = '\0'; + } +} + +const gchar * +rspamd_mime_detect_charset(const rspamd_ftok_t *in, rspamd_mempool_t *pool) +{ + gchar *ret = NULL, *h, *t; + struct rspamd_charset_substitution *s; + const gchar *cset; + rspamd_ftok_t utf8_tok; + UErrorCode uc_err = U_ZERO_ERROR; + + if (sub_hash == NULL) { + rspamd_mime_encoding_substitute_init(); + } + + /* Fast path */ + RSPAMD_FTOK_ASSIGN(&utf8_tok, "utf-8"); + + if (rspamd_ftok_casecmp(in, &utf8_tok) == 0) { + return UTF8_CHARSET; + } + + RSPAMD_FTOK_ASSIGN(&utf8_tok, "utf8"); + + if (rspamd_ftok_casecmp(in, &utf8_tok) == 0) { + return UTF8_CHARSET; + } + + ret = rspamd_mempool_ftokdup(pool, in); + rspamd_charset_normalize(ret); + + if ((in->len > 3 && rspamd_lc_cmp(in->begin, "cp-", 3) == 0) || + (in->len > 4 && (rspamd_lc_cmp(in->begin, "ibm-", 4) == 0))) { + /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */ + h = ret; + t = ret; + + while (*h != '\0') { + if (*h != '-') { + *t++ = *h; + } + + h++; + } + + *t = '\0'; + } + + s = g_hash_table_lookup(sub_hash, ret); + + if (s) { + ret = (char *) s->canon; + } + + /* Try different aliases */ + cset = ucnv_getCanonicalName(ret, "MIME", &uc_err); + + if (cset == NULL) { + uc_err = U_ZERO_ERROR; + cset = ucnv_getCanonicalName(ret, "IANA", &uc_err); + } + + if (cset == NULL) { + uc_err = U_ZERO_ERROR; + cset = ucnv_getCanonicalName(ret, "", &uc_err); + } + + if (cset == NULL) { + uc_err = U_ZERO_ERROR; + cset = ucnv_getAlias(ret, 0, &uc_err); + } + + return cset; +} + +gchar * +rspamd_mime_text_to_utf8(rspamd_mempool_t *pool, + gchar *input, gsize len, const gchar *in_enc, + gsize *olen, GError **err) +{ + gchar *d; + gint32 r, clen, dlen; + UChar *tmp_buf; + + UErrorCode uc_err = U_ZERO_ERROR; + UConverter *utf8_converter; + struct rspamd_charset_converter *conv; + rspamd_ftok_t cset_tok; + + /* Check if already utf8 */ + RSPAMD_FTOK_FROM_STR(&cset_tok, in_enc); + + if (rspamd_mime_charset_utf_check(&cset_tok, input, len, + FALSE)) { + d = rspamd_mempool_alloc(pool, len); + memcpy(d, input, len); + if (olen) { + *olen = len; + } + + return d; + } + + conv = rspamd_mime_get_converter_cached(in_enc, pool, TRUE, &uc_err); + utf8_converter = rspamd_get_utf8_converter(); + + if (conv == NULL) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot open converter for %s: %s", + in_enc, u_errorName(uc_err)); + + return NULL; + } + + tmp_buf = g_new(UChar, len + 1); + uc_err = U_ZERO_ERROR; + r = rspamd_converter_to_uchars(conv, tmp_buf, len + 1, input, len, &uc_err); + + if (!U_SUCCESS(uc_err)) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot convert data to unicode from %s: %s", + in_enc, u_errorName(uc_err)); + g_free(tmp_buf); + + return NULL; + } + + /* Now, convert to utf8 */ + clen = ucnv_getMaxCharSize(utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen); + d = rspamd_mempool_alloc(pool, dlen); + r = ucnv_fromUChars(utf8_converter, d, dlen, tmp_buf, r, &uc_err); + + if (!U_SUCCESS(uc_err)) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot convert data from unicode from %s: %s", + in_enc, u_errorName(uc_err)); + g_free(tmp_buf); + + return NULL; + } + + msg_debug_pool("converted from %s to UTF-8 inlen: %z, outlen: %d", + in_enc, len, r); + g_free(tmp_buf); + + if (olen) { + *olen = r; + } + + return d; +} + +static gboolean +rspamd_mime_text_part_utf8_convert(struct rspamd_task *task, + struct rspamd_mime_text_part *text_part, + GByteArray *input, + const gchar *charset, + GError **err) +{ + gchar *d; + gint32 r, clen, dlen, uc_len; + UChar *tmp_buf; + UErrorCode uc_err = U_ZERO_ERROR; + UConverter *utf8_converter; + struct rspamd_charset_converter *conv; + + conv = rspamd_mime_get_converter_cached(charset, task->task_pool, + TRUE, &uc_err); + utf8_converter = rspamd_get_utf8_converter(); + + if (conv == NULL) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot open converter for %s: %s", + charset, u_errorName(uc_err)); + + return FALSE; + } + + tmp_buf = g_new(UChar, input->len + 1); + uc_err = U_ZERO_ERROR; + uc_len = rspamd_converter_to_uchars(conv, + tmp_buf, + input->len + 1, + input->data, + input->len, + &uc_err); + + if (!U_SUCCESS(uc_err)) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot convert data to unicode from %s: %s", + charset, u_errorName(uc_err)); + g_free(tmp_buf); + + return FALSE; + } + + /* Now, convert to utf8 */ + clen = ucnv_getMaxCharSize(utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING(uc_len, clen); + d = rspamd_mempool_alloc(task->task_pool, dlen); + r = ucnv_fromUChars(utf8_converter, d, dlen, + tmp_buf, uc_len, &uc_err); + + if (!U_SUCCESS(uc_err)) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot convert data from unicode from %s: %s", + charset, u_errorName(uc_err)); + g_free(tmp_buf); + + return FALSE; + } + + if (text_part->mime_part && text_part->mime_part->ct) { + msg_info_task("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)", + charset, &text_part->mime_part->ct->charset, input->len, r, uc_len); + } + else { + msg_info_task("converted text part from %s (no charset announced) to UTF-8 inlen: %d, " + "outlen: %d (%d UTF16 chars)", + charset, input->len, r, uc_len); + } + + text_part->utf_raw_content = rspamd_mempool_alloc(task->task_pool, + sizeof(*text_part->utf_raw_content) + sizeof(gpointer) * 4); + text_part->utf_raw_content->data = d; + text_part->utf_raw_content->len = r; + g_free(tmp_buf); + + return TRUE; +} + +gboolean +rspamd_mime_to_utf8_byte_array(GByteArray *in, + GByteArray *out, + rspamd_mempool_t *pool, + const gchar *enc) +{ + gint32 r, clen, dlen; + UChar *tmp_buf; + UErrorCode uc_err = U_ZERO_ERROR; + UConverter *utf8_converter; + struct rspamd_charset_converter *conv; + rspamd_ftok_t charset_tok; + + if (in == NULL || in->len == 0) { + return FALSE; + } + + if (enc == NULL) { + /* Assume utf ? */ + if (rspamd_fast_utf8_validate(in->data, in->len) == 0) { + g_byte_array_set_size(out, in->len); + memcpy(out->data, in->data, out->len); + + return TRUE; + } + else { + /* Bad stuff, keep out */ + return FALSE; + } + } + + RSPAMD_FTOK_FROM_STR(&charset_tok, enc); + + if (rspamd_mime_charset_utf_check(&charset_tok, (gchar *) in->data, in->len, + FALSE)) { + g_byte_array_set_size(out, in->len); + memcpy(out->data, in->data, out->len); + + return TRUE; + } + + utf8_converter = rspamd_get_utf8_converter(); + conv = rspamd_mime_get_converter_cached(enc, pool, TRUE, &uc_err); + + if (conv == NULL) { + return FALSE; + } + + tmp_buf = g_new(UChar, in->len + 1); + uc_err = U_ZERO_ERROR; + r = rspamd_converter_to_uchars(conv, + tmp_buf, in->len + 1, + in->data, in->len, &uc_err); + + if (!U_SUCCESS(uc_err)) { + g_free(tmp_buf); + + return FALSE; + } + + /* Now, convert to utf8 */ + clen = ucnv_getMaxCharSize(utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen); + g_byte_array_set_size(out, dlen); + r = ucnv_fromUChars(utf8_converter, out->data, dlen, tmp_buf, r, &uc_err); + + if (!U_SUCCESS(uc_err)) { + g_free(tmp_buf); + + return FALSE; + } + + g_free(tmp_buf); + out->len = r; + + return TRUE; +} + +void rspamd_mime_charset_utf_enforce(gchar *in, gsize len) +{ + gchar *p, *end; + goffset err_offset; + UChar32 uc = 0; + + /* Now we validate input and replace bad characters with '?' symbol */ + p = in; + end = in + len; + + while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate(p, len)) > 0) { + err_offset--; /* As it returns it 1 indexed */ + gint32 cur_offset = err_offset; + + while (cur_offset < len) { + gint32 tmp = cur_offset; + + U8_NEXT(p, cur_offset, len, uc); + + if (uc > 0) { + /* Fill string between err_offset and tmp with `?` character */ + memset(p + err_offset, '?', tmp - err_offset); + break; + } + } + + if (uc < 0) { + /* Fill till the end */ + memset(p + err_offset, '?', len - err_offset); + break; + } + + p += cur_offset; + len = end - p; + } +} + +const char * +rspamd_mime_charset_find_by_content(const gchar *in, gsize inlen, + bool check_utf8) +{ + int nconsumed; + bool is_reliable; + const gchar *ced_name; + + if (check_utf8) { + if (rspamd_fast_utf8_validate(in, inlen) == 0) { + return UTF8_CHARSET; + } + } + + + ced_name = ced_encoding_detect(in, inlen, NULL, NULL, + NULL, 0, CED_EMAIL_CORPUS, + false, &nconsumed, &is_reliable); + + if (ced_name) { + + return ced_name; + } + + return NULL; +} + +static const char * +rspamd_mime_charset_find_by_content_maybe_split(const gchar *in, gsize inlen) +{ + if (inlen < RSPAMD_CHARSET_MAX_CONTENT * 3) { + return rspamd_mime_charset_find_by_content(in, inlen, false); + } + else { + const gchar *c1, *c2, *c3; + + c1 = rspamd_mime_charset_find_by_content(in, RSPAMD_CHARSET_MAX_CONTENT, false); + c2 = rspamd_mime_charset_find_by_content(in + inlen / 2, + RSPAMD_CHARSET_MAX_CONTENT, false); + c3 = rspamd_mime_charset_find_by_content(in + inlen - RSPAMD_CHARSET_MAX_CONTENT, + RSPAMD_CHARSET_MAX_CONTENT, false); + + /* 7bit stuff */ + if (c1 && strcmp(c1, "US-ASCII") == 0) { + c1 = NULL; /* Invalid - we have 8 bit there */ + } + if (c2 && strcmp(c2, "US-ASCII") == 0) { + c2 = NULL; /* Invalid - we have 8 bit there */ + } + if (c3 && strcmp(c3, "US-ASCII") == 0) { + c3 = NULL; /* Invalid - we have 8 bit there */ + } + + if (!c1) { + c1 = c2 ? c2 : c3; + } + if (!c2) { + c2 = c3 ? c3 : c1; + } + if (!c3) { + c3 = c1 ? c2 : c1; + } + + if (c1 && c2 && c3) { + /* Quorum */ + if (c1 == c2) { + return c1; + } + else if (c2 == c3) { + return c2; + } + else if (c1 == c3) { + return c3; + } + + /* All charsets are distinct. Use the one from the top */ + return c1; + } + + return NULL; + } +} + +gboolean +rspamd_mime_charset_utf_check(rspamd_ftok_t *charset, + gchar *in, gsize len, gboolean content_check) +{ + const gchar *real_charset; + + if (utf_compatible_re == NULL) { + utf_compatible_re = rspamd_regexp_new( + "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$", + "i", NULL); + } + + if (charset->len == 0 || + rspamd_regexp_match(utf_compatible_re, + charset->begin, charset->len, TRUE)) { + /* + * In case of UTF8 charset we still can check the content to find + * corner cases + */ + if (content_check) { + if (rspamd_fast_utf8_validate(in, len) != 0) { + real_charset = rspamd_mime_charset_find_by_content_maybe_split(in, len); + + if (real_charset) { + + if (rspamd_regexp_match(utf_compatible_re, + real_charset, strlen(real_charset), TRUE)) { + RSPAMD_FTOK_ASSIGN(charset, UTF8_CHARSET); + + return TRUE; + } + else { + charset->begin = real_charset; + charset->len = strlen(real_charset); + + return FALSE; + } + } + + rspamd_mime_charset_utf_enforce(in, len); + } + } + + return TRUE; + } + + return FALSE; +} + +void rspamd_mime_text_part_maybe_convert(struct rspamd_task *task, + struct rspamd_mime_text_part *text_part) +{ + GError *err = NULL; + const gchar *charset = NULL; + gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE; + GByteArray *part_content; + rspamd_ftok_t charset_tok; + struct rspamd_mime_part *part = text_part->mime_part; + + if (rspamd_str_has_8bit(text_part->raw.begin, text_part->raw.len)) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW; + } + + /* Allocate copy storage */ + part_content = g_byte_array_sized_new(text_part->parsed.len); + memcpy(part_content->data, text_part->parsed.begin, text_part->parsed.len); + part_content->len = text_part->parsed.len; + rspamd_mempool_notify_alloc(task->task_pool, + part_content->len); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) g_byte_array_unref, part_content); + + if (rspamd_str_has_8bit(text_part->parsed.begin, text_part->parsed.len)) { + if (rspamd_fast_utf8_validate(text_part->parsed.begin, text_part->parsed.len) == 0) { + /* Valid UTF, likely all good */ + need_charset_heuristic = FALSE; + valid_utf8 = TRUE; + checked = TRUE; + } + + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED; + } + else { + /* All 7bit characters, assume it valid utf */ + need_charset_heuristic = FALSE; + valid_utf8 = TRUE; + checked = TRUE; /* Already valid utf, no need in further checks */ + } + + if (part->ct->charset.len == 0) { + if (need_charset_heuristic) { + charset = rspamd_mime_charset_find_by_content_maybe_split(text_part->parsed.begin, + text_part->parsed.len); + + if (charset != NULL) { + msg_info_task("detected charset %s", charset); + } + + checked = TRUE; + text_part->real_charset = charset; + } + else if (valid_utf8) { + SET_PART_UTF(text_part); + text_part->utf_raw_content = part_content; + text_part->real_charset = UTF8_CHARSET; + + return; + } + } + else { + charset = rspamd_mime_detect_charset(&part->ct->charset, + task->task_pool); + + if (charset == NULL) { + /* We don't know the real charset but can try heuristic */ + if (need_charset_heuristic) { + charset = rspamd_mime_charset_find_by_content_maybe_split(part_content->data, + part_content->len); + msg_info_task("detected charset: %s", charset); + checked = TRUE; + text_part->real_charset = charset; + } + else if (valid_utf8) { + /* We already know that the input is valid utf, so skip heuristic */ + text_part->real_charset = UTF8_CHARSET; + } + } + else { + text_part->real_charset = charset; + + if (strcmp(charset, UTF8_CHARSET) != 0) { + /* + * We have detected some charset, but we don't know which one, + * so we need to reset valid utf8 flag and enforce it later + */ + valid_utf8 = FALSE; + } + } + } + + if (text_part->real_charset == NULL) { + msg_info_task("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"", + MESSAGE_FIELD_CHECK(task, message_id), &part->ct->charset, + part->ct->cpy); + SET_PART_RAW(text_part); + text_part->utf_raw_content = part_content; + + return; + } + + RSPAMD_FTOK_FROM_STR(&charset_tok, charset); + + if (!valid_utf8) { + if (rspamd_mime_charset_utf_check(&charset_tok, part_content->data, + part_content->len, !checked)) { + SET_PART_UTF(text_part); + text_part->utf_raw_content = part_content; + text_part->real_charset = UTF8_CHARSET; + + return; + } + else { + charset = charset_tok.begin; + + if (!rspamd_mime_text_part_utf8_convert(task, text_part, + part_content, charset, &err)) { + msg_warn_task("<%s>: cannot convert from %s to utf8: %s", + MESSAGE_FIELD(task, message_id), + charset, + err ? err->message : "unknown problem"); + SET_PART_RAW(text_part); + g_error_free(err); + + text_part->utf_raw_content = part_content; + return; + } + + SET_PART_UTF(text_part); + text_part->real_charset = charset; + } + } + else { + SET_PART_UTF(text_part); + text_part->utf_raw_content = part_content; + } +} diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h new file mode 100644 index 0000000..ff81292 --- /dev/null +++ b/src/libmime/mime_encoding.h @@ -0,0 +1,148 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_MIME_ENCODING_H_ +#define SRC_LIBMIME_MIME_ENCODING_H_ + +#include "config.h" +#include "mem_pool.h" +#include "fstring.h" +#include <unicode/uchar.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct rspamd_mime_part; +struct rspamd_mime_text_part; +struct rspamd_charset_converter; + +/** + * Convert charset alias to a canonic charset name + * @param pool pool to store temporary data + * @param in + * @return + */ +const gchar *rspamd_mime_detect_charset(const rspamd_ftok_t *in, + rspamd_mempool_t *pool); + +/** + * Convert text chunk to utf-8. Input encoding is substituted using + * `rspamd_mime_detect_charset`. + * If input encoding is already utf, this function returns input pointer. + * Memory is allocated from pool if a conversion is needed + * @param pool + * @param input + * @param len + * @param in_enc canon charset + * @param olen + * @param err + * @return + */ +gchar *rspamd_mime_text_to_utf8(rspamd_mempool_t *pool, + gchar *input, gsize len, const gchar *in_enc, + gsize *olen, GError **err); + +/** + * Converts data from `in` to `out`, + * returns `FALSE` if `enc` is not a valid iconv charset + * + * This function, in fact, copies `in` from `out` replacing out content in + * total. + * @param in + * @param out + * @param enc validated canonical charset name. If NULL, then utf8 check is done only + * @return + */ +gboolean rspamd_mime_to_utf8_byte_array(GByteArray *in, + GByteArray *out, + rspamd_mempool_t *pool, + const gchar *enc); + +/** + * Maybe convert part to utf-8 + * @param task + * @param text_part + * @return + */ +void rspamd_mime_text_part_maybe_convert(struct rspamd_task *task, + struct rspamd_mime_text_part *text_part); + +/** + * Checks utf8 charset and normalize/validate utf8 string + * @param charset + * @param in + * @param len + * @return + */ +gboolean rspamd_mime_charset_utf_check(rspamd_ftok_t *charset, + gchar *in, gsize len, + gboolean content_check); + +/** + * Ensure that all characters in string are valid utf8 chars or replace them + * with '?' + * @param in + * @param len + */ +void rspamd_mime_charset_utf_enforce(gchar *in, gsize len); + +/** + * Gets cached converter + * @param enc input encoding + * @param pool pool to use for temporary normalisation + * @param is_canon TRUE if normalisation is needed + * @param err output error + * @return converter + */ +struct rspamd_charset_converter *rspamd_mime_get_converter_cached( + const gchar *enc, + rspamd_mempool_t *pool, + gboolean is_canon, + UErrorCode *err); + +/** + * Performs charset->utf16 conversion + * @param cnv + * @param dest + * @param destCapacity + * @param src + * @param srcLength + * @param pErrorCode + * @return + */ +gint32 +rspamd_converter_to_uchars(struct rspamd_charset_converter *cnv, + UChar *dest, + gint32 destCapacity, + const char *src, + gint32 srcLength, + UErrorCode *pErrorCode); + +/** + * Detect charset in text + * @param in + * @param inlen + * @return detected charset name or NULL + */ +const char *rspamd_mime_charset_find_by_content(const gchar *in, gsize inlen, + bool check_utf8); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBMIME_MIME_ENCODING_H_ */ diff --git a/src/libmime/mime_encoding_list.h b/src/libmime/mime_encoding_list.h new file mode 100644 index 0000000..b5fc5e1 --- /dev/null +++ b/src/libmime/mime_encoding_list.h @@ -0,0 +1,1577 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_MIME_ENCODING_LIST_H_ +#define SRC_LIBMIME_MIME_ENCODING_LIST_H_ + +static const struct rspamd_charset_substitution sub[] = { + { + .input = "iso-646-us", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "ansi_x3.4-1968", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "iso-ir-6", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "iso_646.irv:1991", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "ascii", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "iso646-us", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "us", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "ibm367", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "cp367", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "csascii", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "ascii7", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "default", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "646", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "iso_646.irv:1983", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "iso969-us", + .canon = "ansi_x3.4-1986", + .flags = RSPAMD_CHARSET_FLAG_ASCII, + }, + { + .input = "tw-big5", + .canon = "big5", + .flags = 0, + }, + { + .input = "csbig5", + .canon = "big5", + .flags = 0, + }, + { + .input = "hkscs-big5", + .canon = "big5-hkscs", + .flags = 0, + }, + { + .input = "big5hk", + .canon = "big5-hkscs", + .flags = 0, + }, + { + .input = "big5-hkscs:unicode", + .canon = "big5-hkscs", + .flags = 0, + }, + { + .input = "extended_unix_code_packed_format_for_japanese", + .canon = "euc-jp", + .flags = 0, + }, + { + .input = "cseucpkdfmtjapanese", + .canon = "euc-jp", + .flags = 0, + }, + { + .input = "x-eucjp", + .canon = "euc-jp", + .flags = 0, + }, + { + .input = "x-euc-jp", + .canon = "euc-jp", + .flags = 0, + }, + { + .input = "unicode-1-1-utf-8", + .canon = "utf-8", + .flags = RSPAMD_CHARSET_FLAG_UTF, + }, + { + .input = "cseuckr", + .canon = "euc-kr", + .flags = 0, + }, + { + .input = "5601", + .canon = "euc-kr", + .flags = 0, + }, + { + .input = "ksc-5601", + .canon = "euc-kr", + .flags = 0, + }, + { + .input = "ksc-5601-1987", + .canon = "euc-kr", + .flags = 0, + }, + { + .input = "ksc-5601_1987", + .canon = "euc-kr", + .flags = 0, + }, + { + .input = "ksc5601", + .canon = "euc-kr", + .flags = 0, + }, + { + .input = "cns11643", + .canon = "euc-tw", + .flags = 0, + }, + { + .input = "ibm-euctw", + .canon = "euc-tw", + .flags = 0, + }, + { + .input = "gb-18030", + .canon = "gb18030", + .flags = 0, + }, + { + .input = "ibm1392", + .canon = "gb18030", + .flags = 0, + }, + { + .input = "ibm-1392", + .canon = "gb18030", + .flags = 0, + }, + { + .input = "gb18030-2000", + .canon = "gb18030", + .flags = 0, + }, + { + .input = "gb-2312", + .canon = "gb2312", + .flags = 0, + }, + { + .input = "csgb2312", + .canon = "gb2312", + .flags = 0, + }, + { + .input = "euc_cn", + .canon = "gb2312", + .flags = 0, + }, + { + .input = "euccn", + .canon = "gb2312", + .flags = 0, + }, + { + .input = "euc-cn", + .canon = "gb2312", + .flags = 0, + }, + { + .input = "gb-k", + .canon = "gbk", + .flags = 0, + }, + { + .input = "iso_8859-1:1987", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "iso-ir-100", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "iso_8859-1", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "latin1", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "l1", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "ibm819", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "cp819", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "csisolatin1", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "819", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "cp819", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "iso8859-1", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "8859-1", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "iso8859_1", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "iso_8859_1", + .canon = "iso-8859-1", + .flags = 0, + }, + { + .input = "iso_8859-2:1987", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "iso-ir-101", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "iso_8859-2", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "latin2", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "l2", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "csisolatin2", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "912", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "cp912", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "ibm-912", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "ibm912", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "iso8859-2", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "8859-2", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "iso8859_2", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "iso_8859_2", + .canon = "iso-8859-2", + .flags = 0, + }, + { + .input = "iso_8859-3:1988", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "iso-ir-109", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "iso_8859-3", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "latin3", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "l3", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "csisolatin3", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "913", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "cp913", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "ibm-913", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "ibm913", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "iso8859-3", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "8859-3", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "iso8859_3", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "iso_8859_3", + .canon = "iso-8859-3", + .flags = 0, + }, + { + .input = "iso_8859-4:1988", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "iso-ir-110", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "iso_8859-4", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "latin4", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "l4", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "csisolatin4", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "914", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "cp914", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "ibm-914", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "ibm914", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "iso8859-4", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "8859-4", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "iso8859_4", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "iso_8859_4", + .canon = "iso-8859-4", + .flags = 0, + }, + { + .input = "iso_8859-5:1988", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "iso-ir-144", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "iso_8859-5", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "cyrillic", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "csisolatincyrillic", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "915", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "cp915", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "ibm-915", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "ibm915", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "iso8859-5", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "8859-5", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "iso8859_5", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "iso_8859_5", + .canon = "iso-8859-5", + .flags = 0, + }, + { + .input = "iso_8859-6:1987", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "iso-ir-127", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "iso_8859-6", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "ecma-114", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "asmo-708", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "arabic", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "csisolatinarabic", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "1089", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "cp1089", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "ibm-1089", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "ibm1089", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "iso8859-6", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "8859-6", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "iso8859_6", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "iso_8859_6", + .canon = "iso-8859-6", + .flags = 0, + }, + { + .input = "iso_8859-7:1987", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "iso-ir-126", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "iso_8859-7", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "elot_928", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "ecma-118", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "greek", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "greek8", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "csisolatingreek", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "813", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "cp813", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "ibm-813", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "ibm813", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "iso8859-7", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "8859-7", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "iso8859_7", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "iso_8859_7", + .canon = "iso-8859-7", + .flags = 0, + }, + { + .input = "iso_8859-8:1988", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "iso-ir-138", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "iso_8859-8", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "hebrew", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "csisolatinhebrew", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "916", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "cp916", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "ibm-916", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "ibm916", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "iso8859-8", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "8859-8", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "iso8859_8", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "iso_8859_8", + .canon = "iso-8859-8", + .flags = 0, + }, + { + .input = "iso_8859-9:1989", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "iso-ir-148", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "iso_8859-9", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "latin5", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "l5", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "csisolatin5", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "920", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "cp920", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "ibm-920", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "ibm920", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "iso8859-9", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "8859-9", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "iso8859_9", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "iso_8859_9", + .canon = "iso-8859-9", + .flags = 0, + }, + { + .input = "iso_8859-13", + .canon = "iso-8859-13", + .flags = 0, + }, + { + .input = "iso8859-13", + .canon = "iso-8859-13", + .flags = 0, + }, + { + .input = "8859-13", + .canon = "iso-8859-13", + .flags = 0, + }, + { + .input = "iso8859_13", + .canon = "iso-8859-13", + .flags = 0, + }, + { + .input = "iso_8859_13", + .canon = "iso-8859-13", + .flags = 0, + }, + { + .input = "iso-ir-199", + .canon = "iso-8859-14", + .flags = 0, + }, + { + .input = "iso_8859-14:1998", + .canon = "iso-8859-14", + .flags = 0, + }, + { + .input = "iso_8859-14", + .canon = "iso-8859-14", + .flags = 0, + }, + { + .input = "latin8", + .canon = "iso-8859-14", + .flags = 0, + }, + { + .input = "iso-celtic", + .canon = "iso-8859-14", + .flags = 0, + }, + { + .input = "l8", + .canon = "iso-8859-14", + .flags = 0, + }, + { + .input = "csisolatin9", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "csisolatin0", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "latin9", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "latin0", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "923", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "cp923", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "ibm-923", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "ibm923", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "iso8859-15", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "iso_8859-15", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "8859-15", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "iso_8859-15_fdis", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "l9", + .canon = "iso-8859-15", + .flags = 0, + }, + { + .input = "koi-8-r", + .canon = "koi8-r", + .flags = 0, + }, + { + .input = "cskoi8r", + .canon = "koi8-r", + .flags = 0, + }, + { + .input = "koi8", + .canon = "koi8-r", + .flags = 0, + }, + { + .input = "koi-8-u", + .canon = "koi8-u", + .flags = 0, + }, + { + .input = "koi-8-t", + .canon = "koi8-t", + .flags = 0, + }, + { + .input = "shiftjis", + .canon = "shift_jis", + .flags = 0, + }, + { + .input = "ms_kanji", + .canon = "shift_jis", + .flags = 0, + }, + { + .input = "csshiftjis", + .canon = "shift_jis", + .flags = 0, + }, + { + .input = "cp-437", + .canon = "ibm437", + .flags = 0, + }, + { + .input = "cp437", + .canon = "ibm437", + .flags = 0, + }, + { + .input = "437", + .canon = "ibm437", + .flags = 0, + }, + { + .input = "cspc8codepage437437", + .canon = "ibm437", + .flags = 0, + }, + { + .input = "cspc8codepage437", + .canon = "ibm437", + .flags = 0, + }, + { + .input = "ibm-437", + .canon = "ibm437", + .flags = 0, + }, + { + .input = "cp-850", + .canon = "ibm850", + .flags = 0, + }, + { + .input = "cp850", + .canon = "ibm850", + .flags = 0, + }, + { + .input = "850", + .canon = "ibm850", + .flags = 0, + }, + { + .input = "cspc850multilingual850", + .canon = "ibm850", + .flags = 0, + }, + { + .input = "cspc850multilingual", + .canon = "ibm850", + .flags = 0, + }, + { + .input = "ibm-850", + .canon = "ibm850", + .flags = 0, + }, + { + .input = "cp-851", + .canon = "ibm851", + .flags = 0, + }, + { + .input = "cp851", + .canon = "ibm851", + .flags = 0, + }, + { + .input = "851", + .canon = "ibm851", + .flags = 0, + }, + { + .input = "csibm851", + .canon = "ibm851", + .flags = 0, + }, + { + .input = "cp-852", + .canon = "ibm852", + .flags = 0, + }, + { + .input = "cp852", + .canon = "ibm852", + .flags = 0, + }, + { + .input = "852", + .canon = "ibm852", + .flags = 0, + }, + { + .input = "cspcp852", + .canon = "ibm852", + .flags = 0, + }, + { + .input = "852", + .canon = "ibm852", + .flags = 0, + }, + { + .input = "cspcp852", + .canon = "ibm852", + .flags = 0, + }, + { + .input = "ibm-852", + .canon = "ibm852", + .flags = 0, + }, + { + .input = "cp-855", + .canon = "ibm855", + .flags = 0, + }, + { + .input = "cp855", + .canon = "ibm855", + .flags = 0, + }, + { + .input = "855", + .canon = "ibm855", + .flags = 0, + }, + { + .input = "csibm855", + .canon = "ibm855", + .flags = 0, + }, + { + .input = "cspcp855", + .canon = "ibm855", + .flags = 0, + }, + { + .input = "ibm-855", + .canon = "ibm855", + .flags = 0, + }, + { + .input = "cp-857", + .canon = "ibm857", + .flags = 0, + }, + { + .input = "cp857", + .canon = "ibm857", + .flags = 0, + }, + { + .input = "857", + .canon = "ibm857", + .flags = 0, + }, + { + .input = "csibm857", + .canon = "ibm857", + .flags = 0, + }, + { + .input = "857", + .canon = "ibm857", + .flags = 0, + }, + { + .input = "csibm857", + .canon = "ibm857", + .flags = 0, + }, + { + .input = "ibm-857", + .canon = "ibm857", + .flags = 0, + }, + { + .input = "cp-860", + .canon = "ibm860", + .flags = 0, + }, + { + .input = "cp860", + .canon = "ibm860", + .flags = 0, + }, + { + .input = "860", + .canon = "ibm860", + .flags = 0, + }, + { + .input = "csibm860", + .canon = "ibm860", + .flags = 0, + }, + { + .input = "860", + .canon = "ibm860", + .flags = 0, + }, + { + .input = "csibm860", + .canon = "ibm860", + .flags = 0, + }, + { + .input = "ibm-860", + .canon = "ibm860", + .flags = 0, + }, + { + .input = "cp-861", + .canon = "ibm861", + .flags = 0, + }, + { + .input = "cp861", + .canon = "ibm861", + .flags = 0, + }, + { + .input = "861", + .canon = "ibm861", + .flags = 0, + }, + { + .input = "cp-is", + .canon = "ibm861", + .flags = 0, + }, + { + .input = "csibm861", + .canon = "ibm861", + .flags = 0, + }, + { + .input = "861", + .canon = "ibm861", + .flags = 0, + }, + { + .input = "cp-is", + .canon = "ibm861", + .flags = 0, + }, + { + .input = "csibm861", + .canon = "ibm861", + .flags = 0, + }, + { + .input = "ibm-861", + .canon = "ibm861", + .flags = 0, + }, + { + .input = "cp-862", + .canon = "ibm862", + .flags = 0, + }, + { + .input = "cp862", + .canon = "ibm862", + .flags = 0, + }, + { + .input = "862", + .canon = "ibm862", + .flags = 0, + }, + { + .input = "cspc862latinhebrew862", + .canon = "ibm862", + .flags = 0, + }, + { + .input = "cspc862latinhebrew", + .canon = "ibm862", + .flags = 0, + }, + { + .input = "ibm-862", + .canon = "ibm862", + .flags = 0, + }, + { + .input = "cp-863", + .canon = "ibm863", + .flags = 0, + }, + { + .input = "cp863", + .canon = "ibm863", + .flags = 0, + }, + { + .input = "863", + .canon = "ibm863", + .flags = 0, + }, + { + .input = "csibm863", + .canon = "ibm863", + .flags = 0, + }, + { + .input = "863", + .canon = "ibm863", + .flags = 0, + }, + { + .input = "csibm863", + .canon = "ibm863", + .flags = 0, + }, + { + .input = "ibm-863", + .canon = "ibm863", + .flags = 0, + }, + { + .input = "cp-864", + .canon = "ibm864", + .flags = 0, + }, + { + .input = "cp864", + .canon = "ibm864", + .flags = 0, + }, + { + .input = "csibm864", + .canon = "ibm864", + .flags = 0, + }, + { + .input = "csibm864", + .canon = "ibm864", + .flags = 0, + }, + { + .input = "ibm-864", + .canon = "ibm864", + .flags = 0, + }, + { + .input = "cp-865", + .canon = "ibm865", + .flags = 0, + }, + { + .input = "cp865", + .canon = "ibm865", + .flags = 0, + }, + { + .input = "865", + .canon = "ibm865", + .flags = 0, + }, + { + .input = "csibm865", + .canon = "ibm865", + .flags = 0, + }, + { + .input = "865", + .canon = "ibm865", + .flags = 0, + }, + { + .input = "csibm865", + .canon = "ibm865", + .flags = 0, + }, + { + .input = "ibm-865", + .canon = "ibm865", + .flags = 0, + }, + { + .input = "cp-866", + .canon = "ibm866", + .flags = 0, + }, + { + .input = "cp866", + .canon = "ibm866", + .flags = 0, + }, + { + .input = "866", + .canon = "ibm866", + .flags = 0, + }, + { + .input = "csibm866", + .canon = "ibm866", + .flags = 0, + }, + { + .input = "866", + .canon = "ibm866", + .flags = 0, + }, + { + .input = "csibm866", + .canon = "ibm866", + .flags = 0, + }, + { + .input = "ibm-866", + .canon = "ibm866", + .flags = 0, + }, + { + .input = "cp-868", + .canon = "ibm868", + .flags = 0, + }, + { + .input = "cp868", + .canon = "ibm868", + .flags = 0, + }, + { + .input = "cp-ar", + .canon = "ibm868", + .flags = 0, + }, + { + .input = "csibm868", + .canon = "ibm868", + .flags = 0, + }, + { + .input = "ibm-868", + .canon = "ibm868", + .flags = 0, + }, + { + .input = "cp-869", + .canon = "ibm869", + .flags = 0, + }, + { + .input = "cp869", + .canon = "ibm869", + .flags = 0, + }, + { + .input = "869", + .canon = "ibm869", + .flags = 0, + }, + { + .input = "cp-gr", + .canon = "ibm869", + .flags = 0, + }, + { + .input = "csibm869", + .canon = "ibm869", + .flags = 0, + }, + { + .input = "cp-891", + .canon = "ibm891", + .flags = 0, + }, + { + .input = "cp891", + .canon = "ibm891", + .flags = 0, + }, + { + .input = "csibm891", + .canon = "ibm891", + .flags = 0, + }, + { + .input = "cp-903", + .canon = "ibm903", + .flags = 0, + }, + { + .input = "cp903", + .canon = "ibm903", + .flags = 0, + }, + { + .input = "csibm903", + .canon = "ibm903", + .flags = 0, + }, + { + .input = "cp-904", + .canon = "ibm904", + .flags = 0, + }, + { + .input = "cp904", + .canon = "ibm904", + .flags = 0, + }, + { + .input = "904", + .canon = "ibm904", + .flags = 0, + }, + { + .input = "csibm904", + .canon = "ibm904", + .flags = 0, + }, + { + .input = "cp-1251", + .canon = "cp1251", + .flags = 0, + }, + { + .input = "windows-1251", + .canon = "cp1251", + .flags = 0, + }, + { + .input = "cp-1255", + .canon = "cp1255", + .flags = 0, + }, + { + .input = "windows-1255", + .canon = "cp1255", + .flags = 0, + }, + { + .input = "tis620.2533", + .canon = "tis-620", + .flags = 0, + }, +}; + +#endif /* SRC_LIBMIME_MIME_ENCODING_LIST_H_ */ diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c new file mode 100644 index 0000000..e51539e --- /dev/null +++ b/src/libmime/mime_expressions.c @@ -0,0 +1,2392 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <contrib/libucl/ucl.h> +#include "config.h" +#include "util.h" +#include "cfg_file.h" +#include "rspamd.h" +#include "message.h" +#include "mime_expressions.h" +#include "libserver/html/html.h" +#include "lua/lua_common.h" +#include "utlist.h" + +gboolean rspamd_compare_encoding(struct rspamd_task *task, + GArray *args, + void *unused); +gboolean rspamd_header_exists(struct rspamd_task *task, + GArray *args, + void *unused); +gboolean rspamd_parts_distance(struct rspamd_task *task, + GArray *args, + void *unused); +gboolean rspamd_recipients_distance(struct rspamd_task *task, + GArray *args, + void *unused); +gboolean rspamd_has_only_html_part(struct rspamd_task *task, + GArray *args, + void *unused); +gboolean rspamd_is_recipients_sorted(struct rspamd_task *task, + GArray *args, + void *unused); +gboolean rspamd_compare_transfer_encoding(struct rspamd_task *task, + GArray *args, + void *unused); +gboolean rspamd_is_html_balanced(struct rspamd_task *task, + GArray *args, + void *unused); +gboolean rspamd_has_html_tag(struct rspamd_task *task, + GArray *args, + void *unused); +gboolean rspamd_has_fake_html(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_raw_header_exists(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_check_smtp_data(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_content_type_is_type(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_content_type_is_subtype(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_content_type_has_param(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_content_type_compare_param(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_has_content_part(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_has_content_part_len(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_is_empty_body(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_has_flag_expr(struct rspamd_task *task, + GArray *args, + void *unused); +static gboolean rspamd_has_symbol_expr(struct rspamd_task *task, + GArray *args, + void *unused); + +static rspamd_expression_atom_t *rspamd_mime_expr_parse(const gchar *line, gsize len, + rspamd_mempool_t *pool, gpointer ud, GError **err); +static gdouble rspamd_mime_expr_process(void *ud, rspamd_expression_atom_t *atom); +static gint rspamd_mime_expr_priority(rspamd_expression_atom_t *atom); +static void rspamd_mime_expr_destroy(rspamd_expression_atom_t *atom); + +/** + * Regexp structure + */ +struct rspamd_regexp_atom { + enum rspamd_re_type type; /**< regexp type */ + gchar *regexp_text; /**< regexp text representation */ + rspamd_regexp_t *regexp; /**< regexp structure */ + union { + const gchar *header; /**< header name for header regexps */ + const gchar *selector; /**< selector name for lua selector regexp */ + } extra; + gboolean is_test; /**< true if this expression must be tested */ + gboolean is_strong; /**< true if headers search must be case sensitive */ + gboolean is_multiple; /**< true if we need to match all inclusions of atom */ +}; + +/** + * Rspamd expression function + */ +struct rspamd_function_atom { + gchar *name; /**< name of function */ + GArray *args; /**< its args */ +}; + +enum rspamd_mime_atom_type { + MIME_ATOM_REGEXP = 0, + MIME_ATOM_INTERNAL_FUNCTION, + MIME_ATOM_LUA_FUNCTION, + MIME_ATOM_LOCAL_LUA_FUNCTION, /* New style */ +}; + +struct rspamd_mime_atom { + gchar *str; + union { + struct rspamd_regexp_atom *re; + struct rspamd_function_atom *func; + const gchar *lua_function; + gint lua_cbref; + } d; + enum rspamd_mime_atom_type type; +}; + +/* + * List of internal functions of rspamd + * Sorted by name to use bsearch + */ +static struct _fl { + const gchar *name; + rspamd_internal_func_t func; + void *user_data; +} rspamd_functions_list[] = { + {"check_smtp_data", rspamd_check_smtp_data, NULL}, + {"compare_encoding", rspamd_compare_encoding, NULL}, + {"compare_parts_distance", rspamd_parts_distance, NULL}, + {"compare_recipients_distance", rspamd_recipients_distance, NULL}, + {"compare_transfer_encoding", rspamd_compare_transfer_encoding, NULL}, + {"content_type_compare_param", rspamd_content_type_compare_param, NULL}, + {"content_type_has_param", rspamd_content_type_has_param, NULL}, + {"content_type_is_subtype", rspamd_content_type_is_subtype, NULL}, + {"content_type_is_type", rspamd_content_type_is_type, NULL}, + {"has_content_part", rspamd_has_content_part, NULL}, + {"has_content_part_len", rspamd_has_content_part_len, NULL}, + {"has_fake_html", rspamd_has_fake_html, NULL}, + {"has_flag", rspamd_has_flag_expr, NULL}, + {"has_html_tag", rspamd_has_html_tag, NULL}, + {"has_only_html_part", rspamd_has_only_html_part, NULL}, + {"has_symbol", rspamd_has_symbol_expr, NULL}, + {"header_exists", rspamd_header_exists, NULL}, + {"is_empty_body", rspamd_is_empty_body, NULL}, + {"is_html_balanced", rspamd_is_html_balanced, NULL}, + {"is_recipients_sorted", rspamd_is_recipients_sorted, NULL}, + {"raw_header_exists", rspamd_raw_header_exists, NULL}, +}; + +const struct rspamd_atom_subr mime_expr_subr = { + .parse = rspamd_mime_expr_parse, + .process = rspamd_mime_expr_process, + .priority = rspamd_mime_expr_priority, + .destroy = rspamd_mime_expr_destroy}; + +static struct _fl *list_ptr = &rspamd_functions_list[0]; +static guint32 functions_number = sizeof(rspamd_functions_list) / + sizeof(struct _fl); +static gboolean list_allocated = FALSE; + +/* Bsearch routine */ +static gint +fl_cmp(const void *s1, const void *s2) +{ + struct _fl *fl1 = (struct _fl *) s1; + struct _fl *fl2 = (struct _fl *) s2; + return strcmp(fl1->name, fl2->name); +} + +static GQuark +rspamd_mime_expr_quark(void) +{ + return g_quark_from_static_string("mime-expressions"); +} + +#define TYPE_CHECK(str, type, len) (sizeof(type) - 1 == (len) && rspamd_lc_cmp((str), (type), (len)) == 0) +static gboolean +rspamd_parse_long_option(const gchar *start, gsize len, + struct rspamd_regexp_atom *a) +{ + gboolean ret = FALSE; + + if (TYPE_CHECK(start, "body", len)) { + ret = TRUE; + a->type = RSPAMD_RE_BODY; + } + else if (TYPE_CHECK(start, "part", len) || + TYPE_CHECK(start, "mime", len)) { + ret = TRUE; + a->type = RSPAMD_RE_MIME; + } + else if (TYPE_CHECK(start, "raw_part", len) || + TYPE_CHECK(start, "raw_mime", len) || + TYPE_CHECK(start, "mime_raw", len)) { + ret = TRUE; + a->type = RSPAMD_RE_RAWMIME; + } + else if (TYPE_CHECK(start, "header", len)) { + ret = TRUE; + a->type = RSPAMD_RE_HEADER; + } + else if (TYPE_CHECK(start, "mime_header", len) || + TYPE_CHECK(start, "header_mime", len)) { + ret = TRUE; + a->type = RSPAMD_RE_MIMEHEADER; + } + else if (TYPE_CHECK(start, "raw_header", len) || + TYPE_CHECK(start, "header_raw", len)) { + ret = TRUE; + a->type = RSPAMD_RE_RAWHEADER; + } + else if (TYPE_CHECK(start, "all_header", len) || + TYPE_CHECK(start, "header_all", len) || + TYPE_CHECK(start, "all_headers", len)) { + ret = TRUE; + a->type = RSPAMD_RE_ALLHEADER; + } + else if (TYPE_CHECK(start, "url", len)) { + ret = TRUE; + a->type = RSPAMD_RE_URL; + } + else if (TYPE_CHECK(start, "email", len)) { + ret = TRUE; + a->type = RSPAMD_RE_EMAIL; + } + else if (TYPE_CHECK(start, "sa_body", len)) { + ret = TRUE; + a->type = RSPAMD_RE_SABODY; + } + else if (TYPE_CHECK(start, "sa_raw_body", len) || + TYPE_CHECK(start, "sa_body_raw", len)) { + ret = TRUE; + a->type = RSPAMD_RE_SARAWBODY; + } + else if (TYPE_CHECK(start, "words", len)) { + ret = TRUE; + a->type = RSPAMD_RE_WORDS; + } + else if (TYPE_CHECK(start, "raw_words", len)) { + ret = TRUE; + a->type = RSPAMD_RE_RAWWORDS; + } + else if (TYPE_CHECK(start, "stem_words", len)) { + ret = TRUE; + a->type = RSPAMD_RE_STEMWORDS; + } + else if (TYPE_CHECK(start, "selector", len)) { + ret = TRUE; + a->type = RSPAMD_RE_SELECTOR; + } + + return ret; +} + +/* + * Rspamd regexp utility functions + */ +static struct rspamd_regexp_atom * +rspamd_mime_expr_parse_regexp_atom(rspamd_mempool_t *pool, const gchar *line, + struct rspamd_config *cfg) +{ + const gchar *begin, *end, *p, *src, *start, *brace; + gchar *dbegin, *dend, *extra = NULL; + struct rspamd_regexp_atom *result; + GError *err = NULL; + GString *re_flags; + + if (line == NULL) { + msg_err_pool("cannot parse NULL line"); + return NULL; + } + + src = line; + result = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_regexp_atom)); + /* Skip whitespaces */ + while (g_ascii_isspace(*line)) { + line++; + } + if (*line == '\0') { + msg_warn_pool("got empty regexp"); + return NULL; + } + + result->type = RSPAMD_RE_MAX; + + start = line; + /* First try to find header name */ + begin = strchr(line, '/'); + if (begin != NULL) { + p = begin; + end = NULL; + while (p != line) { + if (*p == '=') { + end = p; + break; + } + p--; + } + + if (end) { + extra = rspamd_mempool_alloc(pool, end - line + 1); + rspamd_strlcpy(extra, line, end - line + 1); + line = end; + } + } + else { + extra = rspamd_mempool_strdup(pool, line); + result->type = RSPAMD_RE_MAX; + line = start; + } + /* Find begin of regexp */ + while (*line && *line != '/') { + line++; + } + if (*line != '\0') { + begin = line + 1; + } + else if (extra == NULL) { + /* Assume that line without // is just a header name */ + extra = rspamd_mempool_strdup(pool, line); + result->type = RSPAMD_RE_HEADER; + return result; + } + else { + /* We got header name earlier but have not found // expression, so it is invalid regexp */ + msg_warn_pool( + "got no header name (eg. header=) but without corresponding regexp, %s", + src); + return NULL; + } + /* Find end */ + end = begin; + while (*end && (*end != '/' || *(end - 1) == '\\')) { + end++; + } + if (end == begin || *end != '/') { + msg_warn_pool("no trailing / in regexp %s", src); + return NULL; + } + /* Parse flags */ + p = end + 1; + re_flags = g_string_sized_new(32); + + while (p != NULL) { + switch (*p) { + case 'i': + case 'm': + case 's': + case 'x': + case 'u': + case 'O': + case 'r': + case 'L': + /* Handled by rspamd_regexp_t */ + g_string_append_c(re_flags, *p); + p++; + break; + case 'o': + p++; + break; + /* Type flags */ + case 'H': + result->type = RSPAMD_RE_HEADER; + p++; + break; + case 'R': + result->type = RSPAMD_RE_ALLHEADER; + p++; + break; + case 'B': + result->type = RSPAMD_RE_MIMEHEADER; + p++; + break; + case 'C': + result->type = RSPAMD_RE_SABODY; + p++; + break; + case 'D': + result->type = RSPAMD_RE_SARAWBODY; + p++; + break; + case 'M': + result->type = RSPAMD_RE_BODY; + p++; + break; + case 'P': + result->type = RSPAMD_RE_MIME; + p++; + break; + case 'Q': + result->type = RSPAMD_RE_RAWMIME; + p++; + break; + case 'U': + result->type = RSPAMD_RE_URL; + p++; + break; + case 'X': + result->type = RSPAMD_RE_RAWHEADER; + p++; + break; + case '$': + result->type = RSPAMD_RE_SELECTOR; + p++; + break; + case '{': + /* Long definition */ + if ((brace = strchr(p + 1, '}')) != NULL) { + if (!rspamd_parse_long_option(p + 1, brace - (p + 1), result)) { + msg_warn_pool("invalid long regexp type: %*s in '%s'", + (int) (brace - (p + 1)), p + 1, src); + p = NULL; + } + else { + p = brace + 1; + } + } + else { + p = NULL; + } + break; + /* Other flags */ + case 'T': + result->is_test = TRUE; + p++; + break; + case 'S': + result->is_strong = TRUE; + p++; + break; + case 'A': + result->is_multiple = TRUE; + p++; + break; + /* Stop flags parsing */ + default: + p = NULL; + break; + } + } + + if (result->type >= RSPAMD_RE_MAX) { + if (extra) { + /* Assume header regexp */ + result->extra.header = extra; + result->type = RSPAMD_RE_HEADER; + } + else { + msg_err_pool("could not read regexp: %s, unknown type", src); + return NULL; + } + } + + if ((result->type == RSPAMD_RE_HEADER || + result->type == RSPAMD_RE_RAWHEADER || + result->type == RSPAMD_RE_MIMEHEADER)) { + if (extra == NULL) { + msg_err_pool("header regexp: '%s' has no header part", src); + return NULL; + } + else { + result->extra.header = extra; + } + } + + if (result->type == RSPAMD_RE_SELECTOR) { + if (extra == NULL) { + msg_err_pool("selector regexp: '%s' has no selector part", src); + return NULL; + } + else { + result->extra.selector = extra; + } + } + + + result->regexp_text = rspamd_mempool_strdup(pool, start); + dbegin = result->regexp_text + (begin - start); + dend = result->regexp_text + (end - start); + *dend = '\0'; + + result->regexp = rspamd_regexp_new(dbegin, re_flags->str, + &err); + + g_string_free(re_flags, TRUE); + + if (result->regexp == NULL || err != NULL) { + msg_warn_pool("could not read regexp: %s while reading regexp %e", + src, err); + + if (err) { + g_error_free(err); + } + + return NULL; + } + + if (result->is_multiple) { + rspamd_regexp_set_maxhits(result->regexp, 0); + } + else { + rspamd_regexp_set_maxhits(result->regexp, 1); + } + + rspamd_regexp_set_ud(result->regexp, result); + + *dend = '/'; + + return result; +} + +struct rspamd_function_atom * +rspamd_mime_expr_parse_function_atom(rspamd_mempool_t *pool, const gchar *input) +{ + const gchar *obrace, *ebrace, *p, *c; + gchar t, *databuf; + guint len; + struct rspamd_function_atom *res; + struct expression_argument arg; + GError *err = NULL; + enum { + start_read_argument = 0, + in_string, + in_regexp, + got_backslash, + got_comma + } state, + prev_state = 0; + + obrace = strchr(input, '('); + ebrace = strrchr(input, ')'); + + g_assert(obrace != NULL && ebrace != NULL); + + res = rspamd_mempool_alloc0(pool, sizeof(*res)); + res->name = rspamd_mempool_alloc(pool, obrace - input + 1); + rspamd_strlcpy(res->name, input, obrace - input + 1); + res->args = g_array_new(FALSE, FALSE, sizeof(struct expression_argument)); + + p = obrace + 1; + c = p; + state = start_read_argument; + + /* Read arguments */ + while (p <= ebrace) { + t = *p; + switch (state) { + case start_read_argument: + if (t == '/') { + state = in_regexp; + c = p; + } + else if (!g_ascii_isspace(t)) { + state = in_string; + + if (t == '\'' || t == '\"') { + c = p + 1; + } + else { + c = p; + } + } + p++; + break; + case in_regexp: + if (t == '\\') { + state = got_backslash; + prev_state = in_regexp; + } + else if (t == ',' || p == ebrace) { + len = p - c + 1; + databuf = rspamd_mempool_alloc(pool, len); + rspamd_strlcpy(databuf, c, len); + arg.type = EXPRESSION_ARGUMENT_REGEXP; + arg.data = rspamd_regexp_cache_create(NULL, databuf, NULL, &err); + + if (arg.data == NULL) { + /* Fallback to string */ + msg_warn("cannot parse slashed argument %s as regexp: %s", + databuf, err->message); + g_error_free(err); + arg.type = EXPRESSION_ARGUMENT_NORMAL; + arg.data = databuf; + } + + g_array_append_val(res->args, arg); + state = got_comma; + } + p++; + break; + case in_string: + if (t == '\\') { + state = got_backslash; + prev_state = in_string; + } + else if (t == ',' || p == ebrace) { + if (*(p - 1) == '\'' || *(p - 1) == '\"') { + len = p - c; + } + else { + len = p - c + 1; + } + + databuf = rspamd_mempool_alloc(pool, len); + rspamd_strlcpy(databuf, c, len); + arg.type = EXPRESSION_ARGUMENT_NORMAL; + arg.data = databuf; + g_array_append_val(res->args, arg); + state = got_comma; + } + p++; + break; + case got_backslash: + state = prev_state; + p++; + break; + case got_comma: + state = start_read_argument; + break; + } + } + + return res; +} + +static rspamd_expression_atom_t * +rspamd_mime_expr_parse(const gchar *line, gsize len, + rspamd_mempool_t *pool, gpointer ud, GError **err) +{ + rspamd_expression_atom_t *a = NULL; + struct rspamd_mime_atom *mime_atom = NULL; + const gchar *p, *end, *c = NULL; + struct rspamd_mime_expr_ud *real_ud = (struct rspamd_mime_expr_ud *) ud; + struct rspamd_config *cfg; + rspamd_regexp_t *own_re; + gchar t; + gint type = MIME_ATOM_REGEXP, obraces = 0, ebraces = 0; + enum { + in_header = 0, + got_slash, + in_regexp, + got_backslash, + got_second_slash, + in_flags, + in_flags_brace, + got_obrace, + in_function, + in_local_function, + got_ebrace, + end_atom, + bad_atom + } state = 0, + prev_state = 0; + + p = line; + end = p + len; + cfg = real_ud->cfg; + + while (p < end) { + t = *p; + + switch (state) { + case in_header: + if (t == '/') { + /* Regexp */ + state = got_slash; + } + else if (t == '(') { + /* Function */ + state = got_obrace; + } + else if (!g_ascii_isalnum(t) && t != '_' && t != '-' && t != '=') { + if (t == ':') { + if (p - line == 3 && memcmp(line, "lua", 3) == 0) { + type = MIME_ATOM_LOCAL_LUA_FUNCTION; + state = in_local_function; + c = p + 1; + } + } + else { + /* Likely lua function, identified by just a string */ + type = MIME_ATOM_LUA_FUNCTION; + state = end_atom; + /* Do not increase p */ + continue; + } + } + else if (g_ascii_isspace(t)) { + state = bad_atom; + } + p++; + break; + case got_slash: + state = in_regexp; + break; + case in_regexp: + if (t == '\\') { + state = got_backslash; + prev_state = in_regexp; + } + else if (t == '/') { + state = got_second_slash; + } + p++; + break; + case got_second_slash: + state = in_flags; + break; + case in_flags: + if (t == '{') { + state = in_flags_brace; + p++; + } + else if (!g_ascii_isalpha(t) && t != '$') { + state = end_atom; + } + else { + p++; + } + break; + case in_flags_brace: + if (t == '}') { + state = in_flags; + } + p++; + break; + case got_backslash: + state = prev_state; + p++; + break; + case got_obrace: + state = in_function; + type = MIME_ATOM_INTERNAL_FUNCTION; + obraces++; + break; + case in_function: + if (t == '\\') { + state = got_backslash; + prev_state = in_function; + } + else if (t == '(') { + obraces++; + } + else if (t == ')') { + ebraces++; + if (ebraces == obraces) { + state = got_ebrace; + } + } + p++; + break; + case in_local_function: + if (!(g_ascii_isalnum(t) || t == '-' || t == '_')) { + g_assert(c != NULL); + state = end_atom; + } + else { + p++; + } + break; + case got_ebrace: + state = end_atom; + break; + case bad_atom: + g_set_error(err, rspamd_mime_expr_quark(), 100, "cannot parse" + " mime atom '%s' when reading symbol '%c' at offset %d, " + "near %.*s", + line, t, (gint) (p - line), + (gint) MIN(end - p, 10), p); + return NULL; + case end_atom: + goto set; + } + } +set: + + if (p - line == 0 || (state != got_ebrace && state != got_second_slash && + state != in_flags && state != end_atom)) { + g_set_error(err, rspamd_mime_expr_quark(), 200, "incomplete or empty" + " mime atom"); + return NULL; + } + + mime_atom = rspamd_mempool_alloc(pool, sizeof(*mime_atom)); + mime_atom->type = type; + mime_atom->str = rspamd_mempool_alloc(pool, p - line + 1); + rspamd_strlcpy(mime_atom->str, line, p - line + 1); + + if (type == MIME_ATOM_REGEXP) { + mime_atom->d.re = rspamd_mime_expr_parse_regexp_atom(pool, + mime_atom->str, cfg); + if (mime_atom->d.re == NULL) { + g_set_error(err, rspamd_mime_expr_quark(), 200, + "cannot parse regexp '%s'", + mime_atom->str); + goto err; + } + else { + gint lua_cbref = -1; + + /* Check regexp condition */ + if (real_ud->conf_obj != NULL) { + const ucl_object_t *re_conditions = ucl_object_lookup(real_ud->conf_obj, + "re_conditions"); + + if (re_conditions != NULL) { + if (ucl_object_type(re_conditions) != UCL_OBJECT) { + g_set_error(err, rspamd_mime_expr_quark(), 320, + "re_conditions is not a table for '%s'", + mime_atom->str); + rspamd_regexp_unref(mime_atom->d.re->regexp); + goto err; + } + + const ucl_object_t *function_obj = ucl_object_lookup(re_conditions, + mime_atom->str); + + if (function_obj != NULL) { + if (ucl_object_type(function_obj) != UCL_USERDATA) { + g_set_error(err, rspamd_mime_expr_quark(), 320, + "condition for '%s' is invalid, must be function", + mime_atom->str); + rspamd_regexp_unref(mime_atom->d.re->regexp); + goto err; + } + + struct ucl_lua_funcdata *fd = function_obj->value.ud; + + lua_cbref = fd->idx; + } + } + } + + if (lua_cbref != -1) { + msg_info_config("added condition for regexp %s", mime_atom->str); + /* Add SOM_LEFTMOST_FLAG implicitly */ + rspamd_regexp_set_flags(mime_atom->d.re->regexp, rspamd_regexp_get_flags(mime_atom->d.re->regexp) | + RSPAMD_REGEXP_FLAG_LEFTMOST); + } + + /* Register new item in the cache */ + if (mime_atom->d.re->type == RSPAMD_RE_HEADER || + mime_atom->d.re->type == RSPAMD_RE_RAWHEADER || + mime_atom->d.re->type == RSPAMD_RE_MIMEHEADER) { + + if (mime_atom->d.re->extra.header != NULL) { + own_re = mime_atom->d.re->regexp; + mime_atom->d.re->regexp = rspamd_re_cache_add(cfg->re_cache, + mime_atom->d.re->regexp, + mime_atom->d.re->type, + mime_atom->d.re->extra.header, + strlen(mime_atom->d.re->extra.header) + 1, + lua_cbref); + /* Pass ownership to the cache */ + rspamd_regexp_unref(own_re); + } + else { + /* We have header regexp, but no header name is detected */ + g_set_error(err, + rspamd_mime_expr_quark(), + 200, + "no header name in header regexp: '%s'", + mime_atom->str); + rspamd_regexp_unref(mime_atom->d.re->regexp); + goto err; + } + } + else if (mime_atom->d.re->type == RSPAMD_RE_SELECTOR) { + if (mime_atom->d.re->extra.selector != NULL) { + own_re = mime_atom->d.re->regexp; + mime_atom->d.re->regexp = rspamd_re_cache_add(cfg->re_cache, + mime_atom->d.re->regexp, + mime_atom->d.re->type, + mime_atom->d.re->extra.selector, + strlen(mime_atom->d.re->extra.selector) + 1, + lua_cbref); + /* Pass ownership to the cache */ + rspamd_regexp_unref(own_re); + } + else { + /* We have selector regexp, but no selector name is detected */ + g_set_error(err, + rspamd_mime_expr_quark(), + 200, + "no selector name in selector regexp: '%s'", + mime_atom->str); + rspamd_regexp_unref(mime_atom->d.re->regexp); + goto err; + } + } + else { + own_re = mime_atom->d.re->regexp; + mime_atom->d.re->regexp = rspamd_re_cache_add(cfg->re_cache, + mime_atom->d.re->regexp, + mime_atom->d.re->type, + NULL, + 0, + lua_cbref); + /* Pass ownership to the cache */ + rspamd_regexp_unref(own_re); + } + } + } + else if (type == MIME_ATOM_LUA_FUNCTION) { + mime_atom->d.lua_function = mime_atom->str; + + lua_getglobal(cfg->lua_state, mime_atom->str); + + if (lua_type(cfg->lua_state, -1) != LUA_TFUNCTION) { + g_set_error(err, rspamd_mime_expr_quark(), 200, + "no such lua function '%s'", + mime_atom->str); + lua_pop(cfg->lua_state, 1); + + goto err; + } + + lua_pop(cfg->lua_state, 1); + } + else if (type == MIME_ATOM_LOCAL_LUA_FUNCTION) { + /* p pointer is set to the start of Lua function name */ + + if (real_ud->conf_obj == NULL) { + g_set_error(err, rspamd_mime_expr_quark(), 300, + "no config object for '%s'", + mime_atom->str); + goto err; + } + + const ucl_object_t *functions = ucl_object_lookup(real_ud->conf_obj, + "functions"); + + if (functions == NULL) { + g_set_error(err, rspamd_mime_expr_quark(), 310, + "no functions defined for '%s'", + mime_atom->str); + goto err; + } + + if (ucl_object_type(functions) != UCL_OBJECT) { + g_set_error(err, rspamd_mime_expr_quark(), 320, + "functions is not a table for '%s'", + mime_atom->str); + goto err; + } + + const ucl_object_t *function_obj; + + function_obj = ucl_object_lookup_len(functions, c, + p - c); + + if (function_obj == NULL) { + g_set_error(err, rspamd_mime_expr_quark(), 320, + "function %.*s is not found for '%s'", + (int) (p - c), c, mime_atom->str); + goto err; + } + + if (ucl_object_type(function_obj) != UCL_USERDATA) { + g_set_error(err, rspamd_mime_expr_quark(), 320, + "function %.*s has invalid type for '%s'", + (int) (p - c), c, mime_atom->str); + goto err; + } + + struct ucl_lua_funcdata *fd = function_obj->value.ud; + + mime_atom->d.lua_cbref = fd->idx; + } + else { + mime_atom->d.func = rspamd_mime_expr_parse_function_atom(pool, + mime_atom->str); + if (mime_atom->d.func == NULL) { + g_set_error(err, rspamd_mime_expr_quark(), 200, + "cannot parse function '%s'", + mime_atom->str); + goto err; + } + } + + a = rspamd_mempool_alloc0(pool, sizeof(*a)); + a->len = p - line; + a->priority = 0; + a->data = mime_atom; + + return a; + +err: + + return NULL; +} + +static gint +rspamd_mime_expr_process_regexp(struct rspamd_regexp_atom *re, + struct rspamd_task *task) +{ + gint ret; + + if (re == NULL) { + msg_info_task("invalid regexp passed"); + return 0; + } + + if (re->type == RSPAMD_RE_HEADER || re->type == RSPAMD_RE_RAWHEADER) { + ret = rspamd_re_cache_process(task, + re->regexp, + re->type, + re->extra.header, + strlen(re->extra.header), + re->is_strong); + } + else if (re->type == RSPAMD_RE_SELECTOR) { + ret = rspamd_re_cache_process(task, + re->regexp, + re->type, + re->extra.selector, + strlen(re->extra.selector), + re->is_strong); + } + else { + ret = rspamd_re_cache_process(task, + re->regexp, + re->type, + NULL, + 0, + re->is_strong); + } + + if (re->is_test) { + msg_info_task("test %s regexp '%s' returned %d", + rspamd_re_cache_type_to_string(re->type), + re->regexp_text, ret); + } + + return ret; +} + + +static gint +rspamd_mime_expr_priority(rspamd_expression_atom_t *atom) +{ + struct rspamd_mime_atom *mime_atom = atom->data; + gint ret = 0; + + switch (mime_atom->type) { + case MIME_ATOM_INTERNAL_FUNCTION: + /* Prioritize internal functions slightly */ + ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 8; + break; + case MIME_ATOM_LUA_FUNCTION: + case MIME_ATOM_LOCAL_LUA_FUNCTION: + ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 4; + break; + case MIME_ATOM_REGEXP: + switch (mime_atom->d.re->type) { + case RSPAMD_RE_HEADER: + case RSPAMD_RE_RAWHEADER: + ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 16; + break; + case RSPAMD_RE_URL: + case RSPAMD_RE_EMAIL: + ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 8; + break; + case RSPAMD_RE_SELECTOR: + ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 8; + break; + case RSPAMD_RE_MIME: + case RSPAMD_RE_RAWMIME: + ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 2; + break; + case RSPAMD_RE_WORDS: + case RSPAMD_RE_RAWWORDS: + case RSPAMD_RE_STEMWORDS: + default: + /* For expensive regexps */ + ret = 0; + break; + } + } + + return ret; +} + +static void +rspamd_mime_expr_destroy(rspamd_expression_atom_t *atom) +{ + struct rspamd_mime_atom *mime_atom = atom->data; + + if (mime_atom) { + if (mime_atom->type == MIME_ATOM_INTERNAL_FUNCTION) { + /* Need to cleanup arguments */ + g_array_free(mime_atom->d.func->args, TRUE); + } + } +} + +static gboolean +rspamd_mime_expr_process_function(struct rspamd_function_atom *func, + struct rspamd_task *task, + lua_State *L) +{ + struct _fl *selected, key; + + key.name = func->name; + + selected = bsearch(&key, + list_ptr, + functions_number, + sizeof(struct _fl), + fl_cmp); + if (selected == NULL) { + /* Try to check lua function */ + return FALSE; + } + + return selected->func(task, func->args, selected->user_data); +} + +static gdouble +rspamd_mime_expr_process(void *ud, rspamd_expression_atom_t *atom) +{ + struct rspamd_task *task = (struct rspamd_task *) ud; + struct rspamd_mime_atom *mime_atom; + lua_State *L; + gdouble ret = 0; + + g_assert(task != NULL); + g_assert(atom != NULL); + + mime_atom = atom->data; + + if (mime_atom->type == MIME_ATOM_REGEXP) { + ret = rspamd_mime_expr_process_regexp(mime_atom->d.re, task); + } + else if (mime_atom->type == MIME_ATOM_LUA_FUNCTION) { + L = task->cfg->lua_state; + lua_getglobal(L, mime_atom->d.lua_function); + rspamd_lua_task_push(L, task); + + if (lua_pcall(L, 1, 1, 0) != 0) { + msg_info_task("lua call to global function '%s' for atom '%s' failed: %s", + mime_atom->d.lua_function, + mime_atom->str, + lua_tostring(L, -1)); + lua_pop(L, 1); + } + else { + if (lua_type(L, -1) == LUA_TBOOLEAN) { + ret = lua_toboolean(L, -1); + } + else if (lua_type(L, -1) == LUA_TNUMBER) { + ret = lua_tonumber(L, 1); + } + else { + msg_err_task("%s returned wrong return type: %s", + mime_atom->str, lua_typename(L, lua_type(L, -1))); + } + /* Remove result */ + lua_pop(L, 1); + } + } + else if (mime_atom->type == MIME_ATOM_LOCAL_LUA_FUNCTION) { + gint err_idx; + + L = task->cfg->lua_state; + lua_pushcfunction(L, &rspamd_lua_traceback); + err_idx = lua_gettop(L); + + lua_rawgeti(L, LUA_REGISTRYINDEX, mime_atom->d.lua_cbref); + rspamd_lua_task_push(L, task); + + if (lua_pcall(L, 1, 1, err_idx) != 0) { + msg_info_task("lua call to local function for atom '%s' failed: %s", + mime_atom->str, + lua_tostring(L, -1)); + } + else { + if (lua_type(L, -1) == LUA_TBOOLEAN) { + ret = lua_toboolean(L, -1); + } + else if (lua_type(L, -1) == LUA_TNUMBER) { + ret = lua_tonumber(L, 1); + } + else { + msg_err_task("%s returned wrong return type: %s", + mime_atom->str, lua_typename(L, lua_type(L, -1))); + } + } + + lua_settop(L, 0); + } + else { + ret = rspamd_mime_expr_process_function(mime_atom->d.func, task, + task->cfg->lua_state); + } + + return ret; +} + +void register_expression_function(const gchar *name, + rspamd_internal_func_t func, + void *user_data) +{ + static struct _fl *new; + + functions_number++; + + new = g_new(struct _fl, functions_number); + memcpy(new, list_ptr, (functions_number - 1) * sizeof(struct _fl)); + if (list_allocated) { + g_free(list_ptr); + } + + list_allocated = TRUE; + new[functions_number - 1].name = name; + new[functions_number - 1].func = func; + new[functions_number - 1].user_data = user_data; + qsort(new, functions_number, sizeof(struct _fl), fl_cmp); + list_ptr = new; +} + +gboolean +rspamd_compare_encoding(struct rspamd_task *task, GArray *args, void *unused) +{ + struct expression_argument *arg; + + if (args == NULL || task == NULL) { + return FALSE; + } + + arg = &g_array_index(args, struct expression_argument, 0); + if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid argument to function is passed"); + return FALSE; + } + + /* XXX: really write this function */ + return TRUE; +} + +gboolean +rspamd_header_exists(struct rspamd_task *task, GArray *args, void *unused) +{ + struct expression_argument *arg; + struct rspamd_mime_header *rh; + + if (args == NULL || task == NULL) { + return FALSE; + } + + arg = &g_array_index(args, struct expression_argument, 0); + if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid argument to function is passed"); + return FALSE; + } + + rh = rspamd_message_get_header_array(task, + (gchar *) arg->data, FALSE); + + debug_task("try to get header %s: %d", (gchar *) arg->data, + (rh != NULL)); + + if (rh) { + return TRUE; + } + + return FALSE; +} + + +/* + * This function is designed to find difference between text/html and text/plain parts + * It takes one argument: difference threshold, if we have two text parts, compare + * its hashes and check for threshold, if value is greater than threshold, return TRUE + * and return FALSE otherwise. + */ +gboolean +rspamd_parts_distance(struct rspamd_task *task, GArray *args, void *unused) +{ + gint threshold, threshold2 = -1; + struct expression_argument *arg; + gdouble *pdiff, diff; + + if (args == NULL || args->len == 0) { + debug_task("no threshold is specified, assume it 100"); + threshold = 100; + } + else { + errno = 0; + arg = &g_array_index(args, struct expression_argument, 0); + if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid argument to function is passed"); + return FALSE; + } + + threshold = strtoul((gchar *) arg->data, NULL, 10); + if (errno != 0) { + msg_info_task("bad numeric value for threshold \"%s\", assume it 100", + (gchar *) arg->data); + threshold = 100; + } + if (args->len >= 2) { + arg = &g_array_index(args, struct expression_argument, 1); + if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid argument to function is passed"); + return FALSE; + } + + errno = 0; + threshold2 = strtoul((gchar *) arg->data, NULL, 10); + if (errno != 0) { + msg_info_task("bad numeric value for threshold \"%s\", ignore it", + (gchar *) arg->data); + threshold2 = -1; + } + } + } + + if ((pdiff = + rspamd_mempool_get_variable(task->task_pool, + "parts_distance")) != NULL) { + diff = (1.0 - (*pdiff)) * 100.0; + + if (diff != -1) { + if (threshold2 > 0) { + if (diff >= MIN(threshold, threshold2) && + diff < MAX(threshold, threshold2)) { + + return TRUE; + } + } + else { + if (diff <= threshold) { + return TRUE; + } + } + return FALSE; + } + else { + return FALSE; + } + } + + return FALSE; +} + +struct addr_list { + const gchar *name; + guint namelen; + const gchar *addr; + guint addrlen; +}; + +static gint +addr_list_cmp_func(const void *a, const void *b) +{ + const struct addr_list *addra = (struct addr_list *) a, + *addrb = (struct addr_list *) b; + + if (addra->addrlen != addrb->addrlen) { + return addra->addrlen - addrb->addrlen; + } + + return memcmp(addra->addr, addrb->addr, addra->addrlen); +} + +#define COMPARE_RCPT_LEN 3 +#define MIN_RCPT_TO_COMPARE 7 + +gboolean +rspamd_recipients_distance(struct rspamd_task *task, GArray *args, + void *unused) +{ + struct expression_argument *arg; + struct rspamd_email_address *cur; + double threshold; + struct addr_list *ar; + gint num, i, hits = 0; + + if (args == NULL) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + arg = &g_array_index(args, struct expression_argument, 0); + if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid argument to function is passed"); + return FALSE; + } + + errno = 0; + threshold = strtod((gchar *) arg->data, NULL); + + if (errno != 0) { + msg_warn_task("invalid numeric value '%s': %s", + (gchar *) arg->data, + strerror(errno)); + return FALSE; + } + + if (!MESSAGE_FIELD(task, rcpt_mime)) { + return FALSE; + } + + num = MESSAGE_FIELD(task, rcpt_mime)->len; + + if (num < MIN_RCPT_TO_COMPARE) { + return FALSE; + } + + ar = rspamd_mempool_alloc0(task->task_pool, num * sizeof(struct addr_list)); + + /* Fill array */ + num = 0; + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, rcpt_mime), i, cur) + { + if (cur->addr_len > COMPARE_RCPT_LEN) { + ar[num].name = cur->addr; + ar[num].namelen = cur->addr_len; + ar[num].addr = cur->domain; + ar[num].addrlen = cur->domain_len; + num++; + } + } + + qsort(ar, num, sizeof(*ar), addr_list_cmp_func); + + /* Cycle all elements in array */ + for (i = 0; i < num; i++) { + if (i < num - 1) { + if (ar[i].namelen == ar[i + 1].namelen) { + if (rspamd_lc_cmp(ar[i].name, ar[i + 1].name, COMPARE_RCPT_LEN) == 0) { + hits++; + } + } + } + } + + if ((hits * num / 2.) / (double) num >= threshold) { + return TRUE; + } + + return FALSE; +} + +gboolean +rspamd_has_only_html_part(struct rspamd_task *task, GArray *args, + void *unused) +{ + struct rspamd_mime_text_part *p; + guint i, cnt_html = 0, cnt_txt = 0; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, p) + { + if (!IS_TEXT_PART_ATTACHMENT(p)) { + if (IS_TEXT_PART_HTML(p)) { + cnt_html++; + } + else { + cnt_txt++; + } + } + } + + return (cnt_html > 0 && cnt_txt == 0); +} + +static gboolean +is_recipient_list_sorted(GPtrArray *ar) +{ + struct rspamd_email_address *addr; + gboolean res = TRUE; + rspamd_ftok_t cur, prev; + gint i; + + /* Do not check to short address lists */ + if (ar == NULL || ar->len < MIN_RCPT_TO_COMPARE) { + return FALSE; + } + + prev.len = 0; + prev.begin = NULL; + + PTR_ARRAY_FOREACH(ar, i, addr) + { + cur.begin = addr->addr; + cur.len = addr->addr_len; + + if (prev.len != 0) { + if (rspamd_ftok_casecmp(&cur, &prev) <= 0) { + res = FALSE; + break; + } + } + + prev = cur; + } + + return res; +} + +gboolean +rspamd_is_recipients_sorted(struct rspamd_task *task, + GArray *args, + void *unused) +{ + /* Check all types of addresses */ + + if (MESSAGE_FIELD(task, rcpt_mime)) { + return is_recipient_list_sorted(MESSAGE_FIELD(task, rcpt_mime)); + } + + return FALSE; +} + +gboolean +rspamd_compare_transfer_encoding(struct rspamd_task *task, + GArray *args, + void *unused) +{ + struct expression_argument *arg; + guint i; + struct rspamd_mime_part *part; + enum rspamd_cte cte; + + if (args == NULL) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + arg = &g_array_index(args, struct expression_argument, 0); + if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid argument to function is passed"); + return FALSE; + } + + cte = rspamd_cte_from_string(arg->data); + + if (cte == RSPAMD_CTE_UNKNOWN) { + msg_warn_task("unknown cte: %s", arg->data); + return FALSE; + } + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + if (IS_PART_TEXT(part)) { + if (part->cte == cte) { + return TRUE; + } + } + } + + return FALSE; +} + +gboolean +rspamd_is_html_balanced(struct rspamd_task *task, GArray *args, void *unused) +{ + /* Totally broken but seems to be never used */ + return TRUE; +} + +gboolean +rspamd_has_html_tag(struct rspamd_task *task, GArray *args, void *unused) +{ + struct rspamd_mime_text_part *p; + struct expression_argument *arg; + guint i; + gboolean res = FALSE; + + if (args == NULL) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + arg = &g_array_index(args, struct expression_argument, 0); + if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid argument to function is passed"); + return FALSE; + } + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, p) + { + if (IS_TEXT_PART_HTML(p) && p->html) { + res = rspamd_html_tag_seen(p->html, arg->data); + } + + if (res) { + break; + } + } + + return res; +} + +gboolean +rspamd_has_fake_html(struct rspamd_task *task, GArray *args, void *unused) +{ + struct rspamd_mime_text_part *p; + guint i; + gboolean res = FALSE; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, p) + { + if (IS_TEXT_PART_HTML(p) && (rspamd_html_get_tags_count(p->html) < 2)) { + res = TRUE; + } + + if (res) { + break; + } + } + + return res; +} + +static gboolean +rspamd_raw_header_exists(struct rspamd_task *task, GArray *args, void *unused) +{ + struct expression_argument *arg; + + if (args == NULL || task == NULL) { + return FALSE; + } + + arg = &g_array_index(args, struct expression_argument, 0); + if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid argument to function is passed"); + return FALSE; + } + + return rspamd_message_get_header_array(task, arg->data, FALSE) != NULL; +} + +static gboolean +match_smtp_data(struct rspamd_task *task, + struct expression_argument *arg, + const gchar *what, gsize len) +{ + rspamd_regexp_t *re; + gint r = 0; + + if (arg->type == EXPRESSION_ARGUMENT_REGEXP) { + /* This is a regexp */ + re = arg->data; + if (re == NULL) { + msg_warn_task("cannot compile regexp for function"); + return FALSE; + } + + + if (len > 0) { + r = rspamd_regexp_search(re, what, len, NULL, NULL, FALSE, NULL); + } + + return r; + } + else if (arg->type == EXPRESSION_ARGUMENT_NORMAL && + g_ascii_strncasecmp(arg->data, what, len) == 0) { + return TRUE; + } + + return FALSE; +} + +static gboolean +rspamd_check_smtp_data(struct rspamd_task *task, GArray *args, void *unused) +{ + struct expression_argument *arg; + struct rspamd_email_address *addr = NULL; + GPtrArray *rcpts = NULL; + const gchar *type, *str = NULL; + guint i; + + if (args == NULL) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + arg = &g_array_index(args, struct expression_argument, 0); + + if (!arg || !arg->data || arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("no parameters to function"); + return FALSE; + } + else { + type = arg->data; + switch (*type) { + case 'f': + case 'F': + if (g_ascii_strcasecmp(type, "from") == 0) { + addr = rspamd_task_get_sender(task); + } + else { + msg_warn_task("bad argument to function: %s", type); + return FALSE; + } + break; + case 'h': + case 'H': + if (g_ascii_strcasecmp(type, "helo") == 0) { + str = task->helo; + } + else { + msg_warn_task("bad argument to function: %s", type); + return FALSE; + } + break; + case 'u': + case 'U': + if (g_ascii_strcasecmp(type, "user") == 0) { + str = task->auth_user; + } + else { + msg_warn_task("bad argument to function: %s", type); + return FALSE; + } + break; + case 's': + case 'S': + if (g_ascii_strcasecmp(type, "subject") == 0) { + str = MESSAGE_FIELD(task, subject); + } + else { + msg_warn_task("bad argument to function: %s", type); + return FALSE; + } + break; + case 'r': + case 'R': + if (g_ascii_strcasecmp(type, "rcpt") == 0) { + rcpts = task->rcpt_envelope; + } + else { + msg_warn_task("bad argument to function: %s", type); + return FALSE; + } + break; + default: + msg_warn_task("bad argument to function: %s", type); + return FALSE; + } + } + + if (str == NULL && addr == NULL && rcpts == NULL) { + /* Not enough data so regexp would NOT be found anyway */ + return FALSE; + } + + /* We would process only one more argument, others are ignored */ + if (args->len >= 2) { + arg = &g_array_index(args, struct expression_argument, 1); + + if (arg) { + if (str != NULL) { + return match_smtp_data(task, arg, str, strlen(str)); + } + else if (addr != NULL && addr->addr) { + return match_smtp_data(task, arg, addr->addr, addr->addr_len); + } + else { + if (rcpts != NULL) { + for (i = 0; i < rcpts->len; i++) { + addr = g_ptr_array_index(rcpts, i); + + if (addr && addr->addr && + match_smtp_data(task, arg, + addr->addr, addr->addr_len)) { + return TRUE; + } + } + } + } + } + } + + return FALSE; +} + +static inline gboolean +rspamd_check_ct_attr(const gchar *begin, gsize len, + struct expression_argument *arg_pattern) +{ + rspamd_regexp_t *re; + gboolean r = FALSE; + + if (arg_pattern->type == EXPRESSION_ARGUMENT_REGEXP) { + re = arg_pattern->data; + + if (len > 0) { + r = rspamd_regexp_search(re, + begin, len, + NULL, NULL, FALSE, NULL); + } + + if (r) { + return TRUE; + } + } + else { + /* Just do strcasecmp */ + gsize plen = strlen(arg_pattern->data); + + if (plen == len && + g_ascii_strncasecmp(arg_pattern->data, begin, len) == 0) { + return TRUE; + } + } + + return FALSE; +} + +static gboolean +rspamd_content_type_compare_param(struct rspamd_task *task, + GArray *args, + void *unused) +{ + + struct expression_argument *arg, *arg1, *arg_pattern; + gboolean recursive = FALSE; + struct rspamd_mime_part *cur_part; + guint i; + rspamd_ftok_t srch; + struct rspamd_content_type_param *found = NULL, *cur; + const gchar *param_name; + + if (args == NULL || args->len < 2) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + arg = &g_array_index(args, struct expression_argument, 0); + g_assert(arg->type == EXPRESSION_ARGUMENT_NORMAL); + param_name = arg->data; + arg_pattern = &g_array_index(args, struct expression_argument, 1); + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, cur_part) + { + if (args->len >= 3) { + arg1 = &g_array_index(args, struct expression_argument, 2); + if (g_ascii_strncasecmp(arg1->data, "true", + sizeof("true") - 1) == 0) { + recursive = TRUE; + } + } + else { + /* + * If user did not specify argument, let's assume that he wants + * recursive search if mime part is multipart/mixed + */ + if (IS_PART_MULTIPART(cur_part)) { + recursive = TRUE; + } + } + + rspamd_ftok_t lit; + RSPAMD_FTOK_FROM_STR(&srch, param_name); + RSPAMD_FTOK_FROM_STR(&lit, "charset"); + + if (rspamd_ftok_equal(&srch, &lit)) { + if (rspamd_check_ct_attr(cur_part->ct->charset.begin, + cur_part->ct->charset.len, arg_pattern)) { + return TRUE; + } + } + + RSPAMD_FTOK_FROM_STR(&lit, "boundary"); + if (rspamd_ftok_equal(&srch, &lit)) { + if (rspamd_check_ct_attr(cur_part->ct->orig_boundary.begin, + cur_part->ct->orig_boundary.len, arg_pattern)) { + return TRUE; + } + } + + if (cur_part->ct->attrs) { + found = g_hash_table_lookup(cur_part->ct->attrs, &srch); + + if (found) { + DL_FOREACH(found, cur) + { + if (rspamd_check_ct_attr(cur->value.begin, + cur->value.len, arg_pattern)) { + return TRUE; + } + } + } + } + + if (!recursive) { + break; + } + } + + return FALSE; +} + +static gboolean +rspamd_content_type_has_param(struct rspamd_task *task, + GArray *args, + void *unused) +{ + struct expression_argument *arg, *arg1; + gboolean recursive = FALSE; + struct rspamd_mime_part *cur_part; + guint i; + rspamd_ftok_t srch; + struct rspamd_content_type_param *found = NULL; + const gchar *param_name; + + if (args == NULL || args->len < 1) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + arg = &g_array_index(args, struct expression_argument, 0); + g_assert(arg->type == EXPRESSION_ARGUMENT_NORMAL); + param_name = arg->data; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, cur_part) + { + if (args->len >= 2) { + arg1 = &g_array_index(args, struct expression_argument, 1); + if (g_ascii_strncasecmp(arg1->data, "true", + sizeof("true") - 1) == 0) { + recursive = TRUE; + } + } + else { + /* + * If user did not specify argument, let's assume that he wants + * recursive search if mime part is multipart/mixed + */ + if (IS_PART_MULTIPART(cur_part)) { + recursive = TRUE; + } + } + + + rspamd_ftok_t lit; + RSPAMD_FTOK_FROM_STR(&srch, param_name); + RSPAMD_FTOK_FROM_STR(&lit, "charset"); + + if (rspamd_ftok_equal(&srch, &lit)) { + if (cur_part->ct->charset.len > 0) { + return TRUE; + } + } + + RSPAMD_FTOK_FROM_STR(&lit, "boundary"); + if (rspamd_ftok_equal(&srch, &lit)) { + if (cur_part->ct->boundary.len > 0) { + return TRUE; + } + } + + if (cur_part->ct->attrs) { + found = g_hash_table_lookup(cur_part->ct->attrs, &srch); + + if (found) { + return TRUE; + } + } + + if (!recursive) { + break; + } + } + + return FALSE; +} + +static gboolean +rspamd_content_type_check(struct rspamd_task *task, + GArray *args, + gboolean check_subtype) +{ + rspamd_ftok_t *param_data, srch; + rspamd_regexp_t *re; + struct expression_argument *arg1, *arg_pattern; + struct rspamd_content_type *ct; + gint r = 0; + guint i; + gboolean recursive = FALSE; + struct rspamd_mime_part *cur_part; + + if (args == NULL || args->len < 1) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + arg_pattern = &g_array_index(args, struct expression_argument, 0); + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, cur_part) + { + ct = cur_part->ct; + + if (args->len >= 2) { + arg1 = &g_array_index(args, struct expression_argument, 1); + if (g_ascii_strncasecmp(arg1->data, "true", + sizeof("true") - 1) == 0) { + recursive = TRUE; + } + } + else { + /* + * If user did not specify argument, let's assume that he wants + * recursive search if mime part is multipart/mixed + */ + if (IS_PART_MULTIPART(cur_part)) { + recursive = TRUE; + } + } + + if (check_subtype) { + param_data = &ct->subtype; + } + else { + param_data = &ct->type; + } + + if (arg_pattern->type == EXPRESSION_ARGUMENT_REGEXP) { + re = arg_pattern->data; + + if (param_data->len > 0) { + r = rspamd_regexp_search(re, param_data->begin, param_data->len, + NULL, NULL, FALSE, NULL); + } + + if (r) { + return TRUE; + } + } + else { + /* Just do strcasecmp */ + srch.begin = arg_pattern->data; + srch.len = strlen(arg_pattern->data); + + if (rspamd_ftok_casecmp(param_data, &srch) == 0) { + return TRUE; + } + } + + /* Get next part */ + if (!recursive) { + break; + } + } + + return FALSE; +} + +static gboolean +rspamd_content_type_is_type(struct rspamd_task *task, + GArray *args, + void *unused) +{ + return rspamd_content_type_check(task, args, FALSE); +} + +static gboolean +rspamd_content_type_is_subtype(struct rspamd_task *task, + GArray *args, + void *unused) +{ + return rspamd_content_type_check(task, args, TRUE); +} + +static gboolean +compare_subtype(struct rspamd_task *task, struct rspamd_content_type *ct, + struct expression_argument *subtype) +{ + rspamd_regexp_t *re; + rspamd_ftok_t srch; + gint r = 0; + + if (subtype == NULL || ct == NULL) { + msg_warn_task("invalid parameters passed"); + return FALSE; + } + if (subtype->type == EXPRESSION_ARGUMENT_REGEXP) { + re = subtype->data; + + if (ct->subtype.len > 0) { + r = rspamd_regexp_search(re, ct->subtype.begin, ct->subtype.len, + NULL, NULL, FALSE, NULL); + } + } + else { + srch.begin = subtype->data; + srch.len = strlen(subtype->data); + + /* Just do strcasecmp */ + if (rspamd_ftok_casecmp(&ct->subtype, &srch) == 0) { + return TRUE; + } + } + + return r; +} + +static gboolean +compare_len(struct rspamd_mime_part *part, guint min, guint max) +{ + if (min == 0 && max == 0) { + return TRUE; + } + + if (min == 0) { + return part->parsed_data.len <= max; + } + else if (max == 0) { + return part->parsed_data.len >= min; + } + else { + return part->parsed_data.len >= min && part->parsed_data.len <= max; + } +} + +static gboolean +common_has_content_part(struct rspamd_task *task, + struct expression_argument *param_type, + struct expression_argument *param_subtype, + gint min_len, + gint max_len) +{ + rspamd_regexp_t *re; + struct rspamd_mime_part *part; + struct rspamd_content_type *ct; + rspamd_ftok_t srch; + gint r = 0; + guint i; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + ct = part->ct; + + if (ct == NULL) { + continue; + } + + if (param_type->type == EXPRESSION_ARGUMENT_REGEXP) { + re = param_type->data; + + if (ct->type.len > 0) { + r = rspamd_regexp_search(re, ct->type.begin, ct->type.len, + NULL, NULL, FALSE, NULL); + } + + /* Also check subtype and length of the part */ + if (r && param_subtype) { + r = compare_len(part, min_len, max_len) && + compare_subtype(task, ct, param_subtype); + + return r; + } + } + else { + /* Just do strcasecmp */ + srch.begin = param_type->data; + srch.len = strlen(param_type->data); + + if (rspamd_ftok_casecmp(&ct->type, &srch) == 0) { + if (param_subtype) { + if (compare_subtype(task, ct, param_subtype)) { + if (compare_len(part, min_len, max_len)) { + return TRUE; + } + } + } + else { + if (compare_len(part, min_len, max_len)) { + return TRUE; + } + } + } + } + } + + return FALSE; +} + +static gboolean +rspamd_has_content_part(struct rspamd_task *task, GArray *args, void *unused) +{ + struct expression_argument *param_type = NULL, *param_subtype = NULL; + + if (args == NULL) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + param_type = &g_array_index(args, struct expression_argument, 0); + if (args->len >= 2) { + param_subtype = &g_array_index(args, struct expression_argument, 1); + } + + return common_has_content_part(task, param_type, param_subtype, 0, 0); +} + +static gboolean +rspamd_has_content_part_len(struct rspamd_task *task, + GArray *args, + void *unused) +{ + struct expression_argument *param_type = NULL, *param_subtype = NULL; + gint min = 0, max = 0; + struct expression_argument *arg; + + if (args == NULL) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + param_type = &g_array_index(args, struct expression_argument, 0); + + if (args->len >= 2) { + param_subtype = &g_array_index(args, struct expression_argument, 1); + + if (args->len >= 3) { + arg = &g_array_index(args, struct expression_argument, 2); + errno = 0; + min = strtoul(arg->data, NULL, 10); + g_assert(arg->type == EXPRESSION_ARGUMENT_NORMAL); + + if (errno != 0) { + msg_warn_task("invalid numeric value '%s': %s", + (gchar *) arg->data, + strerror(errno)); + return FALSE; + } + + if (args->len >= 4) { + arg = &g_array_index(args, struct expression_argument, 3); + g_assert(arg->type == EXPRESSION_ARGUMENT_NORMAL); + max = strtoul(arg->data, NULL, 10); + + if (errno != 0) { + msg_warn_task("invalid numeric value '%s': %s", + (gchar *) arg->data, + strerror(errno)); + return FALSE; + } + } + } + } + + return common_has_content_part(task, param_type, param_subtype, min, max); +} + +static gboolean +rspamd_is_empty_body(struct rspamd_task *task, + GArray *args, + void *unused) +{ + struct rspamd_mime_part *part; + guint i; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) + { + if (part->parsed_data.len > 0) { + return FALSE; + } + } + + return TRUE; +} + +#define TASK_FLAG_READ(flag) \ + do { \ + result = !!(task->flags & (flag)); \ + } while (0) + +#define TASK_GET_FLAG(flag, strname, macro) \ + do { \ + if (!found && strcmp((flag), strname) == 0) { \ + TASK_FLAG_READ((macro)); \ + found = TRUE; \ + } \ + } while (0) + +#define TASK_PROTOCOL_FLAG_READ(flag) \ + do { \ + result = !!(task->protocol_flags & (flag)); \ + } while (0) + +#define TASK_GET_PROTOCOL_FLAG(flag, strname, macro) \ + do { \ + if (!found && strcmp((flag), strname) == 0) { \ + TASK_PROTOCOL_FLAG_READ((macro)); \ + found = TRUE; \ + } \ + } while (0) + + +static gboolean +rspamd_has_flag_expr(struct rspamd_task *task, + GArray *args, + void *unused) +{ + gboolean found = FALSE, result = FALSE; + struct expression_argument *flag_arg; + const gchar *flag_str; + + if (args == NULL) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + flag_arg = &g_array_index(args, struct expression_argument, 0); + + if (flag_arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid parameter to function"); + return FALSE; + } + + flag_str = (const gchar *) flag_arg->data; + + TASK_GET_FLAG(flag_str, "pass_all", RSPAMD_TASK_FLAG_PASS_ALL); + TASK_GET_FLAG(flag_str, "no_log", RSPAMD_TASK_FLAG_NO_LOG); + TASK_GET_FLAG(flag_str, "no_stat", RSPAMD_TASK_FLAG_NO_STAT); + TASK_GET_FLAG(flag_str, "skip", RSPAMD_TASK_FLAG_SKIP); + TASK_GET_PROTOCOL_FLAG(flag_str, "extended_urls", + RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS); + TASK_GET_FLAG(flag_str, "learn_spam", RSPAMD_TASK_FLAG_LEARN_SPAM); + TASK_GET_FLAG(flag_str, "learn_ham", RSPAMD_TASK_FLAG_LEARN_HAM); + TASK_GET_FLAG(flag_str, "greylisted", RSPAMD_TASK_FLAG_GREYLISTED); + TASK_GET_FLAG(flag_str, "broken_headers", + RSPAMD_TASK_FLAG_BROKEN_HEADERS); + TASK_GET_FLAG(flag_str, "skip_process", + RSPAMD_TASK_FLAG_SKIP_PROCESS); + TASK_GET_PROTOCOL_FLAG(flag_str, "milter", + RSPAMD_TASK_PROTOCOL_FLAG_MILTER); + TASK_GET_FLAG(flag_str, "bad_unicode", + RSPAMD_TASK_FLAG_BAD_UNICODE); + + if (!found) { + msg_warn_task("invalid flag name %s", flag_str); + return FALSE; + } + + return result; +} + +static gboolean +rspamd_has_symbol_expr(struct rspamd_task *task, + GArray *args, + void *unused) +{ + struct expression_argument *sym_arg; + const gchar *symbol_str; + + if (args == NULL) { + msg_warn_task("no parameters to function"); + return FALSE; + } + + sym_arg = &g_array_index(args, struct expression_argument, 0); + + if (sym_arg->type != EXPRESSION_ARGUMENT_NORMAL) { + msg_warn_task("invalid parameter to function"); + return FALSE; + } + + symbol_str = (const gchar *) sym_arg->data; + + if (rspamd_task_find_symbol_result(task, symbol_str, NULL)) { + return TRUE; + } + + return FALSE; +} diff --git a/src/libmime/mime_expressions.h b/src/libmime/mime_expressions.h new file mode 100644 index 0000000..a2ea3fe --- /dev/null +++ b/src/libmime/mime_expressions.h @@ -0,0 +1,65 @@ +/** + * @file expressions.h + * Rspamd expressions API + */ + +#ifndef RSPAMD_EXPRESSIONS_H +#define RSPAMD_EXPRESSIONS_H + +#include "config.h" +#include "expression.h" +#include "contrib/libucl/ucl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct rspamd_config; + +struct rspamd_mime_expr_ud { + struct rspamd_config *cfg; + const ucl_object_t *conf_obj; +}; + +extern const struct rspamd_atom_subr mime_expr_subr; + +/** + * Function's argument + */ +enum rspamd_expression_type { + EXPRESSION_ARGUMENT_NORMAL = 0, + EXPRESSION_ARGUMENT_BOOL, + EXPRESSION_ARGUMENT_REGEXP +}; +struct expression_argument { + enum rspamd_expression_type type; /**< type of argument (text or other function) */ + void *data; /**< pointer to its data */ +}; + + +typedef gboolean (*rspamd_internal_func_t)(struct rspamd_task *, + GArray *args, void *user_data); + + +/** + * Register specified function to rspamd internal functions list + * @param name name of function + * @param func pointer to function + */ +void register_expression_function(const gchar *name, + rspamd_internal_func_t func, + void *user_data); + +/** + * Set global limit of regexp data size to be processed + * @param limit new limit in bytes + * @return old limit value + */ +guint rspamd_mime_expression_set_re_limit(guint limit); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c new file mode 100644 index 0000000..2bd559d --- /dev/null +++ b/src/libmime/mime_headers.c @@ -0,0 +1,1441 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mime_headers.h" +#include "smtp_parsers.h" +#include "mime_encoding.h" +#include "received.h" +#include "contrib/uthash/utlist.h" +#include "libserver/mempool_vars_internal.h" +#include "libserver/cfg_file.h" +#include "libutil/util.h" +#include <unicode/utf8.h> + +KHASH_INIT(rspamd_mime_headers_htb, gchar *, + struct rspamd_mime_header *, 1, + rspamd_strcase_hash, rspamd_strcase_equal); + +struct rspamd_mime_headers_table { + khash_t(rspamd_mime_headers_htb) htb; + ref_entry_t ref; +}; + +static void +rspamd_mime_header_check_special(struct rspamd_task *task, + struct rspamd_mime_header *rh) +{ + guint64 h; + const gchar *p, *end; + gchar *id; + gint max_recipients = -1, len; + + if (task->cfg) { + max_recipients = task->cfg->max_recipients; + } + + h = rspamd_icase_hash(rh->name, strlen(rh->name), 0xdeadbabe); + + switch (h) { + case 0x88705DC4D9D61ABULL: /* received */ + if (rspamd_received_header_parse(task, rh->decoded, strlen(rh->decoded), rh)) { + rh->flags |= RSPAMD_HEADER_RECEIVED; + } + break; + case 0x76F31A09F4352521ULL: /* to */ + MESSAGE_FIELD(task, rcpt_mime) = rspamd_email_address_from_mime(task->task_pool, + rh->value, strlen(rh->value), + MESSAGE_FIELD(task, rcpt_mime), max_recipients); + rh->flags |= RSPAMD_HEADER_TO | RSPAMD_HEADER_RCPT | RSPAMD_HEADER_UNIQUE; + break; + case 0x7EB117C1480B76ULL: /* cc */ + MESSAGE_FIELD(task, rcpt_mime) = rspamd_email_address_from_mime(task->task_pool, + rh->value, strlen(rh->value), + MESSAGE_FIELD(task, rcpt_mime), max_recipients); + rh->flags |= RSPAMD_HEADER_CC | RSPAMD_HEADER_RCPT | RSPAMD_HEADER_UNIQUE; + break; + case 0xE4923E11C4989C8DULL: /* bcc */ + MESSAGE_FIELD(task, rcpt_mime) = rspamd_email_address_from_mime(task->task_pool, + rh->value, strlen(rh->value), + MESSAGE_FIELD(task, rcpt_mime), max_recipients); + rh->flags |= RSPAMD_HEADER_BCC | RSPAMD_HEADER_RCPT | RSPAMD_HEADER_UNIQUE; + break; + case 0x41E1985EDC1CBDE4ULL: /* from */ + MESSAGE_FIELD(task, from_mime) = rspamd_email_address_from_mime(task->task_pool, + rh->value, strlen(rh->value), + MESSAGE_FIELD(task, from_mime), max_recipients); + rh->flags |= RSPAMD_HEADER_FROM | RSPAMD_HEADER_SENDER | RSPAMD_HEADER_UNIQUE; + break; + case 0x43A558FC7C240226ULL: /* message-id */ { + + rh->flags = RSPAMD_HEADER_MESSAGE_ID | RSPAMD_HEADER_UNIQUE; + p = rh->decoded; + len = rspamd_strip_smtp_comments_inplace(rh->decoded, strlen(p)); + rh->decoded[len] = '\0'; /* Zero terminate after stripping */ + /* Strip surrounding spaces */ + rh->decoded = g_strstrip(rh->decoded); + end = p + len; + + if (*p == '<') { + p++; + } + + if (end > p) { + gchar *d; + + if (*(end - 1) == '>') { + end--; + } + + id = rspamd_mempool_alloc(task->task_pool, end - p + 1); + d = id; + + while (p < end) { + if (g_ascii_isgraph(*p)) { + *d++ = *p++; + } + else { + *d++ = '?'; + p++; + } + } + + *d = '\0'; + + MESSAGE_FIELD(task, message_id) = id; + } + + break; + } + case 0xB91D3910358E8212ULL: /* subject */ + if (MESSAGE_FIELD(task, subject) == NULL) { + MESSAGE_FIELD(task, subject) = rh->decoded; + } + rh->flags = RSPAMD_HEADER_SUBJECT | RSPAMD_HEADER_UNIQUE; + break; + case 0xEE4AA2EAAC61D6F4ULL: /* return-path */ + if (task->from_envelope == NULL) { + task->from_envelope = rspamd_email_address_from_smtp(rh->decoded, + strlen(rh->decoded)); + } + rh->flags = RSPAMD_HEADER_RETURN_PATH | RSPAMD_HEADER_UNIQUE; + break; + case 0xB9EEFAD2E93C2161ULL: /* delivered-to */ + if (task->deliver_to == NULL) { + task->deliver_to = rh->decoded; + } + rh->flags = RSPAMD_HEADER_DELIVERED_TO; + break; + case 0x2EC3BFF3C393FC10ULL: /* date */ + case 0xAC0DDB1A1D214CAULL: /* sender */ + case 0x54094572367AB695ULL: /* in-reply-to */ + case 0x81CD9E9131AB6A9AULL: /* content-type */ + case 0xC39BD9A75AA25B60ULL: /* content-transfer-encoding */ + case 0xB3F6704CB3AD6589ULL: /* references */ + rh->flags = RSPAMD_HEADER_UNIQUE; + break; + } +} + +static void +rspamd_mime_header_add(struct rspamd_task *task, + khash_t(rspamd_mime_headers_htb) * target, + struct rspamd_mime_header **order_ptr, + struct rspamd_mime_header *rh, + gboolean check_special) +{ + khiter_t k; + struct rspamd_mime_header *ex; + int res; + + k = kh_put(rspamd_mime_headers_htb, target, rh->name, &res); + + if (res == 0) { + ex = kh_value(target, k); + DL_APPEND(ex, rh); + msg_debug_task("append raw header %s: %s", rh->name, rh->value); + } + else { + kh_value(target, k) = rh; + rh->prev = rh; + rh->next = NULL; + msg_debug_task("add new raw header %s: %s", rh->name, rh->value); + } + + LL_PREPEND2(*order_ptr, rh, ord_next); + + if (check_special) { + rspamd_mime_header_check_special(task, rh); + } +} + + +/* Convert raw headers to a list of struct raw_header * */ +void rspamd_mime_headers_process(struct rspamd_task *task, + struct rspamd_mime_headers_table *target, + struct rspamd_mime_header **order_ptr, + const gchar *in, gsize len, + gboolean check_newlines) +{ + struct rspamd_mime_header *nh = NULL; + const gchar *p, *c, *end; + gchar *tmp, *tp; + gint state = 0, l, next_state = 100, err_state = 100, t_state; + gboolean valid_folding = FALSE, shift_by_one = FALSE; + guint nlines_count[RSPAMD_TASK_NEWLINES_MAX]; + guint norder = 0; + + p = in; + end = p + len; + c = p; + memset(nlines_count, 0, sizeof(nlines_count)); + msg_debug_task("start processing headers"); + + while (p < end) { + /* FSM for processing headers */ + switch (state) { + case 0: + /* Begin processing headers */ + if (!g_ascii_isalpha(*p)) { + /* We have some garbage at the beginning of headers, skip this line */ + state = 100; + next_state = 0; + } + else { + state = 1; + c = p; + } + break; + case 1: + /* We got something like header's name */ + if (*p == ':') { + nh = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_mime_header)); + l = p - c; + tmp = rspamd_mempool_alloc(task->task_pool, l + 1); + rspamd_null_safe_copy(c, l, tmp, l + 1); + nh->name = tmp; + nh->flags |= RSPAMD_HEADER_EMPTY_SEPARATOR; + nh->raw_value = c; + nh->raw_len = p - c; /* Including trailing ':' */ + p++; + state = 2; + c = p; + } + else if (g_ascii_isspace(*p)) { + /* Not header but some garbage */ + if (target == MESSAGE_FIELD(task, raw_headers)) { + /* Do not propagate flag from the attachments */ + task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; + } + state = 100; + next_state = 0; + } + else { + p++; + } + break; + case 2: + /* We got header's name, so skip any \t or spaces */ + if (*p == '\t') { + nh->flags &= ~RSPAMD_HEADER_EMPTY_SEPARATOR; + nh->flags |= RSPAMD_HEADER_TAB_SEPARATED; + p++; + } + else if (*p == ' ') { + nh->flags &= ~RSPAMD_HEADER_EMPTY_SEPARATOR; + p++; + } + else if (*p == '\n' || *p == '\r') { + + if (check_newlines) { + if (*p == '\n') { + nlines_count[RSPAMD_TASK_NEWLINES_LF]++; + } + else if (p + 1 < end && *(p + 1) == '\n') { + nlines_count[RSPAMD_TASK_NEWLINES_CRLF]++; + } + else { + nlines_count[RSPAMD_TASK_NEWLINES_CR]++; + } + } + + /* Process folding */ + state = 99; + l = p - c; + if (l > 0) { + tmp = rspamd_mempool_alloc(task->task_pool, l + 1); + rspamd_null_safe_copy(c, l, tmp, l + 1); + nh->separator = tmp; + } + next_state = 3; + err_state = 5; + c = p; + } + else { + /* Process value */ + l = p - c; + if (l >= 0) { + tmp = rspamd_mempool_alloc(task->task_pool, l + 1); + rspamd_null_safe_copy(c, l, tmp, l + 1); + nh->separator = tmp; + } + c = p; + state = 3; + } + break; + case 3: + if (*p == '\r' || *p == '\n') { + /* Hold folding */ + if (check_newlines) { + if (*p == '\n') { + nlines_count[RSPAMD_TASK_NEWLINES_LF]++; + } + else if (p + 1 < end && *(p + 1) == '\n') { + nlines_count[RSPAMD_TASK_NEWLINES_CRLF]++; + } + else { + nlines_count[RSPAMD_TASK_NEWLINES_CR]++; + } + } + state = 99; + next_state = 3; + err_state = 4; + } + else if (p + 1 == end) { + state = 4; + } + else { + p++; + } + break; + case 4: + /* Copy header's value */ + + /* + * XXX: + * The original decision to use here null terminated + * strings was extremely poor! + */ + l = p - c; + tmp = rspamd_mempool_alloc(task->task_pool, l + 1); + tp = tmp; + t_state = 0; + while (l--) { + if (t_state == 0) { + /* Before folding */ + if (*c == '\n' || *c == '\r') { + t_state = 1; + c++; + *tp++ = ' '; + } + else { + if (*c != '\0') { + *tp++ = *c++; + } + else { + c++; + } + } + } + else if (t_state == 1) { + /* Inside folding */ + if (g_ascii_isspace(*c)) { + c++; + } + else { + t_state = 0; + if (*c != '\0') { + *tp++ = *c++; + } + else { + c++; + } + } + } + } + /* Strip last space that can be added by \r\n parsing */ + if (tp > tmp && *(tp - 1) == ' ') { + tp--; + } + + *tp = '\0'; + /* Strip the initial spaces that could also be added by folding */ + while (*tmp != '\0' && g_ascii_isspace(*tmp)) { + tmp++; + } + + if (p + 1 == end) { + nh->raw_len = end - nh->raw_value; + } + else { + nh->raw_len = p - nh->raw_value; + } + + nh->value = tmp; + + gboolean broken_utf = FALSE; + + nh->decoded = rspamd_mime_header_decode(task->task_pool, + nh->value, strlen(tmp), &broken_utf); + + if (broken_utf) { + task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE; + } + + if (nh->decoded == NULL) { + /* As we strip comments in place... */ + nh->decoded = rspamd_mempool_strdup(task->task_pool, ""); + } + + /* We also validate utf8 and replace all non-valid utf8 chars */ + rspamd_mime_charset_utf_enforce(nh->decoded, strlen(nh->decoded)); + nh->order = norder++; + rspamd_mime_header_add(task, &target->htb, order_ptr, nh, check_newlines); + nh = NULL; + state = 0; + break; + case 5: + /* Header has only name, no value */ + nh->value = rspamd_mempool_strdup(task->task_pool, ""); + nh->decoded = rspamd_mempool_strdup(task->task_pool, ""); + nh->raw_len = p - nh->raw_value; + if (shift_by_one) { + nh->raw_len++; + } + nh->order = norder++; + rspamd_mime_header_add(task, &target->htb, order_ptr, nh, check_newlines); + nh = NULL; + state = 0; + break; + case 99: + /* Folding state */ + if (p + 1 == end) { + state = err_state; + /* Include the last character into the next header */ + shift_by_one = TRUE; + } + else { + if (*p == '\r' || *p == '\n') { + p++; + valid_folding = FALSE; + } + else if (*p == '\t' || *p == ' ') { + /* Valid folding */ + p++; + valid_folding = TRUE; + } + else { + if (valid_folding) { + debug_task("go to state: %d->%d", state, next_state); + state = next_state; + } + else { + /* Fall back */ + debug_task("go to state: %d->%d", state, err_state); + state = err_state; + } + } + } + break; + case 100: + /* Fail state, skip line */ + + if (*p == '\r') { + if (p + 1 < end && *(p + 1) == '\n') { + nlines_count[RSPAMD_TASK_NEWLINES_CRLF]++; + p++; + } + p++; + state = next_state; + } + else if (*p == '\n') { + nlines_count[RSPAMD_TASK_NEWLINES_LF]++; + + if (p + 1 < end && *(p + 1) == '\r') { + p++; + } + p++; + state = next_state; + } + else if (p + 1 == end) { + state = next_state; + p++; + } + else { + p++; + } + break; + } + } + + /* Since we have prepended headers, we need to reverse the list to get the actual order */ + LL_REVERSE(*order_ptr); + + if (check_newlines) { + guint max_cnt = 0; + gint sel = 0; + rspamd_cryptobox_hash_state_t hs; + guchar hout[rspamd_cryptobox_HASHBYTES], *hexout; + + for (gint i = RSPAMD_TASK_NEWLINES_CR; i < RSPAMD_TASK_NEWLINES_MAX; i++) { + if (nlines_count[i] > max_cnt) { + max_cnt = nlines_count[i]; + sel = i; + } + } + + MESSAGE_FIELD(task, nlines_type) = sel; + + rspamd_cryptobox_hash_init(&hs, NULL, 0); + + LL_FOREACH(*order_ptr, nh) + { + if (nh->name && nh->flags != RSPAMD_HEADER_RECEIVED) { + rspamd_cryptobox_hash_update(&hs, nh->name, strlen(nh->name)); + } + } + + rspamd_cryptobox_hash_final(&hs, hout); + hexout = rspamd_mempool_alloc(task->task_pool, sizeof(hout) * 2 + 1); + hexout[sizeof(hout) * 2] = '\0'; + rspamd_encode_hex_buf(hout, sizeof(hout), hexout, + sizeof(hout) * 2 + 1); + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_HEADERS_HASH, + hexout, NULL); + } +} + +static void +rspamd_mime_header_maybe_save_token(rspamd_mempool_t *pool, + GString *out, + GByteArray *token, + GByteArray *decoded_token, + rspamd_ftok_t *old_charset, + rspamd_ftok_t *new_charset) +{ + if (new_charset->len == 0) { + g_assert_not_reached(); + } + + if (old_charset->len > 0) { + if (rspamd_ftok_casecmp(new_charset, old_charset) == 0) { + rspamd_ftok_t srch; + + /* + * Special case for iso-2022-jp: + * https://github.com/vstakhov/rspamd/issues/1669 + */ + RSPAMD_FTOK_ASSIGN(&srch, "iso-2022-jp"); + + if (rspamd_ftok_casecmp(new_charset, &srch) != 0) { + /* We can concatenate buffers, just return */ + return; + } + } + } + + /* We need to flush and decode old token to out string */ + if (rspamd_mime_to_utf8_byte_array(token, decoded_token, pool, + rspamd_mime_detect_charset(new_charset, pool))) { + g_string_append_len(out, decoded_token->data, decoded_token->len); + } + + /* We also reset buffer */ + g_byte_array_set_size(token, 0); + /* + * Propagate charset + * + * Here are dragons: we save the original charset to allow buffers concat + * in the condition at the beginning of the function. + * However, it will likely cause unnecessary calls for + * `rspamd_mime_detect_charset` which could be relatively expensive. + * But we ignore that for now... + */ + memcpy(old_charset, new_charset, sizeof(*old_charset)); +} + +static void +rspamd_mime_header_sanity_check(GString *str) +{ + gsize i; + gchar t; + + for (i = 0; i < str->len; i++) { + t = str->str[i]; + if (!((t & 0x80) || g_ascii_isgraph(t))) { + if (g_ascii_isspace(t)) { + /* Replace spaces characters with plain space */ + str->str[i] = ' '; + } + else { + str->str[i] = '?'; + } + } + } +} + +gchar * +rspamd_mime_header_decode(rspamd_mempool_t *pool, const gchar *in, + gsize inlen, gboolean *invalid_utf) +{ + GString *out; + const guchar *c, *p, *end; + const gchar *tok_start = NULL; + gsize tok_len = 0, pos; + GByteArray *token = NULL, *decoded; + rspamd_ftok_t cur_charset = {0, NULL}, old_charset = {0, NULL}; + gint encoding; + gssize r; + guint qmarks = 0; + gchar *ret; + enum { + parse_normal = 0, + got_eqsign, + got_encoded_start, + got_more_qmark, + skip_spaces, + } state = parse_normal; + + g_assert(in != NULL); + + c = in; + p = in; + end = in + inlen; + out = g_string_sized_new(inlen); + token = g_byte_array_sized_new(80); + decoded = g_byte_array_sized_new(122); + + while (p < end) { + switch (state) { + case parse_normal: + if (*p == '=') { + g_string_append_len(out, c, p - c); + c = p; + state = got_eqsign; + } + else if (*p >= 128) { + gint off = 0; + UChar32 uc; + /* Unencoded character */ + g_string_append_len(out, c, p - c); + /* Check if that's valid UTF8 */ + U8_NEXT(p, off, end - p, uc); + + if (uc <= 0) { + c = p + 1; + /* 0xFFFD in UTF8 */ + g_string_append_len(out, " ", 3); + off = 0; + U8_APPEND_UNSAFE(out->str + out->len - 3, + off, 0xfffd); + + if (invalid_utf) { + *invalid_utf = TRUE; + } + } + else { + c = p; + p = p + off; + continue; /* To avoid p ++ after this block */ + } + } + p++; + break; + case got_eqsign: + if (*p == '?') { + state = got_encoded_start; + qmarks = 0; + } + else { + g_string_append_len(out, c, 1); + c = p; + state = parse_normal; + continue; /* Deal with == case */ + } + p++; + break; + case got_encoded_start: + if (*p == '?') { + state = got_more_qmark; + qmarks++; + + /* Skip multiple ? signs */ + p++; + while (p < end && *p == '?') { + p++; + } + + continue; + } + p++; + break; + case got_more_qmark: + if (*p == '=') { + if (qmarks < 3) { + state = got_encoded_start; + } + else { + /* Finished encoded boundary */ + if (*c == '"') { + /* Quoted string, non-RFC conformant but used by retards */ + c++; + } + if (rspamd_rfc2047_parser(c, p - c + 1, &encoding, + &cur_charset.begin, &cur_charset.len, + &tok_start, &tok_len)) { + /* We have a token, so we can decode it from `encoding` */ + if (token->len > 0) { + if (old_charset.len == 0) { + memcpy(&old_charset, &cur_charset, + sizeof(old_charset)); + } + + rspamd_mime_header_maybe_save_token(pool, out, + token, decoded, + &old_charset, &cur_charset); + } + + qmarks = 0; + pos = token->len; + g_byte_array_set_size(token, pos + tok_len); + + if (encoding == RSPAMD_RFC2047_QP) { + r = rspamd_decode_qp2047_buf(tok_start, tok_len, + token->data + pos, tok_len); + + if (r != -1) { + token->len = pos + r; + } + else { + /* Cannot decode qp */ + token->len -= tok_len; + } + } + else { + if (rspamd_cryptobox_base64_decode(tok_start, tok_len, + token->data + pos, &tok_len)) { + token->len = pos + tok_len; + } + else { + /* Cannot decode */ + token->len -= tok_len; + } + } + + c = p + 1; + state = skip_spaces; + } + else { + /* Not encoded-word */ + old_charset.len = 0; + + if (token->len > 0) { + rspamd_mime_header_maybe_save_token(pool, out, + token, decoded, + &old_charset, &cur_charset); + } + + g_string_append_len(out, c, p - c); + c = p; + state = parse_normal; + } + } /* qmarks >= 3 */ + } /* p == '=' */ + else { + state = got_encoded_start; + } + p++; + break; + case skip_spaces: + if (g_ascii_isspace(*p)) { + p++; + } + else if (*p == '=' && p < end - 1 && p[1] == '?') { + /* Next boundary, can glue */ + c = p; + p += 2; + state = got_encoded_start; + } + else { + /* Need to save spaces and decoded token */ + if (token->len > 0) { + old_charset.len = 0; + rspamd_mime_header_maybe_save_token(pool, out, + token, decoded, + &old_charset, &cur_charset); + } + + g_string_append_len(out, c, p - c); + c = p; + state = parse_normal; + } + break; + } + } + + /* Leftover */ + switch (state) { + case skip_spaces: + if (token->len > 0 && cur_charset.len > 0) { + old_charset.len = 0; + rspamd_mime_header_maybe_save_token(pool, out, + token, decoded, + &old_charset, &cur_charset); + } + break; + default: + /* Just copy leftover */ + if (p > c) { + g_string_append_len(out, c, p - c); + } + break; + } + + g_byte_array_free(token, TRUE); + g_byte_array_free(decoded, TRUE); + rspamd_mime_header_sanity_check(out); + rspamd_mempool_notify_alloc(pool, out->len); + ret = g_string_free(out, FALSE); + rspamd_mempool_add_destructor(pool, g_free, ret); + + return ret; +} + +gchar * +rspamd_mime_header_encode(const gchar *in, gsize len) +{ + const gchar *p = in, *end = in + len; + gchar *out, encode_buf[80 * sizeof(guint32)]; + GString *res; + gboolean need_encoding = FALSE; + + /* Check if we need to encode */ + while (p < end) { + if ((((guchar) *p) & 0x80) != 0) { + need_encoding = TRUE; + break; + } + p++; + } + + if (!need_encoding) { + out = g_malloc(len + 1); + rspamd_strlcpy(out, in, len + 1); + } + else { + /* Need encode */ + gsize ulen, pos; + gint r; + const gchar *prev; + /* Choose step: =?UTF-8?Q?<qp>?= should be less than 76 chars */ + guint step = (76 - 12) / 3 + 1; + + ulen = g_utf8_strlen(in, len); + res = g_string_sized_new(len * 2 + 1); + pos = 0; + prev = in; + /* Adjust chunk size for unicode average length */ + step *= 1.0 * ulen / (gdouble) len; + + while (pos < ulen) { + p = g_utf8_offset_to_pointer(in, pos); + + if (p > prev) { + /* Encode and print */ + r = rspamd_encode_qp2047_buf(prev, p - prev, + encode_buf, sizeof(encode_buf)); + + if (r != -1) { + if (res->len > 0) { + rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r, + encode_buf); + } + else { + rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r, + encode_buf); + } + } + } + + pos += MIN(step, ulen - pos); + prev = p; + } + + /* Leftover */ + if (prev < end) { + r = rspamd_encode_qp2047_buf(prev, end - prev, + encode_buf, sizeof(encode_buf)); + + if (r != -1) { + if (res->len > 0) { + rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r, + encode_buf); + } + else { + rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r, + encode_buf); + } + } + } + + out = g_string_free(res, FALSE); + } + + return out; +} + +gchar * +rspamd_mime_message_id_generate(const gchar *fqdn) +{ + GString *out; + guint64 rnd, clk; + + out = g_string_sized_new(strlen(fqdn) + 22); + rnd = ottery_rand_uint64(); + clk = rspamd_get_calendar_ticks() * 1e6; + + rspamd_printf_gstring(out, "%*bs.%*bs@%s", + (gint) sizeof(guint64) - 3, (guchar *) &clk, + (gint) sizeof(guint64), (gchar *) &rnd, + fqdn); + + return g_string_free(out, FALSE); +} + +struct rspamd_mime_header * +rspamd_message_get_header_from_hash(struct rspamd_mime_headers_table *hdrs, + const gchar *field, + gboolean need_modified) +{ + if (hdrs == NULL) { + return NULL; + } + + khiter_t k; + khash_t(rspamd_mime_headers_htb) *htb = &hdrs->htb; + struct rspamd_mime_header *hdr; + + if (htb) { + k = kh_get(rspamd_mime_headers_htb, htb, (gchar *) field); + + if (k == kh_end(htb)) { + return NULL; + } + + hdr = kh_value(htb, k); + + if (!need_modified) { + if (hdr->flags & RSPAMD_HEADER_NON_EXISTING) { + return NULL; + } + + return hdr; + } + else { + if (hdr->flags & RSPAMD_HEADER_MODIFIED) { + return hdr->modified_chain; + } + + return hdr; + } + } + + return NULL; +} + +struct rspamd_mime_header * +rspamd_message_get_header_array(struct rspamd_task *task, const gchar *field, + gboolean need_modified) +{ + return rspamd_message_get_header_from_hash( + MESSAGE_FIELD_CHECK(task, raw_headers), + field, need_modified); +} + +gsize rspamd_mime_headers_count(struct rspamd_mime_headers_table *hdrs) +{ + if (hdrs) { + return kh_size(&hdrs->htb); + } + + return 0; +} + +bool rspamd_mime_headers_foreach(const struct rspamd_mime_headers_table *hdrs, + rspamd_hdr_traverse_func_t func, void *ud) +{ + const gchar *name; + struct rspamd_mime_header *hdr; + + kh_foreach(&hdrs->htb, name, hdr, { + if (!func(name, hdr, ud)) { + return false; + } + }); + + return true; +} + +static void +rspamd_message_headers_dtor(struct rspamd_mime_headers_table *hdrs) +{ + if (hdrs) { + kfree(hdrs->htb.keys); + kfree(hdrs->htb.vals); + kfree(hdrs->htb.flags); + g_free(hdrs); + } +} + +struct rspamd_mime_headers_table * +rspamd_message_headers_ref(struct rspamd_mime_headers_table *hdrs) +{ + REF_RETAIN(hdrs); + + return hdrs; +} + +void rspamd_message_headers_unref(struct rspamd_mime_headers_table *hdrs) +{ + REF_RELEASE(hdrs); +} + +struct rspamd_mime_headers_table * +rspamd_message_headers_new(void) +{ + struct rspamd_mime_headers_table *nhdrs; + + nhdrs = g_malloc0(sizeof(*nhdrs)); + REF_INIT_RETAIN(nhdrs, rspamd_message_headers_dtor); + + return nhdrs; +} + +gsize rspamd_message_header_unfold_inplace(char *hdr, gsize len) +{ + /* + * t - tortoise (destination) + * h - hare (source) + */ + char *t = hdr, *h = hdr, *end = (hdr + len); + enum { + copy_chars, + folding_cr, + folding_lf, + folding_ws, + } state = copy_chars; + + while (h < end) { + switch (state) { + case copy_chars: + if (*h == '\r') { + state = folding_cr; + h++; + } + else if (*h == '\n') { + state = folding_lf; + h++; + } + else { + *t++ = *h++; + } + break; + case folding_cr: + if (*h == '\n') { + state = folding_lf; + h++; + } + else if (g_ascii_isspace(*h)) { + state = folding_ws; + h++; + } + else { + /* It is weird, not like a folding, so we need to revert back */ + *t++ = '\r'; + state = copy_chars; + } + break; + case folding_lf: + if (g_ascii_isspace(*h)) { + state = folding_ws; + h++; + } + else { + /* It is weird, not like a folding, so we need to revert back */ + *t++ = '\n'; + state = copy_chars; + } + break; + case folding_ws: + if (!g_ascii_isspace(*h)) { + *t++ = ' '; + state = copy_chars; + } + else { + h++; + } + break; + } + } + + return t - hdr; +} + +void rspamd_message_set_modified_header(struct rspamd_task *task, + struct rspamd_mime_headers_table *hdrs, + const gchar *hdr_name, + const ucl_object_t *obj, + struct rspamd_mime_header **order_ptr) +{ + khiter_t k; + khash_t(rspamd_mime_headers_htb) *htb = &hdrs->htb; + struct rspamd_mime_header *hdr_elt, *existing_chain; + int i; + + if (htb) { + k = kh_get(rspamd_mime_headers_htb, htb, (gchar *) hdr_name); + + if (k == kh_end(htb)) { + hdr_elt = rspamd_mempool_alloc0(task->task_pool, sizeof(*hdr_elt)); + + hdr_elt->flags |= RSPAMD_HEADER_MODIFIED | RSPAMD_HEADER_NON_EXISTING; + hdr_elt->name = rspamd_mempool_strdup(task->task_pool, hdr_name); + + int r; + k = kh_put(rspamd_mime_headers_htb, htb, hdr_elt->name, &r); + + kh_value(htb, k) = hdr_elt; + + if (order_ptr) { + /* + * This iterates over all headers in O(N), but we have no other options here, as the + * list is already set. + */ + LL_APPEND2(*order_ptr, hdr_elt, ord_next); + } + } + else { + hdr_elt = kh_value(htb, k); + } + } + else { + /* No hash, no modification */ + msg_err_task("internal error: calling for set_modified_header for no headers"); + return; + } + + if (hdr_elt->flags & RSPAMD_HEADER_MODIFIED) { + existing_chain = hdr_elt->modified_chain; + } + else { + existing_chain = hdr_elt; + } + + const ucl_object_t *elt, *cur; + ucl_object_iter_t it; + + /* First, deal with removed headers, copying the relevant headers with remove flag */ + elt = ucl_object_lookup(obj, "remove"); + + /* + * remove: {1, 2 ...} + * where number is the header's position starting from '1' + */ + if (elt && ucl_object_type(elt) == UCL_ARRAY) { + /* First, use a temporary array to keep all headers */ + GPtrArray *existing_ar = g_ptr_array_new(); + struct rspamd_mime_header *cur_hdr; + + /* Exclude removed headers */ + LL_FOREACH(existing_chain, cur_hdr) + { + if (!(cur_hdr->flags & RSPAMD_HEADER_REMOVED)) { + g_ptr_array_add(existing_ar, cur_hdr); + } + } + + it = NULL; + + while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) { + if (ucl_object_type(cur) == UCL_INT) { + int ord = ucl_object_toint(cur); + + if (ord == 0) { + /* Remove all headers in the existing chain */ + PTR_ARRAY_FOREACH(existing_ar, i, cur_hdr) + { + cur_hdr->flags |= RSPAMD_HEADER_MODIFIED | RSPAMD_HEADER_REMOVED; + } + } + else if (ord > 0) { + /* Start from the top */ + + if (ord <= existing_ar->len) { + cur_hdr = g_ptr_array_index(existing_ar, ord - 1); + cur_hdr->flags |= RSPAMD_HEADER_MODIFIED | RSPAMD_HEADER_REMOVED; + } + } + else { + /* Start from the bottom; ord < 0 */ + if ((-ord) <= existing_ar->len) { + cur_hdr = g_ptr_array_index(existing_ar, existing_ar->len + ord); + cur_hdr->flags |= RSPAMD_HEADER_MODIFIED | RSPAMD_HEADER_REMOVED; + } + } + } + } + + /* + * Next, we return all headers modified to the existing chain + * This implies an additional copy of all structures but is safe enough to + * deal with it + */ + hdr_elt->flags |= RSPAMD_HEADER_MODIFIED; + hdr_elt->modified_chain = NULL; + + PTR_ARRAY_FOREACH(existing_ar, i, cur_hdr) + { + if (!(cur_hdr->flags & RSPAMD_HEADER_REMOVED)) { + struct rspamd_mime_header *nhdr = rspamd_mempool_alloc( + task->task_pool, sizeof(*nhdr)); + memcpy(nhdr, cur_hdr, sizeof(*nhdr)); + nhdr->modified_chain = NULL; + nhdr->prev = NULL; + nhdr->next = NULL; + nhdr->ord_next = NULL; + + DL_APPEND(hdr_elt->modified_chain, nhdr); + } + } + + g_ptr_array_free(existing_ar, TRUE); + + /* End of headers removal logic */ + } + + /* We can now deal with headers additions */ + elt = ucl_object_lookup(obj, "add"); + if (elt && ucl_object_type(elt) == UCL_ARRAY) { + if (!(hdr_elt->flags & RSPAMD_HEADER_MODIFIED)) { + /* Copy the header itself to the modified chain */ + struct rspamd_mime_header *nhdr; + hdr_elt->flags |= RSPAMD_HEADER_MODIFIED; + nhdr = rspamd_mempool_alloc( + task->task_pool, sizeof(*nhdr)); + memcpy(nhdr, hdr_elt, sizeof(*hdr_elt)); + nhdr->modified_chain = NULL; + nhdr->next = NULL; + nhdr->ord_next = NULL; + nhdr->prev = nhdr; + hdr_elt->modified_chain = nhdr; + } + + /* + * add: {{1, "foo"}, {-1, "bar"} ...} + * where number is the header's position starting from '1' + */ + it = NULL; + + while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) { + if (ucl_object_type(cur) == UCL_ARRAY) { + const ucl_object_t *order = ucl_array_find_index(cur, 0), + *value = ucl_array_find_index(cur, 1); + + if (order && value && + (ucl_object_type(order) == UCL_INT && + ucl_object_type(value) == UCL_STRING)) { + int ord = ucl_object_toint(order); + const char *raw_value; + gsize raw_len; + + raw_value = ucl_object_tolstring(value, &raw_len); + + if (raw_len == 0) { + continue; + } + + struct rspamd_mime_header *nhdr = rspamd_mempool_alloc0( + task->task_pool, sizeof(*nhdr)); + + nhdr->flags |= RSPAMD_HEADER_ADDED; + nhdr->name = hdr_elt->name; + nhdr->value = rspamd_mempool_alloc(task->task_pool, + raw_len + 1); + /* Strlcpy will ensure that value will have no embedded \0 */ + rspamd_strlcpy(nhdr->value, raw_value, raw_len + 1); + gsize value_len = rspamd_message_header_unfold_inplace(nhdr->value, raw_len); + nhdr->value[value_len] = '\0'; + + /* Deal with the raw value */ + size_t namelen = strlen(hdr_elt->name); + char *rawbuf = rspamd_mempool_alloc(task->task_pool, namelen + + raw_len + + sizeof(": \r\n")); + /* Name: value<newline> */ + nhdr->raw_value = rawbuf; + memcpy(rawbuf, hdr_elt->name, namelen); + rawbuf += namelen; + memcpy(rawbuf, ": ", sizeof(": ") - 1); + nhdr->separator = rspamd_mempool_strdup(task->task_pool, " "); + rawbuf += sizeof(": ") - 1; + memcpy(rawbuf, raw_value, raw_len); + nhdr->raw_len = raw_len; + + if (MESSAGE_FIELD(task, nlines_type) == RSPAMD_TASK_NEWLINES_LF) { + rawbuf[raw_len++] = '\n'; + } + else { + rawbuf[raw_len++] = '\r'; + + if (MESSAGE_FIELD(task, nlines_type) == RSPAMD_TASK_NEWLINES_CRLF) { + rawbuf[raw_len++] = '\n'; + } + } + + rawbuf[raw_len] = '\0'; + + nhdr->decoded = rspamd_mime_header_decode(task->task_pool, + raw_value, nhdr->raw_len, + NULL); + + /* Now find a position to insert a value */ + struct rspamd_mime_header **pos = &hdr_elt->modified_chain; + + if (ord == 0) { + DL_PREPEND(hdr_elt->modified_chain, nhdr); + } + else if (ord == -1) { + DL_APPEND(hdr_elt->modified_chain, nhdr); + } + else if (ord > 0) { + while (ord > 0 && (*pos)) { + ord--; + pos = &((*pos)->next); + } + if (*pos) { + /* pos is &(elt)->next */ + nhdr->next = (*pos); + nhdr->prev = (*pos)->prev; + (*pos)->prev = nhdr; + *pos = nhdr; + } + else { + /* Last element */ + DL_APPEND(*pos, nhdr); + } + } + else { + /* NYI: negative order is not defined */ + msg_err_task("internal error: calling for set_modified_header " + "with negative add order header"); + } + } + else { + msg_err_task("internal error: calling for set_modified_header " + "with invalid header"); + } + } + } + } +} + +gsize rspamd_strip_smtp_comments_inplace(gchar *input, gsize len) +{ + enum parser_state { + parse_normal, + parse_obrace, + parse_comment, + parse_quoted_copy, + parse_quoted_ignore, + } state = parse_normal, + next_state = parse_normal; + gchar *d = input, *end = input + len, *start = input; + gchar t; + int obraces = 0, ebraces = 0; + + while (input < end) { + t = *input; + switch (state) { + case parse_normal: + if (t == '(') { + state = parse_obrace; + } + else if (t == '\\') { + state = parse_quoted_copy; + next_state = parse_normal; + } + else { + *d++ = t; + } + input++; + break; + case parse_obrace: + obraces++; + if (t == '(') { + obraces++; + } + else if (t == ')') { + ebraces++; + + if (obraces == ebraces) { + obraces = 0; + ebraces = 0; + state = parse_normal; + } + } + else if (t == '\\') { + state = parse_quoted_ignore; + next_state = parse_comment; + } + else { + state = parse_comment; + } + input++; + break; + case parse_comment: + if (t == '(') { + state = parse_obrace; + } + else if (t == ')') { + ebraces++; + + if (obraces == ebraces) { + obraces = 0; + ebraces = 0; + state = parse_normal; + } + } + else if (t == '\\') { + state = parse_quoted_ignore; + next_state = parse_comment; + } + input++; + break; + case parse_quoted_copy: + *d++ = t; + state = next_state; + input++; + break; + case parse_quoted_ignore: + state = next_state; + input++; + break; + } + } + + return (d - start); +}
\ No newline at end of file diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h new file mode 100644 index 0000000..60015a2 --- /dev/null +++ b/src/libmime/mime_headers.h @@ -0,0 +1,200 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_MIME_HEADERS_H_ +#define SRC_LIBMIME_MIME_HEADERS_H_ + +#include "config.h" +#include "libutil/mem_pool.h" +#include "libutil/addr.h" +#include "khash.h" +#include "contrib/libucl/ucl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; + +enum rspamd_rfc2047_encoding { + RSPAMD_RFC2047_QP = 0, + RSPAMD_RFC2047_BASE64, +}; + +enum rspamd_mime_header_flags { + RSPAMD_HEADER_GENERIC = 0u, + RSPAMD_HEADER_RECEIVED = 1u << 0u, + RSPAMD_HEADER_TO = 1u << 2u, + RSPAMD_HEADER_CC = 1u << 3u, + RSPAMD_HEADER_BCC = 1u << 4u, + RSPAMD_HEADER_FROM = 1u << 5u, + RSPAMD_HEADER_MESSAGE_ID = 1u << 6u, + RSPAMD_HEADER_SUBJECT = 1u << 7u, + RSPAMD_HEADER_RETURN_PATH = 1u << 8u, + RSPAMD_HEADER_DELIVERED_TO = 1u << 9u, + RSPAMD_HEADER_SENDER = 1u << 10u, + RSPAMD_HEADER_RCPT = 1u << 11u, + RSPAMD_HEADER_UNIQUE = 1u << 12u, + RSPAMD_HEADER_EMPTY_SEPARATOR = 1u << 13u, + RSPAMD_HEADER_TAB_SEPARATED = 1u << 14u, + RSPAMD_HEADER_MODIFIED = 1u << 15u, /* Means we need to check modified chain */ + RSPAMD_HEADER_ADDED = 1u << 16u, /* A header has been artificially added */ + RSPAMD_HEADER_REMOVED = 1u << 17u, /* A header has been artificially removed */ + RSPAMD_HEADER_NON_EXISTING = 1u << 18u, /* Header was not in the original message */ +}; + +struct rspamd_mime_header { + const gchar *raw_value; /* As it is in the message (unfolded and unparsed) */ + gsize raw_len; + guint order; + int flags; /* see enum rspamd_mime_header_flags */ + /* These are zero terminated (historically) */ + gchar *name; /* Also used for key */ + gchar *value; + gchar *separator; + gchar *decoded; + struct rspamd_mime_header *modified_chain; /* Headers modified during transform */ + struct rspamd_mime_header *prev, *next; /* Headers with the same name */ + struct rspamd_mime_header *ord_next; /* Overall order of headers, slist */ +}; + +struct rspamd_mime_headers_table; + +/** + * Process headers and store them in `target` + * @param task + * @param target + * @param in + * @param len + * @param check_newlines + */ +void rspamd_mime_headers_process(struct rspamd_task *task, + struct rspamd_mime_headers_table *target, + struct rspamd_mime_header **order_ptr, + const gchar *in, gsize len, + gboolean check_newlines); + +/** + * Perform rfc2047 decoding of a header + * @param pool + * @param in + * @param inlen + * @return + */ +gchar *rspamd_mime_header_decode(rspamd_mempool_t *pool, const gchar *in, + gsize inlen, gboolean *invalid_utf); + +/** + * Encode mime header if needed + * @param in + * @param len + * @return newly allocated encoded header + */ +gchar *rspamd_mime_header_encode(const gchar *in, gsize len); + +/** + * Generate new unique message id + * @param fqdn + * @return + */ +gchar *rspamd_mime_message_id_generate(const gchar *fqdn); + +/** + * Get an array of header's values with specified header's name using raw headers + * @param task worker task structure + * @param field header's name + * @return An array of header's values or NULL. It is NOT permitted to free array or values. + */ +struct rspamd_mime_header * +rspamd_message_get_header_array(struct rspamd_task *task, + const gchar *field, + gboolean need_modified); + +/** + * Get an array of header's values with specified header's name using raw headers + * @param htb hash table indexed by header name (caseless) with ptr arrays as elements + * @param field header's name + * @return An array of header's values or NULL. It is NOT permitted to free array or values. + */ +struct rspamd_mime_header * +rspamd_message_get_header_from_hash(struct rspamd_mime_headers_table *hdrs, + const gchar *field, + gboolean need_modified); + +/** + * Modifies a header (or insert one if not found) + * @param hdrs + * @param hdr_name + * @param obj an array of modified values + * + */ +void rspamd_message_set_modified_header(struct rspamd_task *task, + struct rspamd_mime_headers_table *hdrs, + const gchar *hdr_name, + const ucl_object_t *obj, + struct rspamd_mime_header **order_ptr); + +/** + * Cleans up hash table of the headers + * @param htb + */ +void rspamd_message_headers_unref(struct rspamd_mime_headers_table *hdrs); + +struct rspamd_mime_headers_table *rspamd_message_headers_ref(struct rspamd_mime_headers_table *hdrs); + +/** + * Init headers hash + * @return + */ +struct rspamd_mime_headers_table *rspamd_message_headers_new(void); + +/** + * Returns size for a headers table + * @param hdrs + * @return + */ +gsize rspamd_mime_headers_count(struct rspamd_mime_headers_table *hdrs); + +typedef bool(rspamd_hdr_traverse_func_t)(const gchar *, const struct rspamd_mime_header *, void *); +/** + * Traverse all headers in a table + * @param func + * @param ud + * @return + */ +bool rspamd_mime_headers_foreach(const struct rspamd_mime_headers_table *, + rspamd_hdr_traverse_func_t func, void *ud); + +/** + * Strip rfc822 CFWS sequences from a string in place + * @param input input + * @param len length of the input + * @return new length of the input + */ +gsize rspamd_strip_smtp_comments_inplace(gchar *input, gsize len); + +/** + * Unfold header in place + * @param hdr header value + * @param len length of the header + * @return new unfolded length + */ +gsize rspamd_message_header_unfold_inplace(char *hdr, gsize len); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBMIME_MIME_HEADERS_H_ */ diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c new file mode 100644 index 0000000..217f0b8 --- /dev/null +++ b/src/libmime/mime_parser.c @@ -0,0 +1,1758 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "config.h" +#include "task.h" +#include "mime_parser.h" +#include "mime_headers.h" +#include "message.h" +#include "multipattern.h" +#include "contrib/libottery/ottery.h" +#include "contrib/uthash/utlist.h" +#include <openssl/cms.h> +#include <openssl/pkcs7.h> +#include "contrib/fastutf8/fastutf8.h" + +struct rspamd_mime_parser_lib_ctx { + struct rspamd_multipattern *mp_boundary; + guchar hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */ + guint key_usages; +}; + +struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL; + +static const guint max_nested = 64; +static const guint max_key_usages = 10000; + +#define msg_debug_mime(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \ + rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(mime) + +#define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0) +#define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED) + +struct rspamd_mime_boundary { + goffset boundary; + goffset start; + guint64 hash; + guint64 closed_hash; + gint flags; +}; + +struct rspamd_mime_parser_ctx { + GPtrArray *stack; /* Stack of parts */ + GArray *boundaries; /* Boundaries found in the whole message */ + const gchar *start; + const gchar *pos; + const gchar *end; + struct rspamd_task *task; + guint nesting; +}; + +static enum rspamd_mime_parse_error +rspamd_mime_parse_multipart_part(struct rspamd_task *task, + struct rspamd_mime_part *part, + struct rspamd_mime_parser_ctx *st, + GError **err); +static enum rspamd_mime_parse_error +rspamd_mime_parse_message(struct rspamd_task *task, + struct rspamd_mime_part *part, + struct rspamd_mime_parser_ctx *st, + GError **err); +static enum rspamd_mime_parse_error +rspamd_mime_parse_normal_part(struct rspamd_task *task, + struct rspamd_mime_part *part, + struct rspamd_mime_parser_ctx *st, + struct rspamd_content_type *ct, + GError **err); + +static enum rspamd_mime_parse_error +rspamd_mime_process_multipart_node(struct rspamd_task *task, + struct rspamd_mime_parser_ctx *st, + struct rspamd_mime_part *multipart, + const gchar *start, const gchar *end, + gboolean is_finished, + GError **err); + + +#define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark()) +static GQuark +rspamd_mime_parser_quark(void) +{ + return g_quark_from_static_string("mime-parser"); +} + +const gchar * +rspamd_cte_to_string(enum rspamd_cte ct) +{ + const gchar *ret = "unknown"; + + switch (ct) { + case RSPAMD_CTE_7BIT: + ret = "7bit"; + break; + case RSPAMD_CTE_8BIT: + ret = "8bit"; + break; + case RSPAMD_CTE_QP: + ret = "quoted-printable"; + break; + case RSPAMD_CTE_B64: + ret = "base64"; + break; + case RSPAMD_CTE_UUE: + ret = "X-uuencode"; + break; + default: + break; + } + + return ret; +} + +enum rspamd_cte +rspamd_cte_from_string(const gchar *str) +{ + enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN; + + g_assert(str != NULL); + + if (strcmp(str, "7bit") == 0) { + ret = RSPAMD_CTE_7BIT; + } + else if (strcmp(str, "8bit") == 0) { + ret = RSPAMD_CTE_8BIT; + } + else if (strcmp(str, "quoted-printable") == 0) { + ret = RSPAMD_CTE_QP; + } + else if (strcmp(str, "base64") == 0) { + ret = RSPAMD_CTE_B64; + } + else if (strcmp(str, "X-uuencode") == 0) { + ret = RSPAMD_CTE_UUE; + } + else if (strcmp(str, "uuencode") == 0) { + ret = RSPAMD_CTE_UUE; + } + else if (strcmp(str, "X-uue") == 0) { + ret = RSPAMD_CTE_UUE; + } + + return ret; +} + +static void +rspamd_mime_parser_init_lib(void) +{ + lib_ctx = g_malloc0(sizeof(*lib_ctx)); + lib_ctx->mp_boundary = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT); + g_assert(lib_ctx->mp_boundary != NULL); + rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\r--", 0); + rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\n--", 0); + + GError *err = NULL; + if (!rspamd_multipattern_compile(lib_ctx->mp_boundary, &err)) { + msg_err("fatal error: cannot compile multipattern for mime parser boundaries: %e", err); + g_error_free(err); + g_abort(); + } + ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey)); +} + +static enum rspamd_cte +rspamd_mime_parse_cte(const gchar *in, gsize len) +{ + guint64 h; + enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN; + + in = rspamd_string_len_strip(in, &len, " \t;,.+-#!`~'"); + h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + in, len, 0xdeadbabe); + + switch (h) { + case 0xCEDAA7056B4753F7ULL: /* 7bit */ + ret = RSPAMD_CTE_7BIT; + break; + case 0x42E0745448B39FC1ULL: /* 8bit */ + case 0x6B169E6B155BADC0ULL: /* binary */ + ret = RSPAMD_CTE_8BIT; + break; + case 0x6D69A5BB02A633B0ULL: /* quoted-printable */ + ret = RSPAMD_CTE_QP; + break; + case 0x96305588A76DC9A9ULL: /* base64 */ + case 0x171029DE1B0423A9ULL: /* base-64 */ + ret = RSPAMD_CTE_B64; + break; + case 0x420b54dc00d13cecULL: /* uuencode */ + case 0x8df6700b8f6c4cf9ULL: /* x-uuencode */ + case 0x41f725ec544356d3ULL: /* x-uue */ + ret = RSPAMD_CTE_UUE; + break; + } + + return ret; +} + +static enum rspamd_cte +rspamd_mime_part_get_cte_heuristic(struct rspamd_task *task, + struct rspamd_mime_part *part) +{ + const guint check_len = 128; + guint real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0, + padeqsign = 0, nupper = 0, nlower = 0; + gboolean b64_chars = TRUE; + const guchar *p, *end; + enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN; + + real_len = MIN(check_len, part->raw_data.len); + p = (const guchar *) part->raw_data.begin; + end = p + part->raw_data.len; + + while (p < end && g_ascii_isspace(*p)) { + p++; + } + + if (end - p > sizeof("begin-base64 ")) { + const guchar *uue_start; + + if (memcmp(p, "begin ", sizeof("begin ") - 1) == 0) { + uue_start = p + sizeof("begin ") - 1; + + while (uue_start < end && g_ascii_isspace(*uue_start)) { + uue_start++; + } + + if (uue_start < end && g_ascii_isdigit(*uue_start)) { + return RSPAMD_CTE_UUE; + } + } + else if (memcmp(p, "begin-base64 ", sizeof("begin-base64 ") - 1) == 0) { + uue_start = p + sizeof("begin ") - 1; + + while (uue_start < end && g_ascii_isspace(*uue_start)) { + uue_start++; + } + + if (uue_start < end && g_ascii_isdigit(*uue_start)) { + return RSPAMD_CTE_UUE; + } + } + } + + /* Skip trailing spaces */ + while (end > p && g_ascii_isspace(*(end - 1))) { + end--; + } + + if (end > p + 2) { + if (*(end - 1) == '=') { + padeqsign++; + end--; + } + + if (*(end - 1) == '=') { + padeqsign++; + end--; + } + } + + /* Adjust end to analyse only first characters */ + if (end - p > real_len) { + end = p + real_len; + } + + while (p < end) { + if (*p == ' ') { + nspaces++; + } + else if (*p == '=') { + b64_chars = FALSE; /* Eqsign must not be inside base64 */ + neqsign++; + p++; + + if (p + 2 < end && g_ascii_isxdigit(*p) && g_ascii_isxdigit(*(p + 1))) { + p++; + nqpencoded++; + } + + continue; + } + else if (*p >= 0x80) { + n8bit++; + b64_chars = FALSE; + } + else if (!(g_ascii_isalnum(*p) || *p == '/' || *p == '+')) { + b64_chars = FALSE; + } + else if (g_ascii_isupper(*p)) { + nupper++; + } + else if (g_ascii_islower(*p)) { + nlower++; + } + + p++; + } + + if (b64_chars && neqsign <= 2 && nspaces == 0) { + /* Need more thinking */ + + if (part->raw_data.len > 80) { + if (padeqsign > 0) { + ret = RSPAMD_CTE_B64; + } + else { + /* We have a large piece of data with no spaces and base64 + * symbols only, no padding is detected as well... + * + * There is a small chance that our first 128 characters + * are either some garbage or it is a base64 with no padding + * (e.g. when it is not needed) + */ + if (nupper > 1 && nlower > 1) { + /* + * We have both uppercase and lowercase letters, so it can be + * base64 + */ + ret = RSPAMD_CTE_B64; + } + else { + ret = RSPAMD_CTE_7BIT; + } + } + } + else { + + if (((end - (const guchar *) part->raw_data.begin) + padeqsign) % 4 == 0) { + if (padeqsign == 0) { + /* + * It can be either base64 or plain text, hard to say + * Let's assume that if we have > 1 uppercase it is + * likely base64 + */ + if (nupper > 1 && nlower > 1) { + ret = RSPAMD_CTE_B64; + } + else { + ret = RSPAMD_CTE_7BIT; + } + } + else { + ret = RSPAMD_CTE_B64; + } + } + else { + /* No way */ + if (padeqsign == 1 || padeqsign == 2) { + ret = RSPAMD_CTE_B64; + } + else { + ret = RSPAMD_CTE_7BIT; + } + } + } + } + else if (n8bit == 0) { + if (neqsign > 2 && nqpencoded > 2) { + ret = RSPAMD_CTE_QP; + } + else { + ret = RSPAMD_CTE_7BIT; + } + } + else { + ret = RSPAMD_CTE_8BIT; + } + + msg_debug_mime("detected cte: %s", rspamd_cte_to_string(ret)); + + return ret; +} + +static void +rspamd_mime_part_get_cte(struct rspamd_task *task, + struct rspamd_mime_headers_table *hdrs, + struct rspamd_mime_part *part, + gboolean apply_heuristic) +{ + struct rspamd_mime_header *hdr, *cur; + enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN; + gboolean parent_propagated = FALSE; + + hdr = rspamd_message_get_header_from_hash(hdrs, "Content-Transfer-Encoding", FALSE); + + if (hdr == NULL) { + if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN && + !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) { + part->cte = part->parent_part->cte; + parent_propagated = TRUE; + + goto check_cte; + } + + if (apply_heuristic) { + part->cte = rspamd_mime_part_get_cte_heuristic(task, part); + msg_info_task("detected missing CTE for part as: %s", + rspamd_cte_to_string(part->cte)); + } + + part->flags |= RSPAMD_MIME_PART_MISSING_CTE; + } + else { + DL_FOREACH(hdr, cur) + { + gsize hlen; + gchar lc_buf[128]; + + hlen = rspamd_snprintf(lc_buf, sizeof(lc_buf), "%s", cur->value); + rspamd_str_lc(lc_buf, hlen); + cte = rspamd_mime_parse_cte(lc_buf, hlen); + + if (cte != RSPAMD_CTE_UNKNOWN) { + part->cte = cte; + break; + } + } + + check_cte: + if (apply_heuristic) { + if (part->cte == RSPAMD_CTE_UNKNOWN) { + part->cte = rspamd_mime_part_get_cte_heuristic(task, part); + + msg_info_task("corrected bad CTE for part to: %s", + rspamd_cte_to_string(part->cte)); + } + else if (part->cte == RSPAMD_CTE_B64 || + part->cte == RSPAMD_CTE_QP) { + /* Additionally check sanity */ + cte = rspamd_mime_part_get_cte_heuristic(task, part); + + if (cte == RSPAMD_CTE_8BIT) { + msg_info_task( + "incorrect cte specified for part: %s, %s detected", + rspamd_cte_to_string(part->cte), + rspamd_cte_to_string(cte)); + part->cte = cte; + part->flags |= RSPAMD_MIME_PART_BAD_CTE; + } + else if (cte != part->cte && parent_propagated) { + part->cte = cte; + msg_info_task("detected missing CTE for part as: %s", + rspamd_cte_to_string(part->cte)); + } + } + else { + msg_debug_mime("processed cte: %s", + rspamd_cte_to_string(cte)); + } + } + else { + msg_debug_mime("processed cte: %s", rspamd_cte_to_string(cte)); + } + } +} +static void +rspamd_mime_part_get_cd(struct rspamd_task *task, struct rspamd_mime_part *part) +{ + struct rspamd_mime_header *hdr, *cur; + struct rspamd_content_disposition *cd = NULL; + rspamd_ftok_t srch; + struct rspamd_content_type_param *found; + + hdr = rspamd_message_get_header_from_hash(part->raw_headers, + "Content-Disposition", FALSE); + + + if (hdr == NULL) { + cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd)); + cd->type = RSPAMD_CT_INLINE; + + /* We can also have content disposition definitions in Content-Type */ + if (part->ct && part->ct->attrs) { + RSPAMD_FTOK_ASSIGN(&srch, "name"); + found = g_hash_table_lookup(part->ct->attrs, &srch); + + if (!found) { + RSPAMD_FTOK_ASSIGN(&srch, "filename"); + found = g_hash_table_lookup(part->ct->attrs, &srch); + } + + if (found) { + cd->type = RSPAMD_CT_ATTACHMENT; + memcpy(&cd->filename, &found->value, sizeof(cd->filename)); + } + } + } + else { + DL_FOREACH(hdr, cur) + { + gsize hlen; + cd = NULL; + + if (cur->value) { + hlen = strlen(cur->value); + cd = rspamd_content_disposition_parse(cur->value, hlen, + task->task_pool); + } + + if (cd) { + /* We still need to check filename */ + if (cd->filename.len == 0) { + if (part->ct && part->ct->attrs) { + RSPAMD_FTOK_ASSIGN(&srch, "name"); + found = g_hash_table_lookup(part->ct->attrs, &srch); + + if (!found) { + RSPAMD_FTOK_ASSIGN(&srch, "filename"); + found = g_hash_table_lookup(part->ct->attrs, &srch); + } + + if (found) { + cd->type = RSPAMD_CT_ATTACHMENT; + memcpy(&cd->filename, &found->value, + sizeof(cd->filename)); + } + } + } + + msg_debug_mime("processed content disposition: %s, file: \"%T\"", + cd->lc_data, &cd->filename); + break; + } + else if (part->ct) { + /* + * Even in case of malformed Content-Disposition, we can still + * fall back to Content-Type + */ + cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd)); + cd->type = RSPAMD_CT_INLINE; + + /* We can also have content disposition definitions in Content-Type */ + if (part->ct->attrs) { + RSPAMD_FTOK_ASSIGN(&srch, "name"); + found = g_hash_table_lookup(part->ct->attrs, &srch); + + if (!found) { + RSPAMD_FTOK_ASSIGN(&srch, "filename"); + found = g_hash_table_lookup(part->ct->attrs, &srch); + } + + if (found) { + cd->type = RSPAMD_CT_ATTACHMENT; + memcpy(&cd->filename, &found->value, sizeof(cd->filename)); + } + } + } + } + } + + part->cd = cd; +} + +void rspamd_mime_parser_calc_digest(struct rspamd_mime_part *part) +{ + /* Blake2b applied to string 'rspamd' */ + static const guchar hash_key[] = { + 0xef, + 0x43, + 0xae, + 0x80, + 0xcc, + 0x8d, + 0xc3, + 0x4c, + 0x6f, + 0x1b, + 0xd6, + 0x18, + 0x1b, + 0xae, + 0x87, + 0x74, + 0x0c, + 0xca, + 0xf7, + 0x8e, + 0x5f, + 0x2e, + 0x54, + 0x32, + 0xf6, + 0x79, + 0xb9, + 0x27, + 0x26, + 0x96, + 0x20, + 0x92, + 0x70, + 0x07, + 0x85, + 0xeb, + 0x83, + 0xf7, + 0x89, + 0xe0, + 0xd7, + 0x32, + 0x2a, + 0xd2, + 0x1a, + 0x64, + 0x41, + 0xef, + 0x49, + 0xff, + 0xc3, + 0x8c, + 0x54, + 0xf9, + 0x67, + 0x74, + 0x30, + 0x1e, + 0x70, + 0x2e, + 0xb7, + 0x12, + 0x09, + 0xfe, + }; + + if (part->parsed_data.len > 0) { + rspamd_cryptobox_hash(part->digest, + part->parsed_data.begin, part->parsed_data.len, + hash_key, sizeof(hash_key)); + } +} + +static enum rspamd_mime_parse_error +rspamd_mime_parse_normal_part(struct rspamd_task *task, + struct rspamd_mime_part *part, + struct rspamd_mime_parser_ctx *st, + struct rspamd_content_type *ct, + GError **err) +{ + rspamd_fstring_t *parsed; + gssize r; + + g_assert(part != NULL); + + rspamd_mime_part_get_cte(task, part->raw_headers, part, + part->ct && !(part->ct->flags & RSPAMD_CONTENT_TYPE_MESSAGE)); + rspamd_mime_part_get_cd(task, part); + + switch (part->cte) { + case RSPAMD_CTE_7BIT: + case RSPAMD_CTE_8BIT: + case RSPAMD_CTE_UNKNOWN: + if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) { + if (part->cte != RSPAMD_CTE_7BIT) { + /* We have something that has a missing content-type, + * but it has non-7bit characters. + * + * In theory, it is very unsafe to process it as a text part + * as we unlikely get some sane result + */ + + /* + * On the other hand, there is an evidence that some + * emails actually rely on that. + * So we apply an expensive hack here: + * if there are no 8bit characters -OR- the content is valid + * UTF8, we can still imply Content-Type == text/plain + */ + + if (rspamd_str_has_8bit(part->raw_data.begin, part->raw_data.len) && + !rspamd_fast_utf8_validate(part->raw_data.begin, part->raw_data.len)) { + part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT; + part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; + } + } + } + + if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) { + /* Need to copy text as we have couple of in-place change functions */ + parsed = rspamd_fstring_sized_new(part->raw_data.len); + parsed->len = part->raw_data.len; + memcpy(parsed->str, part->raw_data.begin, parsed->len); + part->parsed_data.begin = parsed->str; + part->parsed_data.len = parsed->len; + rspamd_mempool_notify_alloc(task->task_pool, parsed->len); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); + } + else { + part->parsed_data.begin = part->raw_data.begin; + part->parsed_data.len = part->raw_data.len; + } + break; + case RSPAMD_CTE_QP: + parsed = rspamd_fstring_sized_new(part->raw_data.len); + r = rspamd_decode_qp_buf(part->raw_data.begin, part->raw_data.len, + parsed->str, parsed->allocated); + if (r != -1) { + parsed->len = r; + part->parsed_data.begin = parsed->str; + part->parsed_data.len = parsed->len; + rspamd_mempool_notify_alloc(task->task_pool, parsed->len); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); + } + else { + msg_err_task("invalid quoted-printable encoded part, assume 8bit"); + if (part->ct) { + part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; + } + part->cte = RSPAMD_CTE_8BIT; + memcpy(parsed->str, part->raw_data.begin, part->raw_data.len); + parsed->len = part->raw_data.len; + part->parsed_data.begin = parsed->str; + part->parsed_data.len = parsed->len; + rspamd_mempool_notify_alloc(task->task_pool, parsed->len); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); + } + break; + case RSPAMD_CTE_B64: + parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12); + rspamd_cryptobox_base64_decode(part->raw_data.begin, + part->raw_data.len, + parsed->str, &parsed->len); + part->parsed_data.begin = parsed->str; + part->parsed_data.len = parsed->len; + rspamd_mempool_notify_alloc(task->task_pool, parsed->len); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); + break; + case RSPAMD_CTE_UUE: + parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12); + r = rspamd_decode_uue_buf(part->raw_data.begin, part->raw_data.len, + parsed->str, parsed->allocated); + rspamd_mempool_notify_alloc(task->task_pool, parsed->len); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed); + if (r != -1) { + parsed->len = r; + part->parsed_data.begin = parsed->str; + part->parsed_data.len = parsed->len; + } + else { + msg_err_task("invalid uuencoding in encoded part, assume 8bit"); + if (part->ct) { + part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; + } + part->cte = RSPAMD_CTE_8BIT; + parsed->len = MIN(part->raw_data.len, parsed->allocated); + memcpy(parsed->str, part->raw_data.begin, parsed->len); + rspamd_mempool_notify_alloc(task->task_pool, parsed->len); + part->parsed_data.begin = parsed->str; + part->parsed_data.len = parsed->len; + } + break; + default: + g_assert_not_reached(); + } + + part->part_number = MESSAGE_FIELD(task, parts)->len; + part->urls = g_ptr_array_new(); + g_ptr_array_add(MESSAGE_FIELD(task, parts), part); + msg_debug_mime("parsed data part %T/%T of length %z (%z orig), %s cte", + &part->ct->type, &part->ct->subtype, part->parsed_data.len, + part->raw_data.len, rspamd_cte_to_string(part->cte)); + rspamd_mime_parser_calc_digest(part); + + if (ct && (ct->flags & RSPAMD_CONTENT_TYPE_SMIME)) { + CMS_ContentInfo *cms; + const unsigned char *der_beg = part->parsed_data.begin; + cms = d2i_CMS_ContentInfo(NULL, &der_beg, part->parsed_data.len); + + if (cms) { + const ASN1_OBJECT *asn_ct = CMS_get0_eContentType(cms); + int ct_nid = OBJ_obj2nid(asn_ct); + + if (ct_nid == NID_pkcs7_data) { + BIO *bio = BIO_new_mem_buf(part->parsed_data.begin, + part->parsed_data.len); + + PKCS7 *p7; + p7 = d2i_PKCS7_bio(bio, NULL); + + if (p7) { + ct_nid = OBJ_obj2nid(p7->type); + + if (ct_nid == NID_pkcs7_signed) { + PKCS7 *p7_signed_content = p7->d.sign->contents; + + ct_nid = OBJ_obj2nid(p7_signed_content->type); + + if (ct_nid == NID_pkcs7_data && p7_signed_content->d.data) { + int ret; + + msg_debug_mime("found an additional part inside of " + "smime structure of type %T/%T; length=%d", + &ct->type, &ct->subtype, p7_signed_content->d.data->length); + /* + * Since ASN.1 structures are freed, we need to copy + * the content + */ + gchar *cpy = rspamd_mempool_alloc(task->task_pool, + p7_signed_content->d.data->length); + memcpy(cpy, p7_signed_content->d.data->data, + p7_signed_content->d.data->length); + ret = rspamd_mime_process_multipart_node(task, + st, NULL, + cpy, cpy + p7_signed_content->d.data->length, + TRUE, err); + + PKCS7_free(p7); + BIO_free(bio); + CMS_ContentInfo_free(cms); + + return ret; + } + } + + PKCS7_free(p7); + } + + BIO_free(bio); + } + + CMS_ContentInfo_free(cms); + } + } + + return RSPAMD_MIME_PARSE_OK; +} + +struct rspamd_mime_multipart_cbdata { + struct rspamd_task *task; + struct rspamd_mime_part *multipart; + struct rspamd_mime_parser_ctx *st; + const gchar *part_start; + rspamd_ftok_t *cur_boundary; + guint64 bhash; + GError **err; +}; + +static enum rspamd_mime_parse_error +rspamd_mime_process_multipart_node(struct rspamd_task *task, + struct rspamd_mime_parser_ctx *st, + struct rspamd_mime_part *multipart, + const gchar *start, const gchar *end, + gboolean is_finished, + GError **err) +{ + struct rspamd_content_type *ct, *sel = NULL; + struct rspamd_mime_header *hdr = NULL, *cur; + struct rspamd_mime_part *npart; + GString str; + goffset hdr_pos, body_pos; + enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL; + + + str.str = (gchar *) start; + str.len = end - start; + + if (*start == '\n' || *start == '\r') { + /* + * We have a part that starts from newline which means that + * there are completely no headers in this part, + * hence we assume it as a text part + */ + hdr_pos = 0; + body_pos = 0; + + if (!is_finished) { + /* Ignore garbage */ + const gchar *p = start; + gboolean seen_something = FALSE; + + while (p < end) { + if (g_ascii_isalnum(*p)) { + seen_something = TRUE; + break; + } + p++; + } + + if (!seen_something) { + return RSPAMD_MIME_PARSE_NO_PART; + } + } + } + else { + hdr_pos = rspamd_string_find_eoh(&str, &body_pos); + } + + npart = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_mime_part)); + npart->parent_part = multipart; + npart->raw_headers = rspamd_message_headers_new(); + npart->headers_order = NULL; + + if (multipart) { + if (multipart->specific.mp->children == NULL) { + multipart->specific.mp->children = g_ptr_array_sized_new(2); + } + + g_ptr_array_add(multipart->specific.mp->children, npart); + } + + if (hdr_pos > 0 && hdr_pos < str.len) { + npart->raw_headers_str = str.str; + npart->raw_headers_len = hdr_pos; + npart->raw_data.begin = start + body_pos; + npart->raw_data.len = (end - start) - body_pos; + + if (npart->raw_headers_len > 0) { + rspamd_mime_headers_process(task, npart->raw_headers, + &npart->headers_order, + npart->raw_headers_str, + npart->raw_headers_len, + FALSE); + + /* Preserve the natural order */ + if (npart->headers_order) { + LL_REVERSE2(npart->headers_order, ord_next); + } + } + + hdr = rspamd_message_get_header_from_hash(npart->raw_headers, + "Content-Type", FALSE); + } + else { + npart->raw_headers_str = 0; + npart->raw_headers_len = 0; + npart->raw_data.begin = start; + npart->raw_data.len = end - start; + } + + + if (hdr != NULL) { + + DL_FOREACH(hdr, cur) + { + ct = rspamd_content_type_parse(cur->value, strlen(cur->value), + task->task_pool); + + /* Here we prefer multipart content-type or any content-type */ + if (ct) { + if (sel == NULL) { + sel = ct; + } + else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { + sel = ct; + } + } + } + } + + if (sel == NULL) { + sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel)); + RSPAMD_FTOK_ASSIGN(&sel->type, "text"); + RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain"); + } + + npart->ct = sel; + + if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { + st->nesting++; + g_ptr_array_add(st->stack, npart); + npart->part_type = RSPAMD_MIME_PART_MULTIPART; + npart->specific.mp = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_mime_multipart)); + memcpy(&npart->specific.mp->boundary, &sel->orig_boundary, + sizeof(rspamd_ftok_t)); + ret = rspamd_mime_parse_multipart_part(task, npart, st, err); + } + else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) { + st->nesting++; + g_ptr_array_add(st->stack, npart); + npart->part_type = RSPAMD_MIME_PART_MESSAGE; + + if ((ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err)) == RSPAMD_MIME_PARSE_OK) { + ret = rspamd_mime_parse_message(task, npart, st, err); + } + } + else { + ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err); + } + + return ret; +} + +static enum rspamd_mime_parse_error +rspamd_mime_parse_multipart_cb(struct rspamd_task *task, + struct rspamd_mime_part *multipart, + struct rspamd_mime_parser_ctx *st, + struct rspamd_mime_multipart_cbdata *cb, + struct rspamd_mime_boundary *b) +{ + const gchar *pos = st->start + b->boundary; + enum rspamd_mime_parse_error ret; + + task = cb->task; + + /* Now check boundary */ + if (!cb->part_start) { + cb->part_start = st->start + b->start; + st->pos = cb->part_start; + } + else { + /* + * We have seen the start of the boundary, + * but it might be unsuitable (e.g. in broken headers) + */ + if (cb->part_start < pos && cb->cur_boundary) { + + if ((ret = rspamd_mime_process_multipart_node(task, cb->st, + cb->multipart, cb->part_start, pos, TRUE, cb->err)) != RSPAMD_MIME_PARSE_OK) { + return ret; + } + + if (b->start > 0) { + /* Go towards the next part */ + cb->part_start = st->start + b->start; + cb->st->pos = cb->part_start; + } + } + else { + /* We have an empty boundary, do nothing */ + } + } + + return RSPAMD_MIME_PARSE_OK; +} + +static enum rspamd_mime_parse_error +rspamd_multipart_boundaries_filter(struct rspamd_task *task, + struct rspamd_mime_part *multipart, + struct rspamd_mime_parser_ctx *st, + struct rspamd_mime_multipart_cbdata *cb) +{ + struct rspamd_mime_boundary *cur; + goffset last_offset; + guint i, sel = 0; + enum rspamd_mime_parse_error ret; + + last_offset = (multipart->raw_data.begin - st->start) + + multipart->raw_data.len; + + /* Find the first offset suitable for this part */ + for (i = 0; i < st->boundaries->len; i++) { + cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i); + + if (cur->start >= multipart->raw_data.begin - st->start) { + if (cb->cur_boundary) { + /* Check boundary */ + msg_debug_mime("compare %L and %L (and %L)", + cb->bhash, cur->hash, cur->closed_hash); + + if (cb->bhash == cur->hash) { + sel = i; + break; + } + else if (cb->bhash == cur->closed_hash) { + /* Not a closing element in fact */ + cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED); + cur->hash = cur->closed_hash; + sel = i; + break; + } + } + else { + /* Set current boundary */ + cb->cur_boundary = rspamd_mempool_alloc(task->task_pool, + sizeof(rspamd_ftok_t)); + cb->cur_boundary->begin = st->start + cur->boundary; + cb->cur_boundary->len = 0; + cb->bhash = cur->hash; + sel = i; + break; + } + } + } + + /* Now we can go forward with boundaries that are same to what we have */ + for (i = sel; i < st->boundaries->len; i++) { + cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i); + + if (cur->boundary > last_offset) { + break; + } + + if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) { + if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st, + cb, cur)) != RSPAMD_MIME_PARSE_OK) { + return ret; + } + + if (cur->closed_hash == cb->bhash) { + /* We have again fake closed hash */ + cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED); + cur->hash = cur->closed_hash; + } + + if (RSPAMD_BOUNDARY_IS_CLOSED(cur)) { + /* We also might check the next boundary... */ + if (i < st->boundaries->len - 1) { + cur = &g_array_index(st->boundaries, + struct rspamd_mime_boundary, i + 1); + + if (cur->hash == cb->bhash) { + continue; + } + else if (cur->closed_hash == cb->bhash) { + /* We have again fake closed hash */ + cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED); + cur->hash = cur->closed_hash; + continue; + } + } + + break; + } + } + } + + if (i == st->boundaries->len && cb->cur_boundary) { + /* Process the last part */ + struct rspamd_mime_boundary fb; + + fb.boundary = last_offset; + fb.start = -1; + + if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st, + cb, &fb)) != RSPAMD_MIME_PARSE_OK) { + return ret; + } + } + + return RSPAMD_MIME_PARSE_OK; +} + +static enum rspamd_mime_parse_error +rspamd_mime_parse_multipart_part(struct rspamd_task *task, + struct rspamd_mime_part *part, + struct rspamd_mime_parser_ctx *st, + GError **err) +{ + struct rspamd_mime_multipart_cbdata cbdata; + enum rspamd_mime_parse_error ret; + + if (st->nesting > max_nested) { + g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d", + st->nesting); + return RSPAMD_MIME_PARSE_NESTING; + } + + part->part_number = MESSAGE_FIELD(task, parts)->len; + part->urls = g_ptr_array_new(); + g_ptr_array_add(MESSAGE_FIELD(task, parts), part); + st->nesting++; + rspamd_mime_part_get_cte(task, part->raw_headers, part, FALSE); + + st->pos = part->raw_data.begin; + cbdata.multipart = part; + cbdata.task = task; + cbdata.st = st; + cbdata.part_start = NULL; + cbdata.err = err; + + if (part->ct->boundary.len > 0) { + /* We know our boundary */ + cbdata.cur_boundary = &part->ct->boundary; + rspamd_cryptobox_siphash((guchar *) &cbdata.bhash, + cbdata.cur_boundary->begin, cbdata.cur_boundary->len, + lib_ctx->hkey); + msg_debug_mime("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash); + } + else { + /* Guess boundary */ + cbdata.cur_boundary = NULL; + cbdata.bhash = 0; + } + + ret = rspamd_multipart_boundaries_filter(task, part, st, &cbdata); + /* Cleanup stack */ + st->nesting--; + g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1); + + return ret; +} + +/* Process boundary like structures in a message */ +static gint +rspamd_mime_preprocess_cb(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + const gchar *end = text + len, *p = text + match_pos, *bend; + gsize blen; + gboolean closing = FALSE; + struct rspamd_mime_boundary b; + struct rspamd_mime_parser_ctx *st = context; + struct rspamd_task *task; + + task = st->task; + + if (G_LIKELY(p < end)) { + + blen = 0; + + while (p < end) { + if (*p == '\r' || *p == '\n') { + break; + } + + blen++; + p++; + } + + if (blen > 0) { + /* We have found something like boundary */ + p = text + match_pos; + bend = p + blen - 1; + + if (*bend == '-') { + /* We need to verify last -- */ + if (bend > p + 1 && *(bend - 1) == '-') { + closing = TRUE; + bend--; + blen -= 2; + } + else { + /* Not a closing boundary somehow, e.g. if a boundary=='-' */ + bend++; + } + } + else { + bend++; + } + + while (bend < end) { + if (*bend == '\r') { + bend++; + + /* \r\n */ + if (bend < end && *bend == '\n') { + bend++; + } + } + else if (*bend == '\n') { + /* \n */ + bend++; + } + else if (g_ascii_isspace(*bend)) { + /* Spaces in the same line, skip them */ + bend++; + continue; + } + + break; + } + + b.boundary = p - st->start - 2; + b.start = bend - st->start; + + /* Small optimisation as boundaries are usually short strings */ + gchar *lc_copy, lc_copy_buf[128]; + + if (blen + 2 < sizeof(lc_copy_buf)) { + lc_copy = lc_copy_buf; + } + else { + lc_copy = g_malloc(blen + 2); + } + + if (closing) { + memcpy(lc_copy, p, blen + 2); + rspamd_str_lc(lc_copy, blen + 2); + } + else { + memcpy(lc_copy, p, blen); + rspamd_str_lc(lc_copy, blen); + } + + rspamd_cryptobox_siphash((guchar *) &b.hash, lc_copy, blen, + lib_ctx->hkey); + msg_debug_mime("normal hash: %*s -> %L, %d boffset, %d data offset", + (gint) blen, lc_copy, b.hash, (int) b.boundary, (int) b.start); + + if (closing) { + b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED; + rspamd_cryptobox_siphash((guchar *) &b.closed_hash, lc_copy, + blen + 2, + lib_ctx->hkey); + msg_debug_mime("closing hash: %*s -> %L, %d boffset, %d data offset", + (gint) blen + 2, lc_copy, + b.closed_hash, + (int) b.boundary, (int) b.start); + } + else { + b.flags = 0; + b.closed_hash = 0; + } + + /* Check if a string has been allocated on the heap */ + if (blen + 2 >= sizeof(lc_copy_buf)) { + g_free(lc_copy); + } + g_array_append_val(st->boundaries, b); + } + } + + return 0; +} + +static goffset +rspamd_mime_parser_headers_heuristic(GString *input, goffset *body_start) +{ + const gsize default_max_len = 76; + gsize max_len = MIN(input->len, default_max_len); + const gchar *p, *end; + enum { + st_before_colon = 0, + st_colon, + st_spaces_after_colon, + st_value, + st_error + } state = st_before_colon; + + p = input->str; + end = p + max_len; + + while (p < end) { + switch (state) { + case st_before_colon: + if (G_UNLIKELY(*p == ':')) { + state = st_colon; + } + else if (G_UNLIKELY(!g_ascii_isgraph(*p))) { + state = st_error; + } + + p++; + break; + case st_colon: + if (g_ascii_isspace(*p)) { + state = st_spaces_after_colon; + } + else { + state = st_value; + } + p++; + break; + case st_spaces_after_colon: + if (!g_ascii_isspace(*p)) { + state = st_value; + } + p++; + break; + case st_value: + /* We accept any value */ + goto end; + break; + case st_error: + return (-1); + break; + } + } + +end: + if (state == st_value) { + if (body_start) { + *body_start = input->len; + } + + return input->len; + } + + return (-1); +} + +static void +rspamd_mime_preprocess_message(struct rspamd_task *task, + struct rspamd_mime_part *top, + struct rspamd_mime_parser_ctx *st) +{ + + if (top->raw_data.begin >= st->pos) { + rspamd_multipattern_lookup(lib_ctx->mp_boundary, + top->raw_data.begin - 1, + top->raw_data.len + 1, + rspamd_mime_preprocess_cb, st, NULL); + } + else { + rspamd_multipattern_lookup(lib_ctx->mp_boundary, + st->pos, + st->end - st->pos, + rspamd_mime_preprocess_cb, st, NULL); + } +} + +static void +rspamd_mime_parse_stack_free(struct rspamd_mime_parser_ctx *st) +{ + if (st) { + g_ptr_array_free(st->stack, TRUE); + g_array_free(st->boundaries, TRUE); + g_free(st); + } +} + +static enum rspamd_mime_parse_error +rspamd_mime_parse_message(struct rspamd_task *task, + struct rspamd_mime_part *part, + struct rspamd_mime_parser_ctx *st, + GError **err) +{ + struct rspamd_content_type *ct, *sel = NULL; + struct rspamd_mime_header *hdr = NULL, *cur; + const gchar *pbegin, *p; + gsize plen, len; + struct rspamd_mime_part *npart; + goffset hdr_pos, body_pos; + guint i; + enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK; + GString str; + struct rspamd_mime_parser_ctx *nst = st; + + if (st->nesting > max_nested) { + g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d", + st->nesting); + return RSPAMD_MIME_PARSE_NESTING; + } + + /* Allocate real part */ + npart = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_mime_part)); + + if (part == NULL) { + /* Top level message */ + p = task->msg.begin; + len = task->msg.len; + + str.str = (gchar *) p; + str.len = len; + + hdr_pos = rspamd_string_find_eoh(&str, &body_pos); + + if (hdr_pos > 0 && hdr_pos < str.len) { + + MESSAGE_FIELD(task, raw_headers_content).begin = str.str; + MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos; + MESSAGE_FIELD(task, raw_headers_content).body_start = str.str + body_pos; + + if (MESSAGE_FIELD(task, raw_headers_content).len > 0) { + rspamd_mime_headers_process(task, + MESSAGE_FIELD(task, raw_headers), + &MESSAGE_FIELD(task, headers_order), + MESSAGE_FIELD(task, raw_headers_content).begin, + MESSAGE_FIELD(task, raw_headers_content).len, + TRUE); + npart->raw_headers = rspamd_message_headers_ref( + MESSAGE_FIELD(task, raw_headers)); + + /* Preserve the natural order */ + if (MESSAGE_FIELD(task, headers_order)) { + LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next); + } + } + + hdr = rspamd_message_get_header_from_hash( + MESSAGE_FIELD(task, raw_headers), + "Content-Type", FALSE); + } + else { + /* First apply heuristic, maybe we have just headers */ + hdr_pos = rspamd_mime_parser_headers_heuristic(&str, &body_pos); + + if (hdr_pos > 0 && hdr_pos <= str.len) { + MESSAGE_FIELD(task, raw_headers_content).begin = str.str; + MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos; + MESSAGE_FIELD(task, raw_headers_content).body_start = str.str + + body_pos; + + if (MESSAGE_FIELD(task, raw_headers_content).len > 0) { + rspamd_mime_headers_process(task, + MESSAGE_FIELD(task, raw_headers), + &MESSAGE_FIELD(task, headers_order), + MESSAGE_FIELD(task, raw_headers_content).begin, + MESSAGE_FIELD(task, raw_headers_content).len, + TRUE); + npart->raw_headers = rspamd_message_headers_ref( + MESSAGE_FIELD(task, raw_headers)); + + /* Preserve the natural order */ + if (MESSAGE_FIELD(task, headers_order)) { + LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next); + } + } + + hdr = rspamd_message_get_header_from_hash( + MESSAGE_FIELD(task, raw_headers), + "Content-Type", FALSE); + task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; + } + else { + body_pos = 0; + } + } + + pbegin = st->start + body_pos; + plen = st->end - pbegin; + npart->headers_order = NULL; + } + else { + /* + * Here are dragons: + * We allocate new parser context as we need to shift pointers + */ + nst = g_malloc0(sizeof(*st)); + nst->stack = g_ptr_array_sized_new(4); + nst->boundaries = g_array_sized_new(FALSE, FALSE, + sizeof(struct rspamd_mime_boundary), 8); + nst->start = part->parsed_data.begin; + nst->end = nst->start + part->parsed_data.len; + nst->pos = nst->start; + nst->task = st->task; + nst->nesting = st->nesting; + st->nesting++; + + str.str = (gchar *) part->parsed_data.begin; + str.len = part->parsed_data.len; + + hdr_pos = rspamd_string_find_eoh(&str, &body_pos); + npart->raw_headers = rspamd_message_headers_new(); + npart->headers_order = NULL; + + if (hdr_pos > 0 && hdr_pos < str.len) { + npart->raw_headers_str = str.str; + npart->raw_headers_len = hdr_pos; + npart->raw_data.begin = str.str + body_pos; + + if (npart->raw_headers_len > 0) { + rspamd_mime_headers_process(task, + npart->raw_headers, + &npart->headers_order, + npart->raw_headers_str, + npart->raw_headers_len, + FALSE); + + /* Preserve the natural order */ + if (npart->headers_order) { + LL_REVERSE2(npart->headers_order, ord_next); + } + } + + hdr = rspamd_message_get_header_from_hash(npart->raw_headers, + "Content-Type", FALSE); + } + else { + body_pos = 0; + } + + pbegin = part->parsed_data.begin + body_pos; + plen = part->parsed_data.len - body_pos; + } + + npart->raw_data.begin = pbegin; + npart->raw_data.len = plen; + npart->parent_part = part; + + if (hdr == NULL) { + sel = NULL; + } + else { + DL_FOREACH(hdr, cur) + { + ct = rspamd_content_type_parse(cur->value, strlen(cur->value), + task->task_pool); + + /* Here we prefer multipart content-type or any content-type */ + if (ct) { + if (sel == NULL) { + sel = ct; + } + else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { + sel = ct; + } + } + } + } + + if (sel == NULL) { + /* For messages we automatically assume plaintext */ + msg_info_task("cannot find content-type for a message, assume text/plain"); + sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel)); + sel->flags = RSPAMD_CONTENT_TYPE_TEXT | RSPAMD_CONTENT_TYPE_MISSING; + RSPAMD_FTOK_ASSIGN(&sel->type, "text"); + RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain"); + } + + npart->ct = sel; + + if ((part == NULL || nst != st) && + (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART | RSPAMD_CONTENT_TYPE_MESSAGE))) { + /* Not a trivial message, need to preprocess */ + rspamd_mime_preprocess_message(task, npart, nst); + } + + if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) { + g_ptr_array_add(nst->stack, npart); + nst->nesting++; + npart->part_type = RSPAMD_MIME_PART_MULTIPART; + npart->specific.mp = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_mime_multipart)); + memcpy(&npart->specific.mp->boundary, &sel->orig_boundary, + sizeof(rspamd_ftok_t)); + ret = rspamd_mime_parse_multipart_part(task, npart, nst, err); + } + else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) { + if ((ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err)) == RSPAMD_MIME_PARSE_OK) { + npart->part_type = RSPAMD_MIME_PART_MESSAGE; + ret = rspamd_mime_parse_message(task, npart, nst, err); + } + } + else { + ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err); + } + + if (ret != RSPAMD_MIME_PARSE_OK) { + return ret; + } + + if (part && st->stack->len > 0) { + /* Remove message part from the parent stack */ + g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1); + st->nesting--; + } + + /* Process leftovers for boundaries */ + if (nst->boundaries) { + struct rspamd_mime_boundary *boundary, *start_boundary = NULL, + *end_boundary = NULL; + goffset cur_offset = nst->pos - nst->start, + end_offset = st->end - st->start; + guint sel_idx = 0; + + for (;;) { + start_boundary = NULL; + + for (i = sel_idx; i < nst->boundaries->len; i++) { + boundary = &g_array_index(nst->boundaries, + struct rspamd_mime_boundary, i); + + if (boundary->start > cur_offset && + boundary->boundary < end_offset && + !RSPAMD_BOUNDARY_IS_CLOSED(boundary)) { + start_boundary = boundary; + sel_idx = i; + break; + } + } + + if (start_boundary) { + const gchar *start, *end; + + if (nst->boundaries->len > sel_idx + 1) { + end_boundary = &g_array_index(nst->boundaries, + struct rspamd_mime_boundary, sel_idx + 1); + end = nst->start + end_boundary->boundary; + } + else { + end = nst->end; + } + + sel_idx++; + + start = nst->start + start_boundary->start; + + if (end > start && + (ret = rspamd_mime_process_multipart_node(task, nst, + NULL, start, end, FALSE, err)) != RSPAMD_MIME_PARSE_OK) { + + if (nst != st) { + rspamd_mime_parse_stack_free(nst); + } + + if (ret == RSPAMD_MIME_PARSE_NO_PART) { + return RSPAMD_MIME_PARSE_OK; + } + + return ret; + } + } + else { + break; + } + } + } + + if (nst != st) { + rspamd_mime_parse_stack_free(nst); + } + + return ret; +} + +enum rspamd_mime_parse_error +rspamd_mime_parse_task(struct rspamd_task *task, GError **err) +{ + struct rspamd_mime_parser_ctx *st; + enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK; + + if (lib_ctx == NULL) { + rspamd_mime_parser_init_lib(); + } + + if (++lib_ctx->key_usages > max_key_usages) { + /* Regenerate siphash key */ + ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey)); + lib_ctx->key_usages = 0; + } + + st = g_malloc0(sizeof(*st)); + st->stack = g_ptr_array_sized_new(4); + st->pos = MESSAGE_FIELD(task, raw_headers_content).body_start; + st->end = task->msg.begin + task->msg.len; + st->boundaries = g_array_sized_new(FALSE, FALSE, + sizeof(struct rspamd_mime_boundary), 8); + st->task = task; + + if (st->pos == NULL) { + st->pos = task->msg.begin; + } + + st->start = task->msg.begin; + ret = rspamd_mime_parse_message(task, NULL, st, err); + rspamd_mime_parse_stack_free(st); + + return ret; +} diff --git a/src/libmime/mime_parser.h b/src/libmime/mime_parser.h new file mode 100644 index 0000000..aa77b2b --- /dev/null +++ b/src/libmime/mime_parser.h @@ -0,0 +1,46 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_MIME_PARSER_H_ +#define SRC_LIBMIME_MIME_PARSER_H_ + +#include "config.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct rspamd_mime_part; + +enum rspamd_mime_parse_error { + RSPAMD_MIME_PARSE_OK = 0, + RSPAMD_MIME_PARSE_FATAL, + RSPAMD_MIME_PARSE_NESTING, + RSPAMD_MIME_PARSE_NO_PART, +}; + +enum rspamd_mime_parse_error rspamd_mime_parse_task(struct rspamd_task *task, + GError **err); + +void rspamd_mime_parser_calc_digest(struct rspamd_mime_part *part); + + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBMIME_MIME_PARSER_H_ */ diff --git a/src/libmime/mime_string.cxx b/src/libmime/mime_string.cxx new file mode 100644 index 0000000..e818e64 --- /dev/null +++ b/src/libmime/mime_string.cxx @@ -0,0 +1,167 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" +#include "mime_string.hxx" +#include "unicode/uchar.h" + +TEST_SUITE("mime_string") +{ + using namespace rspamd::mime; + TEST_CASE("mime_string unfiltered ctors") + { + SUBCASE("empty") + { + mime_string st; + CHECK(st.size() == 0); + CHECK(st == ""); + } + SUBCASE("unfiltered valid") + { + mime_string st{std::string_view("abcd")}; + CHECK(st == "abcd"); + } + SUBCASE("unfiltered zero character") + { + mime_string st{"abc\0d", 5}; + CHECK(st.has_zeroes()); + CHECK(st == "abcd"); + } + SUBCASE("unfiltered invalid character - middle") + { + mime_string st{std::string("abc\234d")}; + CHECK(st.has_invalid()); + CHECK(st == "abc\uFFFDd"); + } + SUBCASE("unfiltered invalid character - end") + { + mime_string st{std::string("abc\234")}; + CHECK(st.has_invalid()); + CHECK(st == "abc\uFFFD"); + } + SUBCASE("unfiltered invalid character - start") + { + mime_string st{std::string("\234abc")}; + CHECK(st.has_invalid()); + CHECK(st == "\uFFFDabc"); + } + } + + TEST_CASE("mime_string filtered ctors") + { + auto print_filter = [](UChar32 inp) -> UChar32 { + if (!u_isprint(inp)) { + return 0; + } + + return inp; + }; + + auto tolower_filter = [](UChar32 inp) -> UChar32 { + return u_tolower(inp); + }; + + SUBCASE("empty") + { + mime_string st{std::string_view(""), tolower_filter}; + CHECK(st.size() == 0); + CHECK(st == ""); + } + SUBCASE("filtered valid") + { + mime_string st{std::string("AbCdУ"), tolower_filter}; + CHECK(st == "abcdу"); + } + SUBCASE("filtered invalid + filtered") + { + mime_string st{std::string("abcd\234\1"), print_filter}; + CHECK(st == "abcd\uFFFD"); + } + } + TEST_CASE("mime_string assign") + { + SUBCASE("assign from valid") + { + mime_string st; + + CHECK(st.assign_if_valid(std::string("test"))); + CHECK(st == "test"); + } + SUBCASE("assign from invalid") + { + mime_string st; + + CHECK(!st.assign_if_valid(std::string("test\234t"))); + CHECK(st == ""); + } + } + + TEST_CASE("mime_string iterators") + { + + SUBCASE("unfiltered iterator ascii") + { + auto in = std::string("abcd"); + mime_string st{in}; + CHECK(st == "abcd"); + + int i = 0; + for (auto &&c: st) { + CHECK(c == in[i++]); + } + } + + SUBCASE("unfiltered iterator utf8") + { + auto in = std::string("тест"); + UChar32 ucs[4] = {1090, 1077, 1089, 1090}; + mime_string st{in}; + CHECK(st == "тест"); + + int i = 0; + for (auto &&c: st) { + CHECK(c == ucs[i++]); + } + CHECK(i == sizeof(ucs) / sizeof(ucs[0])); + } + + SUBCASE("unfiltered raw iterator ascii") + { + auto in = std::string("abcd"); + mime_string st{in}; + CHECK(st == "abcd"); + + int i = 0; + for (auto it = st.raw_begin(); it != st.raw_end(); ++it) { + CHECK(*it == in[i++]); + } + } + + SUBCASE("unfiltered raw iterator utf8") + { + auto in = std::string("тест"); + mime_string st{in}; + CHECK(st == "тест"); + + int i = 0; + for (auto it = st.raw_begin(); it != st.raw_end(); ++it) { + CHECK(*it == in[i++]); + } + CHECK(i == in.size()); + } + } +}
\ No newline at end of file diff --git a/src/libmime/mime_string.hxx b/src/libmime/mime_string.hxx new file mode 100644 index 0000000..7476816 --- /dev/null +++ b/src/libmime/mime_string.hxx @@ -0,0 +1,670 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_MIME_STRING_HXX +#define RSPAMD_MIME_STRING_HXX +#pragma once + +#include <algorithm> +#include <string> +#include <string_view> +#include <memory> +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include <iosfwd> +#include "libutil/mem_pool.h" +#include "function2/function2.hpp" +#include "unicode/utf8.h" +#include "contrib/fastutf8/fastutf8.h" + +namespace rspamd::mime { +/* + * The motivation for another string is to have utf8 valid string replacing + * all bad things with FFFFD replacement character and filtering \0 and other + * strange stuff defined by policies. + * This string always exclude \0 characters and ignore them! This is how MUA acts, + * and we also store a flag about bad characters. + * Mime string iterators are always const, so the underlying storage should not + * be modified externally. + */ +template<class T = char, class Allocator = std::allocator<T>, + class Functor = fu2::function_view<UChar32(UChar32)>> +class basic_mime_string; + +using mime_string = basic_mime_string<char>; +using mime_pool_string = basic_mime_string<char, mempool_allocator<char>>; + +/* Helpers for type safe flags */ +enum class mime_string_flags : std::uint8_t { + MIME_STRING_DEFAULT = 0, + MIME_STRING_SEEN_ZEROES = 0x1 << 0, + MIME_STRING_SEEN_INVALID = 0x1 << 1, +}; + +constexpr mime_string_flags operator|(mime_string_flags lhs, mime_string_flags rhs) +{ + using ut = std::underlying_type<mime_string_flags>::type; + return static_cast<mime_string_flags>(static_cast<ut>(lhs) | static_cast<ut>(rhs)); +} + +constexpr mime_string_flags operator&(mime_string_flags lhs, mime_string_flags rhs) +{ + using ut = std::underlying_type<mime_string_flags>::type; + return static_cast<mime_string_flags>(static_cast<ut>(lhs) & static_cast<ut>(rhs)); +} + +constexpr bool operator!(mime_string_flags fl) +{ + return fl == mime_string_flags::MIME_STRING_DEFAULT; +} + +// Codepoint iterator base class +template<typename Container, bool Raw = false> +struct iterator_base { + template<typename, typename, typename> + friend class basic_mime_string; + +public: + using value_type = typename Container::value_type; + using difference_type = typename Container::difference_type; + using codepoint_type = typename Container::codepoint_type; + using reference_type = codepoint_type; + using iterator_category = std::bidirectional_iterator_tag; + + bool operator==(const iterator_base &it) const noexcept + { + return idx == it.idx; + } + + bool operator!=(const iterator_base &it) const noexcept + { + return idx != it.idx; + } + + iterator_base(difference_type index, Container *instance) noexcept + : idx(index), cont_instance(instance) + { + } + iterator_base() noexcept = default; + iterator_base(const iterator_base &) noexcept = default; + + iterator_base &operator=(const iterator_base &) noexcept = default; + + Container *get_instance() const noexcept + { + return cont_instance; + } + + codepoint_type get_value() const noexcept + { + auto i = idx; + codepoint_type uc; + U8_NEXT_UNSAFE(cont_instance->data(), i, uc); + return uc; + } + +protected: + difference_type idx; + Container *cont_instance = nullptr; + +protected: + void advance(difference_type n) noexcept + { + if (n > 0) { + U8_FWD_N_UNSAFE(cont_instance->data(), idx, n); + } + else if (n < 0) { + U8_BACK_N_UNSAFE(cont_instance->data(), idx, (-n)); + } + } + void increment() noexcept + { + codepoint_type uc; + U8_NEXT_UNSAFE(cont_instance->data(), idx, uc); + } + + void decrement() noexcept + { + codepoint_type uc; + U8_PREV_UNSAFE(cont_instance->data(), idx, uc); + } +}; + +// Partial spec for raw Byte-based iterator base +template<typename Container> +struct iterator_base<Container, true> { + template<typename, typename, typename> + friend class basic_string; + +public: + using value_type = typename Container::value_type; + using difference_type = typename Container::difference_type; + using reference_type = value_type; + using iterator_category = std::bidirectional_iterator_tag; + + bool operator==(const iterator_base &it) const noexcept + { + return idx == it.idx; + } + bool operator!=(const iterator_base &it) const noexcept + { + return idx != it.idx; + } + + iterator_base(difference_type index, Container *instance) noexcept + : idx(index), cont_instance(instance) + { + } + + iterator_base() noexcept = default; + iterator_base(const iterator_base &) noexcept = default; + iterator_base &operator=(const iterator_base &) noexcept = default; + Container *get_instance() const noexcept + { + return cont_instance; + } + + value_type get_value() const noexcept + { + return cont_instance->get_storage().at(idx); + } + +protected: + difference_type idx; + Container *cont_instance = nullptr; + +protected: + //! Advance the iterator n times (negative values allowed!) + void advance(difference_type n) noexcept + { + idx += n; + } + + void increment() noexcept + { + idx++; + } + void decrement() noexcept + { + idx--; + } +}; + +template<typename Container, bool Raw> +struct iterator; +template<typename Container, bool Raw> +struct const_iterator; + +template<typename Container, bool Raw = false> +struct iterator : iterator_base<Container, Raw> { + iterator(typename iterator_base<Container, Raw>::difference_type index, Container *instance) noexcept + : iterator_base<Container, Raw>(index, instance) + { + } + iterator() noexcept = default; + iterator(const iterator &) noexcept = default; + + iterator &operator=(const iterator &) noexcept = default; + /* Disallow creating from const_iterator */ + iterator(const const_iterator<Container, Raw> &) = delete; + + /* Prefix */ + iterator &operator++() noexcept + { + this->increment(); + return *this; + } + + /* Postfix */ + iterator operator++(int) noexcept + { + iterator tmp{this->idx, this->cont_instance}; + this->increment(); + return tmp; + } + + /* Prefix */ + iterator &operator--() noexcept + { + this->decrement(); + return *this; + } + + /* Postfix */ + iterator operator--(int) noexcept + { + iterator tmp{this->idx, this->cont_instance}; + this->decrement(); + return tmp; + } + + iterator operator+(typename iterator_base<Container, Raw>::difference_type n) const noexcept + { + iterator it{*this}; + it.advance(n); + return it; + } + + iterator &operator+=(typename iterator_base<Container, Raw>::difference_type n) noexcept + { + this->advance(n); + return *this; + } + + iterator operator-(typename iterator_base<Container, Raw>::difference_type n) const noexcept + { + iterator it{*this}; + it.advance(-n); + return it; + } + + iterator &operator-=(typename iterator_base<Container, Raw>::difference_type n) noexcept + { + this->advance(-n); + return *this; + } + + typename iterator::reference_type operator*() const noexcept + { + return this->get_value(); + } +}; + +template<class CharT, class Allocator, class Functor> +class basic_mime_string : private Allocator { +public: + using storage_type = std::basic_string<CharT, std::char_traits<CharT>, Allocator>; + using view_type = std::basic_string_view<CharT, std::char_traits<CharT>>; + using filter_type = Functor; + using codepoint_type = UChar32; + using value_type = CharT; + using difference_type = std::ptrdiff_t; + using iterator = rspamd::mime::iterator<basic_mime_string, false>; + using raw_iterator = rspamd::mime::iterator<basic_mime_string, true>; + /* Ctors */ + basic_mime_string() noexcept + : Allocator() + { + } + explicit basic_mime_string(const Allocator &alloc) noexcept + : Allocator(alloc) + { + } + explicit basic_mime_string(filter_type &&filt, const Allocator &alloc = Allocator()) noexcept + : Allocator(alloc), filter_func(std::move(filt)) + { + } + + basic_mime_string(const CharT *str, std::size_t sz, const Allocator &alloc = Allocator()) noexcept + : Allocator(alloc) + { + append_c_string_unfiltered(str, sz); + } + + basic_mime_string(const storage_type &st, + const Allocator &alloc = Allocator()) noexcept + : basic_mime_string(st.data(), st.size(), alloc) + { + } + + basic_mime_string(const view_type &st, + const Allocator &alloc = Allocator()) noexcept + : basic_mime_string(st.data(), st.size(), alloc) + { + } + /* Explicit move ctor */ + basic_mime_string(basic_mime_string &&other) noexcept + { + *this = std::move(other); + } + + + /** + * Creates a string with a filter function. It is calee responsibility to + * ensure that the filter functor survives long enough to work with a string + * @param str + * @param sz + * @param filt + * @param alloc + */ + basic_mime_string(const CharT *str, std::size_t sz, + filter_type &&filt, + const Allocator &alloc = Allocator()) noexcept + : Allocator(alloc), + filter_func(std::move(filt)) + { + append_c_string_filtered(str, sz); + } + + basic_mime_string(const storage_type &st, + filter_type &&filt, + const Allocator &alloc = Allocator()) noexcept + : basic_mime_string(st.data(), st.size(), std::move(filt), alloc) + { + } + basic_mime_string(const view_type &st, + filter_type &&filt, + const Allocator &alloc = Allocator()) noexcept + : basic_mime_string(st.data(), st.size(), std::move(filt), alloc) + { + } + + /* It seems some libc++ implementations still perform copy, this might fix them */ + basic_mime_string &operator=(basic_mime_string &&other) + { + storage = std::move(other.storage); + filter_func = std::move(other.filter_func); + + return *this; + } + + constexpr auto size() const noexcept -> std::size_t + { + return storage.size(); + } + + constexpr auto data() const noexcept -> const CharT * + { + return storage.data(); + } + + constexpr auto has_zeroes() const noexcept -> bool + { + return !!(flags & mime_string_flags::MIME_STRING_SEEN_ZEROES); + } + + constexpr auto has_invalid() const noexcept -> bool + { + return !!(flags & mime_string_flags::MIME_STRING_SEEN_INVALID); + } + + /** + * Assign mime string from another string using move operation if a source string + * is utf8 valid. + * If this function returns false, then ownership has not been transferred + * and the `other` string is unmodified as well as the storage + * @param other + * @return + */ + [[nodiscard]] auto assign_if_valid(storage_type &&other) -> bool + { + if (filter_func) { + /* No way */ + return false; + } + if (rspamd_fast_utf8_validate((const unsigned char *) other.data(), other.size()) == 0) { + std::swap(storage, other); + + return true; + } + + return false; + } + + /** + * Copy to the internal storage discarding the contained value + * @param other + * @return + */ + auto assign_copy(const view_type &other) + { + storage.clear(); + + if (filter_func) { + append_c_string_filtered(other.data(), other.size()); + } + else { + append_c_string_unfiltered(other.data(), other.size()); + } + } + auto assign_copy(const storage_type &other) + { + storage.clear(); + + if (filter_func) { + append_c_string_filtered(other.data(), other.size()); + } + else { + append_c_string_unfiltered(other.data(), other.size()); + } + } + auto assign_copy(const basic_mime_string &other) + { + storage.clear(); + + if (filter_func) { + append_c_string_filtered(other.data(), other.size()); + } + else { + append_c_string_unfiltered(other.data(), other.size()); + } + } + + /* Mutators */ + auto append(const CharT *str, std::size_t size) -> std::size_t + { + if (filter_func) { + return append_c_string_filtered(str, size); + } + else { + return append_c_string_unfiltered(str, size); + } + } + auto append(const storage_type &other) -> std::size_t + { + return append(other.data(), other.size()); + } + auto append(const view_type &other) -> std::size_t + { + return append(other.data(), other.size()); + } + + auto ltrim(const view_type &what) -> void + { + auto it = std::find_if(storage.begin(), storage.end(), + [&what](CharT c) { + return !std::any_of(what.begin(), what.end(), [&c](CharT sc) { return sc == c; }); + }); + storage.erase(storage.begin(), it); + } + + auto rtrim(const view_type &what) -> void + { + auto it = std::find_if(storage.rbegin(), storage.rend(), + [&what](CharT c) { + return !std::any_of(what.begin(), what.end(), [&c](CharT sc) { return sc == c; }); + }); + storage.erase(it.base(), storage.end()); + } + + auto trim(const view_type &what) -> void + { + ltrim(what); + rtrim(what); + } + + /* Comparison */ + auto operator==(const basic_mime_string &other) + { + return other.storage == storage; + } + auto operator==(const storage_type &other) + { + return other == storage; + } + auto operator==(const view_type &other) + { + return other == storage; + } + auto operator==(const CharT *other) + { + if (other == NULL) { + return false; + } + auto olen = strlen(other); + if (storage.size() == olen) { + return memcmp(storage.data(), other, olen) == 0; + } + + return false; + } + + /* Iterators */ + inline auto begin() noexcept -> iterator + { + return {0, this}; + } + + inline auto raw_begin() noexcept -> raw_iterator + { + return {0, this}; + } + + inline auto end() noexcept -> iterator + { + return {(difference_type) size(), this}; + } + + inline auto raw_end() noexcept -> raw_iterator + { + return {(difference_type) size(), this}; + } + + /* Utility */ + inline auto get_storage() const noexcept -> const storage_type & + { + return storage; + } + + inline auto as_view() const noexcept -> view_type + { + return view_type{storage}; + } + + constexpr CharT operator[](std::size_t pos) const noexcept + { + return storage[pos]; + } + constexpr CharT at(std::size_t pos) const + { + return storage.at(pos); + } + constexpr bool empty() const noexcept + { + return storage.empty(); + } + + + /* For doctest stringify */ + friend std::ostream &operator<<(std::ostream &os, const CharT &value) + { + os << value.storage; + return os; + } + +private: + mime_string_flags flags = mime_string_flags::MIME_STRING_DEFAULT; + storage_type storage; + filter_type filter_func; + + auto append_c_string_unfiltered(const CharT *str, std::size_t len) -> std::size_t + { + /* This is fast path */ + const auto *p = str; + const auto *end = str + len; + std::int32_t err_offset;// We have to use int32_t here as old libicu is brain-damaged + auto orig_size = storage.size(); + + storage.reserve(len + storage.size()); + + if (memchr(str, 0, len) != NULL) { + /* Fallback to slow path */ + flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES; + return append_c_string_filtered(str, len); + } + + while (p < end && len > 0 && + (err_offset = rspamd_fast_utf8_validate((const unsigned char *) p, len)) > 0) { + auto cur_offset = err_offset - 1; + storage.append(p, cur_offset); + + while (cur_offset < len) { + auto tmp = cur_offset; + UChar32 uc; + + U8_NEXT(p, cur_offset, len, uc); + + if (uc < 0) { + storage.append("\uFFFD"); + flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID; + } + else { + cur_offset = tmp; + break; + } + } + + p += cur_offset; + len = end - p; + } + + storage.append(p, len); + return storage.size() - orig_size; + } + + auto append_c_string_filtered(const CharT *str, std::size_t len) -> std::size_t + { + std::int32_t i = 0;// We have to use int32_t here as old libicu is brain-damaged + UChar32 uc; + char tmp[4]; + auto orig_size = storage.size(); + /* Slow path */ + + storage.reserve(len + storage.size()); + + while (i < len) { + U8_NEXT(str, i, len, uc); + + if (uc < 0) { + /* Replace with 0xFFFD */ + storage.append("\uFFFD"); + flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID; + } + else { + if (filter_func) { + uc = filter_func(uc); + } + + if (uc == 0) { + /* Special case, ignore it */ + flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES; + } + else { + std::int32_t o = 0; + U8_APPEND_UNSAFE(tmp, o, uc); + storage.append(tmp, o); + } + } + } + + return storage.size() - orig_size; + } +}; + +}// namespace rspamd::mime + + +#endif//RSPAMD_MIME_STRING_HXX diff --git a/src/libmime/received.cxx b/src/libmime/received.cxx new file mode 100644 index 0000000..dc16d9b --- /dev/null +++ b/src/libmime/received.cxx @@ -0,0 +1,1017 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "libserver/url.h" +#include "lua/lua_common.h" +#include "libserver/cfg_file.h" +#include "libserver/mempool_vars_internal.h" +#include "mime_string.hxx" +#include "smtp_parsers.h" +#include "message.h" +#include "received.hxx" +#include "frozen/string.h" +#include "frozen/unordered_map.h" + +namespace rspamd::mime { + +enum class received_part_type { + RSPAMD_RECEIVED_PART_FROM, + RSPAMD_RECEIVED_PART_BY, + RSPAMD_RECEIVED_PART_FOR, + RSPAMD_RECEIVED_PART_WITH, + RSPAMD_RECEIVED_PART_ID, + RSPAMD_RECEIVED_PART_UNKNOWN, +}; + +struct received_part { + received_part_type type; + mime_string data; + std::vector<mime_string> comments; + + explicit received_part(received_part_type t) + : type(t), + data(received_char_filter) + { + } +}; + +static inline auto +received_part_set_or_append(const gchar *begin, + gsize len, + mime_string &dest) -> void +{ + if (len == 0) { + return; + } + + dest.append(begin, len); + dest.trim(" \t"); +} + +static auto +received_process_part(const std::string_view &data, + received_part_type type, + std::ptrdiff_t &last, + received_part &npart) -> bool +{ + auto obraces = 0, ebraces = 0; + auto seen_tcpinfo = false; + enum _parse_state { + skip_spaces, + in_comment, + read_data, + read_tcpinfo, + all_done + } state, + next_state; + + /* In this function, we just process comments and data separately */ + const auto *p = data.data(); + const auto *end = p + data.size(); + const auto *c = p; + + state = skip_spaces; + next_state = read_data; + + while (p < end) { + switch (state) { + case skip_spaces: + if (!g_ascii_isspace(*p)) { + c = p; + state = next_state; + } + else { + p++; + } + break; + case in_comment: + if (*p == '(') { + obraces++; + } + else if (*p == ')') { + ebraces++; + + if (ebraces >= obraces) { + if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) { + if (p > c) { + npart.comments.emplace_back(received_char_filter); + auto &comment = npart.comments.back(); + received_part_set_or_append(c, p - c, + comment); + } + } + + p++; + c = p; + state = skip_spaces; + next_state = read_data; + + continue; + } + } + + p++; + break; + case read_data: + if (*p == '(') { + if (p > c) { + if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) { + received_part_set_or_append(c, p - c, + npart.data); + } + } + + state = in_comment; + obraces = 1; + ebraces = 0; + p++; + c = p; + } + else if (g_ascii_isspace(*p)) { + if (p > c) { + if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) { + received_part_set_or_append(c, p - c, + npart.data); + } + } + + state = skip_spaces; + next_state = read_data; + c = p; + } + else if (*p == ';') { + /* It is actually delimiter of date part if not in the comments */ + if (p > c) { + if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) { + received_part_set_or_append(c, p - c, + npart.data); + } + } + + state = all_done; + continue; + } + else if (npart.data.size() > 0) { + /* We have already received data and find something with no ( */ + if (!seen_tcpinfo && type == received_part_type::RSPAMD_RECEIVED_PART_FROM) { + /* Check if we have something special here, such as TCPinfo */ + if (*c == '[') { + state = read_tcpinfo; + p++; + } + else { + state = all_done; + continue; + } + } + else { + state = all_done; + continue; + } + } + else { + p++; + } + break; + case read_tcpinfo: + if (*p == ']') { + received_part_set_or_append(c, p - c + 1, + npart.data); + seen_tcpinfo = TRUE; + state = skip_spaces; + next_state = read_data; + c = p; + } + p++; + break; + case all_done: + if (p > data.data()) { + last = p - data.data(); + return true; + } + else { + /* Empty element */ + return false; + } + break; + } + } + + /* Leftover */ + switch (state) { + case read_data: + if (p > c) { + if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) { + received_part_set_or_append(c, p - c, + npart.data); + } + + last = p - data.data(); + + return true; + } + break; + case skip_spaces: + if (p > data.data()) { + last = p - data.data(); + + return true; + } + default: + break; + } + + return false; +} + +template<std::size_t N> +constexpr auto lit_compare_lowercase(const char lit[N], const char *in) -> bool +{ + for (auto i = 0; i < N; i++) { + if (lc_map[(unsigned char) in[i]] != lit[i]) { + return false; + } + } + + return true; +} + +static auto +received_spill(const std::string_view &in, + std::ptrdiff_t &date_pos) -> std::vector<received_part> +{ + std::vector<received_part> parts; + std::ptrdiff_t pos = 0; + auto seen_from = false, seen_by = false; + + const auto *p = in.data(); + const auto *end = p + in.size(); + + auto skip_spaces = [&p, end]() { + while (p < end && g_ascii_isspace(*p)) { + p++; + } + }; + + skip_spaces(); + + /* Skip SMTP comments */ + if (*p == '(') { + auto obraces = 0, ebraces = 0; + + while (p < end) { + if (*p == ')') { + ebraces++; + } + else if (*p == '(') { + obraces++; + } + + p++; + + if (obraces == ebraces) { + /* Skip spaces after */ + skip_spaces(); + break; + } + } + } + + auto len = end - p; + + if (len == 0) { + return parts; + } + + auto maybe_process_part = [&](received_part_type what) -> bool { + parts.emplace_back(what); + auto &rcvd_part = parts.back(); + auto chunk = std::string_view{p, (std::size_t)(end - p)}; + + if (!received_process_part(chunk, what, pos, rcvd_part)) { + parts.pop_back(); + + return false; + } + + return true; + }; + + if (len > 4 && lit_compare_lowercase<4>("from", p)) { + p += sizeof("from") - 1; + + /* We can now store from part */ + if (!maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_FROM)) { + /* Do not accept malformed from */ + return {}; + } + + g_assert(pos != 0); + p += pos; + len = end > p ? end - p : 0; + seen_from = true; + } + + if (len > 2 && lit_compare_lowercase<2>("by", p)) { + p += sizeof("by") - 1; + + if (!maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_BY)) { + return {}; + } + + g_assert(pos != 0); + p += pos; + len = end > p ? end - p : 0; + seen_by = true; + } + + if (!seen_from && !seen_by) { + /* Useless received */ + return {}; + } + + while (p < end) { + bool got_part = false; + if (*p == ';') { + /* We are at the date separator, stop here */ + date_pos = p - in.data() + 1; + break; + } + else { + if (len > sizeof("with") && lit_compare_lowercase<4>("with", p)) { + p += sizeof("with") - 1; + + got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_WITH); + } + else if (len > sizeof("for") && lit_compare_lowercase<3>("for", p)) { + p += sizeof("for") - 1; + got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_FOR); + } + else if (len > sizeof("id") && lit_compare_lowercase<2>("id", p)) { + p += sizeof("id") - 1; + got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_ID); + } + else { + while (p < end) { + if (!(g_ascii_isspace(*p) || *p == '(' || *p == ';')) { + p++; + } + else { + break; + } + } + + if (p == end) { + return {}; + } + else if (*p == ';') { + date_pos = p - in.data() + 1; + break; + } + else { + got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN); + } + } + + if (!got_part) { + p++; + len = end > p ? end - p : 0; + } + else { + g_assert(pos != 0); + p += pos; + len = end > p ? end - p : 0; + } + } + } + + return parts; +} + +#define RSPAMD_INET_ADDRESS_PARSE_RECEIVED \ + (rspamd_inet_address_parse_flags)(RSPAMD_INET_ADDRESS_PARSE_REMOTE | RSPAMD_INET_ADDRESS_PARSE_NO_UNIX) + +static auto +received_process_rdns(rspamd_mempool_t *pool, + const std::string_view &in, + mime_string &dest) -> bool +{ + auto seen_dot = false; + + const auto *p = in.data(); + const auto *end = p + in.size(); + + if (in.empty()) { + return false; + } + + if (*p == '[' && *(end - 1) == ']' && in.size() > 2) { + /* We have enclosed ip address */ + auto *addr = rspamd_parse_inet_address_pool(p + 1, + (end - p) - 2, + pool, + RSPAMD_INET_ADDRESS_PARSE_RECEIVED); + + if (addr) { + const gchar *addr_str; + + if (rspamd_inet_address_get_port(addr) != 0) { + addr_str = rspamd_inet_address_to_string_pretty(addr); + } + else { + addr_str = rspamd_inet_address_to_string(addr); + } + + dest.assign_copy(std::string_view{addr_str}); + + return true; + } + } + + auto hlen = 0u; + + while (p < end) { + if (!g_ascii_isspace(*p) && rspamd_url_is_domain(*p)) { + if (*p == '.') { + seen_dot = true; + } + + hlen++; + } + else { + break; + } + + p++; + } + + if (hlen > 0) { + if (p == end || (seen_dot && (g_ascii_isspace(*p) || *p == '[' || *p == '('))) { + /* All data looks like a hostname */ + dest.assign_copy(std::string_view{in.data(), hlen}); + + return true; + } + } + + return false; +} + +static auto +received_process_host_tcpinfo(rspamd_mempool_t *pool, + received_header &rh, + const std::string_view &in) -> bool +{ + rspamd_inet_addr_t *addr = nullptr; + auto ret = false; + + if (in.empty()) { + return false; + } + + if (in[0] == '[') { + /* Likely Exim version */ + + auto brace_pos = in.find(']'); + + if (brace_pos != std::string_view::npos) { + auto substr_addr = in.substr(1, brace_pos - 1); + addr = rspamd_parse_inet_address_pool(substr_addr.data(), + substr_addr.size(), + pool, + RSPAMD_INET_ADDRESS_PARSE_RECEIVED); + + if (addr) { + rh.addr = addr; + rh.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(addr))); + } + } + } + else { + if (g_ascii_isxdigit(in[0])) { + /* Try to parse IP address */ + addr = rspamd_parse_inet_address_pool(in.data(), + in.size(), pool, RSPAMD_INET_ADDRESS_PARSE_RECEIVED); + if (addr) { + rh.addr = addr; + rh.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(addr))); + } + } + + if (!addr) { + /* Try canonical Postfix version: rdns [ip] */ + auto obrace_pos = in.find('['); + + if (obrace_pos != std::string_view::npos) { + auto ebrace_pos = in.rfind(']'); + + if (ebrace_pos != std::string_view::npos && ebrace_pos > obrace_pos) { + auto substr_addr = in.substr(obrace_pos + 1, + ebrace_pos - obrace_pos - 1); + addr = rspamd_parse_inet_address_pool(substr_addr.data(), + substr_addr.size(), + pool, + RSPAMD_INET_ADDRESS_PARSE_RECEIVED); + + if (addr) { + rh.addr = addr; + rh.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(addr))); + + /* Process with rDNS */ + auto rdns_substr = in.substr(0, obrace_pos); + + if (received_process_rdns(pool, rdns_substr, rh.real_hostname)) { + ret = true; + } + } + } + } + else { + /* Hostname or some crap, sigh... */ + if (received_process_rdns(pool, in, rh.real_hostname)) { + ret = true; + } + } + } + } + + return ret; +} + +static void +received_process_from(rspamd_mempool_t *pool, + const received_part &rpart, + received_header &rh) +{ + if (rpart.data.size() > 0) { + /* We have seen multiple cases: + * - [ip] (hostname/unknown [real_ip]) + * - helo (hostname/unknown [real_ip]) + * - [ip] + * - hostname + * - hostname ([ip]:port helo=xxx) + * Maybe more... + */ + auto seen_ip_in_data = false; + + if (!rpart.comments.empty()) { + /* We can have info within comment as part of RFC */ + received_process_host_tcpinfo( + pool, rh, + rpart.comments[0].as_view()); + } + + if (rh.real_ip.size() == 0) { + /* Try to do the same with data */ + if (received_process_host_tcpinfo( + pool, rh, + rpart.data.as_view())) { + seen_ip_in_data = true; + } + } + + if (!seen_ip_in_data) { + if (rh.real_ip.size() != 0) { + /* Get announced hostname (usually helo) */ + received_process_rdns(pool, + rpart.data.as_view(), + rh.from_hostname); + } + else { + received_process_host_tcpinfo(pool, + rh, rpart.data.as_view()); + } + } + } + else { + /* rpart->dlen = 0 */ + if (!rpart.comments.empty()) { + received_process_host_tcpinfo( + pool, rh, + rpart.comments[0].as_view()); + } + } +} + +static auto +received_header_parse(received_header_chain &chain, rspamd_mempool_t *pool, + const std::string_view &in, + struct rspamd_mime_header *hdr) -> bool +{ + std::ptrdiff_t date_pos = -1; + + static constexpr const auto protos_map = frozen::make_unordered_map<frozen::string, received_flags>({{"smtp", received_flags::SMTP}, + {"esmtp", received_flags::ESMTP}, + {"esmtpa", received_flags::ESMTPA | + received_flags::AUTHENTICATED}, + {"esmtpsa", received_flags::ESMTPSA | + received_flags::SSL | + received_flags::AUTHENTICATED}, + {"esmtps", received_flags::ESMTPS | + received_flags::SSL}, + {"lmtp", received_flags::LMTP}, + {"imap", received_flags::IMAP}, + {"imaps", received_flags::IMAP | + received_flags::SSL}, + {"http", received_flags::HTTP}, + {"https", received_flags::HTTP | + received_flags::SSL}, + {"local", received_flags::LOCAL}}); + + auto parts = received_spill(in, date_pos); + + if (parts.empty()) { + return false; + } + + auto &rh = chain.new_received(); + + rh.flags = received_flags::UNKNOWN; + rh.hdr = hdr; + + for (const auto &part: parts) { + switch (part.type) { + case received_part_type::RSPAMD_RECEIVED_PART_FROM: + received_process_from(pool, part, rh); + break; + case received_part_type::RSPAMD_RECEIVED_PART_BY: + received_process_rdns(pool, + part.data.as_view(), + rh.by_hostname); + break; + case received_part_type::RSPAMD_RECEIVED_PART_WITH: + if (part.data.size() > 0) { + auto proto_flag_it = protos_map.find(part.data.as_view()); + + if (proto_flag_it != protos_map.end()) { + rh.flags = proto_flag_it->second; + } + } + break; + case received_part_type::RSPAMD_RECEIVED_PART_FOR: + rh.for_mbox.assign_copy(part.data); + rh.for_addr = rspamd_email_address_from_smtp(rh.for_mbox.data(), + rh.for_mbox.size()); + break; + default: + /* Do nothing */ + break; + } + } + + if (!rh.real_hostname.empty() && rh.from_hostname.empty()) { + rh.from_hostname.assign_copy(rh.real_hostname); + } + + if (date_pos > 0 && date_pos < in.size()) { + auto date_sub = in.substr(date_pos); + rh.timestamp = rspamd_parse_smtp_date((const unsigned char *) date_sub.data(), + date_sub.size(), nullptr); + } + + return true; +} + +static auto +received_maybe_fix_task(struct rspamd_task *task) -> bool +{ + auto *recv_chain_ptr = static_cast<received_header_chain *>(MESSAGE_FIELD(task, received_headers)); + + if (recv_chain_ptr) { + auto need_recv_correction = false; + + auto top_recv_maybe = recv_chain_ptr->get_received(0); + + if (top_recv_maybe.has_value()) { + auto &top_recv = top_recv_maybe.value().get(); + + const auto *raddr = top_recv.addr; + if (top_recv.real_ip.size() == 0 || (task->cfg && task->cfg->ignore_received)) { + need_recv_correction = true; + } + else if (!(task->flags & RSPAMD_TASK_FLAG_NO_IP) && task->from_addr) { + if (!raddr) { + need_recv_correction = true; + } + else { + if (rspamd_inet_address_compare(raddr, task->from_addr, FALSE) != 0) { + need_recv_correction = true; + } + } + } + + if (need_recv_correction && !(task->flags & RSPAMD_TASK_FLAG_NO_IP) && task->from_addr) { + msg_debug_task("the first received seems to be" + " not ours, prepend it with fake one"); + + auto &trecv = recv_chain_ptr->new_received(received_header_chain::append_type::append_head); + trecv.flags |= received_flags::ARTIFICIAL; + + if (task->flags & RSPAMD_TASK_FLAG_SSL) { + trecv.flags |= received_flags::SSL; + } + + if (task->auth_user) { + trecv.flags |= received_flags::AUTHENTICATED; + } + + trecv.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(task->from_addr))); + + const auto *mta_name = (const char *) rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_MTA_NAME); + + if (mta_name) { + trecv.by_hostname.assign_copy(std::string_view(mta_name)); + } + trecv.addr = rspamd_inet_address_copy(task->from_addr, + task->task_pool); + + if (task->hostname) { + trecv.real_hostname.assign_copy(std::string_view(task->hostname)); + trecv.from_hostname.assign_copy(trecv.real_hostname); + } + + return true; + } + + /* Extract data from received header if we were not given IP */ + if (!need_recv_correction && (task->flags & RSPAMD_TASK_FLAG_NO_IP) && + (task->cfg && !task->cfg->ignore_received)) { + if (!top_recv.real_ip.empty()) { + if (!rspamd_parse_inet_address(&task->from_addr, + top_recv.real_ip.data(), + top_recv.real_ip.size(), + RSPAMD_INET_ADDRESS_PARSE_NO_UNIX)) { + msg_warn_task("cannot get IP from received header: '%s'", + top_recv.real_ip.data()); + task->from_addr = nullptr; + } + } + if (!top_recv.real_hostname.empty()) { + task->hostname = top_recv.real_hostname.data(); + } + + return true; + } + } + } + + return false; +} + +static auto +received_export_to_lua(received_header_chain *chain, lua_State *L) -> bool +{ + if (chain == nullptr) { + return false; + } + + lua_createtable(L, chain->size(), 0); + + auto push_flag = [L](const received_header &rh, received_flags fl, const char *name) { + lua_pushboolean(L, !!(rh.flags & fl)); + lua_setfield(L, -2, name); + }; + + auto i = 1; + + for (const auto &rh: chain->as_vector()) { + lua_createtable(L, 0, 10); + + if (rh.hdr && rh.hdr->decoded) { + rspamd_lua_table_set(L, "raw", rh.hdr->decoded); + } + + lua_createtable(L, 0, 3); + push_flag(rh, received_flags::ARTIFICIAL, "artificial"); + push_flag(rh, received_flags::AUTHENTICATED, "authenticated"); + push_flag(rh, received_flags::SSL, "ssl"); + lua_setfield(L, -2, "flags"); + + auto push_nullable_string = [L](const mime_string &st, const char *field) { + if (st.empty()) { + lua_pushnil(L); + } + else { + lua_pushlstring(L, st.data(), st.size()); + } + lua_setfield(L, -2, field); + }; + + push_nullable_string(rh.from_hostname, "from_hostname"); + push_nullable_string(rh.real_hostname, "real_hostname"); + push_nullable_string(rh.real_ip, "from_ip"); + push_nullable_string(rh.by_hostname, "by_hostname"); + push_nullable_string(rh.for_mbox, "for"); + + if (rh.addr) { + rspamd_lua_ip_push(L, rh.addr); + } + else { + lua_pushnil(L); + } + lua_setfield(L, -2, "real_ip"); + + lua_pushstring(L, received_protocol_to_string(rh.flags)); + lua_setfield(L, -2, "proto"); + + lua_pushinteger(L, rh.timestamp); + lua_setfield(L, -2, "timestamp"); + + lua_rawseti(L, -2, i++); + } + + return true; +} + +}// namespace rspamd::mime + +bool rspamd_received_header_parse(struct rspamd_task *task, + const char *data, size_t sz, + struct rspamd_mime_header *hdr) +{ + auto *recv_chain_ptr = static_cast<rspamd::mime::received_header_chain *>(MESSAGE_FIELD(task, received_headers)); + + if (recv_chain_ptr == nullptr) { + /* This constructor automatically registers dtor in mempool */ + recv_chain_ptr = new rspamd::mime::received_header_chain(task); + MESSAGE_FIELD(task, received_headers) = (void *) recv_chain_ptr; + } + return rspamd::mime::received_header_parse(*recv_chain_ptr, task->task_pool, + std::string_view{data, sz}, hdr); +} + +bool rspamd_received_maybe_fix_task(struct rspamd_task *task) +{ + return rspamd::mime::received_maybe_fix_task(task); +} + +bool rspamd_received_export_to_lua(struct rspamd_task *task, lua_State *L) +{ + return rspamd::mime::received_export_to_lua( + static_cast<rspamd::mime::received_header_chain *>(MESSAGE_FIELD(task, received_headers)), + L); +} + +/* Tests part */ +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +TEST_SUITE("received") +{ + TEST_CASE("parse received") + { + using namespace std::string_view_literals; + using map_type = ankerl::unordered_dense::map<std::string_view, std::string_view>; + std::vector<std::pair<std::string_view, map_type>> cases{ + // Simple received + {"from smtp11.mailtrack.pl (smtp11.mailtrack.pl [185.243.30.90])"sv, + {{"real_ip", "185.243.30.90"}, + {"real_hostname", "smtp11.mailtrack.pl"}, + {"from_hostname", "smtp11.mailtrack.pl"}}}, + // Real Postfix IPv6 received + {"from server.chat-met-vreemden.nl (unknown [IPv6:2a01:7c8:aab6:26d:5054:ff:fed1:1da2])\n" + "\t(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))\n" + "\t(Client did not present a certificate)\n" + "\tby mx1.freebsd.org (Postfix) with ESMTPS id CF0171862\n" + "\tfor <test@example.com>; Mon, 6 Jul 2015 09:01:20 +0000 (UTC)\n" + "\t(envelope-from upwest201diana@outlook.com)"sv, + {{"real_ip", "2a01:7c8:aab6:26d:5054:ff:fed1:1da2"}, + {"from_hostname", "server.chat-met-vreemden.nl"}, + {"by_hostname", "mx1.freebsd.org"}, + {"for_mbox", "<test@example.com>"}}}, + // Exim IPv4 received + {"from localhost ([127.0.0.1]:49019 helo=hummus.csx.cam.ac.uk)\n" + " by hummus.csx.cam.ac.uk with esmtp (Exim 4.91-pdpfix1)\n" + " (envelope-from <exim-dev-bounces@exim.org>)\n" + " id 1fZ55o-0006DP-3H\n" + " for <xxx@xxx.xxx>; Sat, 30 Jun 2018 02:54:28 +0100"sv, + { + {"from_hostname", "localhost"}, + {"real_ip", "127.0.0.1"}, + {"for_mbox", "<xxx@xxx.xxx>"}, + {"by_hostname", "hummus.csx.cam.ac.uk"}, + }}, + // Exim IPv6 received + {"from smtp.spodhuis.org ([2a02:898:31:0:48:4558:736d:7470]:38689\n" + " helo=mx.spodhuis.org)\n" + " by hummus.csx.cam.ac.uk with esmtpsa (TLSv1.3:TLS_AES_256_GCM_SHA384:256)\n" + " (Exim 4.91-pdpfix1+cc) (envelope-from <xxx@exim.org>)\n" + " id 1fZ55k-0006CO-9M\n" + " for exim-dev@exim.org; Sat, 30 Jun 2018 02:54:24 +0100"sv, + { + {"from_hostname", "smtp.spodhuis.org"}, + {"real_ip", "2a02:898:31:0:48:4558:736d:7470"}, + {"for_mbox", "exim-dev@exim.org"}, + {"by_hostname", "hummus.csx.cam.ac.uk"}, + }}, + // Haraka received + {"from aaa.cn ([1.1.1.1]) by localhost.localdomain (Haraka/2.8.18) with " + "ESMTPA id 349C9C2B-491A-4925-A687-3EF14038C344.1 envelope-from <huxin@xxx.com> " + "(authenticated bits=0); Tue, 03 Jul 2018 14:18:13 +0200"sv, + { + {"from_hostname", "aaa.cn"}, + {"real_ip", "1.1.1.1"}, + {"by_hostname", "localhost.localdomain"}, + }}, + // Invalid by + {"from [192.83.172.101] (HELLO 148.251.238.35) (148.251.238.35) " + "by guovswzqkvry051@sohu.com with gg login " + "by AOL 6.0 for Windows US sub 008 SMTP ; Tue, 03 Jul 2018 09:01:47 -0300"sv, + { + {"from_hostname", "192.83.172.101"}, + {"real_ip", "192.83.172.101"}, + }}, + // Invalid hostinfo + {"from example.com ([]) by example.com with ESMTP id 2019091111 ;" + " Thu, 26 Sep 2019 11:19:07 +0200"sv, + { + {"by_hostname", "example.com"}, + {"from_hostname", "example.com"}, + {"real_hostname", "example.com"}, + }}, + // Different real and announced hostnames + broken crap + {"from 171-29.br (1-1-1-1.z.com.br [1.1.1.1]) by x.com.br (Postfix) " + "with;ESMTP id 44QShF6xj4z1X for <hey@y.br>; Thu, 21 Mar 2019 23:45:46 -0300 " + ": <g @yi.br>"sv, + { + {"real_ip", "1.1.1.1"}, + {"from_hostname", "171-29.br"}, + {"real_hostname", "1-1-1-1.z.com.br"}, + {"by_hostname", "x.com.br"}, + }}, + // Different real and announced ips + no hostname + {"from [127.0.0.1] ([127.0.0.2]) by smtp.gmail.com with ESMTPSA id xxxololo"sv, + { + {"real_ip", "127.0.0.2"}, + {"from_hostname", "127.0.0.1"}, + {"by_hostname", "smtp.gmail.com"}, + }}, + // Different real and hostanes + {"from 185.118.166.127 (steven2.zhou01.pserver.ru [185.118.166.127]) " + "by mail.832zsu.cn (Postfix) with ESMTPA id AAD722133E34"sv, + { + {"real_ip", "185.118.166.127"}, + {"from_hostname", "185.118.166.127"}, + {"real_hostname", "steven2.zhou01.pserver.ru"}, + {"by_hostname", "mail.832zsu.cn"}, + }}, + // \0 in received must be filtered + {"from smtp11.mailt\0rack.pl (smtp11.mail\0track.pl [1\085.243.30.90])"sv, + {{"real_ip", "185.243.30.90"}, + {"real_hostname", "smtp11.mailtrack.pl"}, + {"from_hostname", "smtp11.mailtrack.pl"}}}, + // No from part + {"by mail.832zsu.cn (Postfix) with ESMTPA id AAD722133E34"sv, + { + {"by_hostname", "mail.832zsu.cn"}, + }}, + // From part is in the comment + {"(from asterisk@localhost)\n" + " by pbx.xxx.com (8.14.7/8.14.7/Submit) id 076Go4wD014562;\n" + " Thu, 6 Aug 2020 11:50:04 -0500"sv, + { + {"by_hostname", "pbx.xxx.com"}, + }}, + }; + rspamd_mempool_t *pool = rspamd_mempool_new_default("rcvd test", 0); + + for (auto &&c: cases) { + SUBCASE(c.first.data()) + { + rspamd::mime::received_header_chain chain; + auto ret = rspamd::mime::received_header_parse(chain, pool, + c.first, nullptr); + CHECK(ret == true); + auto &&rh = chain.get_received(0); + CHECK(rh.has_value()); + auto res = rh.value().get().as_map(); + + for (const auto &expected: c.second) { + CHECK_MESSAGE(res.contains(expected.first), expected.first.data()); + CHECK(res[expected.first] == expected.second); + } + for (const auto &existing: res) { + CHECK_MESSAGE(c.second.contains(existing.first), existing.first.data()); + CHECK(c.second[existing.first] == existing.second); + } + } + } + + rspamd_mempool_delete(pool); + } +}
\ No newline at end of file diff --git a/src/libmime/received.h b/src/libmime/received.h new file mode 100644 index 0000000..46608a3 --- /dev/null +++ b/src/libmime/received.h @@ -0,0 +1,68 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef RSPAMD_RECEIVED_H +#define RSPAMD_RECEIVED_H + +#include "config.h" +#include "libutil/addr.h" + +#ifdef __cplusplus +extern "C" { +#endif +/* + * C bindings for C++ received code + */ + +struct rspamd_email_address; +struct rspamd_received_header_chain; +struct rspamd_mime_header; + +/** + * Parse received header from an input header data + * @param task + * @param data + * @param sz + * @param hdr + * @return + */ +bool rspamd_received_header_parse(struct rspamd_task *task, + const char *data, size_t sz, struct rspamd_mime_header *hdr); + + +/** + * Process task data and the most top received and fix either part if needed + * @param task + * @return + */ +bool rspamd_received_maybe_fix_task(struct rspamd_task *task); + +struct lua_State; +/** + * Push received headers chain to lua + * @param task + * @param L + * @return + */ +bool rspamd_received_export_to_lua(struct rspamd_task *task, struct lua_State *L); + +#ifdef __cplusplus +} +#endif + + +#endif//RSPAMD_RECEIVED_H diff --git a/src/libmime/received.hxx b/src/libmime/received.hxx new file mode 100644 index 0000000..4f423f1 --- /dev/null +++ b/src/libmime/received.hxx @@ -0,0 +1,314 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef RSPAMD_RECEIVED_HXX +#define RSPAMD_RECEIVED_HXX +#pragma once + +#include "config.h" +#include "received.h" +#include "mime_string.hxx" +#include "libmime/email_addr.h" +#include "libserver/task.h" +#include "contrib/ankerl/unordered_dense.h" +#include <vector> +#include <string_view> +#include <utility> +#include <optional> + +namespace rspamd::mime { + +static inline auto +received_char_filter(UChar32 uc) -> UChar32 +{ + if (u_isprint(uc)) { + return u_tolower(uc); + } + + return 0; +} + +enum class received_flags { + DEFAULT = 0, + SMTP = 1u << 0u, + ESMTP = 1u << 1u, + ESMTPA = 1u << 2u, + ESMTPS = 1u << 3u, + ESMTPSA = 1u << 4u, + LMTP = 1u << 5u, + IMAP = 1u << 6u, + LOCAL = 1u << 7u, + HTTP = 1u << 8u, + MAPI = 1u << 9u, + UNKNOWN = 1u << 10u, + ARTIFICIAL = (1u << 11u), + SSL = (1u << 12u), + AUTHENTICATED = (1u << 13u), +}; + +constexpr received_flags operator|(received_flags lhs, received_flags rhs) +{ + using ut = std::underlying_type<received_flags>::type; + return static_cast<received_flags>(static_cast<ut>(lhs) | static_cast<ut>(rhs)); +} + +constexpr received_flags operator|=(received_flags &lhs, const received_flags rhs) +{ + using ut = std::underlying_type<received_flags>::type; + lhs = static_cast<received_flags>(static_cast<ut>(lhs) | static_cast<ut>(rhs)); + return lhs; +} + +constexpr received_flags operator&(received_flags lhs, received_flags rhs) +{ + using ut = std::underlying_type<received_flags>::type; + return static_cast<received_flags>(static_cast<ut>(lhs) & static_cast<ut>(rhs)); +} + +constexpr bool operator!(received_flags fl) +{ + return fl == received_flags::DEFAULT; +} + +constexpr received_flags received_type_apply_protocols_mask(received_flags fl) +{ + return fl & (received_flags::SMTP | + received_flags::ESMTP | + received_flags::ESMTPA | + received_flags::ESMTPS | + received_flags::ESMTPSA | + received_flags::IMAP | + received_flags::HTTP | + received_flags::LOCAL | + received_flags::MAPI | + received_flags::LMTP); +} + +constexpr const char *received_protocol_to_string(received_flags fl) +{ + const auto *proto = "unknown"; + + switch (received_type_apply_protocols_mask(fl)) { + case received_flags::SMTP: + proto = "smtp"; + break; + case received_flags::ESMTP: + proto = "esmtp"; + break; + case received_flags::ESMTPS: + proto = "esmtps"; + break; + case received_flags::ESMTPA: + proto = "esmtpa"; + break; + case received_flags::ESMTPSA: + proto = "esmtpsa"; + break; + case received_flags::LMTP: + proto = "lmtp"; + break; + case received_flags::IMAP: + proto = "imap"; + break; + case received_flags::HTTP: + proto = "http"; + break; + case received_flags::LOCAL: + proto = "local"; + break; + case received_flags::MAPI: + proto = "mapi"; + break; + default: + break; + } + + return proto; +} + +struct received_header { + mime_string from_hostname; + mime_string real_hostname; + mime_string real_ip; + mime_string by_hostname; + mime_string for_mbox; + struct rspamd_email_address *for_addr = nullptr; + rspamd_inet_addr_t *addr = nullptr; + struct rspamd_mime_header *hdr = nullptr; + time_t timestamp = 0; + received_flags flags = received_flags::DEFAULT; /* See enum rspamd_received_type */ + + received_header() noexcept + : from_hostname(received_char_filter), + real_hostname(received_char_filter), + real_ip(received_char_filter), + by_hostname(received_char_filter), + for_mbox() + { + } + /* We have raw C pointers, so copy is explicitly disabled */ + received_header(const received_header &other) = delete; + received_header(received_header &&other) noexcept + { + *this = std::move(other); + } + + received_header &operator=(received_header &&other) noexcept + { + if (this != &other) { + from_hostname = std::move(other.from_hostname); + real_hostname = std::move(other.real_hostname); + real_ip = std::move(other.real_ip); + by_hostname = std::move(other.by_hostname); + for_mbox = std::move(other.for_mbox); + timestamp = other.timestamp; + flags = other.flags; + std::swap(for_addr, other.for_addr); + std::swap(addr, other.addr); + std::swap(hdr, other.hdr); + } + return *this; + } + + /* Unit tests helper */ + static auto from_map(const ankerl::unordered_dense::map<std::string_view, std::string_view> &map) -> received_header + { + using namespace std::string_view_literals; + received_header rh; + + if (map.contains("from_hostname")) { + rh.from_hostname.assign_copy(map.at("from_hostname"sv)); + } + if (map.contains("real_hostname")) { + rh.real_hostname.assign_copy(map.at("real_hostname"sv)); + } + if (map.contains("by_hostname")) { + rh.by_hostname.assign_copy(map.at("by_hostname"sv)); + } + if (map.contains("real_ip")) { + rh.real_ip.assign_copy(map.at("real_ip"sv)); + } + if (map.contains("for_mbox")) { + rh.for_mbox.assign_copy(map.at("for_mbox"sv)); + } + + return rh; + } + + auto as_map() const -> ankerl::unordered_dense::map<std::string_view, std::string_view> + { + ankerl::unordered_dense::map<std::string_view, std::string_view> map; + + if (!from_hostname.empty()) { + map["from_hostname"] = from_hostname.as_view(); + } + if (!real_hostname.empty()) { + map["real_hostname"] = real_hostname.as_view(); + } + if (!by_hostname.empty()) { + map["by_hostname"] = by_hostname.as_view(); + } + if (!real_ip.empty()) { + map["real_ip"] = real_ip.as_view(); + } + if (!for_mbox.empty()) { + map["for_mbox"] = for_mbox.as_view(); + } + + return map; + } + + ~received_header() + { + if (for_addr) { + rspamd_email_address_free(for_addr); + } + } +}; + +class received_header_chain { +public: + explicit received_header_chain(struct rspamd_task *task) + { + headers.reserve(2); + rspamd_mempool_add_destructor(task->task_pool, + received_header_chain::received_header_chain_pool_dtor, this); + } + explicit received_header_chain() + { + headers.reserve(2); + } + + enum class append_type { + append_tail, + append_head + }; + + auto new_received(append_type how = append_type::append_tail) -> received_header & + { + if (how == append_type::append_tail) { + headers.emplace_back(); + + return headers.back(); + } + else { + headers.insert(std::begin(headers), received_header()); + + return headers.front(); + } + } + auto new_received(received_header &&hdr, append_type how = append_type::append_tail) -> received_header & + { + if (how == append_type::append_tail) { + headers.emplace_back(std::move(hdr)); + + return headers.back(); + } + else { + headers.insert(std::begin(headers), std::move(hdr)); + + return headers.front(); + } + } + auto get_received(std::size_t nth) -> std::optional<std::reference_wrapper<received_header>> + { + if (nth < headers.size()) { + return headers[nth]; + } + + return std::nullopt; + } + auto size() const -> std::size_t + { + return headers.size(); + } + constexpr auto as_vector() const -> const std::vector<received_header> & + { + return headers; + } + +private: + static auto received_header_chain_pool_dtor(void *ptr) -> void + { + delete static_cast<received_header_chain *>(ptr); + } + std::vector<received_header> headers; +}; + +}// namespace rspamd::mime + +#endif//RSPAMD_RECEIVED_HXX diff --git a/src/libmime/scan_result.c b/src/libmime/scan_result.c new file mode 100644 index 0000000..a6bc0cb --- /dev/null +++ b/src/libmime/scan_result.c @@ -0,0 +1,1106 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "mem_pool.h" +#include "scan_result.h" +#include "rspamd.h" +#include "message.h" +#include "lua/lua_common.h" +#include "libserver/cfg_file_private.h" +#include "libmime/scan_result_private.h" +#include "contrib/fastutf8/fastutf8.h" +#include <math.h> +#include "contrib/uthash/utlist.h" + +#define msg_debug_metric(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_metric_log_id, "metric", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(metric) + +/* Average symbols count to optimize hash allocation */ +static struct rspamd_counter_data symbols_count; + +static void +rspamd_scan_result_dtor(gpointer d) +{ + struct rspamd_scan_result *r = (struct rspamd_scan_result *) d; + struct rspamd_symbol_result *sres; + + rspamd_set_counter_ema(&symbols_count, kh_size(r->symbols), 0.5); + + if (r->symbol_cbref != -1) { + luaL_unref(r->task->cfg->lua_state, LUA_REGISTRYINDEX, r->symbol_cbref); + } + + kh_foreach_value(r->symbols, sres, { + if (sres->options) { + kh_destroy(rspamd_options_hash, sres->options); + } + }); + + kh_destroy(rspamd_symbols_hash, r->symbols); + kh_destroy(rspamd_symbols_group_hash, r->sym_groups); +} + +static void +rspamd_metric_actions_foreach_cb(int i, struct rspamd_action *act, void *cbd) +{ + struct rspamd_scan_result *metric_res = (struct rspamd_scan_result *) cbd; + metric_res->actions_config[i].flags = RSPAMD_ACTION_RESULT_DEFAULT; + if (!(act->flags & RSPAMD_ACTION_NO_THRESHOLD)) { + metric_res->actions_config[i].cur_limit = act->threshold; + } + else { + metric_res->actions_config[i].flags |= RSPAMD_ACTION_RESULT_NO_THRESHOLD; + } + metric_res->actions_config[i].action = act; +} + +struct rspamd_scan_result * +rspamd_create_metric_result(struct rspamd_task *task, + const gchar *name, gint lua_sym_cbref) +{ + struct rspamd_scan_result *metric_res; + + metric_res = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_scan_result)); + metric_res->symbols = kh_init(rspamd_symbols_hash); + metric_res->sym_groups = kh_init(rspamd_symbols_group_hash); + + if (name) { + metric_res->name = rspamd_mempool_strdup(task->task_pool, name); + } + else { + metric_res->name = NULL; + } + + metric_res->symbol_cbref = lua_sym_cbref; + metric_res->task = task; + + /* Optimize allocation */ + kh_resize(rspamd_symbols_group_hash, metric_res->sym_groups, 4); + + if (symbols_count.mean > 4) { + kh_resize(rspamd_symbols_hash, metric_res->symbols, symbols_count.mean); + } + else { + kh_resize(rspamd_symbols_hash, metric_res->symbols, 4); + } + + if (task->cfg) { + size_t nact = rspamd_config_actions_size(task->cfg); + metric_res->actions_config = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_action_config) * nact); + rspamd_config_actions_foreach_enumerate(task->cfg, rspamd_metric_actions_foreach_cb, metric_res); + metric_res->nactions = nact; + } + + rspamd_mempool_add_destructor(task->task_pool, + rspamd_scan_result_dtor, + metric_res); + DL_APPEND(task->result, metric_res); + + return metric_res; +} + +static inline int +rspamd_pr_sort(const struct rspamd_passthrough_result *pra, + const struct rspamd_passthrough_result *prb) +{ + return prb->priority - pra->priority; +} + +bool rspamd_add_passthrough_result(struct rspamd_task *task, + struct rspamd_action *action, + guint priority, + double target_score, + const gchar *message, + const gchar *module, + uint flags, + struct rspamd_scan_result *scan_result) +{ + struct rspamd_passthrough_result *pr; + + if (scan_result == NULL) { + scan_result = task->result; + } + + /* Find the specific action config */ + struct rspamd_action_config *action_config = NULL; + + for (unsigned int i = 0; i < scan_result->nactions; i++) { + struct rspamd_action_config *cur = &scan_result->actions_config[i]; + + /* We assume that all action pointers are static */ + if (cur->action == action) { + action_config = cur; + break; + } + } + + if (action_config && (action_config->flags & RSPAMD_ACTION_RESULT_DISABLED)) { + msg_info_task("<%s>: NOT set pre-result to '%s' %s(%.2f): '%s' from %s(%d); action is disabled", + MESSAGE_FIELD_CHECK(task, message_id), action->name, + flags & RSPAMD_PASSTHROUGH_LEAST ? "*least " : "", + target_score, + message, module, priority); + + return false; + } + + pr = rspamd_mempool_alloc(task->task_pool, sizeof(*pr)); + pr->action = action; + pr->priority = priority; + pr->message = message; + pr->module = module; + pr->target_score = target_score; + pr->flags = flags; + + DL_APPEND(scan_result->passthrough_result, pr); + DL_SORT(scan_result->passthrough_result, rspamd_pr_sort); + + if (!isnan(target_score)) { + + msg_info_task("<%s>: set pre-result to '%s' %s(%.2f): '%s' from %s(%d)", + MESSAGE_FIELD_CHECK(task, message_id), action->name, + flags & RSPAMD_PASSTHROUGH_LEAST ? "*least " : "", + target_score, + message, module, priority); + } + else { + msg_info_task("<%s>: set pre-result to '%s' %s(no score): '%s' from %s(%d)", + MESSAGE_FIELD_CHECK(task, message_id), action->name, + flags & RSPAMD_PASSTHROUGH_LEAST ? "*least " : "", + message, module, priority); + } + + scan_result->nresults++; + + return true; +} + +static inline gdouble +rspamd_check_group_score(struct rspamd_task *task, + const gchar *symbol, + struct rspamd_symbols_group *gr, + gdouble *group_score, + gdouble w) +{ + if (gr != NULL && group_score && gr->max_score > 0.0 && w > 0.0) { + if (*group_score >= gr->max_score && w > 0) { + msg_info_task("maximum group score %.2f for group %s has been reached," + " ignoring symbol %s with weight %.2f", + gr->max_score, + gr->name, symbol, w); + return NAN; + } + else if (*group_score + w > gr->max_score) { + w = gr->max_score - *group_score; + } + } + + return w; +} + +#ifndef DBL_EPSILON +#define DBL_EPSILON 2.2204460492503131e-16 +#endif + +static struct rspamd_symbol_result * +insert_metric_result(struct rspamd_task *task, + const gchar *symbol, + double weight, + const gchar *opt, + struct rspamd_scan_result *metric_res, + enum rspamd_symbol_insert_flags flags, + bool *new_sym) +{ + struct rspamd_symbol_result *symbol_result = NULL; + gdouble final_score, *gr_score = NULL, next_gf = 1.0, diff; + struct rspamd_symbol *sdef; + struct rspamd_symbols_group *gr = NULL; + const ucl_object_t *mobj, *sobj; + gint max_shots = G_MAXINT, ret; + guint i; + khiter_t k; + gboolean single = !!(flags & RSPAMD_SYMBOL_INSERT_SINGLE); + gchar *sym_cpy; + + if (!isfinite(weight)) { + msg_warn_task("detected %s score for symbol %s, replace it with zero", + isnan(weight) ? "NaN" : "infinity", symbol); + weight = 0.0; + } + + msg_debug_metric("want to insert symbol %s, initial weight %.2f", + symbol, weight); + + sdef = g_hash_table_lookup(task->cfg->symbols, symbol); + if (sdef == NULL) { + if (flags & RSPAMD_SYMBOL_INSERT_ENFORCE) { + final_score = 1.0 * weight; /* Enforce static weight to 1.0 */ + } + else { + final_score = 0.0; + } + + msg_debug_metric("no symbol definition for %s; final multiplier %.2f", + symbol, final_score); + } + else { + if (sdef->cache_item) { + /* Check if we can insert this symbol at all */ + if (!rspamd_symcache_is_item_allowed(task, sdef->cache_item, FALSE)) { + msg_debug_metric("symbol %s is not allowed to be inserted due to settings", + symbol); + return NULL; + } + } + + final_score = (*sdef->weight_ptr) * weight; + + PTR_ARRAY_FOREACH(sdef->groups, i, gr) + { + k = kh_get(rspamd_symbols_group_hash, metric_res->sym_groups, gr); + + if (k == kh_end(metric_res->sym_groups)) { + k = kh_put(rspamd_symbols_group_hash, metric_res->sym_groups, + gr, &ret); + kh_value(metric_res->sym_groups, k) = 0; + } + } + + msg_debug_metric("metric multiplier for %s is %.2f", + symbol, *sdef->weight_ptr); + } + + if (task->settings) { + gdouble corr; + mobj = ucl_object_lookup(task->settings, "scores"); + + if (!mobj) { + /* Legacy */ + mobj = task->settings; + } + else { + msg_debug_metric("found scores in the settings"); + } + + sobj = ucl_object_lookup(mobj, symbol); + if (sobj != NULL && ucl_object_todouble_safe(sobj, &corr)) { + msg_debug_metric("settings: changed weight of symbol %s from %.2f " + "to %.2f * %.2f", + symbol, final_score, corr, weight); + final_score = corr * weight; + } + } + + k = kh_get(rspamd_symbols_hash, metric_res->symbols, symbol); + if (k != kh_end(metric_res->symbols)) { + /* Existing metric score */ + symbol_result = kh_value(metric_res->symbols, k); + if (single) { + max_shots = 1; + } + else { + if (sdef) { + if (sdef->groups) { + PTR_ARRAY_FOREACH(sdef->groups, i, gr) + { + if (gr->flags & RSPAMD_SYMBOL_GROUP_ONE_SHOT) { + max_shots = 1; + } + } + } + + max_shots = MIN(max_shots, sdef->nshots); + } + else { + max_shots = task->cfg->default_max_shots; + } + } + + msg_debug_metric("nshots: %d for symbol %s", max_shots, symbol); + + if (!single && (max_shots > 0 && (symbol_result->nshots >= max_shots))) { + single = TRUE; + } + + symbol_result->nshots++; + + if (opt) { + rspamd_task_add_result_option(task, symbol_result, opt, strlen(opt)); + } + + /* Adjust diff */ + if (!single) { + diff = final_score; + msg_debug_metric("symbol %s can be inserted multiple times: %.2f weight", + symbol, diff); + } + else { + if (fabs(symbol_result->score) < fabs(final_score) && + signbit(symbol_result->score) == signbit(final_score)) { + /* Replace less significant weight with a more significant one */ + diff = final_score - symbol_result->score; + msg_debug_metric("symbol %s can be inserted single time;" + " weight adjusted %.2f + %.2f", + symbol, symbol_result->score, diff); + } + else { + diff = 0; + } + } + + if (diff) { + /* Handle grow factor */ + if (metric_res->grow_factor && diff > 0) { + diff *= metric_res->grow_factor; + next_gf *= task->cfg->grow_factor; + } + else if (diff > 0) { + next_gf = task->cfg->grow_factor; + } + + msg_debug_metric("adjust grow factor to %.2f for symbol %s (%.2f final)", + next_gf, symbol, diff); + + if (sdef) { + PTR_ARRAY_FOREACH(sdef->groups, i, gr) + { + gdouble cur_diff; + + k = kh_get(rspamd_symbols_group_hash, + metric_res->sym_groups, gr); + g_assert(k != kh_end(metric_res->sym_groups)); + gr_score = &kh_value(metric_res->sym_groups, k); + cur_diff = rspamd_check_group_score(task, symbol, gr, + gr_score, diff); + + if (isnan(cur_diff)) { + /* Limit reached, do not add result */ + msg_debug_metric( + "group limit %.2f is reached for %s when inserting symbol %s;" + " drop score %.2f", + *gr_score, gr->name, symbol, diff); + + diff = NAN; + break; + } + else if (gr_score) { + *gr_score += cur_diff; + + if (cur_diff < diff) { + /* Reduce */ + msg_debug_metric( + "group limit %.2f is reached for %s when inserting symbol %s;" + " reduce score %.2f - %.2f", + *gr_score, gr->name, symbol, diff, cur_diff); + diff = cur_diff; + } + } + } + } + + if (!isnan(diff)) { + metric_res->score += diff; + metric_res->grow_factor = next_gf; + + if (single) { + msg_debug_metric("final score for single symbol %s = %.2f; %.2f diff", + symbol, final_score, diff); + symbol_result->score = final_score; + } + else { + msg_debug_metric("increase final score for multiple symbol %s += %.2f = %.2f", + symbol, symbol_result->score, diff); + symbol_result->score += diff; + } + } + } + } + else { + /* New result */ + if (new_sym) { + *new_sym = true; + } + + sym_cpy = rspamd_mempool_strdup(task->task_pool, symbol); + k = kh_put(rspamd_symbols_hash, metric_res->symbols, + sym_cpy, &ret); + g_assert(ret > 0); + symbol_result = rspamd_mempool_alloc0(task->task_pool, sizeof(*symbol_result)); + kh_value(metric_res->symbols, k) = symbol_result; + + /* Handle grow factor */ + if (metric_res->grow_factor && final_score > 0) { + final_score *= metric_res->grow_factor; + next_gf *= task->cfg->grow_factor; + } + else if (final_score > 0) { + next_gf = task->cfg->grow_factor; + } + + msg_debug_metric("adjust grow factor to %.2f for symbol %s (%.2f final)", + next_gf, symbol, final_score); + + symbol_result->name = sym_cpy; + symbol_result->sym = sdef; + symbol_result->nshots = 1; + + if (sdef) { + /* Check group limits */ + PTR_ARRAY_FOREACH(sdef->groups, i, gr) + { + gdouble cur_score; + + k = kh_get(rspamd_symbols_group_hash, metric_res->sym_groups, gr); + g_assert(k != kh_end(metric_res->sym_groups)); + gr_score = &kh_value(metric_res->sym_groups, k); + cur_score = rspamd_check_group_score(task, symbol, gr, + gr_score, final_score); + + if (isnan(cur_score)) { + /* Limit reached, do not add result */ + msg_debug_metric( + "group limit %.2f is reached for %s when inserting symbol %s;" + " drop score %.2f", + *gr_score, gr->name, symbol, final_score); + final_score = NAN; + break; + } + else if (gr_score) { + *gr_score += cur_score; + + if (cur_score < final_score) { + /* Reduce */ + msg_debug_metric( + "group limit %.2f is reached for %s when inserting symbol %s;" + " reduce score %.2f - %.2f", + *gr_score, gr->name, symbol, final_score, cur_score); + final_score = cur_score; + } + } + } + } + + if (!isnan(final_score)) { + const double epsilon = DBL_EPSILON; + + metric_res->score += final_score; + metric_res->grow_factor = next_gf; + symbol_result->score = final_score; + + if (final_score > epsilon) { + metric_res->npositive++; + metric_res->positive_score += final_score; + } + else if (final_score < -epsilon) { + metric_res->nnegative++; + metric_res->negative_score += fabs(final_score); + } + } + else { + symbol_result->score = 0; + } + + if (opt) { + rspamd_task_add_result_option(task, symbol_result, opt, strlen(opt)); + } + } + + msg_debug_metric("final insertion for symbol %s, score %.2f, factor: %f", + symbol, + symbol_result->score, + final_score); + metric_res->nresults++; + + return symbol_result; +} + +struct rspamd_symbol_result * +rspamd_task_insert_result_full(struct rspamd_task *task, + const gchar *symbol, + double weight, + const gchar *opt, + enum rspamd_symbol_insert_flags flags, + struct rspamd_scan_result *result) +{ + struct rspamd_symbol_result *symbol_result = NULL, *ret = NULL; + struct rspamd_scan_result *mres; + + /* + * We allow symbols to be inserted for skipped tasks, as it might be a + * race condition before some symbol is finished and skip flag being set. + */ + if (!RSPAMD_TASK_IS_SKIPPED(task) && (task->processed_stages & (RSPAMD_TASK_STAGE_IDEMPOTENT >> 1))) { + msg_err_task("cannot insert symbol %s on idempotent phase", + symbol); + + return NULL; + } + + if (result == NULL) { + /* Insert everywhere */ + DL_FOREACH(task->result, mres) + { + if (mres->symbol_cbref != -1) { + /* Check if we can insert this symbol to this symbol result */ + GError *err = NULL; + lua_State *L = (lua_State *) task->cfg->lua_state; + + if (!rspamd_lua_universal_pcall(L, mres->symbol_cbref, + G_STRLOC, 1, "uss", &err, + "rspamd{task}", task, symbol, mres->name ? mres->name : "default")) { + msg_warn_task("cannot call for symbol_cbref for result %s: %e", + mres->name ? mres->name : "default", err); + g_error_free(err); + + continue; + } + else { + if (!lua_toboolean(L, -1)) { + /* Skip symbol */ + msg_debug_metric("skip symbol %s for result %s due to Lua return value", + symbol, mres->name); + lua_pop(L, 1); /* Remove result */ + + continue; + } + + lua_pop(L, 1); /* Remove result */ + } + } + + bool new_symbol = false; + + symbol_result = insert_metric_result(task, + symbol, + weight, + opt, + mres, + flags, + &new_symbol); + + if (mres->name == NULL) { + /* Default result */ + ret = symbol_result; + + /* Process cache item */ + if (symbol_result && task->cfg->cache && symbol_result->sym && symbol_result->nshots == 1) { + rspamd_symcache_inc_frequency(task->cfg->cache, + symbol_result->sym->cache_item, + symbol_result->sym->name); + } + } + else if (new_symbol) { + /* O(N) but we normally don't have any shadow results */ + LL_APPEND(ret, symbol_result); + } + } + } + else { + /* Specific insertion */ + symbol_result = insert_metric_result(task, + symbol, + weight, + opt, + result, + flags, + NULL); + ret = symbol_result; + + if (result->name == NULL) { + /* Process cache item */ + if (symbol_result && task->cfg->cache && symbol_result->sym && symbol_result->nshots == 1) { + rspamd_symcache_inc_frequency(task->cfg->cache, + symbol_result->sym->cache_item, + symbol_result->sym->name); + } + } + } + + return ret; +} + +static gchar * +rspamd_task_option_safe_copy(struct rspamd_task *task, + const gchar *val, + gsize vlen, + gsize *outlen) +{ + const gchar *p, *end; + + p = val; + end = val + vlen; + vlen = 0; /* Reuse */ + + while (p < end) { + if (*p & 0x80) { + UChar32 uc; + gint off = 0; + + U8_NEXT(p, off, end - p, uc); + + if (uc > 0) { + if (u_isprint(uc)) { + vlen += off; + } + else { + /* We will replace it with 0xFFFD */ + vlen += MAX(off, 3); + } + } + else { + vlen += MAX(off, 3); + } + + p += off; + } + else if (!g_ascii_isprint(*p)) { + /* Another 0xFFFD */ + vlen += 3; + p++; + } + else { + p++; + vlen++; + } + } + + gchar *dest, *d; + + dest = rspamd_mempool_alloc(task->task_pool, vlen + 1); + d = dest; + p = val; + + while (p < end) { + if (*p & 0x80) { + UChar32 uc; + gint off = 0; + + U8_NEXT(p, off, end - p, uc); + + if (uc > 0) { + if (u_isprint(uc)) { + memcpy(d, p, off); + d += off; + } + else { + /* We will replace it with 0xFFFD */ + *d++ = '\357'; + *d++ = '\277'; + *d++ = '\275'; + } + } + else { + *d++ = '\357'; + *d++ = '\277'; + *d++ = '\275'; + } + + p += off; + } + else if (!g_ascii_isprint(*p)) { + /* Another 0xFFFD */ + *d++ = '\357'; + *d++ = '\277'; + *d++ = '\275'; + p++; + } + else { + *d++ = *p++; + } + } + + *d = '\0'; + *(outlen) = d - dest; + + return dest; +} + +gboolean +rspamd_task_add_result_option(struct rspamd_task *task, + struct rspamd_symbol_result *s, + const gchar *val, + gsize vlen) +{ + struct rspamd_symbol_option *opt, srch; + gboolean ret = FALSE; + gchar *opt_cpy = NULL; + gsize cpy_len; + khiter_t k; + gint r; + struct rspamd_symbol_result *cur; + + if (s && val) { + /* + * Here we assume that this function is all the time called with the + * symbol from the default result, not some shadow result, or + * the option insertion will be wrong + */ + LL_FOREACH(s, cur) + { + if (cur->opts_len < 0) { + /* Cannot add more options, give up */ + msg_debug_task("cannot add more options to symbol %s when adding option %s", + cur->name, val); + ret = FALSE; + continue; + } + + if (!cur->options) { + cur->options = kh_init(rspamd_options_hash); + } + + if (vlen + cur->opts_len > task->cfg->max_opts_len) { + /* Add truncated option */ + msg_info_task("cannot add more options to symbol %s when adding option %s", + cur->name, val); + val = "..."; + vlen = 3; + cur->opts_len = -1; + } + + if (!(cur->sym && (cur->sym->flags & RSPAMD_SYMBOL_FLAG_ONEPARAM))) { + + srch.option = (gchar *) val; + srch.optlen = vlen; + k = kh_get(rspamd_options_hash, cur->options, &srch); + + if (k == kh_end(cur->options)) { + opt_cpy = rspamd_task_option_safe_copy(task, val, vlen, &cpy_len); + if (cpy_len != vlen) { + srch.option = (gchar *) opt_cpy; + srch.optlen = cpy_len; + k = kh_get(rspamd_options_hash, cur->options, &srch); + } + /* Append new options */ + if (k == kh_end(cur->options)) { + opt = rspamd_mempool_alloc0(task->task_pool, sizeof(*opt)); + opt->optlen = cpy_len; + opt->option = opt_cpy; + + kh_put(rspamd_options_hash, cur->options, opt, &r); + DL_APPEND(cur->opts_head, opt); + + if (s == cur) { + ret = TRUE; + } + } + } + } + else { + /* Skip addition */ + if (s == cur) { + ret = FALSE; + } + } + + if (ret && cur->opts_len >= 0) { + cur->opts_len += vlen; + } + } + } + else if (!val) { + ret = TRUE; + } + + task->result->nresults++; + + return ret; +} + +struct rspamd_action_config * +rspamd_find_action_config_for_action(struct rspamd_scan_result *scan_result, + struct rspamd_action *act) +{ + for (unsigned int i = 0; i < scan_result->nactions; i++) { + struct rspamd_action_config *cur = &scan_result->actions_config[i]; + + if (act == cur->action) { + return cur; + } + } + + return NULL; +} + +struct rspamd_action * +rspamd_check_action_metric(struct rspamd_task *task, + struct rspamd_passthrough_result **ppr, + struct rspamd_scan_result *scan_result) +{ + struct rspamd_action_config *action_lim, + *noaction = NULL; + struct rspamd_action *selected_action = NULL, *least_action = NULL; + struct rspamd_passthrough_result *pr, *sel_pr = NULL; + double max_score = -(G_MAXDOUBLE), sc; + gboolean seen_least = FALSE; + + if (scan_result == NULL) { + scan_result = task->result; + } + + if (scan_result->passthrough_result != NULL) { + DL_FOREACH(scan_result->passthrough_result, pr) + { + struct rspamd_action_config *act_config = + rspamd_find_action_config_for_action(scan_result, pr->action); + + /* Skip disabled actions */ + if (act_config && (act_config->flags & RSPAMD_ACTION_RESULT_DISABLED)) { + continue; + } + + if (!seen_least || !(pr->flags & RSPAMD_PASSTHROUGH_LEAST)) { + sc = pr->target_score; + selected_action = pr->action; + + if (!(pr->flags & RSPAMD_PASSTHROUGH_LEAST)) { + if (!isnan(sc)) { + if (pr->action->action_type == METRIC_ACTION_NOACTION) { + scan_result->score = MIN(sc, scan_result->score); + } + else { + scan_result->score = sc; + } + } + + if (ppr) { + *ppr = pr; + } + + return selected_action; + } + else { + seen_least = true; + least_action = selected_action; + + if (isnan(sc)) { + + if (selected_action->flags & RSPAMD_ACTION_NO_THRESHOLD) { + /* + * In this case, we have a passthrough action that + * is `least` action, however, there is no threshold + * on it. + * + * Hence, we imply the following logic: + * + * - we leave score unchanged + * - we apply passthrough no threshold action unless + * score based action *is not* reject, otherwise + * we apply reject action + */ + } + else { + sc = selected_action->threshold; + max_score = sc; + sel_pr = pr; + } + } + else { + max_score = sc; + sel_pr = pr; + } + } + } + } + } + + /* + * Select result by score + */ + for (size_t i = scan_result->nactions - 1; i != (size_t) -1; i--) { + action_lim = &scan_result->actions_config[i]; + sc = action_lim->cur_limit; + + if (action_lim->action->action_type == METRIC_ACTION_NOACTION) { + noaction = action_lim; + } + + if ((action_lim->flags & (RSPAMD_ACTION_RESULT_DISABLED | RSPAMD_ACTION_RESULT_NO_THRESHOLD))) { + continue; + } + + if (isnan(sc) || + (action_lim->action->flags & (RSPAMD_ACTION_NO_THRESHOLD | RSPAMD_ACTION_HAM))) { + continue; + } + + if (scan_result->score >= sc && sc > max_score) { + selected_action = action_lim->action; + max_score = sc; + } + } + + if (selected_action == NULL) { + selected_action = noaction->action; + } + + if (selected_action) { + + if (seen_least) { + /* Adjust least action */ + if (least_action->flags & RSPAMD_ACTION_NO_THRESHOLD) { + if (selected_action->action_type != METRIC_ACTION_REJECT && + selected_action->action_type != METRIC_ACTION_DISCARD) { + /* Override score based action with least action */ + selected_action = least_action; + + if (ppr) { + *ppr = sel_pr; + } + } + } + else { + /* Adjust score if needed */ + if (max_score > scan_result->score) { + if (ppr) { + *ppr = sel_pr; + } + + scan_result->score = max_score; + } + } + } + + return selected_action; + } + + if (ppr) { + *ppr = sel_pr; + } + + return noaction->action; +} + +struct rspamd_symbol_result * +rspamd_task_find_symbol_result(struct rspamd_task *task, const char *sym, + struct rspamd_scan_result *result) +{ + struct rspamd_symbol_result *res = NULL; + khiter_t k; + + if (result == NULL) { + /* Use default result */ + result = task->result; + } + + k = kh_get(rspamd_symbols_hash, result->symbols, sym); + + if (k != kh_end(result->symbols)) { + res = kh_value(result->symbols, k); + } + + return res; +} + +struct rspamd_symbol_result *rspamd_task_remove_symbol_result( + struct rspamd_task *task, + const gchar *symbol, + struct rspamd_scan_result *result) +{ + struct rspamd_symbol_result *res = NULL; + khiter_t k; + + if (result == NULL) { + /* Use default result */ + result = task->result; + } + + k = kh_get(rspamd_symbols_hash, result->symbols, symbol); + + if (k != kh_end(result->symbols)) { + res = kh_value(result->symbols, k); + + if (!isnan(res->score)) { + /* Remove score from the result */ + result->score -= res->score; + + /* Also check the group limit */ + if (result->sym_groups && res->sym) { + struct rspamd_symbol_group *gr; + gint i; + khiter_t k_groups; + + PTR_ARRAY_FOREACH(res->sym->groups, i, gr) + { + gdouble *gr_score; + + k_groups = kh_get(rspamd_symbols_group_hash, + result->sym_groups, gr); + + if (k_groups != kh_end(result->sym_groups)) { + gr_score = &kh_value(result->sym_groups, k_groups); + + if (gr_score) { + *gr_score -= res->score; + } + } + } + } + } + + kh_del(rspamd_symbols_hash, result->symbols, k); + } + else { + return NULL; + } + + return res; +} + +void rspamd_task_symbol_result_foreach(struct rspamd_task *task, + struct rspamd_scan_result *result, GHFunc func, + gpointer ud) +{ + const gchar *kk; + struct rspamd_symbol_result *res; + + if (result == NULL) { + /* Use default result */ + result = task->result; + } + + if (func) { + kh_foreach(result->symbols, kk, res, { + func((gpointer) kk, (gpointer) res, ud); + }); + } +} + +struct rspamd_scan_result * +rspamd_find_metric_result(struct rspamd_task *task, + const gchar *name) +{ + struct rspamd_scan_result *res; + + if (name == NULL || strcmp(name, "default") == 0) { + return task->result; + } + + DL_FOREACH(task->result, res) + { + if (res->name && strcmp(res->name, name) == 0) { + return res; + } + } + + return NULL; +} diff --git a/src/libmime/scan_result.h b/src/libmime/scan_result.h new file mode 100644 index 0000000..46c2de8 --- /dev/null +++ b/src/libmime/scan_result.h @@ -0,0 +1,250 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file scan_result.h + * Scan result holder + */ + +#ifndef RSPAMD_SCAN_RESULT_H +#define RSPAMD_SCAN_RESULT_H + +#include "config.h" +#include "rspamd_symcache.h" +#include "task.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct rspamd_settings; +struct rspamd_classifier_config; + +struct rspamd_symbol_option { + gchar *option; + gsize optlen; + struct rspamd_symbol_option *prev, *next; +}; + +enum rspamd_symbol_result_flags { + RSPAMD_SYMBOL_RESULT_NORMAL = 0, + RSPAMD_SYMBOL_RESULT_IGNORED = (1 << 0) +}; + +struct kh_rspamd_options_hash_s; + +/** + * Rspamd symbol + */ +struct rspamd_symbol_result { + double score; /**< symbol's score */ + struct kh_rspamd_options_hash_s *options; /**< list of symbol's options */ + struct rspamd_symbol_option *opts_head; /**< head of linked list of options */ + const gchar *name; + struct rspamd_symbol *sym; /**< symbol configuration */ + gssize opts_len; /**< total size of all options (negative if truncated option is added) */ + guint nshots; + int flags; + struct rspamd_symbol_result *next; /**< for shadow results */ +}; + + +#define RSPAMD_PASSTHROUGH_NORMAL 1 +#define RSPAMD_PASSTHROUGH_LOW 0 +#define RSPAMD_PASSTHROUGH_HIGH 2 +#define RSPAMD_PASSTHROUGH_CRITICAL 3 + +#define RSPAMD_PASSTHROUGH_LEAST (1u << 0u) +#define RSPAMD_PASSTHROUGH_NO_SMTP_MESSAGE (1u << 1u) +#define RSPAMD_PASSTHROUGH_PROCESS_ALL (1u << 2u) + +struct rspamd_passthrough_result { + struct rspamd_action *action; + guint priority; + guint flags; + double target_score; + const gchar *message; + const gchar *module; + struct rspamd_passthrough_result *prev, *next; +}; + + +enum rspamd_action_config_flags { + RSPAMD_ACTION_RESULT_DEFAULT = 0, + RSPAMD_ACTION_RESULT_NO_THRESHOLD = (1u << 0u), + RSPAMD_ACTION_RESULT_DISABLED = (1u << 1u), +}; +struct rspamd_action_config { + gdouble cur_limit; + int flags; + struct rspamd_action *action; +}; + +struct kh_rspamd_symbols_hash_s; +struct kh_rspamd_symbols_group_hash_s; + + +struct rspamd_scan_result { + double score; /**< total score */ + double grow_factor; /**< current grow factor */ + struct rspamd_passthrough_result *passthrough_result; + double positive_score; + double negative_score; + struct kh_rspamd_symbols_hash_s *symbols; /**< symbols of metric */ + struct kh_rspamd_symbols_group_hash_s *sym_groups; /**< groups of symbols */ + struct rspamd_action_config *actions_config; + const gchar *name; /**< for named results, NULL is the default result */ + struct rspamd_task *task; /**< back reference */ + gint symbol_cbref; /**< lua function that defines if a symbol can be inserted, -1 if unused */ + guint nactions; + guint npositive; + guint nnegative; + guint nresults; /**< all results: positive, negative, passthrough etc */ + guint nresults_postfilters; /**< how many results are there before postfilters stage */ + struct rspamd_scan_result *prev, *next; /**< double linked list of results */ +}; + +/** + * Create or return existing result for the specified metric name + * @param task task object + * @return metric result or NULL if metric `name` has not been found + */ +struct rspamd_scan_result *rspamd_create_metric_result(struct rspamd_task *task, + const gchar *name, gint lua_sym_cbref); + +/** + * Find result with a specific name (NULL means the default result) + * @param task + * @param name + * @return + */ +struct rspamd_scan_result *rspamd_find_metric_result(struct rspamd_task *task, + const gchar *name); + +/** + * Adds a new passthrough result to a task + * @param task + * @param action + * @param priority + * @param target_score + * @param message + * @param module + */ +bool rspamd_add_passthrough_result(struct rspamd_task *task, + struct rspamd_action *action, guint priority, + double target_score, const gchar *message, + const gchar *module, guint flags, + struct rspamd_scan_result *scan_result); + +enum rspamd_symbol_insert_flags { + RSPAMD_SYMBOL_INSERT_DEFAULT = 0, + RSPAMD_SYMBOL_INSERT_SINGLE = (1 << 0), + RSPAMD_SYMBOL_INSERT_ENFORCE = (1 << 1), +}; + +/** + * Insert a result to task + * @param task worker's task that present message from user + * @param metric_name metric's name to which we need to insert result + * @param symbol symbol to insert + * @param weight numeric weight for symbol + * @param opts list of symbol's options + */ +struct rspamd_symbol_result *rspamd_task_insert_result_full(struct rspamd_task *task, + const gchar *symbol, + double weight, + const gchar *opts, + enum rspamd_symbol_insert_flags flags, + struct rspamd_scan_result *result); + +#define rspamd_task_insert_result_single(task, symbol, weight, opts) \ + rspamd_task_insert_result_full((task), (symbol), (weight), (opts), RSPAMD_SYMBOL_INSERT_SINGLE, NULL) +#define rspamd_task_insert_result(task, symbol, weight, opts) \ + rspamd_task_insert_result_full((task), (symbol), (weight), (opts), RSPAMD_SYMBOL_INSERT_DEFAULT, NULL) + +/** + * Removes a symbol from a specific symbol result + * @param task + * @param symbol + * @param result + * @return + */ +struct rspamd_symbol_result *rspamd_task_remove_symbol_result( + struct rspamd_task *task, + const gchar *symbol, + struct rspamd_scan_result *result); +/** + * Adds new option to symbol + * @param task + * @param s + * @param opt + */ +gboolean rspamd_task_add_result_option(struct rspamd_task *task, + struct rspamd_symbol_result *s, + const gchar *opt, + gsize vlen); + +/** + * Finds symbol result + * @param task + * @param sym + * @return + */ +struct rspamd_symbol_result * +rspamd_task_find_symbol_result(struct rspamd_task *task, const char *sym, + struct rspamd_scan_result *result); + +/** + * Compatibility function to iterate on symbols hash + * @param task + * @param func + * @param ud + */ +void rspamd_task_symbol_result_foreach(struct rspamd_task *task, + struct rspamd_scan_result *result, + GHFunc func, + gpointer ud); + +/** + * Default consolidation function for metric, it get all symbols and multiply symbol + * weight by some factor that is specified in config. Default factor is 1. + * @param task worker's task that present message from user + * @param metric_name name of metric + * @return result metric weight + */ +double rspamd_factor_consolidation_func(struct rspamd_task *task, + const gchar *metric_name, + const gchar *unused); + + +/** + * Check thresholds and return action for a task + * @param task + * @return + */ +struct rspamd_action *rspamd_check_action_metric(struct rspamd_task *task, + struct rspamd_passthrough_result **ppr, + struct rspamd_scan_result *scan_result); + +struct rspamd_action_config *rspamd_find_action_config_for_action(struct rspamd_scan_result *scan_result, + struct rspamd_action *act); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libmime/scan_result_private.h b/src/libmime/scan_result_private.h new file mode 100644 index 0000000..cf0c0c5 --- /dev/null +++ b/src/libmime/scan_result_private.h @@ -0,0 +1,55 @@ +// +// Created by Vsevolod Stakhov on 2019-01-14. +// + +#ifndef RSPAMD_SCAN_RESULT_PRIVATE_H +#define RSPAMD_SCAN_RESULT_PRIVATE_H + +#include "scan_result.h" +#include "contrib/libucl/khash.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RSPAMD_OPTS_SEED 0x9f1f608628a4fefbULL +#define rspamd_symopt_hash(opt) (rspamd_cryptobox_fast_hash( \ + ((struct rspamd_symbol_option *) opt)->option, \ + ((struct rspamd_symbol_option *) opt)->optlen, RSPAMD_OPTS_SEED)) +static inline bool +rspamd_symopt_equal(const struct rspamd_symbol_option *o1, + const struct rspamd_symbol_option *o2) +{ + if (o1->optlen == o2->optlen) { + return (memcmp(o1->option, o2->option, o1->optlen) == 0); + } + + return false; +} + +KHASH_INIT(rspamd_options_hash, struct rspamd_symbol_option *, char, + 0, rspamd_symopt_hash, rspamd_symopt_equal); +/** + * Result of metric processing + */ +KHASH_MAP_INIT_STR(rspamd_symbols_hash, struct rspamd_symbol_result *); +#if UINTPTR_MAX <= UINT_MAX +/* 32 bit */ +#define rspamd_ptr_hash_func(key) (khint32_t)(((uintptr_t) (key)) >> 1) +#else +/* likely 64 bit */ +#define rspamd_ptr_hash_func(key) (khint32_t)(((uintptr_t) (key)) >> 3) +#endif +#define rspamd_ptr_equal_func(a, b) ((a) == (b)) +KHASH_INIT(rspamd_symbols_group_hash, + void *, + double, + 1, + rspamd_ptr_hash_func, + rspamd_ptr_equal_func); + +#ifdef __cplusplus +} +#endif + +#endif//RSPAMD_SCAN_RESULT_PRIVATE_H diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h new file mode 100644 index 0000000..e188b63 --- /dev/null +++ b/src/libmime/smtp_parsers.h @@ -0,0 +1,51 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_SMTP_PARSERS_H_ +#define SRC_LIBMIME_SMTP_PARSERS_H_ + +#include "config.h" +#include "email_addr.h" +#include "content_type.h" +#include "task.h" +#include "message.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +int rspamd_smtp_addr_parse(const char *data, size_t len, + struct rspamd_email_address *addr); + +gboolean rspamd_content_disposition_parser(const char *data, size_t len, + struct rspamd_content_disposition *cd, + rspamd_mempool_t *pool); + +gboolean +rspamd_rfc2047_parser(const gchar *in, gsize len, gint *pencoding, + const gchar **charset, gsize *charset_len, + const gchar **encoded, gsize *encoded_len); + +rspamd_inet_addr_t *rspamd_parse_smtp_ip(const char *data, size_t len, + rspamd_mempool_t *pool); + +guint64 rspamd_parse_smtp_date(const unsigned char *data, size_t len, GError **err); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBMIME_SMTP_PARSERS_H_ */ |