summaryrefslogtreecommitdiffstats
path: root/src/libmime
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/libmime/CMakeLists.txt19
-rw-r--r--src/libmime/archives.c2057
-rw-r--r--src/libmime/archives.h72
-rw-r--r--src/libmime/content_type.c884
-rw-r--r--src/libmime/content_type.h130
-rw-r--r--src/libmime/email_addr.c563
-rw-r--r--src/libmime/email_addr.h97
-rw-r--r--src/libmime/images.c718
-rw-r--r--src/libmime/images.h76
-rw-r--r--src/libmime/lang_detection.c2103
-rw-r--r--src/libmime/lang_detection.h110
-rw-r--r--src/libmime/lang_detection_fasttext.cxx269
-rw-r--r--src/libmime/lang_detection_fasttext.h91
-rw-r--r--src/libmime/message.c1732
-rw-r--r--src/libmime/message.h239
-rw-r--r--src/libmime/mime_encoding.c864
-rw-r--r--src/libmime/mime_encoding.h148
-rw-r--r--src/libmime/mime_encoding_list.h1577
-rw-r--r--src/libmime/mime_expressions.c2392
-rw-r--r--src/libmime/mime_expressions.h65
-rw-r--r--src/libmime/mime_headers.c1441
-rw-r--r--src/libmime/mime_headers.h200
-rw-r--r--src/libmime/mime_parser.c1758
-rw-r--r--src/libmime/mime_parser.h46
-rw-r--r--src/libmime/mime_string.cxx167
-rw-r--r--src/libmime/mime_string.hxx670
-rw-r--r--src/libmime/received.cxx1017
-rw-r--r--src/libmime/received.h68
-rw-r--r--src/libmime/received.hxx314
-rw-r--r--src/libmime/scan_result.c1106
-rw-r--r--src/libmime/scan_result.h250
-rw-r--r--src/libmime/scan_result_private.h55
-rw-r--r--src/libmime/smtp_parsers.h51
33 files changed, 21349 insertions, 0 deletions
diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt
new file mode 100644
index 0000000..09e5dbf
--- /dev/null
+++ b/src/libmime/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Librspamd mime
+SET(LIBRSPAMDMIMESRC
+ ${CMAKE_CURRENT_SOURCE_DIR}/received.cxx
+ ${CMAKE_CURRENT_SOURCE_DIR}/email_addr.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/mime_expressions.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/scan_result.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/images.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/message.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/archives.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/content_type.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection_fasttext.cxx
+ ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx
+ )
+
+SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE) \ No newline at end of file
diff --git a/src/libmime/archives.c b/src/libmime/archives.c
new file mode 100644
index 0000000..ea0ea55
--- /dev/null
+++ b/src/libmime/archives.c
@@ -0,0 +1,2057 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "message.h"
+#include "task.h"
+#include "archives.h"
+#include "libmime/mime_encoding.h"
+#include <unicode/uchar.h>
+#include <unicode/utf8.h>
+#include <unicode/utf16.h>
+#include <unicode/ucnv.h>
+
+#define msg_debug_archive(...) rspamd_conditional_debug_fast(NULL, NULL, \
+ rspamd_archive_log_id, "archive", task->task_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+
+INIT_LOG_MODULE(archive)
+
+static void
+rspamd_archive_dtor(gpointer p)
+{
+ struct rspamd_archive *arch = p;
+ struct rspamd_archive_file *f;
+ guint i;
+
+ for (i = 0; i < arch->files->len; i++) {
+ f = g_ptr_array_index(arch->files, i);
+
+ if (f->fname) {
+ g_string_free(f->fname, TRUE);
+ }
+
+ g_free(f);
+ }
+
+ g_ptr_array_free(arch->files, TRUE);
+}
+
+static bool
+rspamd_archive_file_try_utf(struct rspamd_task *task,
+ struct rspamd_archive *arch,
+ struct rspamd_archive_file *fentry,
+ const gchar *in, gsize inlen)
+{
+ const gchar *charset = NULL, *p, *end;
+ GString *res;
+
+ charset = rspamd_mime_charset_find_by_content(in, inlen, TRUE);
+
+ if (charset) {
+ UChar *tmp;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ gint32 r, clen, dlen;
+ struct rspamd_charset_converter *conv;
+ UConverter *utf8_converter;
+
+ conv = rspamd_mime_get_converter_cached(charset, task->task_pool,
+ TRUE, &uc_err);
+ utf8_converter = rspamd_get_utf8_converter();
+
+ if (conv == NULL) {
+ msg_info_task("cannot open converter for %s: %s",
+ charset, u_errorName(uc_err));
+ fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
+ fentry->fname = g_string_new_len(in, inlen);
+
+ return false;
+ }
+
+ tmp = g_malloc(sizeof(*tmp) * (inlen + 1));
+ r = rspamd_converter_to_uchars(conv, tmp, inlen + 1,
+ in, inlen, &uc_err);
+ if (!U_SUCCESS(uc_err)) {
+ msg_info_task("cannot convert data to unicode from %s: %s",
+ charset, u_errorName(uc_err));
+ g_free(tmp);
+
+ fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
+ fentry->fname = g_string_new_len(in, inlen);
+
+ return NULL;
+ }
+
+ int i = 0;
+
+ while (i < r) {
+ UChar32 uc;
+
+ U16_NEXT(tmp, i, r, uc);
+
+ if (IS_ZERO_WIDTH_SPACE(uc) || u_iscntrl(uc)) {
+ msg_info_task("control character in archive file name found: 0x%02xd "
+ "(filename=%T)",
+ uc, arch->archive_name);
+ fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
+ break;
+ }
+ }
+
+ clen = ucnv_getMaxCharSize(utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen);
+ res = g_string_sized_new(dlen);
+ r = ucnv_fromUChars(utf8_converter, res->str, dlen, tmp, r, &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ msg_info_task("cannot convert data from unicode from %s: %s",
+ charset, u_errorName(uc_err));
+ g_free(tmp);
+ g_string_free(res, TRUE);
+ fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
+ fentry->fname = g_string_new_len(in, inlen);
+
+ return NULL;
+ }
+
+ g_free(tmp);
+ res->len = r;
+
+ msg_debug_archive("converted from %s to UTF-8 inlen: %z, outlen: %d",
+ charset, inlen, r);
+ fentry->fname = res;
+ }
+ else {
+ /* Convert unsafe characters to '?' */
+ res = g_string_sized_new(inlen);
+ p = in;
+ end = in + inlen;
+
+ while (p < end) {
+ if (g_ascii_isgraph(*p)) {
+ g_string_append_c(res, *p);
+ }
+ else {
+ g_string_append_c(res, '?');
+
+ if (*p < 0x7f && (g_ascii_iscntrl(*p) || *p == '\0')) {
+ if (!(fentry->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED)) {
+ msg_info_task("suspicious character in archive file name found: 0x%02xd "
+ "(filename=%T)",
+ (int) *p, arch->archive_name);
+ fentry->flags |= RSPAMD_ARCHIVE_FILE_OBFUSCATED;
+ }
+ }
+ }
+
+ p++;
+ }
+ fentry->fname = res;
+ }
+
+ return true;
+}
+
+static void
+rspamd_archive_process_zip(struct rspamd_task *task,
+ struct rspamd_mime_part *part)
+{
+ const guchar *p, *start, *end, *eocd = NULL, *cd;
+ const guint32 eocd_magic = 0x06054b50, cd_basic_len = 46;
+ const guchar cd_magic[] = {0x50, 0x4b, 0x01, 0x02};
+ const guint max_processed = 1024;
+ guint32 cd_offset, cd_size, comp_size, uncomp_size, processed = 0;
+ guint16 extra_len, fname_len, comment_len;
+ struct rspamd_archive *arch;
+ struct rspamd_archive_file *f = NULL;
+
+ /* Zip files have interesting data at the end of archive */
+ p = part->parsed_data.begin + part->parsed_data.len - 1;
+ start = part->parsed_data.begin;
+ end = p;
+
+ /* Search for EOCD:
+ * 22 bytes is a typical size of eocd without a comment and
+ * end points one byte after the last character
+ */
+ p -= 21;
+
+ while (p > start + sizeof(guint32)) {
+ guint32 t;
+
+ if (processed > max_processed) {
+ break;
+ }
+
+ /* XXX: not an efficient approach */
+ memcpy(&t, p, sizeof(t));
+
+ if (GUINT32_FROM_LE(t) == eocd_magic) {
+ eocd = p;
+ break;
+ }
+
+ p--;
+ processed++;
+ }
+
+
+ if (eocd == NULL) {
+ /* Not a zip file */
+ msg_info_task("zip archive is invalid (no EOCD)");
+
+ return;
+ }
+
+ if (end - eocd < 21) {
+ msg_info_task("zip archive is invalid (short EOCD)");
+
+ return;
+ }
+
+
+ memcpy(&cd_size, eocd + 12, sizeof(cd_size));
+ cd_size = GUINT32_FROM_LE(cd_size);
+ memcpy(&cd_offset, eocd + 16, sizeof(cd_offset));
+ cd_offset = GUINT32_FROM_LE(cd_offset);
+
+ /* We need to check sanity as well */
+ if (cd_offset + cd_size > (guint) (eocd - start)) {
+ msg_info_task("zip archive is invalid (bad size/offset for CD)");
+
+ return;
+ }
+
+ cd = start + cd_offset;
+
+ arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
+ arch->files = g_ptr_array_new();
+ arch->type = RSPAMD_ARCHIVE_ZIP;
+ if (part->cd) {
+ arch->archive_name = &part->cd->filename;
+ }
+ rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
+ arch);
+
+ while (cd < start + cd_offset + cd_size) {
+ guint16 flags;
+
+ /* Read central directory record */
+ if (eocd - cd < cd_basic_len ||
+ memcmp(cd, cd_magic, sizeof(cd_magic)) != 0) {
+ msg_info_task("zip archive is invalid (bad cd record)");
+
+ return;
+ }
+
+ memcpy(&flags, cd + 8, sizeof(guint16));
+ flags = GUINT16_FROM_LE(flags);
+ memcpy(&comp_size, cd + 20, sizeof(guint32));
+ comp_size = GUINT32_FROM_LE(comp_size);
+ memcpy(&uncomp_size, cd + 24, sizeof(guint32));
+ uncomp_size = GUINT32_FROM_LE(uncomp_size);
+ memcpy(&fname_len, cd + 28, sizeof(fname_len));
+ fname_len = GUINT16_FROM_LE(fname_len);
+ memcpy(&extra_len, cd + 30, sizeof(extra_len));
+ extra_len = GUINT16_FROM_LE(extra_len);
+ memcpy(&comment_len, cd + 32, sizeof(comment_len));
+ comment_len = GUINT16_FROM_LE(comment_len);
+
+ if (cd + fname_len + comment_len + extra_len + cd_basic_len > eocd) {
+ msg_info_task("zip archive is invalid (too large cd record)");
+
+ return;
+ }
+
+ f = g_malloc0(sizeof(*f));
+ rspamd_archive_file_try_utf(task, arch, f, cd + cd_basic_len, fname_len);
+
+ f->compressed_size = comp_size;
+ f->uncompressed_size = uncomp_size;
+
+ if (flags & 0x41u) {
+ f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
+ }
+
+ if (f->fname) {
+ if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) {
+ arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES;
+ }
+
+ g_ptr_array_add(arch->files, f);
+ msg_debug_archive("found file in zip archive: %v", f->fname);
+ }
+ else {
+ g_free(f);
+
+ return;
+ }
+
+ /* Process extra fields */
+ const guchar *extra = cd + fname_len + cd_basic_len;
+ p = extra;
+
+ while (p + sizeof(guint16) * 2 < extra + extra_len) {
+ guint16 hid, hlen;
+
+ memcpy(&hid, p, sizeof(guint16));
+ hid = GUINT16_FROM_LE(hid);
+ memcpy(&hlen, p + sizeof(guint16), sizeof(guint16));
+ hlen = GUINT16_FROM_LE(hlen);
+
+ if (hid == 0x0017) {
+ f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
+ }
+
+ p += hlen + sizeof(guint16) * 2;
+ }
+
+ cd += fname_len + comment_len + extra_len + cd_basic_len;
+ }
+
+ part->part_type = RSPAMD_MIME_PART_ARCHIVE;
+ part->specific.arch = arch;
+
+ arch->size = part->parsed_data.len;
+}
+
+static inline gint
+rspamd_archive_rar_read_vint(const guchar *start, gsize remain, guint64 *res)
+{
+ /*
+ * From http://www.rarlab.com/technote.htm:
+ * Variable length integer. Can include one or more bytes, where
+ * lower 7 bits of every byte contain integer data and highest bit
+ * in every byte is the continuation flag.
+ * If highest bit is 0, this is the last byte in sequence.
+ * So first byte contains 7 least significant bits of integer and
+ * continuation flag. Second byte, if present, contains next 7 bits and so on.
+ */
+ guint64 t = 0;
+ guint shift = 0;
+ const guchar *p = start;
+
+ while (remain > 0 && shift <= 57) {
+ if (*p & 0x80) {
+ t |= ((guint64) (*p & 0x7f)) << shift;
+ }
+ else {
+ t |= ((guint64) (*p & 0x7f)) << shift;
+ p++;
+ break;
+ }
+
+ shift += 7;
+ p++;
+ remain--;
+ }
+
+ if (remain == 0 || shift > 64) {
+ return -1;
+ }
+
+ *res = GUINT64_FROM_LE(t);
+
+ return p - start;
+}
+
+#define RAR_SKIP_BYTES(n) \
+ do { \
+ if ((n) <= 0) { \
+ msg_debug_archive("rar archive is invalid (bad skip value)"); \
+ return; \
+ } \
+ if ((gsize) (end - p) < (n)) { \
+ msg_debug_archive("rar archive is invalid (truncated)"); \
+ return; \
+ } \
+ p += (n); \
+ } while (0)
+
+#define RAR_READ_VINT() \
+ do { \
+ r = rspamd_archive_rar_read_vint(p, end - p, &vint); \
+ if (r == -1) { \
+ msg_debug_archive("rar archive is invalid (bad vint)"); \
+ return; \
+ } \
+ else if (r == 0) { \
+ msg_debug_archive("rar archive is invalid (BAD vint offset)"); \
+ return; \
+ } \
+ } while (0)
+
+#define RAR_READ_VINT_SKIP() \
+ do { \
+ r = rspamd_archive_rar_read_vint(p, end - p, &vint); \
+ if (r == -1) { \
+ msg_debug_archive("rar archive is invalid (bad vint)"); \
+ return; \
+ } \
+ p += r; \
+ } while (0)
+
+#define RAR_READ_UINT16(n) \
+ do { \
+ if (end - p < (glong) sizeof(guint16)) { \
+ msg_debug_archive("rar archive is invalid (bad int16)"); \
+ return; \
+ } \
+ n = p[0] + (p[1] << 8); \
+ p += sizeof(guint16); \
+ } while (0)
+
+#define RAR_READ_UINT32(n) \
+ do { \
+ if (end - p < (glong) sizeof(guint32)) { \
+ msg_debug_archive("rar archive is invalid (bad int32)"); \
+ return; \
+ } \
+ n = (guint) p[0] + ((guint) p[1] << 8) + ((guint) p[2] << 16) + ((guint) p[3] << 24); \
+ p += sizeof(guint32); \
+ } while (0)
+
+static void
+rspamd_archive_process_rar_v4(struct rspamd_task *task, const guchar *start,
+ const guchar *end, struct rspamd_mime_part *part)
+{
+ const guchar *p = start, *start_section;
+ guint8 type;
+ guint flags;
+ guint64 sz, comp_sz = 0, uncomp_sz = 0;
+ struct rspamd_archive *arch;
+ struct rspamd_archive_file *f;
+
+ arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
+ arch->files = g_ptr_array_new();
+ arch->type = RSPAMD_ARCHIVE_RAR;
+ if (part->cd) {
+ arch->archive_name = &part->cd->filename;
+ }
+ rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
+ arch);
+
+ while (p < end) {
+ /* Crc16 */
+ start_section = p;
+ RAR_SKIP_BYTES(sizeof(guint16));
+ type = *p;
+ p++;
+ RAR_READ_UINT16(flags);
+
+ if (type == 0x73) {
+ /* Main header, check for encryption */
+ if (flags & 0x80) {
+ arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
+ goto end;
+ }
+ }
+
+ RAR_READ_UINT16(sz);
+
+ if (flags & 0x8000) {
+ /* We also need to read ADD_SIZE element */
+ guint32 tmp;
+
+ RAR_READ_UINT32(tmp);
+ sz += tmp;
+ /* This is also used as PACK_SIZE */
+ comp_sz = tmp;
+ }
+
+ if (sz == 0) {
+ /* Zero sized block - error */
+ msg_debug_archive("rar archive is invalid (zero size block)");
+
+ return;
+ }
+
+ if (type == 0x74) {
+ guint fname_len;
+
+ /* File header */
+ /* Uncompressed size */
+ RAR_READ_UINT32(uncomp_sz);
+ /* Skip to NAME_SIZE element */
+ RAR_SKIP_BYTES(11);
+ RAR_READ_UINT16(fname_len);
+
+ if (fname_len == 0 || fname_len > (gsize) (end - p)) {
+ msg_debug_archive("rar archive is invalid (bad filename size: %d)",
+ fname_len);
+
+ return;
+ }
+
+ /* Attrs */
+ RAR_SKIP_BYTES(4);
+
+ if (flags & 0x100) {
+ /* We also need to read HIGH_PACK_SIZE */
+ guint32 tmp;
+
+ RAR_READ_UINT32(tmp);
+ sz += tmp;
+ comp_sz += tmp;
+ /* HIGH_UNP_SIZE */
+ RAR_READ_UINT32(tmp);
+ uncomp_sz += tmp;
+ }
+
+ f = g_malloc0(sizeof(*f));
+
+ if (flags & 0x200) {
+ /* We have unicode + normal version */
+ guchar *tmp;
+
+ tmp = memchr(p, '\0', fname_len);
+
+ if (tmp != NULL) {
+ /* Just use ASCII version */
+ rspamd_archive_file_try_utf(task, arch, f, p, tmp - p);
+ msg_debug_archive("found ascii filename in rarv4 archive: %v",
+ f->fname);
+ }
+ else {
+ /* We have UTF8 filename, use it as is */
+ rspamd_archive_file_try_utf(task, arch, f, p, fname_len);
+ msg_debug_archive("found utf filename in rarv4 archive: %v",
+ f->fname);
+ }
+ }
+ else {
+ rspamd_archive_file_try_utf(task, arch, f, p, fname_len);
+ msg_debug_archive("found ascii (old) filename in rarv4 archive: %v",
+ f->fname);
+ }
+
+ f->compressed_size = comp_sz;
+ f->uncompressed_size = uncomp_sz;
+
+ if (flags & 0x4) {
+ f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
+ }
+
+ if (f->fname) {
+ if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) {
+ arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES;
+ }
+ g_ptr_array_add(arch->files, f);
+ }
+ else {
+ g_free(f);
+ }
+ }
+
+ p = start_section;
+ RAR_SKIP_BYTES(sz);
+ }
+
+end:
+ part->part_type = RSPAMD_MIME_PART_ARCHIVE;
+ part->specific.arch = arch;
+ arch->size = part->parsed_data.len;
+}
+
+static void
+rspamd_archive_process_rar(struct rspamd_task *task,
+ struct rspamd_mime_part *part)
+{
+ const guchar *p, *end, *section_start;
+ const guchar rar_v5_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00},
+ rar_v4_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00};
+ const guint rar_encrypted_header = 4, rar_main_header = 1,
+ rar_file_header = 2;
+ guint64 vint, sz, comp_sz = 0, uncomp_sz = 0, flags = 0, type = 0,
+ extra_sz = 0;
+ struct rspamd_archive *arch;
+ struct rspamd_archive_file *f;
+ gint r;
+
+ p = part->parsed_data.begin;
+ end = p + part->parsed_data.len;
+
+ if ((gsize) (end - p) <= sizeof(rar_v5_magic)) {
+ msg_debug_archive("rar archive is invalid (too small)");
+
+ return;
+ }
+
+ if (memcmp(p, rar_v5_magic, sizeof(rar_v5_magic)) == 0) {
+ p += sizeof(rar_v5_magic);
+ }
+ else if (memcmp(p, rar_v4_magic, sizeof(rar_v4_magic)) == 0) {
+ p += sizeof(rar_v4_magic);
+
+ rspamd_archive_process_rar_v4(task, p, end, part);
+ return;
+ }
+ else {
+ msg_debug_archive("rar archive is invalid (no rar magic)");
+
+ return;
+ }
+
+ /* Rar v5 format */
+ arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
+ arch->files = g_ptr_array_new();
+ arch->type = RSPAMD_ARCHIVE_RAR;
+ if (part->cd) {
+ arch->archive_name = &part->cd->filename;
+ }
+ rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
+ arch);
+
+ /* Now we can have either encryption header or archive header */
+ /* Crc 32 */
+ RAR_SKIP_BYTES(sizeof(guint32));
+ /* Size */
+ RAR_READ_VINT_SKIP();
+ sz = vint;
+ /* Type */
+ section_start = p;
+ RAR_READ_VINT_SKIP();
+ type = vint;
+ /* Header flags */
+ RAR_READ_VINT_SKIP();
+ flags = vint;
+
+ if (flags & 0x1) {
+ /* Have extra zone */
+ RAR_READ_VINT_SKIP();
+ }
+ if (flags & 0x2) {
+ /* Data zone is presented */
+ RAR_READ_VINT_SKIP();
+ sz += vint;
+ }
+
+ if (type == rar_encrypted_header) {
+ /* We can't read any further information as archive is encrypted */
+ arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
+ goto end;
+ }
+ else if (type != rar_main_header) {
+ msg_debug_archive("rar archive is invalid (bad main header)");
+
+ return;
+ }
+
+ /* Nothing useful in main header */
+ p = section_start;
+ RAR_SKIP_BYTES(sz);
+
+ while (p < end) {
+ gboolean has_extra = FALSE;
+ /* Read the next header */
+ /* Crc 32 */
+ RAR_SKIP_BYTES(sizeof(guint32));
+ /* Size */
+ RAR_READ_VINT_SKIP();
+
+ sz = vint;
+ if (sz == 0) {
+ /* Zero sized block - error */
+ msg_debug_archive("rar archive is invalid (zero size block)");
+
+ return;
+ }
+
+ section_start = p;
+ /* Type */
+ RAR_READ_VINT_SKIP();
+ type = vint;
+ /* Header flags */
+ RAR_READ_VINT_SKIP();
+ flags = vint;
+
+ if (flags & 0x1) {
+ /* Have extra zone */
+ RAR_READ_VINT_SKIP();
+ extra_sz = vint;
+ has_extra = TRUE;
+ }
+
+ if (flags & 0x2) {
+ /* Data zone is presented */
+ RAR_READ_VINT_SKIP();
+ sz += vint;
+ comp_sz = vint;
+ }
+
+ if (type != rar_file_header) {
+ p = section_start;
+ RAR_SKIP_BYTES(sz);
+ }
+ else {
+ /* We have a file header, go forward */
+ guint64 fname_len;
+ bool is_directory = false;
+
+ /* File header specific flags */
+ RAR_READ_VINT_SKIP();
+ flags = vint;
+
+ /* Unpacked size */
+ RAR_READ_VINT_SKIP();
+ uncomp_sz = vint;
+ /* Attributes */
+ RAR_READ_VINT_SKIP();
+
+ if (flags & 0x2) {
+ /* Unix mtime */
+ RAR_SKIP_BYTES(sizeof(guint32));
+ }
+ if (flags & 0x4) {
+ /* Crc32 */
+ RAR_SKIP_BYTES(sizeof(guint32));
+ }
+ if (flags & 0x1) {
+ /* Ignore directories for sanity purposes */
+ is_directory = true;
+ msg_debug_archive("skip directory record in a rar archive");
+ }
+
+ if (!is_directory) {
+ /* Compression */
+ RAR_READ_VINT_SKIP();
+ /* Host OS */
+ RAR_READ_VINT_SKIP();
+ /* Filename length (finally!) */
+ RAR_READ_VINT_SKIP();
+ fname_len = vint;
+
+ if (fname_len == 0 || fname_len > (gsize) (end - p)) {
+ msg_debug_archive("rar archive is invalid (bad filename size)");
+
+ return;
+ }
+
+ f = g_malloc0(sizeof(*f));
+ f->uncompressed_size = uncomp_sz;
+ f->compressed_size = comp_sz;
+ rspamd_archive_file_try_utf(task, arch, f, p, fname_len);
+
+ if (f->fname) {
+ msg_debug_archive("added rarv5 file: %v", f->fname);
+ g_ptr_array_add(arch->files, f);
+ if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) {
+ arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES;
+ }
+ }
+ else {
+ g_free(f);
+ f = NULL;
+ }
+
+ if (f && has_extra && extra_sz > 0 &&
+ p + fname_len + extra_sz < end) {
+ /* Try to find encryption record in extra field */
+ const guchar *ex = p + fname_len;
+
+ while (ex < p + extra_sz) {
+ const guchar *t;
+ gint64 cur_sz = 0, sec_type = 0;
+
+ r = rspamd_archive_rar_read_vint(ex, extra_sz, &cur_sz);
+ if (r == -1) {
+ msg_debug_archive("rar archive is invalid (bad vint)");
+ return;
+ }
+
+ t = ex + r;
+
+ r = rspamd_archive_rar_read_vint(t, extra_sz - r, &sec_type);
+ if (r == -1) {
+ msg_debug_archive("rar archive is invalid (bad vint)");
+ return;
+ }
+
+ if (sec_type == 0x01) {
+ f->flags |= RSPAMD_ARCHIVE_FILE_ENCRYPTED;
+ arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
+ break;
+ }
+
+ ex += cur_sz;
+ }
+ }
+ }
+
+ /* Restore p to the beginning of the header */
+ p = section_start;
+ RAR_SKIP_BYTES(sz);
+ }
+ }
+
+end:
+ part->part_type = RSPAMD_MIME_PART_ARCHIVE;
+ part->specific.arch = arch;
+ arch->size = part->parsed_data.len;
+}
+
+static inline gint
+rspamd_archive_7zip_read_vint(const guchar *start, gsize remain, guint64 *res)
+{
+ /*
+ * REAL_UINT64 means real UINT64.
+ * UINT64 means real UINT64 encoded with the following scheme:
+ *
+ * Size of encoding sequence depends from first byte:
+ * First_Byte Extra_Bytes Value
+ * (binary)
+ * 0xxxxxxx : ( xxxxxxx )
+ * 10xxxxxx BYTE y[1] : ( xxxxxx << (8 * 1)) + y
+ * 110xxxxx BYTE y[2] : ( xxxxx << (8 * 2)) + y
+ * ...
+ * 1111110x BYTE y[6] : ( x << (8 * 6)) + y
+ * 11111110 BYTE y[7] : y
+ * 11111111 BYTE y[8] : y
+ */
+ guchar t;
+
+ if (remain == 0) {
+ return -1;
+ }
+
+ t = *start;
+
+ if (!isset(&t, 7)) {
+ /* Trivial case */
+ *res = t;
+ return 1;
+ }
+ else if (t == 0xFF) {
+ if (remain >= sizeof(guint64) + 1) {
+ memcpy(res, start + 1, sizeof(guint64));
+ *res = GUINT64_FROM_LE(*res);
+
+ return sizeof(guint64) + 1;
+ }
+ }
+ else {
+ gint cur_bit = 6, intlen = 1;
+ const guchar bmask = 0xFF;
+ guint64 tgt;
+
+ while (cur_bit > 0) {
+ if (!isset(&t, cur_bit)) {
+ if (remain >= intlen + 1) {
+ memcpy(&tgt, start + 1, intlen);
+ tgt = GUINT64_FROM_LE(tgt);
+ /* Shift back */
+ tgt >>= sizeof(tgt) - NBBY * intlen;
+ /* Add masked value */
+ tgt += (guint64) (t & (bmask >> (NBBY - cur_bit)))
+ << (NBBY * intlen);
+ *res = tgt;
+
+ return intlen + 1;
+ }
+ }
+ cur_bit--;
+ intlen++;
+ }
+ }
+
+ return -1;
+}
+
+#define SZ_READ_VINT_SKIP() \
+ do { \
+ r = rspamd_archive_7zip_read_vint(p, end - p, &vint); \
+ if (r == -1) { \
+ msg_debug_archive("7z archive is invalid (bad vint)"); \
+ return; \
+ } \
+ p += r; \
+ } while (0)
+#define SZ_READ_VINT(var) \
+ do { \
+ int r; \
+ r = rspamd_archive_7zip_read_vint(p, end - p, &(var)); \
+ if (r == -1) { \
+ msg_debug_archive("7z archive is invalid (bad vint): %s", G_STRLOC); \
+ return NULL; \
+ } \
+ p += r; \
+ } while (0)
+
+#define SZ_READ_UINT64(n) \
+ do { \
+ if (end - p < (goffset) sizeof(guint64)) { \
+ msg_debug_archive("7zip archive is invalid (bad uint64): %s", G_STRLOC); \
+ return; \
+ } \
+ memcpy(&(n), p, sizeof(guint64)); \
+ n = GUINT64_FROM_LE(n); \
+ p += sizeof(guint64); \
+ } while (0)
+#define SZ_SKIP_BYTES(n) \
+ do { \
+ if (end - p >= (n)) { \
+ p += (n); \
+ } \
+ else { \
+ msg_debug_archive("7zip archive is invalid (truncated); wanted to read %d bytes, %d avail: %s", (gint) (n), (gint) (end - p), G_STRLOC); \
+ return NULL; \
+ } \
+ } while (0)
+
+enum rspamd_7zip_header_mark {
+ kEnd = 0x00,
+ kHeader = 0x01,
+ kArchiveProperties = 0x02,
+ kAdditionalStreamsInfo = 0x03,
+ kMainStreamsInfo = 0x04,
+ kFilesInfo = 0x05,
+ kPackInfo = 0x06,
+ kUnPackInfo = 0x07,
+ kSubStreamsInfo = 0x08,
+ kSize = 0x09,
+ kCRC = 0x0A,
+ kFolder = 0x0B,
+ kCodersUnPackSize = 0x0C,
+ kNumUnPackStream = 0x0D,
+ kEmptyStream = 0x0E,
+ kEmptyFile = 0x0F,
+ kAnti = 0x10,
+ kName = 0x11,
+ kCTime = 0x12,
+ kATime = 0x13,
+ kMTime = 0x14,
+ kWinAttributes = 0x15,
+ kComment = 0x16,
+ kEncodedHeader = 0x17,
+ kStartPos = 0x18,
+ kDummy = 0x19,
+};
+
+
+#define _7Z_CRYPTO_MAIN_ZIP 0x06F10101 /* Main Zip crypto algo */
+#define _7Z_CRYPTO_RAR_29 0x06F10303 /* Rar29 AES-128 + (modified SHA-1) */
+#define _7Z_CRYPTO_AES_256_SHA_256 0x06F10701 /* AES-256 + SHA-256 */
+
+#define IS_SZ_ENCRYPTED(codec_id) (((codec_id) == _7Z_CRYPTO_MAIN_ZIP) || \
+ ((codec_id) == _7Z_CRYPTO_RAR_29) || \
+ ((codec_id) == _7Z_CRYPTO_AES_256_SHA_256))
+
+static const guchar *
+rspamd_7zip_read_bits(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch, guint nbits,
+ guint *pbits_set)
+{
+ unsigned mask = 0, avail = 0, i;
+ gboolean bit_set = 0;
+
+ for (i = 0; i < nbits; i++) {
+ if (mask == 0) {
+ avail = *p;
+ SZ_SKIP_BYTES(1);
+ mask = 0x80;
+ }
+
+ bit_set = (avail & mask) ? 1 : 0;
+
+ if (bit_set && pbits_set) {
+ (*pbits_set)++;
+ }
+
+ mask >>= 1;
+ }
+
+ return p;
+}
+
+static const guchar *
+rspamd_7zip_read_digest(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch,
+ guint64 num_streams,
+ guint *pdigest_read)
+{
+ guchar all_defined = *p;
+ guint64 i;
+ guint num_defined = 0;
+ /*
+ * BYTE AllAreDefined
+ * if (AllAreDefined == 0)
+ * {
+ * for(NumStreams)
+ * BIT Defined
+ * }
+ * UINT32 CRCs[NumDefined]
+ */
+ SZ_SKIP_BYTES(1);
+
+ if (all_defined) {
+ num_defined = num_streams;
+ }
+ else {
+ if (num_streams > 8192) {
+ /* Gah */
+ return NULL;
+ }
+
+ p = rspamd_7zip_read_bits(task, p, end, arch, num_streams, &num_defined);
+
+ if (p == NULL) {
+ return NULL;
+ }
+ }
+
+ for (i = 0; i < num_defined; i++) {
+ SZ_SKIP_BYTES(sizeof(guint32));
+ }
+
+ if (pdigest_read) {
+ *pdigest_read = num_defined;
+ }
+
+ return p;
+}
+
+static const guchar *
+rspamd_7zip_read_pack_info(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch)
+{
+ guint64 pack_pos = 0, pack_streams = 0, i, cur_sz;
+ guint num_digests = 0;
+ guchar t;
+ /*
+ * UINT64 PackPos
+ * UINT64 NumPackStreams
+ *
+ * []
+ * BYTE NID::kSize (0x09)
+ * UINT64 PackSizes[NumPackStreams]
+ * []
+ *
+ * []
+ * BYTE NID::kCRC (0x0A)
+ * PackStreamDigests[NumPackStreams]
+ * []
+ * BYTE NID::kEnd
+ */
+
+ SZ_READ_VINT(pack_pos);
+ SZ_READ_VINT(pack_streams);
+
+ while (p != NULL && p < end) {
+ t = *p;
+ SZ_SKIP_BYTES(1);
+ msg_debug_archive("7zip: read pack info %xc", t);
+
+ switch (t) {
+ case kSize:
+ /* We need to skip pack_streams VINTS */
+ for (i = 0; i < pack_streams; i++) {
+ SZ_READ_VINT(cur_sz);
+ }
+ break;
+ case kCRC:
+ /* CRCs are more complicated */
+ p = rspamd_7zip_read_digest(task, p, end, arch, pack_streams,
+ &num_digests);
+ break;
+ case kEnd:
+ goto end;
+ break;
+ default:
+ p = NULL;
+ msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
+ goto end;
+ break;
+ }
+ }
+
+end:
+
+ return p;
+}
+
+static const guchar *
+rspamd_7zip_read_folder(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch, guint *pnstreams, guint *ndigests)
+{
+ guint64 ncoders = 0, i, j, noutstreams = 0, ninstreams = 0;
+
+ SZ_READ_VINT(ncoders);
+
+ for (i = 0; i < ncoders && p != NULL && p < end; i++) {
+ guint64 sz, tmp;
+ guchar t;
+ /*
+ * BYTE
+ * {
+ * 0:3 CodecIdSize
+ * 4: Is Complex Coder
+ * 5: There Are Attributes
+ * 6: Reserved
+ * 7: There are more alternative methods. (Not used anymore, must be 0).
+ * }
+ * BYTE CodecId[CodecIdSize]
+ * if (Is Complex Coder)
+ * {
+ * UINT64 NumInStreams;
+ * UINT64 NumOutStreams;
+ * }
+ * if (There Are Attributes)
+ * {
+ * UINT64 PropertiesSize
+ * BYTE Properties[PropertiesSize]
+ * }
+ */
+ t = *p;
+ SZ_SKIP_BYTES(1);
+ sz = t & 0xF;
+ /* Codec ID */
+ tmp = 0;
+ for (j = 0; j < sz; j++) {
+ tmp <<= 8;
+ tmp += p[j];
+ }
+
+ msg_debug_archive("7zip: read codec id: %L", tmp);
+
+ if (IS_SZ_ENCRYPTED(tmp)) {
+ arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
+ }
+
+ SZ_SKIP_BYTES(sz);
+
+ if (t & (1u << 4)) {
+ /* Complex */
+ SZ_READ_VINT(tmp); /* InStreams */
+ ninstreams += tmp;
+ SZ_READ_VINT(tmp); /* OutStreams */
+ noutstreams += tmp;
+ }
+ else {
+ /* XXX: is it correct ? */
+ noutstreams++;
+ ninstreams++;
+ }
+ if (t & (1u << 5)) {
+ /* Attributes ... */
+ SZ_READ_VINT(tmp); /* Size of attrs */
+ SZ_SKIP_BYTES(tmp);
+ }
+ }
+
+ if (noutstreams > 1) {
+ /* BindPairs, WTF, huh */
+ for (i = 0; i < noutstreams - 1; i++) {
+ guint64 tmp;
+
+ SZ_READ_VINT(tmp);
+ SZ_READ_VINT(tmp);
+ }
+ }
+
+ gint64 npacked = (gint64) ninstreams - (gint64) noutstreams + 1;
+ msg_debug_archive("7zip: instreams=%L, outstreams=%L, packed=%L",
+ ninstreams, noutstreams, npacked);
+
+ if (npacked > 1) {
+ /* Gah... */
+ for (i = 0; i < npacked; i++) {
+ guint64 tmp;
+
+ SZ_READ_VINT(tmp);
+ }
+ }
+
+ *pnstreams = noutstreams;
+ (*ndigests) += npacked;
+
+ return p;
+}
+
+static const guchar *
+rspamd_7zip_read_coders_info(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch,
+ guint *pnum_folders, guint *pnum_nodigest)
+{
+ guint64 num_folders = 0, i, tmp;
+ guchar t;
+ guint *folder_nstreams = NULL, num_digests = 0, digests_read = 0;
+
+ while (p != NULL && p < end) {
+ /*
+ * BYTE NID::kFolder (0x0B)
+ * UINT64 NumFolders
+ * BYTE External
+ * switch(External)
+ * {
+ * case 0:
+ * Folders[NumFolders]
+ * case 1:
+ * UINT64 DataStreamIndex
+ * }
+ * BYTE ID::kCodersUnPackSize (0x0C)
+ * for(Folders)
+ * for(Folder.NumOutStreams)
+ * UINT64 UnPackSize;
+ * []
+ * BYTE NID::kCRC (0x0A)
+ * UnPackDigests[NumFolders]
+ * []
+ * BYTE NID::kEnd
+ */
+
+ t = *p;
+ SZ_SKIP_BYTES(1);
+ msg_debug_archive("7zip: read coders info %xc", t);
+
+ switch (t) {
+ case kFolder:
+ SZ_READ_VINT(num_folders);
+ msg_debug_archive("7zip: nfolders=%L", num_folders);
+
+ if (*p != 0) {
+ /* External folders */
+ SZ_SKIP_BYTES(1);
+ SZ_READ_VINT(tmp);
+ }
+ else {
+ SZ_SKIP_BYTES(1);
+
+ if (num_folders > 8192) {
+ /* Gah */
+ return NULL;
+ }
+
+ if (folder_nstreams) {
+ g_free(folder_nstreams);
+ }
+
+ folder_nstreams = g_malloc(sizeof(int) * num_folders);
+
+ for (i = 0; i < num_folders && p != NULL && p < end; i++) {
+ p = rspamd_7zip_read_folder(task, p, end, arch,
+ &folder_nstreams[i], &num_digests);
+ }
+ }
+ break;
+ case kCodersUnPackSize:
+ for (i = 0; i < num_folders && p != NULL && p < end; i++) {
+ if (folder_nstreams) {
+ for (guint j = 0; j < folder_nstreams[i]; j++) {
+ SZ_READ_VINT(tmp); /* Unpacked size */
+ msg_debug_archive("7zip: unpacked size "
+ "(folder=%d, stream=%d) = %L",
+ (gint) i, j, tmp);
+ }
+ }
+ else {
+ msg_err_task("internal 7zip error");
+ }
+ }
+ break;
+ case kCRC:
+ /*
+ * Here are dragons. Spec tells that here there could be up
+ * to nfolders digests. However, according to the actual source
+ * code, in case of multiple out streams there should be digests
+ * for all out streams.
+ *
+ * In the real life (tm) it is even more idiotic: all these digests
+ * are in another section! But that section needs number of digests
+ * that are absent here. It is the most stupid thing I've ever seen
+ * in any file format.
+ *
+ * I hope there *WAS* some reason to do such shit...
+ */
+ p = rspamd_7zip_read_digest(task, p, end, arch, num_digests,
+ &digests_read);
+ break;
+ case kEnd:
+ goto end;
+ break;
+ default:
+ p = NULL;
+ msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
+ goto end;
+ break;
+ }
+ }
+
+end:
+
+ if (pnum_nodigest) {
+ *pnum_nodigest = num_digests - digests_read;
+ }
+ if (pnum_folders) {
+ *pnum_folders = num_folders;
+ }
+
+ if (folder_nstreams) {
+ g_free(folder_nstreams);
+ }
+
+ return p;
+}
+
+static const guchar *
+rspamd_7zip_read_substreams_info(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch,
+ guint num_folders, guint num_nodigest)
+{
+ guchar t;
+ guint i;
+ guint64 *folder_nstreams;
+
+ if (num_folders > 8192) {
+ /* Gah */
+ return NULL;
+ }
+
+ folder_nstreams = g_alloca(sizeof(guint64) * num_folders);
+ memset(folder_nstreams, 0, sizeof(guint64) * num_folders);
+
+ while (p != NULL && p < end) {
+ /*
+ * []
+ * BYTE NID::kNumUnPackStream; (0x0D)
+ * UINT64 NumUnPackStreamsInFolders[NumFolders];
+ * []
+ *
+ * []
+ * BYTE NID::kSize (0x09)
+ * UINT64 UnPackSizes[??]
+ * []
+ *
+ *
+ * []
+ * BYTE NID::kCRC (0x0A)
+ * Digests[Number of streams with unknown CRC]
+ * []
+
+ */
+ t = *p;
+ SZ_SKIP_BYTES(1);
+
+ msg_debug_archive("7zip: read substream info %xc", t);
+
+ switch (t) {
+ case kNumUnPackStream:
+ for (i = 0; i < num_folders; i++) {
+ guint64 tmp;
+
+ SZ_READ_VINT(tmp);
+ folder_nstreams[i] = tmp;
+ }
+ break;
+ case kCRC:
+ /*
+ * Read the comment in the rspamd_7zip_read_coders_info
+ */
+ p = rspamd_7zip_read_digest(task, p, end, arch, num_nodigest,
+ NULL);
+ break;
+ case kSize:
+ /*
+ * Another brain damaged logic, but we have to support it
+ * as there are no ways to proceed without it.
+ * In fact, it is just absent in the real life...
+ */
+ for (i = 0; i < num_folders; i++) {
+ for (guint j = 0; j < folder_nstreams[i]; j++) {
+ guint64 tmp;
+
+ SZ_READ_VINT(tmp); /* Who cares indeed */
+ }
+ }
+ break;
+ case kEnd:
+ goto end;
+ break;
+ default:
+ p = NULL;
+ msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
+ goto end;
+ break;
+ }
+ }
+
+end:
+ return p;
+}
+
+static const guchar *
+rspamd_7zip_read_main_streams_info(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch)
+{
+ guchar t;
+ guint num_folders = 0, unknown_digests = 0;
+
+ while (p != NULL && p < end) {
+ t = *p;
+ SZ_SKIP_BYTES(1);
+ msg_debug_archive("7zip: read main streams info %xc", t);
+
+ /*
+ *
+ * []
+ * PackInfo
+ * []
+
+ * []
+ * CodersInfo
+ * []
+ *
+ * []
+ * SubStreamsInfo
+ * []
+ *
+ * BYTE NID::kEnd
+ */
+ switch (t) {
+ case kPackInfo:
+ p = rspamd_7zip_read_pack_info(task, p, end, arch);
+ break;
+ case kUnPackInfo:
+ p = rspamd_7zip_read_coders_info(task, p, end, arch, &num_folders,
+ &unknown_digests);
+ break;
+ case kSubStreamsInfo:
+ p = rspamd_7zip_read_substreams_info(task, p, end, arch, num_folders,
+ unknown_digests);
+ break;
+ break;
+ case kEnd:
+ goto end;
+ break;
+ default:
+ p = NULL;
+ msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
+ goto end;
+ break;
+ }
+ }
+
+end:
+ return p;
+}
+
+static const guchar *
+rspamd_7zip_read_archive_props(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch)
+{
+ guchar proptype;
+ guint64 proplen;
+
+ /*
+ * for (;;)
+ * {
+ * BYTE PropertyType;
+ * if (aType == 0)
+ * break;
+ * UINT64 PropertySize;
+ * BYTE PropertyData[PropertySize];
+ * }
+ */
+
+ if (p != NULL) {
+ proptype = *p;
+ SZ_SKIP_BYTES(1);
+
+ while (proptype != 0) {
+ SZ_READ_VINT(proplen);
+
+ if (p + proplen < end) {
+ p += proplen;
+ }
+ else {
+ return NULL;
+ }
+
+ proptype = *p;
+ SZ_SKIP_BYTES(1);
+ }
+ }
+
+ return p;
+}
+
+static GString *
+rspamd_7zip_ucs2_to_utf8(struct rspamd_task *task, const guchar *p,
+ const guchar *end)
+{
+ GString *res;
+ goffset dest_pos = 0, src_pos = 0;
+ const gsize len = (end - p) / sizeof(guint16);
+ guint16 *up;
+ UChar32 wc;
+ UBool is_error = 0;
+
+ res = g_string_sized_new((end - p) * 3 / 2 + sizeof(wc) + 1);
+ up = (guint16 *) p;
+
+ while (src_pos < len) {
+ U16_NEXT(up, src_pos, len, wc);
+
+ if (wc > 0) {
+ U8_APPEND(res->str, dest_pos,
+ res->allocated_len - 1,
+ wc, is_error);
+ }
+
+ if (is_error) {
+ g_string_free(res, TRUE);
+
+ return NULL;
+ }
+ }
+
+ g_assert(dest_pos < res->allocated_len);
+
+ res->len = dest_pos;
+ res->str[dest_pos] = '\0';
+
+ return res;
+}
+
+static const guchar *
+rspamd_7zip_read_files_info(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch)
+{
+ guint64 nfiles = 0, sz, i;
+ guchar t, b;
+ struct rspamd_archive_file *fentry;
+
+ SZ_READ_VINT(nfiles);
+
+ for (; p != NULL && p < end;) {
+ t = *p;
+ SZ_SKIP_BYTES(1);
+
+ msg_debug_archive("7zip: read file data type %xc", t);
+
+ if (t == kEnd) {
+ goto end;
+ }
+
+ /* This is SO SPECIAL, gah */
+ SZ_READ_VINT(sz);
+
+ switch (t) {
+ case kEmptyStream:
+ case kEmptyFile:
+ case kAnti: /* AntiFile, OMFG */
+ /* We don't care about these bits */
+ case kCTime:
+ case kATime:
+ case kMTime:
+ /* We don't care of these guys, but we still have to parse them, gah */
+ if (sz > 0) {
+ SZ_SKIP_BYTES(sz);
+ }
+ break;
+ case kName:
+ /* The most useful part in this whole bloody format */
+ b = *p; /* External flag */
+ SZ_SKIP_BYTES(1);
+
+ if (b) {
+ /* TODO: for the god sake, do something about external
+ * filenames...
+ */
+ guint64 tmp;
+
+ SZ_READ_VINT(tmp);
+ }
+ else {
+ for (i = 0; i < nfiles; i++) {
+ /* Zero terminated wchar_t: happy converting... */
+ /* First, find terminator */
+ const guchar *fend = NULL, *tp = p;
+ GString *res;
+
+ while (tp < end - 1) {
+ if (*tp == 0 && *(tp + 1) == 0) {
+ fend = tp;
+ break;
+ }
+
+ tp += 2;
+ }
+
+ if (fend == NULL || fend - p == 0) {
+ /* Crap instead of fname */
+ msg_debug_archive("bad 7zip name; %s", G_STRLOC);
+ goto end;
+ }
+
+ res = rspamd_7zip_ucs2_to_utf8(task, p, fend);
+
+ if (res != NULL) {
+ fentry = g_malloc0(sizeof(*fentry));
+ fentry->fname = res;
+ g_ptr_array_add(arch->files, fentry);
+ msg_debug_archive("7zip: found file %v", res);
+ }
+ else {
+ msg_debug_archive("bad 7zip name; %s", G_STRLOC);
+ }
+ /* Skip zero terminating character */
+ p = fend + 2;
+ }
+ }
+ break;
+ case kDummy:
+ case kWinAttributes:
+ if (sz > 0) {
+ SZ_SKIP_BYTES(sz);
+ }
+ break;
+ default:
+ p = NULL;
+ msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
+ goto end;
+ break;
+ }
+ }
+
+end:
+ return p;
+}
+
+static const guchar *
+rspamd_7zip_read_next_section(struct rspamd_task *task,
+ const guchar *p, const guchar *end,
+ struct rspamd_archive *arch)
+{
+ guchar t = *p;
+
+ SZ_SKIP_BYTES(1);
+
+ msg_debug_archive("7zip: read section %xc", t);
+
+ switch (t) {
+ case kHeader:
+ /* We just skip byte and go further */
+ break;
+ case kEncodedHeader:
+ /*
+ * In fact, headers are just packed, but we assume it as
+ * encrypted to distinguish from the normal archives
+ */
+ msg_debug_archive("7zip: encoded header, needs to be uncompressed");
+ arch->flags |= RSPAMD_ARCHIVE_CANNOT_READ;
+ p = NULL; /* Cannot get anything useful */
+ break;
+ case kArchiveProperties:
+ p = rspamd_7zip_read_archive_props(task, p, end, arch);
+ break;
+ case kMainStreamsInfo:
+ p = rspamd_7zip_read_main_streams_info(task, p, end, arch);
+ break;
+ case kAdditionalStreamsInfo:
+ p = rspamd_7zip_read_main_streams_info(task, p, end, arch);
+ break;
+ case kFilesInfo:
+ p = rspamd_7zip_read_files_info(task, p, end, arch);
+ break;
+ case kEnd:
+ p = NULL;
+ msg_debug_archive("7zip: read final section");
+ break;
+ default:
+ p = NULL;
+ msg_debug_archive("bad 7zip type: %xc; %s", t, G_STRLOC);
+ break;
+ }
+
+ return p;
+}
+
+static void
+rspamd_archive_process_7zip(struct rspamd_task *task,
+ struct rspamd_mime_part *part)
+{
+ struct rspamd_archive *arch;
+ const guchar *start, *p, *end;
+ const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
+ guint64 section_offset = 0, section_length = 0;
+
+ start = part->parsed_data.begin;
+ p = start;
+ end = p + part->parsed_data.len;
+
+ if (end - p <= sizeof(guint64) + sizeof(guint32) ||
+ memcmp(p, sz_magic, sizeof(sz_magic)) != 0) {
+ msg_debug_archive("7z archive is invalid (no 7z magic)");
+
+ return;
+ }
+
+ arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
+ arch->files = g_ptr_array_new();
+ arch->type = RSPAMD_ARCHIVE_7ZIP;
+ rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
+ arch);
+
+ /* Magic (6 bytes) + version (2 bytes) + crc32 (4 bytes) */
+ p += sizeof(guint64) + sizeof(guint32);
+
+ SZ_READ_UINT64(section_offset);
+ SZ_READ_UINT64(section_length);
+
+ if (end - p > sizeof(guint32)) {
+ p += sizeof(guint32);
+ }
+ else {
+ msg_debug_archive("7z archive is invalid (truncated crc)");
+
+ return;
+ }
+
+ if (end - p > section_offset) {
+ p += section_offset;
+ }
+ else {
+ msg_debug_archive("7z archive is invalid (incorrect section offset)");
+
+ return;
+ }
+
+ while ((p = rspamd_7zip_read_next_section(task, p, end, arch)) != NULL)
+ ;
+
+ part->part_type = RSPAMD_MIME_PART_ARCHIVE;
+ part->specific.arch = arch;
+ if (part->cd != NULL) {
+ arch->archive_name = &part->cd->filename;
+ }
+ arch->size = part->parsed_data.len;
+}
+
+static void
+rspamd_archive_process_gzip(struct rspamd_task *task,
+ struct rspamd_mime_part *part)
+{
+ struct rspamd_archive *arch;
+ const guchar *start, *p, *end;
+ const guchar gz_magic[] = {0x1F, 0x8B};
+ guchar flags;
+
+ start = part->parsed_data.begin;
+ p = start;
+ end = p + part->parsed_data.len;
+
+ if (end - p <= 10 || memcmp(p, gz_magic, sizeof(gz_magic)) != 0) {
+ msg_debug_archive("gzip archive is invalid (no gzip magic)");
+
+ return;
+ }
+
+ arch = rspamd_mempool_alloc0(task->task_pool, sizeof(*arch));
+ arch->files = g_ptr_array_sized_new(1);
+ arch->type = RSPAMD_ARCHIVE_GZIP;
+ if (part->cd) {
+ arch->archive_name = &part->cd->filename;
+ }
+ rspamd_mempool_add_destructor(task->task_pool, rspamd_archive_dtor,
+ arch);
+
+ flags = p[3];
+
+ if (flags & (1u << 5)) {
+ arch->flags |= RSPAMD_ARCHIVE_ENCRYPTED;
+ }
+
+ if (flags & (1u << 3)) {
+ /* We have file name presented in archive, try to use it */
+ if (flags & (1u << 1)) {
+ /* Multipart */
+ p += 12;
+ }
+ else {
+ p += 10;
+ }
+
+ if (flags & (1u << 2)) {
+ /* Optional section */
+ guint16 optlen = 0;
+
+ RAR_READ_UINT16(optlen);
+
+ if (end <= p + optlen) {
+ msg_debug_archive("gzip archive is invalid, bad extra length: %d",
+ (int) optlen);
+
+ return;
+ }
+
+ p += optlen;
+ }
+
+ /* Read file name */
+ const guchar *fname_start = p;
+
+ while (p < end) {
+ if (*p == '\0') {
+ if (p > fname_start) {
+ struct rspamd_archive_file *f;
+
+ f = g_malloc0(sizeof(*f));
+
+ rspamd_archive_file_try_utf(task, arch, f,
+ fname_start, p - fname_start);
+
+ if (f->fname) {
+ g_ptr_array_add(arch->files, f);
+
+ if (f->flags & RSPAMD_ARCHIVE_FILE_OBFUSCATED) {
+ arch->flags |= RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES;
+ }
+ }
+ else {
+ /* Invalid filename, skip */
+ g_free(f);
+ }
+
+ goto set;
+ }
+ }
+
+ p++;
+ }
+
+ /* Wrong filename, not zero terminated */
+ msg_debug_archive("gzip archive is invalid, bad filename at pos %d",
+ (int) (p - start));
+
+ return;
+ }
+
+ /* Fallback, we need to extract file name from archive name if possible */
+ if (part->cd && part->cd->filename.len > 0) {
+ const gchar *dot_pos, *slash_pos;
+
+ dot_pos = rspamd_memrchr(part->cd->filename.begin, '.',
+ part->cd->filename.len);
+
+ if (dot_pos) {
+ struct rspamd_archive_file *f;
+
+ slash_pos = rspamd_memrchr(part->cd->filename.begin, '/',
+ part->cd->filename.len);
+
+ if (slash_pos && slash_pos < dot_pos) {
+ f = g_malloc0(sizeof(*f));
+ f->fname = g_string_sized_new(dot_pos - slash_pos);
+ g_string_append_len(f->fname, slash_pos + 1,
+ dot_pos - slash_pos - 1);
+
+ msg_debug_archive("fallback to gzip filename based on cd: %v",
+ f->fname);
+
+ g_ptr_array_add(arch->files, f);
+
+ goto set;
+ }
+ else {
+ const gchar *fname_start = part->cd->filename.begin;
+
+ f = g_malloc0(sizeof(*f));
+
+ if (memchr(fname_start, '.', part->cd->filename.len) != dot_pos) {
+ /* Double dots, something like foo.exe.gz */
+ f->fname = g_string_sized_new(dot_pos - fname_start);
+ g_string_append_len(f->fname, fname_start,
+ dot_pos - fname_start);
+ }
+ else {
+ /* Single dot, something like foo.gzz */
+ f->fname = g_string_sized_new(part->cd->filename.len);
+ g_string_append_len(f->fname, fname_start,
+ part->cd->filename.len);
+ }
+
+ msg_debug_archive("fallback to gzip filename based on cd: %v",
+ f->fname);
+
+ g_ptr_array_add(arch->files, f);
+
+ goto set;
+ }
+ }
+ }
+
+ return;
+
+set:
+ /* Set archive data */
+ part->part_type = RSPAMD_MIME_PART_ARCHIVE;
+ part->specific.arch = arch;
+ arch->size = part->parsed_data.len;
+}
+
+static gboolean
+rspamd_archive_cheat_detect(struct rspamd_mime_part *part, const gchar *str,
+ const guchar *magic_start, gsize magic_len)
+{
+ struct rspamd_content_type *ct;
+ const gchar *p;
+ rspamd_ftok_t srch, *fname;
+
+ ct = part->ct;
+ RSPAMD_FTOK_ASSIGN(&srch, "application");
+
+ if (ct && ct->type.len && ct->subtype.len > 0 && rspamd_ftok_cmp(&ct->type, &srch) == 0) {
+ if (rspamd_substring_search_caseless(ct->subtype.begin, ct->subtype.len,
+ str, strlen(str)) != -1) {
+ /* We still need to check magic, see #1848 */
+ if (magic_start != NULL) {
+ if (part->parsed_data.len > magic_len &&
+ memcmp(part->parsed_data.begin,
+ magic_start, magic_len) == 0) {
+ return TRUE;
+ }
+ /* No magic, refuse this type of archive */
+ return FALSE;
+ }
+ else {
+ return TRUE;
+ }
+ }
+ }
+
+ if (part->cd) {
+ fname = &part->cd->filename;
+
+ if (fname && fname->len > strlen(str)) {
+ p = fname->begin + fname->len - strlen(str);
+
+ if (rspamd_lc_cmp(p, str, strlen(str)) == 0) {
+ if (*(p - 1) == '.') {
+ if (magic_start != NULL) {
+ if (part->parsed_data.len > magic_len &&
+ memcmp(part->parsed_data.begin,
+ magic_start, magic_len) == 0) {
+ return TRUE;
+ }
+ /* No magic, refuse this type of archive */
+ return FALSE;
+ }
+
+ return TRUE;
+ }
+ }
+ }
+
+ if (magic_start != NULL) {
+ if (part->parsed_data.len > magic_len &&
+ memcmp(part->parsed_data.begin, magic_start, magic_len) == 0) {
+ return TRUE;
+ }
+ }
+ }
+ else {
+ if (magic_start != NULL) {
+ if (part->parsed_data.len > magic_len &&
+ memcmp(part->parsed_data.begin, magic_start, magic_len) == 0) {
+ return TRUE;
+ }
+ }
+ }
+
+ return FALSE;
+}
+
+void rspamd_archives_process(struct rspamd_task *task)
+{
+ guint i;
+ struct rspamd_mime_part *part;
+ const guchar rar_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07};
+ const guchar zip_magic[] = {0x50, 0x4b, 0x03, 0x04};
+ const guchar sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
+ const guchar gz_magic[] = {0x1F, 0x8B, 0x08};
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
+ {
+ if (part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
+ if (part->parsed_data.len > 0) {
+ if (rspamd_archive_cheat_detect(part, "zip",
+ zip_magic, sizeof(zip_magic))) {
+ rspamd_archive_process_zip(task, part);
+ }
+ else if (rspamd_archive_cheat_detect(part, "rar",
+ rar_magic, sizeof(rar_magic))) {
+ rspamd_archive_process_rar(task, part);
+ }
+ else if (rspamd_archive_cheat_detect(part, "7z",
+ sz_magic, sizeof(sz_magic))) {
+ rspamd_archive_process_7zip(task, part);
+ }
+ else if (rspamd_archive_cheat_detect(part, "gz",
+ gz_magic, sizeof(gz_magic))) {
+ rspamd_archive_process_gzip(task, part);
+ }
+
+ if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT) &&
+ part->part_type == RSPAMD_MIME_PART_ARCHIVE &&
+ part->specific.arch) {
+ struct rspamd_archive *arch = part->specific.arch;
+
+ msg_info_task("found %s archive with incorrect content-type: %T/%T",
+ rspamd_archive_type_str(arch->type),
+ &part->ct->type, &part->ct->subtype);
+
+ if (!(part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
+ part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
+ }
+ }
+ }
+ }
+ }
+}
+
+
+const gchar *
+rspamd_archive_type_str(enum rspamd_archive_type type)
+{
+ const gchar *ret = "unknown";
+
+ switch (type) {
+ case RSPAMD_ARCHIVE_ZIP:
+ ret = "zip";
+ break;
+ case RSPAMD_ARCHIVE_RAR:
+ ret = "rar";
+ break;
+ case RSPAMD_ARCHIVE_7ZIP:
+ ret = "7z";
+ break;
+ case RSPAMD_ARCHIVE_GZIP:
+ ret = "gz";
+ break;
+ }
+
+ return ret;
+}
diff --git a/src/libmime/archives.h b/src/libmime/archives.h
new file mode 100644
index 0000000..56beb62
--- /dev/null
+++ b/src/libmime/archives.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_ARCHIVES_H_
+#define SRC_LIBMIME_ARCHIVES_H_
+
+#include "config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum rspamd_archive_type {
+ RSPAMD_ARCHIVE_ZIP,
+ RSPAMD_ARCHIVE_RAR,
+ RSPAMD_ARCHIVE_7ZIP,
+ RSPAMD_ARCHIVE_GZIP,
+};
+
+enum rspamd_archive_flags {
+ RSPAMD_ARCHIVE_ENCRYPTED = (1u << 0u),
+ RSPAMD_ARCHIVE_CANNOT_READ = (1u << 1u),
+ RSPAMD_ARCHIVE_HAS_OBFUSCATED_FILES = (1u << 2u),
+};
+
+enum rspamd_archive_file_flags {
+ RSPAMD_ARCHIVE_FILE_ENCRYPTED = (1u << 0u),
+ RSPAMD_ARCHIVE_FILE_OBFUSCATED = (1u << 1u),
+};
+
+struct rspamd_archive_file {
+ GString *fname;
+ gsize compressed_size;
+ gsize uncompressed_size;
+ enum rspamd_archive_file_flags flags;
+};
+
+struct rspamd_archive {
+ enum rspamd_archive_type type;
+ const rspamd_ftok_t *archive_name;
+ gsize size;
+ enum rspamd_archive_flags flags;
+ GPtrArray *files; /* Array of struct rspamd_archive_file */
+};
+
+/**
+ * Process archives from a worker task
+ */
+void rspamd_archives_process(struct rspamd_task *task);
+
+/**
+ * Get textual representation of an archive's type
+ */
+const gchar *rspamd_archive_type_str(enum rspamd_archive_type type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SRC_LIBMIME_ARCHIVES_H_ */
diff --git a/src/libmime/content_type.c b/src/libmime/content_type.c
new file mode 100644
index 0000000..765cb87
--- /dev/null
+++ b/src/libmime/content_type.c
@@ -0,0 +1,884 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "libmime/content_type.h"
+#include "smtp_parsers.h"
+#include "utlist.h"
+#include "libserver/url.h"
+#include "libmime/mime_encoding.h"
+
+static gboolean
+rspamd_rfc2231_decode(rspamd_mempool_t *pool,
+ struct rspamd_content_type_param *param,
+ gchar *value_start, gchar *value_end)
+{
+ gchar *quote_pos;
+
+ quote_pos = memchr(value_start, '\'', value_end - value_start);
+
+ if (quote_pos == NULL) {
+ /* Plain percent encoding */
+ gsize r = rspamd_url_decode(value_start, value_start,
+ value_end - value_start);
+ param->value.begin = value_start;
+ param->value.len = r;
+ }
+ else {
+ /*
+ * We can have encoding'language'data, or
+ * encoding'data (in theory).
+ * Try to handle both...
+ */
+ const gchar *charset = NULL;
+ rspamd_ftok_t ctok;
+
+ ctok.begin = value_start;
+ ctok.len = quote_pos - value_start;
+
+ if (ctok.len > 0) {
+ charset = rspamd_mime_detect_charset(&ctok, pool);
+ }
+
+ /* Now, we can check for either next quote sign or, eh, ignore that */
+ value_start = quote_pos + 1;
+
+ quote_pos = memchr(value_start, '\'', value_end - value_start);
+
+ if (quote_pos) {
+ /* Ignore language */
+ value_start = quote_pos + 1;
+ }
+
+ /* Perform percent decoding */
+ gsize r = rspamd_url_decode(value_start, value_start,
+ value_end - value_start);
+ GError *err = NULL;
+
+ if (charset == NULL) {
+ /* Try heuristic */
+ charset = rspamd_mime_charset_find_by_content(value_start, r, TRUE);
+ }
+
+ if (charset == NULL) {
+ msg_warn_pool("cannot convert parameter from charset %T", &ctok);
+
+ return FALSE;
+ }
+
+ param->value.begin = rspamd_mime_text_to_utf8(pool,
+ value_start, r,
+ charset, &param->value.len, &err);
+
+ if (param->value.begin == NULL) {
+ msg_warn_pool("cannot convert parameter from charset %s: %e",
+ charset, err);
+
+ if (err) {
+ g_error_free(err);
+ }
+
+ return FALSE;
+ }
+ }
+
+ param->flags |= RSPAMD_CONTENT_PARAM_RFC2231;
+
+ return TRUE;
+}
+
+static gboolean
+rspamd_param_maybe_rfc2231_process(rspamd_mempool_t *pool,
+ struct rspamd_content_type_param *param,
+ gchar *name_start, gchar *name_end,
+ gchar *value_start, gchar *value_end)
+{
+ const gchar *star_pos;
+
+ star_pos = memchr(name_start, '*', name_end - name_start);
+
+ if (star_pos == NULL) {
+ return FALSE;
+ }
+
+ /* We have three possibilities here:
+ * 1. name* (just name + 2231 encoding)
+ * 2. name*(\d+) (piecewise stuff but no rfc2231 encoding)
+ * 3. name*(\d+)* (piecewise stuff and rfc2231 encoding)
+ */
+
+ if (star_pos == name_end - 1) {
+ /* First */
+ if (rspamd_rfc2231_decode(pool, param, value_start, value_end)) {
+ param->name.begin = name_start;
+ param->name.len = name_end - name_start - 1;
+ }
+ }
+ else if (*(name_end - 1) == '*') {
+ /* Third */
+ /* Check number */
+ gulong tmp;
+
+ if (!rspamd_strtoul(star_pos + 1, name_end - star_pos - 2, &tmp)) {
+ return FALSE;
+ }
+
+ param->flags |= RSPAMD_CONTENT_PARAM_PIECEWISE | RSPAMD_CONTENT_PARAM_RFC2231;
+ param->rfc2231_id = tmp;
+ param->name.begin = name_start;
+ param->name.len = star_pos - name_start;
+ param->value.begin = value_start;
+ param->value.len = value_end - value_start;
+
+ /* Deal with that later... */
+ }
+ else {
+ /* Second case */
+ gulong tmp;
+
+ if (!rspamd_strtoul(star_pos + 1, name_end - star_pos - 1, &tmp)) {
+ return FALSE;
+ }
+
+ param->flags |= RSPAMD_CONTENT_PARAM_PIECEWISE;
+ param->rfc2231_id = tmp;
+ param->name.begin = name_start;
+ param->name.len = star_pos - name_start;
+ param->value.begin = value_start;
+ param->value.len = value_end - value_start;
+ }
+
+ return TRUE;
+}
+
+static gint32
+rspamd_cmp_pieces(struct rspamd_content_type_param *p1, struct rspamd_content_type_param *p2)
+{
+ return p1->rfc2231_id - p2->rfc2231_id;
+}
+
+static void
+rspamd_postprocess_ct_attributes(rspamd_mempool_t *pool,
+ GHashTable *htb,
+ void (*proc)(rspamd_mempool_t *, struct rspamd_content_type_param *, gpointer ud),
+ gpointer procd)
+{
+ GHashTableIter it;
+ gpointer k, v;
+ struct rspamd_content_type_param *param, *sorted, *cur;
+
+ if (htb == NULL) {
+ return;
+ }
+
+ g_hash_table_iter_init(&it, htb);
+
+ while (g_hash_table_iter_next(&it, &k, &v)) {
+ param = (struct rspamd_content_type_param *) v;
+
+ if (param->flags & RSPAMD_CONTENT_PARAM_PIECEWISE) {
+ /* Reconstruct param */
+ gsize tlen = 0;
+ gchar *ndata, *pos;
+
+ sorted = param;
+ DL_SORT(sorted, rspamd_cmp_pieces);
+
+ DL_FOREACH(sorted, cur)
+ {
+ tlen += cur->value.len;
+ }
+
+ ndata = rspamd_mempool_alloc(pool, tlen);
+ pos = ndata;
+
+ DL_FOREACH(sorted, cur)
+ {
+ memcpy(pos, cur->value.begin, cur->value.len);
+ pos += cur->value.len;
+ }
+
+ if (param->flags & RSPAMD_CONTENT_PARAM_RFC2231) {
+ if (!rspamd_rfc2231_decode(pool, param,
+ ndata, pos)) {
+ param->flags |= RSPAMD_CONTENT_PARAM_BROKEN;
+ param->value.begin = ndata;
+ param->value.len = tlen;
+ }
+ }
+ else {
+ param->value.begin = ndata;
+ param->value.len = tlen;
+ }
+
+ /* Detach from list */
+ param->next = NULL;
+ param->prev = param;
+ }
+
+ gboolean invalid_utf = FALSE;
+
+ if (param->value.begin != NULL && param->value.len > 0) {
+ param->value.begin = rspamd_mime_header_decode(pool, param->value.begin,
+ param->value.len, &invalid_utf);
+ param->value.len = strlen(param->value.begin);
+ }
+
+ if (invalid_utf) {
+ param->flags |= RSPAMD_CONTENT_PARAM_BROKEN;
+ }
+
+ proc(pool, param, procd);
+ }
+}
+
+static void
+rspamd_content_type_postprocess(rspamd_mempool_t *pool,
+ struct rspamd_content_type_param *param,
+ gpointer ud)
+{
+ rspamd_ftok_t srch;
+ struct rspamd_content_type_param *found = NULL;
+
+ struct rspamd_content_type *ct = (struct rspamd_content_type *) ud;
+
+ RSPAMD_FTOK_ASSIGN(&srch, "charset");
+
+ if (rspamd_ftok_icase_equal(&param->name, &srch)) {
+ /* Adjust charset */
+ found = param;
+ ct->charset.begin = param->value.begin;
+ ct->charset.len = param->value.len;
+ }
+
+ RSPAMD_FTOK_ASSIGN(&srch, "boundary");
+
+ if (rspamd_ftok_icase_equal(&param->name, &srch)) {
+ found = param;
+ gchar *lc_boundary;
+ /* Adjust boundary */
+ lc_boundary = rspamd_mempool_alloc(pool, param->value.len);
+ memcpy(lc_boundary, param->value.begin, param->value.len);
+ rspamd_str_lc(lc_boundary, param->value.len);
+ ct->boundary.begin = lc_boundary;
+ ct->boundary.len = param->value.len;
+ /* Preserve original (case sensitive) boundary */
+ ct->orig_boundary.begin = param->value.begin;
+ ct->orig_boundary.len = param->value.len;
+ }
+
+ if (!found) {
+ RSPAMD_FTOK_ASSIGN(&srch, "name");
+ if (!rspamd_ftok_icase_equal(&param->name, &srch)) {
+ /* Just lowercase */
+ rspamd_str_lc_utf8((gchar *) param->value.begin, param->value.len);
+ }
+ }
+}
+
+static void
+rspamd_content_disposition_postprocess(rspamd_mempool_t *pool,
+ struct rspamd_content_type_param *param,
+ gpointer ud)
+{
+ rspamd_ftok_t srch;
+ struct rspamd_content_disposition *cd = (struct rspamd_content_disposition *) ud;
+
+ srch.begin = "filename";
+ srch.len = 8;
+
+ if (rspamd_ftok_icase_equal(&param->name, &srch)) {
+ /* Adjust filename */
+ cd->filename.begin = param->value.begin;
+ cd->filename.len = param->value.len;
+ }
+}
+
+void rspamd_content_type_add_param(rspamd_mempool_t *pool,
+ struct rspamd_content_type *ct,
+ gchar *name_start, gchar *name_end,
+ gchar *value_start, gchar *value_end)
+{
+ struct rspamd_content_type_param *nparam;
+ rspamd_ftok_t srch;
+ struct rspamd_content_type_param *found = NULL;
+
+ g_assert(ct != NULL);
+
+ nparam = rspamd_mempool_alloc0(pool, sizeof(*nparam));
+ rspamd_str_lc(name_start, name_end - name_start);
+
+ if (!rspamd_param_maybe_rfc2231_process(pool, nparam, name_start,
+ name_end, value_start, value_end)) {
+ nparam->name.begin = name_start;
+ nparam->name.len = name_end - name_start;
+ nparam->value.begin = value_start;
+ nparam->value.len = value_end - value_start;
+ }
+
+ srch.begin = nparam->name.begin;
+ srch.len = nparam->name.len;
+
+ if (ct->attrs) {
+ found = g_hash_table_lookup(ct->attrs, &srch);
+ }
+ else {
+ ct->attrs = g_hash_table_new(rspamd_ftok_icase_hash,
+ rspamd_ftok_icase_equal);
+ }
+
+ if (!found) {
+ DL_APPEND(found, nparam);
+ g_hash_table_insert(ct->attrs, &nparam->name, nparam);
+ }
+ else {
+ DL_APPEND(found, nparam);
+ }
+}
+
+static struct rspamd_content_type *
+rspamd_content_type_parser(gchar *in, gsize len, rspamd_mempool_t *pool)
+{
+ guint obraces = 0, ebraces = 0, qlen = 0;
+ gchar *p, *c, *end, *pname_start = NULL, *pname_end = NULL;
+ struct rspamd_content_type *res = NULL, val;
+ gboolean eqsign_seen = FALSE;
+ enum {
+ parse_type,
+ parse_subtype,
+ parse_after_subtype,
+ parse_param_name,
+ parse_param_after_name,
+ parse_param_value,
+ parse_param_value_after_quote,
+ parse_space,
+ parse_quoted,
+ parse_comment,
+ } state = parse_space,
+ next_state = parse_type;
+
+ p = in;
+ c = p;
+ end = p + len;
+ memset(&val, 0, sizeof(val));
+ val.cpy = in;
+
+ while (p < end) {
+ switch (state) {
+ case parse_type:
+ if (g_ascii_isspace(*p) || *p == ';') {
+ /* We have type without subtype */
+ val.type.begin = c;
+ val.type.len = p - c;
+ state = parse_after_subtype;
+ }
+ else if (*p == '/') {
+ val.type.begin = c;
+ val.type.len = p - c;
+ state = parse_space;
+ next_state = parse_subtype;
+ p++;
+ }
+ else {
+ p++;
+ }
+ break;
+ case parse_subtype:
+ if (g_ascii_isspace(*p) || *p == ';') {
+ val.subtype.begin = c;
+ val.subtype.len = p - c;
+ state = parse_after_subtype;
+ }
+ else {
+ p++;
+ }
+ break;
+ case parse_after_subtype:
+ if (*p == ';' || g_ascii_isspace(*p)) {
+ p++;
+ }
+ else if (*p == '(') {
+ c = p;
+ state = parse_comment;
+ next_state = parse_param_name;
+ obraces = 1;
+ ebraces = 0;
+ pname_start = NULL;
+ pname_end = NULL;
+ eqsign_seen = FALSE;
+ p++;
+ }
+ else {
+ c = p;
+ state = parse_param_name;
+ pname_start = NULL;
+ pname_end = NULL;
+ eqsign_seen = FALSE;
+ }
+ break;
+ case parse_param_name:
+ if (*p == '=') {
+ pname_start = c;
+ pname_end = p;
+ state = parse_param_after_name;
+ eqsign_seen = TRUE;
+ p++;
+ }
+ else if (g_ascii_isspace(*p)) {
+ pname_start = c;
+ pname_end = p;
+ state = parse_param_after_name;
+ }
+ else {
+ p++;
+ }
+ break;
+ case parse_param_after_name:
+ if (g_ascii_isspace(*p)) {
+ p++;
+ }
+ else if (*p == '=') {
+ if (eqsign_seen) {
+ /* Treat as value start */
+ c = p;
+ eqsign_seen = FALSE;
+ state = parse_param_value;
+ p++;
+ }
+ else {
+ eqsign_seen = TRUE;
+ p++;
+ }
+ }
+ else {
+ if (eqsign_seen) {
+ state = parse_param_value;
+ c = p;
+ }
+ else {
+ /* Invalid parameter without value */
+ c = p;
+ state = parse_param_name;
+ pname_start = NULL;
+ pname_end = NULL;
+ }
+ }
+ break;
+ case parse_param_value:
+ if (*p == '"') {
+ p++;
+ c = p;
+ state = parse_quoted;
+ next_state = parse_param_value_after_quote;
+ }
+ else if (g_ascii_isspace(*p)) {
+ if (pname_start && pname_end && pname_end > pname_start) {
+ rspamd_content_type_add_param(pool, &val, pname_start,
+ pname_end, c, p);
+ }
+
+ state = parse_space;
+ next_state = parse_param_name;
+ pname_start = NULL;
+ pname_end = NULL;
+ }
+ else if (*p == '(') {
+ if (pname_start && pname_end && pname_end > pname_start) {
+ rspamd_content_type_add_param(pool, &val, pname_start,
+ pname_end, c, p);
+ }
+
+ obraces = 1;
+ ebraces = 0;
+ p++;
+ state = parse_comment;
+ next_state = parse_param_name;
+ pname_start = NULL;
+ pname_end = NULL;
+ }
+ else if (*p == ';') {
+ if (pname_start && pname_end && pname_end > pname_start) {
+ rspamd_content_type_add_param(pool, &val, pname_start,
+ pname_end, c, p);
+ }
+
+ p++;
+ state = parse_space;
+ next_state = parse_param_name;
+ pname_start = NULL;
+ pname_end = NULL;
+ }
+ else {
+ p++;
+ }
+ break;
+ case parse_param_value_after_quote:
+ if (pname_start && pname_end && pname_end > pname_start) {
+ rspamd_content_type_add_param(pool, &val, pname_start,
+ pname_end, c, c + qlen);
+ }
+
+ if (*p == '"') {
+ p++;
+
+ if (p == end) {
+ /* Last quote: done... */
+ state = parse_space;
+ break;
+ }
+
+ if (*p == ';') {
+ p++;
+ state = parse_space;
+ next_state = parse_param_name;
+ pname_start = NULL;
+ pname_end = NULL;
+ continue;
+ }
+ }
+
+ /* We should not normally be here in fact */
+ if (g_ascii_isspace(*p)) {
+ state = parse_space;
+ next_state = parse_param_name;
+ pname_start = NULL;
+ pname_end = NULL;
+ }
+ else if (*p == '(') {
+ obraces = 1;
+ ebraces = 0;
+ p++;
+ state = parse_comment;
+ next_state = parse_param_name;
+ pname_start = NULL;
+ pname_end = NULL;
+ }
+ else {
+ state = parse_param_name;
+ pname_start = NULL;
+ pname_end = NULL;
+ c = p;
+ }
+ break;
+ case parse_quoted:
+ if (*p == '\\') {
+ /* Quoted pair */
+ if (p + 1 < end) {
+ p += 2;
+ }
+ else {
+ p++;
+ }
+ }
+ else if (*p == '"') {
+ qlen = p - c;
+ state = next_state;
+ }
+ else {
+ p++;
+ }
+ break;
+ case parse_comment:
+ if (*p == '(') {
+ obraces++;
+ p++;
+ }
+ else if (*p == ')') {
+ ebraces++;
+ p++;
+
+ if (ebraces == obraces && p < end) {
+ if (g_ascii_isspace(*p)) {
+ state = parse_space;
+ }
+ else {
+ c = p;
+ state = next_state;
+ }
+ }
+ }
+ else {
+ p++;
+ }
+ break;
+ case parse_space:
+ if (g_ascii_isspace(*p)) {
+ p++;
+ }
+ else if (*p == '(') {
+ obraces = 1;
+ ebraces = 0;
+ p++;
+ state = parse_comment;
+ }
+ else {
+ c = p;
+ state = next_state;
+ }
+ break;
+ }
+ }
+
+ /* Process leftover */
+ switch (state) {
+ case parse_type:
+ val.type.begin = c;
+ val.type.len = p - c;
+ break;
+ case parse_subtype:
+ val.subtype.begin = c;
+ val.subtype.len = p - c;
+ break;
+ case parse_param_value:
+ if (pname_start && pname_end && pname_end > pname_start) {
+ if (p > c && *(p - 1) == ';') {
+ p--;
+ }
+
+ rspamd_content_type_add_param(pool, &val, pname_start,
+ pname_end, c, p);
+ }
+ break;
+ case parse_param_value_after_quote:
+ if (pname_start && pname_end && pname_end > pname_start) {
+ rspamd_content_type_add_param(pool, &val, pname_start,
+ pname_end, c, c + qlen);
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (val.type.len > 0) {
+ gchar *tmp;
+
+ res = rspamd_mempool_alloc(pool, sizeof(val));
+ memcpy(res, &val, sizeof(val));
+
+ /*
+ * Lowercase type and subtype as they are specified as case insensitive
+ * in rfc2045 section 5.1
+ */
+ tmp = rspamd_mempool_alloc(pool, val.type.len);
+ memcpy(tmp, val.type.begin, val.type.len);
+ rspamd_str_lc(tmp, val.type.len);
+ res->type.begin = tmp;
+
+ if (val.subtype.len > 0) {
+ tmp = rspamd_mempool_alloc(pool, val.subtype.len);
+ memcpy(tmp, val.subtype.begin, val.subtype.len);
+ rspamd_str_lc(tmp, val.subtype.len);
+ res->subtype.begin = tmp;
+ }
+ }
+
+ return res;
+}
+
+struct rspamd_content_type *
+rspamd_content_type_parse(const gchar *in,
+ gsize len, rspamd_mempool_t *pool)
+{
+ struct rspamd_content_type *res = NULL;
+ rspamd_ftok_t srch;
+ gchar *cpy;
+
+ cpy = rspamd_mempool_alloc(pool, len + 1);
+ rspamd_strlcpy(cpy, in, len + 1);
+
+ if ((res = rspamd_content_type_parser(cpy, len, pool)) != NULL) {
+ if (res->attrs) {
+ rspamd_mempool_add_destructor(pool,
+ (rspamd_mempool_destruct_t) g_hash_table_unref, res->attrs);
+
+ rspamd_postprocess_ct_attributes(pool, res->attrs,
+ rspamd_content_type_postprocess, res);
+ }
+
+ /* Now do some hacks to work with broken content types */
+ if (res->subtype.len == 0) {
+ res->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
+ RSPAMD_FTOK_ASSIGN(&srch, "text");
+
+ if (rspamd_ftok_casecmp(&res->type, &srch) == 0) {
+ /* Workaround for Content-Type: text */
+ /* Assume text/plain */
+ RSPAMD_FTOK_ASSIGN(&srch, "plain");
+ }
+ else {
+ RSPAMD_FTOK_ASSIGN(&srch, "html");
+
+ if (rspamd_ftok_casecmp(&res->type, &srch) == 0) {
+ /* Workaround for Content-Type: html */
+ RSPAMD_FTOK_ASSIGN(&res->type, "text");
+ RSPAMD_FTOK_ASSIGN(&res->subtype, "html");
+ }
+ else {
+ RSPAMD_FTOK_ASSIGN(&srch, "application");
+
+ if (rspamd_ftok_casecmp(&res->type, &srch) == 0) {
+ RSPAMD_FTOK_ASSIGN(&res->subtype, "octet-stream");
+ }
+ }
+ }
+ }
+ else {
+ /* Common mistake done by retards */
+ RSPAMD_FTOK_ASSIGN(&srch, "alternate");
+
+ if (rspamd_ftok_casecmp(&res->subtype, &srch) == 0) {
+ res->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
+ RSPAMD_FTOK_ASSIGN(&res->subtype, "alternative");
+ }
+
+ /* PKCS7 smime */
+ RSPAMD_FTOK_ASSIGN(&srch, "pkcs7-mime");
+ if (rspamd_substring_search(res->subtype.begin, res->subtype.len,
+ srch.begin, srch.len) != -1) {
+ res->flags |= RSPAMD_CONTENT_TYPE_SMIME;
+ }
+ }
+
+ RSPAMD_FTOK_ASSIGN(&srch, "multipart");
+
+ if (rspamd_ftok_casecmp(&res->type, &srch) == 0) {
+ res->flags |= RSPAMD_CONTENT_TYPE_MULTIPART;
+
+ RSPAMD_FTOK_ASSIGN(&srch, "encrypted");
+ if (rspamd_ftok_casecmp(&res->subtype, &srch) == 0) {
+ res->flags |= RSPAMD_CONTENT_TYPE_ENCRYPTED;
+ }
+ }
+ else {
+ RSPAMD_FTOK_ASSIGN(&srch, "text");
+
+ if (rspamd_ftok_casecmp(&res->type, &srch) == 0) {
+ res->flags |= RSPAMD_CONTENT_TYPE_TEXT;
+ }
+ else {
+ RSPAMD_FTOK_ASSIGN(&srch, "message");
+
+ if (rspamd_ftok_casecmp(&res->type, &srch) == 0) {
+ RSPAMD_FTOK_ASSIGN(&srch, "delivery-status");
+
+ if (rspamd_ftok_casecmp(&res->subtype, &srch) == 0) {
+ res->flags |= RSPAMD_CONTENT_TYPE_TEXT | RSPAMD_CONTENT_TYPE_DSN;
+ }
+ else {
+ RSPAMD_FTOK_ASSIGN(&srch, "notification");
+
+ if (rspamd_substring_search_caseless(res->subtype.begin,
+ res->subtype.len, srch.begin, srch.len) != -1) {
+ res->flags |= RSPAMD_CONTENT_TYPE_TEXT |
+ RSPAMD_CONTENT_TYPE_DSN;
+ }
+ else {
+ res->flags |= RSPAMD_CONTENT_TYPE_MESSAGE;
+ }
+ }
+ }
+ }
+ }
+ }
+ else {
+ msg_warn_pool("cannot parse content type: %*s", (gint) len, cpy);
+ }
+
+ return res;
+}
+
+void rspamd_content_disposition_add_param(rspamd_mempool_t *pool,
+ struct rspamd_content_disposition *cd,
+ const gchar *name_start, const gchar *name_end,
+ const gchar *value_start, const gchar *value_end)
+{
+ rspamd_ftok_t srch;
+ gchar *name_cpy, *value_cpy, *name_cpy_end, *value_cpy_end;
+ struct rspamd_content_type_param *found = NULL, *nparam;
+
+ g_assert(cd != NULL);
+
+ name_cpy = rspamd_mempool_alloc(pool, name_end - name_start);
+ memcpy(name_cpy, name_start, name_end - name_start);
+ name_cpy_end = name_cpy + (name_end - name_start);
+
+ value_cpy = rspamd_mempool_alloc(pool, value_end - value_start);
+ memcpy(value_cpy, value_start, value_end - value_start);
+ value_cpy_end = value_cpy + (value_end - value_start);
+
+ nparam = rspamd_mempool_alloc0(pool, sizeof(*nparam));
+ rspamd_str_lc(name_cpy, name_cpy_end - name_cpy);
+
+ if (!rspamd_param_maybe_rfc2231_process(pool, nparam, name_cpy,
+ name_cpy_end, value_cpy, value_cpy_end)) {
+ nparam->name.begin = name_cpy;
+ nparam->name.len = name_cpy_end - name_cpy;
+ nparam->value.begin = value_cpy;
+ nparam->value.len = value_cpy_end - value_cpy;
+ }
+
+ srch.begin = nparam->name.begin;
+ srch.len = nparam->name.len;
+
+ if (cd->attrs) {
+ found = g_hash_table_lookup(cd->attrs, &srch);
+ }
+ else {
+ cd->attrs = g_hash_table_new(rspamd_ftok_icase_hash,
+ rspamd_ftok_icase_equal);
+ }
+
+ if (!found) {
+ DL_APPEND(found, nparam);
+ g_hash_table_insert(cd->attrs, &nparam->name, nparam);
+ }
+ else {
+ DL_APPEND(found, nparam);
+ }
+}
+
+struct rspamd_content_disposition *
+rspamd_content_disposition_parse(const gchar *in,
+ gsize len, rspamd_mempool_t *pool)
+{
+ struct rspamd_content_disposition *res = NULL, val;
+
+ if (rspamd_content_disposition_parser(in, len, &val, pool)) {
+
+ if (val.type == RSPAMD_CT_UNKNOWN) {
+ /* 'Fix' type to attachment as MUA does */
+ val.type = RSPAMD_CT_ATTACHMENT;
+ }
+
+ res = rspamd_mempool_alloc(pool, sizeof(val));
+ memcpy(res, &val, sizeof(val));
+ res->lc_data = rspamd_mempool_alloc(pool, len + 1);
+ rspamd_strlcpy(res->lc_data, in, len + 1);
+ rspamd_str_lc(res->lc_data, len);
+
+ if (res->attrs) {
+ rspamd_postprocess_ct_attributes(pool, res->attrs,
+ rspamd_content_disposition_postprocess, res);
+ rspamd_mempool_add_destructor(pool,
+ (rspamd_mempool_destruct_t) g_hash_table_unref, res->attrs);
+ }
+ }
+ else {
+ msg_warn_pool("cannot parse content disposition: %*s",
+ (gint) len, in);
+ }
+
+ return res;
+}
diff --git a/src/libmime/content_type.h b/src/libmime/content_type.h
new file mode 100644
index 0000000..ac49bdc
--- /dev/null
+++ b/src/libmime/content_type.h
@@ -0,0 +1,130 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_CONTENT_TYPE_H_
+#define SRC_LIBMIME_CONTENT_TYPE_H_
+
+#include "config.h"
+#include "libutil/fstring.h"
+#include "libutil/mem_pool.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum rspamd_content_type_flags {
+ RSPAMD_CONTENT_TYPE_VALID = 0,
+ RSPAMD_CONTENT_TYPE_BROKEN = 1 << 0,
+ RSPAMD_CONTENT_TYPE_MULTIPART = 1 << 1,
+ RSPAMD_CONTENT_TYPE_TEXT = 1 << 2,
+ RSPAMD_CONTENT_TYPE_MESSAGE = 1 << 3,
+ RSPAMD_CONTENT_TYPE_DSN = 1 << 4,
+ RSPAMD_CONTENT_TYPE_MISSING = 1 << 5,
+ RSPAMD_CONTENT_TYPE_ENCRYPTED = 1 << 6,
+ RSPAMD_CONTENT_TYPE_SMIME = 1 << 7,
+};
+
+enum rspamd_content_param_flags {
+ RSPAMD_CONTENT_PARAM_NORMAL = 0,
+ RSPAMD_CONTENT_PARAM_RFC2231 = (1 << 0),
+ RSPAMD_CONTENT_PARAM_PIECEWISE = (1 << 1),
+ RSPAMD_CONTENT_PARAM_BROKEN = (1 << 2),
+};
+
+struct rspamd_content_type_param {
+ rspamd_ftok_t name;
+ rspamd_ftok_t value;
+ guint rfc2231_id;
+ enum rspamd_content_param_flags flags;
+ struct rspamd_content_type_param *prev, *next;
+};
+
+struct rspamd_content_type {
+ gchar *cpy;
+ rspamd_ftok_t type;
+ rspamd_ftok_t subtype;
+ rspamd_ftok_t charset;
+ rspamd_ftok_t boundary;
+ rspamd_ftok_t orig_boundary;
+ enum rspamd_content_type_flags flags;
+ GHashTable *attrs; /* Can be empty */
+};
+
+enum rspamd_content_disposition_type {
+ RSPAMD_CT_UNKNOWN = 0,
+ RSPAMD_CT_INLINE = 1,
+ RSPAMD_CT_ATTACHMENT = 2,
+};
+
+struct rspamd_content_disposition {
+ gchar *lc_data;
+ enum rspamd_content_disposition_type type;
+ rspamd_ftok_t filename;
+ GHashTable *attrs; /* Can be empty */
+};
+
+/**
+ * Adds new parameter to content type structure
+ * @param ct
+ * @param name_start (can be modified)
+ * @param name_end
+ * @param value_start (can be modified)
+ * @param value_end
+ */
+void rspamd_content_type_add_param(rspamd_mempool_t *pool,
+ struct rspamd_content_type *ct,
+ gchar *name_start, gchar *name_end,
+ gchar *value_start, gchar *value_end);
+
+/**
+ * Parse content type from the header (performs copy + lowercase)
+ * @param in
+ * @param len
+ * @param pool
+ * @return
+ */
+struct rspamd_content_type *rspamd_content_type_parse(const gchar *in,
+ gsize len, rspamd_mempool_t *pool);
+
+/**
+ * Adds new param for content disposition header
+ * @param pool
+ * @param cd
+ * @param name_start
+ * @param name_end
+ * @param value_start
+ * @param value_end
+ */
+void rspamd_content_disposition_add_param(rspamd_mempool_t *pool,
+ struct rspamd_content_disposition *cd,
+ const gchar *name_start, const gchar *name_end,
+ const gchar *value_start, const gchar *value_end);
+
+/**
+ * Parse content-disposition header
+ * @param in
+ * @param len
+ * @param pool
+ * @return
+ */
+struct rspamd_content_disposition *rspamd_content_disposition_parse(const gchar *in,
+ gsize len,
+ rspamd_mempool_t *pool);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SRC_LIBMIME_CONTENT_TYPE_H_ */
diff --git a/src/libmime/email_addr.c b/src/libmime/email_addr.c
new file mode 100644
index 0000000..0af7388
--- /dev/null
+++ b/src/libmime/email_addr.c
@@ -0,0 +1,563 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "email_addr.h"
+#include "message.h"
+#include "printf.h"
+#include "smtp_parsers.h"
+
+static void
+rspamd_email_address_unescape(struct rspamd_email_address *addr)
+{
+ const char *h, *end;
+ char *t, *d;
+
+ if (addr->user_len == 0) {
+ return;
+ }
+
+ d = g_malloc(addr->user_len);
+ t = d;
+ h = addr->user;
+ end = h + addr->user_len;
+
+ while (h < end) {
+ if (*h != '\\') {
+ *t++ = *h;
+ }
+ h++;
+ }
+
+ addr->user = d;
+ addr->user_len = t - d;
+ addr->flags |= RSPAMD_EMAIL_ADDR_USER_ALLOCATED;
+}
+
+struct rspamd_email_address *
+rspamd_email_address_from_smtp(const gchar *str, guint len)
+{
+ struct rspamd_email_address addr, *ret;
+ gsize nlen;
+
+ if (str == NULL || len == 0) {
+ return NULL;
+ }
+
+ rspamd_smtp_addr_parse(str, len, &addr);
+
+ if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
+ ret = g_malloc(sizeof(*ret));
+ memcpy(ret, &addr, sizeof(addr));
+
+ if ((ret->flags & RSPAMD_EMAIL_ADDR_QUOTED) && ret->addr[0] == '"') {
+ if (ret->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
+ /* We also need to unquote user */
+ rspamd_email_address_unescape(ret);
+ }
+
+ /* We need to unquote addr */
+ nlen = ret->domain_len + ret->user_len + 2;
+ ret->addr = g_malloc(nlen + 1);
+ ret->addr_len = rspamd_snprintf((char *) ret->addr, nlen, "%*s@%*s",
+ (gint) ret->user_len, ret->user,
+ (gint) ret->domain_len, ret->domain);
+ ret->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
+ }
+
+ return ret;
+ }
+
+ return NULL;
+}
+
+void rspamd_email_address_free(struct rspamd_email_address *addr)
+{
+ if (addr) {
+ if (addr->flags & RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED) {
+ g_free((void *) addr->addr);
+ }
+
+ if (addr->flags & RSPAMD_EMAIL_ADDR_USER_ALLOCATED) {
+ g_free((void *) addr->user);
+ }
+
+ g_free(addr);
+ }
+}
+
+static inline void
+rspamd_email_address_add(rspamd_mempool_t *pool,
+ GPtrArray *ar,
+ struct rspamd_email_address *addr,
+ GString *name)
+{
+ struct rspamd_email_address *elt;
+ guint nlen;
+
+ elt = g_malloc0(sizeof(*elt));
+ rspamd_mempool_notify_alloc(pool, sizeof(*elt));
+
+ if (addr != NULL) {
+ memcpy(elt, addr, sizeof(*addr));
+ }
+ else {
+ elt->addr = "";
+ elt->domain = "";
+ elt->raw = "<>";
+ elt->raw_len = 2;
+ elt->user = "";
+ elt->flags |= RSPAMD_EMAIL_ADDR_EMPTY;
+ }
+
+ if ((elt->flags & RSPAMD_EMAIL_ADDR_QUOTED) && elt->addr[0] == '"') {
+ if (elt->flags & RSPAMD_EMAIL_ADDR_HAS_BACKSLASH) {
+ /* We also need to unquote user */
+ rspamd_email_address_unescape(elt);
+ }
+
+ /* We need to unquote addr */
+ nlen = elt->domain_len + elt->user_len + 2;
+ elt->addr = g_malloc(nlen + 1);
+ rspamd_mempool_notify_alloc(pool, nlen + 1);
+ elt->addr_len = rspamd_snprintf((char *) elt->addr, nlen, "%*s@%*s",
+ (gint) elt->user_len, elt->user,
+ (gint) elt->domain_len, elt->domain);
+ elt->flags |= RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED;
+ }
+
+ if (name->len > 0) {
+ rspamd_gstring_strip(name, " \t\v");
+ elt->name = rspamd_mime_header_decode(pool, name->str, name->len, NULL);
+ }
+
+ rspamd_mempool_notify_alloc(pool, name->len);
+ g_ptr_array_add(ar, elt);
+}
+
+/*
+ * Tries to parse an email address that doesn't conform RFC
+ */
+static gboolean
+rspamd_email_address_parse_heuristic(const char *data, size_t len,
+ struct rspamd_email_address *addr)
+{
+ const gchar *p = data, *at = NULL, *end = data + len;
+ gboolean ret = FALSE;
+
+ memset(addr, 0, sizeof(*addr));
+
+ if (*p == '<' && len > 1) {
+ /* Angled address */
+ addr->addr_len = rspamd_memcspn(p + 1, ">", len - 1);
+ addr->addr = p + 1;
+ addr->raw = p;
+ addr->raw_len = len;
+ ret = TRUE;
+
+ p = p + 1;
+ len = addr->addr_len;
+ end = p + len;
+ }
+ else if (len > 0) {
+ addr->addr = p;
+ addr->addr_len = len;
+ addr->raw = p;
+ addr->raw_len = len;
+ ret = TRUE;
+ }
+
+ if (ret) {
+ at = rspamd_memrchr(p, '@', len);
+
+ if (at != NULL && at + 1 < end) {
+ addr->domain = at + 1;
+ addr->domain_len = end - (at + 1);
+ addr->user = p;
+ addr->user_len = at - p;
+ }
+
+ if (rspamd_str_has_8bit(p, len)) {
+ addr->flags |= RSPAMD_EMAIL_ADDR_HAS_8BIT;
+ }
+ }
+
+ return ret;
+}
+
+static inline int
+rspamd_email_address_check_and_add(const gchar *start, gsize len,
+ GPtrArray *res,
+ rspamd_mempool_t *pool,
+ GString *ns,
+ gint max_elements)
+{
+ struct rspamd_email_address addr;
+
+ g_assert(res != NULL);
+
+ if (max_elements > 0 && res->len >= max_elements) {
+ msg_info_pool_check("reached maximum number of elements %d when adding %v",
+ max_elements,
+ ns);
+
+ return -1;
+ }
+
+ /* The whole email is likely address */
+ memset(&addr, 0, sizeof(addr));
+ rspamd_smtp_addr_parse(start, len, &addr);
+
+ if (addr.flags & RSPAMD_EMAIL_ADDR_VALID) {
+ rspamd_email_address_add(pool, res, &addr, ns);
+ }
+ else {
+ /* Try heuristic */
+ if (rspamd_email_address_parse_heuristic(start,
+ len, &addr)) {
+ rspamd_email_address_add(pool, res, &addr, ns);
+
+ return 1;
+ }
+ else {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+GPtrArray *
+rspamd_email_address_from_mime(rspamd_mempool_t *pool, const gchar *hdr,
+ guint len,
+ GPtrArray *src,
+ gint max_elements)
+{
+ GPtrArray *res = src;
+ gboolean seen_at = FALSE, seen_obrace = FALSE;
+
+ const gchar *p = hdr, *end = hdr + len, *c = hdr, *t;
+ GString *ns, *cpy;
+ gint obraces, ebraces;
+ enum {
+ parse_name = 0,
+ parse_quoted,
+ parse_addr,
+ skip_spaces
+ } state = parse_name,
+ next_state = parse_name;
+
+ if (res == NULL) {
+ res = g_ptr_array_sized_new(2);
+ rspamd_mempool_add_destructor(pool, rspamd_email_address_list_destroy,
+ res);
+ }
+ else if (max_elements > 0 && res->len >= max_elements) {
+ msg_info_pool_check("reached maximum number of elements %d", max_elements);
+
+ return res;
+ }
+
+ ns = g_string_sized_new(len);
+ cpy = g_string_sized_new(len);
+
+ rspamd_mempool_add_destructor(pool, rspamd_gstring_free_hard, cpy);
+
+ /* First, we need to remove all comments as they are terrible */
+ obraces = 0;
+ ebraces = 0;
+
+ while (p < end) {
+ if (state == parse_name) {
+ if (*p == '\\') {
+ if (obraces == 0) {
+ g_string_append_c(cpy, *p);
+ }
+
+ p++;
+ }
+ else {
+ if (*p == '"') {
+ state = parse_quoted;
+ }
+ else if (*p == '(') {
+ obraces++; /* To avoid ) itself being copied */
+ }
+ else if (*p == ')') {
+ ebraces++;
+ p++;
+ }
+
+ if (obraces == ebraces) {
+ obraces = 0;
+ ebraces = 0;
+ }
+ }
+
+ if (p < end && obraces == 0) {
+ g_string_append_c(cpy, *p);
+ }
+ }
+ else {
+ /* Quoted elt */
+ if (*p == '\\') {
+ g_string_append_c(cpy, *p);
+ p++;
+ }
+ else {
+ if (*p == '"') {
+ state = parse_name;
+ }
+ }
+
+ if (p < end) {
+ g_string_append_c(cpy, *p);
+ }
+ }
+
+ p++;
+ }
+
+ state = parse_name;
+
+ p = cpy->str;
+ c = p;
+ end = p + cpy->len;
+
+ while (p < end) {
+ switch (state) {
+ case parse_name:
+ if (*p == '"') {
+ /* We need to strip last spaces and update `ns` */
+ if (p > c) {
+ guint nspaces = 0;
+
+ t = p - 1;
+
+ while (t > c && g_ascii_isspace(*t)) {
+ t--;
+ nspaces++;
+ }
+
+ g_string_append_len(ns, c, t - c + 1);
+
+ if (nspaces > 0) {
+ g_string_append_c(ns, ' ');
+ }
+ }
+
+ state = parse_quoted;
+ c = p + 1;
+ }
+ else if (*p == '<') {
+ if (p > c) {
+ t = p - 1;
+
+ while (t > c && g_ascii_isspace(*t)) {
+ t--;
+ }
+
+ g_string_append_len(ns, c, t - c + 1);
+ }
+
+ c = p;
+ state = parse_addr;
+ }
+ else if (*p == ',') {
+ if (p > c && seen_at) {
+ /*
+ * Last token must be the address:
+ * e.g. Some name name@domain.com
+ */
+ t = p - 1;
+
+ while (t > c && g_ascii_isspace(*t)) {
+ t--;
+ }
+
+ int check = rspamd_email_address_check_and_add(c, t - c + 1,
+ res, pool, ns, max_elements);
+
+ if (check == 0 && res->len == 0) {
+ /* Insert fake address */
+ rspamd_email_address_add(pool, res, NULL, ns);
+ }
+ else if (check != 1) {
+ goto end;
+ }
+
+ /* Cleanup for the next use */
+ g_string_set_size(ns, 0);
+ seen_at = FALSE;
+ }
+
+ state = skip_spaces;
+ next_state = parse_name;
+ }
+ else if (*p == '@') {
+ seen_at = TRUE;
+ }
+
+ p++;
+ break;
+ case parse_quoted:
+ if (*p == '\\') {
+ if (p > c) {
+ g_string_append_len(ns, c, p - c);
+ }
+
+ p++;
+ c = p;
+ }
+ else if (*p == '"') {
+ if (p > c) {
+ g_string_append_len(ns, c, p - c);
+ }
+
+ if (p + 1 < end && g_ascii_isspace(p[1])) {
+ g_string_append_c(ns, ' ');
+ }
+
+ state = skip_spaces;
+ next_state = parse_name;
+ }
+ else if (*p == '@' && seen_obrace) {
+ seen_at = TRUE;
+ }
+ else if (*p == '<') {
+ seen_obrace = TRUE;
+ }
+ p++;
+ break;
+ case parse_addr:
+ if (*p == '>') {
+ int check = rspamd_email_address_check_and_add(c, p - c + 1,
+ res, pool, ns, max_elements);
+ if (check == 0 && res->len == 0) {
+ /* Insert a fake address */
+ rspamd_email_address_add(pool, res, NULL, ns);
+ }
+ else if (check != 1) {
+ goto end;
+ }
+
+ /* Cleanup for the next use */
+ g_string_set_size(ns, 0);
+ seen_at = FALSE;
+ state = skip_spaces;
+ next_state = parse_name;
+ }
+ else if (*p == '@') {
+ seen_at = TRUE;
+ }
+ p++;
+ break;
+ case skip_spaces:
+ if (!g_ascii_isspace(*p)) {
+ c = p;
+ state = next_state;
+ }
+ else {
+ p++;
+ }
+ break;
+ }
+ }
+
+ /* Handle leftover */
+ switch (state) {
+ case parse_name:
+ /* Assume the whole header as name (bad thing) */
+ if (p > c) {
+ while (p > c && g_ascii_isspace(*p)) {
+ p--;
+ }
+
+ if (p > c) {
+ if (seen_at) {
+ /* The whole email is likely address */
+ int check = rspamd_email_address_check_and_add(c, p - c,
+ res, pool, ns, max_elements);
+ if (check == 0 && res->len == 0) {
+ /* Insert a fake address */
+ rspamd_email_address_add(pool, res, NULL, ns);
+ }
+ else if (check != 1) {
+ goto end;
+ }
+ }
+ else {
+ /* No @ seen */
+ g_string_append_len(ns, c, p - c);
+
+ if (res->len == 0) {
+ rspamd_email_address_add(pool, res, NULL, ns);
+ }
+ }
+ }
+ else if (res->len == 0) {
+ rspamd_email_address_add(pool, res, NULL, ns);
+ }
+ }
+ break;
+ case parse_addr:
+ if (p > c) {
+ if (rspamd_email_address_check_and_add(c, p - c,
+ res, pool, ns, max_elements) == 0) {
+ if (res->len == 0) {
+ rspamd_email_address_add(pool, res, NULL, ns);
+ }
+ }
+ }
+ break;
+ case parse_quoted:
+ /* Unfinished quoted string or a comment */
+ /* If we have seen obrace + at, then we still can try to resolve address */
+ if (seen_at && seen_obrace) {
+ p = rspamd_memrchr(cpy->str, '<', cpy->len);
+ g_assert(p != NULL);
+ if (rspamd_email_address_check_and_add(p, end - p,
+ res, pool, ns, max_elements) == 0) {
+ if (res->len == 0) {
+ rspamd_email_address_add(pool, res, NULL, ns);
+ }
+ }
+ }
+ break;
+ default:
+ /* Do nothing */
+ break;
+ }
+end:
+ rspamd_mempool_notify_alloc(pool, cpy->len);
+ g_string_free(ns, TRUE);
+
+ return res;
+}
+
+void rspamd_email_address_list_destroy(gpointer ptr)
+{
+ GPtrArray *ar = ptr;
+ guint i;
+ struct rspamd_email_address *addr;
+
+ PTR_ARRAY_FOREACH(ar, i, addr)
+ {
+ rspamd_email_address_free(addr);
+ }
+
+ g_ptr_array_free(ar, TRUE);
+} \ No newline at end of file
diff --git a/src/libmime/email_addr.h b/src/libmime/email_addr.h
new file mode 100644
index 0000000..ed00722
--- /dev/null
+++ b/src/libmime/email_addr.h
@@ -0,0 +1,97 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_EMAIL_ADDR_H_
+#define SRC_LIBMIME_EMAIL_ADDR_H_
+
+#include "config.h"
+#include "libutil/mem_pool.h"
+#include "libutil/ref.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_mime_header;
+
+enum rspamd_email_address_flags {
+ RSPAMD_EMAIL_ADDR_VALID = (1 << 0),
+ RSPAMD_EMAIL_ADDR_IP = (1 << 1),
+ RSPAMD_EMAIL_ADDR_BRACED = (1 << 2),
+ RSPAMD_EMAIL_ADDR_QUOTED = (1 << 3),
+ RSPAMD_EMAIL_ADDR_EMPTY = (1 << 4),
+ RSPAMD_EMAIL_ADDR_HAS_BACKSLASH = (1 << 5),
+ RSPAMD_EMAIL_ADDR_ADDR_ALLOCATED = (1 << 6),
+ RSPAMD_EMAIL_ADDR_USER_ALLOCATED = (1 << 7),
+ RSPAMD_EMAIL_ADDR_HAS_8BIT = (1 << 8),
+ RSPAMD_EMAIL_ADDR_ALIASED = (1 << 9),
+ RSPAMD_EMAIL_ADDR_ORIGINAL = (1 << 10),
+};
+
+/*
+ * Structure that represents email address in a convenient way
+ */
+struct rspamd_email_address {
+ const gchar *raw;
+ const gchar *addr;
+ const gchar *user;
+ const gchar *domain;
+ const gchar *name;
+
+ guint raw_len;
+ guint addr_len;
+ guint domain_len;
+ guint user_len;
+ guint flags;
+};
+
+struct rspamd_task;
+
+/**
+ * Create email address from a single rfc822 address (e.g. from mail from:)
+ * @param str string to use
+ * @param len length of string
+ * @return
+ */
+struct rspamd_email_address *rspamd_email_address_from_smtp(const gchar *str, guint len);
+
+/**
+ * Parses email address from the mime header, decodes names and return the array
+ * of `rspamd_email_address`. If `src` is NULL, then this function creates a new
+ * array and adds a destructor to remove elements when `pool` is destroyed.
+ * Otherwise, addresses are appended to `src`.
+ * @param hdr
+ * @param len
+ * @return
+ */
+GPtrArray *
+rspamd_email_address_from_mime(rspamd_mempool_t *pool, const gchar *hdr, guint len,
+ GPtrArray *src, gint max_elements);
+
+/**
+ * Destroys list of email addresses
+ * @param ptr
+ */
+void rspamd_email_address_list_destroy(gpointer ptr);
+
+void rspamd_email_address_free(struct rspamd_email_address *addr);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SRC_LIBMIME_EMAIL_ADDR_H_ */
diff --git a/src/libmime/images.c b/src/libmime/images.c
new file mode 100644
index 0000000..1344d91
--- /dev/null
+++ b/src/libmime/images.c
@@ -0,0 +1,718 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "config.h"
+#include "images.h"
+#include "task.h"
+#include "message.h"
+#include "libserver/html/html.h"
+
+#define msg_debug_images(...) rspamd_conditional_debug_fast(NULL, NULL, \
+ rspamd_images_log_id, "images", task->task_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+
+INIT_LOG_MODULE(images)
+
+#ifdef USABLE_GD
+#include "gd.h"
+#include "hash.h"
+#include <math.h>
+
+#define RSPAMD_NORMALIZED_DIM 64
+
+static rspamd_lru_hash_t *images_hash = NULL;
+#endif
+
+static const guint8 png_signature[] = {137, 80, 78, 71, 13, 10, 26, 10};
+static const guint8 jpg_sig1[] = {0xff, 0xd8};
+static const guint8 jpg_sig_jfif[] = {0xff, 0xe0};
+static const guint8 jpg_sig_exif[] = {0xff, 0xe1};
+static const guint8 gif_signature[] = {'G', 'I', 'F', '8'};
+static const guint8 bmp_signature[] = {'B', 'M'};
+
+static bool process_image(struct rspamd_task *task, struct rspamd_mime_part *part);
+
+
+bool rspamd_images_process_mime_part_maybe(struct rspamd_task *task,
+ struct rspamd_mime_part *part)
+{
+ if (part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
+ if (part->detected_type &&
+ strcmp(part->detected_type, "image") == 0 &&
+ part->parsed_data.len > 0) {
+
+ return process_image(task, part);
+ }
+ }
+
+ return false;
+}
+
+void rspamd_images_process(struct rspamd_task *task)
+{
+ guint i;
+ struct rspamd_mime_part *part;
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
+ {
+ rspamd_images_process_mime_part_maybe(task, part);
+ }
+}
+
+static enum rspamd_image_type
+detect_image_type(rspamd_ftok_t *data)
+{
+ if (data->len > sizeof(png_signature) / sizeof(png_signature[0])) {
+ if (memcmp(data->begin, png_signature, sizeof(png_signature)) == 0) {
+ return IMAGE_TYPE_PNG;
+ }
+ }
+ if (data->len > 10) {
+ if (memcmp(data->begin, jpg_sig1, sizeof(jpg_sig1)) == 0) {
+ if (memcmp(data->begin + 2, jpg_sig_jfif, sizeof(jpg_sig_jfif)) == 0 ||
+ memcmp(data->begin + 2, jpg_sig_exif, sizeof(jpg_sig_exif)) == 0) {
+ return IMAGE_TYPE_JPG;
+ }
+ }
+ }
+ if (data->len > sizeof(gif_signature) / sizeof(gif_signature[0])) {
+ if (memcmp(data->begin, gif_signature, sizeof(gif_signature)) == 0) {
+ return IMAGE_TYPE_GIF;
+ }
+ }
+ if (data->len > sizeof(bmp_signature) / sizeof(bmp_signature[0])) {
+ if (memcmp(data->begin, bmp_signature, sizeof(bmp_signature)) == 0) {
+ return IMAGE_TYPE_BMP;
+ }
+ }
+
+ return IMAGE_TYPE_UNKNOWN;
+}
+
+
+static struct rspamd_image *
+process_png_image(rspamd_mempool_t *pool, rspamd_ftok_t *data)
+{
+ struct rspamd_image *img;
+ guint32 t;
+ const guint8 *p;
+
+ if (data->len < 24) {
+ msg_info_pool("bad png detected (maybe striped)");
+ return NULL;
+ }
+
+ /* In png we should find iHDR section and get data from it */
+ /* Skip signature and read header section */
+ p = data->begin + 12;
+ if (memcmp(p, "IHDR", 4) != 0) {
+ msg_info_pool("png doesn't begins with IHDR section");
+ return NULL;
+ }
+
+ img = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_image));
+ img->type = IMAGE_TYPE_PNG;
+ img->data = data;
+
+ p += 4;
+ memcpy(&t, p, sizeof(guint32));
+ img->width = ntohl(t);
+ p += 4;
+ memcpy(&t, p, sizeof(guint32));
+ img->height = ntohl(t);
+
+ return img;
+}
+
+static struct rspamd_image *
+process_jpg_image(rspamd_mempool_t *pool, rspamd_ftok_t *data)
+{
+ const guint8 *p, *end;
+ guint16 h, w;
+ struct rspamd_image *img;
+
+ img = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_image));
+ img->type = IMAGE_TYPE_JPG;
+ img->data = data;
+
+ p = data->begin;
+ end = p + data->len - 8;
+ p += 2;
+
+ while (p < end) {
+ if (p[0] == 0xFF && p[1] != 0xFF) {
+ guint len = p[2] * 256 + p[3];
+
+ p++;
+
+ if (*p == 0xc0 || *p == 0xc1 || *p == 0xc2 || *p == 0xc3 ||
+ *p == 0xc9 || *p == 0xca || *p == 0xcb) {
+ memcpy(&h, p + 4, sizeof(guint16));
+ h = p[4] * 0xff + p[5];
+ img->height = h;
+ w = p[6] * 0xff + p[7];
+ img->width = w;
+
+ return img;
+ }
+
+
+ p += len;
+ }
+ else {
+ p++;
+ }
+ }
+
+ return NULL;
+}
+
+static struct rspamd_image *
+process_gif_image(rspamd_mempool_t *pool, rspamd_ftok_t *data)
+{
+ struct rspamd_image *img;
+ const guint8 *p;
+ guint16 t;
+
+ if (data->len < 10) {
+ msg_info_pool("bad gif detected (maybe striped)");
+ return NULL;
+ }
+
+ img = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_image));
+ img->type = IMAGE_TYPE_GIF;
+ img->data = data;
+
+ p = data->begin + 6;
+ memcpy(&t, p, sizeof(guint16));
+ img->width = GUINT16_FROM_LE(t);
+ memcpy(&t, p + 2, sizeof(guint16));
+ img->height = GUINT16_FROM_LE(t);
+
+ return img;
+}
+
+static struct rspamd_image *
+process_bmp_image(rspamd_mempool_t *pool, rspamd_ftok_t *data)
+{
+ struct rspamd_image *img;
+ gint32 t;
+ const guint8 *p;
+
+ if (data->len < 28) {
+ msg_info_pool("bad bmp detected (maybe striped)");
+ return NULL;
+ }
+
+ img = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_image));
+ img->type = IMAGE_TYPE_BMP;
+ img->data = data;
+ p = data->begin + 18;
+ memcpy(&t, p, sizeof(guint32));
+ img->width = GUINT32_FROM_LE(t);
+ memcpy(&t, p + 4, sizeof(gint32));
+ img->height = GUINT32_FROM_LE(t);
+
+ return img;
+}
+
+#ifdef USABLE_GD
+/*
+ * DCT from Emil Mikulic.
+ * http://unix4lyfe.org/dct/
+ */
+static void
+rspamd_image_dct_block(gint pixels[8][8], gdouble *out)
+{
+ gint i;
+ gint rows[8][8];
+
+ static const gint c1 = 1004 /* cos(pi/16) << 10 */,
+ s1 = 200 /* sin(pi/16) */,
+ c3 = 851 /* cos(3pi/16) << 10 */,
+ s3 = 569 /* sin(3pi/16) << 10 */,
+ r2c6 = 554 /* sqrt(2)*cos(6pi/16) << 10 */,
+ r2s6 = 1337 /* sqrt(2)*sin(6pi/16) << 10 */,
+ r2 = 181; /* sqrt(2) << 7*/
+
+ gint x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+ /* transform rows */
+ for (i = 0; i < 8; i++) {
+ x0 = pixels[0][i];
+ x1 = pixels[1][i];
+ x2 = pixels[2][i];
+ x3 = pixels[3][i];
+ x4 = pixels[4][i];
+ x5 = pixels[5][i];
+ x6 = pixels[6][i];
+ x7 = pixels[7][i];
+
+ /* Stage 1 */
+ x8 = x7 + x0;
+ x0 -= x7;
+ x7 = x1 + x6;
+ x1 -= x6;
+ x6 = x2 + x5;
+ x2 -= x5;
+ x5 = x3 + x4;
+ x3 -= x4;
+
+ /* Stage 2 */
+ x4 = x8 + x5;
+ x8 -= x5;
+ x5 = x7 + x6;
+ x7 -= x6;
+ x6 = c1 * (x1 + x2);
+ x2 = (-s1 - c1) * x2 + x6;
+ x1 = (s1 - c1) * x1 + x6;
+ x6 = c3 * (x0 + x3);
+ x3 = (-s3 - c3) * x3 + x6;
+ x0 = (s3 - c3) * x0 + x6;
+
+ /* Stage 3 */
+ x6 = x4 + x5;
+ x4 -= x5;
+ x5 = r2c6 * (x7 + x8);
+ x7 = (-r2s6 - r2c6) * x7 + x5;
+ x8 = (r2s6 - r2c6) * x8 + x5;
+ x5 = x0 + x2;
+ x0 -= x2;
+ x2 = x3 + x1;
+ x3 -= x1;
+
+ /* Stage 4 and output */
+ rows[i][0] = x6;
+ rows[i][4] = x4;
+ rows[i][2] = x8 >> 10;
+ rows[i][6] = x7 >> 10;
+ rows[i][7] = (x2 - x5) >> 10;
+ rows[i][1] = (x2 + x5) >> 10;
+ rows[i][3] = (x3 * r2) >> 17;
+ rows[i][5] = (x0 * r2) >> 17;
+ }
+
+ /* transform columns */
+ for (i = 0; i < 8; i++) {
+ x0 = rows[0][i];
+ x1 = rows[1][i];
+ x2 = rows[2][i];
+ x3 = rows[3][i];
+ x4 = rows[4][i];
+ x5 = rows[5][i];
+ x6 = rows[6][i];
+ x7 = rows[7][i];
+
+ /* Stage 1 */
+ x8 = x7 + x0;
+ x0 -= x7;
+ x7 = x1 + x6;
+ x1 -= x6;
+ x6 = x2 + x5;
+ x2 -= x5;
+ x5 = x3 + x4;
+ x3 -= x4;
+
+ /* Stage 2 */
+ x4 = x8 + x5;
+ x8 -= x5;
+ x5 = x7 + x6;
+ x7 -= x6;
+ x6 = c1 * (x1 + x2);
+ x2 = (-s1 - c1) * x2 + x6;
+ x1 = (s1 - c1) * x1 + x6;
+ x6 = c3 * (x0 + x3);
+ x3 = (-s3 - c3) * x3 + x6;
+ x0 = (s3 - c3) * x0 + x6;
+
+ /* Stage 3 */
+ x6 = x4 + x5;
+ x4 -= x5;
+ x5 = r2c6 * (x7 + x8);
+ x7 = (-r2s6 - r2c6) * x7 + x5;
+ x8 = (r2s6 - r2c6) * x8 + x5;
+ x5 = x0 + x2;
+ x0 -= x2;
+ x2 = x3 + x1;
+ x3 -= x1;
+
+ /* Stage 4 and output */
+ out[i * 8] = (double) ((x6 + 16) >> 3);
+ out[i * 8 + 1] = (double) ((x4 + 16) >> 3);
+ out[i * 8 + 2] = (double) ((x8 + 16384) >> 13);
+ out[i * 8 + 3] = (double) ((x7 + 16384) >> 13);
+ out[i * 8 + 4] = (double) ((x2 - x5 + 16384) >> 13);
+ out[i * 8 + 5] = (double) ((x2 + x5 + 16384) >> 13);
+ out[i * 8 + 6] = (double) (((x3 >> 8) * r2 + 8192) >> 12);
+ out[i * 8 + 7] = (double) (((x0 >> 8) * r2 + 8192) >> 12);
+ }
+}
+
+struct rspamd_image_cache_entry {
+ guchar digest[64];
+ guchar dct[RSPAMD_DCT_LEN / NBBY];
+};
+
+static void
+rspamd_image_cache_entry_dtor(gpointer p)
+{
+ struct rspamd_image_cache_entry *entry = p;
+ g_free(entry);
+}
+
+static guint32
+rspamd_image_dct_hash(gconstpointer p)
+{
+ return rspamd_cryptobox_fast_hash(p, rspamd_cryptobox_HASHBYTES,
+ rspamd_hash_seed());
+}
+
+static gboolean
+rspamd_image_dct_equal(gconstpointer a, gconstpointer b)
+{
+ return memcmp(a, b, rspamd_cryptobox_HASHBYTES) == 0;
+}
+
+static void
+rspamd_image_create_cache(struct rspamd_config *cfg)
+{
+ images_hash = rspamd_lru_hash_new_full(cfg->images_cache_size, NULL,
+ rspamd_image_cache_entry_dtor,
+ rspamd_image_dct_hash, rspamd_image_dct_equal);
+}
+
+static gboolean
+rspamd_image_check_hash(struct rspamd_task *task, struct rspamd_image *img)
+{
+ struct rspamd_image_cache_entry *found;
+
+ if (images_hash == NULL) {
+ rspamd_image_create_cache(task->cfg);
+ }
+
+ found = rspamd_lru_hash_lookup(images_hash, img->parent->digest,
+ task->tv.tv_sec);
+
+ if (found) {
+ /* We need to decompress */
+ img->dct = g_malloc(RSPAMD_DCT_LEN / NBBY);
+ rspamd_mempool_add_destructor(task->task_pool, g_free,
+ img->dct);
+ /* Copy as found could be destroyed by LRU */
+ memcpy(img->dct, found->dct, RSPAMD_DCT_LEN / NBBY);
+ img->is_normalized = TRUE;
+
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+static void
+rspamd_image_save_hash(struct rspamd_task *task, struct rspamd_image *img)
+{
+ struct rspamd_image_cache_entry *found;
+
+ if (img->is_normalized) {
+ found = rspamd_lru_hash_lookup(images_hash, img->parent->digest,
+ task->tv.tv_sec);
+
+ if (!found) {
+ found = g_malloc0(sizeof(*found));
+ memcpy(found->dct, img->dct, RSPAMD_DCT_LEN / NBBY);
+ memcpy(found->digest, img->parent->digest, sizeof(found->digest));
+
+ rspamd_lru_hash_insert(images_hash, found->digest, found,
+ task->tv.tv_sec, 0);
+ }
+ }
+}
+
+#endif
+
+void rspamd_image_normalize(struct rspamd_task *task, struct rspamd_image *img)
+{
+#ifdef USABLE_GD
+ gdImagePtr src = NULL, dst = NULL;
+ guint i, j, k, l;
+ gdouble *dct;
+
+ if (img->data->len == 0 || img->data->len > G_MAXINT32) {
+ return;
+ }
+
+ if (img->height <= RSPAMD_NORMALIZED_DIM ||
+ img->width <= RSPAMD_NORMALIZED_DIM) {
+ return;
+ }
+
+ if (img->data->len > task->cfg->max_pic_size) {
+ return;
+ }
+
+ if (rspamd_image_check_hash(task, img)) {
+ return;
+ }
+
+ switch (img->type) {
+ case IMAGE_TYPE_JPG:
+ src = gdImageCreateFromJpegPtr(img->data->len, (void *) img->data->begin);
+ break;
+ case IMAGE_TYPE_PNG:
+ src = gdImageCreateFromPngPtr(img->data->len, (void *) img->data->begin);
+ break;
+ case IMAGE_TYPE_GIF:
+ src = gdImageCreateFromGifPtr(img->data->len, (void *) img->data->begin);
+ break;
+ case IMAGE_TYPE_BMP:
+ src = gdImageCreateFromBmpPtr(img->data->len, (void *) img->data->begin);
+ break;
+ default:
+ return;
+ }
+
+ if (src == NULL) {
+ msg_info_task("cannot load image of type %s from %T",
+ rspamd_image_type_str(img->type), img->filename);
+ }
+ else {
+ gdImageSetInterpolationMethod(src, GD_BILINEAR_FIXED);
+
+ dst = gdImageScale(src, RSPAMD_NORMALIZED_DIM, RSPAMD_NORMALIZED_DIM);
+ gdImageGrayScale(dst);
+ gdImageDestroy(src);
+
+ img->is_normalized = TRUE;
+ dct = g_malloc0(sizeof(gdouble) * RSPAMD_DCT_LEN);
+ img->dct = g_malloc0(RSPAMD_DCT_LEN / NBBY);
+ rspamd_mempool_add_destructor(task->task_pool, g_free,
+ img->dct);
+
+ /*
+ * Split message into blocks:
+ *
+ * ****
+ * ****
+ *
+ * Get sum of saturation values, and set bit if sum is > avg
+ * Then go further
+ *
+ * ****
+ * ****
+ *
+ * and repeat this algorithm.
+ *
+ * So on each iteration we move by 16 pixels and calculate 2 elements of
+ * signature
+ */
+ for (i = 0; i < RSPAMD_NORMALIZED_DIM; i += 8) {
+ for (j = 0; j < RSPAMD_NORMALIZED_DIM; j += 8) {
+ gint p[8][8];
+
+ for (k = 0; k < 8; k++) {
+ p[k][0] = gdImageGetPixel(dst, i + k, j);
+ p[k][1] = gdImageGetPixel(dst, i + k, j + 1);
+ p[k][2] = gdImageGetPixel(dst, i + k, j + 2);
+ p[k][3] = gdImageGetPixel(dst, i + k, j + 3);
+ p[k][4] = gdImageGetPixel(dst, i + k, j + 4);
+ p[k][5] = gdImageGetPixel(dst, i + k, j + 5);
+ p[k][6] = gdImageGetPixel(dst, i + k, j + 6);
+ p[k][7] = gdImageGetPixel(dst, i + k, j + 7);
+ }
+
+ rspamd_image_dct_block(p,
+ dct + i * RSPAMD_NORMALIZED_DIM + j);
+
+ gdouble avg = 0.0;
+
+ for (k = 0; k < 8; k++) {
+ for (l = 0; l < 8; l++) {
+ gdouble x = *(dct +
+ i * RSPAMD_NORMALIZED_DIM + j + k * 8 + l);
+ avg += (x - avg) / (gdouble) (k * 8 + l + 1);
+ }
+ }
+
+
+ for (k = 0; k < 8; k++) {
+ for (l = 0; l < 8; l++) {
+ guint idx = i * RSPAMD_NORMALIZED_DIM + j + k * 8 + l;
+
+ if (dct[idx] >= avg) {
+ setbit(img->dct, idx);
+ }
+ }
+ }
+ }
+ }
+
+ gdImageDestroy(dst);
+ g_free(dct);
+ rspamd_image_save_hash(task, img);
+ }
+#endif
+}
+
+struct rspamd_image *
+rspamd_maybe_process_image(rspamd_mempool_t *pool,
+ rspamd_ftok_t *data)
+{
+ enum rspamd_image_type type;
+ struct rspamd_image *img = NULL;
+
+ if ((type = detect_image_type(data)) != IMAGE_TYPE_UNKNOWN) {
+ switch (type) {
+ case IMAGE_TYPE_PNG:
+ img = process_png_image(pool, data);
+ break;
+ case IMAGE_TYPE_JPG:
+ img = process_jpg_image(pool, data);
+ break;
+ case IMAGE_TYPE_GIF:
+ img = process_gif_image(pool, data);
+ break;
+ case IMAGE_TYPE_BMP:
+ img = process_bmp_image(pool, data);
+ break;
+ default:
+ img = NULL;
+ break;
+ }
+ }
+
+ return img;
+}
+
+static bool
+process_image(struct rspamd_task *task, struct rspamd_mime_part *part)
+{
+ struct rspamd_image *img;
+
+ img = rspamd_maybe_process_image(task->task_pool, &part->parsed_data);
+
+ if (img != NULL) {
+ msg_debug_images("detected %s image of size %ud x %ud",
+ rspamd_image_type_str(img->type),
+ img->width, img->height);
+
+ if (part->cd) {
+ img->filename = &part->cd->filename;
+ }
+
+ img->parent = part;
+
+ part->part_type = RSPAMD_MIME_PART_IMAGE;
+ part->specific.img = img;
+
+ return true;
+ }
+
+ return false;
+}
+
+const gchar *
+rspamd_image_type_str(enum rspamd_image_type type)
+{
+ switch (type) {
+ case IMAGE_TYPE_PNG:
+ return "PNG";
+ break;
+ case IMAGE_TYPE_JPG:
+ return "JPEG";
+ break;
+ case IMAGE_TYPE_GIF:
+ return "GIF";
+ break;
+ case IMAGE_TYPE_BMP:
+ return "BMP";
+ break;
+ default:
+ break;
+ }
+
+ return "unknown";
+}
+
+static void
+rspamd_image_process_part(struct rspamd_task *task, struct rspamd_mime_part *part)
+{
+ struct rspamd_mime_header *rh;
+ struct rspamd_mime_text_part *tp;
+ struct html_image *himg;
+ const gchar *cid;
+ guint cid_len, i;
+ struct rspamd_image *img;
+
+ img = (struct rspamd_image *) part->specific.img;
+
+ if (img) {
+ /* Check Content-Id */
+ rh = rspamd_message_get_header_from_hash(part->raw_headers,
+ "Content-Id", FALSE);
+
+ if (rh) {
+ cid = rh->decoded;
+
+ if (*cid == '<') {
+ cid++;
+ }
+
+ cid_len = strlen(cid);
+
+ if (cid_len > 0) {
+ if (cid[cid_len - 1] == '>') {
+ cid_len--;
+ }
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp)
+ {
+ if (IS_TEXT_PART_HTML(tp) && tp->html != NULL) {
+ himg = rspamd_html_find_embedded_image(tp->html, cid, cid_len);
+
+ if (himg != NULL) {
+ img->html_image = himg;
+ himg->embedded_image = img;
+
+ msg_debug_images("found linked image by cid: <%s>",
+ cid);
+
+ if (himg->height == 0) {
+ himg->height = img->height;
+ }
+
+ if (himg->width == 0) {
+ himg->width = img->width;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void rspamd_images_link(struct rspamd_task *task)
+{
+ struct rspamd_mime_part *part;
+ guint i;
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
+ {
+ if (part->part_type == RSPAMD_MIME_PART_IMAGE) {
+ rspamd_image_process_part(task, part);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/libmime/images.h b/src/libmime/images.h
new file mode 100644
index 0000000..bf8b3be
--- /dev/null
+++ b/src/libmime/images.h
@@ -0,0 +1,76 @@
+#ifndef IMAGES_H_
+#define IMAGES_H_
+
+#include "config.h"
+#include "fstring.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct html_image;
+struct rspamd_task;
+struct rspamd_mime_part;
+
+#define RSPAMD_DCT_LEN (64 * 64)
+
+enum rspamd_image_type {
+ IMAGE_TYPE_PNG = 0,
+ IMAGE_TYPE_JPG,
+ IMAGE_TYPE_GIF,
+ IMAGE_TYPE_BMP,
+ IMAGE_TYPE_UNKNOWN
+};
+
+struct rspamd_image {
+ struct rspamd_mime_part *parent;
+ rspamd_ftok_t *data;
+ rspamd_ftok_t *filename;
+ struct html_image *html_image;
+ enum rspamd_image_type type;
+ guint32 width;
+ guint32 height;
+ gboolean is_normalized;
+ guchar *dct;
+};
+
+/*
+ * Process images from a worker task
+ */
+void rspamd_images_process(struct rspamd_task *task);
+
+/**
+ * Process image if possible in a single mime part
+ * @param task
+ * @param part
+ * @return
+ */
+bool rspamd_images_process_mime_part_maybe(struct rspamd_task *task,
+ struct rspamd_mime_part *part);
+
+/*
+ * Link embedded images to the HTML parts
+ */
+void rspamd_images_link(struct rspamd_task *task);
+
+/**
+ * Processes image in raw data
+ * @param task
+ * @param data
+ * @return
+ */
+struct rspamd_image *rspamd_maybe_process_image(rspamd_mempool_t *pool,
+ rspamd_ftok_t *data);
+
+/*
+ * Get textual representation of an image's type
+ */
+const gchar *rspamd_image_type_str(enum rspamd_image_type type);
+
+void rspamd_image_normalize(struct rspamd_task *task, struct rspamd_image *img);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* IMAGES_H_ */
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
new file mode 100644
index 0000000..bdd0aad
--- /dev/null
+++ b/src/libmime/lang_detection.c
@@ -0,0 +1,2103 @@
+/*
+ * Copyright 2024 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lang_detection.h"
+#include "lang_detection_fasttext.h"
+#include "libserver/logger.h"
+#include "libcryptobox/cryptobox.h"
+#include "libutil/multipattern.h"
+#include "ucl.h"
+#include "khash.h"
+#include "libstemmer.h"
+
+#include <glob.h>
+#include <unicode/utf8.h>
+#include <unicode/utf16.h>
+#include <unicode/ucnv.h>
+#include <unicode/uchar.h>
+#include <unicode/ustring.h>
+#include <math.h>
+
+static const gsize default_short_text_limit = 10;
+static const gsize default_words = 80;
+static const gdouble update_prob = 0.6;
+static const gchar *default_languages_path = RSPAMD_SHAREDIR "/languages";
+
+#undef EXTRA_LANGDET_DEBUG
+
+struct rspamd_language_unicode_match {
+ const gchar *lang;
+ gint unicode_code;
+};
+
+/*
+ * List of languages detected by unicode scripts
+ */
+static const struct rspamd_language_unicode_match unicode_langs[] = {
+ {"el", RSPAMD_UNICODE_GREEK},
+ {"ml", RSPAMD_UNICODE_MALAYALAM},
+ {"te", RSPAMD_UNICODE_TELUGU},
+ {"ta", RSPAMD_UNICODE_TAMIL},
+ {"gu", RSPAMD_UNICODE_GUJARATI},
+ {"th", RSPAMD_UNICODE_THAI},
+ {"ka", RSPAMD_UNICODE_GEORGIAN},
+ {"si", RSPAMD_UNICODE_SINHALA},
+ {"hy", RSPAMD_UNICODE_ARMENIAN},
+ {"ja", RSPAMD_UNICODE_JP},
+ {"ko", RSPAMD_UNICODE_HANGUL},
+};
+
+/*
+ * Top languages
+ */
+static const gchar *tier0_langs[] = {
+ "en",
+};
+static const gchar *tier1_langs[] = {
+ "fr", "it", "de", "es", "nl",
+ "pt", "ru", "pl", "tk", "th", "ar"};
+
+enum rspamd_language_category {
+ RSPAMD_LANGUAGE_LATIN = 0,
+ RSPAMD_LANGUAGE_CYRILLIC,
+ RSPAMD_LANGUAGE_DEVANAGARI,
+ RSPAMD_LANGUAGE_ARAB,
+ RSPAMD_LANGUAGE_MAX,
+};
+
+struct rspamd_language_elt {
+ const gchar *name; /* e.g. "en" or "ru" */
+ gint flags; /* enum rspamd_language_elt_flags */
+ enum rspamd_language_category category;
+ guint trigrams_words;
+ guint stop_words;
+ gdouble mean;
+ gdouble std;
+ guint occurrences; /* total number of parts with this language */
+};
+
+struct rspamd_ngramm_elt {
+ struct rspamd_language_elt *elt;
+ gdouble prob;
+};
+
+struct rspamd_ngramm_chain {
+ GPtrArray *languages;
+ gdouble mean;
+ gdouble std;
+ gchar *utf;
+};
+
+struct rspamd_stop_word_range {
+ guint start;
+ guint stop;
+ struct rspamd_language_elt *elt;
+};
+
+struct rspamd_stop_word_elt {
+ struct rspamd_multipattern *mp;
+ GArray *ranges; /* of rspamd_stop_word_range */
+};
+
+#define msg_debug_lang_det(...) rspamd_conditional_debug_fast(NULL, NULL, \
+ rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+#define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast(NULL, NULL, \
+ rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+
+INIT_LOG_MODULE_PUBLIC(langdet)
+
+static const struct rspamd_language_unicode_match *
+rspamd_language_search_unicode_match(const gchar *key,
+ const struct rspamd_language_unicode_match *elts, size_t nelts)
+{
+ size_t i;
+
+ for (i = 0; i < nelts; i++) {
+ if (strcmp(elts[i].lang, key) == 0) {
+ return &elts[i];
+ }
+ }
+
+ return NULL;
+}
+
+static gboolean
+rspamd_language_search_str(const gchar *key, const gchar *elts[], size_t nelts)
+{
+ size_t i;
+
+ for (i = 0; i < nelts; i++) {
+ if (strcmp(elts[i], key) == 0) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
+static guint
+rspamd_trigram_hash_func(gconstpointer key)
+{
+ return rspamd_cryptobox_fast_hash(key, 3 * sizeof(UChar32),
+ rspamd_hash_seed());
+}
+
+static gboolean
+rspamd_trigram_equal_func(gconstpointer v, gconstpointer v2)
+{
+ return memcmp(v, v2, 3 * sizeof(UChar32)) == 0;
+}
+
+KHASH_INIT(rspamd_trigram_hash, const UChar32 *, struct rspamd_ngramm_chain, true,
+ rspamd_trigram_hash_func, rspamd_trigram_equal_func);
+KHASH_INIT(rspamd_candidates_hash, const gchar *,
+ struct rspamd_lang_detector_res *, true,
+ rspamd_str_hash, rspamd_str_equal);
+KHASH_INIT(rspamd_stopwords_hash, rspamd_ftok_t *,
+ char, false,
+ rspamd_ftok_hash, rspamd_ftok_equal);
+
+KHASH_INIT(rspamd_languages_hash, const gchar *, struct rspamd_language_elt *, true,
+ rspamd_str_hash, rspamd_str_equal);
+struct rspamd_lang_detector {
+ khash_t(rspamd_languages_hash) * languages;
+ khash_t(rspamd_trigram_hash) * trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */
+ struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX];
+ khash_t(rspamd_stopwords_hash) * stop_words_norm;
+ UConverter *uchar_converter;
+ gsize short_text_limit;
+ bool prefer_fasttext;
+ gsize total_occurrences; /* number of all languages found */
+ gpointer fasttext_detector;
+ ref_entry_t ref;
+};
+
+static void
+rspamd_language_detector_ucs_lowercase(UChar32 *s, gsize len)
+{
+ gsize i;
+
+ for (i = 0; i < len; i++) {
+ s[i] = u_tolower(s[i]);
+ }
+}
+
+static gboolean
+rspamd_language_detector_ucs_is_latin(const UChar32 *s, gsize len)
+{
+ gsize i;
+ gboolean ret = TRUE;
+
+ for (i = 0; i < len; i++) {
+ if (s[i] >= 128 || !(g_ascii_isalnum(s[i]) || s[i] == ' ')) {
+ ret = FALSE;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+struct rspamd_language_ucs_elt {
+ guint freq;
+ const gchar *utf;
+ UChar32 s[0];
+};
+
+static void
+rspamd_language_detector_init_ngramm(struct rspamd_config *cfg,
+ struct rspamd_lang_detector *d,
+ struct rspamd_language_elt *lelt,
+ struct rspamd_language_ucs_elt *ucs,
+ guint len,
+ guint freq,
+ guint total,
+ khash_t(rspamd_trigram_hash) * htb)
+{
+ struct rspamd_ngramm_chain *chain = NULL, st_chain;
+ struct rspamd_ngramm_elt *elt;
+ khiter_t k;
+ guint i;
+ gboolean found;
+
+ switch (len) {
+ case 1:
+ case 2:
+ g_assert_not_reached();
+ break;
+ case 3:
+ k = kh_get(rspamd_trigram_hash, htb, ucs->s);
+ if (k != kh_end(htb)) {
+ chain = &kh_value(htb, k);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ break;
+ }
+
+ if (chain == NULL) {
+ /* New element */
+ chain = &st_chain;
+ memset(chain, 0, sizeof(st_chain));
+ chain->languages = g_ptr_array_sized_new(32);
+ rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard,
+ chain->languages);
+ chain->utf = rspamd_mempool_strdup(cfg->cfg_pool, ucs->utf);
+ elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt));
+ elt->elt = lelt;
+ elt->prob = ((gdouble) freq) / ((gdouble) total);
+ g_ptr_array_add(chain->languages, elt);
+
+ k = kh_put(rspamd_trigram_hash, htb, ucs->s, &i);
+ kh_value(htb, k) = *chain;
+ }
+ else {
+ /* Check sanity */
+ found = FALSE;
+
+ PTR_ARRAY_FOREACH(chain->languages, i, elt)
+ {
+ if (strcmp(elt->elt->name, lelt->name) == 0) {
+ found = TRUE;
+ elt->prob += ((gdouble) freq) / ((gdouble) total);
+ break;
+ }
+ }
+
+ if (!found) {
+ elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt));
+ elt->elt = lelt;
+ elt->prob = ((gdouble) freq) / ((gdouble) total);
+ g_ptr_array_add(chain->languages, elt);
+ }
+ }
+}
+
+static inline enum rspamd_language_category
+rspamd_language_detector_get_category(guint uflags)
+{
+ enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN;
+
+ if (uflags & RSPAMD_UNICODE_CYRILLIC) {
+ cat = RSPAMD_LANGUAGE_CYRILLIC;
+ }
+ else if (uflags & RSPAMD_UNICODE_DEVANAGARI) {
+ cat = RSPAMD_LANGUAGE_DEVANAGARI;
+ }
+ else if (uflags & RSPAMD_UNICODE_ARABIC) {
+ cat = RSPAMD_LANGUAGE_ARAB;
+ }
+
+ return cat;
+}
+
+static const gchar *
+rspamd_language_detector_print_flags(struct rspamd_language_elt *elt)
+{
+ static gchar flags_buf[256];
+ goffset r = 0;
+
+ if (elt->flags & RS_LANGUAGE_TIER1) {
+ r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier1,");
+ }
+ if (elt->flags & RS_LANGUAGE_TIER0) {
+ r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier0,");
+ }
+ if (elt->flags & RS_LANGUAGE_LATIN) {
+ r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "latin,");
+ }
+
+ if (r > 0) {
+ flags_buf[r - 1] = '\0';
+ }
+ else {
+ flags_buf[r] = '\0';
+ }
+
+ return flags_buf;
+}
+
+static gint
+rspamd_language_detector_cmp_ngramm(gconstpointer a, gconstpointer b)
+{
+ struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **) a;
+ struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **) b;
+
+ return (gint) e2->freq - (gint) e1->freq;
+}
+
+static void
+rspamd_language_detector_read_file(struct rspamd_config *cfg,
+ struct rspamd_lang_detector *d,
+ const gchar *path,
+ const ucl_object_t *stop_words)
+{
+ struct ucl_parser *parser;
+ ucl_object_t *top;
+ const ucl_object_t *freqs, *n_words, *cur, *type, *flags;
+ ucl_object_iter_t it = NULL;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ struct rspamd_language_elt *nelt;
+ struct rspamd_language_ucs_elt *ucs_elt;
+ khash_t(rspamd_trigram_hash) *htb = NULL;
+ gchar *pos;
+ guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped,
+ loaded;
+ gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0;
+ enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX;
+
+ parser = ucl_parser_new(UCL_PARSER_NO_FILEVARS);
+ if (!ucl_parser_add_file(parser, path)) {
+ msg_warn_config("cannot parse file %s: %s", path,
+ ucl_parser_get_error(parser));
+ ucl_parser_free(parser);
+
+ return;
+ }
+
+ top = ucl_parser_get_object(parser);
+ ucl_parser_free(parser);
+
+ freqs = ucl_object_lookup(top, "freq");
+
+ if (freqs == NULL) {
+ msg_warn_config("file %s has no 'freq' key", path);
+ ucl_object_unref(top);
+
+ return;
+ }
+
+ pos = strrchr(path, '/');
+ g_assert(pos != NULL);
+ nelt = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*nelt));
+ nelt->name = rspamd_mempool_strdup(cfg->cfg_pool, pos + 1);
+ /* Remove extension */
+ pos = strchr(nelt->name, '.');
+ g_assert(pos != NULL);
+ *pos = '\0';
+
+ n_words = ucl_object_lookup(top, "n_words");
+
+ if (n_words == NULL || ucl_object_type(n_words) != UCL_ARRAY ||
+ n_words->len != 3) {
+ msg_warn_config("cannot find n_words in language %s", nelt->name);
+ ucl_object_unref(top);
+
+ return;
+ }
+ else {
+ nelt->trigrams_words = ucl_object_toint(ucl_array_find_index(n_words,
+ 2));
+ }
+
+ type = ucl_object_lookup(top, "type");
+
+ if (type == NULL || ucl_object_type(type) != UCL_STRING) {
+ msg_debug_config("cannot find type in language %s", nelt->name);
+ ucl_object_unref(top);
+
+ return;
+ }
+ else {
+ const gchar *stype = ucl_object_tostring(type);
+
+ if (strcmp(stype, "latin") == 0) {
+ cat = RSPAMD_LANGUAGE_LATIN;
+ }
+ else if (strcmp(stype, "cyrillic") == 0) {
+ cat = RSPAMD_LANGUAGE_CYRILLIC;
+ }
+ else if (strcmp(stype, "arab") == 0) {
+ cat = RSPAMD_LANGUAGE_ARAB;
+ }
+ else if (strcmp(stype, "devanagari") == 0) {
+ cat = RSPAMD_LANGUAGE_DEVANAGARI;
+ }
+ else {
+ msg_debug_config("unknown type %s of language %s", stype, nelt->name);
+ ucl_object_unref(top);
+
+ return;
+ }
+ }
+
+ flags = ucl_object_lookup(top, "flags");
+
+ if (flags != NULL && ucl_object_type(flags) == UCL_ARRAY) {
+ ucl_object_iter_t it = NULL;
+ const ucl_object_t *cur;
+
+ while ((cur = ucl_object_iterate(flags, &it, true)) != NULL) {
+ const gchar *fl = ucl_object_tostring(cur);
+
+ if (cur) {
+ if (strcmp(fl, "diacritics") == 0) {
+ nelt->flags |= RS_LANGUAGE_DIACRITICS;
+ }
+ else if (strcmp(fl, "ascii") == 0) {
+ nelt->flags |= RS_LANGUAGE_ASCII;
+ }
+ else {
+ msg_debug_config("unknown flag %s of language %s", fl, nelt->name);
+ }
+ }
+ else {
+ msg_debug_config("unknown flags type of language %s", nelt->name);
+ }
+ }
+ }
+
+ if (stop_words) {
+ const ucl_object_t *specific_stop_words;
+
+ specific_stop_words = ucl_object_lookup(stop_words, nelt->name);
+
+ if (specific_stop_words) {
+ struct sb_stemmer *stem = NULL;
+ it = NULL;
+ const ucl_object_t *w;
+ guint start, stop;
+
+ stem = sb_stemmer_new(nelt->name, "UTF_8");
+ start = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp);
+
+ while ((w = ucl_object_iterate(specific_stop_words, &it, true)) != NULL) {
+ gsize wlen;
+ const char *word = ucl_object_tolstring(w, &wlen);
+ const char *saved;
+ guint mp_flags = RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8;
+
+ if (rspamd_multipattern_has_hyperscan()) {
+ mp_flags |= RSPAMD_MULTIPATTERN_RE;
+ }
+
+ rspamd_multipattern_add_pattern_len(d->stop_words[cat].mp,
+ word, wlen,
+ mp_flags);
+ nelt->stop_words++;
+
+ /* Also lemmatise and store normalised */
+ if (stem) {
+ const char *nw = sb_stemmer_stem(stem, word, wlen);
+
+
+ if (nw) {
+ saved = nw;
+ wlen = strlen(nw);
+ }
+ else {
+ saved = word;
+ }
+ }
+ else {
+ saved = word;
+ }
+
+ if (saved) {
+ gint rc;
+ rspamd_ftok_t *tok;
+ gchar *dst;
+
+ tok = rspamd_mempool_alloc(cfg->cfg_pool,
+ sizeof(*tok) + wlen + 1);
+ dst = ((gchar *) tok) + sizeof(*tok);
+ rspamd_strlcpy(dst, saved, wlen + 1);
+ tok->begin = dst;
+ tok->len = wlen;
+
+ kh_put(rspamd_stopwords_hash, d->stop_words_norm,
+ tok, &rc);
+ }
+ }
+
+ if (stem) {
+ sb_stemmer_delete(stem);
+ }
+
+ stop = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp);
+
+ struct rspamd_stop_word_range r;
+
+ r.start = start;
+ r.stop = stop;
+ r.elt = nelt;
+
+ g_array_append_val(d->stop_words[cat].ranges, r);
+ it = NULL;
+ }
+ }
+
+ nelt->category = cat;
+ htb = d->trigrams[cat];
+
+ GPtrArray *ngramms;
+ guint nsym;
+
+ if (rspamd_language_search_str(nelt->name, tier1_langs,
+ G_N_ELEMENTS(tier1_langs))) {
+ nelt->flags |= RS_LANGUAGE_TIER1;
+ }
+
+ if (rspamd_language_search_str(nelt->name, tier0_langs,
+ G_N_ELEMENTS(tier0_langs))) {
+ nelt->flags |= RS_LANGUAGE_TIER0;
+ }
+
+ it = NULL;
+ ngramms = g_ptr_array_sized_new(freqs->len);
+ i = 0;
+ skipped = 0;
+ loaded = 0;
+
+ while ((cur = ucl_object_iterate(freqs, &it, true)) != NULL) {
+ const gchar *key;
+ gsize keylen;
+ guint freq;
+
+ key = ucl_object_keyl(cur, &keylen);
+ freq = ucl_object_toint(cur);
+
+ i++;
+ delta = freq - mean;
+ mean += delta / i;
+ delta2 = freq - mean;
+ m2 += delta * delta2;
+
+ if (key != NULL) {
+ UChar32 *cur_ucs;
+ const char *end = key + keylen, *cur_utf = key;
+
+ ucs_elt = rspamd_mempool_alloc(cfg->cfg_pool,
+ sizeof(*ucs_elt) + (keylen + 1) * sizeof(UChar32));
+
+ cur_ucs = ucs_elt->s;
+ nsym = 0;
+ uc_err = U_ZERO_ERROR;
+
+ while (cur_utf < end) {
+ *cur_ucs++ = ucnv_getNextUChar(d->uchar_converter, &cur_utf,
+ end, &uc_err);
+ if (!U_SUCCESS(uc_err)) {
+ break;
+ }
+
+ nsym++;
+ }
+
+ if (!U_SUCCESS(uc_err)) {
+ msg_warn_config("cannot convert key %*s to unicode: %s",
+ (gint) keylen, key, u_errorName(uc_err));
+
+ continue;
+ }
+
+ ucs_elt->utf = key;
+ rspamd_language_detector_ucs_lowercase(ucs_elt->s, nsym);
+
+ if (nsym == 3) {
+ g_ptr_array_add(ngramms, ucs_elt);
+ }
+ else {
+ continue;
+ }
+
+ if (rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) {
+ total_latin++;
+ }
+
+ ucs_elt->freq = freq;
+
+ total_ngramms++;
+ }
+ }
+
+ std = sqrt(m2 / (i - 1));
+
+ if (total_latin >= total_ngramms / 3) {
+ nelt->flags |= RS_LANGUAGE_LATIN;
+ }
+
+ nsym = 3;
+
+ total = 0;
+ PTR_ARRAY_FOREACH(ngramms, i, ucs_elt)
+ {
+
+ if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
+ rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) {
+ ucs_elt->freq = 0;
+ /* Skip latin ngramm for non-latin language to avoid garbage */
+ skipped++;
+ continue;
+ }
+
+ /* Now, discriminate low frequency ngramms */
+
+ total += ucs_elt->freq;
+ loaded++;
+ }
+
+ g_ptr_array_sort(ngramms, rspamd_language_detector_cmp_ngramm);
+
+ PTR_ARRAY_FOREACH(ngramms, i, ucs_elt)
+ {
+ if (ucs_elt->freq > 0) {
+ rspamd_language_detector_init_ngramm(cfg, d,
+ nelt, ucs_elt, nsym,
+ ucs_elt->freq, total, htb);
+ }
+ }
+
+#ifdef EXTRA_LANGDET_DEBUG
+ /* Useful for debug */
+ for (i = 0; i < 10; i++) {
+ ucs_elt = g_ptr_array_index(ngramms, i);
+
+ msg_debug_lang_det_cfg("%s -> %s: %d", nelt->name,
+ ucs_elt->utf, ucs_elt->freq);
+ }
+#endif
+
+ g_ptr_array_free(ngramms, TRUE);
+ nelt->mean = mean;
+ nelt->std = std;
+
+ msg_debug_lang_det_cfg("loaded %s language, %d trigrams, "
+ "%d ngramms loaded; "
+ "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; "
+ "(%s)",
+ nelt->name,
+ (gint) nelt->trigrams_words,
+ total,
+ std, mean,
+ skipped, loaded, nelt->stop_words,
+ rspamd_language_detector_print_flags(nelt));
+
+ int ret;
+ khiter_t k = kh_put(rspamd_languages_hash, d->languages, nelt->name, &ret);
+ g_assert(ret > 0); /* must be unique */
+ kh_value(d->languages, k) = nelt;
+ ucl_object_unref(top);
+}
+
+static gboolean
+rspamd_ucl_array_find_str(const gchar *str, const ucl_object_t *ar)
+{
+ ucl_object_iter_t it = NULL;
+ const ucl_object_t *cur;
+
+ if (ar == NULL || ar->len == 0) {
+ return FALSE;
+ }
+
+ while ((cur = ucl_object_iterate(ar, &it, true)) != NULL) {
+ if (ucl_object_type(cur) == UCL_STRING && rspamd_strcase_equal(
+ ucl_object_tostring(cur), str)) {
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+static void
+rspamd_language_detector_process_chain(struct rspamd_config *cfg,
+ struct rspamd_ngramm_chain *chain)
+{
+ struct rspamd_ngramm_elt *elt;
+ guint i;
+ gdouble delta, mean = 0, delta2, m2 = 0, std;
+
+ if (chain->languages->len > 3) {
+ PTR_ARRAY_FOREACH(chain->languages, i, elt)
+ {
+ delta = elt->prob - mean;
+ mean += delta / (i + 1);
+ delta2 = elt->prob - mean;
+ m2 += delta * delta2;
+ }
+
+ std = sqrt(m2 / (i - 1));
+ chain->mean = mean;
+ chain->std = std;
+
+ /* Now, filter elements that are lower than mean */
+ PTR_ARRAY_FOREACH(chain->languages, i, elt)
+ {
+ if (elt->prob < mean) {
+ g_ptr_array_remove_index_fast(chain->languages, i);
+#ifdef EXTRA_LANGDET_DEBUG
+ msg_debug_lang_det_cfg("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f",
+ elt->elt->name, chain->utf, elt->prob, mean, std);
+#endif
+ }
+ }
+ }
+ else {
+ /* We have a unique ngramm, increase its weight */
+ PTR_ARRAY_FOREACH(chain->languages, i, elt)
+ {
+ elt->prob *= 4.0;
+#ifdef EXTRA_LANGDET_DEBUG
+ msg_debug_lang_det_cfg("increase weight of %s in %s; prob: %.4f",
+ elt->elt->name, chain->utf, elt->prob);
+#endif
+ }
+ }
+}
+
+static void
+rspamd_language_detector_dtor(struct rspamd_lang_detector *d)
+{
+ if (d) {
+ for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i++) {
+ kh_destroy(rspamd_trigram_hash, d->trigrams[i]);
+ rspamd_multipattern_destroy(d->stop_words[i].mp);
+ g_array_free(d->stop_words[i].ranges, TRUE);
+ }
+
+ if (d->languages) {
+ kh_destroy(rspamd_languages_hash, d->languages);
+ }
+
+ kh_destroy(rspamd_stopwords_hash, d->stop_words_norm);
+ rspamd_lang_detection_fasttext_destroy(d->fasttext_detector);
+ }
+}
+
+struct rspamd_lang_detector *
+rspamd_language_detector_init(struct rspamd_config *cfg)
+{
+ const ucl_object_t *section, *elt, *languages_enable = NULL,
+ *languages_disable = NULL;
+ const gchar *languages_path = default_languages_path;
+ glob_t gl;
+ size_t i, short_text_limit = default_short_text_limit, total = 0;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ GString *languages_pattern;
+ struct rspamd_ngramm_chain *chain, schain;
+ gchar *fname;
+ struct rspamd_lang_detector *ret = NULL;
+ struct ucl_parser *parser;
+ ucl_object_t *stop_words;
+ bool prefer_fasttext = true;
+
+ section = ucl_object_lookup(cfg->cfg_ucl_obj, "lang_detection");
+
+ if (section != NULL) {
+ elt = ucl_object_lookup(section, "languages");
+
+ if (elt) {
+ languages_path = ucl_object_tostring(elt);
+ }
+
+ elt = ucl_object_lookup(section, "short_text_limit");
+
+ if (elt) {
+ short_text_limit = ucl_object_toint(elt);
+ }
+
+ languages_enable = ucl_object_lookup(section, "languages_enable");
+ languages_disable = ucl_object_lookup(section, "languages_disable");
+
+ elt = ucl_object_lookup(section, "prefer_fasttext");
+ if (elt) {
+ prefer_fasttext = ucl_object_toboolean(elt);
+ }
+ }
+
+ languages_pattern = g_string_sized_new(PATH_MAX);
+ rspamd_printf_gstring(languages_pattern, "%s/stop_words", languages_path);
+ parser = ucl_parser_new(UCL_PARSER_DEFAULT);
+
+ if (ucl_parser_add_file(parser, languages_pattern->str)) {
+ stop_words = ucl_parser_get_object(parser);
+ }
+ else {
+ msg_err_config("cannot read stop words from %s: %s",
+ languages_pattern->str,
+ ucl_parser_get_error(parser));
+ stop_words = NULL;
+ }
+
+ ucl_parser_free(parser);
+ languages_pattern->len = 0;
+
+ rspamd_printf_gstring(languages_pattern, "%s/*.json", languages_path);
+ memset(&gl, 0, sizeof(gl));
+
+ if (glob(languages_pattern->str, 0, NULL, &gl) != 0) {
+ msg_err_config("cannot read any files matching %v", languages_pattern);
+ goto end;
+ }
+
+ ret = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*ret));
+ ret->languages = kh_init(rspamd_languages_hash);
+ kh_resize(rspamd_languages_hash, ret->languages, gl.gl_pathc);
+ ret->uchar_converter = rspamd_get_utf8_converter();
+ ret->short_text_limit = short_text_limit;
+ ret->stop_words_norm = kh_init(rspamd_stopwords_hash);
+ ret->prefer_fasttext = prefer_fasttext;
+
+ /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
+ for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) {
+ ret->trigrams[i] = kh_init(rspamd_trigram_hash);
+#ifdef WITH_HYPERSCAN
+ ret->stop_words[i].mp = rspamd_multipattern_create(
+ RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 |
+ RSPAMD_MULTIPATTERN_RE);
+#else
+ ret->stop_words[i].mp = rspamd_multipattern_create(
+ RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8);
+#endif
+
+ ret->stop_words[i].ranges = g_array_new(FALSE, FALSE,
+ sizeof(struct rspamd_stop_word_range));
+ }
+
+ g_assert(uc_err == U_ZERO_ERROR);
+
+ for (i = 0; i < gl.gl_pathc; i++) {
+ fname = g_path_get_basename(gl.gl_pathv[i]);
+
+ if (!rspamd_ucl_array_find_str(fname, languages_disable) ||
+ (languages_enable == NULL ||
+ rspamd_ucl_array_find_str(fname, languages_enable))) {
+ rspamd_language_detector_read_file(cfg, ret, gl.gl_pathv[i],
+ stop_words);
+ }
+ else {
+ msg_info_config("skip language file %s: disabled", fname);
+ }
+
+ g_free(fname);
+ }
+
+ for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) {
+ GError *err = NULL;
+
+ kh_foreach_value(ret->trigrams[i], schain, {
+ chain = &schain;
+ rspamd_language_detector_process_chain(cfg, chain);
+ });
+
+ if (!rspamd_multipattern_compile(ret->stop_words[i].mp, &err)) {
+ msg_err_config("cannot compile stop words for %z language group: %e",
+ i, err);
+ g_error_free(err);
+ }
+
+ total += kh_size(ret->trigrams[i]);
+ }
+
+ ret->fasttext_detector = rspamd_lang_detection_fasttext_init(cfg);
+ char *fasttext_status = rspamd_lang_detection_fasttext_show_info(ret->fasttext_detector);
+
+ msg_info_config("loaded %d languages, "
+ "%d trigrams; %s",
+ (gint) kh_size(ret->languages),
+ (gint) total, fasttext_status);
+ g_free(fasttext_status);
+
+ if (stop_words) {
+ ucl_object_unref(stop_words);
+ }
+
+ REF_INIT_RETAIN(ret, rspamd_language_detector_dtor);
+ rspamd_mempool_add_destructor(cfg->cfg_pool,
+ (rspamd_mempool_destruct_t) rspamd_language_detector_unref,
+ ret);
+
+end:
+ if (gl.gl_pathc > 0) {
+ globfree(&gl);
+ }
+
+ g_string_free(languages_pattern, TRUE);
+
+ return ret;
+}
+
+static void
+rspamd_language_detector_random_select(GArray *ucs_tokens, guint nwords,
+ goffset *offsets_out,
+ guint64 *seed)
+{
+ guint step_len, remainder, i, out_idx;
+ guint64 coin, sel;
+ rspamd_stat_token_t *tok;
+
+ g_assert(nwords != 0);
+ g_assert(offsets_out != NULL);
+ g_assert(ucs_tokens->len >= nwords);
+ /*
+ * We split input array into `nwords` parts. For each part we randomly select
+ * an element from this particular split. Here is an example:
+ *
+ * nwords=2, input_len=5
+ *
+ * w1 w2 w3 w4 w5
+ * ^ ^
+ * part1 part2
+ * vv vv
+ * w2 w5
+ *
+ * So we have 2 output words from 5 input words selected randomly within
+ * their splits. It is not uniform distribution but it seems to be better
+ * to include words from different text parts
+ */
+ step_len = ucs_tokens->len / nwords;
+ remainder = ucs_tokens->len % nwords;
+
+ out_idx = 0;
+ coin = rspamd_random_uint64_fast_seed(seed);
+ sel = coin % (step_len + remainder);
+ offsets_out[out_idx] = sel;
+
+ for (i = step_len + remainder; i < ucs_tokens->len;
+ i += step_len, out_idx++) {
+ guint ntries = 0;
+ coin = rspamd_random_uint64_fast_seed(seed);
+ sel = (coin % step_len) + i;
+
+ for (;;) {
+ tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel);
+ /* Filter bad tokens */
+
+ if (tok->unicode.len >= 2 &&
+ !(tok->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION) &&
+ u_isalpha(tok->unicode.begin[0]) &&
+ u_isalpha(tok->unicode.begin[tok->unicode.len - 1])) {
+ offsets_out[out_idx] = sel;
+ break;
+ }
+ else {
+ ntries++;
+ coin = rspamd_random_uint64_fast_seed(seed);
+
+ if (ntries < step_len) {
+ sel = (coin % step_len) + i;
+ }
+ else if (ntries < ucs_tokens->len) {
+ sel = coin % ucs_tokens->len;
+ }
+ else {
+ offsets_out[out_idx] = sel;
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * Fisher-Yates algorithm:
+ * for i from 0 to n−2 do
+ * j ← random integer such that i ≤ j < n
+ * exchange a[i] and a[j]
+ */
+#if 0
+ if (out_idx > 2) {
+ for (i = 0; i < out_idx - 2; i++) {
+ coin = rspamd_random_uint64_fast ();
+ sel = (coin % (out_idx - i)) + i;
+ /* swap */
+ tmp = offsets_out[i];
+ offsets_out[i] = offsets_out[sel];
+ offsets_out[sel] = tmp;
+ }
+ }
+#endif
+}
+
+static goffset
+rspamd_language_detector_next_ngramm(rspamd_stat_token_t *tok, UChar32 *window,
+ guint wlen, goffset cur_off)
+{
+ guint i;
+
+ if (wlen > 1) {
+ /* Deal with spaces at the beginning and ending */
+
+ if (cur_off == 0) {
+ window[0] = (UChar32) ' ';
+
+ for (i = 0; i < wlen - 1; i++) {
+ window[i + 1] = tok->unicode.begin[i];
+ }
+ }
+ else if (cur_off + wlen == tok->unicode.len + 1) {
+ /* Add trailing space */
+ for (i = 0; i < wlen - 1; i++) {
+ window[i] = tok->unicode.begin[cur_off + i];
+ }
+ window[wlen - 1] = (UChar32) ' ';
+ }
+ else if (cur_off + wlen > tok->unicode.len + 1) {
+ /* No more fun */
+ return -1;
+ }
+ else {
+ /* Normal case */
+ for (i = 0; i < wlen; i++) {
+ window[i] = tok->unicode.begin[cur_off + i];
+ }
+ }
+ }
+ else {
+ if (tok->normalized.len <= cur_off) {
+ return -1;
+ }
+
+ window[0] = tok->unicode.begin[cur_off];
+ }
+
+ return cur_off + 1;
+}
+
+/*
+ * Do full guess for a specific ngramm, checking all languages defined
+ */
+static void
+rspamd_language_detector_process_ngramm_full(struct rspamd_task *task,
+ struct rspamd_lang_detector *d,
+ UChar32 *window,
+ khash_t(rspamd_candidates_hash) * candidates,
+ khash_t(rspamd_trigram_hash) * trigrams)
+{
+ guint i;
+ gint ret;
+ struct rspamd_ngramm_chain *chain = NULL;
+ struct rspamd_ngramm_elt *elt;
+ struct rspamd_lang_detector_res *cand;
+ khiter_t k;
+ gdouble prob;
+
+ k = kh_get(rspamd_trigram_hash, trigrams, window);
+ if (k != kh_end(trigrams)) {
+ chain = &kh_value(trigrams, k);
+ }
+
+ if (chain) {
+ PTR_ARRAY_FOREACH(chain->languages, i, elt)
+ {
+ prob = elt->prob;
+
+ if (prob < chain->mean) {
+ continue;
+ }
+
+ k = kh_get(rspamd_candidates_hash, candidates, elt->elt->name);
+ if (k != kh_end(candidates)) {
+ cand = kh_value(candidates, k);
+ }
+ else {
+ cand = NULL;
+ }
+
+#ifdef NGRAMMS_DEBUG
+ msg_err("gramm: %s, lang: %s, prob: %.3f", chain->utf,
+ elt->elt->name, log2(elt->prob));
+#endif
+ if (cand == NULL) {
+ cand = rspamd_mempool_alloc(task->task_pool, sizeof(*cand));
+ cand->elt = elt->elt;
+ cand->lang = elt->elt->name;
+ cand->prob = prob;
+
+ k = kh_put(rspamd_candidates_hash, candidates, elt->elt->name,
+ &ret);
+ kh_value(candidates, k) = cand;
+ }
+ else {
+ /* Update guess */
+ cand->prob += prob;
+ }
+ }
+ }
+}
+
+static void
+rspamd_language_detector_detect_word(struct rspamd_task *task,
+ struct rspamd_lang_detector *d,
+ rspamd_stat_token_t *tok,
+ khash_t(rspamd_candidates_hash) * candidates,
+ khash_t(rspamd_trigram_hash) * trigrams)
+{
+ const guint wlen = 3;
+ UChar32 window[3];
+ goffset cur = 0;
+
+ /* Split words */
+ while ((cur = rspamd_language_detector_next_ngramm(tok, window, wlen, cur)) != -1) {
+ rspamd_language_detector_process_ngramm_full(task,
+ d, window, candidates, trigrams);
+ }
+}
+
+static const gdouble cutoff_limit = -8.0;
+/*
+ * Converts frequencies to log probabilities, filter those candidates who
+ * has the lowest probabilities
+ */
+
+static inline void
+rspamd_language_detector_filter_step1(struct rspamd_task *task,
+ struct rspamd_lang_detector_res *cand,
+ gdouble *max_prob, guint *filtered)
+{
+ if (!isnan(cand->prob)) {
+ if (cand->prob == 0) {
+ cand->prob = NAN;
+ msg_debug_lang_det(
+ "exclude language %s",
+ cand->lang);
+ (*filtered)++;
+ }
+ else {
+ cand->prob = log2(cand->prob);
+ if (cand->prob < cutoff_limit) {
+ msg_debug_lang_det(
+ "exclude language %s: %.3f, cutoff limit: %.3f",
+ cand->lang, cand->prob, cutoff_limit);
+ cand->prob = NAN;
+ (*filtered)++;
+ }
+ else if (cand->prob > *max_prob) {
+ *max_prob = cand->prob;
+ }
+ }
+ }
+}
+
+static inline void
+rspamd_language_detector_filter_step2(struct rspamd_task *task,
+ struct rspamd_lang_detector_res *cand,
+ gdouble max_prob, guint *filtered)
+{
+ /*
+ * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that
+ * prob2 is 2^4 less than prob1
+ */
+ if (!isnan(cand->prob) && max_prob - cand->prob > 1) {
+ msg_debug_lang_det("exclude language %s: %.3f (%.3f max)",
+ cand->lang, cand->prob, max_prob);
+ cand->prob = NAN;
+ (*filtered)++;
+ }
+}
+
+static void
+rspamd_language_detector_filter_negligible(struct rspamd_task *task,
+ khash_t(rspamd_candidates_hash) * candidates)
+{
+ struct rspamd_lang_detector_res *cand;
+ guint filtered = 0;
+ gdouble max_prob = -(G_MAXDOUBLE);
+
+ kh_foreach_value(candidates, cand,
+ rspamd_language_detector_filter_step1(task, cand, &max_prob, &filtered));
+ kh_foreach_value(candidates, cand,
+ rspamd_language_detector_filter_step2(task, cand, max_prob, &filtered));
+
+ msg_debug_lang_det("removed %d languages", filtered);
+}
+
+static void
+rspamd_language_detector_detect_type(struct rspamd_task *task,
+ guint nwords,
+ struct rspamd_lang_detector *d,
+ GArray *words,
+ enum rspamd_language_category cat,
+ khash_t(rspamd_candidates_hash) * candidates,
+ struct rspamd_mime_text_part *part)
+{
+ guint nparts = MIN(words->len, nwords);
+ goffset *selected_words;
+ rspamd_stat_token_t *tok;
+ guint i;
+ guint64 seed;
+
+ /* Seed PRNG with part digest to provide some sort of determinism */
+ memcpy(&seed, part->mime_part->digest, sizeof(seed));
+ selected_words = g_new0(goffset, nparts);
+ rspamd_language_detector_random_select(words, nparts, selected_words, &seed);
+ msg_debug_lang_det("randomly selected %d words", nparts);
+
+ for (i = 0; i < nparts; i++) {
+ tok = &g_array_index(words, rspamd_stat_token_t,
+ selected_words[i]);
+
+ if (tok->unicode.len >= 3) {
+ rspamd_language_detector_detect_word(task, d, tok, candidates,
+ d->trigrams[cat]);
+ }
+ }
+
+ /* Filter negligible candidates */
+ rspamd_language_detector_filter_negligible(task, candidates);
+ g_free(selected_words);
+}
+
+static gint
+rspamd_language_detector_cmp(gconstpointer a, gconstpointer b)
+{
+ const struct rspamd_lang_detector_res
+ *canda = *(const struct rspamd_lang_detector_res **) a,
+ *candb = *(const struct rspamd_lang_detector_res **) b;
+
+ if (canda->prob > candb->prob) {
+ return -1;
+ }
+ else if (candb->prob > canda->prob) {
+ return 1;
+ }
+
+ return 0;
+}
+
+enum rspamd_language_detected_type {
+ rs_detect_none = 0,
+ rs_detect_single,
+ rs_detect_multiple,
+};
+
+static enum rspamd_language_detected_type
+rspamd_language_detector_try_ngramm(struct rspamd_task *task,
+ guint nwords,
+ struct rspamd_lang_detector *d,
+ GArray *ucs_tokens,
+ enum rspamd_language_category cat,
+ khash_t(rspamd_candidates_hash) * candidates,
+ struct rspamd_mime_text_part *part)
+{
+ guint cand_len = 0;
+ struct rspamd_lang_detector_res *cand;
+
+ rspamd_language_detector_detect_type(task,
+ nwords,
+ d,
+ ucs_tokens,
+ cat,
+ candidates,
+ part);
+
+ kh_foreach_value(candidates, cand, {
+ if (!isnan(cand->prob)) {
+ cand_len++;
+ }
+ });
+
+ if (cand_len == 0) {
+ return rs_detect_none;
+ }
+ else if (cand_len == 1) {
+ return rs_detect_single;
+ }
+
+ return rs_detect_multiple;
+}
+
+enum rspamd_language_sort_flags {
+ RSPAMD_LANG_FLAG_DEFAULT = 0,
+ RSPAMD_LANG_FLAG_SHORT = 1 << 0,
+};
+
+struct rspamd_frequency_sort_cbdata {
+ struct rspamd_lang_detector *d;
+ enum rspamd_language_sort_flags flags;
+ gdouble std;
+ gdouble mean;
+};
+
+static const gdouble tier0_adjustment = 1.2;
+static const gdouble tier1_adjustment = 0.8;
+static const gdouble frequency_adjustment = 0.8;
+
+static gint
+rspamd_language_detector_cmp_heuristic(gconstpointer a, gconstpointer b,
+ gpointer ud)
+{
+ struct rspamd_frequency_sort_cbdata *cbd = ud;
+ struct rspamd_lang_detector_res
+ *canda = *(struct rspamd_lang_detector_res **) a,
+ *candb = *(struct rspamd_lang_detector_res **) b;
+ gdouble adj;
+ gdouble proba_adjusted, probb_adjusted, freqa, freqb;
+
+ if (cbd->d->total_occurrences == 0) {
+ /* Not enough data, compare directly */
+ return rspamd_language_detector_cmp(a, b);
+ }
+
+ freqa = ((gdouble) canda->elt->occurrences) /
+ (gdouble) cbd->d->total_occurrences;
+ freqb = ((gdouble) candb->elt->occurrences) /
+ (gdouble) cbd->d->total_occurrences;
+
+ proba_adjusted = canda->prob;
+ probb_adjusted = candb->prob;
+
+ if (isnormal(freqa) && isnormal(freqb)) {
+ proba_adjusted += cbd->std * (frequency_adjustment * freqa);
+ probb_adjusted += cbd->std * (frequency_adjustment * freqb);
+ }
+
+ if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
+ adj = tier1_adjustment * 2.0;
+ }
+ else {
+ adj = tier1_adjustment;
+ }
+ if (canda->elt->flags & RS_LANGUAGE_TIER1) {
+ proba_adjusted += cbd->std * adj;
+ }
+
+ if (candb->elt->flags & RS_LANGUAGE_TIER1) {
+ probb_adjusted += cbd->std * adj;
+ }
+
+ if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
+ adj = tier0_adjustment * 16.0;
+ }
+ else {
+ adj = tier0_adjustment;
+ }
+
+ if (canda->elt->flags & RS_LANGUAGE_TIER0) {
+ proba_adjusted += cbd->std * adj;
+ }
+
+ if (candb->elt->flags & RS_LANGUAGE_TIER0) {
+ probb_adjusted += cbd->std * adj;
+ }
+
+ /* Hack: adjust probability directly */
+ canda->prob = proba_adjusted;
+ candb->prob = probb_adjusted;
+
+ if (proba_adjusted > probb_adjusted) {
+ return -1;
+ }
+ else if (probb_adjusted > proba_adjusted) {
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+rspamd_language_detector_unicode_scripts(struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ guint *pchinese,
+ guint *pspecial)
+{
+ const gchar *p = part->utf_stripped_content->data, *end;
+ guint i = 0, cnt = 0;
+ end = p + part->utf_stripped_content->len;
+ gint32 uc, sc;
+ guint nlatin = 0, nchinese = 0, nspecial = 0;
+ const guint cutoff_limit = 32;
+
+ while (p + i < end) {
+ U8_NEXT(p, i, part->utf_stripped_content->len, uc);
+
+ if (((gint32) uc) < 0) {
+ break;
+ }
+
+ if (u_isalpha(uc)) {
+ sc = ublock_getCode(uc);
+ cnt++;
+
+ switch (sc) {
+ case UBLOCK_BASIC_LATIN:
+ case UBLOCK_LATIN_1_SUPPLEMENT:
+ part->unicode_scripts |= RSPAMD_UNICODE_LATIN;
+ nlatin++;
+ break;
+ case UBLOCK_HEBREW:
+ part->unicode_scripts |= RSPAMD_UNICODE_HEBREW;
+ nspecial++;
+ break;
+ case UBLOCK_GREEK:
+ part->unicode_scripts |= RSPAMD_UNICODE_GREEK;
+ nspecial++;
+ break;
+ case UBLOCK_CYRILLIC:
+ part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC;
+ nspecial++;
+ break;
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
+ case UBLOCK_CJK_COMPATIBILITY:
+ case UBLOCK_CJK_RADICALS_SUPPLEMENT:
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
+ part->unicode_scripts |= RSPAMD_UNICODE_CJK;
+ nchinese++;
+ break;
+ case UBLOCK_HIRAGANA:
+ case UBLOCK_KATAKANA:
+ part->unicode_scripts |= RSPAMD_UNICODE_JP;
+ nspecial++;
+ break;
+ case UBLOCK_HANGUL_JAMO:
+ case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
+ part->unicode_scripts |= RSPAMD_UNICODE_HANGUL;
+ nspecial++;
+ break;
+ case UBLOCK_ARABIC:
+ part->unicode_scripts |= RSPAMD_UNICODE_ARABIC;
+ nspecial++;
+ break;
+ case UBLOCK_DEVANAGARI:
+ part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI;
+ nspecial++;
+ break;
+ case UBLOCK_ARMENIAN:
+ part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN;
+ nspecial++;
+ break;
+ case UBLOCK_GEORGIAN:
+ part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN;
+ nspecial++;
+ break;
+ case UBLOCK_GUJARATI:
+ part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI;
+ nspecial++;
+ break;
+ case UBLOCK_TELUGU:
+ part->unicode_scripts |= RSPAMD_UNICODE_TELUGU;
+ nspecial++;
+ break;
+ case UBLOCK_TAMIL:
+ part->unicode_scripts |= RSPAMD_UNICODE_TAMIL;
+ nspecial++;
+ break;
+ case UBLOCK_THAI:
+ part->unicode_scripts |= RSPAMD_UNICODE_THAI;
+ nspecial++;
+ break;
+ case RSPAMD_UNICODE_MALAYALAM:
+ part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM;
+ nspecial++;
+ break;
+ case RSPAMD_UNICODE_SINHALA:
+ part->unicode_scripts |= RSPAMD_UNICODE_SINHALA;
+ nspecial++;
+ break;
+ }
+ }
+
+ if (nspecial > cutoff_limit && nspecial > nlatin) {
+ break;
+ }
+ else if (nchinese > cutoff_limit && nchinese > nlatin) {
+ if (nspecial > 0) {
+ /* Likely japanese */
+ break;
+ }
+ }
+ }
+
+ msg_debug_lang_det("stop after checking %d characters, "
+ "%d latin, %d special, %d chinese",
+ cnt, nlatin, nspecial, nchinese);
+
+ *pchinese = nchinese;
+ *pspecial = nspecial;
+}
+
+static inline void
+rspamd_language_detector_set_language(struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ const gchar *code,
+ struct rspamd_language_elt *elt)
+{
+ struct rspamd_lang_detector_res *r;
+
+ r = rspamd_mempool_alloc0(task->task_pool, sizeof(*r));
+ r->prob = 1.0;
+ r->lang = code;
+ r->elt = elt;
+
+ if (part->languages == NULL) {
+ part->languages = g_ptr_array_sized_new(1);
+ }
+
+ g_ptr_array_add(part->languages, r);
+ part->language = code;
+}
+
+static gboolean
+rspamd_language_detector_try_uniscript(struct rspamd_task *task,
+ struct rspamd_mime_text_part *part,
+ guint nchinese,
+ guint nspecial)
+{
+ guint i;
+
+ for (i = 0; i < G_N_ELEMENTS(unicode_langs); i++) {
+ if (unicode_langs[i].unicode_code & part->unicode_scripts) {
+
+ if (unicode_langs[i].unicode_code != RSPAMD_UNICODE_JP) {
+ msg_debug_lang_det("set language based on unicode script %s",
+ unicode_langs[i].lang);
+ rspamd_language_detector_set_language(task, part,
+ unicode_langs[i].lang, NULL);
+
+ return TRUE;
+ }
+ else {
+ /* Japanese <-> Chinese guess */
+
+ /*
+ * Typically there might be around 0-70% of kanji glyphs
+ * and the rest are Haragana/Katakana
+ *
+ * If we discover that Kanji is more than 80% then we consider
+ * it Chinese
+ */
+ if (nchinese <= 5 || nchinese < nspecial * 5) {
+ msg_debug_lang_det("set language based on unicode script %s",
+ unicode_langs[i].lang);
+ rspamd_language_detector_set_language(task, part,
+ unicode_langs[i].lang, NULL);
+
+ return TRUE;
+ }
+ }
+ }
+ }
+
+ if (part->unicode_scripts & RSPAMD_UNICODE_CJK) {
+ msg_debug_lang_det("guess chinese based on CJK characters: %d chinese, %d special",
+ nchinese, nspecial);
+ rspamd_language_detector_set_language(task, part,
+ "zh-CN", NULL);
+
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+static guint
+rspamd_langelt_hash_func(gconstpointer key)
+{
+ const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *) key;
+ return rspamd_cryptobox_fast_hash(elt->name, strlen(elt->name),
+ rspamd_hash_seed());
+}
+
+static gboolean
+rspamd_langelt_equal_func(gconstpointer v, gconstpointer v2)
+{
+ const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *) v,
+ *elt2 = (const struct rspamd_language_elt *) v2;
+ return strcmp(elt1->name, elt2->name) == 0;
+}
+
+/* This hash set stores a word index in the language to avoid duplicate stop words */
+KHASH_INIT(rspamd_sw_res_set, int, char, 0, kh_int_hash_func, kh_int_hash_equal);
+
+KHASH_INIT(rspamd_sw_hash, struct rspamd_language_elt *, khash_t(rspamd_sw_res_set) *, 1,
+ rspamd_langelt_hash_func, rspamd_langelt_equal_func);
+
+struct rspamd_sw_cbdata {
+ struct rspamd_task *task;
+ khash_t(rspamd_sw_hash) * res;
+ GArray *ranges;
+};
+
+static gint
+rspamd_ranges_cmp(const void *k, const void *memb)
+{
+ gint pos = GPOINTER_TO_INT(k);
+ const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *) memb;
+
+ if (pos >= r->start && pos < r->stop) {
+ return 0;
+ }
+ else if (pos < r->start) {
+ return -1;
+ }
+
+ return 1;
+}
+
+static gint
+rspamd_language_detector_sw_cb(struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
+{
+ /* Check if boundary */
+ const gchar *prev = text, *next = text + len;
+ struct rspamd_stop_word_range *r;
+ struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *) context;
+ khiter_t k;
+ static const gsize max_stop_words = 80;
+ struct rspamd_task *task;
+
+ if (match_start > 0) {
+ prev = text + match_start - 1;
+
+ if (!(g_ascii_isspace(*prev) || g_ascii_ispunct(*prev))) {
+ return 0;
+ }
+ }
+
+ if (match_pos < len) {
+ next = text + match_pos;
+
+ if (!(g_ascii_isspace(*next) || g_ascii_ispunct(*next))) {
+ return 0;
+ }
+ }
+
+ /* We have a word on the boundary, check range */
+ task = cbdata->task;
+ r = bsearch(GINT_TO_POINTER(strnum), cbdata->ranges->data,
+ cbdata->ranges->len, sizeof(*r), rspamd_ranges_cmp);
+
+ g_assert(r != NULL);
+
+ k = kh_get(rspamd_sw_hash, cbdata->res, r->elt);
+ gint nwords = 1;
+
+ if (k != kh_end(cbdata->res)) {
+ khiter_t set_k;
+ int tt;
+
+ set_k = kh_get(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum);
+ nwords = kh_size(kh_value(cbdata->res, k));
+
+ if (set_k == kh_end(kh_value(cbdata->res, k))) {
+ /* New word */
+ set_k = kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt);
+ msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)",
+ (int) (next - prev - 1), prev + 1, r->elt->name, nwords);
+ }
+
+ if (nwords > max_stop_words) {
+ return 1;
+ }
+ }
+ else {
+ gint tt;
+
+ k = kh_put(rspamd_sw_hash, cbdata->res, r->elt, &tt);
+ kh_value(cbdata->res, k) = kh_init(rspamd_sw_res_set);
+ kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt);
+
+ msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)",
+ (int) (next - prev - 1), prev + 1, r->elt->name, nwords);
+ }
+
+ return 0;
+}
+
+static gboolean
+rspamd_language_detector_try_stop_words(struct rspamd_task *task,
+ struct rspamd_lang_detector *d,
+ struct rspamd_mime_text_part *part,
+ enum rspamd_language_category cat)
+{
+ struct rspamd_stop_word_elt *elt;
+ struct rspamd_sw_cbdata cbdata;
+ gboolean ret = FALSE;
+ static const int stop_words_threshold = 4, /* minimum stop words count */
+ strong_confidence_threshold = 10 /* we are sure that this is enough */;
+
+ elt = &d->stop_words[cat];
+ cbdata.res = kh_init(rspamd_sw_hash);
+ cbdata.ranges = elt->ranges;
+ cbdata.task = task;
+
+ rspamd_multipattern_lookup(elt->mp, part->utf_stripped_content->data,
+ part->utf_stripped_content->len, rspamd_language_detector_sw_cb,
+ &cbdata, NULL);
+
+ if (kh_size(cbdata.res) > 0) {
+ khash_t(rspamd_sw_res_set) * cur_res;
+ double max_rate = G_MINDOUBLE;
+ struct rspamd_language_elt *cur_lang, *sel = NULL;
+ gboolean ignore_ascii = FALSE, ignore_latin = FALSE;
+
+ again:
+ kh_foreach(cbdata.res, cur_lang, cur_res, {
+ int cur_matches = kh_size(cur_res);
+
+ if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) {
+ /* Restart matches */
+ ignore_ascii = TRUE;
+ sel = NULL;
+ max_rate = G_MINDOUBLE;
+ msg_debug_lang_det("ignore ascii after finding %d stop words from %s",
+ cur_matches, cur_lang->name);
+ goto again;
+ }
+
+ if (!ignore_latin && cur_lang->category != RSPAMD_LANGUAGE_LATIN) {
+ /* Restart matches */
+ ignore_latin = TRUE;
+ sel = NULL;
+ max_rate = G_MINDOUBLE;
+ msg_debug_lang_det("ignore latin after finding stop %d words from %s",
+ cur_matches, cur_lang->name);
+ goto again;
+ }
+
+ if (cur_matches < stop_words_threshold) {
+ continue;
+ }
+
+ if (cur_matches < strong_confidence_threshold) {
+ /* Ignore mixed languages when not enough confidence */
+ if (ignore_ascii && (cur_lang->flags & RS_LANGUAGE_ASCII)) {
+ continue;
+ }
+
+ if (ignore_latin && cur_lang->category == RSPAMD_LANGUAGE_LATIN) {
+ continue;
+ }
+ }
+
+ double rate = (double) cur_matches / (double) cur_lang->stop_words;
+
+ if (rate > max_rate) {
+ max_rate = rate;
+ sel = cur_lang;
+ }
+
+ msg_debug_lang_det("found %d stop words from %s: %3f rate",
+ cur_matches, cur_lang->name, rate);
+ });
+
+ /* Cleanup */
+ kh_foreach(cbdata.res, cur_lang, cur_res, {
+ kh_destroy(rspamd_sw_res_set, cur_res);
+ });
+
+ if (max_rate > 0 && sel) {
+ msg_debug_lang_det("set language based on stop words script %s, %.3f found",
+ sel->name, max_rate);
+ rspamd_language_detector_set_language(task, part,
+ sel->name, sel);
+
+ ret = TRUE;
+ }
+ }
+ else {
+ msg_debug_lang_det("found no stop words in a text");
+ }
+
+ kh_destroy(rspamd_sw_hash, cbdata.res);
+
+ return ret;
+}
+
+gboolean
+rspamd_language_detector_detect(struct rspamd_task *task,
+ struct rspamd_lang_detector *d,
+ struct rspamd_mime_text_part *part)
+{
+ khash_t(rspamd_candidates_hash) * candidates;
+ GPtrArray *result;
+ gdouble mean, std, start_ticks, end_ticks;
+ guint cand_len;
+ enum rspamd_language_category cat;
+ struct rspamd_lang_detector_res *cand;
+ enum rspamd_language_detected_type r;
+ struct rspamd_frequency_sort_cbdata cbd;
+ /* Check if we have sorted candidates based on frequency */
+ gboolean frequency_heuristic_applied = FALSE, ret = FALSE;
+
+ if (!part->utf_stripped_content) {
+ return FALSE;
+ }
+
+ start_ticks = rspamd_get_ticks(TRUE);
+
+ guint nchinese = 0, nspecial = 0;
+ rspamd_language_detector_unicode_scripts(task, part, &nchinese, &nspecial);
+
+ /* Disable internal language detection heuristics if we have fasttext */
+ if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector) || !d->prefer_fasttext) {
+ /* Apply unicode scripts heuristic */
+ if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) {
+ ret = TRUE;
+ }
+
+ cat = rspamd_language_detector_get_category(part->unicode_scripts);
+
+ if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) {
+ ret = TRUE;
+ }
+ }
+
+ if (!ret) {
+ unsigned ndetected = 0;
+ if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) {
+ rspamd_fasttext_predict_result_t fasttext_predict_result =
+ rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task,
+ part->utf_words, 4);
+
+ ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result);
+
+ if (ndetected > 0) {
+ candidates = kh_init(rspamd_candidates_hash);
+ kh_resize(rspamd_candidates_hash, candidates, ndetected);
+
+ /* Now fill all results where probability is above threshold */
+ float max_prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, 0);
+
+ for (unsigned int i = 0; i < ndetected; i++) {
+ float prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i);
+ if (prob > max_prob * 0.75) {
+ char *lang = rspamd_mempool_strdup(task->task_pool,
+ rspamd_lang_detection_fasttext_get_lang(fasttext_predict_result, i));
+ int tmp;
+ khiter_t k = kh_put(rspamd_candidates_hash, candidates, lang, &tmp);
+
+ kh_value(candidates, k) = rspamd_mempool_alloc0(task->task_pool, sizeof(*cand));
+ cand = kh_value(candidates, k);
+ cand->lang = lang;
+ cand->prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i);
+
+ /* Find the corresponding language elt */
+ k = kh_get(rspamd_languages_hash, d->languages, lang);
+ if (k != kh_end(d->languages)) {
+ cand->elt = kh_value(d->languages, k);
+ }
+ }
+ }
+
+ if (kh_size(candidates) == 1) {
+ r = rs_detect_single;
+ }
+ else if (kh_size(candidates) > 1) {
+ r = rs_detect_multiple;
+ }
+ else {
+ r = rs_detect_none;
+ }
+ }
+
+ rspamd_fasttext_predict_result_destroy(fasttext_predict_result);
+ }
+ if (ndetected == 0) {
+ if (part->utf_words->len < default_short_text_limit) {
+ r = rs_detect_none;
+ msg_debug_lang_det("text is too short for trigrams detection: "
+ "%d words; at least %d words required",
+ (int) part->utf_words->len,
+ (int) default_short_text_limit);
+ switch (cat) {
+ case RSPAMD_LANGUAGE_CYRILLIC:
+ rspamd_language_detector_set_language(task, part, "ru", NULL);
+ break;
+ case RSPAMD_LANGUAGE_DEVANAGARI:
+ rspamd_language_detector_set_language(task, part, "hi", NULL);
+ break;
+ case RSPAMD_LANGUAGE_ARAB:
+ rspamd_language_detector_set_language(task, part, "ar", NULL);
+ break;
+ default:
+ case RSPAMD_LANGUAGE_LATIN:
+ rspamd_language_detector_set_language(task, part, "en", NULL);
+ break;
+ }
+ msg_debug_lang_det("set %s language based on symbols category",
+ part->language);
+
+ candidates = kh_init(rspamd_candidates_hash);
+ }
+ else {
+ candidates = kh_init(rspamd_candidates_hash);
+ kh_resize(rspamd_candidates_hash, candidates, 32);
+
+ r = rspamd_language_detector_try_ngramm(task,
+ default_words,
+ d,
+ part->utf_words,
+ cat,
+ candidates,
+ part);
+
+ if (r == rs_detect_none) {
+ msg_debug_lang_det("no trigrams found, fallback to english");
+ rspamd_language_detector_set_language(task, part, "en", NULL);
+ }
+ else if (r == rs_detect_multiple) {
+ /* Check our guess */
+
+ mean = 0.0;
+ std = 0.0;
+ cand_len = 0;
+
+ /* Check distribution */
+ kh_foreach_value(candidates, cand, {
+ if (!isnan(cand->prob)) {
+ mean += cand->prob;
+ cand_len++;
+ }
+ });
+
+ if (cand_len > 0) {
+ mean /= cand_len;
+
+ kh_foreach_value(candidates, cand, {
+ gdouble err;
+ if (!isnan(cand->prob)) {
+ err = cand->prob - mean;
+ std += fabs(err);
+ }
+ });
+
+ std /= cand_len;
+ }
+
+ msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev",
+ cand_len, mean, std);
+
+ if (cand_len > 0 && std / fabs(mean) < 0.25) {
+ msg_debug_lang_det("apply frequency heuristic sorting");
+ frequency_heuristic_applied = TRUE;
+ cbd.d = d;
+ cbd.mean = mean;
+ cbd.std = std;
+ cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+
+ if (part->nwords < default_words / 2) {
+ cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ }
+ }
+ }
+ }
+ }
+
+ /* Now, convert hash to array and sort it */
+ if (r != rs_detect_none && kh_size(candidates) > 0) {
+ result = g_ptr_array_sized_new(kh_size(candidates));
+
+ kh_foreach_value(candidates, cand, {
+ if (!isnan(cand->prob)) {
+ msg_debug_lang_det("pre-sorting probability %s -> %.2f", cand->lang,
+ cand->prob);
+ g_ptr_array_add(result, cand);
+ }
+ });
+
+ if (frequency_heuristic_applied) {
+ g_ptr_array_sort_with_data(result,
+ rspamd_language_detector_cmp_heuristic,
+ (gpointer) &cbd);
+ }
+ else {
+ g_ptr_array_sort(result, rspamd_language_detector_cmp);
+ }
+
+ int i;
+ PTR_ARRAY_FOREACH(result, i, cand)
+ {
+ msg_debug_lang_det("final probability %s -> %.2f", cand->lang,
+ cand->prob);
+ }
+
+ if (part->languages != NULL) {
+ g_ptr_array_unref(part->languages);
+ }
+
+ part->languages = result;
+ part->language = ((struct rspamd_lang_detector_res *) g_ptr_array_index(result, 0))->lang;
+ ret = TRUE;
+ }
+ else if (part->languages == NULL) {
+ rspamd_language_detector_set_language(task, part, "en", NULL);
+ }
+
+ kh_destroy(rspamd_candidates_hash, candidates);
+ }
+
+ /* Update internal stat */
+ if (part->languages != NULL && part->languages->len > 0 && !frequency_heuristic_applied) {
+ cand = g_ptr_array_index(part->languages, 0);
+ if (cand->elt) {
+ cand->elt->occurrences++;
+ d->total_occurrences++;
+
+ msg_debug_lang_det("updated stat for %s: %d occurrences, %z total detected",
+ cand->elt->name, cand->elt->occurrences,
+ d->total_occurrences);
+ }
+ }
+
+ end_ticks = rspamd_get_ticks(TRUE);
+ msg_debug_lang_det("detected languages in %.0f ticks",
+ (end_ticks - start_ticks));
+
+ return ret;
+}
+
+
+struct rspamd_lang_detector *
+rspamd_language_detector_ref(struct rspamd_lang_detector *d)
+{
+ REF_RETAIN(d);
+
+ return d;
+}
+
+void rspamd_language_detector_unref(struct rspamd_lang_detector *d)
+{
+ REF_RELEASE(d);
+}
+
+gboolean
+rspamd_language_detector_is_stop_word(struct rspamd_lang_detector *d,
+ const gchar *word, gsize wlen)
+{
+ khiter_t k;
+ rspamd_ftok_t search;
+
+ search.begin = word;
+ search.len = wlen;
+
+ k = kh_get(rspamd_stopwords_hash, d->stop_words_norm, &search);
+
+ if (k != kh_end(d->stop_words_norm)) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+gint rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt)
+{
+ if (elt) {
+ return elt->flags;
+ }
+
+ return 0;
+} \ No newline at end of file
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
new file mode 100644
index 0000000..5423c13
--- /dev/null
+++ b/src/libmime/lang_detection.h
@@ -0,0 +1,110 @@
+/*-
+ * Copyright 2017 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_LANG_DETECTION_H
+#define RSPAMD_LANG_DETECTION_H
+
+#include "config.h"
+#include "libserver/cfg_file.h"
+#include "libstat/stat_api.h"
+#include "libmime/message.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_lang_detector;
+struct rspamd_language_elt;
+struct rspamd_task;
+
+enum rspamd_unicode_scripts {
+ RSPAMD_UNICODE_LATIN = (1 << 0),
+ RSPAMD_UNICODE_GREEK = (1 << 1),
+ RSPAMD_UNICODE_CYRILLIC = (1 << 2),
+ RSPAMD_UNICODE_HEBREW = (1 << 3),
+ RSPAMD_UNICODE_CJK = (1 << 4),
+ RSPAMD_UNICODE_JP = (1 << 5),
+ RSPAMD_UNICODE_ARABIC = (1 << 6),
+ RSPAMD_UNICODE_DEVANAGARI = (1 << 7),
+ RSPAMD_UNICODE_THAI = (1 << 8),
+ RSPAMD_UNICODE_ARMENIAN = (1 << 9),
+ RSPAMD_UNICODE_GEORGIAN = (1 << 10),
+ RSPAMD_UNICODE_GUJARATI = (1 << 11),
+ RSPAMD_UNICODE_TAMIL = (1 << 12),
+ RSPAMD_UNICODE_TELUGU = (1 << 13),
+ RSPAMD_UNICODE_MALAYALAM = (1 << 14),
+ RSPAMD_UNICODE_SINHALA = (1 << 15),
+ RSPAMD_UNICODE_HANGUL = (1 << 16),
+};
+
+enum rspamd_language_elt_flags {
+ RS_LANGUAGE_DEFAULT = 0,
+ RS_LANGUAGE_LATIN = (1 << 0),
+ RS_LANGUAGE_TIER1 = (1 << 3),
+ RS_LANGUAGE_TIER0 = (1 << 4),
+ RS_LANGUAGE_DIACRITICS = (1 << 5),
+ RS_LANGUAGE_ASCII = (1 << 6),
+};
+
+struct rspamd_lang_detector_res {
+ gdouble prob;
+ const gchar *lang;
+ struct rspamd_language_elt *elt;
+};
+
+/**
+ * Create new language detector object using configuration object
+ * @param cfg
+ * @return
+ */
+struct rspamd_lang_detector *rspamd_language_detector_init(struct rspamd_config *cfg);
+
+struct rspamd_lang_detector *rspamd_language_detector_ref(struct rspamd_lang_detector *d);
+
+void rspamd_language_detector_unref(struct rspamd_lang_detector *d);
+
+/**
+ * Try to detect language of words
+ * @param d
+ * @param ucs_tokens
+ * @param words_len
+ * @return array of struct rspamd_lang_detector_res sorted by freq descending
+ */
+gboolean rspamd_language_detector_detect(struct rspamd_task *task,
+ struct rspamd_lang_detector *d,
+ struct rspamd_mime_text_part *part);
+
+/**
+ * Returns TRUE if the specified word is known to be a stop word
+ * @param d
+ * @param word
+ * @param wlen
+ * @return
+ */
+gboolean rspamd_language_detector_is_stop_word(struct rspamd_lang_detector *d,
+ const gchar *word, gsize wlen);
+
+/**
+ * Return language flags for a specific language elt
+ * @param elt
+ * @return
+ */
+gint rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx
new file mode 100644
index 0000000..c973ed7
--- /dev/null
+++ b/src/libmime/lang_detection_fasttext.cxx
@@ -0,0 +1,269 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lang_detection_fasttext.h"
+
+#ifdef WITH_FASTTEXT
+#include "fasttext/fasttext.h"
+#include "libserver/cfg_file.h"
+#include "libserver/logger.h"
+#include "fmt/core.h"
+#include "stat_api.h"
+#include <exception>
+#include <string_view>
+#include <vector>
+#endif
+
+#ifdef WITH_FASTTEXT
+
+EXTERN_LOG_MODULE_DEF(langdet);
+#define msg_debug_lang_det(...) rspamd_conditional_debug_fast(nullptr, nullptr, \
+ rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
+ __FUNCTION__, \
+ __VA_ARGS__)
+
+namespace rspamd::langdet {
+class fasttext_langdet {
+private:
+ fasttext::FastText ft;
+ std::string model_fname;
+ bool loaded = false;
+
+public:
+ explicit fasttext_langdet(struct rspamd_config *cfg)
+ {
+ const auto *ucl_obj = cfg->cfg_ucl_obj;
+ const auto *opts_section = ucl_object_find_key(ucl_obj, "lang_detection");
+
+ if (opts_section) {
+ const auto *model = ucl_object_find_key(opts_section, "fasttext_model");
+
+ if (model) {
+ try {
+ ft.loadModel(ucl_object_tostring(model));
+ loaded = true;
+ model_fname = std::string{ucl_object_tostring(model)};
+ } catch (std::exception &e) {
+ auto err_message = fmt::format("cannot load fasttext model: {}", e.what());
+ msg_err_config("%s", err_message.c_str());
+ loaded = false;
+ }
+ }
+ }
+ }
+
+ /* Disallow multiple initialisation */
+ fasttext_langdet() = delete;
+ fasttext_langdet(const fasttext_langdet &) = delete;
+ fasttext_langdet(fasttext_langdet &&) = delete;
+
+ ~fasttext_langdet() = default;
+
+ auto is_enabled() const -> bool
+ {
+ return loaded;
+ }
+ auto word2vec(const char *in, std::size_t len, std::vector<std::int32_t> &word_ngramms) const
+ {
+ if (!loaded) {
+ return;
+ }
+
+ std::string tok{in, len};
+ const auto &dic = ft.getDictionary();
+ auto h = dic->hash(tok);
+ auto wid = dic->getId(tok, h);
+ auto type = wid < 0 ? dic->getType(tok) : dic->getType(wid);
+
+ if (type == fasttext::entry_type::word) {
+ if (wid < 0) {
+ auto pipelined_word = fmt::format("{}{}{}", fasttext::Dictionary::BOW, tok, fasttext::Dictionary::EOW);
+ dic->computeSubwords(pipelined_word, word_ngramms);
+ }
+ else {
+ if (ft.getArgs().maxn <= 0) {
+ word_ngramms.push_back(wid);
+ }
+ else {
+ const auto ngrams = dic->getSubwords(wid);
+ word_ngramms.insert(word_ngramms.end(), ngrams.cbegin(), ngrams.cend());
+ }
+ }
+ }
+ }
+ auto detect_language(std::vector<std::int32_t> &words, int k)
+ -> std::vector<std::pair<fasttext::real, std::string>> *
+ {
+ if (!loaded) {
+ return nullptr;
+ }
+
+ auto predictions = new std::vector<std::pair<fasttext::real, std::string>>;
+ predictions->reserve(k);
+ fasttext::Predictions line_predictions;
+ line_predictions.reserve(k);
+ ft.predict(k, words, line_predictions, 0.0f);
+ const auto *dict = ft.getDictionary().get();
+
+ for (const auto &pred: line_predictions) {
+ predictions->push_back(std::make_pair(std::exp(pred.first), dict->getLabel(pred.second)));
+ }
+ return predictions;
+ }
+
+ auto model_info(void) const -> const std::string
+ {
+ if (!loaded) {
+ static const auto not_loaded = std::string{"fasttext model is not loaded"};
+ return not_loaded;
+ }
+ else {
+ return fmt::format("fasttext model {}: {} languages, {} tokens", model_fname,
+ ft.getDictionary()->nlabels(), ft.getDictionary()->ntokens());
+ }
+ }
+};
+}// namespace rspamd::langdet
+#endif
+
+/* C API part */
+G_BEGIN_DECLS
+
+#define FASTTEXT_MODEL_TO_C_API(p) reinterpret_cast<rspamd::langdet::fasttext_langdet *>(p)
+#define FASTTEXT_RESULT_TO_C_API(res) reinterpret_cast<std::vector<std::pair<fasttext::real, std::string>> *>(res)
+
+void *rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg)
+{
+#ifndef WITH_FASTTEXT
+ return nullptr;
+#else
+ return (void *) new rspamd::langdet::fasttext_langdet(cfg);
+#endif
+}
+
+char *rspamd_lang_detection_fasttext_show_info(void *ud)
+{
+#ifndef WITH_FASTTEXT
+ return g_strdup("fasttext is not compiled in");
+#else
+ auto model_info = FASTTEXT_MODEL_TO_C_API(ud)->model_info();
+
+ return g_strdup(model_info.c_str());
+#endif
+}
+
+bool rspamd_lang_detection_fasttext_is_enabled(void *ud)
+{
+#ifdef WITH_FASTTEXT
+ auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
+
+ if (real_model) {
+ return real_model->is_enabled();
+ }
+#endif
+
+ return false;
+}
+
+rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
+ struct rspamd_task *task,
+ GArray *utf_words,
+ int k)
+{
+#ifndef WITH_FASTTEXT
+ return nullptr;
+#else
+ /* Avoid too long inputs */
+ static const guint max_fasttext_input_len = 1024 * 1024;
+ auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
+ std::vector<std::int32_t> words_vec;
+ words_vec.reserve(utf_words->len);
+
+ for (auto i = 0; i < std::min(utf_words->len, max_fasttext_input_len); i++) {
+ const auto *w = &g_array_index(utf_words, rspamd_stat_token_t, i);
+ if (w->original.len > 0) {
+ real_model->word2vec(w->original.begin, w->original.len, words_vec);
+ }
+ }
+
+ msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), utf_words->len);
+
+ auto *res = real_model->detect_language(words_vec, k);
+
+ return (rspamd_fasttext_predict_result_t) res;
+#endif
+}
+
+void rspamd_lang_detection_fasttext_destroy(void *ud)
+{
+#ifdef WITH_FASTTEXT
+ delete FASTTEXT_MODEL_TO_C_API(ud);
+#endif
+}
+
+
+guint rspamd_lang_detection_fasttext_get_nlangs(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+ auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+ if (real_res) {
+ return real_res->size();
+ }
+#endif
+ return 0;
+}
+
+const char *
+rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res, unsigned int idx)
+{
+#ifdef WITH_FASTTEXT
+ auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+ if (real_res && real_res->size() > idx) {
+ /* Fasttext returns result in form __label__<lang>, so we need to remove __label__ prefix */
+ auto lang = std::string_view{real_res->at(idx).second};
+ if (lang.size() > sizeof("__label__") && lang.substr(0, sizeof("__label__") - 1) == "__label__") {
+ lang.remove_prefix(sizeof("__label__") - 1);
+ }
+ return lang.data();
+ }
+#endif
+ return nullptr;
+}
+
+float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res, unsigned int idx)
+{
+#ifdef WITH_FASTTEXT
+ auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+ if (real_res && real_res->size() > idx) {
+ return real_res->at(idx).first;
+ }
+#endif
+ return 0.0f;
+}
+
+void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res)
+{
+#ifdef WITH_FASTTEXT
+ auto *real_res = FASTTEXT_RESULT_TO_C_API(res);
+
+ delete real_res;
+#endif
+}
+
+G_END_DECLS \ No newline at end of file
diff --git a/src/libmime/lang_detection_fasttext.h b/src/libmime/lang_detection_fasttext.h
new file mode 100644
index 0000000..c8710d3
--- /dev/null
+++ b/src/libmime/lang_detection_fasttext.h
@@ -0,0 +1,91 @@
+/*-
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_LANG_DETECTION_FASTTEXT_H
+#define RSPAMD_LANG_DETECTION_FASTTEXT_H
+
+#include "config.h"
+
+G_BEGIN_DECLS
+struct rspamd_config;
+struct rspamd_task; /* for logging */
+/**
+ * Initialize fasttext language detector
+ * @param cfg
+ * @return opaque pointer
+ */
+void *rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg);
+
+/**
+ * Check if fasttext language detector is enabled
+ * @param ud
+ * @return
+ */
+bool rspamd_lang_detection_fasttext_is_enabled(void *ud);
+
+/**
+ * Show info about fasttext language detector
+ * @param ud
+ * @return
+ */
+char *rspamd_lang_detection_fasttext_show_info(void *ud);
+
+
+typedef void *rspamd_fasttext_predict_result_t;
+/**
+ * Detect language using fasttext
+ * @param ud opaque pointer
+ * @param in input text
+ * @param len length of input text
+ * @param k number of results to return
+ * @return TRUE if language is detected
+ */
+rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
+ struct rspamd_task *task, GArray *utf_words, int k);
+
+/**
+ * Get number of languages detected
+ * @param ud
+ * @return
+ */
+guint rspamd_lang_detection_fasttext_get_nlangs(rspamd_fasttext_predict_result_t ud);
+/**
+ * Get language from fasttext result
+ * @param res
+ * @return
+ */
+const char *rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res, unsigned int idx);
+
+/**
+ * Get probability from fasttext result
+ * @param res
+ * @return
+ */
+float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res, unsigned int idx);
+
+/**
+ * Destroy fasttext result
+ * @param res
+ */
+void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res);
+
+/**
+ * Destroy fasttext language detector
+ */
+void rspamd_lang_detection_fasttext_destroy(void *ud);
+
+
+G_END_DECLS
+#endif /* RSPAMD_LANG_DETECTION_FASTTEXT_H */
diff --git a/src/libmime/message.c b/src/libmime/message.c
new file mode 100644
index 0000000..3acc935
--- /dev/null
+++ b/src/libmime/message.c
@@ -0,0 +1,1732 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "config.h"
+#include "util.h"
+#include "rspamd.h"
+#include "message.h"
+#include "libserver/html/html.h"
+#include "images.h"
+#include "archives.h"
+#include "tokenizers/tokenizers.h"
+#include "smtp_parsers.h"
+#include "mime_parser.h"
+#include "mime_encoding.h"
+#include "lang_detection.h"
+#include "libutil/multipattern.h"
+#include "libserver/mempool_vars_internal.h"
+
+#ifdef WITH_SNOWBALL
+#include "libstemmer.h"
+#endif
+
+#include <math.h>
+#include <unicode/uchar.h>
+#include "sodium.h"
+#include "libserver/cfg_file_private.h"
+#include "lua/lua_common.h"
+#include "contrib/uthash/utlist.h"
+#include "contrib/t1ha/t1ha.h"
+#include "received.h"
+
+#define GTUBE_SYMBOL "GTUBE"
+
+#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
+#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
+
+static const gchar gtube_pattern_reject[] = "XJS*C4JDBQADN1.NSBN3*2IDNEN*"
+ "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
+static const gchar gtube_pattern_add_header[] = "YJS*C4JDBQADN1.NSBN3*2IDNEN*"
+ "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
+static const gchar gtube_pattern_rewrite_subject[] = "ZJS*C4JDBQADN1.NSBN3*2IDNEN*"
+ "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
+static const gchar gtube_pattern_no_action[] = "AJS*C4JDBQADN1.NSBN3*2IDNEN*"
+ "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
+struct rspamd_multipattern *gtube_matcher = NULL;
+static const guint64 words_hash_seed = 0xdeadbabe;
+
+static void
+free_byte_array_callback(void *pointer)
+{
+ GByteArray *arr = (GByteArray *) pointer;
+ g_byte_array_free(arr, TRUE);
+}
+
+static void
+rspamd_mime_part_extract_words(struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ rspamd_stat_token_t *w;
+ guint i, total_len = 0, short_len = 0;
+
+ if (part->utf_words) {
+ rspamd_stem_words(part->utf_words, task->task_pool, part->language,
+ task->lang_det);
+
+ for (i = 0; i < part->utf_words->len; i++) {
+ guint64 h;
+
+ w = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+
+ if (w->stemmed.len > 0) {
+ /*
+ * We use static hash seed if we would want to use that in shingles
+ * computation in future
+ */
+ h = rspamd_cryptobox_fast_hash_specific(
+ RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
+ w->stemmed.begin, w->stemmed.len, words_hash_seed);
+ g_array_append_val(part->normalized_hashes, h);
+ total_len += w->stemmed.len;
+
+ if (w->stemmed.len <= 3) {
+ short_len++;
+ }
+
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT &&
+ !(w->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) {
+ part->nwords++;
+ }
+ }
+
+ if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE |
+ RSPAMD_STAT_TOKEN_FLAG_NORMALISED |
+ RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) {
+ task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
+ }
+ }
+
+ if (part->utf_words->len) {
+ gdouble *avg_len_p, *short_len_p;
+
+ avg_len_p = rspamd_mempool_get_variable(task->task_pool,
+ RSPAMD_MEMPOOL_AVG_WORDS_LEN);
+
+ if (avg_len_p == NULL) {
+ avg_len_p = rspamd_mempool_alloc(task->task_pool,
+ sizeof(double));
+ *avg_len_p = total_len;
+ rspamd_mempool_set_variable(task->task_pool,
+ RSPAMD_MEMPOOL_AVG_WORDS_LEN, avg_len_p, NULL);
+ }
+ else {
+ *avg_len_p += total_len;
+ }
+
+ short_len_p = rspamd_mempool_get_variable(task->task_pool,
+ RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
+
+ if (short_len_p == NULL) {
+ short_len_p = rspamd_mempool_alloc(task->task_pool,
+ sizeof(double));
+ *short_len_p = short_len;
+ rspamd_mempool_set_variable(task->task_pool,
+ RSPAMD_MEMPOOL_SHORT_WORDS_CNT, avg_len_p, NULL);
+ }
+ else {
+ *short_len_p += short_len;
+ }
+ }
+ }
+}
+
+static void
+rspamd_mime_part_create_words(struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ enum rspamd_tokenize_type tok_type;
+
+ if (IS_TEXT_PART_UTF(part)) {
+
+#if U_ICU_VERSION_MAJOR_NUM < 50
+ /* Hack to prevent hang with Thai in old libicu */
+ const gchar *p = part->utf_stripped_content->data, *end;
+ guint i = 0;
+ end = p + part->utf_stripped_content->len;
+ gint32 uc, sc;
+
+ tok_type = RSPAMD_TOKENIZE_UTF;
+
+ while (p + i < end) {
+ U8_NEXT(p, i, part->utf_stripped_content->len, uc);
+
+ if (((gint32) uc) < 0) {
+ tok_type = RSPAMD_TOKENIZE_RAW;
+ break;
+ }
+
+ if (u_isalpha(uc)) {
+ sc = ublock_getCode(uc);
+
+ if (sc == UBLOCK_THAI) {
+ msg_info_task("enable workaround for Thai characters for old libicu");
+ tok_type = RSPAMD_TOKENIZE_RAW;
+ break;
+ }
+ }
+ }
+#else
+ tok_type = RSPAMD_TOKENIZE_UTF;
+#endif
+ }
+ else {
+ tok_type = RSPAMD_TOKENIZE_RAW;
+ }
+
+ part->utf_words = rspamd_tokenize_text(
+ part->utf_stripped_content->data,
+ part->utf_stripped_content->len,
+ &part->utf_stripped_text,
+ tok_type, task->cfg,
+ part->exceptions,
+ NULL,
+ NULL,
+ task->task_pool);
+
+
+ if (part->utf_words) {
+ part->normalized_hashes = g_array_sized_new(FALSE, FALSE,
+ sizeof(guint64), part->utf_words->len);
+ rspamd_normalize_words(part->utf_words, task->task_pool);
+ }
+}
+
+static void
+rspamd_mime_part_detect_language(struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ struct rspamd_lang_detector_res *lang;
+
+ if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 &&
+ task->lang_det) {
+ if (rspamd_language_detector_detect(task, task->lang_det, part)) {
+ lang = g_ptr_array_index(part->languages, 0);
+ part->language = lang->lang;
+
+ msg_info_task("detected part language: %s", part->language);
+ }
+ else {
+ part->language = "en"; /* Safe fallback */
+ }
+ }
+}
+
+static void
+rspamd_strip_newlines_parse(struct rspamd_task *task,
+ const gchar *begin, const gchar *pe,
+ struct rspamd_mime_text_part *part)
+{
+ const gchar *p = begin, *c = begin;
+ gboolean crlf_added = FALSE, is_utf = IS_TEXT_PART_UTF(part);
+ gboolean url_open_bracket = FALSE;
+ UChar32 uc;
+
+ enum {
+ normal_char,
+ seen_cr,
+ seen_lf,
+ } state = normal_char;
+
+ while (p < pe) {
+ if (U8_IS_LEAD(*p) && is_utf) {
+ gint32 off = p - begin;
+ U8_NEXT(begin, off, pe - begin, uc);
+
+ if (uc != -1) {
+ while (p < pe && off < (pe - begin)) {
+ if (IS_ZERO_WIDTH_SPACE(uc)) {
+ /* Invisible space ! */
+ task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
+ part->spaces++;
+
+ if (p > c) {
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) c, p - c);
+ c = begin + off;
+ p = c;
+ }
+
+ U8_NEXT(begin, off, pe - begin, uc);
+
+ if (!IS_ZERO_WIDTH_SPACE(uc)) {
+ break;
+ }
+
+ part->double_spaces++;
+ p = begin + off;
+ c = p;
+ }
+ else {
+ break;
+ }
+ }
+ }
+ }
+
+ if (G_UNLIKELY(p >= pe)) {
+ /*
+ * This is reached when there is a utf8 part and we
+ * have zero width spaces at the end of the text
+ * So we just check overflow and refuse to access *p if it is
+ * after our real content.
+ */
+ break;
+ }
+ else if (*p == '\r') {
+ switch (state) {
+ case normal_char:
+ state = seen_cr;
+ if (p > c) {
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) c, p - c);
+ }
+
+ crlf_added = FALSE;
+ c = p + 1;
+ break;
+ case seen_cr:
+ /* Double \r\r */
+ if (!crlf_added) {
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) " ", 1);
+ crlf_added = TRUE;
+ g_ptr_array_add(part->newlines,
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
+ }
+
+ part->nlines++;
+ part->empty_lines++;
+ c = p + 1;
+ break;
+ case seen_lf:
+ /* Likely \r\n\r...*/
+ state = seen_cr;
+ c = p + 1;
+ break;
+ }
+
+ url_open_bracket = FALSE;
+
+ p++;
+ }
+ else if (*p == '\n') {
+ switch (state) {
+ case normal_char:
+ state = seen_lf;
+
+ if (p > c) {
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) c, p - c);
+ }
+
+ c = p + 1;
+
+ if (IS_TEXT_PART_HTML(part) || !url_open_bracket) {
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) " ", 1);
+ g_ptr_array_add(part->newlines,
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
+ crlf_added = TRUE;
+ }
+ else {
+ crlf_added = FALSE;
+ }
+
+ break;
+ case seen_cr:
+ /* \r\n */
+ if (!crlf_added) {
+ if (IS_TEXT_PART_HTML(part) || !url_open_bracket) {
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) " ", 1);
+ crlf_added = TRUE;
+ }
+
+ g_ptr_array_add(part->newlines,
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
+ }
+
+ c = p + 1;
+ state = seen_lf;
+
+ break;
+ case seen_lf:
+ /* Double \n\n */
+ if (!crlf_added) {
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) " ", 1);
+ crlf_added = TRUE;
+ g_ptr_array_add(part->newlines,
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
+ }
+
+ part->nlines++;
+ part->empty_lines++;
+
+ c = p + 1;
+ break;
+ }
+ url_open_bracket = FALSE;
+
+ p++;
+ }
+ else {
+ if ((*p) == '<') {
+ url_open_bracket = TRUE;
+ }
+ else if ((*p) == '>') {
+ url_open_bracket = FALSE;
+ }
+
+ switch (state) {
+ case normal_char:
+ if (*p == ' ') {
+ part->spaces++;
+
+ if (p > begin && *(p - 1) == ' ') {
+ part->double_spaces++;
+ }
+ }
+ else {
+ part->non_spaces++;
+
+ if ((*p) & 0x80) {
+ part->non_ascii_chars++;
+ }
+ else {
+ if (g_ascii_isupper(*p)) {
+ part->capital_letters++;
+ }
+ else if (g_ascii_isdigit(*p)) {
+ part->numeric_characters++;
+ }
+
+ part->ascii_chars++;
+ }
+ }
+ break;
+ case seen_cr:
+ case seen_lf:
+ part->nlines++;
+
+ if (!crlf_added) {
+ g_ptr_array_add(part->newlines,
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
+ }
+
+ /* Skip initial spaces */
+ if (*p == ' ') {
+ if (!crlf_added) {
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) " ", 1);
+ }
+
+ while (p < pe && *p == ' ') {
+ p++;
+ c++;
+ part->spaces++;
+ }
+
+ if (p < pe && (*p == '\r' || *p == '\n')) {
+ part->empty_lines++;
+ }
+ }
+
+ state = normal_char;
+ continue;
+ }
+
+ p++;
+ }
+ }
+
+ /* Leftover */
+ if (p > c) {
+ if (p > pe) {
+ p = pe;
+ }
+
+ switch (state) {
+ case normal_char:
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) c, p - c);
+
+ while (c < p) {
+ if (*c == ' ') {
+ part->spaces++;
+
+ if (c > begin && *(c - 1) == ' ') {
+ part->double_spaces++;
+ }
+ }
+ else {
+ part->non_spaces++;
+
+ if ((*c) & 0x80) {
+ part->non_ascii_chars++;
+ }
+ else {
+ part->ascii_chars++;
+ }
+ }
+
+ c++;
+ }
+ break;
+ default:
+
+ if (!crlf_added) {
+ g_byte_array_append(part->utf_stripped_content,
+ (const guint8 *) " ", 1);
+ g_ptr_array_add(part->newlines,
+ (((gpointer) (goffset) (part->utf_stripped_content->len))));
+ }
+
+ part->nlines++;
+ break;
+ }
+ }
+}
+
+static void
+rspamd_u_text_dtor(void *p)
+{
+ utext_close((UText *) p);
+}
+
+static void
+rspamd_normalize_text_part(struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ const gchar *p, *end;
+ guint i;
+ goffset off;
+ struct rspamd_process_exception *ex;
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ part->newlines = g_ptr_array_sized_new(128);
+
+ if (IS_TEXT_PART_EMPTY(part)) {
+ part->utf_stripped_content = g_byte_array_new();
+ }
+ else {
+ part->utf_stripped_content = g_byte_array_sized_new(part->utf_content.len);
+
+ p = (const gchar *) part->utf_content.begin;
+ end = p + part->utf_content.len;
+
+ rspamd_strip_newlines_parse(task, p, end, part);
+
+ for (i = 0; i < part->newlines->len; i++) {
+ ex = rspamd_mempool_alloc(task->task_pool, sizeof(*ex));
+ off = (goffset) g_ptr_array_index(part->newlines, i);
+ g_ptr_array_index(part->newlines, i) = (gpointer) (goffset) (part->utf_stripped_content->data + off);
+ ex->pos = off;
+ ex->len = 0;
+ ex->type = RSPAMD_EXCEPTION_NEWLINE;
+ part->exceptions = g_list_prepend(part->exceptions, ex);
+ }
+ }
+
+ if (IS_TEXT_PART_UTF(part)) {
+ utext_openUTF8(&part->utf_stripped_text,
+ part->utf_stripped_content->data,
+ part->utf_stripped_content->len,
+ &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ msg_warn_task("cannot open text from utf content");
+ /* Probably, should be an assertion */
+ }
+ else {
+ rspamd_mempool_add_destructor(task->task_pool,
+ rspamd_u_text_dtor,
+ &part->utf_stripped_text);
+ }
+ }
+
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) free_byte_array_callback,
+ part->utf_stripped_content);
+ rspamd_mempool_notify_alloc(task->task_pool,
+ part->utf_stripped_content->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
+ part->newlines);
+}
+
+#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
+
+static guint
+rspamd_words_levenshtein_distance(struct rspamd_task *task,
+ GArray *w1, GArray *w2)
+{
+ guint s1len, s2len, x, y, lastdiag, olddiag;
+ guint *column, ret;
+ guint64 h1, h2;
+ gint eq;
+ static const guint max_words = 8192;
+
+ s1len = w1->len;
+ s2len = w2->len;
+
+ if (s1len + s2len > max_words) {
+ msg_info_task("cannot direct compare multipart/alternative parts with more than %ud words in total: "
+ "(%ud words in one part and %ud in another)",
+ max_words, s1len, s2len);
+
+ /* Use approximate comparison of number of words */
+ if (s1len > s2len) {
+ return s1len - s2len;
+ }
+ else {
+ return s2len - s1len;
+ }
+ }
+
+ column = g_malloc0((s1len + 1) * sizeof(guint));
+
+ for (y = 1; y <= s1len; y++) {
+ column[y] = y;
+ }
+
+ for (x = 1; x <= s2len; x++) {
+ column[0] = x;
+
+ for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
+ olddiag = column[y];
+ h1 = g_array_index(w1, guint64, y - 1);
+ h2 = g_array_index(w2, guint64, x - 1);
+ eq = (h1 == h2) ? 1 : 0;
+ /*
+ * Cost of replacement is twice higher than cost of add/delete
+ * to calculate percentage properly
+ */
+ column[y] = MIN3(column[y] + 1, column[y - 1] + 1,
+ lastdiag + (eq * 2));
+ lastdiag = olddiag;
+ }
+ }
+
+ ret = column[s1len];
+ g_free(column);
+
+ return ret;
+}
+
+static gint
+rspamd_multipattern_gtube_cb(struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
+{
+ struct rspamd_task *task = (struct rspamd_task *) context;
+
+ if (strnum > 0) {
+ if (task->cfg->gtube_patterns_policy == RSPAMD_GTUBE_ALL) {
+ return strnum + 1;
+ }
+
+ return 0;
+ }
+
+ return strnum + 1; /* To distinguish from zero */
+}
+
+static enum rspamd_action_type
+rspamd_check_gtube(struct rspamd_task *task, struct rspamd_mime_text_part *part)
+{
+ static const gsize max_check_size = 8 * 1024;
+ gint ret;
+ enum rspamd_action_type act = METRIC_ACTION_NOACTION;
+ enum rspamd_gtube_patterns_policy policy = task->cfg ? task->cfg->gtube_patterns_policy : RSPAMD_GTUBE_REJECT;
+ g_assert(part != NULL);
+
+ if (gtube_matcher == NULL && policy != RSPAMD_GTUBE_DISABLED) {
+ gtube_matcher = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT);
+
+ rspamd_multipattern_add_pattern(gtube_matcher,
+ gtube_pattern_reject,
+ RSPAMD_MULTIPATTERN_DEFAULT);
+ rspamd_multipattern_add_pattern(gtube_matcher,
+ gtube_pattern_add_header,
+ RSPAMD_MULTIPATTERN_DEFAULT);
+ rspamd_multipattern_add_pattern(gtube_matcher,
+ gtube_pattern_rewrite_subject,
+ RSPAMD_MULTIPATTERN_DEFAULT);
+ rspamd_multipattern_add_pattern(gtube_matcher,
+ gtube_pattern_no_action,
+ RSPAMD_MULTIPATTERN_DEFAULT);
+
+ GError *err = NULL;
+ rspamd_multipattern_compile(gtube_matcher, &err);
+
+ if (err != NULL) {
+ /* It will be expensive, but I don't care, still better than to abort */
+ msg_err("cannot compile gtube matcher: %s", err->message);
+ g_error_free(err);
+ }
+ }
+
+ if (part->utf_content.len >= sizeof(gtube_pattern_reject) &&
+ part->utf_content.len <= max_check_size &&
+ policy != RSPAMD_GTUBE_DISABLED) {
+ if ((ret = rspamd_multipattern_lookup(gtube_matcher, part->utf_content.begin,
+ part->utf_content.len,
+ rspamd_multipattern_gtube_cb, task, NULL)) > 0) {
+
+ switch (ret) {
+ case 1:
+ act = METRIC_ACTION_REJECT;
+ break;
+ case 2:
+ act = METRIC_ACTION_ADD_HEADER;
+ break;
+ case 3:
+ act = METRIC_ACTION_REWRITE_SUBJECT;
+ break;
+ case 4:
+ act = METRIC_ACTION_NOACTION;
+ break;
+ }
+
+ if (ret != 0) {
+ task->flags |= RSPAMD_TASK_FLAG_SKIP;
+ task->flags |= RSPAMD_TASK_FLAG_GTUBE;
+ msg_info_task(
+ "gtube %s pattern has been found in part of length %uz",
+ rspamd_action_to_str(act),
+ part->utf_content.len);
+ }
+ }
+ }
+
+ return act;
+}
+
+static gint
+exceptions_compare_func(gconstpointer a, gconstpointer b)
+{
+ const struct rspamd_process_exception *ea = a, *eb = b;
+
+ return ea->pos - eb->pos;
+}
+
+static gboolean
+rspamd_message_process_plain_text_part(struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
+{
+ if (text_part->parsed.len == 0) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+
+ return TRUE;
+ }
+
+ rspamd_mime_text_part_maybe_convert(task, text_part);
+
+ if (text_part->utf_raw_content != NULL) {
+ /* Just have the same content */
+ text_part->utf_content.begin = (const gchar *) text_part->utf_raw_content->data;
+ text_part->utf_content.len = text_part->utf_raw_content->len;
+ }
+ else {
+ /*
+ * We ignore unconverted parts from now as it is dangerous
+ * to treat them as text parts
+ */
+ text_part->utf_content.begin = NULL;
+ text_part->utf_content.len = 0;
+
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static gboolean
+rspamd_message_process_html_text_part(struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part,
+ uint16_t *cur_url_order)
+{
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
+
+ if (text_part->parsed.len == 0) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+
+ return TRUE;
+ }
+
+ rspamd_mime_text_part_maybe_convert(task, text_part);
+
+ if (text_part->utf_raw_content == NULL) {
+ return FALSE;
+ }
+
+
+ text_part->html = rspamd_html_process_part_full(
+ task,
+ text_part->utf_raw_content,
+ &text_part->exceptions,
+ MESSAGE_FIELD(task, urls),
+ text_part->mime_part->urls,
+ task->cfg ? task->cfg->enable_css_parser : true,
+ cur_url_order);
+ rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
+
+ if (text_part->utf_content.len == 0) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+ }
+
+ return TRUE;
+}
+
+enum rspamd_message_part_is_text_result {
+ RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN = 0,
+ RSPAMD_MESSAGE_PART_IS_TEXT_HTML,
+ RSPAMD_MESSAGE_PART_IS_NOT_TEXT
+};
+
+static enum rspamd_message_part_is_text_result
+rspamd_message_part_can_be_parsed_as_text(struct rspamd_task *task,
+ struct rspamd_mime_part *mime_part)
+{
+ enum rspamd_message_part_is_text_result res = RSPAMD_MESSAGE_PART_IS_NOT_TEXT;
+
+ if ((mime_part->ct && (mime_part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) ||
+ (mime_part->detected_type && strcmp(mime_part->detected_type, "text") == 0)) {
+
+ res = RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN;
+ rspamd_ftok_t html_tok, xhtml_tok;
+
+ html_tok.begin = "html";
+ html_tok.len = 4;
+ xhtml_tok.begin = "xhtml";
+ xhtml_tok.len = 5;
+
+ if (rspamd_ftok_casecmp(&mime_part->ct->subtype, &html_tok) == 0 ||
+ rspamd_ftok_casecmp(&mime_part->ct->subtype, &xhtml_tok) == 0 ||
+ (mime_part->detected_ext &&
+ strcmp(mime_part->detected_ext, "html") == 0)) {
+ res = RSPAMD_MESSAGE_PART_IS_TEXT_HTML;
+ }
+ }
+
+ /* Skip attachments */
+ if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT &&
+ (mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
+ if (!task->cfg->check_text_attachements) {
+ debug_task("skip attachments for checking as text parts");
+ return RSPAMD_MESSAGE_PART_IS_NOT_TEXT;
+ }
+ }
+
+ return res;
+}
+
+static gboolean
+rspamd_message_process_text_part_maybe(struct rspamd_task *task,
+ struct rspamd_mime_part *mime_part,
+ enum rspamd_message_part_is_text_result is_text,
+ uint16_t *cur_url_order)
+{
+ struct rspamd_mime_text_part *text_part;
+ guint flags = 0;
+ enum rspamd_action_type act;
+
+ /* Skip attachments */
+ if ((mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
+ flags |= RSPAMD_MIME_TEXT_PART_ATTACHMENT;
+ }
+
+ text_part = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_text_part));
+ text_part->mime_part = mime_part;
+ text_part->raw.begin = mime_part->raw_data.begin;
+ text_part->raw.len = mime_part->raw_data.len;
+ text_part->parsed.begin = mime_part->parsed_data.begin;
+ text_part->parsed.len = mime_part->parsed_data.len;
+ text_part->utf_stripped_text = (UText) UTEXT_INITIALIZER;
+ text_part->flags |= flags;
+
+ if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
+ if (!rspamd_message_process_html_text_part(task, text_part, cur_url_order)) {
+ return FALSE;
+ }
+ }
+ else {
+ if (!rspamd_message_process_plain_text_part(task, text_part)) {
+ return FALSE;
+ }
+ }
+
+ g_ptr_array_add(MESSAGE_FIELD(task, text_parts), text_part);
+ mime_part->part_type = RSPAMD_MIME_PART_TEXT;
+ mime_part->specific.txt = text_part;
+
+ act = rspamd_check_gtube(task, text_part);
+ if (act != METRIC_ACTION_NOACTION) {
+ struct rspamd_action *action;
+ gdouble score = NAN;
+
+ action = rspamd_config_get_action_by_type(task->cfg, act);
+
+ if (action) {
+ score = action->threshold;
+
+ rspamd_add_passthrough_result(task, action,
+ RSPAMD_PASSTHROUGH_CRITICAL,
+ score, "Gtube pattern",
+ "GTUBE", 0, NULL);
+ }
+
+ rspamd_task_insert_result(task, GTUBE_SYMBOL, 0, NULL);
+
+ return TRUE;
+ }
+
+ /* Post process part */
+ rspamd_normalize_text_part(task, text_part);
+
+ if (!IS_TEXT_PART_HTML(text_part)) {
+ if (mime_part->parent_part) {
+ struct rspamd_mime_part *parent = mime_part->parent_part;
+
+ if (IS_PART_MULTIPART(parent) && parent->specific.mp->children->len == 2) {
+ /*
+ * Use strict extraction mode: we will extract missing urls from
+ * an html part if needed
+ */
+ rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
+ RSPAMD_URL_FIND_STRICT);
+ }
+ else {
+ /*
+ * Fall back to full text extraction using TLD patterns
+ */
+ rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
+ RSPAMD_URL_FIND_ALL);
+ }
+ }
+ else {
+ /*
+ * Fall back to full text extraction using TLD patterns
+ */
+ rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
+ RSPAMD_URL_FIND_ALL);
+ }
+ }
+ else {
+ rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
+ RSPAMD_URL_FIND_STRICT);
+ }
+
+ if (text_part->exceptions) {
+ text_part->exceptions = g_list_sort(text_part->exceptions,
+ exceptions_compare_func);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) g_list_free,
+ text_part->exceptions);
+ }
+
+ rspamd_mime_part_create_words(task, text_part);
+
+ return TRUE;
+}
+
+/* Creates message from various data using libmagic to detect type */
+static void
+rspamd_message_from_data(struct rspamd_task *task, const guchar *start,
+ gsize len)
+{
+ struct rspamd_content_type *ct = NULL;
+ struct rspamd_mime_part *part;
+ const char *mb = "application/octet-stream";
+ gchar *mid;
+ rspamd_ftok_t srch, *tok;
+ gchar cdbuf[1024];
+
+ g_assert(start != NULL);
+
+ part = rspamd_mempool_alloc0(task->task_pool, sizeof(*part));
+
+ part->raw_data.begin = start;
+ part->raw_data.len = len;
+ part->parsed_data.begin = start;
+ part->parsed_data.len = len;
+ part->part_number = MESSAGE_FIELD(task, parts)->len;
+ part->urls = g_ptr_array_new();
+ part->raw_headers = rspamd_message_headers_new();
+ part->headers_order = NULL;
+
+ tok = rspamd_task_get_request_header(task, "Content-Type");
+
+ if (tok) {
+ /* We have Content-Type defined */
+ ct = rspamd_content_type_parse(tok->begin, tok->len,
+ task->task_pool);
+ part->ct = ct;
+ }
+ else if (task->cfg && task->cfg->libs_ctx) {
+ lua_State *L = task->cfg->lua_state;
+
+ if (rspamd_lua_require_function(L,
+ "lua_magic", "detect_mime_part")) {
+
+ struct rspamd_mime_part **pmime;
+ struct rspamd_task **ptask;
+
+ pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
+ rspamd_lua_setclass(L, "rspamd{mimepart}", -1);
+ *pmime = part;
+ ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
+ rspamd_lua_setclass(L, "rspamd{task}", -1);
+ *ptask = task;
+
+ if (lua_pcall(L, 2, 2, 0) != 0) {
+ msg_err_task("cannot detect type: %s", lua_tostring(L, -1));
+ }
+ else {
+ if (lua_istable(L, -1)) {
+ lua_pushstring(L, "ct");
+ lua_gettable(L, -2);
+
+ if (lua_isstring(L, -1)) {
+ mb = rspamd_mempool_strdup(task->task_pool,
+ lua_tostring(L, -1));
+ }
+ }
+ }
+
+ lua_settop(L, 0);
+ }
+ else {
+ msg_err_task("cannot require lua_magic.detect_mime_part");
+ }
+
+ if (mb) {
+ srch.begin = mb;
+ srch.len = strlen(mb);
+ ct = rspamd_content_type_parse(srch.begin, srch.len,
+ task->task_pool);
+
+ if (!part->ct) {
+ msg_info_task("construct fake mime of type: %s", mb);
+ part->ct = ct;
+ }
+ else {
+ /* Check sanity */
+ if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) {
+ RSPAMD_FTOK_FROM_STR(&srch, "application");
+
+ if (rspamd_ftok_cmp(&ct->type, &srch) == 0) {
+ msg_info_task("construct fake mime of type: %s", mb);
+ part->ct = ct;
+ }
+ }
+ else {
+ msg_info_task("construct fake mime of type: %T/%T, detected %s",
+ &part->ct->type, &part->ct->subtype, mb);
+ }
+ }
+
+ part->detected_ct = ct;
+ }
+ }
+
+
+ tok = rspamd_task_get_request_header(task, "Filename");
+
+ if (tok) {
+ rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline; filename=\"%T\"", tok);
+ }
+ else {
+ rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline");
+ }
+
+ part->cd = rspamd_content_disposition_parse(cdbuf, strlen(cdbuf),
+ task->task_pool);
+
+ g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
+ rspamd_mime_parser_calc_digest(part);
+
+ /* Generate message ID */
+ mid = rspamd_mime_message_id_generate("localhost.localdomain");
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) g_free, mid);
+ MESSAGE_FIELD(task, message_id) = mid;
+ task->queue_id = mid;
+}
+
+static void
+rspamd_message_dtor(struct rspamd_message *msg)
+{
+ guint i;
+ struct rspamd_mime_part *p;
+ struct rspamd_mime_text_part *tp;
+
+
+ PTR_ARRAY_FOREACH(msg->parts, i, p)
+ {
+ if (p->raw_headers) {
+ rspamd_message_headers_unref(p->raw_headers);
+ }
+
+ if (IS_PART_MULTIPART(p)) {
+ if (p->specific.mp->children) {
+ g_ptr_array_free(p->specific.mp->children, TRUE);
+ }
+ }
+
+ if (p->part_type == RSPAMD_MIME_PART_CUSTOM_LUA &&
+ p->specific.lua_specific.cbref != -1) {
+ luaL_unref(msg->task->cfg->lua_state,
+ LUA_REGISTRYINDEX,
+ p->specific.lua_specific.cbref);
+ }
+
+ if (p->urls) {
+ g_ptr_array_unref(p->urls);
+ }
+ }
+
+ PTR_ARRAY_FOREACH(msg->text_parts, i, tp)
+ {
+ if (tp->utf_words) {
+ g_array_free(tp->utf_words, TRUE);
+ }
+ if (tp->normalized_hashes) {
+ g_array_free(tp->normalized_hashes, TRUE);
+ }
+ if (tp->languages) {
+ g_ptr_array_unref(tp->languages);
+ }
+ }
+
+ rspamd_message_headers_unref(msg->raw_headers);
+
+ g_ptr_array_unref(msg->text_parts);
+ g_ptr_array_unref(msg->parts);
+
+ kh_destroy(rspamd_url_hash, msg->urls);
+}
+
+struct rspamd_message *
+rspamd_message_new(struct rspamd_task *task)
+{
+ struct rspamd_message *msg;
+
+ msg = rspamd_mempool_alloc0(task->task_pool, sizeof(*msg));
+
+ msg->raw_headers = rspamd_message_headers_new();
+ msg->urls = kh_init(rspamd_url_hash);
+ msg->parts = g_ptr_array_sized_new(4);
+ msg->text_parts = g_ptr_array_sized_new(2);
+ msg->task = task;
+
+ REF_INIT_RETAIN(msg, rspamd_message_dtor);
+
+ return msg;
+}
+
+gboolean
+rspamd_message_parse(struct rspamd_task *task)
+{
+ const gchar *p;
+ gsize len;
+ guint i;
+ GError *err = NULL;
+ guint64 n[2], seed;
+
+ if (RSPAMD_TASK_IS_EMPTY(task)) {
+ /* Don't do anything with empty task */
+ task->flags |= RSPAMD_TASK_FLAG_SKIP_PROCESS;
+ return TRUE;
+ }
+
+ p = task->msg.begin;
+ len = task->msg.len;
+
+ /* Skip any space characters to avoid some bad messages to be unparsed */
+ while (len > 0 && g_ascii_isspace(*p)) {
+ p++;
+ len--;
+ }
+
+ /*
+ * Exim somehow uses mailbox format for messages being scanned:
+ * From xxx@xxx.com Fri May 13 19:08:48 2016
+ *
+ * So we check if a task has this line to avoid possible issues
+ */
+ if (len > sizeof("From ") - 1) {
+ if (memcmp(p, "From ", sizeof("From ") - 1) == 0) {
+ /* Skip to CRLF */
+ msg_info_task("mailbox input detected, enable workaround");
+ p += sizeof("From ") - 1;
+ len -= sizeof("From ") - 1;
+
+ while (len > 0 && *p != '\n') {
+ p++;
+ len--;
+ }
+ while (len > 0 && g_ascii_isspace(*p)) {
+ p++;
+ len--;
+ }
+ }
+ }
+
+ task->msg.begin = p;
+ task->msg.len = len;
+
+ /* Cleanup old message */
+ if (task->message) {
+ rspamd_message_unref(task->message);
+ }
+
+ task->message = rspamd_message_new(task);
+
+ if (task->flags & RSPAMD_TASK_FLAG_MIME) {
+ enum rspamd_mime_parse_error ret;
+
+ debug_task("construct mime parser from string length %d",
+ (gint) task->msg.len);
+ ret = rspamd_mime_parse_task(task, &err);
+
+ switch (ret) {
+ case RSPAMD_MIME_PARSE_FATAL:
+ msg_err_task("cannot construct mime from stream: %e", err);
+
+ if (task->cfg && (!task->cfg->allow_raw_input)) {
+ msg_err_task("cannot construct mime from stream");
+ if (err) {
+ task->err = err;
+ }
+
+ return FALSE;
+ }
+ else {
+ task->flags &= ~RSPAMD_TASK_FLAG_MIME;
+ rspamd_message_from_data(task, p, len);
+ }
+ break;
+ case RSPAMD_MIME_PARSE_NESTING:
+ msg_warn_task("cannot construct full mime from stream: %e", err);
+ task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
+ break;
+ case RSPAMD_MIME_PARSE_OK:
+ default:
+ break;
+ }
+
+ if (err) {
+ g_error_free(err);
+ }
+ }
+ else {
+ rspamd_message_from_data(task, p, len);
+ }
+
+
+ if (MESSAGE_FIELD(task, message_id) == NULL) {
+ MESSAGE_FIELD(task, message_id) = "undef";
+ }
+
+ debug_task("found %ud parts in message", MESSAGE_FIELD(task, parts)->len);
+ if (task->queue_id == NULL) {
+ task->queue_id = "undef";
+ }
+
+ rspamd_received_maybe_fix_task(task);
+
+ struct rspamd_mime_part *part;
+
+ /* Blake2b applied to string 'rspamd' */
+ static const guchar RSPAMD_ALIGNED(32) hash_key[] = {
+ 0xef,
+ 0x43,
+ 0xae,
+ 0x80,
+ 0xcc,
+ 0x8d,
+ 0xc3,
+ 0x4c,
+ 0x6f,
+ 0x1b,
+ 0xd6,
+ 0x18,
+ 0x1b,
+ 0xae,
+ 0x87,
+ 0x74,
+ 0x0c,
+ 0xca,
+ 0xf7,
+ 0x8e,
+ 0x5f,
+ 0x2e,
+ 0x54,
+ 0x32,
+ 0xf6,
+ 0x79,
+ 0xb9,
+ 0x27,
+ 0x26,
+ 0x96,
+ 0x20,
+ 0x92,
+ 0x70,
+ 0x07,
+ 0x85,
+ 0xeb,
+ 0x83,
+ 0xf7,
+ 0x89,
+ 0xe0,
+ 0xd7,
+ 0x32,
+ 0x2a,
+ 0xd2,
+ 0x1a,
+ 0x64,
+ 0x41,
+ 0xef,
+ 0x49,
+ 0xff,
+ 0xc3,
+ 0x8c,
+ 0x54,
+ 0xf9,
+ 0x67,
+ 0x74,
+ 0x30,
+ 0x1e,
+ 0x70,
+ 0x2e,
+ 0xb7,
+ 0x12,
+ 0x09,
+ 0xfe,
+ };
+
+ memcpy(&seed, hash_key, sizeof(seed));
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
+ {
+ n[0] = t1ha2_atonce128(&n[1],
+ part->digest, sizeof(part->digest),
+ seed);
+
+ seed = n[0] ^ n[1];
+ }
+
+ memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n));
+
+ if (MESSAGE_FIELD(task, subject)) {
+ p = MESSAGE_FIELD(task, subject);
+ len = strlen(p);
+ n[0] = t1ha2_atonce128(&n[1],
+ p, len,
+ seed);
+ memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n));
+ }
+
+ if (task->queue_id) {
+ msg_info_task("loaded message; id: <%s>; queue-id: <%s>; size: %z; "
+ "checksum: <%*xs>",
+ MESSAGE_FIELD(task, message_id), task->queue_id, task->msg.len,
+ (gint) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest));
+ }
+ else {
+ msg_info_task("loaded message; id: <%s>; size: %z; "
+ "checksum: <%*xs>",
+ MESSAGE_FIELD(task, message_id), task->msg.len,
+ (gint) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest));
+ }
+
+ return TRUE;
+}
+
+
+/*
+ * A helper structure to store text parts positions, if it was C++, I could just use std::pair,
+ * but here I have to make it all manually, sigh...
+ */
+struct rspamd_mime_part_text_position {
+ unsigned pos;
+ enum rspamd_message_part_is_text_result res;
+};
+
+/* Place html parts first during analysis */
+static int
+rspamd_mime_text_part_position_compare_func(const void *v1, const void *v2)
+{
+ const struct rspamd_mime_part_text_position *p1 = (const struct rspamd_mime_part_text_position *) v1;
+ const struct rspamd_mime_part_text_position *p2 = (const struct rspamd_mime_part_text_position *) v2;
+
+ if (p1->res == p2->res) {
+ return (int) p2->pos - (int) p1->pos;
+ }
+ else {
+ if (p1->res == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
+ return -1;
+ }
+ else {
+ return 1;
+ }
+ }
+}
+
+void rspamd_message_process(struct rspamd_task *task)
+{
+ guint i;
+ struct rspamd_mime_text_part *p1, *p2;
+ gdouble diff, *pdiff;
+ guint tw, *ptw, dw;
+ struct rspamd_mime_part *part;
+ lua_State *L = NULL;
+ gint magic_func_pos = -1, content_func_pos = -1, old_top = -1, funcs_top = -1;
+
+ if (task->cfg) {
+ L = task->cfg->lua_state;
+ }
+
+ rspamd_archives_process(task);
+
+ if (L) {
+ old_top = lua_gettop(L);
+ }
+
+ if (L && rspamd_lua_require_function(L,
+ "lua_magic", "detect_mime_part")) {
+ magic_func_pos = lua_gettop(L);
+ }
+ else {
+ msg_err_task("cannot require lua_magic.detect_mime_part");
+ }
+
+ if (L && rspamd_lua_require_function(L,
+ "lua_content", "maybe_process_mime_part")) {
+ content_func_pos = lua_gettop(L);
+ }
+ else {
+ msg_err_task("cannot require lua_content.maybe_process_mime_part");
+ }
+
+ if (L) {
+ funcs_top = lua_gettop(L);
+ }
+
+ GArray *detected_text_parts = g_array_sized_new(FALSE, FALSE, sizeof(struct rspamd_mime_part_text_position), 2);
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
+ {
+ if (magic_func_pos != -1 && part->parsed_data.len > 0) {
+ struct rspamd_mime_part **pmime;
+ struct rspamd_task **ptask;
+
+ lua_pushcfunction(L, &rspamd_lua_traceback);
+ gint err_idx = lua_gettop(L);
+ lua_pushvalue(L, magic_func_pos);
+ pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
+ rspamd_lua_setclass(L, "rspamd{mimepart}", -1);
+ *pmime = part;
+ ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
+ rspamd_lua_setclass(L, "rspamd{task}", -1);
+ *ptask = task;
+
+ if (lua_pcall(L, 2, 2, err_idx) != 0) {
+ msg_err_task("cannot detect type: %s", lua_tostring(L, -1));
+ }
+ else {
+ if (lua_istable(L, -1)) {
+ const gchar *mb;
+
+ /* First returned value */
+ part->detected_ext = rspamd_mempool_strdup(task->task_pool,
+ lua_tostring(L, -2));
+
+ lua_pushstring(L, "ct");
+ lua_gettable(L, -2);
+
+ if (lua_isstring(L, -1)) {
+ mb = lua_tostring(L, -1);
+
+ if (mb) {
+ rspamd_ftok_t srch;
+
+ srch.begin = mb;
+ srch.len = strlen(mb);
+ part->detected_ct = rspamd_content_type_parse(srch.begin,
+ srch.len,
+ task->task_pool);
+ }
+ }
+
+ lua_pop(L, 1);
+
+ lua_pushstring(L, "type");
+ lua_gettable(L, -2);
+
+ if (lua_isstring(L, -1)) {
+ part->detected_type = rspamd_mempool_strdup(task->task_pool,
+ lua_tostring(L, -1));
+ }
+
+ lua_pop(L, 1);
+
+ lua_pushstring(L, "no_text");
+ lua_gettable(L, -2);
+
+ if (lua_isboolean(L, -1)) {
+ if (!!lua_toboolean(L, -1)) {
+ part->flags |= RSPAMD_MIME_PART_NO_TEXT_EXTRACTION;
+ }
+ }
+
+ lua_pop(L, 1);
+ }
+ }
+
+ lua_settop(L, funcs_top);
+ }
+
+ /* Now detect content */
+ if (content_func_pos != -1 && part->parsed_data.len > 0 &&
+ part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
+ struct rspamd_mime_part **pmime;
+ struct rspamd_task **ptask;
+
+ lua_pushcfunction(L, &rspamd_lua_traceback);
+ gint err_idx = lua_gettop(L);
+ lua_pushvalue(L, content_func_pos);
+ pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
+ rspamd_lua_setclass(L, "rspamd{mimepart}", -1);
+ *pmime = part;
+ ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
+ rspamd_lua_setclass(L, "rspamd{task}", -1);
+ *ptask = task;
+
+ if (lua_pcall(L, 2, 0, err_idx) != 0) {
+ msg_err_task("cannot detect content: %s", lua_tostring(L, -1));
+ }
+
+ lua_settop(L, funcs_top);
+ }
+
+ /* Try to detect image before checking for text */
+ rspamd_images_process_mime_part_maybe(task, part);
+
+ if (part->part_type == RSPAMD_MIME_PART_UNDEFINED &&
+ !(part->flags & RSPAMD_MIME_PART_NO_TEXT_EXTRACTION)) {
+ enum rspamd_message_part_is_text_result res = rspamd_message_part_can_be_parsed_as_text(task, part);
+
+ if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT) {
+ struct rspamd_mime_part_text_position p = {
+ .pos = i,
+ .res = res};
+ g_array_append_val(detected_text_parts, p);
+ }
+ }
+ }
+
+ uint16_t cur_url_order = 0;
+ g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func);
+ /* One more iteration to process text parts in a more specific order */
+ for (i = 0; i < detected_text_parts->len; i++) {
+ part = g_ptr_array_index(MESSAGE_FIELD(task, parts),
+ g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos);
+ rspamd_message_process_text_part_maybe(task, part,
+ g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res, &cur_url_order);
+ }
+
+ g_array_free(detected_text_parts, TRUE);
+
+ if (old_top != -1) {
+ lua_settop(L, old_top);
+ }
+
+ /* Parse urls inside Subject header */
+ if (MESSAGE_FIELD(task, subject)) {
+ rspamd_url_find_multiple(task->task_pool, MESSAGE_FIELD(task, subject),
+ strlen(MESSAGE_FIELD(task, subject)),
+ RSPAMD_URL_FIND_STRICT, NULL,
+ rspamd_url_task_subject_callback,
+ task);
+ }
+
+ /* Calculate average words length and number of short words */
+ struct rspamd_mime_text_part *text_part;
+ gdouble *var;
+ guint total_words = 0;
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
+ {
+ if (!text_part->language) {
+ rspamd_mime_part_detect_language(task, text_part);
+ }
+
+ rspamd_mime_part_extract_words(task, text_part);
+
+ if (text_part->utf_words) {
+ total_words += text_part->nwords;
+ }
+ }
+
+ /* Calculate distance for 2-parts messages */
+ if (i == 2) {
+ p1 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 0);
+ p2 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 1);
+
+ /* First of all check parent object */
+ if (p1->mime_part->parent_part) {
+ rspamd_ftok_t srch;
+
+ srch.begin = "alternative";
+ srch.len = 11;
+
+ if (rspamd_ftok_cmp(&p1->mime_part->parent_part->ct->subtype, &srch) == 0) {
+ if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) &&
+ p1->normalized_hashes && p2->normalized_hashes) {
+ /*
+ * We also detect language on one part and propagate it to
+ * another one
+ */
+ struct rspamd_mime_text_part *sel;
+
+ /* Prefer HTML as text part is not displayed normally */
+ if (IS_TEXT_PART_HTML(p1)) {
+ sel = p1;
+ }
+ else if (IS_TEXT_PART_HTML(p2)) {
+ sel = p2;
+ }
+ else {
+ if (p1->utf_content.len > p2->utf_content.len) {
+ sel = p1;
+ }
+ else {
+ sel = p2;
+ }
+ }
+
+ if (sel->language && sel->language[0]) {
+ /* Propagate language */
+ if (sel == p1) {
+ if (p2->languages) {
+ g_ptr_array_unref(p2->languages);
+ }
+
+ p2->language = sel->language;
+ p2->languages = g_ptr_array_ref(sel->languages);
+ }
+ else {
+ if (p1->languages) {
+ g_ptr_array_unref(p1->languages);
+ }
+
+ p1->language = sel->language;
+ p1->languages = g_ptr_array_ref(sel->languages);
+ }
+ }
+
+ tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
+
+ if (tw > 0) {
+ dw = rspamd_words_levenshtein_distance(task,
+ p1->normalized_hashes,
+ p2->normalized_hashes);
+ diff = dw / (gdouble) tw;
+
+ msg_debug_task(
+ "different words: %d, total words: %d, "
+ "got diff between parts of %.2f",
+ dw, tw,
+ diff);
+
+ pdiff = rspamd_mempool_alloc(task->task_pool,
+ sizeof(gdouble));
+ *pdiff = diff;
+ rspamd_mempool_set_variable(task->task_pool,
+ "parts_distance",
+ pdiff,
+ NULL);
+ ptw = rspamd_mempool_alloc(task->task_pool,
+ sizeof(gint));
+ *ptw = tw;
+ rspamd_mempool_set_variable(task->task_pool,
+ "total_words",
+ ptw,
+ NULL);
+ }
+ }
+ }
+ }
+ else {
+ debug_task(
+ "message contains two parts but they are in different multi-parts");
+ }
+ }
+
+ if (total_words > 0) {
+ var = rspamd_mempool_get_variable(task->task_pool,
+ RSPAMD_MEMPOOL_AVG_WORDS_LEN);
+
+ if (var) {
+ *var /= (double) total_words;
+ }
+
+ var = rspamd_mempool_get_variable(task->task_pool,
+ RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
+
+ if (var) {
+ *var /= (double) total_words;
+ }
+ }
+
+ rspamd_images_link(task);
+ rspamd_tokenize_meta_words(task);
+}
+
+
+struct rspamd_message *
+rspamd_message_ref(struct rspamd_message *msg)
+{
+ REF_RETAIN(msg);
+
+ return msg;
+}
+
+void rspamd_message_unref(struct rspamd_message *msg)
+{
+ if (msg) {
+ REF_RELEASE(msg);
+ }
+}
+
+void rspamd_message_update_digest(struct rspamd_message *msg,
+ const void *input, gsize len)
+{
+ guint64 n[2];
+ /* Sanity */
+ G_STATIC_ASSERT(sizeof(n) == sizeof(msg->digest));
+
+ memcpy(n, msg->digest, sizeof(msg->digest));
+ n[0] = t1ha2_atonce128(&n[1], input, len, n[0]);
+ memcpy(msg->digest, n, sizeof(msg->digest));
+}
diff --git a/src/libmime/message.h b/src/libmime/message.h
new file mode 100644
index 0000000..52dedab
--- /dev/null
+++ b/src/libmime/message.h
@@ -0,0 +1,239 @@
+/**
+ * @file message.h
+ * Message processing functions and structures
+ */
+
+#ifndef RSPAMD_MESSAGE_H
+#define RSPAMD_MESSAGE_H
+
+#include "config.h"
+
+#include "libmime/email_addr.h"
+#include "libutil/addr.h"
+#include "libcryptobox/cryptobox.h"
+#include "libmime/mime_headers.h"
+#include "libmime/content_type.h"
+#include "libserver/url.h"
+#include "libutil/ref.h"
+#include "libutil/str_util.h"
+
+#include <unicode/uchar.h>
+#include <unicode/utext.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_task;
+struct controller_session;
+struct rspamd_image;
+struct rspamd_archive;
+
+enum rspamd_mime_part_flags {
+ RSPAMD_MIME_PART_ATTACHEMENT = (1u << 1u),
+ RSPAMD_MIME_PART_BAD_CTE = (1u << 4u),
+ RSPAMD_MIME_PART_MISSING_CTE = (1u << 5u),
+ RSPAMD_MIME_PART_NO_TEXT_EXTRACTION = (1u << 6u),
+};
+
+enum rspamd_mime_part_type {
+ RSPAMD_MIME_PART_UNDEFINED = 0,
+ RSPAMD_MIME_PART_MULTIPART,
+ RSPAMD_MIME_PART_MESSAGE,
+ RSPAMD_MIME_PART_TEXT,
+ RSPAMD_MIME_PART_ARCHIVE,
+ RSPAMD_MIME_PART_IMAGE,
+ RSPAMD_MIME_PART_CUSTOM_LUA
+};
+
+#define IS_PART_MULTIPART(part) ((part) && ((part)->part_type == RSPAMD_MIME_PART_MULTIPART))
+#define IS_PART_TEXT(part) ((part) && ((part)->part_type == RSPAMD_MIME_PART_TEXT))
+#define IS_PART_MESSAGE(part) ((part) && ((part)->part_type == RSPAMD_MIME_PART_MESSAGE))
+
+enum rspamd_cte {
+ RSPAMD_CTE_UNKNOWN = 0,
+ RSPAMD_CTE_7BIT = 1,
+ RSPAMD_CTE_8BIT = 2,
+ RSPAMD_CTE_QP = 3,
+ RSPAMD_CTE_B64 = 4,
+ RSPAMD_CTE_UUE = 5,
+};
+
+struct rspamd_mime_text_part;
+
+struct rspamd_mime_multipart {
+ GPtrArray *children;
+ rspamd_ftok_t boundary;
+};
+
+enum rspamd_lua_specific_type {
+ RSPAMD_LUA_PART_TEXT,
+ RSPAMD_LUA_PART_STRING,
+ RSPAMD_LUA_PART_TABLE,
+ RSPAMD_LUA_PART_FUNCTION,
+ RSPAMD_LUA_PART_UNKNOWN,
+};
+
+struct rspamd_lua_specific_part {
+ gint cbref;
+ enum rspamd_lua_specific_type type;
+};
+
+struct rspamd_mime_part {
+ struct rspamd_content_type *ct;
+ struct rspamd_content_type *detected_ct;
+ gchar *detected_type;
+ gchar *detected_ext;
+ struct rspamd_content_disposition *cd;
+ rspamd_ftok_t raw_data;
+ rspamd_ftok_t parsed_data;
+ struct rspamd_mime_part *parent_part;
+
+ struct rspamd_mime_header *headers_order;
+ struct rspamd_mime_headers_table *raw_headers;
+ GPtrArray *urls;
+
+ gchar *raw_headers_str;
+ gsize raw_headers_len;
+
+ enum rspamd_cte cte;
+ guint flags;
+ enum rspamd_mime_part_type part_type;
+ guint part_number;
+
+ union {
+ struct rspamd_mime_multipart *mp;
+ struct rspamd_mime_text_part *txt;
+ struct rspamd_image *img;
+ struct rspamd_archive *arch;
+ struct rspamd_lua_specific_part lua_specific;
+ } specific;
+
+ guchar digest[rspamd_cryptobox_HASHBYTES];
+};
+
+#define RSPAMD_MIME_TEXT_PART_FLAG_UTF (1 << 0)
+#define RSPAMD_MIME_TEXT_PART_FLAG_EMPTY (1 << 1)
+#define RSPAMD_MIME_TEXT_PART_FLAG_HTML (1 << 2)
+#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW (1 << 3)
+#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED (1 << 4)
+#define RSPAMD_MIME_TEXT_PART_ATTACHMENT (1 << 5)
+
+#define IS_TEXT_PART_EMPTY(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_EMPTY)
+#define IS_TEXT_PART_UTF(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF)
+#define IS_TEXT_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML)
+#define IS_TEXT_PART_ATTACHMENT(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_ATTACHMENT)
+
+
+struct rspamd_mime_text_part {
+ const gchar *language;
+ GPtrArray *languages;
+ const gchar *real_charset;
+
+ /* Raw data in native encoding */
+ rspamd_ftok_t raw;
+ rspamd_ftok_t parsed; /* decoded from mime encodings */
+
+ /* UTF8 content */
+ rspamd_ftok_t utf_content; /* utf8 encoded processed content */
+ GByteArray *utf_raw_content; /* utf raw content */
+ GByteArray *utf_stripped_content; /* utf content with no newlines */
+ GArray *normalized_hashes; /* Array of guint64 */
+ GArray *utf_words; /* Array of rspamd_stat_token_t */
+ UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
+
+ GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
+ void *html;
+ GList *exceptions; /**< list of offsets of urls */
+ struct rspamd_mime_part *mime_part;
+
+ guint flags;
+ guint nlines;
+ guint spaces;
+ guint nwords;
+ guint non_ascii_chars;
+ guint ascii_chars;
+ guint double_spaces;
+ guint non_spaces;
+ guint empty_lines;
+ guint capital_letters;
+ guint numeric_characters;
+ guint unicode_scripts;
+};
+
+struct rspamd_message_raw_headers_content {
+ const gchar *begin;
+ gsize len;
+ const gchar *body_start;
+};
+
+struct rspamd_message {
+ const gchar *message_id;
+ gchar *subject;
+
+ GPtrArray *parts; /**< list of parsed parts */
+ GPtrArray *text_parts; /**< list of text parts */
+ struct rspamd_message_raw_headers_content raw_headers_content;
+ void *received_headers; /**< list of received headers */
+ khash_t(rspamd_url_hash) * urls;
+ struct rspamd_mime_headers_table *raw_headers; /**< list of raw headers */
+ struct rspamd_mime_header *headers_order; /**< order of raw headers */
+ struct rspamd_task *task;
+ GPtrArray *rcpt_mime;
+ GPtrArray *from_mime;
+ guchar digest[16];
+ enum rspamd_newlines_type nlines_type; /**< type of newlines (detected on most of headers */
+ ref_entry_t ref;
+};
+
+#define MESSAGE_FIELD(task, field) ((task)->message->field)
+#define MESSAGE_FIELD_CHECK(task, field) ((task)->message ? (task)->message->field : (__typeof__((task)->message->field)) NULL)
+
+/**
+ * Parse and pre-process mime message
+ * @param task worker_task object
+ * @return
+ */
+gboolean rspamd_message_parse(struct rspamd_task *task);
+
+/**
+ * Process content in task (e.g. HTML parsing)
+ * @param task
+ */
+void rspamd_message_process(struct rspamd_task *task);
+
+
+/**
+ * Converts string to cte
+ * @param str
+ * @return
+ */
+enum rspamd_cte rspamd_cte_from_string(const gchar *str);
+
+/**
+ * Converts cte to string
+ * @param ct
+ * @return
+ */
+const gchar *rspamd_cte_to_string(enum rspamd_cte ct);
+
+struct rspamd_message *rspamd_message_new(struct rspamd_task *task);
+
+struct rspamd_message *rspamd_message_ref(struct rspamd_message *msg);
+
+void rspamd_message_unref(struct rspamd_message *msg);
+
+/**
+ * Updates digest of the message if modified
+ * @param msg
+ * @param input
+ * @param len
+ */
+void rspamd_message_update_digest(struct rspamd_message *msg,
+ const void *input, gsize len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
new file mode 100644
index 0000000..48a97a4
--- /dev/null
+++ b/src/libmime/mime_encoding.c
@@ -0,0 +1,864 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "libutil/mem_pool.h"
+#include "libutil/regexp.h"
+#include "libutil/hash.h"
+#include "libserver/cfg_file.h"
+#include "libserver/task.h"
+#include "mime_encoding.h"
+#include "message.h"
+#include "contrib/fastutf8/fastutf8.h"
+#include "contrib/google-ced/ced_c.h"
+#include <unicode/ucnv.h>
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+#include <unicode/unorm2.h>
+#endif
+#include <math.h>
+
+#define UTF8_CHARSET "UTF-8"
+
+#define RSPAMD_CHARSET_FLAG_UTF (1 << 0)
+#define RSPAMD_CHARSET_FLAG_ASCII (1 << 1)
+
+#define RSPAMD_CHARSET_CACHE_SIZE 32
+#define RSPAMD_CHARSET_MAX_CONTENT 512
+
+#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
+#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
+
+static rspamd_regexp_t *utf_compatible_re = NULL;
+
+struct rspamd_charset_substitution {
+ const gchar *input;
+ const gchar *canon;
+ gint flags;
+};
+
+#include "mime_encoding_list.h"
+
+static GHashTable *sub_hash = NULL;
+
+static const UChar iso_8859_16_map[] = {
+ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
+ 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
+ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
+ 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
+ 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
+ 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
+ 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
+ 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
+ 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
+ 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+ 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
+ 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
+ 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
+ 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+ 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
+ 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF};
+
+struct rspamd_charset_converter {
+ gchar *canon_name;
+ union {
+ UConverter *conv;
+ const UChar *cnv_table;
+ } d;
+ gboolean is_internal;
+};
+
+static GQuark
+rspamd_charset_conv_error_quark(void)
+{
+ return g_quark_from_static_string("charset conversion error");
+}
+
+static void
+rspamd_converter_dtor(gpointer p)
+{
+ struct rspamd_charset_converter *c = (struct rspamd_charset_converter *) p;
+
+ if (!c->is_internal) {
+ ucnv_close(c->d.conv);
+ }
+
+ g_free(c->canon_name);
+ g_free(c);
+}
+
+int32_t
+rspamd_converter_to_uchars(struct rspamd_charset_converter *cnv,
+ UChar *dest,
+ int32_t destCapacity,
+ const char *src,
+ int32_t srcLength,
+ UErrorCode *pErrorCode)
+{
+ if (!cnv->is_internal) {
+ return ucnv_toUChars(cnv->d.conv,
+ dest, destCapacity,
+ src, srcLength,
+ pErrorCode);
+ }
+ else {
+ UChar *d = dest, *dend = dest + destCapacity;
+ const guchar *p = src, *end = src + srcLength;
+
+ while (p < end && d < dend) {
+ if (*p <= 127) {
+ *d++ = (UChar) *p;
+ }
+ else {
+ *d++ = cnv->d.cnv_table[*p - 128];
+ }
+
+ p++;
+ }
+
+ return d - dest;
+ }
+}
+
+
+struct rspamd_charset_converter *
+rspamd_mime_get_converter_cached(const gchar *enc,
+ rspamd_mempool_t *pool,
+ gboolean is_canon,
+ UErrorCode *err)
+{
+ const gchar *canon_name;
+ static rspamd_lru_hash_t *cache;
+ struct rspamd_charset_converter *conv;
+
+ if (cache == NULL) {
+ cache = rspamd_lru_hash_new_full(RSPAMD_CHARSET_CACHE_SIZE, NULL,
+ rspamd_converter_dtor, rspamd_str_hash,
+ rspamd_str_equal);
+ }
+
+ if (enc == NULL) {
+ return NULL;
+ }
+
+ if (!is_canon) {
+ rspamd_ftok_t cset_tok;
+
+ RSPAMD_FTOK_FROM_STR(&cset_tok, enc);
+ canon_name = rspamd_mime_detect_charset(&cset_tok, pool);
+ }
+ else {
+ canon_name = enc;
+ }
+
+ if (canon_name == NULL) {
+ return NULL;
+ }
+
+ conv = rspamd_lru_hash_lookup(cache, (gpointer) canon_name, 0);
+
+ if (conv == NULL) {
+ if (!(strcmp(canon_name, "ISO-8859-16") == 0 ||
+ strcmp(canon_name, "latin10") == 0 ||
+ strcmp(canon_name, "iso-ir-226") == 0)) {
+ conv = g_malloc0(sizeof(*conv));
+ conv->d.conv = ucnv_open(canon_name, err);
+ conv->canon_name = g_strdup(canon_name);
+
+ if (conv->d.conv != NULL) {
+ ucnv_setToUCallBack(conv->d.conv,
+ UCNV_TO_U_CALLBACK_SUBSTITUTE,
+ NULL,
+ NULL,
+ NULL,
+ err);
+ rspamd_lru_hash_insert(cache, conv->canon_name, conv, 0, 0);
+ }
+ else {
+ g_free(conv);
+ conv = NULL;
+ }
+ }
+ else {
+ /* ISO-8859-16 */
+ conv = g_malloc0(sizeof(*conv));
+ conv->is_internal = TRUE;
+ conv->d.cnv_table = iso_8859_16_map;
+ conv->canon_name = g_strdup(canon_name);
+
+ rspamd_lru_hash_insert(cache, conv->canon_name, conv, 0, 0);
+ }
+ }
+
+ return conv;
+}
+
+static void
+rspamd_mime_encoding_substitute_init(void)
+{
+ guint i;
+
+ sub_hash = g_hash_table_new(rspamd_strcase_hash, rspamd_strcase_equal);
+
+ for (i = 0; i < G_N_ELEMENTS(sub); i++) {
+ g_hash_table_insert(sub_hash, (void *) sub[i].input, (void *) &sub[i]);
+ }
+}
+
+static void
+rspamd_charset_normalize(gchar *in)
+{
+ /*
+ * This is a simple routine to validate input charset
+ * we just check that charset starts with alphanumeric and ends
+ * with alphanumeric
+ */
+ gchar *begin, *end;
+ gboolean changed = FALSE;
+
+ begin = in;
+
+ while (*begin && !g_ascii_isalnum(*begin)) {
+ begin++;
+ changed = TRUE;
+ }
+
+ end = begin + strlen(begin) - 1;
+
+ while (end > begin && !g_ascii_isalnum(*end)) {
+ end--;
+ changed = TRUE;
+ }
+
+ if (changed) {
+ memmove(in, begin, end - begin + 2);
+ *(end + 1) = '\0';
+ }
+}
+
+const gchar *
+rspamd_mime_detect_charset(const rspamd_ftok_t *in, rspamd_mempool_t *pool)
+{
+ gchar *ret = NULL, *h, *t;
+ struct rspamd_charset_substitution *s;
+ const gchar *cset;
+ rspamd_ftok_t utf8_tok;
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ if (sub_hash == NULL) {
+ rspamd_mime_encoding_substitute_init();
+ }
+
+ /* Fast path */
+ RSPAMD_FTOK_ASSIGN(&utf8_tok, "utf-8");
+
+ if (rspamd_ftok_casecmp(in, &utf8_tok) == 0) {
+ return UTF8_CHARSET;
+ }
+
+ RSPAMD_FTOK_ASSIGN(&utf8_tok, "utf8");
+
+ if (rspamd_ftok_casecmp(in, &utf8_tok) == 0) {
+ return UTF8_CHARSET;
+ }
+
+ ret = rspamd_mempool_ftokdup(pool, in);
+ rspamd_charset_normalize(ret);
+
+ if ((in->len > 3 && rspamd_lc_cmp(in->begin, "cp-", 3) == 0) ||
+ (in->len > 4 && (rspamd_lc_cmp(in->begin, "ibm-", 4) == 0))) {
+ /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */
+ h = ret;
+ t = ret;
+
+ while (*h != '\0') {
+ if (*h != '-') {
+ *t++ = *h;
+ }
+
+ h++;
+ }
+
+ *t = '\0';
+ }
+
+ s = g_hash_table_lookup(sub_hash, ret);
+
+ if (s) {
+ ret = (char *) s->canon;
+ }
+
+ /* Try different aliases */
+ cset = ucnv_getCanonicalName(ret, "MIME", &uc_err);
+
+ if (cset == NULL) {
+ uc_err = U_ZERO_ERROR;
+ cset = ucnv_getCanonicalName(ret, "IANA", &uc_err);
+ }
+
+ if (cset == NULL) {
+ uc_err = U_ZERO_ERROR;
+ cset = ucnv_getCanonicalName(ret, "", &uc_err);
+ }
+
+ if (cset == NULL) {
+ uc_err = U_ZERO_ERROR;
+ cset = ucnv_getAlias(ret, 0, &uc_err);
+ }
+
+ return cset;
+}
+
+gchar *
+rspamd_mime_text_to_utf8(rspamd_mempool_t *pool,
+ gchar *input, gsize len, const gchar *in_enc,
+ gsize *olen, GError **err)
+{
+ gchar *d;
+ gint32 r, clen, dlen;
+ UChar *tmp_buf;
+
+ UErrorCode uc_err = U_ZERO_ERROR;
+ UConverter *utf8_converter;
+ struct rspamd_charset_converter *conv;
+ rspamd_ftok_t cset_tok;
+
+ /* Check if already utf8 */
+ RSPAMD_FTOK_FROM_STR(&cset_tok, in_enc);
+
+ if (rspamd_mime_charset_utf_check(&cset_tok, input, len,
+ FALSE)) {
+ d = rspamd_mempool_alloc(pool, len);
+ memcpy(d, input, len);
+ if (olen) {
+ *olen = len;
+ }
+
+ return d;
+ }
+
+ conv = rspamd_mime_get_converter_cached(in_enc, pool, TRUE, &uc_err);
+ utf8_converter = rspamd_get_utf8_converter();
+
+ if (conv == NULL) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot open converter for %s: %s",
+ in_enc, u_errorName(uc_err));
+
+ return NULL;
+ }
+
+ tmp_buf = g_new(UChar, len + 1);
+ uc_err = U_ZERO_ERROR;
+ r = rspamd_converter_to_uchars(conv, tmp_buf, len + 1, input, len, &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot convert data to unicode from %s: %s",
+ in_enc, u_errorName(uc_err));
+ g_free(tmp_buf);
+
+ return NULL;
+ }
+
+ /* Now, convert to utf8 */
+ clen = ucnv_getMaxCharSize(utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen);
+ d = rspamd_mempool_alloc(pool, dlen);
+ r = ucnv_fromUChars(utf8_converter, d, dlen, tmp_buf, r, &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot convert data from unicode from %s: %s",
+ in_enc, u_errorName(uc_err));
+ g_free(tmp_buf);
+
+ return NULL;
+ }
+
+ msg_debug_pool("converted from %s to UTF-8 inlen: %z, outlen: %d",
+ in_enc, len, r);
+ g_free(tmp_buf);
+
+ if (olen) {
+ *olen = r;
+ }
+
+ return d;
+}
+
+static gboolean
+rspamd_mime_text_part_utf8_convert(struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part,
+ GByteArray *input,
+ const gchar *charset,
+ GError **err)
+{
+ gchar *d;
+ gint32 r, clen, dlen, uc_len;
+ UChar *tmp_buf;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ UConverter *utf8_converter;
+ struct rspamd_charset_converter *conv;
+
+ conv = rspamd_mime_get_converter_cached(charset, task->task_pool,
+ TRUE, &uc_err);
+ utf8_converter = rspamd_get_utf8_converter();
+
+ if (conv == NULL) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot open converter for %s: %s",
+ charset, u_errorName(uc_err));
+
+ return FALSE;
+ }
+
+ tmp_buf = g_new(UChar, input->len + 1);
+ uc_err = U_ZERO_ERROR;
+ uc_len = rspamd_converter_to_uchars(conv,
+ tmp_buf,
+ input->len + 1,
+ input->data,
+ input->len,
+ &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot convert data to unicode from %s: %s",
+ charset, u_errorName(uc_err));
+ g_free(tmp_buf);
+
+ return FALSE;
+ }
+
+ /* Now, convert to utf8 */
+ clen = ucnv_getMaxCharSize(utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING(uc_len, clen);
+ d = rspamd_mempool_alloc(task->task_pool, dlen);
+ r = ucnv_fromUChars(utf8_converter, d, dlen,
+ tmp_buf, uc_len, &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot convert data from unicode from %s: %s",
+ charset, u_errorName(uc_err));
+ g_free(tmp_buf);
+
+ return FALSE;
+ }
+
+ if (text_part->mime_part && text_part->mime_part->ct) {
+ msg_info_task("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
+ charset, &text_part->mime_part->ct->charset, input->len, r, uc_len);
+ }
+ else {
+ msg_info_task("converted text part from %s (no charset announced) to UTF-8 inlen: %d, "
+ "outlen: %d (%d UTF16 chars)",
+ charset, input->len, r, uc_len);
+ }
+
+ text_part->utf_raw_content = rspamd_mempool_alloc(task->task_pool,
+ sizeof(*text_part->utf_raw_content) + sizeof(gpointer) * 4);
+ text_part->utf_raw_content->data = d;
+ text_part->utf_raw_content->len = r;
+ g_free(tmp_buf);
+
+ return TRUE;
+}
+
+gboolean
+rspamd_mime_to_utf8_byte_array(GByteArray *in,
+ GByteArray *out,
+ rspamd_mempool_t *pool,
+ const gchar *enc)
+{
+ gint32 r, clen, dlen;
+ UChar *tmp_buf;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ UConverter *utf8_converter;
+ struct rspamd_charset_converter *conv;
+ rspamd_ftok_t charset_tok;
+
+ if (in == NULL || in->len == 0) {
+ return FALSE;
+ }
+
+ if (enc == NULL) {
+ /* Assume utf ? */
+ if (rspamd_fast_utf8_validate(in->data, in->len) == 0) {
+ g_byte_array_set_size(out, in->len);
+ memcpy(out->data, in->data, out->len);
+
+ return TRUE;
+ }
+ else {
+ /* Bad stuff, keep out */
+ return FALSE;
+ }
+ }
+
+ RSPAMD_FTOK_FROM_STR(&charset_tok, enc);
+
+ if (rspamd_mime_charset_utf_check(&charset_tok, (gchar *) in->data, in->len,
+ FALSE)) {
+ g_byte_array_set_size(out, in->len);
+ memcpy(out->data, in->data, out->len);
+
+ return TRUE;
+ }
+
+ utf8_converter = rspamd_get_utf8_converter();
+ conv = rspamd_mime_get_converter_cached(enc, pool, TRUE, &uc_err);
+
+ if (conv == NULL) {
+ return FALSE;
+ }
+
+ tmp_buf = g_new(UChar, in->len + 1);
+ uc_err = U_ZERO_ERROR;
+ r = rspamd_converter_to_uchars(conv,
+ tmp_buf, in->len + 1,
+ in->data, in->len, &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ g_free(tmp_buf);
+
+ return FALSE;
+ }
+
+ /* Now, convert to utf8 */
+ clen = ucnv_getMaxCharSize(utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen);
+ g_byte_array_set_size(out, dlen);
+ r = ucnv_fromUChars(utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ g_free(tmp_buf);
+
+ return FALSE;
+ }
+
+ g_free(tmp_buf);
+ out->len = r;
+
+ return TRUE;
+}
+
+void rspamd_mime_charset_utf_enforce(gchar *in, gsize len)
+{
+ gchar *p, *end;
+ goffset err_offset;
+ UChar32 uc = 0;
+
+ /* Now we validate input and replace bad characters with '?' symbol */
+ p = in;
+ end = in + len;
+
+ while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate(p, len)) > 0) {
+ err_offset--; /* As it returns it 1 indexed */
+ gint32 cur_offset = err_offset;
+
+ while (cur_offset < len) {
+ gint32 tmp = cur_offset;
+
+ U8_NEXT(p, cur_offset, len, uc);
+
+ if (uc > 0) {
+ /* Fill string between err_offset and tmp with `?` character */
+ memset(p + err_offset, '?', tmp - err_offset);
+ break;
+ }
+ }
+
+ if (uc < 0) {
+ /* Fill till the end */
+ memset(p + err_offset, '?', len - err_offset);
+ break;
+ }
+
+ p += cur_offset;
+ len = end - p;
+ }
+}
+
+const char *
+rspamd_mime_charset_find_by_content(const gchar *in, gsize inlen,
+ bool check_utf8)
+{
+ int nconsumed;
+ bool is_reliable;
+ const gchar *ced_name;
+
+ if (check_utf8) {
+ if (rspamd_fast_utf8_validate(in, inlen) == 0) {
+ return UTF8_CHARSET;
+ }
+ }
+
+
+ ced_name = ced_encoding_detect(in, inlen, NULL, NULL,
+ NULL, 0, CED_EMAIL_CORPUS,
+ false, &nconsumed, &is_reliable);
+
+ if (ced_name) {
+
+ return ced_name;
+ }
+
+ return NULL;
+}
+
+static const char *
+rspamd_mime_charset_find_by_content_maybe_split(const gchar *in, gsize inlen)
+{
+ if (inlen < RSPAMD_CHARSET_MAX_CONTENT * 3) {
+ return rspamd_mime_charset_find_by_content(in, inlen, false);
+ }
+ else {
+ const gchar *c1, *c2, *c3;
+
+ c1 = rspamd_mime_charset_find_by_content(in, RSPAMD_CHARSET_MAX_CONTENT, false);
+ c2 = rspamd_mime_charset_find_by_content(in + inlen / 2,
+ RSPAMD_CHARSET_MAX_CONTENT, false);
+ c3 = rspamd_mime_charset_find_by_content(in + inlen - RSPAMD_CHARSET_MAX_CONTENT,
+ RSPAMD_CHARSET_MAX_CONTENT, false);
+
+ /* 7bit stuff */
+ if (c1 && strcmp(c1, "US-ASCII") == 0) {
+ c1 = NULL; /* Invalid - we have 8 bit there */
+ }
+ if (c2 && strcmp(c2, "US-ASCII") == 0) {
+ c2 = NULL; /* Invalid - we have 8 bit there */
+ }
+ if (c3 && strcmp(c3, "US-ASCII") == 0) {
+ c3 = NULL; /* Invalid - we have 8 bit there */
+ }
+
+ if (!c1) {
+ c1 = c2 ? c2 : c3;
+ }
+ if (!c2) {
+ c2 = c3 ? c3 : c1;
+ }
+ if (!c3) {
+ c3 = c1 ? c2 : c1;
+ }
+
+ if (c1 && c2 && c3) {
+ /* Quorum */
+ if (c1 == c2) {
+ return c1;
+ }
+ else if (c2 == c3) {
+ return c2;
+ }
+ else if (c1 == c3) {
+ return c3;
+ }
+
+ /* All charsets are distinct. Use the one from the top */
+ return c1;
+ }
+
+ return NULL;
+ }
+}
+
+gboolean
+rspamd_mime_charset_utf_check(rspamd_ftok_t *charset,
+ gchar *in, gsize len, gboolean content_check)
+{
+ const gchar *real_charset;
+
+ if (utf_compatible_re == NULL) {
+ utf_compatible_re = rspamd_regexp_new(
+ "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$",
+ "i", NULL);
+ }
+
+ if (charset->len == 0 ||
+ rspamd_regexp_match(utf_compatible_re,
+ charset->begin, charset->len, TRUE)) {
+ /*
+ * In case of UTF8 charset we still can check the content to find
+ * corner cases
+ */
+ if (content_check) {
+ if (rspamd_fast_utf8_validate(in, len) != 0) {
+ real_charset = rspamd_mime_charset_find_by_content_maybe_split(in, len);
+
+ if (real_charset) {
+
+ if (rspamd_regexp_match(utf_compatible_re,
+ real_charset, strlen(real_charset), TRUE)) {
+ RSPAMD_FTOK_ASSIGN(charset, UTF8_CHARSET);
+
+ return TRUE;
+ }
+ else {
+ charset->begin = real_charset;
+ charset->len = strlen(real_charset);
+
+ return FALSE;
+ }
+ }
+
+ rspamd_mime_charset_utf_enforce(in, len);
+ }
+ }
+
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+void rspamd_mime_text_part_maybe_convert(struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
+{
+ GError *err = NULL;
+ const gchar *charset = NULL;
+ gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE;
+ GByteArray *part_content;
+ rspamd_ftok_t charset_tok;
+ struct rspamd_mime_part *part = text_part->mime_part;
+
+ if (rspamd_str_has_8bit(text_part->raw.begin, text_part->raw.len)) {
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW;
+ }
+
+ /* Allocate copy storage */
+ part_content = g_byte_array_sized_new(text_part->parsed.len);
+ memcpy(part_content->data, text_part->parsed.begin, text_part->parsed.len);
+ part_content->len = text_part->parsed.len;
+ rspamd_mempool_notify_alloc(task->task_pool,
+ part_content->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) g_byte_array_unref, part_content);
+
+ if (rspamd_str_has_8bit(text_part->parsed.begin, text_part->parsed.len)) {
+ if (rspamd_fast_utf8_validate(text_part->parsed.begin, text_part->parsed.len) == 0) {
+ /* Valid UTF, likely all good */
+ need_charset_heuristic = FALSE;
+ valid_utf8 = TRUE;
+ checked = TRUE;
+ }
+
+ text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
+ }
+ else {
+ /* All 7bit characters, assume it valid utf */
+ need_charset_heuristic = FALSE;
+ valid_utf8 = TRUE;
+ checked = TRUE; /* Already valid utf, no need in further checks */
+ }
+
+ if (part->ct->charset.len == 0) {
+ if (need_charset_heuristic) {
+ charset = rspamd_mime_charset_find_by_content_maybe_split(text_part->parsed.begin,
+ text_part->parsed.len);
+
+ if (charset != NULL) {
+ msg_info_task("detected charset %s", charset);
+ }
+
+ checked = TRUE;
+ text_part->real_charset = charset;
+ }
+ else if (valid_utf8) {
+ SET_PART_UTF(text_part);
+ text_part->utf_raw_content = part_content;
+ text_part->real_charset = UTF8_CHARSET;
+
+ return;
+ }
+ }
+ else {
+ charset = rspamd_mime_detect_charset(&part->ct->charset,
+ task->task_pool);
+
+ if (charset == NULL) {
+ /* We don't know the real charset but can try heuristic */
+ if (need_charset_heuristic) {
+ charset = rspamd_mime_charset_find_by_content_maybe_split(part_content->data,
+ part_content->len);
+ msg_info_task("detected charset: %s", charset);
+ checked = TRUE;
+ text_part->real_charset = charset;
+ }
+ else if (valid_utf8) {
+ /* We already know that the input is valid utf, so skip heuristic */
+ text_part->real_charset = UTF8_CHARSET;
+ }
+ }
+ else {
+ text_part->real_charset = charset;
+
+ if (strcmp(charset, UTF8_CHARSET) != 0) {
+ /*
+ * We have detected some charset, but we don't know which one,
+ * so we need to reset valid utf8 flag and enforce it later
+ */
+ valid_utf8 = FALSE;
+ }
+ }
+ }
+
+ if (text_part->real_charset == NULL) {
+ msg_info_task("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"",
+ MESSAGE_FIELD_CHECK(task, message_id), &part->ct->charset,
+ part->ct->cpy);
+ SET_PART_RAW(text_part);
+ text_part->utf_raw_content = part_content;
+
+ return;
+ }
+
+ RSPAMD_FTOK_FROM_STR(&charset_tok, charset);
+
+ if (!valid_utf8) {
+ if (rspamd_mime_charset_utf_check(&charset_tok, part_content->data,
+ part_content->len, !checked)) {
+ SET_PART_UTF(text_part);
+ text_part->utf_raw_content = part_content;
+ text_part->real_charset = UTF8_CHARSET;
+
+ return;
+ }
+ else {
+ charset = charset_tok.begin;
+
+ if (!rspamd_mime_text_part_utf8_convert(task, text_part,
+ part_content, charset, &err)) {
+ msg_warn_task("<%s>: cannot convert from %s to utf8: %s",
+ MESSAGE_FIELD(task, message_id),
+ charset,
+ err ? err->message : "unknown problem");
+ SET_PART_RAW(text_part);
+ g_error_free(err);
+
+ text_part->utf_raw_content = part_content;
+ return;
+ }
+
+ SET_PART_UTF(text_part);
+ text_part->real_charset = charset;
+ }
+ }
+ else {
+ SET_PART_UTF(text_part);
+ text_part->utf_raw_content = part_content;
+ }
+}
diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h
new file mode 100644
index 0000000..ff81292
--- /dev/null
+++ b/src/libmime/mime_encoding.h
@@ -0,0 +1,148 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_MIME_ENCODING_H_
+#define SRC_LIBMIME_MIME_ENCODING_H_
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+#include <unicode/uchar.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_task;
+struct rspamd_mime_part;
+struct rspamd_mime_text_part;
+struct rspamd_charset_converter;
+
+/**
+ * Convert charset alias to a canonic charset name
+ * @param pool pool to store temporary data
+ * @param in
+ * @return
+ */
+const gchar *rspamd_mime_detect_charset(const rspamd_ftok_t *in,
+ rspamd_mempool_t *pool);
+
+/**
+ * Convert text chunk to utf-8. Input encoding is substituted using
+ * `rspamd_mime_detect_charset`.
+ * If input encoding is already utf, this function returns input pointer.
+ * Memory is allocated from pool if a conversion is needed
+ * @param pool
+ * @param input
+ * @param len
+ * @param in_enc canon charset
+ * @param olen
+ * @param err
+ * @return
+ */
+gchar *rspamd_mime_text_to_utf8(rspamd_mempool_t *pool,
+ gchar *input, gsize len, const gchar *in_enc,
+ gsize *olen, GError **err);
+
+/**
+ * Converts data from `in` to `out`,
+ * returns `FALSE` if `enc` is not a valid iconv charset
+ *
+ * This function, in fact, copies `in` from `out` replacing out content in
+ * total.
+ * @param in
+ * @param out
+ * @param enc validated canonical charset name. If NULL, then utf8 check is done only
+ * @return
+ */
+gboolean rspamd_mime_to_utf8_byte_array(GByteArray *in,
+ GByteArray *out,
+ rspamd_mempool_t *pool,
+ const gchar *enc);
+
+/**
+ * Maybe convert part to utf-8
+ * @param task
+ * @param text_part
+ * @return
+ */
+void rspamd_mime_text_part_maybe_convert(struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part);
+
+/**
+ * Checks utf8 charset and normalize/validate utf8 string
+ * @param charset
+ * @param in
+ * @param len
+ * @return
+ */
+gboolean rspamd_mime_charset_utf_check(rspamd_ftok_t *charset,
+ gchar *in, gsize len,
+ gboolean content_check);
+
+/**
+ * Ensure that all characters in string are valid utf8 chars or replace them
+ * with '?'
+ * @param in
+ * @param len
+ */
+void rspamd_mime_charset_utf_enforce(gchar *in, gsize len);
+
+/**
+ * Gets cached converter
+ * @param enc input encoding
+ * @param pool pool to use for temporary normalisation
+ * @param is_canon TRUE if normalisation is needed
+ * @param err output error
+ * @return converter
+ */
+struct rspamd_charset_converter *rspamd_mime_get_converter_cached(
+ const gchar *enc,
+ rspamd_mempool_t *pool,
+ gboolean is_canon,
+ UErrorCode *err);
+
+/**
+ * Performs charset->utf16 conversion
+ * @param cnv
+ * @param dest
+ * @param destCapacity
+ * @param src
+ * @param srcLength
+ * @param pErrorCode
+ * @return
+ */
+gint32
+rspamd_converter_to_uchars(struct rspamd_charset_converter *cnv,
+ UChar *dest,
+ gint32 destCapacity,
+ const char *src,
+ gint32 srcLength,
+ UErrorCode *pErrorCode);
+
+/**
+ * Detect charset in text
+ * @param in
+ * @param inlen
+ * @return detected charset name or NULL
+ */
+const char *rspamd_mime_charset_find_by_content(const gchar *in, gsize inlen,
+ bool check_utf8);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SRC_LIBMIME_MIME_ENCODING_H_ */
diff --git a/src/libmime/mime_encoding_list.h b/src/libmime/mime_encoding_list.h
new file mode 100644
index 0000000..b5fc5e1
--- /dev/null
+++ b/src/libmime/mime_encoding_list.h
@@ -0,0 +1,1577 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_MIME_ENCODING_LIST_H_
+#define SRC_LIBMIME_MIME_ENCODING_LIST_H_
+
+static const struct rspamd_charset_substitution sub[] = {
+ {
+ .input = "iso-646-us",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "ansi_x3.4-1968",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "iso-ir-6",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "iso_646.irv:1991",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "ascii",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "iso646-us",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "us",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "ibm367",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "cp367",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "csascii",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "ascii7",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "default",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "646",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "iso_646.irv:1983",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "iso969-us",
+ .canon = "ansi_x3.4-1986",
+ .flags = RSPAMD_CHARSET_FLAG_ASCII,
+ },
+ {
+ .input = "tw-big5",
+ .canon = "big5",
+ .flags = 0,
+ },
+ {
+ .input = "csbig5",
+ .canon = "big5",
+ .flags = 0,
+ },
+ {
+ .input = "hkscs-big5",
+ .canon = "big5-hkscs",
+ .flags = 0,
+ },
+ {
+ .input = "big5hk",
+ .canon = "big5-hkscs",
+ .flags = 0,
+ },
+ {
+ .input = "big5-hkscs:unicode",
+ .canon = "big5-hkscs",
+ .flags = 0,
+ },
+ {
+ .input = "extended_unix_code_packed_format_for_japanese",
+ .canon = "euc-jp",
+ .flags = 0,
+ },
+ {
+ .input = "cseucpkdfmtjapanese",
+ .canon = "euc-jp",
+ .flags = 0,
+ },
+ {
+ .input = "x-eucjp",
+ .canon = "euc-jp",
+ .flags = 0,
+ },
+ {
+ .input = "x-euc-jp",
+ .canon = "euc-jp",
+ .flags = 0,
+ },
+ {
+ .input = "unicode-1-1-utf-8",
+ .canon = "utf-8",
+ .flags = RSPAMD_CHARSET_FLAG_UTF,
+ },
+ {
+ .input = "cseuckr",
+ .canon = "euc-kr",
+ .flags = 0,
+ },
+ {
+ .input = "5601",
+ .canon = "euc-kr",
+ .flags = 0,
+ },
+ {
+ .input = "ksc-5601",
+ .canon = "euc-kr",
+ .flags = 0,
+ },
+ {
+ .input = "ksc-5601-1987",
+ .canon = "euc-kr",
+ .flags = 0,
+ },
+ {
+ .input = "ksc-5601_1987",
+ .canon = "euc-kr",
+ .flags = 0,
+ },
+ {
+ .input = "ksc5601",
+ .canon = "euc-kr",
+ .flags = 0,
+ },
+ {
+ .input = "cns11643",
+ .canon = "euc-tw",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-euctw",
+ .canon = "euc-tw",
+ .flags = 0,
+ },
+ {
+ .input = "gb-18030",
+ .canon = "gb18030",
+ .flags = 0,
+ },
+ {
+ .input = "ibm1392",
+ .canon = "gb18030",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-1392",
+ .canon = "gb18030",
+ .flags = 0,
+ },
+ {
+ .input = "gb18030-2000",
+ .canon = "gb18030",
+ .flags = 0,
+ },
+ {
+ .input = "gb-2312",
+ .canon = "gb2312",
+ .flags = 0,
+ },
+ {
+ .input = "csgb2312",
+ .canon = "gb2312",
+ .flags = 0,
+ },
+ {
+ .input = "euc_cn",
+ .canon = "gb2312",
+ .flags = 0,
+ },
+ {
+ .input = "euccn",
+ .canon = "gb2312",
+ .flags = 0,
+ },
+ {
+ .input = "euc-cn",
+ .canon = "gb2312",
+ .flags = 0,
+ },
+ {
+ .input = "gb-k",
+ .canon = "gbk",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-1:1987",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-100",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-1",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "latin1",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "l1",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "ibm819",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "cp819",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatin1",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "819",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "cp819",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-1",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "8859-1",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_1",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_1",
+ .canon = "iso-8859-1",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-2:1987",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-101",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-2",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "latin2",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "l2",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatin2",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "912",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "cp912",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-912",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "ibm912",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-2",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "8859-2",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_2",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_2",
+ .canon = "iso-8859-2",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-3:1988",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-109",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-3",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "latin3",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "l3",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatin3",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "913",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "cp913",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-913",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "ibm913",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-3",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "8859-3",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_3",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_3",
+ .canon = "iso-8859-3",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-4:1988",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-110",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-4",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "latin4",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "l4",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatin4",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "914",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "cp914",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-914",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "ibm914",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-4",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "8859-4",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_4",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_4",
+ .canon = "iso-8859-4",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-5:1988",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-144",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-5",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "cyrillic",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatincyrillic",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "915",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "cp915",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-915",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "ibm915",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-5",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "8859-5",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_5",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_5",
+ .canon = "iso-8859-5",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-6:1987",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-127",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-6",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "ecma-114",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "asmo-708",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "arabic",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatinarabic",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "1089",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "cp1089",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-1089",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "ibm1089",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-6",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "8859-6",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_6",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_6",
+ .canon = "iso-8859-6",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-7:1987",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-126",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-7",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "elot_928",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "ecma-118",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "greek",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "greek8",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatingreek",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "813",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "cp813",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-813",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "ibm813",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-7",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "8859-7",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_7",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_7",
+ .canon = "iso-8859-7",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-8:1988",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-138",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-8",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "hebrew",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatinhebrew",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "916",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "cp916",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-916",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "ibm916",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-8",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "8859-8",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_8",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_8",
+ .canon = "iso-8859-8",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-9:1989",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-148",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-9",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "latin5",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "l5",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatin5",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "920",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "cp920",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-920",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "ibm920",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-9",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "8859-9",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_9",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_9",
+ .canon = "iso-8859-9",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-13",
+ .canon = "iso-8859-13",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-13",
+ .canon = "iso-8859-13",
+ .flags = 0,
+ },
+ {
+ .input = "8859-13",
+ .canon = "iso-8859-13",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859_13",
+ .canon = "iso-8859-13",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859_13",
+ .canon = "iso-8859-13",
+ .flags = 0,
+ },
+ {
+ .input = "iso-ir-199",
+ .canon = "iso-8859-14",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-14:1998",
+ .canon = "iso-8859-14",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-14",
+ .canon = "iso-8859-14",
+ .flags = 0,
+ },
+ {
+ .input = "latin8",
+ .canon = "iso-8859-14",
+ .flags = 0,
+ },
+ {
+ .input = "iso-celtic",
+ .canon = "iso-8859-14",
+ .flags = 0,
+ },
+ {
+ .input = "l8",
+ .canon = "iso-8859-14",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatin9",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "csisolatin0",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "latin9",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "latin0",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "923",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "cp923",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-923",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "ibm923",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "iso8859-15",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-15",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "8859-15",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "iso_8859-15_fdis",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "l9",
+ .canon = "iso-8859-15",
+ .flags = 0,
+ },
+ {
+ .input = "koi-8-r",
+ .canon = "koi8-r",
+ .flags = 0,
+ },
+ {
+ .input = "cskoi8r",
+ .canon = "koi8-r",
+ .flags = 0,
+ },
+ {
+ .input = "koi8",
+ .canon = "koi8-r",
+ .flags = 0,
+ },
+ {
+ .input = "koi-8-u",
+ .canon = "koi8-u",
+ .flags = 0,
+ },
+ {
+ .input = "koi-8-t",
+ .canon = "koi8-t",
+ .flags = 0,
+ },
+ {
+ .input = "shiftjis",
+ .canon = "shift_jis",
+ .flags = 0,
+ },
+ {
+ .input = "ms_kanji",
+ .canon = "shift_jis",
+ .flags = 0,
+ },
+ {
+ .input = "csshiftjis",
+ .canon = "shift_jis",
+ .flags = 0,
+ },
+ {
+ .input = "cp-437",
+ .canon = "ibm437",
+ .flags = 0,
+ },
+ {
+ .input = "cp437",
+ .canon = "ibm437",
+ .flags = 0,
+ },
+ {
+ .input = "437",
+ .canon = "ibm437",
+ .flags = 0,
+ },
+ {
+ .input = "cspc8codepage437437",
+ .canon = "ibm437",
+ .flags = 0,
+ },
+ {
+ .input = "cspc8codepage437",
+ .canon = "ibm437",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-437",
+ .canon = "ibm437",
+ .flags = 0,
+ },
+ {
+ .input = "cp-850",
+ .canon = "ibm850",
+ .flags = 0,
+ },
+ {
+ .input = "cp850",
+ .canon = "ibm850",
+ .flags = 0,
+ },
+ {
+ .input = "850",
+ .canon = "ibm850",
+ .flags = 0,
+ },
+ {
+ .input = "cspc850multilingual850",
+ .canon = "ibm850",
+ .flags = 0,
+ },
+ {
+ .input = "cspc850multilingual",
+ .canon = "ibm850",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-850",
+ .canon = "ibm850",
+ .flags = 0,
+ },
+ {
+ .input = "cp-851",
+ .canon = "ibm851",
+ .flags = 0,
+ },
+ {
+ .input = "cp851",
+ .canon = "ibm851",
+ .flags = 0,
+ },
+ {
+ .input = "851",
+ .canon = "ibm851",
+ .flags = 0,
+ },
+ {
+ .input = "csibm851",
+ .canon = "ibm851",
+ .flags = 0,
+ },
+ {
+ .input = "cp-852",
+ .canon = "ibm852",
+ .flags = 0,
+ },
+ {
+ .input = "cp852",
+ .canon = "ibm852",
+ .flags = 0,
+ },
+ {
+ .input = "852",
+ .canon = "ibm852",
+ .flags = 0,
+ },
+ {
+ .input = "cspcp852",
+ .canon = "ibm852",
+ .flags = 0,
+ },
+ {
+ .input = "852",
+ .canon = "ibm852",
+ .flags = 0,
+ },
+ {
+ .input = "cspcp852",
+ .canon = "ibm852",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-852",
+ .canon = "ibm852",
+ .flags = 0,
+ },
+ {
+ .input = "cp-855",
+ .canon = "ibm855",
+ .flags = 0,
+ },
+ {
+ .input = "cp855",
+ .canon = "ibm855",
+ .flags = 0,
+ },
+ {
+ .input = "855",
+ .canon = "ibm855",
+ .flags = 0,
+ },
+ {
+ .input = "csibm855",
+ .canon = "ibm855",
+ .flags = 0,
+ },
+ {
+ .input = "cspcp855",
+ .canon = "ibm855",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-855",
+ .canon = "ibm855",
+ .flags = 0,
+ },
+ {
+ .input = "cp-857",
+ .canon = "ibm857",
+ .flags = 0,
+ },
+ {
+ .input = "cp857",
+ .canon = "ibm857",
+ .flags = 0,
+ },
+ {
+ .input = "857",
+ .canon = "ibm857",
+ .flags = 0,
+ },
+ {
+ .input = "csibm857",
+ .canon = "ibm857",
+ .flags = 0,
+ },
+ {
+ .input = "857",
+ .canon = "ibm857",
+ .flags = 0,
+ },
+ {
+ .input = "csibm857",
+ .canon = "ibm857",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-857",
+ .canon = "ibm857",
+ .flags = 0,
+ },
+ {
+ .input = "cp-860",
+ .canon = "ibm860",
+ .flags = 0,
+ },
+ {
+ .input = "cp860",
+ .canon = "ibm860",
+ .flags = 0,
+ },
+ {
+ .input = "860",
+ .canon = "ibm860",
+ .flags = 0,
+ },
+ {
+ .input = "csibm860",
+ .canon = "ibm860",
+ .flags = 0,
+ },
+ {
+ .input = "860",
+ .canon = "ibm860",
+ .flags = 0,
+ },
+ {
+ .input = "csibm860",
+ .canon = "ibm860",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-860",
+ .canon = "ibm860",
+ .flags = 0,
+ },
+ {
+ .input = "cp-861",
+ .canon = "ibm861",
+ .flags = 0,
+ },
+ {
+ .input = "cp861",
+ .canon = "ibm861",
+ .flags = 0,
+ },
+ {
+ .input = "861",
+ .canon = "ibm861",
+ .flags = 0,
+ },
+ {
+ .input = "cp-is",
+ .canon = "ibm861",
+ .flags = 0,
+ },
+ {
+ .input = "csibm861",
+ .canon = "ibm861",
+ .flags = 0,
+ },
+ {
+ .input = "861",
+ .canon = "ibm861",
+ .flags = 0,
+ },
+ {
+ .input = "cp-is",
+ .canon = "ibm861",
+ .flags = 0,
+ },
+ {
+ .input = "csibm861",
+ .canon = "ibm861",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-861",
+ .canon = "ibm861",
+ .flags = 0,
+ },
+ {
+ .input = "cp-862",
+ .canon = "ibm862",
+ .flags = 0,
+ },
+ {
+ .input = "cp862",
+ .canon = "ibm862",
+ .flags = 0,
+ },
+ {
+ .input = "862",
+ .canon = "ibm862",
+ .flags = 0,
+ },
+ {
+ .input = "cspc862latinhebrew862",
+ .canon = "ibm862",
+ .flags = 0,
+ },
+ {
+ .input = "cspc862latinhebrew",
+ .canon = "ibm862",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-862",
+ .canon = "ibm862",
+ .flags = 0,
+ },
+ {
+ .input = "cp-863",
+ .canon = "ibm863",
+ .flags = 0,
+ },
+ {
+ .input = "cp863",
+ .canon = "ibm863",
+ .flags = 0,
+ },
+ {
+ .input = "863",
+ .canon = "ibm863",
+ .flags = 0,
+ },
+ {
+ .input = "csibm863",
+ .canon = "ibm863",
+ .flags = 0,
+ },
+ {
+ .input = "863",
+ .canon = "ibm863",
+ .flags = 0,
+ },
+ {
+ .input = "csibm863",
+ .canon = "ibm863",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-863",
+ .canon = "ibm863",
+ .flags = 0,
+ },
+ {
+ .input = "cp-864",
+ .canon = "ibm864",
+ .flags = 0,
+ },
+ {
+ .input = "cp864",
+ .canon = "ibm864",
+ .flags = 0,
+ },
+ {
+ .input = "csibm864",
+ .canon = "ibm864",
+ .flags = 0,
+ },
+ {
+ .input = "csibm864",
+ .canon = "ibm864",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-864",
+ .canon = "ibm864",
+ .flags = 0,
+ },
+ {
+ .input = "cp-865",
+ .canon = "ibm865",
+ .flags = 0,
+ },
+ {
+ .input = "cp865",
+ .canon = "ibm865",
+ .flags = 0,
+ },
+ {
+ .input = "865",
+ .canon = "ibm865",
+ .flags = 0,
+ },
+ {
+ .input = "csibm865",
+ .canon = "ibm865",
+ .flags = 0,
+ },
+ {
+ .input = "865",
+ .canon = "ibm865",
+ .flags = 0,
+ },
+ {
+ .input = "csibm865",
+ .canon = "ibm865",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-865",
+ .canon = "ibm865",
+ .flags = 0,
+ },
+ {
+ .input = "cp-866",
+ .canon = "ibm866",
+ .flags = 0,
+ },
+ {
+ .input = "cp866",
+ .canon = "ibm866",
+ .flags = 0,
+ },
+ {
+ .input = "866",
+ .canon = "ibm866",
+ .flags = 0,
+ },
+ {
+ .input = "csibm866",
+ .canon = "ibm866",
+ .flags = 0,
+ },
+ {
+ .input = "866",
+ .canon = "ibm866",
+ .flags = 0,
+ },
+ {
+ .input = "csibm866",
+ .canon = "ibm866",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-866",
+ .canon = "ibm866",
+ .flags = 0,
+ },
+ {
+ .input = "cp-868",
+ .canon = "ibm868",
+ .flags = 0,
+ },
+ {
+ .input = "cp868",
+ .canon = "ibm868",
+ .flags = 0,
+ },
+ {
+ .input = "cp-ar",
+ .canon = "ibm868",
+ .flags = 0,
+ },
+ {
+ .input = "csibm868",
+ .canon = "ibm868",
+ .flags = 0,
+ },
+ {
+ .input = "ibm-868",
+ .canon = "ibm868",
+ .flags = 0,
+ },
+ {
+ .input = "cp-869",
+ .canon = "ibm869",
+ .flags = 0,
+ },
+ {
+ .input = "cp869",
+ .canon = "ibm869",
+ .flags = 0,
+ },
+ {
+ .input = "869",
+ .canon = "ibm869",
+ .flags = 0,
+ },
+ {
+ .input = "cp-gr",
+ .canon = "ibm869",
+ .flags = 0,
+ },
+ {
+ .input = "csibm869",
+ .canon = "ibm869",
+ .flags = 0,
+ },
+ {
+ .input = "cp-891",
+ .canon = "ibm891",
+ .flags = 0,
+ },
+ {
+ .input = "cp891",
+ .canon = "ibm891",
+ .flags = 0,
+ },
+ {
+ .input = "csibm891",
+ .canon = "ibm891",
+ .flags = 0,
+ },
+ {
+ .input = "cp-903",
+ .canon = "ibm903",
+ .flags = 0,
+ },
+ {
+ .input = "cp903",
+ .canon = "ibm903",
+ .flags = 0,
+ },
+ {
+ .input = "csibm903",
+ .canon = "ibm903",
+ .flags = 0,
+ },
+ {
+ .input = "cp-904",
+ .canon = "ibm904",
+ .flags = 0,
+ },
+ {
+ .input = "cp904",
+ .canon = "ibm904",
+ .flags = 0,
+ },
+ {
+ .input = "904",
+ .canon = "ibm904",
+ .flags = 0,
+ },
+ {
+ .input = "csibm904",
+ .canon = "ibm904",
+ .flags = 0,
+ },
+ {
+ .input = "cp-1251",
+ .canon = "cp1251",
+ .flags = 0,
+ },
+ {
+ .input = "windows-1251",
+ .canon = "cp1251",
+ .flags = 0,
+ },
+ {
+ .input = "cp-1255",
+ .canon = "cp1255",
+ .flags = 0,
+ },
+ {
+ .input = "windows-1255",
+ .canon = "cp1255",
+ .flags = 0,
+ },
+ {
+ .input = "tis620.2533",
+ .canon = "tis-620",
+ .flags = 0,
+ },
+};
+
+#endif /* SRC_LIBMIME_MIME_ENCODING_LIST_H_ */
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c
new file mode 100644
index 0000000..e51539e
--- /dev/null
+++ b/src/libmime/mime_expressions.c
@@ -0,0 +1,2392 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <contrib/libucl/ucl.h>
+#include "config.h"
+#include "util.h"
+#include "cfg_file.h"
+#include "rspamd.h"
+#include "message.h"
+#include "mime_expressions.h"
+#include "libserver/html/html.h"
+#include "lua/lua_common.h"
+#include "utlist.h"
+
+gboolean rspamd_compare_encoding(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+gboolean rspamd_header_exists(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+gboolean rspamd_parts_distance(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+gboolean rspamd_recipients_distance(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+gboolean rspamd_has_only_html_part(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+gboolean rspamd_is_recipients_sorted(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+gboolean rspamd_compare_transfer_encoding(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+gboolean rspamd_is_html_balanced(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+gboolean rspamd_has_html_tag(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+gboolean rspamd_has_fake_html(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_raw_header_exists(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_check_smtp_data(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_content_type_is_type(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_content_type_is_subtype(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_content_type_has_param(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_content_type_compare_param(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_has_content_part(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_has_content_part_len(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_is_empty_body(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_has_flag_expr(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+static gboolean rspamd_has_symbol_expr(struct rspamd_task *task,
+ GArray *args,
+ void *unused);
+
+static rspamd_expression_atom_t *rspamd_mime_expr_parse(const gchar *line, gsize len,
+ rspamd_mempool_t *pool, gpointer ud, GError **err);
+static gdouble rspamd_mime_expr_process(void *ud, rspamd_expression_atom_t *atom);
+static gint rspamd_mime_expr_priority(rspamd_expression_atom_t *atom);
+static void rspamd_mime_expr_destroy(rspamd_expression_atom_t *atom);
+
+/**
+ * Regexp structure
+ */
+struct rspamd_regexp_atom {
+ enum rspamd_re_type type; /**< regexp type */
+ gchar *regexp_text; /**< regexp text representation */
+ rspamd_regexp_t *regexp; /**< regexp structure */
+ union {
+ const gchar *header; /**< header name for header regexps */
+ const gchar *selector; /**< selector name for lua selector regexp */
+ } extra;
+ gboolean is_test; /**< true if this expression must be tested */
+ gboolean is_strong; /**< true if headers search must be case sensitive */
+ gboolean is_multiple; /**< true if we need to match all inclusions of atom */
+};
+
+/**
+ * Rspamd expression function
+ */
+struct rspamd_function_atom {
+ gchar *name; /**< name of function */
+ GArray *args; /**< its args */
+};
+
+enum rspamd_mime_atom_type {
+ MIME_ATOM_REGEXP = 0,
+ MIME_ATOM_INTERNAL_FUNCTION,
+ MIME_ATOM_LUA_FUNCTION,
+ MIME_ATOM_LOCAL_LUA_FUNCTION, /* New style */
+};
+
+struct rspamd_mime_atom {
+ gchar *str;
+ union {
+ struct rspamd_regexp_atom *re;
+ struct rspamd_function_atom *func;
+ const gchar *lua_function;
+ gint lua_cbref;
+ } d;
+ enum rspamd_mime_atom_type type;
+};
+
+/*
+ * List of internal functions of rspamd
+ * Sorted by name to use bsearch
+ */
+static struct _fl {
+ const gchar *name;
+ rspamd_internal_func_t func;
+ void *user_data;
+} rspamd_functions_list[] = {
+ {"check_smtp_data", rspamd_check_smtp_data, NULL},
+ {"compare_encoding", rspamd_compare_encoding, NULL},
+ {"compare_parts_distance", rspamd_parts_distance, NULL},
+ {"compare_recipients_distance", rspamd_recipients_distance, NULL},
+ {"compare_transfer_encoding", rspamd_compare_transfer_encoding, NULL},
+ {"content_type_compare_param", rspamd_content_type_compare_param, NULL},
+ {"content_type_has_param", rspamd_content_type_has_param, NULL},
+ {"content_type_is_subtype", rspamd_content_type_is_subtype, NULL},
+ {"content_type_is_type", rspamd_content_type_is_type, NULL},
+ {"has_content_part", rspamd_has_content_part, NULL},
+ {"has_content_part_len", rspamd_has_content_part_len, NULL},
+ {"has_fake_html", rspamd_has_fake_html, NULL},
+ {"has_flag", rspamd_has_flag_expr, NULL},
+ {"has_html_tag", rspamd_has_html_tag, NULL},
+ {"has_only_html_part", rspamd_has_only_html_part, NULL},
+ {"has_symbol", rspamd_has_symbol_expr, NULL},
+ {"header_exists", rspamd_header_exists, NULL},
+ {"is_empty_body", rspamd_is_empty_body, NULL},
+ {"is_html_balanced", rspamd_is_html_balanced, NULL},
+ {"is_recipients_sorted", rspamd_is_recipients_sorted, NULL},
+ {"raw_header_exists", rspamd_raw_header_exists, NULL},
+};
+
+const struct rspamd_atom_subr mime_expr_subr = {
+ .parse = rspamd_mime_expr_parse,
+ .process = rspamd_mime_expr_process,
+ .priority = rspamd_mime_expr_priority,
+ .destroy = rspamd_mime_expr_destroy};
+
+static struct _fl *list_ptr = &rspamd_functions_list[0];
+static guint32 functions_number = sizeof(rspamd_functions_list) /
+ sizeof(struct _fl);
+static gboolean list_allocated = FALSE;
+
+/* Bsearch routine */
+static gint
+fl_cmp(const void *s1, const void *s2)
+{
+ struct _fl *fl1 = (struct _fl *) s1;
+ struct _fl *fl2 = (struct _fl *) s2;
+ return strcmp(fl1->name, fl2->name);
+}
+
+static GQuark
+rspamd_mime_expr_quark(void)
+{
+ return g_quark_from_static_string("mime-expressions");
+}
+
+#define TYPE_CHECK(str, type, len) (sizeof(type) - 1 == (len) && rspamd_lc_cmp((str), (type), (len)) == 0)
+static gboolean
+rspamd_parse_long_option(const gchar *start, gsize len,
+ struct rspamd_regexp_atom *a)
+{
+ gboolean ret = FALSE;
+
+ if (TYPE_CHECK(start, "body", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_BODY;
+ }
+ else if (TYPE_CHECK(start, "part", len) ||
+ TYPE_CHECK(start, "mime", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_MIME;
+ }
+ else if (TYPE_CHECK(start, "raw_part", len) ||
+ TYPE_CHECK(start, "raw_mime", len) ||
+ TYPE_CHECK(start, "mime_raw", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_RAWMIME;
+ }
+ else if (TYPE_CHECK(start, "header", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_HEADER;
+ }
+ else if (TYPE_CHECK(start, "mime_header", len) ||
+ TYPE_CHECK(start, "header_mime", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_MIMEHEADER;
+ }
+ else if (TYPE_CHECK(start, "raw_header", len) ||
+ TYPE_CHECK(start, "header_raw", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_RAWHEADER;
+ }
+ else if (TYPE_CHECK(start, "all_header", len) ||
+ TYPE_CHECK(start, "header_all", len) ||
+ TYPE_CHECK(start, "all_headers", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_ALLHEADER;
+ }
+ else if (TYPE_CHECK(start, "url", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_URL;
+ }
+ else if (TYPE_CHECK(start, "email", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_EMAIL;
+ }
+ else if (TYPE_CHECK(start, "sa_body", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_SABODY;
+ }
+ else if (TYPE_CHECK(start, "sa_raw_body", len) ||
+ TYPE_CHECK(start, "sa_body_raw", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_SARAWBODY;
+ }
+ else if (TYPE_CHECK(start, "words", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_WORDS;
+ }
+ else if (TYPE_CHECK(start, "raw_words", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_RAWWORDS;
+ }
+ else if (TYPE_CHECK(start, "stem_words", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_STEMWORDS;
+ }
+ else if (TYPE_CHECK(start, "selector", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_SELECTOR;
+ }
+
+ return ret;
+}
+
+/*
+ * Rspamd regexp utility functions
+ */
+static struct rspamd_regexp_atom *
+rspamd_mime_expr_parse_regexp_atom(rspamd_mempool_t *pool, const gchar *line,
+ struct rspamd_config *cfg)
+{
+ const gchar *begin, *end, *p, *src, *start, *brace;
+ gchar *dbegin, *dend, *extra = NULL;
+ struct rspamd_regexp_atom *result;
+ GError *err = NULL;
+ GString *re_flags;
+
+ if (line == NULL) {
+ msg_err_pool("cannot parse NULL line");
+ return NULL;
+ }
+
+ src = line;
+ result = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_regexp_atom));
+ /* Skip whitespaces */
+ while (g_ascii_isspace(*line)) {
+ line++;
+ }
+ if (*line == '\0') {
+ msg_warn_pool("got empty regexp");
+ return NULL;
+ }
+
+ result->type = RSPAMD_RE_MAX;
+
+ start = line;
+ /* First try to find header name */
+ begin = strchr(line, '/');
+ if (begin != NULL) {
+ p = begin;
+ end = NULL;
+ while (p != line) {
+ if (*p == '=') {
+ end = p;
+ break;
+ }
+ p--;
+ }
+
+ if (end) {
+ extra = rspamd_mempool_alloc(pool, end - line + 1);
+ rspamd_strlcpy(extra, line, end - line + 1);
+ line = end;
+ }
+ }
+ else {
+ extra = rspamd_mempool_strdup(pool, line);
+ result->type = RSPAMD_RE_MAX;
+ line = start;
+ }
+ /* Find begin of regexp */
+ while (*line && *line != '/') {
+ line++;
+ }
+ if (*line != '\0') {
+ begin = line + 1;
+ }
+ else if (extra == NULL) {
+ /* Assume that line without // is just a header name */
+ extra = rspamd_mempool_strdup(pool, line);
+ result->type = RSPAMD_RE_HEADER;
+ return result;
+ }
+ else {
+ /* We got header name earlier but have not found // expression, so it is invalid regexp */
+ msg_warn_pool(
+ "got no header name (eg. header=) but without corresponding regexp, %s",
+ src);
+ return NULL;
+ }
+ /* Find end */
+ end = begin;
+ while (*end && (*end != '/' || *(end - 1) == '\\')) {
+ end++;
+ }
+ if (end == begin || *end != '/') {
+ msg_warn_pool("no trailing / in regexp %s", src);
+ return NULL;
+ }
+ /* Parse flags */
+ p = end + 1;
+ re_flags = g_string_sized_new(32);
+
+ while (p != NULL) {
+ switch (*p) {
+ case 'i':
+ case 'm':
+ case 's':
+ case 'x':
+ case 'u':
+ case 'O':
+ case 'r':
+ case 'L':
+ /* Handled by rspamd_regexp_t */
+ g_string_append_c(re_flags, *p);
+ p++;
+ break;
+ case 'o':
+ p++;
+ break;
+ /* Type flags */
+ case 'H':
+ result->type = RSPAMD_RE_HEADER;
+ p++;
+ break;
+ case 'R':
+ result->type = RSPAMD_RE_ALLHEADER;
+ p++;
+ break;
+ case 'B':
+ result->type = RSPAMD_RE_MIMEHEADER;
+ p++;
+ break;
+ case 'C':
+ result->type = RSPAMD_RE_SABODY;
+ p++;
+ break;
+ case 'D':
+ result->type = RSPAMD_RE_SARAWBODY;
+ p++;
+ break;
+ case 'M':
+ result->type = RSPAMD_RE_BODY;
+ p++;
+ break;
+ case 'P':
+ result->type = RSPAMD_RE_MIME;
+ p++;
+ break;
+ case 'Q':
+ result->type = RSPAMD_RE_RAWMIME;
+ p++;
+ break;
+ case 'U':
+ result->type = RSPAMD_RE_URL;
+ p++;
+ break;
+ case 'X':
+ result->type = RSPAMD_RE_RAWHEADER;
+ p++;
+ break;
+ case '$':
+ result->type = RSPAMD_RE_SELECTOR;
+ p++;
+ break;
+ case '{':
+ /* Long definition */
+ if ((brace = strchr(p + 1, '}')) != NULL) {
+ if (!rspamd_parse_long_option(p + 1, brace - (p + 1), result)) {
+ msg_warn_pool("invalid long regexp type: %*s in '%s'",
+ (int) (brace - (p + 1)), p + 1, src);
+ p = NULL;
+ }
+ else {
+ p = brace + 1;
+ }
+ }
+ else {
+ p = NULL;
+ }
+ break;
+ /* Other flags */
+ case 'T':
+ result->is_test = TRUE;
+ p++;
+ break;
+ case 'S':
+ result->is_strong = TRUE;
+ p++;
+ break;
+ case 'A':
+ result->is_multiple = TRUE;
+ p++;
+ break;
+ /* Stop flags parsing */
+ default:
+ p = NULL;
+ break;
+ }
+ }
+
+ if (result->type >= RSPAMD_RE_MAX) {
+ if (extra) {
+ /* Assume header regexp */
+ result->extra.header = extra;
+ result->type = RSPAMD_RE_HEADER;
+ }
+ else {
+ msg_err_pool("could not read regexp: %s, unknown type", src);
+ return NULL;
+ }
+ }
+
+ if ((result->type == RSPAMD_RE_HEADER ||
+ result->type == RSPAMD_RE_RAWHEADER ||
+ result->type == RSPAMD_RE_MIMEHEADER)) {
+ if (extra == NULL) {
+ msg_err_pool("header regexp: '%s' has no header part", src);
+ return NULL;
+ }
+ else {
+ result->extra.header = extra;
+ }
+ }
+
+ if (result->type == RSPAMD_RE_SELECTOR) {
+ if (extra == NULL) {
+ msg_err_pool("selector regexp: '%s' has no selector part", src);
+ return NULL;
+ }
+ else {
+ result->extra.selector = extra;
+ }
+ }
+
+
+ result->regexp_text = rspamd_mempool_strdup(pool, start);
+ dbegin = result->regexp_text + (begin - start);
+ dend = result->regexp_text + (end - start);
+ *dend = '\0';
+
+ result->regexp = rspamd_regexp_new(dbegin, re_flags->str,
+ &err);
+
+ g_string_free(re_flags, TRUE);
+
+ if (result->regexp == NULL || err != NULL) {
+ msg_warn_pool("could not read regexp: %s while reading regexp %e",
+ src, err);
+
+ if (err) {
+ g_error_free(err);
+ }
+
+ return NULL;
+ }
+
+ if (result->is_multiple) {
+ rspamd_regexp_set_maxhits(result->regexp, 0);
+ }
+ else {
+ rspamd_regexp_set_maxhits(result->regexp, 1);
+ }
+
+ rspamd_regexp_set_ud(result->regexp, result);
+
+ *dend = '/';
+
+ return result;
+}
+
+struct rspamd_function_atom *
+rspamd_mime_expr_parse_function_atom(rspamd_mempool_t *pool, const gchar *input)
+{
+ const gchar *obrace, *ebrace, *p, *c;
+ gchar t, *databuf;
+ guint len;
+ struct rspamd_function_atom *res;
+ struct expression_argument arg;
+ GError *err = NULL;
+ enum {
+ start_read_argument = 0,
+ in_string,
+ in_regexp,
+ got_backslash,
+ got_comma
+ } state,
+ prev_state = 0;
+
+ obrace = strchr(input, '(');
+ ebrace = strrchr(input, ')');
+
+ g_assert(obrace != NULL && ebrace != NULL);
+
+ res = rspamd_mempool_alloc0(pool, sizeof(*res));
+ res->name = rspamd_mempool_alloc(pool, obrace - input + 1);
+ rspamd_strlcpy(res->name, input, obrace - input + 1);
+ res->args = g_array_new(FALSE, FALSE, sizeof(struct expression_argument));
+
+ p = obrace + 1;
+ c = p;
+ state = start_read_argument;
+
+ /* Read arguments */
+ while (p <= ebrace) {
+ t = *p;
+ switch (state) {
+ case start_read_argument:
+ if (t == '/') {
+ state = in_regexp;
+ c = p;
+ }
+ else if (!g_ascii_isspace(t)) {
+ state = in_string;
+
+ if (t == '\'' || t == '\"') {
+ c = p + 1;
+ }
+ else {
+ c = p;
+ }
+ }
+ p++;
+ break;
+ case in_regexp:
+ if (t == '\\') {
+ state = got_backslash;
+ prev_state = in_regexp;
+ }
+ else if (t == ',' || p == ebrace) {
+ len = p - c + 1;
+ databuf = rspamd_mempool_alloc(pool, len);
+ rspamd_strlcpy(databuf, c, len);
+ arg.type = EXPRESSION_ARGUMENT_REGEXP;
+ arg.data = rspamd_regexp_cache_create(NULL, databuf, NULL, &err);
+
+ if (arg.data == NULL) {
+ /* Fallback to string */
+ msg_warn("cannot parse slashed argument %s as regexp: %s",
+ databuf, err->message);
+ g_error_free(err);
+ arg.type = EXPRESSION_ARGUMENT_NORMAL;
+ arg.data = databuf;
+ }
+
+ g_array_append_val(res->args, arg);
+ state = got_comma;
+ }
+ p++;
+ break;
+ case in_string:
+ if (t == '\\') {
+ state = got_backslash;
+ prev_state = in_string;
+ }
+ else if (t == ',' || p == ebrace) {
+ if (*(p - 1) == '\'' || *(p - 1) == '\"') {
+ len = p - c;
+ }
+ else {
+ len = p - c + 1;
+ }
+
+ databuf = rspamd_mempool_alloc(pool, len);
+ rspamd_strlcpy(databuf, c, len);
+ arg.type = EXPRESSION_ARGUMENT_NORMAL;
+ arg.data = databuf;
+ g_array_append_val(res->args, arg);
+ state = got_comma;
+ }
+ p++;
+ break;
+ case got_backslash:
+ state = prev_state;
+ p++;
+ break;
+ case got_comma:
+ state = start_read_argument;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static rspamd_expression_atom_t *
+rspamd_mime_expr_parse(const gchar *line, gsize len,
+ rspamd_mempool_t *pool, gpointer ud, GError **err)
+{
+ rspamd_expression_atom_t *a = NULL;
+ struct rspamd_mime_atom *mime_atom = NULL;
+ const gchar *p, *end, *c = NULL;
+ struct rspamd_mime_expr_ud *real_ud = (struct rspamd_mime_expr_ud *) ud;
+ struct rspamd_config *cfg;
+ rspamd_regexp_t *own_re;
+ gchar t;
+ gint type = MIME_ATOM_REGEXP, obraces = 0, ebraces = 0;
+ enum {
+ in_header = 0,
+ got_slash,
+ in_regexp,
+ got_backslash,
+ got_second_slash,
+ in_flags,
+ in_flags_brace,
+ got_obrace,
+ in_function,
+ in_local_function,
+ got_ebrace,
+ end_atom,
+ bad_atom
+ } state = 0,
+ prev_state = 0;
+
+ p = line;
+ end = p + len;
+ cfg = real_ud->cfg;
+
+ while (p < end) {
+ t = *p;
+
+ switch (state) {
+ case in_header:
+ if (t == '/') {
+ /* Regexp */
+ state = got_slash;
+ }
+ else if (t == '(') {
+ /* Function */
+ state = got_obrace;
+ }
+ else if (!g_ascii_isalnum(t) && t != '_' && t != '-' && t != '=') {
+ if (t == ':') {
+ if (p - line == 3 && memcmp(line, "lua", 3) == 0) {
+ type = MIME_ATOM_LOCAL_LUA_FUNCTION;
+ state = in_local_function;
+ c = p + 1;
+ }
+ }
+ else {
+ /* Likely lua function, identified by just a string */
+ type = MIME_ATOM_LUA_FUNCTION;
+ state = end_atom;
+ /* Do not increase p */
+ continue;
+ }
+ }
+ else if (g_ascii_isspace(t)) {
+ state = bad_atom;
+ }
+ p++;
+ break;
+ case got_slash:
+ state = in_regexp;
+ break;
+ case in_regexp:
+ if (t == '\\') {
+ state = got_backslash;
+ prev_state = in_regexp;
+ }
+ else if (t == '/') {
+ state = got_second_slash;
+ }
+ p++;
+ break;
+ case got_second_slash:
+ state = in_flags;
+ break;
+ case in_flags:
+ if (t == '{') {
+ state = in_flags_brace;
+ p++;
+ }
+ else if (!g_ascii_isalpha(t) && t != '$') {
+ state = end_atom;
+ }
+ else {
+ p++;
+ }
+ break;
+ case in_flags_brace:
+ if (t == '}') {
+ state = in_flags;
+ }
+ p++;
+ break;
+ case got_backslash:
+ state = prev_state;
+ p++;
+ break;
+ case got_obrace:
+ state = in_function;
+ type = MIME_ATOM_INTERNAL_FUNCTION;
+ obraces++;
+ break;
+ case in_function:
+ if (t == '\\') {
+ state = got_backslash;
+ prev_state = in_function;
+ }
+ else if (t == '(') {
+ obraces++;
+ }
+ else if (t == ')') {
+ ebraces++;
+ if (ebraces == obraces) {
+ state = got_ebrace;
+ }
+ }
+ p++;
+ break;
+ case in_local_function:
+ if (!(g_ascii_isalnum(t) || t == '-' || t == '_')) {
+ g_assert(c != NULL);
+ state = end_atom;
+ }
+ else {
+ p++;
+ }
+ break;
+ case got_ebrace:
+ state = end_atom;
+ break;
+ case bad_atom:
+ g_set_error(err, rspamd_mime_expr_quark(), 100, "cannot parse"
+ " mime atom '%s' when reading symbol '%c' at offset %d, "
+ "near %.*s",
+ line, t, (gint) (p - line),
+ (gint) MIN(end - p, 10), p);
+ return NULL;
+ case end_atom:
+ goto set;
+ }
+ }
+set:
+
+ if (p - line == 0 || (state != got_ebrace && state != got_second_slash &&
+ state != in_flags && state != end_atom)) {
+ g_set_error(err, rspamd_mime_expr_quark(), 200, "incomplete or empty"
+ " mime atom");
+ return NULL;
+ }
+
+ mime_atom = rspamd_mempool_alloc(pool, sizeof(*mime_atom));
+ mime_atom->type = type;
+ mime_atom->str = rspamd_mempool_alloc(pool, p - line + 1);
+ rspamd_strlcpy(mime_atom->str, line, p - line + 1);
+
+ if (type == MIME_ATOM_REGEXP) {
+ mime_atom->d.re = rspamd_mime_expr_parse_regexp_atom(pool,
+ mime_atom->str, cfg);
+ if (mime_atom->d.re == NULL) {
+ g_set_error(err, rspamd_mime_expr_quark(), 200,
+ "cannot parse regexp '%s'",
+ mime_atom->str);
+ goto err;
+ }
+ else {
+ gint lua_cbref = -1;
+
+ /* Check regexp condition */
+ if (real_ud->conf_obj != NULL) {
+ const ucl_object_t *re_conditions = ucl_object_lookup(real_ud->conf_obj,
+ "re_conditions");
+
+ if (re_conditions != NULL) {
+ if (ucl_object_type(re_conditions) != UCL_OBJECT) {
+ g_set_error(err, rspamd_mime_expr_quark(), 320,
+ "re_conditions is not a table for '%s'",
+ mime_atom->str);
+ rspamd_regexp_unref(mime_atom->d.re->regexp);
+ goto err;
+ }
+
+ const ucl_object_t *function_obj = ucl_object_lookup(re_conditions,
+ mime_atom->str);
+
+ if (function_obj != NULL) {
+ if (ucl_object_type(function_obj) != UCL_USERDATA) {
+ g_set_error(err, rspamd_mime_expr_quark(), 320,
+ "condition for '%s' is invalid, must be function",
+ mime_atom->str);
+ rspamd_regexp_unref(mime_atom->d.re->regexp);
+ goto err;
+ }
+
+ struct ucl_lua_funcdata *fd = function_obj->value.ud;
+
+ lua_cbref = fd->idx;
+ }
+ }
+ }
+
+ if (lua_cbref != -1) {
+ msg_info_config("added condition for regexp %s", mime_atom->str);
+ /* Add SOM_LEFTMOST_FLAG implicitly */
+ rspamd_regexp_set_flags(mime_atom->d.re->regexp, rspamd_regexp_get_flags(mime_atom->d.re->regexp) |
+ RSPAMD_REGEXP_FLAG_LEFTMOST);
+ }
+
+ /* Register new item in the cache */
+ if (mime_atom->d.re->type == RSPAMD_RE_HEADER ||
+ mime_atom->d.re->type == RSPAMD_RE_RAWHEADER ||
+ mime_atom->d.re->type == RSPAMD_RE_MIMEHEADER) {
+
+ if (mime_atom->d.re->extra.header != NULL) {
+ own_re = mime_atom->d.re->regexp;
+ mime_atom->d.re->regexp = rspamd_re_cache_add(cfg->re_cache,
+ mime_atom->d.re->regexp,
+ mime_atom->d.re->type,
+ mime_atom->d.re->extra.header,
+ strlen(mime_atom->d.re->extra.header) + 1,
+ lua_cbref);
+ /* Pass ownership to the cache */
+ rspamd_regexp_unref(own_re);
+ }
+ else {
+ /* We have header regexp, but no header name is detected */
+ g_set_error(err,
+ rspamd_mime_expr_quark(),
+ 200,
+ "no header name in header regexp: '%s'",
+ mime_atom->str);
+ rspamd_regexp_unref(mime_atom->d.re->regexp);
+ goto err;
+ }
+ }
+ else if (mime_atom->d.re->type == RSPAMD_RE_SELECTOR) {
+ if (mime_atom->d.re->extra.selector != NULL) {
+ own_re = mime_atom->d.re->regexp;
+ mime_atom->d.re->regexp = rspamd_re_cache_add(cfg->re_cache,
+ mime_atom->d.re->regexp,
+ mime_atom->d.re->type,
+ mime_atom->d.re->extra.selector,
+ strlen(mime_atom->d.re->extra.selector) + 1,
+ lua_cbref);
+ /* Pass ownership to the cache */
+ rspamd_regexp_unref(own_re);
+ }
+ else {
+ /* We have selector regexp, but no selector name is detected */
+ g_set_error(err,
+ rspamd_mime_expr_quark(),
+ 200,
+ "no selector name in selector regexp: '%s'",
+ mime_atom->str);
+ rspamd_regexp_unref(mime_atom->d.re->regexp);
+ goto err;
+ }
+ }
+ else {
+ own_re = mime_atom->d.re->regexp;
+ mime_atom->d.re->regexp = rspamd_re_cache_add(cfg->re_cache,
+ mime_atom->d.re->regexp,
+ mime_atom->d.re->type,
+ NULL,
+ 0,
+ lua_cbref);
+ /* Pass ownership to the cache */
+ rspamd_regexp_unref(own_re);
+ }
+ }
+ }
+ else if (type == MIME_ATOM_LUA_FUNCTION) {
+ mime_atom->d.lua_function = mime_atom->str;
+
+ lua_getglobal(cfg->lua_state, mime_atom->str);
+
+ if (lua_type(cfg->lua_state, -1) != LUA_TFUNCTION) {
+ g_set_error(err, rspamd_mime_expr_quark(), 200,
+ "no such lua function '%s'",
+ mime_atom->str);
+ lua_pop(cfg->lua_state, 1);
+
+ goto err;
+ }
+
+ lua_pop(cfg->lua_state, 1);
+ }
+ else if (type == MIME_ATOM_LOCAL_LUA_FUNCTION) {
+ /* p pointer is set to the start of Lua function name */
+
+ if (real_ud->conf_obj == NULL) {
+ g_set_error(err, rspamd_mime_expr_quark(), 300,
+ "no config object for '%s'",
+ mime_atom->str);
+ goto err;
+ }
+
+ const ucl_object_t *functions = ucl_object_lookup(real_ud->conf_obj,
+ "functions");
+
+ if (functions == NULL) {
+ g_set_error(err, rspamd_mime_expr_quark(), 310,
+ "no functions defined for '%s'",
+ mime_atom->str);
+ goto err;
+ }
+
+ if (ucl_object_type(functions) != UCL_OBJECT) {
+ g_set_error(err, rspamd_mime_expr_quark(), 320,
+ "functions is not a table for '%s'",
+ mime_atom->str);
+ goto err;
+ }
+
+ const ucl_object_t *function_obj;
+
+ function_obj = ucl_object_lookup_len(functions, c,
+ p - c);
+
+ if (function_obj == NULL) {
+ g_set_error(err, rspamd_mime_expr_quark(), 320,
+ "function %.*s is not found for '%s'",
+ (int) (p - c), c, mime_atom->str);
+ goto err;
+ }
+
+ if (ucl_object_type(function_obj) != UCL_USERDATA) {
+ g_set_error(err, rspamd_mime_expr_quark(), 320,
+ "function %.*s has invalid type for '%s'",
+ (int) (p - c), c, mime_atom->str);
+ goto err;
+ }
+
+ struct ucl_lua_funcdata *fd = function_obj->value.ud;
+
+ mime_atom->d.lua_cbref = fd->idx;
+ }
+ else {
+ mime_atom->d.func = rspamd_mime_expr_parse_function_atom(pool,
+ mime_atom->str);
+ if (mime_atom->d.func == NULL) {
+ g_set_error(err, rspamd_mime_expr_quark(), 200,
+ "cannot parse function '%s'",
+ mime_atom->str);
+ goto err;
+ }
+ }
+
+ a = rspamd_mempool_alloc0(pool, sizeof(*a));
+ a->len = p - line;
+ a->priority = 0;
+ a->data = mime_atom;
+
+ return a;
+
+err:
+
+ return NULL;
+}
+
+static gint
+rspamd_mime_expr_process_regexp(struct rspamd_regexp_atom *re,
+ struct rspamd_task *task)
+{
+ gint ret;
+
+ if (re == NULL) {
+ msg_info_task("invalid regexp passed");
+ return 0;
+ }
+
+ if (re->type == RSPAMD_RE_HEADER || re->type == RSPAMD_RE_RAWHEADER) {
+ ret = rspamd_re_cache_process(task,
+ re->regexp,
+ re->type,
+ re->extra.header,
+ strlen(re->extra.header),
+ re->is_strong);
+ }
+ else if (re->type == RSPAMD_RE_SELECTOR) {
+ ret = rspamd_re_cache_process(task,
+ re->regexp,
+ re->type,
+ re->extra.selector,
+ strlen(re->extra.selector),
+ re->is_strong);
+ }
+ else {
+ ret = rspamd_re_cache_process(task,
+ re->regexp,
+ re->type,
+ NULL,
+ 0,
+ re->is_strong);
+ }
+
+ if (re->is_test) {
+ msg_info_task("test %s regexp '%s' returned %d",
+ rspamd_re_cache_type_to_string(re->type),
+ re->regexp_text, ret);
+ }
+
+ return ret;
+}
+
+
+static gint
+rspamd_mime_expr_priority(rspamd_expression_atom_t *atom)
+{
+ struct rspamd_mime_atom *mime_atom = atom->data;
+ gint ret = 0;
+
+ switch (mime_atom->type) {
+ case MIME_ATOM_INTERNAL_FUNCTION:
+ /* Prioritize internal functions slightly */
+ ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 8;
+ break;
+ case MIME_ATOM_LUA_FUNCTION:
+ case MIME_ATOM_LOCAL_LUA_FUNCTION:
+ ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 4;
+ break;
+ case MIME_ATOM_REGEXP:
+ switch (mime_atom->d.re->type) {
+ case RSPAMD_RE_HEADER:
+ case RSPAMD_RE_RAWHEADER:
+ ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 16;
+ break;
+ case RSPAMD_RE_URL:
+ case RSPAMD_RE_EMAIL:
+ ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 8;
+ break;
+ case RSPAMD_RE_SELECTOR:
+ ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 8;
+ break;
+ case RSPAMD_RE_MIME:
+ case RSPAMD_RE_RAWMIME:
+ ret = RSPAMD_EXPRESSION_MAX_PRIORITY - RSPAMD_EXPRESSION_MAX_PRIORITY / 2;
+ break;
+ case RSPAMD_RE_WORDS:
+ case RSPAMD_RE_RAWWORDS:
+ case RSPAMD_RE_STEMWORDS:
+ default:
+ /* For expensive regexps */
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void
+rspamd_mime_expr_destroy(rspamd_expression_atom_t *atom)
+{
+ struct rspamd_mime_atom *mime_atom = atom->data;
+
+ if (mime_atom) {
+ if (mime_atom->type == MIME_ATOM_INTERNAL_FUNCTION) {
+ /* Need to cleanup arguments */
+ g_array_free(mime_atom->d.func->args, TRUE);
+ }
+ }
+}
+
+static gboolean
+rspamd_mime_expr_process_function(struct rspamd_function_atom *func,
+ struct rspamd_task *task,
+ lua_State *L)
+{
+ struct _fl *selected, key;
+
+ key.name = func->name;
+
+ selected = bsearch(&key,
+ list_ptr,
+ functions_number,
+ sizeof(struct _fl),
+ fl_cmp);
+ if (selected == NULL) {
+ /* Try to check lua function */
+ return FALSE;
+ }
+
+ return selected->func(task, func->args, selected->user_data);
+}
+
+static gdouble
+rspamd_mime_expr_process(void *ud, rspamd_expression_atom_t *atom)
+{
+ struct rspamd_task *task = (struct rspamd_task *) ud;
+ struct rspamd_mime_atom *mime_atom;
+ lua_State *L;
+ gdouble ret = 0;
+
+ g_assert(task != NULL);
+ g_assert(atom != NULL);
+
+ mime_atom = atom->data;
+
+ if (mime_atom->type == MIME_ATOM_REGEXP) {
+ ret = rspamd_mime_expr_process_regexp(mime_atom->d.re, task);
+ }
+ else if (mime_atom->type == MIME_ATOM_LUA_FUNCTION) {
+ L = task->cfg->lua_state;
+ lua_getglobal(L, mime_atom->d.lua_function);
+ rspamd_lua_task_push(L, task);
+
+ if (lua_pcall(L, 1, 1, 0) != 0) {
+ msg_info_task("lua call to global function '%s' for atom '%s' failed: %s",
+ mime_atom->d.lua_function,
+ mime_atom->str,
+ lua_tostring(L, -1));
+ lua_pop(L, 1);
+ }
+ else {
+ if (lua_type(L, -1) == LUA_TBOOLEAN) {
+ ret = lua_toboolean(L, -1);
+ }
+ else if (lua_type(L, -1) == LUA_TNUMBER) {
+ ret = lua_tonumber(L, 1);
+ }
+ else {
+ msg_err_task("%s returned wrong return type: %s",
+ mime_atom->str, lua_typename(L, lua_type(L, -1)));
+ }
+ /* Remove result */
+ lua_pop(L, 1);
+ }
+ }
+ else if (mime_atom->type == MIME_ATOM_LOCAL_LUA_FUNCTION) {
+ gint err_idx;
+
+ L = task->cfg->lua_state;
+ lua_pushcfunction(L, &rspamd_lua_traceback);
+ err_idx = lua_gettop(L);
+
+ lua_rawgeti(L, LUA_REGISTRYINDEX, mime_atom->d.lua_cbref);
+ rspamd_lua_task_push(L, task);
+
+ if (lua_pcall(L, 1, 1, err_idx) != 0) {
+ msg_info_task("lua call to local function for atom '%s' failed: %s",
+ mime_atom->str,
+ lua_tostring(L, -1));
+ }
+ else {
+ if (lua_type(L, -1) == LUA_TBOOLEAN) {
+ ret = lua_toboolean(L, -1);
+ }
+ else if (lua_type(L, -1) == LUA_TNUMBER) {
+ ret = lua_tonumber(L, 1);
+ }
+ else {
+ msg_err_task("%s returned wrong return type: %s",
+ mime_atom->str, lua_typename(L, lua_type(L, -1)));
+ }
+ }
+
+ lua_settop(L, 0);
+ }
+ else {
+ ret = rspamd_mime_expr_process_function(mime_atom->d.func, task,
+ task->cfg->lua_state);
+ }
+
+ return ret;
+}
+
+void register_expression_function(const gchar *name,
+ rspamd_internal_func_t func,
+ void *user_data)
+{
+ static struct _fl *new;
+
+ functions_number++;
+
+ new = g_new(struct _fl, functions_number);
+ memcpy(new, list_ptr, (functions_number - 1) * sizeof(struct _fl));
+ if (list_allocated) {
+ g_free(list_ptr);
+ }
+
+ list_allocated = TRUE;
+ new[functions_number - 1].name = name;
+ new[functions_number - 1].func = func;
+ new[functions_number - 1].user_data = user_data;
+ qsort(new, functions_number, sizeof(struct _fl), fl_cmp);
+ list_ptr = new;
+}
+
+gboolean
+rspamd_compare_encoding(struct rspamd_task *task, GArray *args, void *unused)
+{
+ struct expression_argument *arg;
+
+ if (args == NULL || task == NULL) {
+ return FALSE;
+ }
+
+ arg = &g_array_index(args, struct expression_argument, 0);
+ if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ /* XXX: really write this function */
+ return TRUE;
+}
+
+gboolean
+rspamd_header_exists(struct rspamd_task *task, GArray *args, void *unused)
+{
+ struct expression_argument *arg;
+ struct rspamd_mime_header *rh;
+
+ if (args == NULL || task == NULL) {
+ return FALSE;
+ }
+
+ arg = &g_array_index(args, struct expression_argument, 0);
+ if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ rh = rspamd_message_get_header_array(task,
+ (gchar *) arg->data, FALSE);
+
+ debug_task("try to get header %s: %d", (gchar *) arg->data,
+ (rh != NULL));
+
+ if (rh) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+
+/*
+ * This function is designed to find difference between text/html and text/plain parts
+ * It takes one argument: difference threshold, if we have two text parts, compare
+ * its hashes and check for threshold, if value is greater than threshold, return TRUE
+ * and return FALSE otherwise.
+ */
+gboolean
+rspamd_parts_distance(struct rspamd_task *task, GArray *args, void *unused)
+{
+ gint threshold, threshold2 = -1;
+ struct expression_argument *arg;
+ gdouble *pdiff, diff;
+
+ if (args == NULL || args->len == 0) {
+ debug_task("no threshold is specified, assume it 100");
+ threshold = 100;
+ }
+ else {
+ errno = 0;
+ arg = &g_array_index(args, struct expression_argument, 0);
+ if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ threshold = strtoul((gchar *) arg->data, NULL, 10);
+ if (errno != 0) {
+ msg_info_task("bad numeric value for threshold \"%s\", assume it 100",
+ (gchar *) arg->data);
+ threshold = 100;
+ }
+ if (args->len >= 2) {
+ arg = &g_array_index(args, struct expression_argument, 1);
+ if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ errno = 0;
+ threshold2 = strtoul((gchar *) arg->data, NULL, 10);
+ if (errno != 0) {
+ msg_info_task("bad numeric value for threshold \"%s\", ignore it",
+ (gchar *) arg->data);
+ threshold2 = -1;
+ }
+ }
+ }
+
+ if ((pdiff =
+ rspamd_mempool_get_variable(task->task_pool,
+ "parts_distance")) != NULL) {
+ diff = (1.0 - (*pdiff)) * 100.0;
+
+ if (diff != -1) {
+ if (threshold2 > 0) {
+ if (diff >= MIN(threshold, threshold2) &&
+ diff < MAX(threshold, threshold2)) {
+
+ return TRUE;
+ }
+ }
+ else {
+ if (diff <= threshold) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+ }
+ else {
+ return FALSE;
+ }
+ }
+
+ return FALSE;
+}
+
+struct addr_list {
+ const gchar *name;
+ guint namelen;
+ const gchar *addr;
+ guint addrlen;
+};
+
+static gint
+addr_list_cmp_func(const void *a, const void *b)
+{
+ const struct addr_list *addra = (struct addr_list *) a,
+ *addrb = (struct addr_list *) b;
+
+ if (addra->addrlen != addrb->addrlen) {
+ return addra->addrlen - addrb->addrlen;
+ }
+
+ return memcmp(addra->addr, addrb->addr, addra->addrlen);
+}
+
+#define COMPARE_RCPT_LEN 3
+#define MIN_RCPT_TO_COMPARE 7
+
+gboolean
+rspamd_recipients_distance(struct rspamd_task *task, GArray *args,
+ void *unused)
+{
+ struct expression_argument *arg;
+ struct rspamd_email_address *cur;
+ double threshold;
+ struct addr_list *ar;
+ gint num, i, hits = 0;
+
+ if (args == NULL) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ arg = &g_array_index(args, struct expression_argument, 0);
+ if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ errno = 0;
+ threshold = strtod((gchar *) arg->data, NULL);
+
+ if (errno != 0) {
+ msg_warn_task("invalid numeric value '%s': %s",
+ (gchar *) arg->data,
+ strerror(errno));
+ return FALSE;
+ }
+
+ if (!MESSAGE_FIELD(task, rcpt_mime)) {
+ return FALSE;
+ }
+
+ num = MESSAGE_FIELD(task, rcpt_mime)->len;
+
+ if (num < MIN_RCPT_TO_COMPARE) {
+ return FALSE;
+ }
+
+ ar = rspamd_mempool_alloc0(task->task_pool, num * sizeof(struct addr_list));
+
+ /* Fill array */
+ num = 0;
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, rcpt_mime), i, cur)
+ {
+ if (cur->addr_len > COMPARE_RCPT_LEN) {
+ ar[num].name = cur->addr;
+ ar[num].namelen = cur->addr_len;
+ ar[num].addr = cur->domain;
+ ar[num].addrlen = cur->domain_len;
+ num++;
+ }
+ }
+
+ qsort(ar, num, sizeof(*ar), addr_list_cmp_func);
+
+ /* Cycle all elements in array */
+ for (i = 0; i < num; i++) {
+ if (i < num - 1) {
+ if (ar[i].namelen == ar[i + 1].namelen) {
+ if (rspamd_lc_cmp(ar[i].name, ar[i + 1].name, COMPARE_RCPT_LEN) == 0) {
+ hits++;
+ }
+ }
+ }
+ }
+
+ if ((hits * num / 2.) / (double) num >= threshold) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+gboolean
+rspamd_has_only_html_part(struct rspamd_task *task, GArray *args,
+ void *unused)
+{
+ struct rspamd_mime_text_part *p;
+ guint i, cnt_html = 0, cnt_txt = 0;
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, p)
+ {
+ if (!IS_TEXT_PART_ATTACHMENT(p)) {
+ if (IS_TEXT_PART_HTML(p)) {
+ cnt_html++;
+ }
+ else {
+ cnt_txt++;
+ }
+ }
+ }
+
+ return (cnt_html > 0 && cnt_txt == 0);
+}
+
+static gboolean
+is_recipient_list_sorted(GPtrArray *ar)
+{
+ struct rspamd_email_address *addr;
+ gboolean res = TRUE;
+ rspamd_ftok_t cur, prev;
+ gint i;
+
+ /* Do not check to short address lists */
+ if (ar == NULL || ar->len < MIN_RCPT_TO_COMPARE) {
+ return FALSE;
+ }
+
+ prev.len = 0;
+ prev.begin = NULL;
+
+ PTR_ARRAY_FOREACH(ar, i, addr)
+ {
+ cur.begin = addr->addr;
+ cur.len = addr->addr_len;
+
+ if (prev.len != 0) {
+ if (rspamd_ftok_casecmp(&cur, &prev) <= 0) {
+ res = FALSE;
+ break;
+ }
+ }
+
+ prev = cur;
+ }
+
+ return res;
+}
+
+gboolean
+rspamd_is_recipients_sorted(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+ /* Check all types of addresses */
+
+ if (MESSAGE_FIELD(task, rcpt_mime)) {
+ return is_recipient_list_sorted(MESSAGE_FIELD(task, rcpt_mime));
+ }
+
+ return FALSE;
+}
+
+gboolean
+rspamd_compare_transfer_encoding(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+ struct expression_argument *arg;
+ guint i;
+ struct rspamd_mime_part *part;
+ enum rspamd_cte cte;
+
+ if (args == NULL) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ arg = &g_array_index(args, struct expression_argument, 0);
+ if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ cte = rspamd_cte_from_string(arg->data);
+
+ if (cte == RSPAMD_CTE_UNKNOWN) {
+ msg_warn_task("unknown cte: %s", arg->data);
+ return FALSE;
+ }
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
+ {
+ if (IS_PART_TEXT(part)) {
+ if (part->cte == cte) {
+ return TRUE;
+ }
+ }
+ }
+
+ return FALSE;
+}
+
+gboolean
+rspamd_is_html_balanced(struct rspamd_task *task, GArray *args, void *unused)
+{
+ /* Totally broken but seems to be never used */
+ return TRUE;
+}
+
+gboolean
+rspamd_has_html_tag(struct rspamd_task *task, GArray *args, void *unused)
+{
+ struct rspamd_mime_text_part *p;
+ struct expression_argument *arg;
+ guint i;
+ gboolean res = FALSE;
+
+ if (args == NULL) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ arg = &g_array_index(args, struct expression_argument, 0);
+ if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, p)
+ {
+ if (IS_TEXT_PART_HTML(p) && p->html) {
+ res = rspamd_html_tag_seen(p->html, arg->data);
+ }
+
+ if (res) {
+ break;
+ }
+ }
+
+ return res;
+}
+
+gboolean
+rspamd_has_fake_html(struct rspamd_task *task, GArray *args, void *unused)
+{
+ struct rspamd_mime_text_part *p;
+ guint i;
+ gboolean res = FALSE;
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, p)
+ {
+ if (IS_TEXT_PART_HTML(p) && (rspamd_html_get_tags_count(p->html) < 2)) {
+ res = TRUE;
+ }
+
+ if (res) {
+ break;
+ }
+ }
+
+ return res;
+}
+
+static gboolean
+rspamd_raw_header_exists(struct rspamd_task *task, GArray *args, void *unused)
+{
+ struct expression_argument *arg;
+
+ if (args == NULL || task == NULL) {
+ return FALSE;
+ }
+
+ arg = &g_array_index(args, struct expression_argument, 0);
+ if (!arg || arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ return rspamd_message_get_header_array(task, arg->data, FALSE) != NULL;
+}
+
+static gboolean
+match_smtp_data(struct rspamd_task *task,
+ struct expression_argument *arg,
+ const gchar *what, gsize len)
+{
+ rspamd_regexp_t *re;
+ gint r = 0;
+
+ if (arg->type == EXPRESSION_ARGUMENT_REGEXP) {
+ /* This is a regexp */
+ re = arg->data;
+ if (re == NULL) {
+ msg_warn_task("cannot compile regexp for function");
+ return FALSE;
+ }
+
+
+ if (len > 0) {
+ r = rspamd_regexp_search(re, what, len, NULL, NULL, FALSE, NULL);
+ }
+
+ return r;
+ }
+ else if (arg->type == EXPRESSION_ARGUMENT_NORMAL &&
+ g_ascii_strncasecmp(arg->data, what, len) == 0) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+static gboolean
+rspamd_check_smtp_data(struct rspamd_task *task, GArray *args, void *unused)
+{
+ struct expression_argument *arg;
+ struct rspamd_email_address *addr = NULL;
+ GPtrArray *rcpts = NULL;
+ const gchar *type, *str = NULL;
+ guint i;
+
+ if (args == NULL) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ arg = &g_array_index(args, struct expression_argument, 0);
+
+ if (!arg || !arg->data || arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+ else {
+ type = arg->data;
+ switch (*type) {
+ case 'f':
+ case 'F':
+ if (g_ascii_strcasecmp(type, "from") == 0) {
+ addr = rspamd_task_get_sender(task);
+ }
+ else {
+ msg_warn_task("bad argument to function: %s", type);
+ return FALSE;
+ }
+ break;
+ case 'h':
+ case 'H':
+ if (g_ascii_strcasecmp(type, "helo") == 0) {
+ str = task->helo;
+ }
+ else {
+ msg_warn_task("bad argument to function: %s", type);
+ return FALSE;
+ }
+ break;
+ case 'u':
+ case 'U':
+ if (g_ascii_strcasecmp(type, "user") == 0) {
+ str = task->auth_user;
+ }
+ else {
+ msg_warn_task("bad argument to function: %s", type);
+ return FALSE;
+ }
+ break;
+ case 's':
+ case 'S':
+ if (g_ascii_strcasecmp(type, "subject") == 0) {
+ str = MESSAGE_FIELD(task, subject);
+ }
+ else {
+ msg_warn_task("bad argument to function: %s", type);
+ return FALSE;
+ }
+ break;
+ case 'r':
+ case 'R':
+ if (g_ascii_strcasecmp(type, "rcpt") == 0) {
+ rcpts = task->rcpt_envelope;
+ }
+ else {
+ msg_warn_task("bad argument to function: %s", type);
+ return FALSE;
+ }
+ break;
+ default:
+ msg_warn_task("bad argument to function: %s", type);
+ return FALSE;
+ }
+ }
+
+ if (str == NULL && addr == NULL && rcpts == NULL) {
+ /* Not enough data so regexp would NOT be found anyway */
+ return FALSE;
+ }
+
+ /* We would process only one more argument, others are ignored */
+ if (args->len >= 2) {
+ arg = &g_array_index(args, struct expression_argument, 1);
+
+ if (arg) {
+ if (str != NULL) {
+ return match_smtp_data(task, arg, str, strlen(str));
+ }
+ else if (addr != NULL && addr->addr) {
+ return match_smtp_data(task, arg, addr->addr, addr->addr_len);
+ }
+ else {
+ if (rcpts != NULL) {
+ for (i = 0; i < rcpts->len; i++) {
+ addr = g_ptr_array_index(rcpts, i);
+
+ if (addr && addr->addr &&
+ match_smtp_data(task, arg,
+ addr->addr, addr->addr_len)) {
+ return TRUE;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return FALSE;
+}
+
+static inline gboolean
+rspamd_check_ct_attr(const gchar *begin, gsize len,
+ struct expression_argument *arg_pattern)
+{
+ rspamd_regexp_t *re;
+ gboolean r = FALSE;
+
+ if (arg_pattern->type == EXPRESSION_ARGUMENT_REGEXP) {
+ re = arg_pattern->data;
+
+ if (len > 0) {
+ r = rspamd_regexp_search(re,
+ begin, len,
+ NULL, NULL, FALSE, NULL);
+ }
+
+ if (r) {
+ return TRUE;
+ }
+ }
+ else {
+ /* Just do strcasecmp */
+ gsize plen = strlen(arg_pattern->data);
+
+ if (plen == len &&
+ g_ascii_strncasecmp(arg_pattern->data, begin, len) == 0) {
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+static gboolean
+rspamd_content_type_compare_param(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+
+ struct expression_argument *arg, *arg1, *arg_pattern;
+ gboolean recursive = FALSE;
+ struct rspamd_mime_part *cur_part;
+ guint i;
+ rspamd_ftok_t srch;
+ struct rspamd_content_type_param *found = NULL, *cur;
+ const gchar *param_name;
+
+ if (args == NULL || args->len < 2) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ arg = &g_array_index(args, struct expression_argument, 0);
+ g_assert(arg->type == EXPRESSION_ARGUMENT_NORMAL);
+ param_name = arg->data;
+ arg_pattern = &g_array_index(args, struct expression_argument, 1);
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, cur_part)
+ {
+ if (args->len >= 3) {
+ arg1 = &g_array_index(args, struct expression_argument, 2);
+ if (g_ascii_strncasecmp(arg1->data, "true",
+ sizeof("true") - 1) == 0) {
+ recursive = TRUE;
+ }
+ }
+ else {
+ /*
+ * If user did not specify argument, let's assume that he wants
+ * recursive search if mime part is multipart/mixed
+ */
+ if (IS_PART_MULTIPART(cur_part)) {
+ recursive = TRUE;
+ }
+ }
+
+ rspamd_ftok_t lit;
+ RSPAMD_FTOK_FROM_STR(&srch, param_name);
+ RSPAMD_FTOK_FROM_STR(&lit, "charset");
+
+ if (rspamd_ftok_equal(&srch, &lit)) {
+ if (rspamd_check_ct_attr(cur_part->ct->charset.begin,
+ cur_part->ct->charset.len, arg_pattern)) {
+ return TRUE;
+ }
+ }
+
+ RSPAMD_FTOK_FROM_STR(&lit, "boundary");
+ if (rspamd_ftok_equal(&srch, &lit)) {
+ if (rspamd_check_ct_attr(cur_part->ct->orig_boundary.begin,
+ cur_part->ct->orig_boundary.len, arg_pattern)) {
+ return TRUE;
+ }
+ }
+
+ if (cur_part->ct->attrs) {
+ found = g_hash_table_lookup(cur_part->ct->attrs, &srch);
+
+ if (found) {
+ DL_FOREACH(found, cur)
+ {
+ if (rspamd_check_ct_attr(cur->value.begin,
+ cur->value.len, arg_pattern)) {
+ return TRUE;
+ }
+ }
+ }
+ }
+
+ if (!recursive) {
+ break;
+ }
+ }
+
+ return FALSE;
+}
+
+static gboolean
+rspamd_content_type_has_param(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+ struct expression_argument *arg, *arg1;
+ gboolean recursive = FALSE;
+ struct rspamd_mime_part *cur_part;
+ guint i;
+ rspamd_ftok_t srch;
+ struct rspamd_content_type_param *found = NULL;
+ const gchar *param_name;
+
+ if (args == NULL || args->len < 1) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ arg = &g_array_index(args, struct expression_argument, 0);
+ g_assert(arg->type == EXPRESSION_ARGUMENT_NORMAL);
+ param_name = arg->data;
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, cur_part)
+ {
+ if (args->len >= 2) {
+ arg1 = &g_array_index(args, struct expression_argument, 1);
+ if (g_ascii_strncasecmp(arg1->data, "true",
+ sizeof("true") - 1) == 0) {
+ recursive = TRUE;
+ }
+ }
+ else {
+ /*
+ * If user did not specify argument, let's assume that he wants
+ * recursive search if mime part is multipart/mixed
+ */
+ if (IS_PART_MULTIPART(cur_part)) {
+ recursive = TRUE;
+ }
+ }
+
+
+ rspamd_ftok_t lit;
+ RSPAMD_FTOK_FROM_STR(&srch, param_name);
+ RSPAMD_FTOK_FROM_STR(&lit, "charset");
+
+ if (rspamd_ftok_equal(&srch, &lit)) {
+ if (cur_part->ct->charset.len > 0) {
+ return TRUE;
+ }
+ }
+
+ RSPAMD_FTOK_FROM_STR(&lit, "boundary");
+ if (rspamd_ftok_equal(&srch, &lit)) {
+ if (cur_part->ct->boundary.len > 0) {
+ return TRUE;
+ }
+ }
+
+ if (cur_part->ct->attrs) {
+ found = g_hash_table_lookup(cur_part->ct->attrs, &srch);
+
+ if (found) {
+ return TRUE;
+ }
+ }
+
+ if (!recursive) {
+ break;
+ }
+ }
+
+ return FALSE;
+}
+
+static gboolean
+rspamd_content_type_check(struct rspamd_task *task,
+ GArray *args,
+ gboolean check_subtype)
+{
+ rspamd_ftok_t *param_data, srch;
+ rspamd_regexp_t *re;
+ struct expression_argument *arg1, *arg_pattern;
+ struct rspamd_content_type *ct;
+ gint r = 0;
+ guint i;
+ gboolean recursive = FALSE;
+ struct rspamd_mime_part *cur_part;
+
+ if (args == NULL || args->len < 1) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ arg_pattern = &g_array_index(args, struct expression_argument, 0);
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, cur_part)
+ {
+ ct = cur_part->ct;
+
+ if (args->len >= 2) {
+ arg1 = &g_array_index(args, struct expression_argument, 1);
+ if (g_ascii_strncasecmp(arg1->data, "true",
+ sizeof("true") - 1) == 0) {
+ recursive = TRUE;
+ }
+ }
+ else {
+ /*
+ * If user did not specify argument, let's assume that he wants
+ * recursive search if mime part is multipart/mixed
+ */
+ if (IS_PART_MULTIPART(cur_part)) {
+ recursive = TRUE;
+ }
+ }
+
+ if (check_subtype) {
+ param_data = &ct->subtype;
+ }
+ else {
+ param_data = &ct->type;
+ }
+
+ if (arg_pattern->type == EXPRESSION_ARGUMENT_REGEXP) {
+ re = arg_pattern->data;
+
+ if (param_data->len > 0) {
+ r = rspamd_regexp_search(re, param_data->begin, param_data->len,
+ NULL, NULL, FALSE, NULL);
+ }
+
+ if (r) {
+ return TRUE;
+ }
+ }
+ else {
+ /* Just do strcasecmp */
+ srch.begin = arg_pattern->data;
+ srch.len = strlen(arg_pattern->data);
+
+ if (rspamd_ftok_casecmp(param_data, &srch) == 0) {
+ return TRUE;
+ }
+ }
+
+ /* Get next part */
+ if (!recursive) {
+ break;
+ }
+ }
+
+ return FALSE;
+}
+
+static gboolean
+rspamd_content_type_is_type(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+ return rspamd_content_type_check(task, args, FALSE);
+}
+
+static gboolean
+rspamd_content_type_is_subtype(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+ return rspamd_content_type_check(task, args, TRUE);
+}
+
+static gboolean
+compare_subtype(struct rspamd_task *task, struct rspamd_content_type *ct,
+ struct expression_argument *subtype)
+{
+ rspamd_regexp_t *re;
+ rspamd_ftok_t srch;
+ gint r = 0;
+
+ if (subtype == NULL || ct == NULL) {
+ msg_warn_task("invalid parameters passed");
+ return FALSE;
+ }
+ if (subtype->type == EXPRESSION_ARGUMENT_REGEXP) {
+ re = subtype->data;
+
+ if (ct->subtype.len > 0) {
+ r = rspamd_regexp_search(re, ct->subtype.begin, ct->subtype.len,
+ NULL, NULL, FALSE, NULL);
+ }
+ }
+ else {
+ srch.begin = subtype->data;
+ srch.len = strlen(subtype->data);
+
+ /* Just do strcasecmp */
+ if (rspamd_ftok_casecmp(&ct->subtype, &srch) == 0) {
+ return TRUE;
+ }
+ }
+
+ return r;
+}
+
+static gboolean
+compare_len(struct rspamd_mime_part *part, guint min, guint max)
+{
+ if (min == 0 && max == 0) {
+ return TRUE;
+ }
+
+ if (min == 0) {
+ return part->parsed_data.len <= max;
+ }
+ else if (max == 0) {
+ return part->parsed_data.len >= min;
+ }
+ else {
+ return part->parsed_data.len >= min && part->parsed_data.len <= max;
+ }
+}
+
+static gboolean
+common_has_content_part(struct rspamd_task *task,
+ struct expression_argument *param_type,
+ struct expression_argument *param_subtype,
+ gint min_len,
+ gint max_len)
+{
+ rspamd_regexp_t *re;
+ struct rspamd_mime_part *part;
+ struct rspamd_content_type *ct;
+ rspamd_ftok_t srch;
+ gint r = 0;
+ guint i;
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
+ {
+ ct = part->ct;
+
+ if (ct == NULL) {
+ continue;
+ }
+
+ if (param_type->type == EXPRESSION_ARGUMENT_REGEXP) {
+ re = param_type->data;
+
+ if (ct->type.len > 0) {
+ r = rspamd_regexp_search(re, ct->type.begin, ct->type.len,
+ NULL, NULL, FALSE, NULL);
+ }
+
+ /* Also check subtype and length of the part */
+ if (r && param_subtype) {
+ r = compare_len(part, min_len, max_len) &&
+ compare_subtype(task, ct, param_subtype);
+
+ return r;
+ }
+ }
+ else {
+ /* Just do strcasecmp */
+ srch.begin = param_type->data;
+ srch.len = strlen(param_type->data);
+
+ if (rspamd_ftok_casecmp(&ct->type, &srch) == 0) {
+ if (param_subtype) {
+ if (compare_subtype(task, ct, param_subtype)) {
+ if (compare_len(part, min_len, max_len)) {
+ return TRUE;
+ }
+ }
+ }
+ else {
+ if (compare_len(part, min_len, max_len)) {
+ return TRUE;
+ }
+ }
+ }
+ }
+ }
+
+ return FALSE;
+}
+
+static gboolean
+rspamd_has_content_part(struct rspamd_task *task, GArray *args, void *unused)
+{
+ struct expression_argument *param_type = NULL, *param_subtype = NULL;
+
+ if (args == NULL) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ param_type = &g_array_index(args, struct expression_argument, 0);
+ if (args->len >= 2) {
+ param_subtype = &g_array_index(args, struct expression_argument, 1);
+ }
+
+ return common_has_content_part(task, param_type, param_subtype, 0, 0);
+}
+
+static gboolean
+rspamd_has_content_part_len(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+ struct expression_argument *param_type = NULL, *param_subtype = NULL;
+ gint min = 0, max = 0;
+ struct expression_argument *arg;
+
+ if (args == NULL) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ param_type = &g_array_index(args, struct expression_argument, 0);
+
+ if (args->len >= 2) {
+ param_subtype = &g_array_index(args, struct expression_argument, 1);
+
+ if (args->len >= 3) {
+ arg = &g_array_index(args, struct expression_argument, 2);
+ errno = 0;
+ min = strtoul(arg->data, NULL, 10);
+ g_assert(arg->type == EXPRESSION_ARGUMENT_NORMAL);
+
+ if (errno != 0) {
+ msg_warn_task("invalid numeric value '%s': %s",
+ (gchar *) arg->data,
+ strerror(errno));
+ return FALSE;
+ }
+
+ if (args->len >= 4) {
+ arg = &g_array_index(args, struct expression_argument, 3);
+ g_assert(arg->type == EXPRESSION_ARGUMENT_NORMAL);
+ max = strtoul(arg->data, NULL, 10);
+
+ if (errno != 0) {
+ msg_warn_task("invalid numeric value '%s': %s",
+ (gchar *) arg->data,
+ strerror(errno));
+ return FALSE;
+ }
+ }
+ }
+ }
+
+ return common_has_content_part(task, param_type, param_subtype, min, max);
+}
+
+static gboolean
+rspamd_is_empty_body(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+ struct rspamd_mime_part *part;
+ guint i;
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
+ {
+ if (part->parsed_data.len > 0) {
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+#define TASK_FLAG_READ(flag) \
+ do { \
+ result = !!(task->flags & (flag)); \
+ } while (0)
+
+#define TASK_GET_FLAG(flag, strname, macro) \
+ do { \
+ if (!found && strcmp((flag), strname) == 0) { \
+ TASK_FLAG_READ((macro)); \
+ found = TRUE; \
+ } \
+ } while (0)
+
+#define TASK_PROTOCOL_FLAG_READ(flag) \
+ do { \
+ result = !!(task->protocol_flags & (flag)); \
+ } while (0)
+
+#define TASK_GET_PROTOCOL_FLAG(flag, strname, macro) \
+ do { \
+ if (!found && strcmp((flag), strname) == 0) { \
+ TASK_PROTOCOL_FLAG_READ((macro)); \
+ found = TRUE; \
+ } \
+ } while (0)
+
+
+static gboolean
+rspamd_has_flag_expr(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+ gboolean found = FALSE, result = FALSE;
+ struct expression_argument *flag_arg;
+ const gchar *flag_str;
+
+ if (args == NULL) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ flag_arg = &g_array_index(args, struct expression_argument, 0);
+
+ if (flag_arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid parameter to function");
+ return FALSE;
+ }
+
+ flag_str = (const gchar *) flag_arg->data;
+
+ TASK_GET_FLAG(flag_str, "pass_all", RSPAMD_TASK_FLAG_PASS_ALL);
+ TASK_GET_FLAG(flag_str, "no_log", RSPAMD_TASK_FLAG_NO_LOG);
+ TASK_GET_FLAG(flag_str, "no_stat", RSPAMD_TASK_FLAG_NO_STAT);
+ TASK_GET_FLAG(flag_str, "skip", RSPAMD_TASK_FLAG_SKIP);
+ TASK_GET_PROTOCOL_FLAG(flag_str, "extended_urls",
+ RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS);
+ TASK_GET_FLAG(flag_str, "learn_spam", RSPAMD_TASK_FLAG_LEARN_SPAM);
+ TASK_GET_FLAG(flag_str, "learn_ham", RSPAMD_TASK_FLAG_LEARN_HAM);
+ TASK_GET_FLAG(flag_str, "greylisted", RSPAMD_TASK_FLAG_GREYLISTED);
+ TASK_GET_FLAG(flag_str, "broken_headers",
+ RSPAMD_TASK_FLAG_BROKEN_HEADERS);
+ TASK_GET_FLAG(flag_str, "skip_process",
+ RSPAMD_TASK_FLAG_SKIP_PROCESS);
+ TASK_GET_PROTOCOL_FLAG(flag_str, "milter",
+ RSPAMD_TASK_PROTOCOL_FLAG_MILTER);
+ TASK_GET_FLAG(flag_str, "bad_unicode",
+ RSPAMD_TASK_FLAG_BAD_UNICODE);
+
+ if (!found) {
+ msg_warn_task("invalid flag name %s", flag_str);
+ return FALSE;
+ }
+
+ return result;
+}
+
+static gboolean
+rspamd_has_symbol_expr(struct rspamd_task *task,
+ GArray *args,
+ void *unused)
+{
+ struct expression_argument *sym_arg;
+ const gchar *symbol_str;
+
+ if (args == NULL) {
+ msg_warn_task("no parameters to function");
+ return FALSE;
+ }
+
+ sym_arg = &g_array_index(args, struct expression_argument, 0);
+
+ if (sym_arg->type != EXPRESSION_ARGUMENT_NORMAL) {
+ msg_warn_task("invalid parameter to function");
+ return FALSE;
+ }
+
+ symbol_str = (const gchar *) sym_arg->data;
+
+ if (rspamd_task_find_symbol_result(task, symbol_str, NULL)) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
diff --git a/src/libmime/mime_expressions.h b/src/libmime/mime_expressions.h
new file mode 100644
index 0000000..a2ea3fe
--- /dev/null
+++ b/src/libmime/mime_expressions.h
@@ -0,0 +1,65 @@
+/**
+ * @file expressions.h
+ * Rspamd expressions API
+ */
+
+#ifndef RSPAMD_EXPRESSIONS_H
+#define RSPAMD_EXPRESSIONS_H
+
+#include "config.h"
+#include "expression.h"
+#include "contrib/libucl/ucl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_task;
+struct rspamd_config;
+
+struct rspamd_mime_expr_ud {
+ struct rspamd_config *cfg;
+ const ucl_object_t *conf_obj;
+};
+
+extern const struct rspamd_atom_subr mime_expr_subr;
+
+/**
+ * Function's argument
+ */
+enum rspamd_expression_type {
+ EXPRESSION_ARGUMENT_NORMAL = 0,
+ EXPRESSION_ARGUMENT_BOOL,
+ EXPRESSION_ARGUMENT_REGEXP
+};
+struct expression_argument {
+ enum rspamd_expression_type type; /**< type of argument (text or other function) */
+ void *data; /**< pointer to its data */
+};
+
+
+typedef gboolean (*rspamd_internal_func_t)(struct rspamd_task *,
+ GArray *args, void *user_data);
+
+
+/**
+ * Register specified function to rspamd internal functions list
+ * @param name name of function
+ * @param func pointer to function
+ */
+void register_expression_function(const gchar *name,
+ rspamd_internal_func_t func,
+ void *user_data);
+
+/**
+ * Set global limit of regexp data size to be processed
+ * @param limit new limit in bytes
+ * @return old limit value
+ */
+guint rspamd_mime_expression_set_re_limit(guint limit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c
new file mode 100644
index 0000000..2bd559d
--- /dev/null
+++ b/src/libmime/mime_headers.c
@@ -0,0 +1,1441 @@
+/*
+ * Copyright 2024 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mime_headers.h"
+#include "smtp_parsers.h"
+#include "mime_encoding.h"
+#include "received.h"
+#include "contrib/uthash/utlist.h"
+#include "libserver/mempool_vars_internal.h"
+#include "libserver/cfg_file.h"
+#include "libutil/util.h"
+#include <unicode/utf8.h>
+
+KHASH_INIT(rspamd_mime_headers_htb, gchar *,
+ struct rspamd_mime_header *, 1,
+ rspamd_strcase_hash, rspamd_strcase_equal);
+
+struct rspamd_mime_headers_table {
+ khash_t(rspamd_mime_headers_htb) htb;
+ ref_entry_t ref;
+};
+
+static void
+rspamd_mime_header_check_special(struct rspamd_task *task,
+ struct rspamd_mime_header *rh)
+{
+ guint64 h;
+ const gchar *p, *end;
+ gchar *id;
+ gint max_recipients = -1, len;
+
+ if (task->cfg) {
+ max_recipients = task->cfg->max_recipients;
+ }
+
+ h = rspamd_icase_hash(rh->name, strlen(rh->name), 0xdeadbabe);
+
+ switch (h) {
+ case 0x88705DC4D9D61ABULL: /* received */
+ if (rspamd_received_header_parse(task, rh->decoded, strlen(rh->decoded), rh)) {
+ rh->flags |= RSPAMD_HEADER_RECEIVED;
+ }
+ break;
+ case 0x76F31A09F4352521ULL: /* to */
+ MESSAGE_FIELD(task, rcpt_mime) = rspamd_email_address_from_mime(task->task_pool,
+ rh->value, strlen(rh->value),
+ MESSAGE_FIELD(task, rcpt_mime), max_recipients);
+ rh->flags |= RSPAMD_HEADER_TO | RSPAMD_HEADER_RCPT | RSPAMD_HEADER_UNIQUE;
+ break;
+ case 0x7EB117C1480B76ULL: /* cc */
+ MESSAGE_FIELD(task, rcpt_mime) = rspamd_email_address_from_mime(task->task_pool,
+ rh->value, strlen(rh->value),
+ MESSAGE_FIELD(task, rcpt_mime), max_recipients);
+ rh->flags |= RSPAMD_HEADER_CC | RSPAMD_HEADER_RCPT | RSPAMD_HEADER_UNIQUE;
+ break;
+ case 0xE4923E11C4989C8DULL: /* bcc */
+ MESSAGE_FIELD(task, rcpt_mime) = rspamd_email_address_from_mime(task->task_pool,
+ rh->value, strlen(rh->value),
+ MESSAGE_FIELD(task, rcpt_mime), max_recipients);
+ rh->flags |= RSPAMD_HEADER_BCC | RSPAMD_HEADER_RCPT | RSPAMD_HEADER_UNIQUE;
+ break;
+ case 0x41E1985EDC1CBDE4ULL: /* from */
+ MESSAGE_FIELD(task, from_mime) = rspamd_email_address_from_mime(task->task_pool,
+ rh->value, strlen(rh->value),
+ MESSAGE_FIELD(task, from_mime), max_recipients);
+ rh->flags |= RSPAMD_HEADER_FROM | RSPAMD_HEADER_SENDER | RSPAMD_HEADER_UNIQUE;
+ break;
+ case 0x43A558FC7C240226ULL: /* message-id */ {
+
+ rh->flags = RSPAMD_HEADER_MESSAGE_ID | RSPAMD_HEADER_UNIQUE;
+ p = rh->decoded;
+ len = rspamd_strip_smtp_comments_inplace(rh->decoded, strlen(p));
+ rh->decoded[len] = '\0'; /* Zero terminate after stripping */
+ /* Strip surrounding spaces */
+ rh->decoded = g_strstrip(rh->decoded);
+ end = p + len;
+
+ if (*p == '<') {
+ p++;
+ }
+
+ if (end > p) {
+ gchar *d;
+
+ if (*(end - 1) == '>') {
+ end--;
+ }
+
+ id = rspamd_mempool_alloc(task->task_pool, end - p + 1);
+ d = id;
+
+ while (p < end) {
+ if (g_ascii_isgraph(*p)) {
+ *d++ = *p++;
+ }
+ else {
+ *d++ = '?';
+ p++;
+ }
+ }
+
+ *d = '\0';
+
+ MESSAGE_FIELD(task, message_id) = id;
+ }
+
+ break;
+ }
+ case 0xB91D3910358E8212ULL: /* subject */
+ if (MESSAGE_FIELD(task, subject) == NULL) {
+ MESSAGE_FIELD(task, subject) = rh->decoded;
+ }
+ rh->flags = RSPAMD_HEADER_SUBJECT | RSPAMD_HEADER_UNIQUE;
+ break;
+ case 0xEE4AA2EAAC61D6F4ULL: /* return-path */
+ if (task->from_envelope == NULL) {
+ task->from_envelope = rspamd_email_address_from_smtp(rh->decoded,
+ strlen(rh->decoded));
+ }
+ rh->flags = RSPAMD_HEADER_RETURN_PATH | RSPAMD_HEADER_UNIQUE;
+ break;
+ case 0xB9EEFAD2E93C2161ULL: /* delivered-to */
+ if (task->deliver_to == NULL) {
+ task->deliver_to = rh->decoded;
+ }
+ rh->flags = RSPAMD_HEADER_DELIVERED_TO;
+ break;
+ case 0x2EC3BFF3C393FC10ULL: /* date */
+ case 0xAC0DDB1A1D214CAULL: /* sender */
+ case 0x54094572367AB695ULL: /* in-reply-to */
+ case 0x81CD9E9131AB6A9AULL: /* content-type */
+ case 0xC39BD9A75AA25B60ULL: /* content-transfer-encoding */
+ case 0xB3F6704CB3AD6589ULL: /* references */
+ rh->flags = RSPAMD_HEADER_UNIQUE;
+ break;
+ }
+}
+
+static void
+rspamd_mime_header_add(struct rspamd_task *task,
+ khash_t(rspamd_mime_headers_htb) * target,
+ struct rspamd_mime_header **order_ptr,
+ struct rspamd_mime_header *rh,
+ gboolean check_special)
+{
+ khiter_t k;
+ struct rspamd_mime_header *ex;
+ int res;
+
+ k = kh_put(rspamd_mime_headers_htb, target, rh->name, &res);
+
+ if (res == 0) {
+ ex = kh_value(target, k);
+ DL_APPEND(ex, rh);
+ msg_debug_task("append raw header %s: %s", rh->name, rh->value);
+ }
+ else {
+ kh_value(target, k) = rh;
+ rh->prev = rh;
+ rh->next = NULL;
+ msg_debug_task("add new raw header %s: %s", rh->name, rh->value);
+ }
+
+ LL_PREPEND2(*order_ptr, rh, ord_next);
+
+ if (check_special) {
+ rspamd_mime_header_check_special(task, rh);
+ }
+}
+
+
+/* Convert raw headers to a list of struct raw_header * */
+void rspamd_mime_headers_process(struct rspamd_task *task,
+ struct rspamd_mime_headers_table *target,
+ struct rspamd_mime_header **order_ptr,
+ const gchar *in, gsize len,
+ gboolean check_newlines)
+{
+ struct rspamd_mime_header *nh = NULL;
+ const gchar *p, *c, *end;
+ gchar *tmp, *tp;
+ gint state = 0, l, next_state = 100, err_state = 100, t_state;
+ gboolean valid_folding = FALSE, shift_by_one = FALSE;
+ guint nlines_count[RSPAMD_TASK_NEWLINES_MAX];
+ guint norder = 0;
+
+ p = in;
+ end = p + len;
+ c = p;
+ memset(nlines_count, 0, sizeof(nlines_count));
+ msg_debug_task("start processing headers");
+
+ while (p < end) {
+ /* FSM for processing headers */
+ switch (state) {
+ case 0:
+ /* Begin processing headers */
+ if (!g_ascii_isalpha(*p)) {
+ /* We have some garbage at the beginning of headers, skip this line */
+ state = 100;
+ next_state = 0;
+ }
+ else {
+ state = 1;
+ c = p;
+ }
+ break;
+ case 1:
+ /* We got something like header's name */
+ if (*p == ':') {
+ nh = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_header));
+ l = p - c;
+ tmp = rspamd_mempool_alloc(task->task_pool, l + 1);
+ rspamd_null_safe_copy(c, l, tmp, l + 1);
+ nh->name = tmp;
+ nh->flags |= RSPAMD_HEADER_EMPTY_SEPARATOR;
+ nh->raw_value = c;
+ nh->raw_len = p - c; /* Including trailing ':' */
+ p++;
+ state = 2;
+ c = p;
+ }
+ else if (g_ascii_isspace(*p)) {
+ /* Not header but some garbage */
+ if (target == MESSAGE_FIELD(task, raw_headers)) {
+ /* Do not propagate flag from the attachments */
+ task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
+ }
+ state = 100;
+ next_state = 0;
+ }
+ else {
+ p++;
+ }
+ break;
+ case 2:
+ /* We got header's name, so skip any \t or spaces */
+ if (*p == '\t') {
+ nh->flags &= ~RSPAMD_HEADER_EMPTY_SEPARATOR;
+ nh->flags |= RSPAMD_HEADER_TAB_SEPARATED;
+ p++;
+ }
+ else if (*p == ' ') {
+ nh->flags &= ~RSPAMD_HEADER_EMPTY_SEPARATOR;
+ p++;
+ }
+ else if (*p == '\n' || *p == '\r') {
+
+ if (check_newlines) {
+ if (*p == '\n') {
+ nlines_count[RSPAMD_TASK_NEWLINES_LF]++;
+ }
+ else if (p + 1 < end && *(p + 1) == '\n') {
+ nlines_count[RSPAMD_TASK_NEWLINES_CRLF]++;
+ }
+ else {
+ nlines_count[RSPAMD_TASK_NEWLINES_CR]++;
+ }
+ }
+
+ /* Process folding */
+ state = 99;
+ l = p - c;
+ if (l > 0) {
+ tmp = rspamd_mempool_alloc(task->task_pool, l + 1);
+ rspamd_null_safe_copy(c, l, tmp, l + 1);
+ nh->separator = tmp;
+ }
+ next_state = 3;
+ err_state = 5;
+ c = p;
+ }
+ else {
+ /* Process value */
+ l = p - c;
+ if (l >= 0) {
+ tmp = rspamd_mempool_alloc(task->task_pool, l + 1);
+ rspamd_null_safe_copy(c, l, tmp, l + 1);
+ nh->separator = tmp;
+ }
+ c = p;
+ state = 3;
+ }
+ break;
+ case 3:
+ if (*p == '\r' || *p == '\n') {
+ /* Hold folding */
+ if (check_newlines) {
+ if (*p == '\n') {
+ nlines_count[RSPAMD_TASK_NEWLINES_LF]++;
+ }
+ else if (p + 1 < end && *(p + 1) == '\n') {
+ nlines_count[RSPAMD_TASK_NEWLINES_CRLF]++;
+ }
+ else {
+ nlines_count[RSPAMD_TASK_NEWLINES_CR]++;
+ }
+ }
+ state = 99;
+ next_state = 3;
+ err_state = 4;
+ }
+ else if (p + 1 == end) {
+ state = 4;
+ }
+ else {
+ p++;
+ }
+ break;
+ case 4:
+ /* Copy header's value */
+
+ /*
+ * XXX:
+ * The original decision to use here null terminated
+ * strings was extremely poor!
+ */
+ l = p - c;
+ tmp = rspamd_mempool_alloc(task->task_pool, l + 1);
+ tp = tmp;
+ t_state = 0;
+ while (l--) {
+ if (t_state == 0) {
+ /* Before folding */
+ if (*c == '\n' || *c == '\r') {
+ t_state = 1;
+ c++;
+ *tp++ = ' ';
+ }
+ else {
+ if (*c != '\0') {
+ *tp++ = *c++;
+ }
+ else {
+ c++;
+ }
+ }
+ }
+ else if (t_state == 1) {
+ /* Inside folding */
+ if (g_ascii_isspace(*c)) {
+ c++;
+ }
+ else {
+ t_state = 0;
+ if (*c != '\0') {
+ *tp++ = *c++;
+ }
+ else {
+ c++;
+ }
+ }
+ }
+ }
+ /* Strip last space that can be added by \r\n parsing */
+ if (tp > tmp && *(tp - 1) == ' ') {
+ tp--;
+ }
+
+ *tp = '\0';
+ /* Strip the initial spaces that could also be added by folding */
+ while (*tmp != '\0' && g_ascii_isspace(*tmp)) {
+ tmp++;
+ }
+
+ if (p + 1 == end) {
+ nh->raw_len = end - nh->raw_value;
+ }
+ else {
+ nh->raw_len = p - nh->raw_value;
+ }
+
+ nh->value = tmp;
+
+ gboolean broken_utf = FALSE;
+
+ nh->decoded = rspamd_mime_header_decode(task->task_pool,
+ nh->value, strlen(tmp), &broken_utf);
+
+ if (broken_utf) {
+ task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
+ }
+
+ if (nh->decoded == NULL) {
+ /* As we strip comments in place... */
+ nh->decoded = rspamd_mempool_strdup(task->task_pool, "");
+ }
+
+ /* We also validate utf8 and replace all non-valid utf8 chars */
+ rspamd_mime_charset_utf_enforce(nh->decoded, strlen(nh->decoded));
+ nh->order = norder++;
+ rspamd_mime_header_add(task, &target->htb, order_ptr, nh, check_newlines);
+ nh = NULL;
+ state = 0;
+ break;
+ case 5:
+ /* Header has only name, no value */
+ nh->value = rspamd_mempool_strdup(task->task_pool, "");
+ nh->decoded = rspamd_mempool_strdup(task->task_pool, "");
+ nh->raw_len = p - nh->raw_value;
+ if (shift_by_one) {
+ nh->raw_len++;
+ }
+ nh->order = norder++;
+ rspamd_mime_header_add(task, &target->htb, order_ptr, nh, check_newlines);
+ nh = NULL;
+ state = 0;
+ break;
+ case 99:
+ /* Folding state */
+ if (p + 1 == end) {
+ state = err_state;
+ /* Include the last character into the next header */
+ shift_by_one = TRUE;
+ }
+ else {
+ if (*p == '\r' || *p == '\n') {
+ p++;
+ valid_folding = FALSE;
+ }
+ else if (*p == '\t' || *p == ' ') {
+ /* Valid folding */
+ p++;
+ valid_folding = TRUE;
+ }
+ else {
+ if (valid_folding) {
+ debug_task("go to state: %d->%d", state, next_state);
+ state = next_state;
+ }
+ else {
+ /* Fall back */
+ debug_task("go to state: %d->%d", state, err_state);
+ state = err_state;
+ }
+ }
+ }
+ break;
+ case 100:
+ /* Fail state, skip line */
+
+ if (*p == '\r') {
+ if (p + 1 < end && *(p + 1) == '\n') {
+ nlines_count[RSPAMD_TASK_NEWLINES_CRLF]++;
+ p++;
+ }
+ p++;
+ state = next_state;
+ }
+ else if (*p == '\n') {
+ nlines_count[RSPAMD_TASK_NEWLINES_LF]++;
+
+ if (p + 1 < end && *(p + 1) == '\r') {
+ p++;
+ }
+ p++;
+ state = next_state;
+ }
+ else if (p + 1 == end) {
+ state = next_state;
+ p++;
+ }
+ else {
+ p++;
+ }
+ break;
+ }
+ }
+
+ /* Since we have prepended headers, we need to reverse the list to get the actual order */
+ LL_REVERSE(*order_ptr);
+
+ if (check_newlines) {
+ guint max_cnt = 0;
+ gint sel = 0;
+ rspamd_cryptobox_hash_state_t hs;
+ guchar hout[rspamd_cryptobox_HASHBYTES], *hexout;
+
+ for (gint i = RSPAMD_TASK_NEWLINES_CR; i < RSPAMD_TASK_NEWLINES_MAX; i++) {
+ if (nlines_count[i] > max_cnt) {
+ max_cnt = nlines_count[i];
+ sel = i;
+ }
+ }
+
+ MESSAGE_FIELD(task, nlines_type) = sel;
+
+ rspamd_cryptobox_hash_init(&hs, NULL, 0);
+
+ LL_FOREACH(*order_ptr, nh)
+ {
+ if (nh->name && nh->flags != RSPAMD_HEADER_RECEIVED) {
+ rspamd_cryptobox_hash_update(&hs, nh->name, strlen(nh->name));
+ }
+ }
+
+ rspamd_cryptobox_hash_final(&hs, hout);
+ hexout = rspamd_mempool_alloc(task->task_pool, sizeof(hout) * 2 + 1);
+ hexout[sizeof(hout) * 2] = '\0';
+ rspamd_encode_hex_buf(hout, sizeof(hout), hexout,
+ sizeof(hout) * 2 + 1);
+ rspamd_mempool_set_variable(task->task_pool,
+ RSPAMD_MEMPOOL_HEADERS_HASH,
+ hexout, NULL);
+ }
+}
+
+static void
+rspamd_mime_header_maybe_save_token(rspamd_mempool_t *pool,
+ GString *out,
+ GByteArray *token,
+ GByteArray *decoded_token,
+ rspamd_ftok_t *old_charset,
+ rspamd_ftok_t *new_charset)
+{
+ if (new_charset->len == 0) {
+ g_assert_not_reached();
+ }
+
+ if (old_charset->len > 0) {
+ if (rspamd_ftok_casecmp(new_charset, old_charset) == 0) {
+ rspamd_ftok_t srch;
+
+ /*
+ * Special case for iso-2022-jp:
+ * https://github.com/vstakhov/rspamd/issues/1669
+ */
+ RSPAMD_FTOK_ASSIGN(&srch, "iso-2022-jp");
+
+ if (rspamd_ftok_casecmp(new_charset, &srch) != 0) {
+ /* We can concatenate buffers, just return */
+ return;
+ }
+ }
+ }
+
+ /* We need to flush and decode old token to out string */
+ if (rspamd_mime_to_utf8_byte_array(token, decoded_token, pool,
+ rspamd_mime_detect_charset(new_charset, pool))) {
+ g_string_append_len(out, decoded_token->data, decoded_token->len);
+ }
+
+ /* We also reset buffer */
+ g_byte_array_set_size(token, 0);
+ /*
+ * Propagate charset
+ *
+ * Here are dragons: we save the original charset to allow buffers concat
+ * in the condition at the beginning of the function.
+ * However, it will likely cause unnecessary calls for
+ * `rspamd_mime_detect_charset` which could be relatively expensive.
+ * But we ignore that for now...
+ */
+ memcpy(old_charset, new_charset, sizeof(*old_charset));
+}
+
+static void
+rspamd_mime_header_sanity_check(GString *str)
+{
+ gsize i;
+ gchar t;
+
+ for (i = 0; i < str->len; i++) {
+ t = str->str[i];
+ if (!((t & 0x80) || g_ascii_isgraph(t))) {
+ if (g_ascii_isspace(t)) {
+ /* Replace spaces characters with plain space */
+ str->str[i] = ' ';
+ }
+ else {
+ str->str[i] = '?';
+ }
+ }
+ }
+}
+
+gchar *
+rspamd_mime_header_decode(rspamd_mempool_t *pool, const gchar *in,
+ gsize inlen, gboolean *invalid_utf)
+{
+ GString *out;
+ const guchar *c, *p, *end;
+ const gchar *tok_start = NULL;
+ gsize tok_len = 0, pos;
+ GByteArray *token = NULL, *decoded;
+ rspamd_ftok_t cur_charset = {0, NULL}, old_charset = {0, NULL};
+ gint encoding;
+ gssize r;
+ guint qmarks = 0;
+ gchar *ret;
+ enum {
+ parse_normal = 0,
+ got_eqsign,
+ got_encoded_start,
+ got_more_qmark,
+ skip_spaces,
+ } state = parse_normal;
+
+ g_assert(in != NULL);
+
+ c = in;
+ p = in;
+ end = in + inlen;
+ out = g_string_sized_new(inlen);
+ token = g_byte_array_sized_new(80);
+ decoded = g_byte_array_sized_new(122);
+
+ while (p < end) {
+ switch (state) {
+ case parse_normal:
+ if (*p == '=') {
+ g_string_append_len(out, c, p - c);
+ c = p;
+ state = got_eqsign;
+ }
+ else if (*p >= 128) {
+ gint off = 0;
+ UChar32 uc;
+ /* Unencoded character */
+ g_string_append_len(out, c, p - c);
+ /* Check if that's valid UTF8 */
+ U8_NEXT(p, off, end - p, uc);
+
+ if (uc <= 0) {
+ c = p + 1;
+ /* 0xFFFD in UTF8 */
+ g_string_append_len(out, " ", 3);
+ off = 0;
+ U8_APPEND_UNSAFE(out->str + out->len - 3,
+ off, 0xfffd);
+
+ if (invalid_utf) {
+ *invalid_utf = TRUE;
+ }
+ }
+ else {
+ c = p;
+ p = p + off;
+ continue; /* To avoid p ++ after this block */
+ }
+ }
+ p++;
+ break;
+ case got_eqsign:
+ if (*p == '?') {
+ state = got_encoded_start;
+ qmarks = 0;
+ }
+ else {
+ g_string_append_len(out, c, 1);
+ c = p;
+ state = parse_normal;
+ continue; /* Deal with == case */
+ }
+ p++;
+ break;
+ case got_encoded_start:
+ if (*p == '?') {
+ state = got_more_qmark;
+ qmarks++;
+
+ /* Skip multiple ? signs */
+ p++;
+ while (p < end && *p == '?') {
+ p++;
+ }
+
+ continue;
+ }
+ p++;
+ break;
+ case got_more_qmark:
+ if (*p == '=') {
+ if (qmarks < 3) {
+ state = got_encoded_start;
+ }
+ else {
+ /* Finished encoded boundary */
+ if (*c == '"') {
+ /* Quoted string, non-RFC conformant but used by retards */
+ c++;
+ }
+ if (rspamd_rfc2047_parser(c, p - c + 1, &encoding,
+ &cur_charset.begin, &cur_charset.len,
+ &tok_start, &tok_len)) {
+ /* We have a token, so we can decode it from `encoding` */
+ if (token->len > 0) {
+ if (old_charset.len == 0) {
+ memcpy(&old_charset, &cur_charset,
+ sizeof(old_charset));
+ }
+
+ rspamd_mime_header_maybe_save_token(pool, out,
+ token, decoded,
+ &old_charset, &cur_charset);
+ }
+
+ qmarks = 0;
+ pos = token->len;
+ g_byte_array_set_size(token, pos + tok_len);
+
+ if (encoding == RSPAMD_RFC2047_QP) {
+ r = rspamd_decode_qp2047_buf(tok_start, tok_len,
+ token->data + pos, tok_len);
+
+ if (r != -1) {
+ token->len = pos + r;
+ }
+ else {
+ /* Cannot decode qp */
+ token->len -= tok_len;
+ }
+ }
+ else {
+ if (rspamd_cryptobox_base64_decode(tok_start, tok_len,
+ token->data + pos, &tok_len)) {
+ token->len = pos + tok_len;
+ }
+ else {
+ /* Cannot decode */
+ token->len -= tok_len;
+ }
+ }
+
+ c = p + 1;
+ state = skip_spaces;
+ }
+ else {
+ /* Not encoded-word */
+ old_charset.len = 0;
+
+ if (token->len > 0) {
+ rspamd_mime_header_maybe_save_token(pool, out,
+ token, decoded,
+ &old_charset, &cur_charset);
+ }
+
+ g_string_append_len(out, c, p - c);
+ c = p;
+ state = parse_normal;
+ }
+ } /* qmarks >= 3 */
+ } /* p == '=' */
+ else {
+ state = got_encoded_start;
+ }
+ p++;
+ break;
+ case skip_spaces:
+ if (g_ascii_isspace(*p)) {
+ p++;
+ }
+ else if (*p == '=' && p < end - 1 && p[1] == '?') {
+ /* Next boundary, can glue */
+ c = p;
+ p += 2;
+ state = got_encoded_start;
+ }
+ else {
+ /* Need to save spaces and decoded token */
+ if (token->len > 0) {
+ old_charset.len = 0;
+ rspamd_mime_header_maybe_save_token(pool, out,
+ token, decoded,
+ &old_charset, &cur_charset);
+ }
+
+ g_string_append_len(out, c, p - c);
+ c = p;
+ state = parse_normal;
+ }
+ break;
+ }
+ }
+
+ /* Leftover */
+ switch (state) {
+ case skip_spaces:
+ if (token->len > 0 && cur_charset.len > 0) {
+ old_charset.len = 0;
+ rspamd_mime_header_maybe_save_token(pool, out,
+ token, decoded,
+ &old_charset, &cur_charset);
+ }
+ break;
+ default:
+ /* Just copy leftover */
+ if (p > c) {
+ g_string_append_len(out, c, p - c);
+ }
+ break;
+ }
+
+ g_byte_array_free(token, TRUE);
+ g_byte_array_free(decoded, TRUE);
+ rspamd_mime_header_sanity_check(out);
+ rspamd_mempool_notify_alloc(pool, out->len);
+ ret = g_string_free(out, FALSE);
+ rspamd_mempool_add_destructor(pool, g_free, ret);
+
+ return ret;
+}
+
+gchar *
+rspamd_mime_header_encode(const gchar *in, gsize len)
+{
+ const gchar *p = in, *end = in + len;
+ gchar *out, encode_buf[80 * sizeof(guint32)];
+ GString *res;
+ gboolean need_encoding = FALSE;
+
+ /* Check if we need to encode */
+ while (p < end) {
+ if ((((guchar) *p) & 0x80) != 0) {
+ need_encoding = TRUE;
+ break;
+ }
+ p++;
+ }
+
+ if (!need_encoding) {
+ out = g_malloc(len + 1);
+ rspamd_strlcpy(out, in, len + 1);
+ }
+ else {
+ /* Need encode */
+ gsize ulen, pos;
+ gint r;
+ const gchar *prev;
+ /* Choose step: =?UTF-8?Q?<qp>?= should be less than 76 chars */
+ guint step = (76 - 12) / 3 + 1;
+
+ ulen = g_utf8_strlen(in, len);
+ res = g_string_sized_new(len * 2 + 1);
+ pos = 0;
+ prev = in;
+ /* Adjust chunk size for unicode average length */
+ step *= 1.0 * ulen / (gdouble) len;
+
+ while (pos < ulen) {
+ p = g_utf8_offset_to_pointer(in, pos);
+
+ if (p > prev) {
+ /* Encode and print */
+ r = rspamd_encode_qp2047_buf(prev, p - prev,
+ encode_buf, sizeof(encode_buf));
+
+ if (r != -1) {
+ if (res->len > 0) {
+ rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r,
+ encode_buf);
+ }
+ else {
+ rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r,
+ encode_buf);
+ }
+ }
+ }
+
+ pos += MIN(step, ulen - pos);
+ prev = p;
+ }
+
+ /* Leftover */
+ if (prev < end) {
+ r = rspamd_encode_qp2047_buf(prev, end - prev,
+ encode_buf, sizeof(encode_buf));
+
+ if (r != -1) {
+ if (res->len > 0) {
+ rspamd_printf_gstring(res, " =?UTF-8?Q?%*s?=", r,
+ encode_buf);
+ }
+ else {
+ rspamd_printf_gstring(res, "=?UTF-8?Q?%*s?=", r,
+ encode_buf);
+ }
+ }
+ }
+
+ out = g_string_free(res, FALSE);
+ }
+
+ return out;
+}
+
+gchar *
+rspamd_mime_message_id_generate(const gchar *fqdn)
+{
+ GString *out;
+ guint64 rnd, clk;
+
+ out = g_string_sized_new(strlen(fqdn) + 22);
+ rnd = ottery_rand_uint64();
+ clk = rspamd_get_calendar_ticks() * 1e6;
+
+ rspamd_printf_gstring(out, "%*bs.%*bs@%s",
+ (gint) sizeof(guint64) - 3, (guchar *) &clk,
+ (gint) sizeof(guint64), (gchar *) &rnd,
+ fqdn);
+
+ return g_string_free(out, FALSE);
+}
+
+struct rspamd_mime_header *
+rspamd_message_get_header_from_hash(struct rspamd_mime_headers_table *hdrs,
+ const gchar *field,
+ gboolean need_modified)
+{
+ if (hdrs == NULL) {
+ return NULL;
+ }
+
+ khiter_t k;
+ khash_t(rspamd_mime_headers_htb) *htb = &hdrs->htb;
+ struct rspamd_mime_header *hdr;
+
+ if (htb) {
+ k = kh_get(rspamd_mime_headers_htb, htb, (gchar *) field);
+
+ if (k == kh_end(htb)) {
+ return NULL;
+ }
+
+ hdr = kh_value(htb, k);
+
+ if (!need_modified) {
+ if (hdr->flags & RSPAMD_HEADER_NON_EXISTING) {
+ return NULL;
+ }
+
+ return hdr;
+ }
+ else {
+ if (hdr->flags & RSPAMD_HEADER_MODIFIED) {
+ return hdr->modified_chain;
+ }
+
+ return hdr;
+ }
+ }
+
+ return NULL;
+}
+
+struct rspamd_mime_header *
+rspamd_message_get_header_array(struct rspamd_task *task, const gchar *field,
+ gboolean need_modified)
+{
+ return rspamd_message_get_header_from_hash(
+ MESSAGE_FIELD_CHECK(task, raw_headers),
+ field, need_modified);
+}
+
+gsize rspamd_mime_headers_count(struct rspamd_mime_headers_table *hdrs)
+{
+ if (hdrs) {
+ return kh_size(&hdrs->htb);
+ }
+
+ return 0;
+}
+
+bool rspamd_mime_headers_foreach(const struct rspamd_mime_headers_table *hdrs,
+ rspamd_hdr_traverse_func_t func, void *ud)
+{
+ const gchar *name;
+ struct rspamd_mime_header *hdr;
+
+ kh_foreach(&hdrs->htb, name, hdr, {
+ if (!func(name, hdr, ud)) {
+ return false;
+ }
+ });
+
+ return true;
+}
+
+static void
+rspamd_message_headers_dtor(struct rspamd_mime_headers_table *hdrs)
+{
+ if (hdrs) {
+ kfree(hdrs->htb.keys);
+ kfree(hdrs->htb.vals);
+ kfree(hdrs->htb.flags);
+ g_free(hdrs);
+ }
+}
+
+struct rspamd_mime_headers_table *
+rspamd_message_headers_ref(struct rspamd_mime_headers_table *hdrs)
+{
+ REF_RETAIN(hdrs);
+
+ return hdrs;
+}
+
+void rspamd_message_headers_unref(struct rspamd_mime_headers_table *hdrs)
+{
+ REF_RELEASE(hdrs);
+}
+
+struct rspamd_mime_headers_table *
+rspamd_message_headers_new(void)
+{
+ struct rspamd_mime_headers_table *nhdrs;
+
+ nhdrs = g_malloc0(sizeof(*nhdrs));
+ REF_INIT_RETAIN(nhdrs, rspamd_message_headers_dtor);
+
+ return nhdrs;
+}
+
+gsize rspamd_message_header_unfold_inplace(char *hdr, gsize len)
+{
+ /*
+ * t - tortoise (destination)
+ * h - hare (source)
+ */
+ char *t = hdr, *h = hdr, *end = (hdr + len);
+ enum {
+ copy_chars,
+ folding_cr,
+ folding_lf,
+ folding_ws,
+ } state = copy_chars;
+
+ while (h < end) {
+ switch (state) {
+ case copy_chars:
+ if (*h == '\r') {
+ state = folding_cr;
+ h++;
+ }
+ else if (*h == '\n') {
+ state = folding_lf;
+ h++;
+ }
+ else {
+ *t++ = *h++;
+ }
+ break;
+ case folding_cr:
+ if (*h == '\n') {
+ state = folding_lf;
+ h++;
+ }
+ else if (g_ascii_isspace(*h)) {
+ state = folding_ws;
+ h++;
+ }
+ else {
+ /* It is weird, not like a folding, so we need to revert back */
+ *t++ = '\r';
+ state = copy_chars;
+ }
+ break;
+ case folding_lf:
+ if (g_ascii_isspace(*h)) {
+ state = folding_ws;
+ h++;
+ }
+ else {
+ /* It is weird, not like a folding, so we need to revert back */
+ *t++ = '\n';
+ state = copy_chars;
+ }
+ break;
+ case folding_ws:
+ if (!g_ascii_isspace(*h)) {
+ *t++ = ' ';
+ state = copy_chars;
+ }
+ else {
+ h++;
+ }
+ break;
+ }
+ }
+
+ return t - hdr;
+}
+
+void rspamd_message_set_modified_header(struct rspamd_task *task,
+ struct rspamd_mime_headers_table *hdrs,
+ const gchar *hdr_name,
+ const ucl_object_t *obj,
+ struct rspamd_mime_header **order_ptr)
+{
+ khiter_t k;
+ khash_t(rspamd_mime_headers_htb) *htb = &hdrs->htb;
+ struct rspamd_mime_header *hdr_elt, *existing_chain;
+ int i;
+
+ if (htb) {
+ k = kh_get(rspamd_mime_headers_htb, htb, (gchar *) hdr_name);
+
+ if (k == kh_end(htb)) {
+ hdr_elt = rspamd_mempool_alloc0(task->task_pool, sizeof(*hdr_elt));
+
+ hdr_elt->flags |= RSPAMD_HEADER_MODIFIED | RSPAMD_HEADER_NON_EXISTING;
+ hdr_elt->name = rspamd_mempool_strdup(task->task_pool, hdr_name);
+
+ int r;
+ k = kh_put(rspamd_mime_headers_htb, htb, hdr_elt->name, &r);
+
+ kh_value(htb, k) = hdr_elt;
+
+ if (order_ptr) {
+ /*
+ * This iterates over all headers in O(N), but we have no other options here, as the
+ * list is already set.
+ */
+ LL_APPEND2(*order_ptr, hdr_elt, ord_next);
+ }
+ }
+ else {
+ hdr_elt = kh_value(htb, k);
+ }
+ }
+ else {
+ /* No hash, no modification */
+ msg_err_task("internal error: calling for set_modified_header for no headers");
+ return;
+ }
+
+ if (hdr_elt->flags & RSPAMD_HEADER_MODIFIED) {
+ existing_chain = hdr_elt->modified_chain;
+ }
+ else {
+ existing_chain = hdr_elt;
+ }
+
+ const ucl_object_t *elt, *cur;
+ ucl_object_iter_t it;
+
+ /* First, deal with removed headers, copying the relevant headers with remove flag */
+ elt = ucl_object_lookup(obj, "remove");
+
+ /*
+ * remove: {1, 2 ...}
+ * where number is the header's position starting from '1'
+ */
+ if (elt && ucl_object_type(elt) == UCL_ARRAY) {
+ /* First, use a temporary array to keep all headers */
+ GPtrArray *existing_ar = g_ptr_array_new();
+ struct rspamd_mime_header *cur_hdr;
+
+ /* Exclude removed headers */
+ LL_FOREACH(existing_chain, cur_hdr)
+ {
+ if (!(cur_hdr->flags & RSPAMD_HEADER_REMOVED)) {
+ g_ptr_array_add(existing_ar, cur_hdr);
+ }
+ }
+
+ it = NULL;
+
+ while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) {
+ if (ucl_object_type(cur) == UCL_INT) {
+ int ord = ucl_object_toint(cur);
+
+ if (ord == 0) {
+ /* Remove all headers in the existing chain */
+ PTR_ARRAY_FOREACH(existing_ar, i, cur_hdr)
+ {
+ cur_hdr->flags |= RSPAMD_HEADER_MODIFIED | RSPAMD_HEADER_REMOVED;
+ }
+ }
+ else if (ord > 0) {
+ /* Start from the top */
+
+ if (ord <= existing_ar->len) {
+ cur_hdr = g_ptr_array_index(existing_ar, ord - 1);
+ cur_hdr->flags |= RSPAMD_HEADER_MODIFIED | RSPAMD_HEADER_REMOVED;
+ }
+ }
+ else {
+ /* Start from the bottom; ord < 0 */
+ if ((-ord) <= existing_ar->len) {
+ cur_hdr = g_ptr_array_index(existing_ar, existing_ar->len + ord);
+ cur_hdr->flags |= RSPAMD_HEADER_MODIFIED | RSPAMD_HEADER_REMOVED;
+ }
+ }
+ }
+ }
+
+ /*
+ * Next, we return all headers modified to the existing chain
+ * This implies an additional copy of all structures but is safe enough to
+ * deal with it
+ */
+ hdr_elt->flags |= RSPAMD_HEADER_MODIFIED;
+ hdr_elt->modified_chain = NULL;
+
+ PTR_ARRAY_FOREACH(existing_ar, i, cur_hdr)
+ {
+ if (!(cur_hdr->flags & RSPAMD_HEADER_REMOVED)) {
+ struct rspamd_mime_header *nhdr = rspamd_mempool_alloc(
+ task->task_pool, sizeof(*nhdr));
+ memcpy(nhdr, cur_hdr, sizeof(*nhdr));
+ nhdr->modified_chain = NULL;
+ nhdr->prev = NULL;
+ nhdr->next = NULL;
+ nhdr->ord_next = NULL;
+
+ DL_APPEND(hdr_elt->modified_chain, nhdr);
+ }
+ }
+
+ g_ptr_array_free(existing_ar, TRUE);
+
+ /* End of headers removal logic */
+ }
+
+ /* We can now deal with headers additions */
+ elt = ucl_object_lookup(obj, "add");
+ if (elt && ucl_object_type(elt) == UCL_ARRAY) {
+ if (!(hdr_elt->flags & RSPAMD_HEADER_MODIFIED)) {
+ /* Copy the header itself to the modified chain */
+ struct rspamd_mime_header *nhdr;
+ hdr_elt->flags |= RSPAMD_HEADER_MODIFIED;
+ nhdr = rspamd_mempool_alloc(
+ task->task_pool, sizeof(*nhdr));
+ memcpy(nhdr, hdr_elt, sizeof(*hdr_elt));
+ nhdr->modified_chain = NULL;
+ nhdr->next = NULL;
+ nhdr->ord_next = NULL;
+ nhdr->prev = nhdr;
+ hdr_elt->modified_chain = nhdr;
+ }
+
+ /*
+ * add: {{1, "foo"}, {-1, "bar"} ...}
+ * where number is the header's position starting from '1'
+ */
+ it = NULL;
+
+ while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) {
+ if (ucl_object_type(cur) == UCL_ARRAY) {
+ const ucl_object_t *order = ucl_array_find_index(cur, 0),
+ *value = ucl_array_find_index(cur, 1);
+
+ if (order && value &&
+ (ucl_object_type(order) == UCL_INT &&
+ ucl_object_type(value) == UCL_STRING)) {
+ int ord = ucl_object_toint(order);
+ const char *raw_value;
+ gsize raw_len;
+
+ raw_value = ucl_object_tolstring(value, &raw_len);
+
+ if (raw_len == 0) {
+ continue;
+ }
+
+ struct rspamd_mime_header *nhdr = rspamd_mempool_alloc0(
+ task->task_pool, sizeof(*nhdr));
+
+ nhdr->flags |= RSPAMD_HEADER_ADDED;
+ nhdr->name = hdr_elt->name;
+ nhdr->value = rspamd_mempool_alloc(task->task_pool,
+ raw_len + 1);
+ /* Strlcpy will ensure that value will have no embedded \0 */
+ rspamd_strlcpy(nhdr->value, raw_value, raw_len + 1);
+ gsize value_len = rspamd_message_header_unfold_inplace(nhdr->value, raw_len);
+ nhdr->value[value_len] = '\0';
+
+ /* Deal with the raw value */
+ size_t namelen = strlen(hdr_elt->name);
+ char *rawbuf = rspamd_mempool_alloc(task->task_pool, namelen +
+ raw_len +
+ sizeof(": \r\n"));
+ /* Name: value<newline> */
+ nhdr->raw_value = rawbuf;
+ memcpy(rawbuf, hdr_elt->name, namelen);
+ rawbuf += namelen;
+ memcpy(rawbuf, ": ", sizeof(": ") - 1);
+ nhdr->separator = rspamd_mempool_strdup(task->task_pool, " ");
+ rawbuf += sizeof(": ") - 1;
+ memcpy(rawbuf, raw_value, raw_len);
+ nhdr->raw_len = raw_len;
+
+ if (MESSAGE_FIELD(task, nlines_type) == RSPAMD_TASK_NEWLINES_LF) {
+ rawbuf[raw_len++] = '\n';
+ }
+ else {
+ rawbuf[raw_len++] = '\r';
+
+ if (MESSAGE_FIELD(task, nlines_type) == RSPAMD_TASK_NEWLINES_CRLF) {
+ rawbuf[raw_len++] = '\n';
+ }
+ }
+
+ rawbuf[raw_len] = '\0';
+
+ nhdr->decoded = rspamd_mime_header_decode(task->task_pool,
+ raw_value, nhdr->raw_len,
+ NULL);
+
+ /* Now find a position to insert a value */
+ struct rspamd_mime_header **pos = &hdr_elt->modified_chain;
+
+ if (ord == 0) {
+ DL_PREPEND(hdr_elt->modified_chain, nhdr);
+ }
+ else if (ord == -1) {
+ DL_APPEND(hdr_elt->modified_chain, nhdr);
+ }
+ else if (ord > 0) {
+ while (ord > 0 && (*pos)) {
+ ord--;
+ pos = &((*pos)->next);
+ }
+ if (*pos) {
+ /* pos is &(elt)->next */
+ nhdr->next = (*pos);
+ nhdr->prev = (*pos)->prev;
+ (*pos)->prev = nhdr;
+ *pos = nhdr;
+ }
+ else {
+ /* Last element */
+ DL_APPEND(*pos, nhdr);
+ }
+ }
+ else {
+ /* NYI: negative order is not defined */
+ msg_err_task("internal error: calling for set_modified_header "
+ "with negative add order header");
+ }
+ }
+ else {
+ msg_err_task("internal error: calling for set_modified_header "
+ "with invalid header");
+ }
+ }
+ }
+ }
+}
+
+gsize rspamd_strip_smtp_comments_inplace(gchar *input, gsize len)
+{
+ enum parser_state {
+ parse_normal,
+ parse_obrace,
+ parse_comment,
+ parse_quoted_copy,
+ parse_quoted_ignore,
+ } state = parse_normal,
+ next_state = parse_normal;
+ gchar *d = input, *end = input + len, *start = input;
+ gchar t;
+ int obraces = 0, ebraces = 0;
+
+ while (input < end) {
+ t = *input;
+ switch (state) {
+ case parse_normal:
+ if (t == '(') {
+ state = parse_obrace;
+ }
+ else if (t == '\\') {
+ state = parse_quoted_copy;
+ next_state = parse_normal;
+ }
+ else {
+ *d++ = t;
+ }
+ input++;
+ break;
+ case parse_obrace:
+ obraces++;
+ if (t == '(') {
+ obraces++;
+ }
+ else if (t == ')') {
+ ebraces++;
+
+ if (obraces == ebraces) {
+ obraces = 0;
+ ebraces = 0;
+ state = parse_normal;
+ }
+ }
+ else if (t == '\\') {
+ state = parse_quoted_ignore;
+ next_state = parse_comment;
+ }
+ else {
+ state = parse_comment;
+ }
+ input++;
+ break;
+ case parse_comment:
+ if (t == '(') {
+ state = parse_obrace;
+ }
+ else if (t == ')') {
+ ebraces++;
+
+ if (obraces == ebraces) {
+ obraces = 0;
+ ebraces = 0;
+ state = parse_normal;
+ }
+ }
+ else if (t == '\\') {
+ state = parse_quoted_ignore;
+ next_state = parse_comment;
+ }
+ input++;
+ break;
+ case parse_quoted_copy:
+ *d++ = t;
+ state = next_state;
+ input++;
+ break;
+ case parse_quoted_ignore:
+ state = next_state;
+ input++;
+ break;
+ }
+ }
+
+ return (d - start);
+} \ No newline at end of file
diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h
new file mode 100644
index 0000000..60015a2
--- /dev/null
+++ b/src/libmime/mime_headers.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_MIME_HEADERS_H_
+#define SRC_LIBMIME_MIME_HEADERS_H_
+
+#include "config.h"
+#include "libutil/mem_pool.h"
+#include "libutil/addr.h"
+#include "khash.h"
+#include "contrib/libucl/ucl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_task;
+
+enum rspamd_rfc2047_encoding {
+ RSPAMD_RFC2047_QP = 0,
+ RSPAMD_RFC2047_BASE64,
+};
+
+enum rspamd_mime_header_flags {
+ RSPAMD_HEADER_GENERIC = 0u,
+ RSPAMD_HEADER_RECEIVED = 1u << 0u,
+ RSPAMD_HEADER_TO = 1u << 2u,
+ RSPAMD_HEADER_CC = 1u << 3u,
+ RSPAMD_HEADER_BCC = 1u << 4u,
+ RSPAMD_HEADER_FROM = 1u << 5u,
+ RSPAMD_HEADER_MESSAGE_ID = 1u << 6u,
+ RSPAMD_HEADER_SUBJECT = 1u << 7u,
+ RSPAMD_HEADER_RETURN_PATH = 1u << 8u,
+ RSPAMD_HEADER_DELIVERED_TO = 1u << 9u,
+ RSPAMD_HEADER_SENDER = 1u << 10u,
+ RSPAMD_HEADER_RCPT = 1u << 11u,
+ RSPAMD_HEADER_UNIQUE = 1u << 12u,
+ RSPAMD_HEADER_EMPTY_SEPARATOR = 1u << 13u,
+ RSPAMD_HEADER_TAB_SEPARATED = 1u << 14u,
+ RSPAMD_HEADER_MODIFIED = 1u << 15u, /* Means we need to check modified chain */
+ RSPAMD_HEADER_ADDED = 1u << 16u, /* A header has been artificially added */
+ RSPAMD_HEADER_REMOVED = 1u << 17u, /* A header has been artificially removed */
+ RSPAMD_HEADER_NON_EXISTING = 1u << 18u, /* Header was not in the original message */
+};
+
+struct rspamd_mime_header {
+ const gchar *raw_value; /* As it is in the message (unfolded and unparsed) */
+ gsize raw_len;
+ guint order;
+ int flags; /* see enum rspamd_mime_header_flags */
+ /* These are zero terminated (historically) */
+ gchar *name; /* Also used for key */
+ gchar *value;
+ gchar *separator;
+ gchar *decoded;
+ struct rspamd_mime_header *modified_chain; /* Headers modified during transform */
+ struct rspamd_mime_header *prev, *next; /* Headers with the same name */
+ struct rspamd_mime_header *ord_next; /* Overall order of headers, slist */
+};
+
+struct rspamd_mime_headers_table;
+
+/**
+ * Process headers and store them in `target`
+ * @param task
+ * @param target
+ * @param in
+ * @param len
+ * @param check_newlines
+ */
+void rspamd_mime_headers_process(struct rspamd_task *task,
+ struct rspamd_mime_headers_table *target,
+ struct rspamd_mime_header **order_ptr,
+ const gchar *in, gsize len,
+ gboolean check_newlines);
+
+/**
+ * Perform rfc2047 decoding of a header
+ * @param pool
+ * @param in
+ * @param inlen
+ * @return
+ */
+gchar *rspamd_mime_header_decode(rspamd_mempool_t *pool, const gchar *in,
+ gsize inlen, gboolean *invalid_utf);
+
+/**
+ * Encode mime header if needed
+ * @param in
+ * @param len
+ * @return newly allocated encoded header
+ */
+gchar *rspamd_mime_header_encode(const gchar *in, gsize len);
+
+/**
+ * Generate new unique message id
+ * @param fqdn
+ * @return
+ */
+gchar *rspamd_mime_message_id_generate(const gchar *fqdn);
+
+/**
+ * Get an array of header's values with specified header's name using raw headers
+ * @param task worker task structure
+ * @param field header's name
+ * @return An array of header's values or NULL. It is NOT permitted to free array or values.
+ */
+struct rspamd_mime_header *
+rspamd_message_get_header_array(struct rspamd_task *task,
+ const gchar *field,
+ gboolean need_modified);
+
+/**
+ * Get an array of header's values with specified header's name using raw headers
+ * @param htb hash table indexed by header name (caseless) with ptr arrays as elements
+ * @param field header's name
+ * @return An array of header's values or NULL. It is NOT permitted to free array or values.
+ */
+struct rspamd_mime_header *
+rspamd_message_get_header_from_hash(struct rspamd_mime_headers_table *hdrs,
+ const gchar *field,
+ gboolean need_modified);
+
+/**
+ * Modifies a header (or insert one if not found)
+ * @param hdrs
+ * @param hdr_name
+ * @param obj an array of modified values
+ *
+ */
+void rspamd_message_set_modified_header(struct rspamd_task *task,
+ struct rspamd_mime_headers_table *hdrs,
+ const gchar *hdr_name,
+ const ucl_object_t *obj,
+ struct rspamd_mime_header **order_ptr);
+
+/**
+ * Cleans up hash table of the headers
+ * @param htb
+ */
+void rspamd_message_headers_unref(struct rspamd_mime_headers_table *hdrs);
+
+struct rspamd_mime_headers_table *rspamd_message_headers_ref(struct rspamd_mime_headers_table *hdrs);
+
+/**
+ * Init headers hash
+ * @return
+ */
+struct rspamd_mime_headers_table *rspamd_message_headers_new(void);
+
+/**
+ * Returns size for a headers table
+ * @param hdrs
+ * @return
+ */
+gsize rspamd_mime_headers_count(struct rspamd_mime_headers_table *hdrs);
+
+typedef bool(rspamd_hdr_traverse_func_t)(const gchar *, const struct rspamd_mime_header *, void *);
+/**
+ * Traverse all headers in a table
+ * @param func
+ * @param ud
+ * @return
+ */
+bool rspamd_mime_headers_foreach(const struct rspamd_mime_headers_table *,
+ rspamd_hdr_traverse_func_t func, void *ud);
+
+/**
+ * Strip rfc822 CFWS sequences from a string in place
+ * @param input input
+ * @param len length of the input
+ * @return new length of the input
+ */
+gsize rspamd_strip_smtp_comments_inplace(gchar *input, gsize len);
+
+/**
+ * Unfold header in place
+ * @param hdr header value
+ * @param len length of the header
+ * @return new unfolded length
+ */
+gsize rspamd_message_header_unfold_inplace(char *hdr, gsize len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SRC_LIBMIME_MIME_HEADERS_H_ */
diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c
new file mode 100644
index 0000000..217f0b8
--- /dev/null
+++ b/src/libmime/mime_parser.c
@@ -0,0 +1,1758 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "config.h"
+#include "task.h"
+#include "mime_parser.h"
+#include "mime_headers.h"
+#include "message.h"
+#include "multipattern.h"
+#include "contrib/libottery/ottery.h"
+#include "contrib/uthash/utlist.h"
+#include <openssl/cms.h>
+#include <openssl/pkcs7.h>
+#include "contrib/fastutf8/fastutf8.h"
+
+struct rspamd_mime_parser_lib_ctx {
+ struct rspamd_multipattern *mp_boundary;
+ guchar hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */
+ guint key_usages;
+};
+
+struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL;
+
+static const guint max_nested = 64;
+static const guint max_key_usages = 10000;
+
+#define msg_debug_mime(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \
+ rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+
+INIT_LOG_MODULE(mime)
+
+#define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0)
+#define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED)
+
+struct rspamd_mime_boundary {
+ goffset boundary;
+ goffset start;
+ guint64 hash;
+ guint64 closed_hash;
+ gint flags;
+};
+
+struct rspamd_mime_parser_ctx {
+ GPtrArray *stack; /* Stack of parts */
+ GArray *boundaries; /* Boundaries found in the whole message */
+ const gchar *start;
+ const gchar *pos;
+ const gchar *end;
+ struct rspamd_task *task;
+ guint nesting;
+};
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_multipart_part(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ GError **err);
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_message(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ GError **err);
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_normal_part(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_content_type *ct,
+ GError **err);
+
+static enum rspamd_mime_parse_error
+rspamd_mime_process_multipart_node(struct rspamd_task *task,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_part *multipart,
+ const gchar *start, const gchar *end,
+ gboolean is_finished,
+ GError **err);
+
+
+#define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark())
+static GQuark
+rspamd_mime_parser_quark(void)
+{
+ return g_quark_from_static_string("mime-parser");
+}
+
+const gchar *
+rspamd_cte_to_string(enum rspamd_cte ct)
+{
+ const gchar *ret = "unknown";
+
+ switch (ct) {
+ case RSPAMD_CTE_7BIT:
+ ret = "7bit";
+ break;
+ case RSPAMD_CTE_8BIT:
+ ret = "8bit";
+ break;
+ case RSPAMD_CTE_QP:
+ ret = "quoted-printable";
+ break;
+ case RSPAMD_CTE_B64:
+ ret = "base64";
+ break;
+ case RSPAMD_CTE_UUE:
+ ret = "X-uuencode";
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+enum rspamd_cte
+rspamd_cte_from_string(const gchar *str)
+{
+ enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
+
+ g_assert(str != NULL);
+
+ if (strcmp(str, "7bit") == 0) {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ else if (strcmp(str, "8bit") == 0) {
+ ret = RSPAMD_CTE_8BIT;
+ }
+ else if (strcmp(str, "quoted-printable") == 0) {
+ ret = RSPAMD_CTE_QP;
+ }
+ else if (strcmp(str, "base64") == 0) {
+ ret = RSPAMD_CTE_B64;
+ }
+ else if (strcmp(str, "X-uuencode") == 0) {
+ ret = RSPAMD_CTE_UUE;
+ }
+ else if (strcmp(str, "uuencode") == 0) {
+ ret = RSPAMD_CTE_UUE;
+ }
+ else if (strcmp(str, "X-uue") == 0) {
+ ret = RSPAMD_CTE_UUE;
+ }
+
+ return ret;
+}
+
+static void
+rspamd_mime_parser_init_lib(void)
+{
+ lib_ctx = g_malloc0(sizeof(*lib_ctx));
+ lib_ctx->mp_boundary = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT);
+ g_assert(lib_ctx->mp_boundary != NULL);
+ rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\r--", 0);
+ rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\n--", 0);
+
+ GError *err = NULL;
+ if (!rspamd_multipattern_compile(lib_ctx->mp_boundary, &err)) {
+ msg_err("fatal error: cannot compile multipattern for mime parser boundaries: %e", err);
+ g_error_free(err);
+ g_abort();
+ }
+ ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey));
+}
+
+static enum rspamd_cte
+rspamd_mime_parse_cte(const gchar *in, gsize len)
+{
+ guint64 h;
+ enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
+
+ in = rspamd_string_len_strip(in, &len, " \t;,.+-#!`~'");
+ h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64,
+ in, len, 0xdeadbabe);
+
+ switch (h) {
+ case 0xCEDAA7056B4753F7ULL: /* 7bit */
+ ret = RSPAMD_CTE_7BIT;
+ break;
+ case 0x42E0745448B39FC1ULL: /* 8bit */
+ case 0x6B169E6B155BADC0ULL: /* binary */
+ ret = RSPAMD_CTE_8BIT;
+ break;
+ case 0x6D69A5BB02A633B0ULL: /* quoted-printable */
+ ret = RSPAMD_CTE_QP;
+ break;
+ case 0x96305588A76DC9A9ULL: /* base64 */
+ case 0x171029DE1B0423A9ULL: /* base-64 */
+ ret = RSPAMD_CTE_B64;
+ break;
+ case 0x420b54dc00d13cecULL: /* uuencode */
+ case 0x8df6700b8f6c4cf9ULL: /* x-uuencode */
+ case 0x41f725ec544356d3ULL: /* x-uue */
+ ret = RSPAMD_CTE_UUE;
+ break;
+ }
+
+ return ret;
+}
+
+static enum rspamd_cte
+rspamd_mime_part_get_cte_heuristic(struct rspamd_task *task,
+ struct rspamd_mime_part *part)
+{
+ const guint check_len = 128;
+ guint real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0,
+ padeqsign = 0, nupper = 0, nlower = 0;
+ gboolean b64_chars = TRUE;
+ const guchar *p, *end;
+ enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
+
+ real_len = MIN(check_len, part->raw_data.len);
+ p = (const guchar *) part->raw_data.begin;
+ end = p + part->raw_data.len;
+
+ while (p < end && g_ascii_isspace(*p)) {
+ p++;
+ }
+
+ if (end - p > sizeof("begin-base64 ")) {
+ const guchar *uue_start;
+
+ if (memcmp(p, "begin ", sizeof("begin ") - 1) == 0) {
+ uue_start = p + sizeof("begin ") - 1;
+
+ while (uue_start < end && g_ascii_isspace(*uue_start)) {
+ uue_start++;
+ }
+
+ if (uue_start < end && g_ascii_isdigit(*uue_start)) {
+ return RSPAMD_CTE_UUE;
+ }
+ }
+ else if (memcmp(p, "begin-base64 ", sizeof("begin-base64 ") - 1) == 0) {
+ uue_start = p + sizeof("begin ") - 1;
+
+ while (uue_start < end && g_ascii_isspace(*uue_start)) {
+ uue_start++;
+ }
+
+ if (uue_start < end && g_ascii_isdigit(*uue_start)) {
+ return RSPAMD_CTE_UUE;
+ }
+ }
+ }
+
+ /* Skip trailing spaces */
+ while (end > p && g_ascii_isspace(*(end - 1))) {
+ end--;
+ }
+
+ if (end > p + 2) {
+ if (*(end - 1) == '=') {
+ padeqsign++;
+ end--;
+ }
+
+ if (*(end - 1) == '=') {
+ padeqsign++;
+ end--;
+ }
+ }
+
+ /* Adjust end to analyse only first characters */
+ if (end - p > real_len) {
+ end = p + real_len;
+ }
+
+ while (p < end) {
+ if (*p == ' ') {
+ nspaces++;
+ }
+ else if (*p == '=') {
+ b64_chars = FALSE; /* Eqsign must not be inside base64 */
+ neqsign++;
+ p++;
+
+ if (p + 2 < end && g_ascii_isxdigit(*p) && g_ascii_isxdigit(*(p + 1))) {
+ p++;
+ nqpencoded++;
+ }
+
+ continue;
+ }
+ else if (*p >= 0x80) {
+ n8bit++;
+ b64_chars = FALSE;
+ }
+ else if (!(g_ascii_isalnum(*p) || *p == '/' || *p == '+')) {
+ b64_chars = FALSE;
+ }
+ else if (g_ascii_isupper(*p)) {
+ nupper++;
+ }
+ else if (g_ascii_islower(*p)) {
+ nlower++;
+ }
+
+ p++;
+ }
+
+ if (b64_chars && neqsign <= 2 && nspaces == 0) {
+ /* Need more thinking */
+
+ if (part->raw_data.len > 80) {
+ if (padeqsign > 0) {
+ ret = RSPAMD_CTE_B64;
+ }
+ else {
+ /* We have a large piece of data with no spaces and base64
+ * symbols only, no padding is detected as well...
+ *
+ * There is a small chance that our first 128 characters
+ * are either some garbage or it is a base64 with no padding
+ * (e.g. when it is not needed)
+ */
+ if (nupper > 1 && nlower > 1) {
+ /*
+ * We have both uppercase and lowercase letters, so it can be
+ * base64
+ */
+ ret = RSPAMD_CTE_B64;
+ }
+ else {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ }
+ }
+ else {
+
+ if (((end - (const guchar *) part->raw_data.begin) + padeqsign) % 4 == 0) {
+ if (padeqsign == 0) {
+ /*
+ * It can be either base64 or plain text, hard to say
+ * Let's assume that if we have > 1 uppercase it is
+ * likely base64
+ */
+ if (nupper > 1 && nlower > 1) {
+ ret = RSPAMD_CTE_B64;
+ }
+ else {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ }
+ else {
+ ret = RSPAMD_CTE_B64;
+ }
+ }
+ else {
+ /* No way */
+ if (padeqsign == 1 || padeqsign == 2) {
+ ret = RSPAMD_CTE_B64;
+ }
+ else {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ }
+ }
+ }
+ else if (n8bit == 0) {
+ if (neqsign > 2 && nqpencoded > 2) {
+ ret = RSPAMD_CTE_QP;
+ }
+ else {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ }
+ else {
+ ret = RSPAMD_CTE_8BIT;
+ }
+
+ msg_debug_mime("detected cte: %s", rspamd_cte_to_string(ret));
+
+ return ret;
+}
+
+static void
+rspamd_mime_part_get_cte(struct rspamd_task *task,
+ struct rspamd_mime_headers_table *hdrs,
+ struct rspamd_mime_part *part,
+ gboolean apply_heuristic)
+{
+ struct rspamd_mime_header *hdr, *cur;
+ enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN;
+ gboolean parent_propagated = FALSE;
+
+ hdr = rspamd_message_get_header_from_hash(hdrs, "Content-Transfer-Encoding", FALSE);
+
+ if (hdr == NULL) {
+ if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN &&
+ !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) {
+ part->cte = part->parent_part->cte;
+ parent_propagated = TRUE;
+
+ goto check_cte;
+ }
+
+ if (apply_heuristic) {
+ part->cte = rspamd_mime_part_get_cte_heuristic(task, part);
+ msg_info_task("detected missing CTE for part as: %s",
+ rspamd_cte_to_string(part->cte));
+ }
+
+ part->flags |= RSPAMD_MIME_PART_MISSING_CTE;
+ }
+ else {
+ DL_FOREACH(hdr, cur)
+ {
+ gsize hlen;
+ gchar lc_buf[128];
+
+ hlen = rspamd_snprintf(lc_buf, sizeof(lc_buf), "%s", cur->value);
+ rspamd_str_lc(lc_buf, hlen);
+ cte = rspamd_mime_parse_cte(lc_buf, hlen);
+
+ if (cte != RSPAMD_CTE_UNKNOWN) {
+ part->cte = cte;
+ break;
+ }
+ }
+
+ check_cte:
+ if (apply_heuristic) {
+ if (part->cte == RSPAMD_CTE_UNKNOWN) {
+ part->cte = rspamd_mime_part_get_cte_heuristic(task, part);
+
+ msg_info_task("corrected bad CTE for part to: %s",
+ rspamd_cte_to_string(part->cte));
+ }
+ else if (part->cte == RSPAMD_CTE_B64 ||
+ part->cte == RSPAMD_CTE_QP) {
+ /* Additionally check sanity */
+ cte = rspamd_mime_part_get_cte_heuristic(task, part);
+
+ if (cte == RSPAMD_CTE_8BIT) {
+ msg_info_task(
+ "incorrect cte specified for part: %s, %s detected",
+ rspamd_cte_to_string(part->cte),
+ rspamd_cte_to_string(cte));
+ part->cte = cte;
+ part->flags |= RSPAMD_MIME_PART_BAD_CTE;
+ }
+ else if (cte != part->cte && parent_propagated) {
+ part->cte = cte;
+ msg_info_task("detected missing CTE for part as: %s",
+ rspamd_cte_to_string(part->cte));
+ }
+ }
+ else {
+ msg_debug_mime("processed cte: %s",
+ rspamd_cte_to_string(cte));
+ }
+ }
+ else {
+ msg_debug_mime("processed cte: %s", rspamd_cte_to_string(cte));
+ }
+ }
+}
+static void
+rspamd_mime_part_get_cd(struct rspamd_task *task, struct rspamd_mime_part *part)
+{
+ struct rspamd_mime_header *hdr, *cur;
+ struct rspamd_content_disposition *cd = NULL;
+ rspamd_ftok_t srch;
+ struct rspamd_content_type_param *found;
+
+ hdr = rspamd_message_get_header_from_hash(part->raw_headers,
+ "Content-Disposition", FALSE);
+
+
+ if (hdr == NULL) {
+ cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd));
+ cd->type = RSPAMD_CT_INLINE;
+
+ /* We can also have content disposition definitions in Content-Type */
+ if (part->ct && part->ct->attrs) {
+ RSPAMD_FTOK_ASSIGN(&srch, "name");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+
+ if (!found) {
+ RSPAMD_FTOK_ASSIGN(&srch, "filename");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+ }
+
+ if (found) {
+ cd->type = RSPAMD_CT_ATTACHMENT;
+ memcpy(&cd->filename, &found->value, sizeof(cd->filename));
+ }
+ }
+ }
+ else {
+ DL_FOREACH(hdr, cur)
+ {
+ gsize hlen;
+ cd = NULL;
+
+ if (cur->value) {
+ hlen = strlen(cur->value);
+ cd = rspamd_content_disposition_parse(cur->value, hlen,
+ task->task_pool);
+ }
+
+ if (cd) {
+ /* We still need to check filename */
+ if (cd->filename.len == 0) {
+ if (part->ct && part->ct->attrs) {
+ RSPAMD_FTOK_ASSIGN(&srch, "name");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+
+ if (!found) {
+ RSPAMD_FTOK_ASSIGN(&srch, "filename");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+ }
+
+ if (found) {
+ cd->type = RSPAMD_CT_ATTACHMENT;
+ memcpy(&cd->filename, &found->value,
+ sizeof(cd->filename));
+ }
+ }
+ }
+
+ msg_debug_mime("processed content disposition: %s, file: \"%T\"",
+ cd->lc_data, &cd->filename);
+ break;
+ }
+ else if (part->ct) {
+ /*
+ * Even in case of malformed Content-Disposition, we can still
+ * fall back to Content-Type
+ */
+ cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd));
+ cd->type = RSPAMD_CT_INLINE;
+
+ /* We can also have content disposition definitions in Content-Type */
+ if (part->ct->attrs) {
+ RSPAMD_FTOK_ASSIGN(&srch, "name");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+
+ if (!found) {
+ RSPAMD_FTOK_ASSIGN(&srch, "filename");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+ }
+
+ if (found) {
+ cd->type = RSPAMD_CT_ATTACHMENT;
+ memcpy(&cd->filename, &found->value, sizeof(cd->filename));
+ }
+ }
+ }
+ }
+ }
+
+ part->cd = cd;
+}
+
+void rspamd_mime_parser_calc_digest(struct rspamd_mime_part *part)
+{
+ /* Blake2b applied to string 'rspamd' */
+ static const guchar hash_key[] = {
+ 0xef,
+ 0x43,
+ 0xae,
+ 0x80,
+ 0xcc,
+ 0x8d,
+ 0xc3,
+ 0x4c,
+ 0x6f,
+ 0x1b,
+ 0xd6,
+ 0x18,
+ 0x1b,
+ 0xae,
+ 0x87,
+ 0x74,
+ 0x0c,
+ 0xca,
+ 0xf7,
+ 0x8e,
+ 0x5f,
+ 0x2e,
+ 0x54,
+ 0x32,
+ 0xf6,
+ 0x79,
+ 0xb9,
+ 0x27,
+ 0x26,
+ 0x96,
+ 0x20,
+ 0x92,
+ 0x70,
+ 0x07,
+ 0x85,
+ 0xeb,
+ 0x83,
+ 0xf7,
+ 0x89,
+ 0xe0,
+ 0xd7,
+ 0x32,
+ 0x2a,
+ 0xd2,
+ 0x1a,
+ 0x64,
+ 0x41,
+ 0xef,
+ 0x49,
+ 0xff,
+ 0xc3,
+ 0x8c,
+ 0x54,
+ 0xf9,
+ 0x67,
+ 0x74,
+ 0x30,
+ 0x1e,
+ 0x70,
+ 0x2e,
+ 0xb7,
+ 0x12,
+ 0x09,
+ 0xfe,
+ };
+
+ if (part->parsed_data.len > 0) {
+ rspamd_cryptobox_hash(part->digest,
+ part->parsed_data.begin, part->parsed_data.len,
+ hash_key, sizeof(hash_key));
+ }
+}
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_normal_part(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_content_type *ct,
+ GError **err)
+{
+ rspamd_fstring_t *parsed;
+ gssize r;
+
+ g_assert(part != NULL);
+
+ rspamd_mime_part_get_cte(task, part->raw_headers, part,
+ part->ct && !(part->ct->flags & RSPAMD_CONTENT_TYPE_MESSAGE));
+ rspamd_mime_part_get_cd(task, part);
+
+ switch (part->cte) {
+ case RSPAMD_CTE_7BIT:
+ case RSPAMD_CTE_8BIT:
+ case RSPAMD_CTE_UNKNOWN:
+ if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
+ if (part->cte != RSPAMD_CTE_7BIT) {
+ /* We have something that has a missing content-type,
+ * but it has non-7bit characters.
+ *
+ * In theory, it is very unsafe to process it as a text part
+ * as we unlikely get some sane result
+ */
+
+ /*
+ * On the other hand, there is an evidence that some
+ * emails actually rely on that.
+ * So we apply an expensive hack here:
+ * if there are no 8bit characters -OR- the content is valid
+ * UTF8, we can still imply Content-Type == text/plain
+ */
+
+ if (rspamd_str_has_8bit(part->raw_data.begin, part->raw_data.len) &&
+ !rspamd_fast_utf8_validate(part->raw_data.begin, part->raw_data.len)) {
+ part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT;
+ part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
+ }
+ }
+ }
+
+ if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) {
+ /* Need to copy text as we have couple of in-place change functions */
+ parsed = rspamd_fstring_sized_new(part->raw_data.len);
+ parsed->len = part->raw_data.len;
+ memcpy(parsed->str, part->raw_data.begin, parsed->len);
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ }
+ else {
+ part->parsed_data.begin = part->raw_data.begin;
+ part->parsed_data.len = part->raw_data.len;
+ }
+ break;
+ case RSPAMD_CTE_QP:
+ parsed = rspamd_fstring_sized_new(part->raw_data.len);
+ r = rspamd_decode_qp_buf(part->raw_data.begin, part->raw_data.len,
+ parsed->str, parsed->allocated);
+ if (r != -1) {
+ parsed->len = r;
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ }
+ else {
+ msg_err_task("invalid quoted-printable encoded part, assume 8bit");
+ if (part->ct) {
+ part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
+ }
+ part->cte = RSPAMD_CTE_8BIT;
+ memcpy(parsed->str, part->raw_data.begin, part->raw_data.len);
+ parsed->len = part->raw_data.len;
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ }
+ break;
+ case RSPAMD_CTE_B64:
+ parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12);
+ rspamd_cryptobox_base64_decode(part->raw_data.begin,
+ part->raw_data.len,
+ parsed->str, &parsed->len);
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ break;
+ case RSPAMD_CTE_UUE:
+ parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12);
+ r = rspamd_decode_uue_buf(part->raw_data.begin, part->raw_data.len,
+ parsed->str, parsed->allocated);
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ if (r != -1) {
+ parsed->len = r;
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ }
+ else {
+ msg_err_task("invalid uuencoding in encoded part, assume 8bit");
+ if (part->ct) {
+ part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
+ }
+ part->cte = RSPAMD_CTE_8BIT;
+ parsed->len = MIN(part->raw_data.len, parsed->allocated);
+ memcpy(parsed->str, part->raw_data.begin, parsed->len);
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ part->part_number = MESSAGE_FIELD(task, parts)->len;
+ part->urls = g_ptr_array_new();
+ g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
+ msg_debug_mime("parsed data part %T/%T of length %z (%z orig), %s cte",
+ &part->ct->type, &part->ct->subtype, part->parsed_data.len,
+ part->raw_data.len, rspamd_cte_to_string(part->cte));
+ rspamd_mime_parser_calc_digest(part);
+
+ if (ct && (ct->flags & RSPAMD_CONTENT_TYPE_SMIME)) {
+ CMS_ContentInfo *cms;
+ const unsigned char *der_beg = part->parsed_data.begin;
+ cms = d2i_CMS_ContentInfo(NULL, &der_beg, part->parsed_data.len);
+
+ if (cms) {
+ const ASN1_OBJECT *asn_ct = CMS_get0_eContentType(cms);
+ int ct_nid = OBJ_obj2nid(asn_ct);
+
+ if (ct_nid == NID_pkcs7_data) {
+ BIO *bio = BIO_new_mem_buf(part->parsed_data.begin,
+ part->parsed_data.len);
+
+ PKCS7 *p7;
+ p7 = d2i_PKCS7_bio(bio, NULL);
+
+ if (p7) {
+ ct_nid = OBJ_obj2nid(p7->type);
+
+ if (ct_nid == NID_pkcs7_signed) {
+ PKCS7 *p7_signed_content = p7->d.sign->contents;
+
+ ct_nid = OBJ_obj2nid(p7_signed_content->type);
+
+ if (ct_nid == NID_pkcs7_data && p7_signed_content->d.data) {
+ int ret;
+
+ msg_debug_mime("found an additional part inside of "
+ "smime structure of type %T/%T; length=%d",
+ &ct->type, &ct->subtype, p7_signed_content->d.data->length);
+ /*
+ * Since ASN.1 structures are freed, we need to copy
+ * the content
+ */
+ gchar *cpy = rspamd_mempool_alloc(task->task_pool,
+ p7_signed_content->d.data->length);
+ memcpy(cpy, p7_signed_content->d.data->data,
+ p7_signed_content->d.data->length);
+ ret = rspamd_mime_process_multipart_node(task,
+ st, NULL,
+ cpy, cpy + p7_signed_content->d.data->length,
+ TRUE, err);
+
+ PKCS7_free(p7);
+ BIO_free(bio);
+ CMS_ContentInfo_free(cms);
+
+ return ret;
+ }
+ }
+
+ PKCS7_free(p7);
+ }
+
+ BIO_free(bio);
+ }
+
+ CMS_ContentInfo_free(cms);
+ }
+ }
+
+ return RSPAMD_MIME_PARSE_OK;
+}
+
+struct rspamd_mime_multipart_cbdata {
+ struct rspamd_task *task;
+ struct rspamd_mime_part *multipart;
+ struct rspamd_mime_parser_ctx *st;
+ const gchar *part_start;
+ rspamd_ftok_t *cur_boundary;
+ guint64 bhash;
+ GError **err;
+};
+
+static enum rspamd_mime_parse_error
+rspamd_mime_process_multipart_node(struct rspamd_task *task,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_part *multipart,
+ const gchar *start, const gchar *end,
+ gboolean is_finished,
+ GError **err)
+{
+ struct rspamd_content_type *ct, *sel = NULL;
+ struct rspamd_mime_header *hdr = NULL, *cur;
+ struct rspamd_mime_part *npart;
+ GString str;
+ goffset hdr_pos, body_pos;
+ enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL;
+
+
+ str.str = (gchar *) start;
+ str.len = end - start;
+
+ if (*start == '\n' || *start == '\r') {
+ /*
+ * We have a part that starts from newline which means that
+ * there are completely no headers in this part,
+ * hence we assume it as a text part
+ */
+ hdr_pos = 0;
+ body_pos = 0;
+
+ if (!is_finished) {
+ /* Ignore garbage */
+ const gchar *p = start;
+ gboolean seen_something = FALSE;
+
+ while (p < end) {
+ if (g_ascii_isalnum(*p)) {
+ seen_something = TRUE;
+ break;
+ }
+ p++;
+ }
+
+ if (!seen_something) {
+ return RSPAMD_MIME_PARSE_NO_PART;
+ }
+ }
+ }
+ else {
+ hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
+ }
+
+ npart = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_part));
+ npart->parent_part = multipart;
+ npart->raw_headers = rspamd_message_headers_new();
+ npart->headers_order = NULL;
+
+ if (multipart) {
+ if (multipart->specific.mp->children == NULL) {
+ multipart->specific.mp->children = g_ptr_array_sized_new(2);
+ }
+
+ g_ptr_array_add(multipart->specific.mp->children, npart);
+ }
+
+ if (hdr_pos > 0 && hdr_pos < str.len) {
+ npart->raw_headers_str = str.str;
+ npart->raw_headers_len = hdr_pos;
+ npart->raw_data.begin = start + body_pos;
+ npart->raw_data.len = (end - start) - body_pos;
+
+ if (npart->raw_headers_len > 0) {
+ rspamd_mime_headers_process(task, npart->raw_headers,
+ &npart->headers_order,
+ npart->raw_headers_str,
+ npart->raw_headers_len,
+ FALSE);
+
+ /* Preserve the natural order */
+ if (npart->headers_order) {
+ LL_REVERSE2(npart->headers_order, ord_next);
+ }
+ }
+
+ hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
+ "Content-Type", FALSE);
+ }
+ else {
+ npart->raw_headers_str = 0;
+ npart->raw_headers_len = 0;
+ npart->raw_data.begin = start;
+ npart->raw_data.len = end - start;
+ }
+
+
+ if (hdr != NULL) {
+
+ DL_FOREACH(hdr, cur)
+ {
+ ct = rspamd_content_type_parse(cur->value, strlen(cur->value),
+ task->task_pool);
+
+ /* Here we prefer multipart content-type or any content-type */
+ if (ct) {
+ if (sel == NULL) {
+ sel = ct;
+ }
+ else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+ sel = ct;
+ }
+ }
+ }
+ }
+
+ if (sel == NULL) {
+ sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel));
+ RSPAMD_FTOK_ASSIGN(&sel->type, "text");
+ RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain");
+ }
+
+ npart->ct = sel;
+
+ if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+ st->nesting++;
+ g_ptr_array_add(st->stack, npart);
+ npart->part_type = RSPAMD_MIME_PART_MULTIPART;
+ npart->specific.mp = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_multipart));
+ memcpy(&npart->specific.mp->boundary, &sel->orig_boundary,
+ sizeof(rspamd_ftok_t));
+ ret = rspamd_mime_parse_multipart_part(task, npart, st, err);
+ }
+ else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
+ st->nesting++;
+ g_ptr_array_add(st->stack, npart);
+ npart->part_type = RSPAMD_MIME_PART_MESSAGE;
+
+ if ((ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err)) == RSPAMD_MIME_PARSE_OK) {
+ ret = rspamd_mime_parse_message(task, npart, st, err);
+ }
+ }
+ else {
+ ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err);
+ }
+
+ return ret;
+}
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_multipart_cb(struct rspamd_task *task,
+ struct rspamd_mime_part *multipart,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_multipart_cbdata *cb,
+ struct rspamd_mime_boundary *b)
+{
+ const gchar *pos = st->start + b->boundary;
+ enum rspamd_mime_parse_error ret;
+
+ task = cb->task;
+
+ /* Now check boundary */
+ if (!cb->part_start) {
+ cb->part_start = st->start + b->start;
+ st->pos = cb->part_start;
+ }
+ else {
+ /*
+ * We have seen the start of the boundary,
+ * but it might be unsuitable (e.g. in broken headers)
+ */
+ if (cb->part_start < pos && cb->cur_boundary) {
+
+ if ((ret = rspamd_mime_process_multipart_node(task, cb->st,
+ cb->multipart, cb->part_start, pos, TRUE, cb->err)) != RSPAMD_MIME_PARSE_OK) {
+ return ret;
+ }
+
+ if (b->start > 0) {
+ /* Go towards the next part */
+ cb->part_start = st->start + b->start;
+ cb->st->pos = cb->part_start;
+ }
+ }
+ else {
+ /* We have an empty boundary, do nothing */
+ }
+ }
+
+ return RSPAMD_MIME_PARSE_OK;
+}
+
+static enum rspamd_mime_parse_error
+rspamd_multipart_boundaries_filter(struct rspamd_task *task,
+ struct rspamd_mime_part *multipart,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_multipart_cbdata *cb)
+{
+ struct rspamd_mime_boundary *cur;
+ goffset last_offset;
+ guint i, sel = 0;
+ enum rspamd_mime_parse_error ret;
+
+ last_offset = (multipart->raw_data.begin - st->start) +
+ multipart->raw_data.len;
+
+ /* Find the first offset suitable for this part */
+ for (i = 0; i < st->boundaries->len; i++) {
+ cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i);
+
+ if (cur->start >= multipart->raw_data.begin - st->start) {
+ if (cb->cur_boundary) {
+ /* Check boundary */
+ msg_debug_mime("compare %L and %L (and %L)",
+ cb->bhash, cur->hash, cur->closed_hash);
+
+ if (cb->bhash == cur->hash) {
+ sel = i;
+ break;
+ }
+ else if (cb->bhash == cur->closed_hash) {
+ /* Not a closing element in fact */
+ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
+ cur->hash = cur->closed_hash;
+ sel = i;
+ break;
+ }
+ }
+ else {
+ /* Set current boundary */
+ cb->cur_boundary = rspamd_mempool_alloc(task->task_pool,
+ sizeof(rspamd_ftok_t));
+ cb->cur_boundary->begin = st->start + cur->boundary;
+ cb->cur_boundary->len = 0;
+ cb->bhash = cur->hash;
+ sel = i;
+ break;
+ }
+ }
+ }
+
+ /* Now we can go forward with boundaries that are same to what we have */
+ for (i = sel; i < st->boundaries->len; i++) {
+ cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i);
+
+ if (cur->boundary > last_offset) {
+ break;
+ }
+
+ if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) {
+ if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st,
+ cb, cur)) != RSPAMD_MIME_PARSE_OK) {
+ return ret;
+ }
+
+ if (cur->closed_hash == cb->bhash) {
+ /* We have again fake closed hash */
+ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
+ cur->hash = cur->closed_hash;
+ }
+
+ if (RSPAMD_BOUNDARY_IS_CLOSED(cur)) {
+ /* We also might check the next boundary... */
+ if (i < st->boundaries->len - 1) {
+ cur = &g_array_index(st->boundaries,
+ struct rspamd_mime_boundary, i + 1);
+
+ if (cur->hash == cb->bhash) {
+ continue;
+ }
+ else if (cur->closed_hash == cb->bhash) {
+ /* We have again fake closed hash */
+ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
+ cur->hash = cur->closed_hash;
+ continue;
+ }
+ }
+
+ break;
+ }
+ }
+ }
+
+ if (i == st->boundaries->len && cb->cur_boundary) {
+ /* Process the last part */
+ struct rspamd_mime_boundary fb;
+
+ fb.boundary = last_offset;
+ fb.start = -1;
+
+ if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st,
+ cb, &fb)) != RSPAMD_MIME_PARSE_OK) {
+ return ret;
+ }
+ }
+
+ return RSPAMD_MIME_PARSE_OK;
+}
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_multipart_part(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ GError **err)
+{
+ struct rspamd_mime_multipart_cbdata cbdata;
+ enum rspamd_mime_parse_error ret;
+
+ if (st->nesting > max_nested) {
+ g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
+ st->nesting);
+ return RSPAMD_MIME_PARSE_NESTING;
+ }
+
+ part->part_number = MESSAGE_FIELD(task, parts)->len;
+ part->urls = g_ptr_array_new();
+ g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
+ st->nesting++;
+ rspamd_mime_part_get_cte(task, part->raw_headers, part, FALSE);
+
+ st->pos = part->raw_data.begin;
+ cbdata.multipart = part;
+ cbdata.task = task;
+ cbdata.st = st;
+ cbdata.part_start = NULL;
+ cbdata.err = err;
+
+ if (part->ct->boundary.len > 0) {
+ /* We know our boundary */
+ cbdata.cur_boundary = &part->ct->boundary;
+ rspamd_cryptobox_siphash((guchar *) &cbdata.bhash,
+ cbdata.cur_boundary->begin, cbdata.cur_boundary->len,
+ lib_ctx->hkey);
+ msg_debug_mime("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash);
+ }
+ else {
+ /* Guess boundary */
+ cbdata.cur_boundary = NULL;
+ cbdata.bhash = 0;
+ }
+
+ ret = rspamd_multipart_boundaries_filter(task, part, st, &cbdata);
+ /* Cleanup stack */
+ st->nesting--;
+ g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1);
+
+ return ret;
+}
+
+/* Process boundary like structures in a message */
+static gint
+rspamd_mime_preprocess_cb(struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
+{
+ const gchar *end = text + len, *p = text + match_pos, *bend;
+ gsize blen;
+ gboolean closing = FALSE;
+ struct rspamd_mime_boundary b;
+ struct rspamd_mime_parser_ctx *st = context;
+ struct rspamd_task *task;
+
+ task = st->task;
+
+ if (G_LIKELY(p < end)) {
+
+ blen = 0;
+
+ while (p < end) {
+ if (*p == '\r' || *p == '\n') {
+ break;
+ }
+
+ blen++;
+ p++;
+ }
+
+ if (blen > 0) {
+ /* We have found something like boundary */
+ p = text + match_pos;
+ bend = p + blen - 1;
+
+ if (*bend == '-') {
+ /* We need to verify last -- */
+ if (bend > p + 1 && *(bend - 1) == '-') {
+ closing = TRUE;
+ bend--;
+ blen -= 2;
+ }
+ else {
+ /* Not a closing boundary somehow, e.g. if a boundary=='-' */
+ bend++;
+ }
+ }
+ else {
+ bend++;
+ }
+
+ while (bend < end) {
+ if (*bend == '\r') {
+ bend++;
+
+ /* \r\n */
+ if (bend < end && *bend == '\n') {
+ bend++;
+ }
+ }
+ else if (*bend == '\n') {
+ /* \n */
+ bend++;
+ }
+ else if (g_ascii_isspace(*bend)) {
+ /* Spaces in the same line, skip them */
+ bend++;
+ continue;
+ }
+
+ break;
+ }
+
+ b.boundary = p - st->start - 2;
+ b.start = bend - st->start;
+
+ /* Small optimisation as boundaries are usually short strings */
+ gchar *lc_copy, lc_copy_buf[128];
+
+ if (blen + 2 < sizeof(lc_copy_buf)) {
+ lc_copy = lc_copy_buf;
+ }
+ else {
+ lc_copy = g_malloc(blen + 2);
+ }
+
+ if (closing) {
+ memcpy(lc_copy, p, blen + 2);
+ rspamd_str_lc(lc_copy, blen + 2);
+ }
+ else {
+ memcpy(lc_copy, p, blen);
+ rspamd_str_lc(lc_copy, blen);
+ }
+
+ rspamd_cryptobox_siphash((guchar *) &b.hash, lc_copy, blen,
+ lib_ctx->hkey);
+ msg_debug_mime("normal hash: %*s -> %L, %d boffset, %d data offset",
+ (gint) blen, lc_copy, b.hash, (int) b.boundary, (int) b.start);
+
+ if (closing) {
+ b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED;
+ rspamd_cryptobox_siphash((guchar *) &b.closed_hash, lc_copy,
+ blen + 2,
+ lib_ctx->hkey);
+ msg_debug_mime("closing hash: %*s -> %L, %d boffset, %d data offset",
+ (gint) blen + 2, lc_copy,
+ b.closed_hash,
+ (int) b.boundary, (int) b.start);
+ }
+ else {
+ b.flags = 0;
+ b.closed_hash = 0;
+ }
+
+ /* Check if a string has been allocated on the heap */
+ if (blen + 2 >= sizeof(lc_copy_buf)) {
+ g_free(lc_copy);
+ }
+ g_array_append_val(st->boundaries, b);
+ }
+ }
+
+ return 0;
+}
+
+static goffset
+rspamd_mime_parser_headers_heuristic(GString *input, goffset *body_start)
+{
+ const gsize default_max_len = 76;
+ gsize max_len = MIN(input->len, default_max_len);
+ const gchar *p, *end;
+ enum {
+ st_before_colon = 0,
+ st_colon,
+ st_spaces_after_colon,
+ st_value,
+ st_error
+ } state = st_before_colon;
+
+ p = input->str;
+ end = p + max_len;
+
+ while (p < end) {
+ switch (state) {
+ case st_before_colon:
+ if (G_UNLIKELY(*p == ':')) {
+ state = st_colon;
+ }
+ else if (G_UNLIKELY(!g_ascii_isgraph(*p))) {
+ state = st_error;
+ }
+
+ p++;
+ break;
+ case st_colon:
+ if (g_ascii_isspace(*p)) {
+ state = st_spaces_after_colon;
+ }
+ else {
+ state = st_value;
+ }
+ p++;
+ break;
+ case st_spaces_after_colon:
+ if (!g_ascii_isspace(*p)) {
+ state = st_value;
+ }
+ p++;
+ break;
+ case st_value:
+ /* We accept any value */
+ goto end;
+ break;
+ case st_error:
+ return (-1);
+ break;
+ }
+ }
+
+end:
+ if (state == st_value) {
+ if (body_start) {
+ *body_start = input->len;
+ }
+
+ return input->len;
+ }
+
+ return (-1);
+}
+
+static void
+rspamd_mime_preprocess_message(struct rspamd_task *task,
+ struct rspamd_mime_part *top,
+ struct rspamd_mime_parser_ctx *st)
+{
+
+ if (top->raw_data.begin >= st->pos) {
+ rspamd_multipattern_lookup(lib_ctx->mp_boundary,
+ top->raw_data.begin - 1,
+ top->raw_data.len + 1,
+ rspamd_mime_preprocess_cb, st, NULL);
+ }
+ else {
+ rspamd_multipattern_lookup(lib_ctx->mp_boundary,
+ st->pos,
+ st->end - st->pos,
+ rspamd_mime_preprocess_cb, st, NULL);
+ }
+}
+
+static void
+rspamd_mime_parse_stack_free(struct rspamd_mime_parser_ctx *st)
+{
+ if (st) {
+ g_ptr_array_free(st->stack, TRUE);
+ g_array_free(st->boundaries, TRUE);
+ g_free(st);
+ }
+}
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_message(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ GError **err)
+{
+ struct rspamd_content_type *ct, *sel = NULL;
+ struct rspamd_mime_header *hdr = NULL, *cur;
+ const gchar *pbegin, *p;
+ gsize plen, len;
+ struct rspamd_mime_part *npart;
+ goffset hdr_pos, body_pos;
+ guint i;
+ enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
+ GString str;
+ struct rspamd_mime_parser_ctx *nst = st;
+
+ if (st->nesting > max_nested) {
+ g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
+ st->nesting);
+ return RSPAMD_MIME_PARSE_NESTING;
+ }
+
+ /* Allocate real part */
+ npart = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_part));
+
+ if (part == NULL) {
+ /* Top level message */
+ p = task->msg.begin;
+ len = task->msg.len;
+
+ str.str = (gchar *) p;
+ str.len = len;
+
+ hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
+
+ if (hdr_pos > 0 && hdr_pos < str.len) {
+
+ MESSAGE_FIELD(task, raw_headers_content).begin = str.str;
+ MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos;
+ MESSAGE_FIELD(task, raw_headers_content).body_start = str.str + body_pos;
+
+ if (MESSAGE_FIELD(task, raw_headers_content).len > 0) {
+ rspamd_mime_headers_process(task,
+ MESSAGE_FIELD(task, raw_headers),
+ &MESSAGE_FIELD(task, headers_order),
+ MESSAGE_FIELD(task, raw_headers_content).begin,
+ MESSAGE_FIELD(task, raw_headers_content).len,
+ TRUE);
+ npart->raw_headers = rspamd_message_headers_ref(
+ MESSAGE_FIELD(task, raw_headers));
+
+ /* Preserve the natural order */
+ if (MESSAGE_FIELD(task, headers_order)) {
+ LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next);
+ }
+ }
+
+ hdr = rspamd_message_get_header_from_hash(
+ MESSAGE_FIELD(task, raw_headers),
+ "Content-Type", FALSE);
+ }
+ else {
+ /* First apply heuristic, maybe we have just headers */
+ hdr_pos = rspamd_mime_parser_headers_heuristic(&str, &body_pos);
+
+ if (hdr_pos > 0 && hdr_pos <= str.len) {
+ MESSAGE_FIELD(task, raw_headers_content).begin = str.str;
+ MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos;
+ MESSAGE_FIELD(task, raw_headers_content).body_start = str.str +
+ body_pos;
+
+ if (MESSAGE_FIELD(task, raw_headers_content).len > 0) {
+ rspamd_mime_headers_process(task,
+ MESSAGE_FIELD(task, raw_headers),
+ &MESSAGE_FIELD(task, headers_order),
+ MESSAGE_FIELD(task, raw_headers_content).begin,
+ MESSAGE_FIELD(task, raw_headers_content).len,
+ TRUE);
+ npart->raw_headers = rspamd_message_headers_ref(
+ MESSAGE_FIELD(task, raw_headers));
+
+ /* Preserve the natural order */
+ if (MESSAGE_FIELD(task, headers_order)) {
+ LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next);
+ }
+ }
+
+ hdr = rspamd_message_get_header_from_hash(
+ MESSAGE_FIELD(task, raw_headers),
+ "Content-Type", FALSE);
+ task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
+ }
+ else {
+ body_pos = 0;
+ }
+ }
+
+ pbegin = st->start + body_pos;
+ plen = st->end - pbegin;
+ npart->headers_order = NULL;
+ }
+ else {
+ /*
+ * Here are dragons:
+ * We allocate new parser context as we need to shift pointers
+ */
+ nst = g_malloc0(sizeof(*st));
+ nst->stack = g_ptr_array_sized_new(4);
+ nst->boundaries = g_array_sized_new(FALSE, FALSE,
+ sizeof(struct rspamd_mime_boundary), 8);
+ nst->start = part->parsed_data.begin;
+ nst->end = nst->start + part->parsed_data.len;
+ nst->pos = nst->start;
+ nst->task = st->task;
+ nst->nesting = st->nesting;
+ st->nesting++;
+
+ str.str = (gchar *) part->parsed_data.begin;
+ str.len = part->parsed_data.len;
+
+ hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
+ npart->raw_headers = rspamd_message_headers_new();
+ npart->headers_order = NULL;
+
+ if (hdr_pos > 0 && hdr_pos < str.len) {
+ npart->raw_headers_str = str.str;
+ npart->raw_headers_len = hdr_pos;
+ npart->raw_data.begin = str.str + body_pos;
+
+ if (npart->raw_headers_len > 0) {
+ rspamd_mime_headers_process(task,
+ npart->raw_headers,
+ &npart->headers_order,
+ npart->raw_headers_str,
+ npart->raw_headers_len,
+ FALSE);
+
+ /* Preserve the natural order */
+ if (npart->headers_order) {
+ LL_REVERSE2(npart->headers_order, ord_next);
+ }
+ }
+
+ hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
+ "Content-Type", FALSE);
+ }
+ else {
+ body_pos = 0;
+ }
+
+ pbegin = part->parsed_data.begin + body_pos;
+ plen = part->parsed_data.len - body_pos;
+ }
+
+ npart->raw_data.begin = pbegin;
+ npart->raw_data.len = plen;
+ npart->parent_part = part;
+
+ if (hdr == NULL) {
+ sel = NULL;
+ }
+ else {
+ DL_FOREACH(hdr, cur)
+ {
+ ct = rspamd_content_type_parse(cur->value, strlen(cur->value),
+ task->task_pool);
+
+ /* Here we prefer multipart content-type or any content-type */
+ if (ct) {
+ if (sel == NULL) {
+ sel = ct;
+ }
+ else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+ sel = ct;
+ }
+ }
+ }
+ }
+
+ if (sel == NULL) {
+ /* For messages we automatically assume plaintext */
+ msg_info_task("cannot find content-type for a message, assume text/plain");
+ sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel));
+ sel->flags = RSPAMD_CONTENT_TYPE_TEXT | RSPAMD_CONTENT_TYPE_MISSING;
+ RSPAMD_FTOK_ASSIGN(&sel->type, "text");
+ RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain");
+ }
+
+ npart->ct = sel;
+
+ if ((part == NULL || nst != st) &&
+ (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART | RSPAMD_CONTENT_TYPE_MESSAGE))) {
+ /* Not a trivial message, need to preprocess */
+ rspamd_mime_preprocess_message(task, npart, nst);
+ }
+
+ if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+ g_ptr_array_add(nst->stack, npart);
+ nst->nesting++;
+ npart->part_type = RSPAMD_MIME_PART_MULTIPART;
+ npart->specific.mp = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_multipart));
+ memcpy(&npart->specific.mp->boundary, &sel->orig_boundary,
+ sizeof(rspamd_ftok_t));
+ ret = rspamd_mime_parse_multipart_part(task, npart, nst, err);
+ }
+ else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
+ if ((ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err)) == RSPAMD_MIME_PARSE_OK) {
+ npart->part_type = RSPAMD_MIME_PART_MESSAGE;
+ ret = rspamd_mime_parse_message(task, npart, nst, err);
+ }
+ }
+ else {
+ ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err);
+ }
+
+ if (ret != RSPAMD_MIME_PARSE_OK) {
+ return ret;
+ }
+
+ if (part && st->stack->len > 0) {
+ /* Remove message part from the parent stack */
+ g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1);
+ st->nesting--;
+ }
+
+ /* Process leftovers for boundaries */
+ if (nst->boundaries) {
+ struct rspamd_mime_boundary *boundary, *start_boundary = NULL,
+ *end_boundary = NULL;
+ goffset cur_offset = nst->pos - nst->start,
+ end_offset = st->end - st->start;
+ guint sel_idx = 0;
+
+ for (;;) {
+ start_boundary = NULL;
+
+ for (i = sel_idx; i < nst->boundaries->len; i++) {
+ boundary = &g_array_index(nst->boundaries,
+ struct rspamd_mime_boundary, i);
+
+ if (boundary->start > cur_offset &&
+ boundary->boundary < end_offset &&
+ !RSPAMD_BOUNDARY_IS_CLOSED(boundary)) {
+ start_boundary = boundary;
+ sel_idx = i;
+ break;
+ }
+ }
+
+ if (start_boundary) {
+ const gchar *start, *end;
+
+ if (nst->boundaries->len > sel_idx + 1) {
+ end_boundary = &g_array_index(nst->boundaries,
+ struct rspamd_mime_boundary, sel_idx + 1);
+ end = nst->start + end_boundary->boundary;
+ }
+ else {
+ end = nst->end;
+ }
+
+ sel_idx++;
+
+ start = nst->start + start_boundary->start;
+
+ if (end > start &&
+ (ret = rspamd_mime_process_multipart_node(task, nst,
+ NULL, start, end, FALSE, err)) != RSPAMD_MIME_PARSE_OK) {
+
+ if (nst != st) {
+ rspamd_mime_parse_stack_free(nst);
+ }
+
+ if (ret == RSPAMD_MIME_PARSE_NO_PART) {
+ return RSPAMD_MIME_PARSE_OK;
+ }
+
+ return ret;
+ }
+ }
+ else {
+ break;
+ }
+ }
+ }
+
+ if (nst != st) {
+ rspamd_mime_parse_stack_free(nst);
+ }
+
+ return ret;
+}
+
+enum rspamd_mime_parse_error
+rspamd_mime_parse_task(struct rspamd_task *task, GError **err)
+{
+ struct rspamd_mime_parser_ctx *st;
+ enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
+
+ if (lib_ctx == NULL) {
+ rspamd_mime_parser_init_lib();
+ }
+
+ if (++lib_ctx->key_usages > max_key_usages) {
+ /* Regenerate siphash key */
+ ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey));
+ lib_ctx->key_usages = 0;
+ }
+
+ st = g_malloc0(sizeof(*st));
+ st->stack = g_ptr_array_sized_new(4);
+ st->pos = MESSAGE_FIELD(task, raw_headers_content).body_start;
+ st->end = task->msg.begin + task->msg.len;
+ st->boundaries = g_array_sized_new(FALSE, FALSE,
+ sizeof(struct rspamd_mime_boundary), 8);
+ st->task = task;
+
+ if (st->pos == NULL) {
+ st->pos = task->msg.begin;
+ }
+
+ st->start = task->msg.begin;
+ ret = rspamd_mime_parse_message(task, NULL, st, err);
+ rspamd_mime_parse_stack_free(st);
+
+ return ret;
+}
diff --git a/src/libmime/mime_parser.h b/src/libmime/mime_parser.h
new file mode 100644
index 0000000..aa77b2b
--- /dev/null
+++ b/src/libmime/mime_parser.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_MIME_PARSER_H_
+#define SRC_LIBMIME_MIME_PARSER_H_
+
+#include "config.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_task;
+struct rspamd_mime_part;
+
+enum rspamd_mime_parse_error {
+ RSPAMD_MIME_PARSE_OK = 0,
+ RSPAMD_MIME_PARSE_FATAL,
+ RSPAMD_MIME_PARSE_NESTING,
+ RSPAMD_MIME_PARSE_NO_PART,
+};
+
+enum rspamd_mime_parse_error rspamd_mime_parse_task(struct rspamd_task *task,
+ GError **err);
+
+void rspamd_mime_parser_calc_digest(struct rspamd_mime_part *part);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SRC_LIBMIME_MIME_PARSER_H_ */
diff --git a/src/libmime/mime_string.cxx b/src/libmime/mime_string.cxx
new file mode 100644
index 0000000..e818e64
--- /dev/null
+++ b/src/libmime/mime_string.cxx
@@ -0,0 +1,167 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+#include "mime_string.hxx"
+#include "unicode/uchar.h"
+
+TEST_SUITE("mime_string")
+{
+ using namespace rspamd::mime;
+ TEST_CASE("mime_string unfiltered ctors")
+ {
+ SUBCASE("empty")
+ {
+ mime_string st;
+ CHECK(st.size() == 0);
+ CHECK(st == "");
+ }
+ SUBCASE("unfiltered valid")
+ {
+ mime_string st{std::string_view("abcd")};
+ CHECK(st == "abcd");
+ }
+ SUBCASE("unfiltered zero character")
+ {
+ mime_string st{"abc\0d", 5};
+ CHECK(st.has_zeroes());
+ CHECK(st == "abcd");
+ }
+ SUBCASE("unfiltered invalid character - middle")
+ {
+ mime_string st{std::string("abc\234d")};
+ CHECK(st.has_invalid());
+ CHECK(st == "abc\uFFFDd");
+ }
+ SUBCASE("unfiltered invalid character - end")
+ {
+ mime_string st{std::string("abc\234")};
+ CHECK(st.has_invalid());
+ CHECK(st == "abc\uFFFD");
+ }
+ SUBCASE("unfiltered invalid character - start")
+ {
+ mime_string st{std::string("\234abc")};
+ CHECK(st.has_invalid());
+ CHECK(st == "\uFFFDabc");
+ }
+ }
+
+ TEST_CASE("mime_string filtered ctors")
+ {
+ auto print_filter = [](UChar32 inp) -> UChar32 {
+ if (!u_isprint(inp)) {
+ return 0;
+ }
+
+ return inp;
+ };
+
+ auto tolower_filter = [](UChar32 inp) -> UChar32 {
+ return u_tolower(inp);
+ };
+
+ SUBCASE("empty")
+ {
+ mime_string st{std::string_view(""), tolower_filter};
+ CHECK(st.size() == 0);
+ CHECK(st == "");
+ }
+ SUBCASE("filtered valid")
+ {
+ mime_string st{std::string("AbCdУ"), tolower_filter};
+ CHECK(st == "abcdу");
+ }
+ SUBCASE("filtered invalid + filtered")
+ {
+ mime_string st{std::string("abcd\234\1"), print_filter};
+ CHECK(st == "abcd\uFFFD");
+ }
+ }
+ TEST_CASE("mime_string assign")
+ {
+ SUBCASE("assign from valid")
+ {
+ mime_string st;
+
+ CHECK(st.assign_if_valid(std::string("test")));
+ CHECK(st == "test");
+ }
+ SUBCASE("assign from invalid")
+ {
+ mime_string st;
+
+ CHECK(!st.assign_if_valid(std::string("test\234t")));
+ CHECK(st == "");
+ }
+ }
+
+ TEST_CASE("mime_string iterators")
+ {
+
+ SUBCASE("unfiltered iterator ascii")
+ {
+ auto in = std::string("abcd");
+ mime_string st{in};
+ CHECK(st == "abcd");
+
+ int i = 0;
+ for (auto &&c: st) {
+ CHECK(c == in[i++]);
+ }
+ }
+
+ SUBCASE("unfiltered iterator utf8")
+ {
+ auto in = std::string("тест");
+ UChar32 ucs[4] = {1090, 1077, 1089, 1090};
+ mime_string st{in};
+ CHECK(st == "тест");
+
+ int i = 0;
+ for (auto &&c: st) {
+ CHECK(c == ucs[i++]);
+ }
+ CHECK(i == sizeof(ucs) / sizeof(ucs[0]));
+ }
+
+ SUBCASE("unfiltered raw iterator ascii")
+ {
+ auto in = std::string("abcd");
+ mime_string st{in};
+ CHECK(st == "abcd");
+
+ int i = 0;
+ for (auto it = st.raw_begin(); it != st.raw_end(); ++it) {
+ CHECK(*it == in[i++]);
+ }
+ }
+
+ SUBCASE("unfiltered raw iterator utf8")
+ {
+ auto in = std::string("тест");
+ mime_string st{in};
+ CHECK(st == "тест");
+
+ int i = 0;
+ for (auto it = st.raw_begin(); it != st.raw_end(); ++it) {
+ CHECK(*it == in[i++]);
+ }
+ CHECK(i == in.size());
+ }
+ }
+} \ No newline at end of file
diff --git a/src/libmime/mime_string.hxx b/src/libmime/mime_string.hxx
new file mode 100644
index 0000000..7476816
--- /dev/null
+++ b/src/libmime/mime_string.hxx
@@ -0,0 +1,670 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_MIME_STRING_HXX
+#define RSPAMD_MIME_STRING_HXX
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <string_view>
+#include <memory>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iosfwd>
+#include "libutil/mem_pool.h"
+#include "function2/function2.hpp"
+#include "unicode/utf8.h"
+#include "contrib/fastutf8/fastutf8.h"
+
+namespace rspamd::mime {
+/*
+ * The motivation for another string is to have utf8 valid string replacing
+ * all bad things with FFFFD replacement character and filtering \0 and other
+ * strange stuff defined by policies.
+ * This string always exclude \0 characters and ignore them! This is how MUA acts,
+ * and we also store a flag about bad characters.
+ * Mime string iterators are always const, so the underlying storage should not
+ * be modified externally.
+ */
+template<class T = char, class Allocator = std::allocator<T>,
+ class Functor = fu2::function_view<UChar32(UChar32)>>
+class basic_mime_string;
+
+using mime_string = basic_mime_string<char>;
+using mime_pool_string = basic_mime_string<char, mempool_allocator<char>>;
+
+/* Helpers for type safe flags */
+enum class mime_string_flags : std::uint8_t {
+ MIME_STRING_DEFAULT = 0,
+ MIME_STRING_SEEN_ZEROES = 0x1 << 0,
+ MIME_STRING_SEEN_INVALID = 0x1 << 1,
+};
+
+constexpr mime_string_flags operator|(mime_string_flags lhs, mime_string_flags rhs)
+{
+ using ut = std::underlying_type<mime_string_flags>::type;
+ return static_cast<mime_string_flags>(static_cast<ut>(lhs) | static_cast<ut>(rhs));
+}
+
+constexpr mime_string_flags operator&(mime_string_flags lhs, mime_string_flags rhs)
+{
+ using ut = std::underlying_type<mime_string_flags>::type;
+ return static_cast<mime_string_flags>(static_cast<ut>(lhs) & static_cast<ut>(rhs));
+}
+
+constexpr bool operator!(mime_string_flags fl)
+{
+ return fl == mime_string_flags::MIME_STRING_DEFAULT;
+}
+
+// Codepoint iterator base class
+template<typename Container, bool Raw = false>
+struct iterator_base {
+ template<typename, typename, typename>
+ friend class basic_mime_string;
+
+public:
+ using value_type = typename Container::value_type;
+ using difference_type = typename Container::difference_type;
+ using codepoint_type = typename Container::codepoint_type;
+ using reference_type = codepoint_type;
+ using iterator_category = std::bidirectional_iterator_tag;
+
+ bool operator==(const iterator_base &it) const noexcept
+ {
+ return idx == it.idx;
+ }
+
+ bool operator!=(const iterator_base &it) const noexcept
+ {
+ return idx != it.idx;
+ }
+
+ iterator_base(difference_type index, Container *instance) noexcept
+ : idx(index), cont_instance(instance)
+ {
+ }
+ iterator_base() noexcept = default;
+ iterator_base(const iterator_base &) noexcept = default;
+
+ iterator_base &operator=(const iterator_base &) noexcept = default;
+
+ Container *get_instance() const noexcept
+ {
+ return cont_instance;
+ }
+
+ codepoint_type get_value() const noexcept
+ {
+ auto i = idx;
+ codepoint_type uc;
+ U8_NEXT_UNSAFE(cont_instance->data(), i, uc);
+ return uc;
+ }
+
+protected:
+ difference_type idx;
+ Container *cont_instance = nullptr;
+
+protected:
+ void advance(difference_type n) noexcept
+ {
+ if (n > 0) {
+ U8_FWD_N_UNSAFE(cont_instance->data(), idx, n);
+ }
+ else if (n < 0) {
+ U8_BACK_N_UNSAFE(cont_instance->data(), idx, (-n));
+ }
+ }
+ void increment() noexcept
+ {
+ codepoint_type uc;
+ U8_NEXT_UNSAFE(cont_instance->data(), idx, uc);
+ }
+
+ void decrement() noexcept
+ {
+ codepoint_type uc;
+ U8_PREV_UNSAFE(cont_instance->data(), idx, uc);
+ }
+};
+
+// Partial spec for raw Byte-based iterator base
+template<typename Container>
+struct iterator_base<Container, true> {
+ template<typename, typename, typename>
+ friend class basic_string;
+
+public:
+ using value_type = typename Container::value_type;
+ using difference_type = typename Container::difference_type;
+ using reference_type = value_type;
+ using iterator_category = std::bidirectional_iterator_tag;
+
+ bool operator==(const iterator_base &it) const noexcept
+ {
+ return idx == it.idx;
+ }
+ bool operator!=(const iterator_base &it) const noexcept
+ {
+ return idx != it.idx;
+ }
+
+ iterator_base(difference_type index, Container *instance) noexcept
+ : idx(index), cont_instance(instance)
+ {
+ }
+
+ iterator_base() noexcept = default;
+ iterator_base(const iterator_base &) noexcept = default;
+ iterator_base &operator=(const iterator_base &) noexcept = default;
+ Container *get_instance() const noexcept
+ {
+ return cont_instance;
+ }
+
+ value_type get_value() const noexcept
+ {
+ return cont_instance->get_storage().at(idx);
+ }
+
+protected:
+ difference_type idx;
+ Container *cont_instance = nullptr;
+
+protected:
+ //! Advance the iterator n times (negative values allowed!)
+ void advance(difference_type n) noexcept
+ {
+ idx += n;
+ }
+
+ void increment() noexcept
+ {
+ idx++;
+ }
+ void decrement() noexcept
+ {
+ idx--;
+ }
+};
+
+template<typename Container, bool Raw>
+struct iterator;
+template<typename Container, bool Raw>
+struct const_iterator;
+
+template<typename Container, bool Raw = false>
+struct iterator : iterator_base<Container, Raw> {
+ iterator(typename iterator_base<Container, Raw>::difference_type index, Container *instance) noexcept
+ : iterator_base<Container, Raw>(index, instance)
+ {
+ }
+ iterator() noexcept = default;
+ iterator(const iterator &) noexcept = default;
+
+ iterator &operator=(const iterator &) noexcept = default;
+ /* Disallow creating from const_iterator */
+ iterator(const const_iterator<Container, Raw> &) = delete;
+
+ /* Prefix */
+ iterator &operator++() noexcept
+ {
+ this->increment();
+ return *this;
+ }
+
+ /* Postfix */
+ iterator operator++(int) noexcept
+ {
+ iterator tmp{this->idx, this->cont_instance};
+ this->increment();
+ return tmp;
+ }
+
+ /* Prefix */
+ iterator &operator--() noexcept
+ {
+ this->decrement();
+ return *this;
+ }
+
+ /* Postfix */
+ iterator operator--(int) noexcept
+ {
+ iterator tmp{this->idx, this->cont_instance};
+ this->decrement();
+ return tmp;
+ }
+
+ iterator operator+(typename iterator_base<Container, Raw>::difference_type n) const noexcept
+ {
+ iterator it{*this};
+ it.advance(n);
+ return it;
+ }
+
+ iterator &operator+=(typename iterator_base<Container, Raw>::difference_type n) noexcept
+ {
+ this->advance(n);
+ return *this;
+ }
+
+ iterator operator-(typename iterator_base<Container, Raw>::difference_type n) const noexcept
+ {
+ iterator it{*this};
+ it.advance(-n);
+ return it;
+ }
+
+ iterator &operator-=(typename iterator_base<Container, Raw>::difference_type n) noexcept
+ {
+ this->advance(-n);
+ return *this;
+ }
+
+ typename iterator::reference_type operator*() const noexcept
+ {
+ return this->get_value();
+ }
+};
+
+template<class CharT, class Allocator, class Functor>
+class basic_mime_string : private Allocator {
+public:
+ using storage_type = std::basic_string<CharT, std::char_traits<CharT>, Allocator>;
+ using view_type = std::basic_string_view<CharT, std::char_traits<CharT>>;
+ using filter_type = Functor;
+ using codepoint_type = UChar32;
+ using value_type = CharT;
+ using difference_type = std::ptrdiff_t;
+ using iterator = rspamd::mime::iterator<basic_mime_string, false>;
+ using raw_iterator = rspamd::mime::iterator<basic_mime_string, true>;
+ /* Ctors */
+ basic_mime_string() noexcept
+ : Allocator()
+ {
+ }
+ explicit basic_mime_string(const Allocator &alloc) noexcept
+ : Allocator(alloc)
+ {
+ }
+ explicit basic_mime_string(filter_type &&filt, const Allocator &alloc = Allocator()) noexcept
+ : Allocator(alloc), filter_func(std::move(filt))
+ {
+ }
+
+ basic_mime_string(const CharT *str, std::size_t sz, const Allocator &alloc = Allocator()) noexcept
+ : Allocator(alloc)
+ {
+ append_c_string_unfiltered(str, sz);
+ }
+
+ basic_mime_string(const storage_type &st,
+ const Allocator &alloc = Allocator()) noexcept
+ : basic_mime_string(st.data(), st.size(), alloc)
+ {
+ }
+
+ basic_mime_string(const view_type &st,
+ const Allocator &alloc = Allocator()) noexcept
+ : basic_mime_string(st.data(), st.size(), alloc)
+ {
+ }
+ /* Explicit move ctor */
+ basic_mime_string(basic_mime_string &&other) noexcept
+ {
+ *this = std::move(other);
+ }
+
+
+ /**
+ * Creates a string with a filter function. It is calee responsibility to
+ * ensure that the filter functor survives long enough to work with a string
+ * @param str
+ * @param sz
+ * @param filt
+ * @param alloc
+ */
+ basic_mime_string(const CharT *str, std::size_t sz,
+ filter_type &&filt,
+ const Allocator &alloc = Allocator()) noexcept
+ : Allocator(alloc),
+ filter_func(std::move(filt))
+ {
+ append_c_string_filtered(str, sz);
+ }
+
+ basic_mime_string(const storage_type &st,
+ filter_type &&filt,
+ const Allocator &alloc = Allocator()) noexcept
+ : basic_mime_string(st.data(), st.size(), std::move(filt), alloc)
+ {
+ }
+ basic_mime_string(const view_type &st,
+ filter_type &&filt,
+ const Allocator &alloc = Allocator()) noexcept
+ : basic_mime_string(st.data(), st.size(), std::move(filt), alloc)
+ {
+ }
+
+ /* It seems some libc++ implementations still perform copy, this might fix them */
+ basic_mime_string &operator=(basic_mime_string &&other)
+ {
+ storage = std::move(other.storage);
+ filter_func = std::move(other.filter_func);
+
+ return *this;
+ }
+
+ constexpr auto size() const noexcept -> std::size_t
+ {
+ return storage.size();
+ }
+
+ constexpr auto data() const noexcept -> const CharT *
+ {
+ return storage.data();
+ }
+
+ constexpr auto has_zeroes() const noexcept -> bool
+ {
+ return !!(flags & mime_string_flags::MIME_STRING_SEEN_ZEROES);
+ }
+
+ constexpr auto has_invalid() const noexcept -> bool
+ {
+ return !!(flags & mime_string_flags::MIME_STRING_SEEN_INVALID);
+ }
+
+ /**
+ * Assign mime string from another string using move operation if a source string
+ * is utf8 valid.
+ * If this function returns false, then ownership has not been transferred
+ * and the `other` string is unmodified as well as the storage
+ * @param other
+ * @return
+ */
+ [[nodiscard]] auto assign_if_valid(storage_type &&other) -> bool
+ {
+ if (filter_func) {
+ /* No way */
+ return false;
+ }
+ if (rspamd_fast_utf8_validate((const unsigned char *) other.data(), other.size()) == 0) {
+ std::swap(storage, other);
+
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Copy to the internal storage discarding the contained value
+ * @param other
+ * @return
+ */
+ auto assign_copy(const view_type &other)
+ {
+ storage.clear();
+
+ if (filter_func) {
+ append_c_string_filtered(other.data(), other.size());
+ }
+ else {
+ append_c_string_unfiltered(other.data(), other.size());
+ }
+ }
+ auto assign_copy(const storage_type &other)
+ {
+ storage.clear();
+
+ if (filter_func) {
+ append_c_string_filtered(other.data(), other.size());
+ }
+ else {
+ append_c_string_unfiltered(other.data(), other.size());
+ }
+ }
+ auto assign_copy(const basic_mime_string &other)
+ {
+ storage.clear();
+
+ if (filter_func) {
+ append_c_string_filtered(other.data(), other.size());
+ }
+ else {
+ append_c_string_unfiltered(other.data(), other.size());
+ }
+ }
+
+ /* Mutators */
+ auto append(const CharT *str, std::size_t size) -> std::size_t
+ {
+ if (filter_func) {
+ return append_c_string_filtered(str, size);
+ }
+ else {
+ return append_c_string_unfiltered(str, size);
+ }
+ }
+ auto append(const storage_type &other) -> std::size_t
+ {
+ return append(other.data(), other.size());
+ }
+ auto append(const view_type &other) -> std::size_t
+ {
+ return append(other.data(), other.size());
+ }
+
+ auto ltrim(const view_type &what) -> void
+ {
+ auto it = std::find_if(storage.begin(), storage.end(),
+ [&what](CharT c) {
+ return !std::any_of(what.begin(), what.end(), [&c](CharT sc) { return sc == c; });
+ });
+ storage.erase(storage.begin(), it);
+ }
+
+ auto rtrim(const view_type &what) -> void
+ {
+ auto it = std::find_if(storage.rbegin(), storage.rend(),
+ [&what](CharT c) {
+ return !std::any_of(what.begin(), what.end(), [&c](CharT sc) { return sc == c; });
+ });
+ storage.erase(it.base(), storage.end());
+ }
+
+ auto trim(const view_type &what) -> void
+ {
+ ltrim(what);
+ rtrim(what);
+ }
+
+ /* Comparison */
+ auto operator==(const basic_mime_string &other)
+ {
+ return other.storage == storage;
+ }
+ auto operator==(const storage_type &other)
+ {
+ return other == storage;
+ }
+ auto operator==(const view_type &other)
+ {
+ return other == storage;
+ }
+ auto operator==(const CharT *other)
+ {
+ if (other == NULL) {
+ return false;
+ }
+ auto olen = strlen(other);
+ if (storage.size() == olen) {
+ return memcmp(storage.data(), other, olen) == 0;
+ }
+
+ return false;
+ }
+
+ /* Iterators */
+ inline auto begin() noexcept -> iterator
+ {
+ return {0, this};
+ }
+
+ inline auto raw_begin() noexcept -> raw_iterator
+ {
+ return {0, this};
+ }
+
+ inline auto end() noexcept -> iterator
+ {
+ return {(difference_type) size(), this};
+ }
+
+ inline auto raw_end() noexcept -> raw_iterator
+ {
+ return {(difference_type) size(), this};
+ }
+
+ /* Utility */
+ inline auto get_storage() const noexcept -> const storage_type &
+ {
+ return storage;
+ }
+
+ inline auto as_view() const noexcept -> view_type
+ {
+ return view_type{storage};
+ }
+
+ constexpr CharT operator[](std::size_t pos) const noexcept
+ {
+ return storage[pos];
+ }
+ constexpr CharT at(std::size_t pos) const
+ {
+ return storage.at(pos);
+ }
+ constexpr bool empty() const noexcept
+ {
+ return storage.empty();
+ }
+
+
+ /* For doctest stringify */
+ friend std::ostream &operator<<(std::ostream &os, const CharT &value)
+ {
+ os << value.storage;
+ return os;
+ }
+
+private:
+ mime_string_flags flags = mime_string_flags::MIME_STRING_DEFAULT;
+ storage_type storage;
+ filter_type filter_func;
+
+ auto append_c_string_unfiltered(const CharT *str, std::size_t len) -> std::size_t
+ {
+ /* This is fast path */
+ const auto *p = str;
+ const auto *end = str + len;
+ std::int32_t err_offset;// We have to use int32_t here as old libicu is brain-damaged
+ auto orig_size = storage.size();
+
+ storage.reserve(len + storage.size());
+
+ if (memchr(str, 0, len) != NULL) {
+ /* Fallback to slow path */
+ flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES;
+ return append_c_string_filtered(str, len);
+ }
+
+ while (p < end && len > 0 &&
+ (err_offset = rspamd_fast_utf8_validate((const unsigned char *) p, len)) > 0) {
+ auto cur_offset = err_offset - 1;
+ storage.append(p, cur_offset);
+
+ while (cur_offset < len) {
+ auto tmp = cur_offset;
+ UChar32 uc;
+
+ U8_NEXT(p, cur_offset, len, uc);
+
+ if (uc < 0) {
+ storage.append("\uFFFD");
+ flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID;
+ }
+ else {
+ cur_offset = tmp;
+ break;
+ }
+ }
+
+ p += cur_offset;
+ len = end - p;
+ }
+
+ storage.append(p, len);
+ return storage.size() - orig_size;
+ }
+
+ auto append_c_string_filtered(const CharT *str, std::size_t len) -> std::size_t
+ {
+ std::int32_t i = 0;// We have to use int32_t here as old libicu is brain-damaged
+ UChar32 uc;
+ char tmp[4];
+ auto orig_size = storage.size();
+ /* Slow path */
+
+ storage.reserve(len + storage.size());
+
+ while (i < len) {
+ U8_NEXT(str, i, len, uc);
+
+ if (uc < 0) {
+ /* Replace with 0xFFFD */
+ storage.append("\uFFFD");
+ flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID;
+ }
+ else {
+ if (filter_func) {
+ uc = filter_func(uc);
+ }
+
+ if (uc == 0) {
+ /* Special case, ignore it */
+ flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES;
+ }
+ else {
+ std::int32_t o = 0;
+ U8_APPEND_UNSAFE(tmp, o, uc);
+ storage.append(tmp, o);
+ }
+ }
+ }
+
+ return storage.size() - orig_size;
+ }
+};
+
+}// namespace rspamd::mime
+
+
+#endif//RSPAMD_MIME_STRING_HXX
diff --git a/src/libmime/received.cxx b/src/libmime/received.cxx
new file mode 100644
index 0000000..dc16d9b
--- /dev/null
+++ b/src/libmime/received.cxx
@@ -0,0 +1,1017 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "libserver/url.h"
+#include "lua/lua_common.h"
+#include "libserver/cfg_file.h"
+#include "libserver/mempool_vars_internal.h"
+#include "mime_string.hxx"
+#include "smtp_parsers.h"
+#include "message.h"
+#include "received.hxx"
+#include "frozen/string.h"
+#include "frozen/unordered_map.h"
+
+namespace rspamd::mime {
+
+enum class received_part_type {
+ RSPAMD_RECEIVED_PART_FROM,
+ RSPAMD_RECEIVED_PART_BY,
+ RSPAMD_RECEIVED_PART_FOR,
+ RSPAMD_RECEIVED_PART_WITH,
+ RSPAMD_RECEIVED_PART_ID,
+ RSPAMD_RECEIVED_PART_UNKNOWN,
+};
+
+struct received_part {
+ received_part_type type;
+ mime_string data;
+ std::vector<mime_string> comments;
+
+ explicit received_part(received_part_type t)
+ : type(t),
+ data(received_char_filter)
+ {
+ }
+};
+
+static inline auto
+received_part_set_or_append(const gchar *begin,
+ gsize len,
+ mime_string &dest) -> void
+{
+ if (len == 0) {
+ return;
+ }
+
+ dest.append(begin, len);
+ dest.trim(" \t");
+}
+
+static auto
+received_process_part(const std::string_view &data,
+ received_part_type type,
+ std::ptrdiff_t &last,
+ received_part &npart) -> bool
+{
+ auto obraces = 0, ebraces = 0;
+ auto seen_tcpinfo = false;
+ enum _parse_state {
+ skip_spaces,
+ in_comment,
+ read_data,
+ read_tcpinfo,
+ all_done
+ } state,
+ next_state;
+
+ /* In this function, we just process comments and data separately */
+ const auto *p = data.data();
+ const auto *end = p + data.size();
+ const auto *c = p;
+
+ state = skip_spaces;
+ next_state = read_data;
+
+ while (p < end) {
+ switch (state) {
+ case skip_spaces:
+ if (!g_ascii_isspace(*p)) {
+ c = p;
+ state = next_state;
+ }
+ else {
+ p++;
+ }
+ break;
+ case in_comment:
+ if (*p == '(') {
+ obraces++;
+ }
+ else if (*p == ')') {
+ ebraces++;
+
+ if (ebraces >= obraces) {
+ if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+ if (p > c) {
+ npart.comments.emplace_back(received_char_filter);
+ auto &comment = npart.comments.back();
+ received_part_set_or_append(c, p - c,
+ comment);
+ }
+ }
+
+ p++;
+ c = p;
+ state = skip_spaces;
+ next_state = read_data;
+
+ continue;
+ }
+ }
+
+ p++;
+ break;
+ case read_data:
+ if (*p == '(') {
+ if (p > c) {
+ if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+ received_part_set_or_append(c, p - c,
+ npart.data);
+ }
+ }
+
+ state = in_comment;
+ obraces = 1;
+ ebraces = 0;
+ p++;
+ c = p;
+ }
+ else if (g_ascii_isspace(*p)) {
+ if (p > c) {
+ if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+ received_part_set_or_append(c, p - c,
+ npart.data);
+ }
+ }
+
+ state = skip_spaces;
+ next_state = read_data;
+ c = p;
+ }
+ else if (*p == ';') {
+ /* It is actually delimiter of date part if not in the comments */
+ if (p > c) {
+ if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+ received_part_set_or_append(c, p - c,
+ npart.data);
+ }
+ }
+
+ state = all_done;
+ continue;
+ }
+ else if (npart.data.size() > 0) {
+ /* We have already received data and find something with no ( */
+ if (!seen_tcpinfo && type == received_part_type::RSPAMD_RECEIVED_PART_FROM) {
+ /* Check if we have something special here, such as TCPinfo */
+ if (*c == '[') {
+ state = read_tcpinfo;
+ p++;
+ }
+ else {
+ state = all_done;
+ continue;
+ }
+ }
+ else {
+ state = all_done;
+ continue;
+ }
+ }
+ else {
+ p++;
+ }
+ break;
+ case read_tcpinfo:
+ if (*p == ']') {
+ received_part_set_or_append(c, p - c + 1,
+ npart.data);
+ seen_tcpinfo = TRUE;
+ state = skip_spaces;
+ next_state = read_data;
+ c = p;
+ }
+ p++;
+ break;
+ case all_done:
+ if (p > data.data()) {
+ last = p - data.data();
+ return true;
+ }
+ else {
+ /* Empty element */
+ return false;
+ }
+ break;
+ }
+ }
+
+ /* Leftover */
+ switch (state) {
+ case read_data:
+ if (p > c) {
+ if (type != received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN) {
+ received_part_set_or_append(c, p - c,
+ npart.data);
+ }
+
+ last = p - data.data();
+
+ return true;
+ }
+ break;
+ case skip_spaces:
+ if (p > data.data()) {
+ last = p - data.data();
+
+ return true;
+ }
+ default:
+ break;
+ }
+
+ return false;
+}
+
+template<std::size_t N>
+constexpr auto lit_compare_lowercase(const char lit[N], const char *in) -> bool
+{
+ for (auto i = 0; i < N; i++) {
+ if (lc_map[(unsigned char) in[i]] != lit[i]) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static auto
+received_spill(const std::string_view &in,
+ std::ptrdiff_t &date_pos) -> std::vector<received_part>
+{
+ std::vector<received_part> parts;
+ std::ptrdiff_t pos = 0;
+ auto seen_from = false, seen_by = false;
+
+ const auto *p = in.data();
+ const auto *end = p + in.size();
+
+ auto skip_spaces = [&p, end]() {
+ while (p < end && g_ascii_isspace(*p)) {
+ p++;
+ }
+ };
+
+ skip_spaces();
+
+ /* Skip SMTP comments */
+ if (*p == '(') {
+ auto obraces = 0, ebraces = 0;
+
+ while (p < end) {
+ if (*p == ')') {
+ ebraces++;
+ }
+ else if (*p == '(') {
+ obraces++;
+ }
+
+ p++;
+
+ if (obraces == ebraces) {
+ /* Skip spaces after */
+ skip_spaces();
+ break;
+ }
+ }
+ }
+
+ auto len = end - p;
+
+ if (len == 0) {
+ return parts;
+ }
+
+ auto maybe_process_part = [&](received_part_type what) -> bool {
+ parts.emplace_back(what);
+ auto &rcvd_part = parts.back();
+ auto chunk = std::string_view{p, (std::size_t)(end - p)};
+
+ if (!received_process_part(chunk, what, pos, rcvd_part)) {
+ parts.pop_back();
+
+ return false;
+ }
+
+ return true;
+ };
+
+ if (len > 4 && lit_compare_lowercase<4>("from", p)) {
+ p += sizeof("from") - 1;
+
+ /* We can now store from part */
+ if (!maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_FROM)) {
+ /* Do not accept malformed from */
+ return {};
+ }
+
+ g_assert(pos != 0);
+ p += pos;
+ len = end > p ? end - p : 0;
+ seen_from = true;
+ }
+
+ if (len > 2 && lit_compare_lowercase<2>("by", p)) {
+ p += sizeof("by") - 1;
+
+ if (!maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_BY)) {
+ return {};
+ }
+
+ g_assert(pos != 0);
+ p += pos;
+ len = end > p ? end - p : 0;
+ seen_by = true;
+ }
+
+ if (!seen_from && !seen_by) {
+ /* Useless received */
+ return {};
+ }
+
+ while (p < end) {
+ bool got_part = false;
+ if (*p == ';') {
+ /* We are at the date separator, stop here */
+ date_pos = p - in.data() + 1;
+ break;
+ }
+ else {
+ if (len > sizeof("with") && lit_compare_lowercase<4>("with", p)) {
+ p += sizeof("with") - 1;
+
+ got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_WITH);
+ }
+ else if (len > sizeof("for") && lit_compare_lowercase<3>("for", p)) {
+ p += sizeof("for") - 1;
+ got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_FOR);
+ }
+ else if (len > sizeof("id") && lit_compare_lowercase<2>("id", p)) {
+ p += sizeof("id") - 1;
+ got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_ID);
+ }
+ else {
+ while (p < end) {
+ if (!(g_ascii_isspace(*p) || *p == '(' || *p == ';')) {
+ p++;
+ }
+ else {
+ break;
+ }
+ }
+
+ if (p == end) {
+ return {};
+ }
+ else if (*p == ';') {
+ date_pos = p - in.data() + 1;
+ break;
+ }
+ else {
+ got_part = maybe_process_part(received_part_type::RSPAMD_RECEIVED_PART_UNKNOWN);
+ }
+ }
+
+ if (!got_part) {
+ p++;
+ len = end > p ? end - p : 0;
+ }
+ else {
+ g_assert(pos != 0);
+ p += pos;
+ len = end > p ? end - p : 0;
+ }
+ }
+ }
+
+ return parts;
+}
+
+#define RSPAMD_INET_ADDRESS_PARSE_RECEIVED \
+ (rspamd_inet_address_parse_flags)(RSPAMD_INET_ADDRESS_PARSE_REMOTE | RSPAMD_INET_ADDRESS_PARSE_NO_UNIX)
+
+static auto
+received_process_rdns(rspamd_mempool_t *pool,
+ const std::string_view &in,
+ mime_string &dest) -> bool
+{
+ auto seen_dot = false;
+
+ const auto *p = in.data();
+ const auto *end = p + in.size();
+
+ if (in.empty()) {
+ return false;
+ }
+
+ if (*p == '[' && *(end - 1) == ']' && in.size() > 2) {
+ /* We have enclosed ip address */
+ auto *addr = rspamd_parse_inet_address_pool(p + 1,
+ (end - p) - 2,
+ pool,
+ RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
+
+ if (addr) {
+ const gchar *addr_str;
+
+ if (rspamd_inet_address_get_port(addr) != 0) {
+ addr_str = rspamd_inet_address_to_string_pretty(addr);
+ }
+ else {
+ addr_str = rspamd_inet_address_to_string(addr);
+ }
+
+ dest.assign_copy(std::string_view{addr_str});
+
+ return true;
+ }
+ }
+
+ auto hlen = 0u;
+
+ while (p < end) {
+ if (!g_ascii_isspace(*p) && rspamd_url_is_domain(*p)) {
+ if (*p == '.') {
+ seen_dot = true;
+ }
+
+ hlen++;
+ }
+ else {
+ break;
+ }
+
+ p++;
+ }
+
+ if (hlen > 0) {
+ if (p == end || (seen_dot && (g_ascii_isspace(*p) || *p == '[' || *p == '('))) {
+ /* All data looks like a hostname */
+ dest.assign_copy(std::string_view{in.data(), hlen});
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static auto
+received_process_host_tcpinfo(rspamd_mempool_t *pool,
+ received_header &rh,
+ const std::string_view &in) -> bool
+{
+ rspamd_inet_addr_t *addr = nullptr;
+ auto ret = false;
+
+ if (in.empty()) {
+ return false;
+ }
+
+ if (in[0] == '[') {
+ /* Likely Exim version */
+
+ auto brace_pos = in.find(']');
+
+ if (brace_pos != std::string_view::npos) {
+ auto substr_addr = in.substr(1, brace_pos - 1);
+ addr = rspamd_parse_inet_address_pool(substr_addr.data(),
+ substr_addr.size(),
+ pool,
+ RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
+
+ if (addr) {
+ rh.addr = addr;
+ rh.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(addr)));
+ }
+ }
+ }
+ else {
+ if (g_ascii_isxdigit(in[0])) {
+ /* Try to parse IP address */
+ addr = rspamd_parse_inet_address_pool(in.data(),
+ in.size(), pool, RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
+ if (addr) {
+ rh.addr = addr;
+ rh.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(addr)));
+ }
+ }
+
+ if (!addr) {
+ /* Try canonical Postfix version: rdns [ip] */
+ auto obrace_pos = in.find('[');
+
+ if (obrace_pos != std::string_view::npos) {
+ auto ebrace_pos = in.rfind(']');
+
+ if (ebrace_pos != std::string_view::npos && ebrace_pos > obrace_pos) {
+ auto substr_addr = in.substr(obrace_pos + 1,
+ ebrace_pos - obrace_pos - 1);
+ addr = rspamd_parse_inet_address_pool(substr_addr.data(),
+ substr_addr.size(),
+ pool,
+ RSPAMD_INET_ADDRESS_PARSE_RECEIVED);
+
+ if (addr) {
+ rh.addr = addr;
+ rh.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(addr)));
+
+ /* Process with rDNS */
+ auto rdns_substr = in.substr(0, obrace_pos);
+
+ if (received_process_rdns(pool, rdns_substr, rh.real_hostname)) {
+ ret = true;
+ }
+ }
+ }
+ }
+ else {
+ /* Hostname or some crap, sigh... */
+ if (received_process_rdns(pool, in, rh.real_hostname)) {
+ ret = true;
+ }
+ }
+ }
+ }
+
+ return ret;
+}
+
+static void
+received_process_from(rspamd_mempool_t *pool,
+ const received_part &rpart,
+ received_header &rh)
+{
+ if (rpart.data.size() > 0) {
+ /* We have seen multiple cases:
+ * - [ip] (hostname/unknown [real_ip])
+ * - helo (hostname/unknown [real_ip])
+ * - [ip]
+ * - hostname
+ * - hostname ([ip]:port helo=xxx)
+ * Maybe more...
+ */
+ auto seen_ip_in_data = false;
+
+ if (!rpart.comments.empty()) {
+ /* We can have info within comment as part of RFC */
+ received_process_host_tcpinfo(
+ pool, rh,
+ rpart.comments[0].as_view());
+ }
+
+ if (rh.real_ip.size() == 0) {
+ /* Try to do the same with data */
+ if (received_process_host_tcpinfo(
+ pool, rh,
+ rpart.data.as_view())) {
+ seen_ip_in_data = true;
+ }
+ }
+
+ if (!seen_ip_in_data) {
+ if (rh.real_ip.size() != 0) {
+ /* Get announced hostname (usually helo) */
+ received_process_rdns(pool,
+ rpart.data.as_view(),
+ rh.from_hostname);
+ }
+ else {
+ received_process_host_tcpinfo(pool,
+ rh, rpart.data.as_view());
+ }
+ }
+ }
+ else {
+ /* rpart->dlen = 0 */
+ if (!rpart.comments.empty()) {
+ received_process_host_tcpinfo(
+ pool, rh,
+ rpart.comments[0].as_view());
+ }
+ }
+}
+
+static auto
+received_header_parse(received_header_chain &chain, rspamd_mempool_t *pool,
+ const std::string_view &in,
+ struct rspamd_mime_header *hdr) -> bool
+{
+ std::ptrdiff_t date_pos = -1;
+
+ static constexpr const auto protos_map = frozen::make_unordered_map<frozen::string, received_flags>({{"smtp", received_flags::SMTP},
+ {"esmtp", received_flags::ESMTP},
+ {"esmtpa", received_flags::ESMTPA |
+ received_flags::AUTHENTICATED},
+ {"esmtpsa", received_flags::ESMTPSA |
+ received_flags::SSL |
+ received_flags::AUTHENTICATED},
+ {"esmtps", received_flags::ESMTPS |
+ received_flags::SSL},
+ {"lmtp", received_flags::LMTP},
+ {"imap", received_flags::IMAP},
+ {"imaps", received_flags::IMAP |
+ received_flags::SSL},
+ {"http", received_flags::HTTP},
+ {"https", received_flags::HTTP |
+ received_flags::SSL},
+ {"local", received_flags::LOCAL}});
+
+ auto parts = received_spill(in, date_pos);
+
+ if (parts.empty()) {
+ return false;
+ }
+
+ auto &rh = chain.new_received();
+
+ rh.flags = received_flags::UNKNOWN;
+ rh.hdr = hdr;
+
+ for (const auto &part: parts) {
+ switch (part.type) {
+ case received_part_type::RSPAMD_RECEIVED_PART_FROM:
+ received_process_from(pool, part, rh);
+ break;
+ case received_part_type::RSPAMD_RECEIVED_PART_BY:
+ received_process_rdns(pool,
+ part.data.as_view(),
+ rh.by_hostname);
+ break;
+ case received_part_type::RSPAMD_RECEIVED_PART_WITH:
+ if (part.data.size() > 0) {
+ auto proto_flag_it = protos_map.find(part.data.as_view());
+
+ if (proto_flag_it != protos_map.end()) {
+ rh.flags = proto_flag_it->second;
+ }
+ }
+ break;
+ case received_part_type::RSPAMD_RECEIVED_PART_FOR:
+ rh.for_mbox.assign_copy(part.data);
+ rh.for_addr = rspamd_email_address_from_smtp(rh.for_mbox.data(),
+ rh.for_mbox.size());
+ break;
+ default:
+ /* Do nothing */
+ break;
+ }
+ }
+
+ if (!rh.real_hostname.empty() && rh.from_hostname.empty()) {
+ rh.from_hostname.assign_copy(rh.real_hostname);
+ }
+
+ if (date_pos > 0 && date_pos < in.size()) {
+ auto date_sub = in.substr(date_pos);
+ rh.timestamp = rspamd_parse_smtp_date((const unsigned char *) date_sub.data(),
+ date_sub.size(), nullptr);
+ }
+
+ return true;
+}
+
+static auto
+received_maybe_fix_task(struct rspamd_task *task) -> bool
+{
+ auto *recv_chain_ptr = static_cast<received_header_chain *>(MESSAGE_FIELD(task, received_headers));
+
+ if (recv_chain_ptr) {
+ auto need_recv_correction = false;
+
+ auto top_recv_maybe = recv_chain_ptr->get_received(0);
+
+ if (top_recv_maybe.has_value()) {
+ auto &top_recv = top_recv_maybe.value().get();
+
+ const auto *raddr = top_recv.addr;
+ if (top_recv.real_ip.size() == 0 || (task->cfg && task->cfg->ignore_received)) {
+ need_recv_correction = true;
+ }
+ else if (!(task->flags & RSPAMD_TASK_FLAG_NO_IP) && task->from_addr) {
+ if (!raddr) {
+ need_recv_correction = true;
+ }
+ else {
+ if (rspamd_inet_address_compare(raddr, task->from_addr, FALSE) != 0) {
+ need_recv_correction = true;
+ }
+ }
+ }
+
+ if (need_recv_correction && !(task->flags & RSPAMD_TASK_FLAG_NO_IP) && task->from_addr) {
+ msg_debug_task("the first received seems to be"
+ " not ours, prepend it with fake one");
+
+ auto &trecv = recv_chain_ptr->new_received(received_header_chain::append_type::append_head);
+ trecv.flags |= received_flags::ARTIFICIAL;
+
+ if (task->flags & RSPAMD_TASK_FLAG_SSL) {
+ trecv.flags |= received_flags::SSL;
+ }
+
+ if (task->auth_user) {
+ trecv.flags |= received_flags::AUTHENTICATED;
+ }
+
+ trecv.real_ip.assign_copy(std::string_view(rspamd_inet_address_to_string(task->from_addr)));
+
+ const auto *mta_name = (const char *) rspamd_mempool_get_variable(task->task_pool,
+ RSPAMD_MEMPOOL_MTA_NAME);
+
+ if (mta_name) {
+ trecv.by_hostname.assign_copy(std::string_view(mta_name));
+ }
+ trecv.addr = rspamd_inet_address_copy(task->from_addr,
+ task->task_pool);
+
+ if (task->hostname) {
+ trecv.real_hostname.assign_copy(std::string_view(task->hostname));
+ trecv.from_hostname.assign_copy(trecv.real_hostname);
+ }
+
+ return true;
+ }
+
+ /* Extract data from received header if we were not given IP */
+ if (!need_recv_correction && (task->flags & RSPAMD_TASK_FLAG_NO_IP) &&
+ (task->cfg && !task->cfg->ignore_received)) {
+ if (!top_recv.real_ip.empty()) {
+ if (!rspamd_parse_inet_address(&task->from_addr,
+ top_recv.real_ip.data(),
+ top_recv.real_ip.size(),
+ RSPAMD_INET_ADDRESS_PARSE_NO_UNIX)) {
+ msg_warn_task("cannot get IP from received header: '%s'",
+ top_recv.real_ip.data());
+ task->from_addr = nullptr;
+ }
+ }
+ if (!top_recv.real_hostname.empty()) {
+ task->hostname = top_recv.real_hostname.data();
+ }
+
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static auto
+received_export_to_lua(received_header_chain *chain, lua_State *L) -> bool
+{
+ if (chain == nullptr) {
+ return false;
+ }
+
+ lua_createtable(L, chain->size(), 0);
+
+ auto push_flag = [L](const received_header &rh, received_flags fl, const char *name) {
+ lua_pushboolean(L, !!(rh.flags & fl));
+ lua_setfield(L, -2, name);
+ };
+
+ auto i = 1;
+
+ for (const auto &rh: chain->as_vector()) {
+ lua_createtable(L, 0, 10);
+
+ if (rh.hdr && rh.hdr->decoded) {
+ rspamd_lua_table_set(L, "raw", rh.hdr->decoded);
+ }
+
+ lua_createtable(L, 0, 3);
+ push_flag(rh, received_flags::ARTIFICIAL, "artificial");
+ push_flag(rh, received_flags::AUTHENTICATED, "authenticated");
+ push_flag(rh, received_flags::SSL, "ssl");
+ lua_setfield(L, -2, "flags");
+
+ auto push_nullable_string = [L](const mime_string &st, const char *field) {
+ if (st.empty()) {
+ lua_pushnil(L);
+ }
+ else {
+ lua_pushlstring(L, st.data(), st.size());
+ }
+ lua_setfield(L, -2, field);
+ };
+
+ push_nullable_string(rh.from_hostname, "from_hostname");
+ push_nullable_string(rh.real_hostname, "real_hostname");
+ push_nullable_string(rh.real_ip, "from_ip");
+ push_nullable_string(rh.by_hostname, "by_hostname");
+ push_nullable_string(rh.for_mbox, "for");
+
+ if (rh.addr) {
+ rspamd_lua_ip_push(L, rh.addr);
+ }
+ else {
+ lua_pushnil(L);
+ }
+ lua_setfield(L, -2, "real_ip");
+
+ lua_pushstring(L, received_protocol_to_string(rh.flags));
+ lua_setfield(L, -2, "proto");
+
+ lua_pushinteger(L, rh.timestamp);
+ lua_setfield(L, -2, "timestamp");
+
+ lua_rawseti(L, -2, i++);
+ }
+
+ return true;
+}
+
+}// namespace rspamd::mime
+
+bool rspamd_received_header_parse(struct rspamd_task *task,
+ const char *data, size_t sz,
+ struct rspamd_mime_header *hdr)
+{
+ auto *recv_chain_ptr = static_cast<rspamd::mime::received_header_chain *>(MESSAGE_FIELD(task, received_headers));
+
+ if (recv_chain_ptr == nullptr) {
+ /* This constructor automatically registers dtor in mempool */
+ recv_chain_ptr = new rspamd::mime::received_header_chain(task);
+ MESSAGE_FIELD(task, received_headers) = (void *) recv_chain_ptr;
+ }
+ return rspamd::mime::received_header_parse(*recv_chain_ptr, task->task_pool,
+ std::string_view{data, sz}, hdr);
+}
+
+bool rspamd_received_maybe_fix_task(struct rspamd_task *task)
+{
+ return rspamd::mime::received_maybe_fix_task(task);
+}
+
+bool rspamd_received_export_to_lua(struct rspamd_task *task, lua_State *L)
+{
+ return rspamd::mime::received_export_to_lua(
+ static_cast<rspamd::mime::received_header_chain *>(MESSAGE_FIELD(task, received_headers)),
+ L);
+}
+
+/* Tests part */
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+TEST_SUITE("received")
+{
+ TEST_CASE("parse received")
+ {
+ using namespace std::string_view_literals;
+ using map_type = ankerl::unordered_dense::map<std::string_view, std::string_view>;
+ std::vector<std::pair<std::string_view, map_type>> cases{
+ // Simple received
+ {"from smtp11.mailtrack.pl (smtp11.mailtrack.pl [185.243.30.90])"sv,
+ {{"real_ip", "185.243.30.90"},
+ {"real_hostname", "smtp11.mailtrack.pl"},
+ {"from_hostname", "smtp11.mailtrack.pl"}}},
+ // Real Postfix IPv6 received
+ {"from server.chat-met-vreemden.nl (unknown [IPv6:2a01:7c8:aab6:26d:5054:ff:fed1:1da2])\n"
+ "\t(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))\n"
+ "\t(Client did not present a certificate)\n"
+ "\tby mx1.freebsd.org (Postfix) with ESMTPS id CF0171862\n"
+ "\tfor <test@example.com>; Mon, 6 Jul 2015 09:01:20 +0000 (UTC)\n"
+ "\t(envelope-from upwest201diana@outlook.com)"sv,
+ {{"real_ip", "2a01:7c8:aab6:26d:5054:ff:fed1:1da2"},
+ {"from_hostname", "server.chat-met-vreemden.nl"},
+ {"by_hostname", "mx1.freebsd.org"},
+ {"for_mbox", "<test@example.com>"}}},
+ // Exim IPv4 received
+ {"from localhost ([127.0.0.1]:49019 helo=hummus.csx.cam.ac.uk)\n"
+ " by hummus.csx.cam.ac.uk with esmtp (Exim 4.91-pdpfix1)\n"
+ " (envelope-from <exim-dev-bounces@exim.org>)\n"
+ " id 1fZ55o-0006DP-3H\n"
+ " for <xxx@xxx.xxx>; Sat, 30 Jun 2018 02:54:28 +0100"sv,
+ {
+ {"from_hostname", "localhost"},
+ {"real_ip", "127.0.0.1"},
+ {"for_mbox", "<xxx@xxx.xxx>"},
+ {"by_hostname", "hummus.csx.cam.ac.uk"},
+ }},
+ // Exim IPv6 received
+ {"from smtp.spodhuis.org ([2a02:898:31:0:48:4558:736d:7470]:38689\n"
+ " helo=mx.spodhuis.org)\n"
+ " by hummus.csx.cam.ac.uk with esmtpsa (TLSv1.3:TLS_AES_256_GCM_SHA384:256)\n"
+ " (Exim 4.91-pdpfix1+cc) (envelope-from <xxx@exim.org>)\n"
+ " id 1fZ55k-0006CO-9M\n"
+ " for exim-dev@exim.org; Sat, 30 Jun 2018 02:54:24 +0100"sv,
+ {
+ {"from_hostname", "smtp.spodhuis.org"},
+ {"real_ip", "2a02:898:31:0:48:4558:736d:7470"},
+ {"for_mbox", "exim-dev@exim.org"},
+ {"by_hostname", "hummus.csx.cam.ac.uk"},
+ }},
+ // Haraka received
+ {"from aaa.cn ([1.1.1.1]) by localhost.localdomain (Haraka/2.8.18) with "
+ "ESMTPA id 349C9C2B-491A-4925-A687-3EF14038C344.1 envelope-from <huxin@xxx.com> "
+ "(authenticated bits=0); Tue, 03 Jul 2018 14:18:13 +0200"sv,
+ {
+ {"from_hostname", "aaa.cn"},
+ {"real_ip", "1.1.1.1"},
+ {"by_hostname", "localhost.localdomain"},
+ }},
+ // Invalid by
+ {"from [192.83.172.101] (HELLO 148.251.238.35) (148.251.238.35) "
+ "by guovswzqkvry051@sohu.com with gg login "
+ "by AOL 6.0 for Windows US sub 008 SMTP ; Tue, 03 Jul 2018 09:01:47 -0300"sv,
+ {
+ {"from_hostname", "192.83.172.101"},
+ {"real_ip", "192.83.172.101"},
+ }},
+ // Invalid hostinfo
+ {"from example.com ([]) by example.com with ESMTP id 2019091111 ;"
+ " Thu, 26 Sep 2019 11:19:07 +0200"sv,
+ {
+ {"by_hostname", "example.com"},
+ {"from_hostname", "example.com"},
+ {"real_hostname", "example.com"},
+ }},
+ // Different real and announced hostnames + broken crap
+ {"from 171-29.br (1-1-1-1.z.com.br [1.1.1.1]) by x.com.br (Postfix) "
+ "with;ESMTP id 44QShF6xj4z1X for <hey@y.br>; Thu, 21 Mar 2019 23:45:46 -0300 "
+ ": <g @yi.br>"sv,
+ {
+ {"real_ip", "1.1.1.1"},
+ {"from_hostname", "171-29.br"},
+ {"real_hostname", "1-1-1-1.z.com.br"},
+ {"by_hostname", "x.com.br"},
+ }},
+ // Different real and announced ips + no hostname
+ {"from [127.0.0.1] ([127.0.0.2]) by smtp.gmail.com with ESMTPSA id xxxololo"sv,
+ {
+ {"real_ip", "127.0.0.2"},
+ {"from_hostname", "127.0.0.1"},
+ {"by_hostname", "smtp.gmail.com"},
+ }},
+ // Different real and hostanes
+ {"from 185.118.166.127 (steven2.zhou01.pserver.ru [185.118.166.127]) "
+ "by mail.832zsu.cn (Postfix) with ESMTPA id AAD722133E34"sv,
+ {
+ {"real_ip", "185.118.166.127"},
+ {"from_hostname", "185.118.166.127"},
+ {"real_hostname", "steven2.zhou01.pserver.ru"},
+ {"by_hostname", "mail.832zsu.cn"},
+ }},
+ // \0 in received must be filtered
+ {"from smtp11.mailt\0rack.pl (smtp11.mail\0track.pl [1\085.243.30.90])"sv,
+ {{"real_ip", "185.243.30.90"},
+ {"real_hostname", "smtp11.mailtrack.pl"},
+ {"from_hostname", "smtp11.mailtrack.pl"}}},
+ // No from part
+ {"by mail.832zsu.cn (Postfix) with ESMTPA id AAD722133E34"sv,
+ {
+ {"by_hostname", "mail.832zsu.cn"},
+ }},
+ // From part is in the comment
+ {"(from asterisk@localhost)\n"
+ " by pbx.xxx.com (8.14.7/8.14.7/Submit) id 076Go4wD014562;\n"
+ " Thu, 6 Aug 2020 11:50:04 -0500"sv,
+ {
+ {"by_hostname", "pbx.xxx.com"},
+ }},
+ };
+ rspamd_mempool_t *pool = rspamd_mempool_new_default("rcvd test", 0);
+
+ for (auto &&c: cases) {
+ SUBCASE(c.first.data())
+ {
+ rspamd::mime::received_header_chain chain;
+ auto ret = rspamd::mime::received_header_parse(chain, pool,
+ c.first, nullptr);
+ CHECK(ret == true);
+ auto &&rh = chain.get_received(0);
+ CHECK(rh.has_value());
+ auto res = rh.value().get().as_map();
+
+ for (const auto &expected: c.second) {
+ CHECK_MESSAGE(res.contains(expected.first), expected.first.data());
+ CHECK(res[expected.first] == expected.second);
+ }
+ for (const auto &existing: res) {
+ CHECK_MESSAGE(c.second.contains(existing.first), existing.first.data());
+ CHECK(c.second[existing.first] == existing.second);
+ }
+ }
+ }
+
+ rspamd_mempool_delete(pool);
+ }
+} \ No newline at end of file
diff --git a/src/libmime/received.h b/src/libmime/received.h
new file mode 100644
index 0000000..46608a3
--- /dev/null
+++ b/src/libmime/received.h
@@ -0,0 +1,68 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef RSPAMD_RECEIVED_H
+#define RSPAMD_RECEIVED_H
+
+#include "config.h"
+#include "libutil/addr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * C bindings for C++ received code
+ */
+
+struct rspamd_email_address;
+struct rspamd_received_header_chain;
+struct rspamd_mime_header;
+
+/**
+ * Parse received header from an input header data
+ * @param task
+ * @param data
+ * @param sz
+ * @param hdr
+ * @return
+ */
+bool rspamd_received_header_parse(struct rspamd_task *task,
+ const char *data, size_t sz, struct rspamd_mime_header *hdr);
+
+
+/**
+ * Process task data and the most top received and fix either part if needed
+ * @param task
+ * @return
+ */
+bool rspamd_received_maybe_fix_task(struct rspamd_task *task);
+
+struct lua_State;
+/**
+ * Push received headers chain to lua
+ * @param task
+ * @param L
+ * @return
+ */
+bool rspamd_received_export_to_lua(struct rspamd_task *task, struct lua_State *L);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif//RSPAMD_RECEIVED_H
diff --git a/src/libmime/received.hxx b/src/libmime/received.hxx
new file mode 100644
index 0000000..4f423f1
--- /dev/null
+++ b/src/libmime/received.hxx
@@ -0,0 +1,314 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef RSPAMD_RECEIVED_HXX
+#define RSPAMD_RECEIVED_HXX
+#pragma once
+
+#include "config.h"
+#include "received.h"
+#include "mime_string.hxx"
+#include "libmime/email_addr.h"
+#include "libserver/task.h"
+#include "contrib/ankerl/unordered_dense.h"
+#include <vector>
+#include <string_view>
+#include <utility>
+#include <optional>
+
+namespace rspamd::mime {
+
+static inline auto
+received_char_filter(UChar32 uc) -> UChar32
+{
+ if (u_isprint(uc)) {
+ return u_tolower(uc);
+ }
+
+ return 0;
+}
+
+enum class received_flags {
+ DEFAULT = 0,
+ SMTP = 1u << 0u,
+ ESMTP = 1u << 1u,
+ ESMTPA = 1u << 2u,
+ ESMTPS = 1u << 3u,
+ ESMTPSA = 1u << 4u,
+ LMTP = 1u << 5u,
+ IMAP = 1u << 6u,
+ LOCAL = 1u << 7u,
+ HTTP = 1u << 8u,
+ MAPI = 1u << 9u,
+ UNKNOWN = 1u << 10u,
+ ARTIFICIAL = (1u << 11u),
+ SSL = (1u << 12u),
+ AUTHENTICATED = (1u << 13u),
+};
+
+constexpr received_flags operator|(received_flags lhs, received_flags rhs)
+{
+ using ut = std::underlying_type<received_flags>::type;
+ return static_cast<received_flags>(static_cast<ut>(lhs) | static_cast<ut>(rhs));
+}
+
+constexpr received_flags operator|=(received_flags &lhs, const received_flags rhs)
+{
+ using ut = std::underlying_type<received_flags>::type;
+ lhs = static_cast<received_flags>(static_cast<ut>(lhs) | static_cast<ut>(rhs));
+ return lhs;
+}
+
+constexpr received_flags operator&(received_flags lhs, received_flags rhs)
+{
+ using ut = std::underlying_type<received_flags>::type;
+ return static_cast<received_flags>(static_cast<ut>(lhs) & static_cast<ut>(rhs));
+}
+
+constexpr bool operator!(received_flags fl)
+{
+ return fl == received_flags::DEFAULT;
+}
+
+constexpr received_flags received_type_apply_protocols_mask(received_flags fl)
+{
+ return fl & (received_flags::SMTP |
+ received_flags::ESMTP |
+ received_flags::ESMTPA |
+ received_flags::ESMTPS |
+ received_flags::ESMTPSA |
+ received_flags::IMAP |
+ received_flags::HTTP |
+ received_flags::LOCAL |
+ received_flags::MAPI |
+ received_flags::LMTP);
+}
+
+constexpr const char *received_protocol_to_string(received_flags fl)
+{
+ const auto *proto = "unknown";
+
+ switch (received_type_apply_protocols_mask(fl)) {
+ case received_flags::SMTP:
+ proto = "smtp";
+ break;
+ case received_flags::ESMTP:
+ proto = "esmtp";
+ break;
+ case received_flags::ESMTPS:
+ proto = "esmtps";
+ break;
+ case received_flags::ESMTPA:
+ proto = "esmtpa";
+ break;
+ case received_flags::ESMTPSA:
+ proto = "esmtpsa";
+ break;
+ case received_flags::LMTP:
+ proto = "lmtp";
+ break;
+ case received_flags::IMAP:
+ proto = "imap";
+ break;
+ case received_flags::HTTP:
+ proto = "http";
+ break;
+ case received_flags::LOCAL:
+ proto = "local";
+ break;
+ case received_flags::MAPI:
+ proto = "mapi";
+ break;
+ default:
+ break;
+ }
+
+ return proto;
+}
+
+struct received_header {
+ mime_string from_hostname;
+ mime_string real_hostname;
+ mime_string real_ip;
+ mime_string by_hostname;
+ mime_string for_mbox;
+ struct rspamd_email_address *for_addr = nullptr;
+ rspamd_inet_addr_t *addr = nullptr;
+ struct rspamd_mime_header *hdr = nullptr;
+ time_t timestamp = 0;
+ received_flags flags = received_flags::DEFAULT; /* See enum rspamd_received_type */
+
+ received_header() noexcept
+ : from_hostname(received_char_filter),
+ real_hostname(received_char_filter),
+ real_ip(received_char_filter),
+ by_hostname(received_char_filter),
+ for_mbox()
+ {
+ }
+ /* We have raw C pointers, so copy is explicitly disabled */
+ received_header(const received_header &other) = delete;
+ received_header(received_header &&other) noexcept
+ {
+ *this = std::move(other);
+ }
+
+ received_header &operator=(received_header &&other) noexcept
+ {
+ if (this != &other) {
+ from_hostname = std::move(other.from_hostname);
+ real_hostname = std::move(other.real_hostname);
+ real_ip = std::move(other.real_ip);
+ by_hostname = std::move(other.by_hostname);
+ for_mbox = std::move(other.for_mbox);
+ timestamp = other.timestamp;
+ flags = other.flags;
+ std::swap(for_addr, other.for_addr);
+ std::swap(addr, other.addr);
+ std::swap(hdr, other.hdr);
+ }
+ return *this;
+ }
+
+ /* Unit tests helper */
+ static auto from_map(const ankerl::unordered_dense::map<std::string_view, std::string_view> &map) -> received_header
+ {
+ using namespace std::string_view_literals;
+ received_header rh;
+
+ if (map.contains("from_hostname")) {
+ rh.from_hostname.assign_copy(map.at("from_hostname"sv));
+ }
+ if (map.contains("real_hostname")) {
+ rh.real_hostname.assign_copy(map.at("real_hostname"sv));
+ }
+ if (map.contains("by_hostname")) {
+ rh.by_hostname.assign_copy(map.at("by_hostname"sv));
+ }
+ if (map.contains("real_ip")) {
+ rh.real_ip.assign_copy(map.at("real_ip"sv));
+ }
+ if (map.contains("for_mbox")) {
+ rh.for_mbox.assign_copy(map.at("for_mbox"sv));
+ }
+
+ return rh;
+ }
+
+ auto as_map() const -> ankerl::unordered_dense::map<std::string_view, std::string_view>
+ {
+ ankerl::unordered_dense::map<std::string_view, std::string_view> map;
+
+ if (!from_hostname.empty()) {
+ map["from_hostname"] = from_hostname.as_view();
+ }
+ if (!real_hostname.empty()) {
+ map["real_hostname"] = real_hostname.as_view();
+ }
+ if (!by_hostname.empty()) {
+ map["by_hostname"] = by_hostname.as_view();
+ }
+ if (!real_ip.empty()) {
+ map["real_ip"] = real_ip.as_view();
+ }
+ if (!for_mbox.empty()) {
+ map["for_mbox"] = for_mbox.as_view();
+ }
+
+ return map;
+ }
+
+ ~received_header()
+ {
+ if (for_addr) {
+ rspamd_email_address_free(for_addr);
+ }
+ }
+};
+
+class received_header_chain {
+public:
+ explicit received_header_chain(struct rspamd_task *task)
+ {
+ headers.reserve(2);
+ rspamd_mempool_add_destructor(task->task_pool,
+ received_header_chain::received_header_chain_pool_dtor, this);
+ }
+ explicit received_header_chain()
+ {
+ headers.reserve(2);
+ }
+
+ enum class append_type {
+ append_tail,
+ append_head
+ };
+
+ auto new_received(append_type how = append_type::append_tail) -> received_header &
+ {
+ if (how == append_type::append_tail) {
+ headers.emplace_back();
+
+ return headers.back();
+ }
+ else {
+ headers.insert(std::begin(headers), received_header());
+
+ return headers.front();
+ }
+ }
+ auto new_received(received_header &&hdr, append_type how = append_type::append_tail) -> received_header &
+ {
+ if (how == append_type::append_tail) {
+ headers.emplace_back(std::move(hdr));
+
+ return headers.back();
+ }
+ else {
+ headers.insert(std::begin(headers), std::move(hdr));
+
+ return headers.front();
+ }
+ }
+ auto get_received(std::size_t nth) -> std::optional<std::reference_wrapper<received_header>>
+ {
+ if (nth < headers.size()) {
+ return headers[nth];
+ }
+
+ return std::nullopt;
+ }
+ auto size() const -> std::size_t
+ {
+ return headers.size();
+ }
+ constexpr auto as_vector() const -> const std::vector<received_header> &
+ {
+ return headers;
+ }
+
+private:
+ static auto received_header_chain_pool_dtor(void *ptr) -> void
+ {
+ delete static_cast<received_header_chain *>(ptr);
+ }
+ std::vector<received_header> headers;
+};
+
+}// namespace rspamd::mime
+
+#endif//RSPAMD_RECEIVED_HXX
diff --git a/src/libmime/scan_result.c b/src/libmime/scan_result.c
new file mode 100644
index 0000000..a6bc0cb
--- /dev/null
+++ b/src/libmime/scan_result.c
@@ -0,0 +1,1106 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "config.h"
+#include "mem_pool.h"
+#include "scan_result.h"
+#include "rspamd.h"
+#include "message.h"
+#include "lua/lua_common.h"
+#include "libserver/cfg_file_private.h"
+#include "libmime/scan_result_private.h"
+#include "contrib/fastutf8/fastutf8.h"
+#include <math.h>
+#include "contrib/uthash/utlist.h"
+
+#define msg_debug_metric(...) rspamd_conditional_debug_fast(NULL, NULL, \
+ rspamd_metric_log_id, "metric", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+
+INIT_LOG_MODULE(metric)
+
+/* Average symbols count to optimize hash allocation */
+static struct rspamd_counter_data symbols_count;
+
+static void
+rspamd_scan_result_dtor(gpointer d)
+{
+ struct rspamd_scan_result *r = (struct rspamd_scan_result *) d;
+ struct rspamd_symbol_result *sres;
+
+ rspamd_set_counter_ema(&symbols_count, kh_size(r->symbols), 0.5);
+
+ if (r->symbol_cbref != -1) {
+ luaL_unref(r->task->cfg->lua_state, LUA_REGISTRYINDEX, r->symbol_cbref);
+ }
+
+ kh_foreach_value(r->symbols, sres, {
+ if (sres->options) {
+ kh_destroy(rspamd_options_hash, sres->options);
+ }
+ });
+
+ kh_destroy(rspamd_symbols_hash, r->symbols);
+ kh_destroy(rspamd_symbols_group_hash, r->sym_groups);
+}
+
+static void
+rspamd_metric_actions_foreach_cb(int i, struct rspamd_action *act, void *cbd)
+{
+ struct rspamd_scan_result *metric_res = (struct rspamd_scan_result *) cbd;
+ metric_res->actions_config[i].flags = RSPAMD_ACTION_RESULT_DEFAULT;
+ if (!(act->flags & RSPAMD_ACTION_NO_THRESHOLD)) {
+ metric_res->actions_config[i].cur_limit = act->threshold;
+ }
+ else {
+ metric_res->actions_config[i].flags |= RSPAMD_ACTION_RESULT_NO_THRESHOLD;
+ }
+ metric_res->actions_config[i].action = act;
+}
+
+struct rspamd_scan_result *
+rspamd_create_metric_result(struct rspamd_task *task,
+ const gchar *name, gint lua_sym_cbref)
+{
+ struct rspamd_scan_result *metric_res;
+
+ metric_res = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_scan_result));
+ metric_res->symbols = kh_init(rspamd_symbols_hash);
+ metric_res->sym_groups = kh_init(rspamd_symbols_group_hash);
+
+ if (name) {
+ metric_res->name = rspamd_mempool_strdup(task->task_pool, name);
+ }
+ else {
+ metric_res->name = NULL;
+ }
+
+ metric_res->symbol_cbref = lua_sym_cbref;
+ metric_res->task = task;
+
+ /* Optimize allocation */
+ kh_resize(rspamd_symbols_group_hash, metric_res->sym_groups, 4);
+
+ if (symbols_count.mean > 4) {
+ kh_resize(rspamd_symbols_hash, metric_res->symbols, symbols_count.mean);
+ }
+ else {
+ kh_resize(rspamd_symbols_hash, metric_res->symbols, 4);
+ }
+
+ if (task->cfg) {
+ size_t nact = rspamd_config_actions_size(task->cfg);
+ metric_res->actions_config = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_action_config) * nact);
+ rspamd_config_actions_foreach_enumerate(task->cfg, rspamd_metric_actions_foreach_cb, metric_res);
+ metric_res->nactions = nact;
+ }
+
+ rspamd_mempool_add_destructor(task->task_pool,
+ rspamd_scan_result_dtor,
+ metric_res);
+ DL_APPEND(task->result, metric_res);
+
+ return metric_res;
+}
+
+static inline int
+rspamd_pr_sort(const struct rspamd_passthrough_result *pra,
+ const struct rspamd_passthrough_result *prb)
+{
+ return prb->priority - pra->priority;
+}
+
+bool rspamd_add_passthrough_result(struct rspamd_task *task,
+ struct rspamd_action *action,
+ guint priority,
+ double target_score,
+ const gchar *message,
+ const gchar *module,
+ uint flags,
+ struct rspamd_scan_result *scan_result)
+{
+ struct rspamd_passthrough_result *pr;
+
+ if (scan_result == NULL) {
+ scan_result = task->result;
+ }
+
+ /* Find the specific action config */
+ struct rspamd_action_config *action_config = NULL;
+
+ for (unsigned int i = 0; i < scan_result->nactions; i++) {
+ struct rspamd_action_config *cur = &scan_result->actions_config[i];
+
+ /* We assume that all action pointers are static */
+ if (cur->action == action) {
+ action_config = cur;
+ break;
+ }
+ }
+
+ if (action_config && (action_config->flags & RSPAMD_ACTION_RESULT_DISABLED)) {
+ msg_info_task("<%s>: NOT set pre-result to '%s' %s(%.2f): '%s' from %s(%d); action is disabled",
+ MESSAGE_FIELD_CHECK(task, message_id), action->name,
+ flags & RSPAMD_PASSTHROUGH_LEAST ? "*least " : "",
+ target_score,
+ message, module, priority);
+
+ return false;
+ }
+
+ pr = rspamd_mempool_alloc(task->task_pool, sizeof(*pr));
+ pr->action = action;
+ pr->priority = priority;
+ pr->message = message;
+ pr->module = module;
+ pr->target_score = target_score;
+ pr->flags = flags;
+
+ DL_APPEND(scan_result->passthrough_result, pr);
+ DL_SORT(scan_result->passthrough_result, rspamd_pr_sort);
+
+ if (!isnan(target_score)) {
+
+ msg_info_task("<%s>: set pre-result to '%s' %s(%.2f): '%s' from %s(%d)",
+ MESSAGE_FIELD_CHECK(task, message_id), action->name,
+ flags & RSPAMD_PASSTHROUGH_LEAST ? "*least " : "",
+ target_score,
+ message, module, priority);
+ }
+ else {
+ msg_info_task("<%s>: set pre-result to '%s' %s(no score): '%s' from %s(%d)",
+ MESSAGE_FIELD_CHECK(task, message_id), action->name,
+ flags & RSPAMD_PASSTHROUGH_LEAST ? "*least " : "",
+ message, module, priority);
+ }
+
+ scan_result->nresults++;
+
+ return true;
+}
+
+static inline gdouble
+rspamd_check_group_score(struct rspamd_task *task,
+ const gchar *symbol,
+ struct rspamd_symbols_group *gr,
+ gdouble *group_score,
+ gdouble w)
+{
+ if (gr != NULL && group_score && gr->max_score > 0.0 && w > 0.0) {
+ if (*group_score >= gr->max_score && w > 0) {
+ msg_info_task("maximum group score %.2f for group %s has been reached,"
+ " ignoring symbol %s with weight %.2f",
+ gr->max_score,
+ gr->name, symbol, w);
+ return NAN;
+ }
+ else if (*group_score + w > gr->max_score) {
+ w = gr->max_score - *group_score;
+ }
+ }
+
+ return w;
+}
+
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.2204460492503131e-16
+#endif
+
+static struct rspamd_symbol_result *
+insert_metric_result(struct rspamd_task *task,
+ const gchar *symbol,
+ double weight,
+ const gchar *opt,
+ struct rspamd_scan_result *metric_res,
+ enum rspamd_symbol_insert_flags flags,
+ bool *new_sym)
+{
+ struct rspamd_symbol_result *symbol_result = NULL;
+ gdouble final_score, *gr_score = NULL, next_gf = 1.0, diff;
+ struct rspamd_symbol *sdef;
+ struct rspamd_symbols_group *gr = NULL;
+ const ucl_object_t *mobj, *sobj;
+ gint max_shots = G_MAXINT, ret;
+ guint i;
+ khiter_t k;
+ gboolean single = !!(flags & RSPAMD_SYMBOL_INSERT_SINGLE);
+ gchar *sym_cpy;
+
+ if (!isfinite(weight)) {
+ msg_warn_task("detected %s score for symbol %s, replace it with zero",
+ isnan(weight) ? "NaN" : "infinity", symbol);
+ weight = 0.0;
+ }
+
+ msg_debug_metric("want to insert symbol %s, initial weight %.2f",
+ symbol, weight);
+
+ sdef = g_hash_table_lookup(task->cfg->symbols, symbol);
+ if (sdef == NULL) {
+ if (flags & RSPAMD_SYMBOL_INSERT_ENFORCE) {
+ final_score = 1.0 * weight; /* Enforce static weight to 1.0 */
+ }
+ else {
+ final_score = 0.0;
+ }
+
+ msg_debug_metric("no symbol definition for %s; final multiplier %.2f",
+ symbol, final_score);
+ }
+ else {
+ if (sdef->cache_item) {
+ /* Check if we can insert this symbol at all */
+ if (!rspamd_symcache_is_item_allowed(task, sdef->cache_item, FALSE)) {
+ msg_debug_metric("symbol %s is not allowed to be inserted due to settings",
+ symbol);
+ return NULL;
+ }
+ }
+
+ final_score = (*sdef->weight_ptr) * weight;
+
+ PTR_ARRAY_FOREACH(sdef->groups, i, gr)
+ {
+ k = kh_get(rspamd_symbols_group_hash, metric_res->sym_groups, gr);
+
+ if (k == kh_end(metric_res->sym_groups)) {
+ k = kh_put(rspamd_symbols_group_hash, metric_res->sym_groups,
+ gr, &ret);
+ kh_value(metric_res->sym_groups, k) = 0;
+ }
+ }
+
+ msg_debug_metric("metric multiplier for %s is %.2f",
+ symbol, *sdef->weight_ptr);
+ }
+
+ if (task->settings) {
+ gdouble corr;
+ mobj = ucl_object_lookup(task->settings, "scores");
+
+ if (!mobj) {
+ /* Legacy */
+ mobj = task->settings;
+ }
+ else {
+ msg_debug_metric("found scores in the settings");
+ }
+
+ sobj = ucl_object_lookup(mobj, symbol);
+ if (sobj != NULL && ucl_object_todouble_safe(sobj, &corr)) {
+ msg_debug_metric("settings: changed weight of symbol %s from %.2f "
+ "to %.2f * %.2f",
+ symbol, final_score, corr, weight);
+ final_score = corr * weight;
+ }
+ }
+
+ k = kh_get(rspamd_symbols_hash, metric_res->symbols, symbol);
+ if (k != kh_end(metric_res->symbols)) {
+ /* Existing metric score */
+ symbol_result = kh_value(metric_res->symbols, k);
+ if (single) {
+ max_shots = 1;
+ }
+ else {
+ if (sdef) {
+ if (sdef->groups) {
+ PTR_ARRAY_FOREACH(sdef->groups, i, gr)
+ {
+ if (gr->flags & RSPAMD_SYMBOL_GROUP_ONE_SHOT) {
+ max_shots = 1;
+ }
+ }
+ }
+
+ max_shots = MIN(max_shots, sdef->nshots);
+ }
+ else {
+ max_shots = task->cfg->default_max_shots;
+ }
+ }
+
+ msg_debug_metric("nshots: %d for symbol %s", max_shots, symbol);
+
+ if (!single && (max_shots > 0 && (symbol_result->nshots >= max_shots))) {
+ single = TRUE;
+ }
+
+ symbol_result->nshots++;
+
+ if (opt) {
+ rspamd_task_add_result_option(task, symbol_result, opt, strlen(opt));
+ }
+
+ /* Adjust diff */
+ if (!single) {
+ diff = final_score;
+ msg_debug_metric("symbol %s can be inserted multiple times: %.2f weight",
+ symbol, diff);
+ }
+ else {
+ if (fabs(symbol_result->score) < fabs(final_score) &&
+ signbit(symbol_result->score) == signbit(final_score)) {
+ /* Replace less significant weight with a more significant one */
+ diff = final_score - symbol_result->score;
+ msg_debug_metric("symbol %s can be inserted single time;"
+ " weight adjusted %.2f + %.2f",
+ symbol, symbol_result->score, diff);
+ }
+ else {
+ diff = 0;
+ }
+ }
+
+ if (diff) {
+ /* Handle grow factor */
+ if (metric_res->grow_factor && diff > 0) {
+ diff *= metric_res->grow_factor;
+ next_gf *= task->cfg->grow_factor;
+ }
+ else if (diff > 0) {
+ next_gf = task->cfg->grow_factor;
+ }
+
+ msg_debug_metric("adjust grow factor to %.2f for symbol %s (%.2f final)",
+ next_gf, symbol, diff);
+
+ if (sdef) {
+ PTR_ARRAY_FOREACH(sdef->groups, i, gr)
+ {
+ gdouble cur_diff;
+
+ k = kh_get(rspamd_symbols_group_hash,
+ metric_res->sym_groups, gr);
+ g_assert(k != kh_end(metric_res->sym_groups));
+ gr_score = &kh_value(metric_res->sym_groups, k);
+ cur_diff = rspamd_check_group_score(task, symbol, gr,
+ gr_score, diff);
+
+ if (isnan(cur_diff)) {
+ /* Limit reached, do not add result */
+ msg_debug_metric(
+ "group limit %.2f is reached for %s when inserting symbol %s;"
+ " drop score %.2f",
+ *gr_score, gr->name, symbol, diff);
+
+ diff = NAN;
+ break;
+ }
+ else if (gr_score) {
+ *gr_score += cur_diff;
+
+ if (cur_diff < diff) {
+ /* Reduce */
+ msg_debug_metric(
+ "group limit %.2f is reached for %s when inserting symbol %s;"
+ " reduce score %.2f - %.2f",
+ *gr_score, gr->name, symbol, diff, cur_diff);
+ diff = cur_diff;
+ }
+ }
+ }
+ }
+
+ if (!isnan(diff)) {
+ metric_res->score += diff;
+ metric_res->grow_factor = next_gf;
+
+ if (single) {
+ msg_debug_metric("final score for single symbol %s = %.2f; %.2f diff",
+ symbol, final_score, diff);
+ symbol_result->score = final_score;
+ }
+ else {
+ msg_debug_metric("increase final score for multiple symbol %s += %.2f = %.2f",
+ symbol, symbol_result->score, diff);
+ symbol_result->score += diff;
+ }
+ }
+ }
+ }
+ else {
+ /* New result */
+ if (new_sym) {
+ *new_sym = true;
+ }
+
+ sym_cpy = rspamd_mempool_strdup(task->task_pool, symbol);
+ k = kh_put(rspamd_symbols_hash, metric_res->symbols,
+ sym_cpy, &ret);
+ g_assert(ret > 0);
+ symbol_result = rspamd_mempool_alloc0(task->task_pool, sizeof(*symbol_result));
+ kh_value(metric_res->symbols, k) = symbol_result;
+
+ /* Handle grow factor */
+ if (metric_res->grow_factor && final_score > 0) {
+ final_score *= metric_res->grow_factor;
+ next_gf *= task->cfg->grow_factor;
+ }
+ else if (final_score > 0) {
+ next_gf = task->cfg->grow_factor;
+ }
+
+ msg_debug_metric("adjust grow factor to %.2f for symbol %s (%.2f final)",
+ next_gf, symbol, final_score);
+
+ symbol_result->name = sym_cpy;
+ symbol_result->sym = sdef;
+ symbol_result->nshots = 1;
+
+ if (sdef) {
+ /* Check group limits */
+ PTR_ARRAY_FOREACH(sdef->groups, i, gr)
+ {
+ gdouble cur_score;
+
+ k = kh_get(rspamd_symbols_group_hash, metric_res->sym_groups, gr);
+ g_assert(k != kh_end(metric_res->sym_groups));
+ gr_score = &kh_value(metric_res->sym_groups, k);
+ cur_score = rspamd_check_group_score(task, symbol, gr,
+ gr_score, final_score);
+
+ if (isnan(cur_score)) {
+ /* Limit reached, do not add result */
+ msg_debug_metric(
+ "group limit %.2f is reached for %s when inserting symbol %s;"
+ " drop score %.2f",
+ *gr_score, gr->name, symbol, final_score);
+ final_score = NAN;
+ break;
+ }
+ else if (gr_score) {
+ *gr_score += cur_score;
+
+ if (cur_score < final_score) {
+ /* Reduce */
+ msg_debug_metric(
+ "group limit %.2f is reached for %s when inserting symbol %s;"
+ " reduce score %.2f - %.2f",
+ *gr_score, gr->name, symbol, final_score, cur_score);
+ final_score = cur_score;
+ }
+ }
+ }
+ }
+
+ if (!isnan(final_score)) {
+ const double epsilon = DBL_EPSILON;
+
+ metric_res->score += final_score;
+ metric_res->grow_factor = next_gf;
+ symbol_result->score = final_score;
+
+ if (final_score > epsilon) {
+ metric_res->npositive++;
+ metric_res->positive_score += final_score;
+ }
+ else if (final_score < -epsilon) {
+ metric_res->nnegative++;
+ metric_res->negative_score += fabs(final_score);
+ }
+ }
+ else {
+ symbol_result->score = 0;
+ }
+
+ if (opt) {
+ rspamd_task_add_result_option(task, symbol_result, opt, strlen(opt));
+ }
+ }
+
+ msg_debug_metric("final insertion for symbol %s, score %.2f, factor: %f",
+ symbol,
+ symbol_result->score,
+ final_score);
+ metric_res->nresults++;
+
+ return symbol_result;
+}
+
+struct rspamd_symbol_result *
+rspamd_task_insert_result_full(struct rspamd_task *task,
+ const gchar *symbol,
+ double weight,
+ const gchar *opt,
+ enum rspamd_symbol_insert_flags flags,
+ struct rspamd_scan_result *result)
+{
+ struct rspamd_symbol_result *symbol_result = NULL, *ret = NULL;
+ struct rspamd_scan_result *mres;
+
+ /*
+ * We allow symbols to be inserted for skipped tasks, as it might be a
+ * race condition before some symbol is finished and skip flag being set.
+ */
+ if (!RSPAMD_TASK_IS_SKIPPED(task) && (task->processed_stages & (RSPAMD_TASK_STAGE_IDEMPOTENT >> 1))) {
+ msg_err_task("cannot insert symbol %s on idempotent phase",
+ symbol);
+
+ return NULL;
+ }
+
+ if (result == NULL) {
+ /* Insert everywhere */
+ DL_FOREACH(task->result, mres)
+ {
+ if (mres->symbol_cbref != -1) {
+ /* Check if we can insert this symbol to this symbol result */
+ GError *err = NULL;
+ lua_State *L = (lua_State *) task->cfg->lua_state;
+
+ if (!rspamd_lua_universal_pcall(L, mres->symbol_cbref,
+ G_STRLOC, 1, "uss", &err,
+ "rspamd{task}", task, symbol, mres->name ? mres->name : "default")) {
+ msg_warn_task("cannot call for symbol_cbref for result %s: %e",
+ mres->name ? mres->name : "default", err);
+ g_error_free(err);
+
+ continue;
+ }
+ else {
+ if (!lua_toboolean(L, -1)) {
+ /* Skip symbol */
+ msg_debug_metric("skip symbol %s for result %s due to Lua return value",
+ symbol, mres->name);
+ lua_pop(L, 1); /* Remove result */
+
+ continue;
+ }
+
+ lua_pop(L, 1); /* Remove result */
+ }
+ }
+
+ bool new_symbol = false;
+
+ symbol_result = insert_metric_result(task,
+ symbol,
+ weight,
+ opt,
+ mres,
+ flags,
+ &new_symbol);
+
+ if (mres->name == NULL) {
+ /* Default result */
+ ret = symbol_result;
+
+ /* Process cache item */
+ if (symbol_result && task->cfg->cache && symbol_result->sym && symbol_result->nshots == 1) {
+ rspamd_symcache_inc_frequency(task->cfg->cache,
+ symbol_result->sym->cache_item,
+ symbol_result->sym->name);
+ }
+ }
+ else if (new_symbol) {
+ /* O(N) but we normally don't have any shadow results */
+ LL_APPEND(ret, symbol_result);
+ }
+ }
+ }
+ else {
+ /* Specific insertion */
+ symbol_result = insert_metric_result(task,
+ symbol,
+ weight,
+ opt,
+ result,
+ flags,
+ NULL);
+ ret = symbol_result;
+
+ if (result->name == NULL) {
+ /* Process cache item */
+ if (symbol_result && task->cfg->cache && symbol_result->sym && symbol_result->nshots == 1) {
+ rspamd_symcache_inc_frequency(task->cfg->cache,
+ symbol_result->sym->cache_item,
+ symbol_result->sym->name);
+ }
+ }
+ }
+
+ return ret;
+}
+
+static gchar *
+rspamd_task_option_safe_copy(struct rspamd_task *task,
+ const gchar *val,
+ gsize vlen,
+ gsize *outlen)
+{
+ const gchar *p, *end;
+
+ p = val;
+ end = val + vlen;
+ vlen = 0; /* Reuse */
+
+ while (p < end) {
+ if (*p & 0x80) {
+ UChar32 uc;
+ gint off = 0;
+
+ U8_NEXT(p, off, end - p, uc);
+
+ if (uc > 0) {
+ if (u_isprint(uc)) {
+ vlen += off;
+ }
+ else {
+ /* We will replace it with 0xFFFD */
+ vlen += MAX(off, 3);
+ }
+ }
+ else {
+ vlen += MAX(off, 3);
+ }
+
+ p += off;
+ }
+ else if (!g_ascii_isprint(*p)) {
+ /* Another 0xFFFD */
+ vlen += 3;
+ p++;
+ }
+ else {
+ p++;
+ vlen++;
+ }
+ }
+
+ gchar *dest, *d;
+
+ dest = rspamd_mempool_alloc(task->task_pool, vlen + 1);
+ d = dest;
+ p = val;
+
+ while (p < end) {
+ if (*p & 0x80) {
+ UChar32 uc;
+ gint off = 0;
+
+ U8_NEXT(p, off, end - p, uc);
+
+ if (uc > 0) {
+ if (u_isprint(uc)) {
+ memcpy(d, p, off);
+ d += off;
+ }
+ else {
+ /* We will replace it with 0xFFFD */
+ *d++ = '\357';
+ *d++ = '\277';
+ *d++ = '\275';
+ }
+ }
+ else {
+ *d++ = '\357';
+ *d++ = '\277';
+ *d++ = '\275';
+ }
+
+ p += off;
+ }
+ else if (!g_ascii_isprint(*p)) {
+ /* Another 0xFFFD */
+ *d++ = '\357';
+ *d++ = '\277';
+ *d++ = '\275';
+ p++;
+ }
+ else {
+ *d++ = *p++;
+ }
+ }
+
+ *d = '\0';
+ *(outlen) = d - dest;
+
+ return dest;
+}
+
+gboolean
+rspamd_task_add_result_option(struct rspamd_task *task,
+ struct rspamd_symbol_result *s,
+ const gchar *val,
+ gsize vlen)
+{
+ struct rspamd_symbol_option *opt, srch;
+ gboolean ret = FALSE;
+ gchar *opt_cpy = NULL;
+ gsize cpy_len;
+ khiter_t k;
+ gint r;
+ struct rspamd_symbol_result *cur;
+
+ if (s && val) {
+ /*
+ * Here we assume that this function is all the time called with the
+ * symbol from the default result, not some shadow result, or
+ * the option insertion will be wrong
+ */
+ LL_FOREACH(s, cur)
+ {
+ if (cur->opts_len < 0) {
+ /* Cannot add more options, give up */
+ msg_debug_task("cannot add more options to symbol %s when adding option %s",
+ cur->name, val);
+ ret = FALSE;
+ continue;
+ }
+
+ if (!cur->options) {
+ cur->options = kh_init(rspamd_options_hash);
+ }
+
+ if (vlen + cur->opts_len > task->cfg->max_opts_len) {
+ /* Add truncated option */
+ msg_info_task("cannot add more options to symbol %s when adding option %s",
+ cur->name, val);
+ val = "...";
+ vlen = 3;
+ cur->opts_len = -1;
+ }
+
+ if (!(cur->sym && (cur->sym->flags & RSPAMD_SYMBOL_FLAG_ONEPARAM))) {
+
+ srch.option = (gchar *) val;
+ srch.optlen = vlen;
+ k = kh_get(rspamd_options_hash, cur->options, &srch);
+
+ if (k == kh_end(cur->options)) {
+ opt_cpy = rspamd_task_option_safe_copy(task, val, vlen, &cpy_len);
+ if (cpy_len != vlen) {
+ srch.option = (gchar *) opt_cpy;
+ srch.optlen = cpy_len;
+ k = kh_get(rspamd_options_hash, cur->options, &srch);
+ }
+ /* Append new options */
+ if (k == kh_end(cur->options)) {
+ opt = rspamd_mempool_alloc0(task->task_pool, sizeof(*opt));
+ opt->optlen = cpy_len;
+ opt->option = opt_cpy;
+
+ kh_put(rspamd_options_hash, cur->options, opt, &r);
+ DL_APPEND(cur->opts_head, opt);
+
+ if (s == cur) {
+ ret = TRUE;
+ }
+ }
+ }
+ }
+ else {
+ /* Skip addition */
+ if (s == cur) {
+ ret = FALSE;
+ }
+ }
+
+ if (ret && cur->opts_len >= 0) {
+ cur->opts_len += vlen;
+ }
+ }
+ }
+ else if (!val) {
+ ret = TRUE;
+ }
+
+ task->result->nresults++;
+
+ return ret;
+}
+
+struct rspamd_action_config *
+rspamd_find_action_config_for_action(struct rspamd_scan_result *scan_result,
+ struct rspamd_action *act)
+{
+ for (unsigned int i = 0; i < scan_result->nactions; i++) {
+ struct rspamd_action_config *cur = &scan_result->actions_config[i];
+
+ if (act == cur->action) {
+ return cur;
+ }
+ }
+
+ return NULL;
+}
+
+struct rspamd_action *
+rspamd_check_action_metric(struct rspamd_task *task,
+ struct rspamd_passthrough_result **ppr,
+ struct rspamd_scan_result *scan_result)
+{
+ struct rspamd_action_config *action_lim,
+ *noaction = NULL;
+ struct rspamd_action *selected_action = NULL, *least_action = NULL;
+ struct rspamd_passthrough_result *pr, *sel_pr = NULL;
+ double max_score = -(G_MAXDOUBLE), sc;
+ gboolean seen_least = FALSE;
+
+ if (scan_result == NULL) {
+ scan_result = task->result;
+ }
+
+ if (scan_result->passthrough_result != NULL) {
+ DL_FOREACH(scan_result->passthrough_result, pr)
+ {
+ struct rspamd_action_config *act_config =
+ rspamd_find_action_config_for_action(scan_result, pr->action);
+
+ /* Skip disabled actions */
+ if (act_config && (act_config->flags & RSPAMD_ACTION_RESULT_DISABLED)) {
+ continue;
+ }
+
+ if (!seen_least || !(pr->flags & RSPAMD_PASSTHROUGH_LEAST)) {
+ sc = pr->target_score;
+ selected_action = pr->action;
+
+ if (!(pr->flags & RSPAMD_PASSTHROUGH_LEAST)) {
+ if (!isnan(sc)) {
+ if (pr->action->action_type == METRIC_ACTION_NOACTION) {
+ scan_result->score = MIN(sc, scan_result->score);
+ }
+ else {
+ scan_result->score = sc;
+ }
+ }
+
+ if (ppr) {
+ *ppr = pr;
+ }
+
+ return selected_action;
+ }
+ else {
+ seen_least = true;
+ least_action = selected_action;
+
+ if (isnan(sc)) {
+
+ if (selected_action->flags & RSPAMD_ACTION_NO_THRESHOLD) {
+ /*
+ * In this case, we have a passthrough action that
+ * is `least` action, however, there is no threshold
+ * on it.
+ *
+ * Hence, we imply the following logic:
+ *
+ * - we leave score unchanged
+ * - we apply passthrough no threshold action unless
+ * score based action *is not* reject, otherwise
+ * we apply reject action
+ */
+ }
+ else {
+ sc = selected_action->threshold;
+ max_score = sc;
+ sel_pr = pr;
+ }
+ }
+ else {
+ max_score = sc;
+ sel_pr = pr;
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ * Select result by score
+ */
+ for (size_t i = scan_result->nactions - 1; i != (size_t) -1; i--) {
+ action_lim = &scan_result->actions_config[i];
+ sc = action_lim->cur_limit;
+
+ if (action_lim->action->action_type == METRIC_ACTION_NOACTION) {
+ noaction = action_lim;
+ }
+
+ if ((action_lim->flags & (RSPAMD_ACTION_RESULT_DISABLED | RSPAMD_ACTION_RESULT_NO_THRESHOLD))) {
+ continue;
+ }
+
+ if (isnan(sc) ||
+ (action_lim->action->flags & (RSPAMD_ACTION_NO_THRESHOLD | RSPAMD_ACTION_HAM))) {
+ continue;
+ }
+
+ if (scan_result->score >= sc && sc > max_score) {
+ selected_action = action_lim->action;
+ max_score = sc;
+ }
+ }
+
+ if (selected_action == NULL) {
+ selected_action = noaction->action;
+ }
+
+ if (selected_action) {
+
+ if (seen_least) {
+ /* Adjust least action */
+ if (least_action->flags & RSPAMD_ACTION_NO_THRESHOLD) {
+ if (selected_action->action_type != METRIC_ACTION_REJECT &&
+ selected_action->action_type != METRIC_ACTION_DISCARD) {
+ /* Override score based action with least action */
+ selected_action = least_action;
+
+ if (ppr) {
+ *ppr = sel_pr;
+ }
+ }
+ }
+ else {
+ /* Adjust score if needed */
+ if (max_score > scan_result->score) {
+ if (ppr) {
+ *ppr = sel_pr;
+ }
+
+ scan_result->score = max_score;
+ }
+ }
+ }
+
+ return selected_action;
+ }
+
+ if (ppr) {
+ *ppr = sel_pr;
+ }
+
+ return noaction->action;
+}
+
+struct rspamd_symbol_result *
+rspamd_task_find_symbol_result(struct rspamd_task *task, const char *sym,
+ struct rspamd_scan_result *result)
+{
+ struct rspamd_symbol_result *res = NULL;
+ khiter_t k;
+
+ if (result == NULL) {
+ /* Use default result */
+ result = task->result;
+ }
+
+ k = kh_get(rspamd_symbols_hash, result->symbols, sym);
+
+ if (k != kh_end(result->symbols)) {
+ res = kh_value(result->symbols, k);
+ }
+
+ return res;
+}
+
+struct rspamd_symbol_result *rspamd_task_remove_symbol_result(
+ struct rspamd_task *task,
+ const gchar *symbol,
+ struct rspamd_scan_result *result)
+{
+ struct rspamd_symbol_result *res = NULL;
+ khiter_t k;
+
+ if (result == NULL) {
+ /* Use default result */
+ result = task->result;
+ }
+
+ k = kh_get(rspamd_symbols_hash, result->symbols, symbol);
+
+ if (k != kh_end(result->symbols)) {
+ res = kh_value(result->symbols, k);
+
+ if (!isnan(res->score)) {
+ /* Remove score from the result */
+ result->score -= res->score;
+
+ /* Also check the group limit */
+ if (result->sym_groups && res->sym) {
+ struct rspamd_symbol_group *gr;
+ gint i;
+ khiter_t k_groups;
+
+ PTR_ARRAY_FOREACH(res->sym->groups, i, gr)
+ {
+ gdouble *gr_score;
+
+ k_groups = kh_get(rspamd_symbols_group_hash,
+ result->sym_groups, gr);
+
+ if (k_groups != kh_end(result->sym_groups)) {
+ gr_score = &kh_value(result->sym_groups, k_groups);
+
+ if (gr_score) {
+ *gr_score -= res->score;
+ }
+ }
+ }
+ }
+ }
+
+ kh_del(rspamd_symbols_hash, result->symbols, k);
+ }
+ else {
+ return NULL;
+ }
+
+ return res;
+}
+
+void rspamd_task_symbol_result_foreach(struct rspamd_task *task,
+ struct rspamd_scan_result *result, GHFunc func,
+ gpointer ud)
+{
+ const gchar *kk;
+ struct rspamd_symbol_result *res;
+
+ if (result == NULL) {
+ /* Use default result */
+ result = task->result;
+ }
+
+ if (func) {
+ kh_foreach(result->symbols, kk, res, {
+ func((gpointer) kk, (gpointer) res, ud);
+ });
+ }
+}
+
+struct rspamd_scan_result *
+rspamd_find_metric_result(struct rspamd_task *task,
+ const gchar *name)
+{
+ struct rspamd_scan_result *res;
+
+ if (name == NULL || strcmp(name, "default") == 0) {
+ return task->result;
+ }
+
+ DL_FOREACH(task->result, res)
+ {
+ if (res->name && strcmp(res->name, name) == 0) {
+ return res;
+ }
+ }
+
+ return NULL;
+}
diff --git a/src/libmime/scan_result.h b/src/libmime/scan_result.h
new file mode 100644
index 0000000..46c2de8
--- /dev/null
+++ b/src/libmime/scan_result.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file scan_result.h
+ * Scan result holder
+ */
+
+#ifndef RSPAMD_SCAN_RESULT_H
+#define RSPAMD_SCAN_RESULT_H
+
+#include "config.h"
+#include "rspamd_symcache.h"
+#include "task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_task;
+struct rspamd_settings;
+struct rspamd_classifier_config;
+
+struct rspamd_symbol_option {
+ gchar *option;
+ gsize optlen;
+ struct rspamd_symbol_option *prev, *next;
+};
+
+enum rspamd_symbol_result_flags {
+ RSPAMD_SYMBOL_RESULT_NORMAL = 0,
+ RSPAMD_SYMBOL_RESULT_IGNORED = (1 << 0)
+};
+
+struct kh_rspamd_options_hash_s;
+
+/**
+ * Rspamd symbol
+ */
+struct rspamd_symbol_result {
+ double score; /**< symbol's score */
+ struct kh_rspamd_options_hash_s *options; /**< list of symbol's options */
+ struct rspamd_symbol_option *opts_head; /**< head of linked list of options */
+ const gchar *name;
+ struct rspamd_symbol *sym; /**< symbol configuration */
+ gssize opts_len; /**< total size of all options (negative if truncated option is added) */
+ guint nshots;
+ int flags;
+ struct rspamd_symbol_result *next; /**< for shadow results */
+};
+
+
+#define RSPAMD_PASSTHROUGH_NORMAL 1
+#define RSPAMD_PASSTHROUGH_LOW 0
+#define RSPAMD_PASSTHROUGH_HIGH 2
+#define RSPAMD_PASSTHROUGH_CRITICAL 3
+
+#define RSPAMD_PASSTHROUGH_LEAST (1u << 0u)
+#define RSPAMD_PASSTHROUGH_NO_SMTP_MESSAGE (1u << 1u)
+#define RSPAMD_PASSTHROUGH_PROCESS_ALL (1u << 2u)
+
+struct rspamd_passthrough_result {
+ struct rspamd_action *action;
+ guint priority;
+ guint flags;
+ double target_score;
+ const gchar *message;
+ const gchar *module;
+ struct rspamd_passthrough_result *prev, *next;
+};
+
+
+enum rspamd_action_config_flags {
+ RSPAMD_ACTION_RESULT_DEFAULT = 0,
+ RSPAMD_ACTION_RESULT_NO_THRESHOLD = (1u << 0u),
+ RSPAMD_ACTION_RESULT_DISABLED = (1u << 1u),
+};
+struct rspamd_action_config {
+ gdouble cur_limit;
+ int flags;
+ struct rspamd_action *action;
+};
+
+struct kh_rspamd_symbols_hash_s;
+struct kh_rspamd_symbols_group_hash_s;
+
+
+struct rspamd_scan_result {
+ double score; /**< total score */
+ double grow_factor; /**< current grow factor */
+ struct rspamd_passthrough_result *passthrough_result;
+ double positive_score;
+ double negative_score;
+ struct kh_rspamd_symbols_hash_s *symbols; /**< symbols of metric */
+ struct kh_rspamd_symbols_group_hash_s *sym_groups; /**< groups of symbols */
+ struct rspamd_action_config *actions_config;
+ const gchar *name; /**< for named results, NULL is the default result */
+ struct rspamd_task *task; /**< back reference */
+ gint symbol_cbref; /**< lua function that defines if a symbol can be inserted, -1 if unused */
+ guint nactions;
+ guint npositive;
+ guint nnegative;
+ guint nresults; /**< all results: positive, negative, passthrough etc */
+ guint nresults_postfilters; /**< how many results are there before postfilters stage */
+ struct rspamd_scan_result *prev, *next; /**< double linked list of results */
+};
+
+/**
+ * Create or return existing result for the specified metric name
+ * @param task task object
+ * @return metric result or NULL if metric `name` has not been found
+ */
+struct rspamd_scan_result *rspamd_create_metric_result(struct rspamd_task *task,
+ const gchar *name, gint lua_sym_cbref);
+
+/**
+ * Find result with a specific name (NULL means the default result)
+ * @param task
+ * @param name
+ * @return
+ */
+struct rspamd_scan_result *rspamd_find_metric_result(struct rspamd_task *task,
+ const gchar *name);
+
+/**
+ * Adds a new passthrough result to a task
+ * @param task
+ * @param action
+ * @param priority
+ * @param target_score
+ * @param message
+ * @param module
+ */
+bool rspamd_add_passthrough_result(struct rspamd_task *task,
+ struct rspamd_action *action, guint priority,
+ double target_score, const gchar *message,
+ const gchar *module, guint flags,
+ struct rspamd_scan_result *scan_result);
+
+enum rspamd_symbol_insert_flags {
+ RSPAMD_SYMBOL_INSERT_DEFAULT = 0,
+ RSPAMD_SYMBOL_INSERT_SINGLE = (1 << 0),
+ RSPAMD_SYMBOL_INSERT_ENFORCE = (1 << 1),
+};
+
+/**
+ * Insert a result to task
+ * @param task worker's task that present message from user
+ * @param metric_name metric's name to which we need to insert result
+ * @param symbol symbol to insert
+ * @param weight numeric weight for symbol
+ * @param opts list of symbol's options
+ */
+struct rspamd_symbol_result *rspamd_task_insert_result_full(struct rspamd_task *task,
+ const gchar *symbol,
+ double weight,
+ const gchar *opts,
+ enum rspamd_symbol_insert_flags flags,
+ struct rspamd_scan_result *result);
+
+#define rspamd_task_insert_result_single(task, symbol, weight, opts) \
+ rspamd_task_insert_result_full((task), (symbol), (weight), (opts), RSPAMD_SYMBOL_INSERT_SINGLE, NULL)
+#define rspamd_task_insert_result(task, symbol, weight, opts) \
+ rspamd_task_insert_result_full((task), (symbol), (weight), (opts), RSPAMD_SYMBOL_INSERT_DEFAULT, NULL)
+
+/**
+ * Removes a symbol from a specific symbol result
+ * @param task
+ * @param symbol
+ * @param result
+ * @return
+ */
+struct rspamd_symbol_result *rspamd_task_remove_symbol_result(
+ struct rspamd_task *task,
+ const gchar *symbol,
+ struct rspamd_scan_result *result);
+/**
+ * Adds new option to symbol
+ * @param task
+ * @param s
+ * @param opt
+ */
+gboolean rspamd_task_add_result_option(struct rspamd_task *task,
+ struct rspamd_symbol_result *s,
+ const gchar *opt,
+ gsize vlen);
+
+/**
+ * Finds symbol result
+ * @param task
+ * @param sym
+ * @return
+ */
+struct rspamd_symbol_result *
+rspamd_task_find_symbol_result(struct rspamd_task *task, const char *sym,
+ struct rspamd_scan_result *result);
+
+/**
+ * Compatibility function to iterate on symbols hash
+ * @param task
+ * @param func
+ * @param ud
+ */
+void rspamd_task_symbol_result_foreach(struct rspamd_task *task,
+ struct rspamd_scan_result *result,
+ GHFunc func,
+ gpointer ud);
+
+/**
+ * Default consolidation function for metric, it get all symbols and multiply symbol
+ * weight by some factor that is specified in config. Default factor is 1.
+ * @param task worker's task that present message from user
+ * @param metric_name name of metric
+ * @return result metric weight
+ */
+double rspamd_factor_consolidation_func(struct rspamd_task *task,
+ const gchar *metric_name,
+ const gchar *unused);
+
+
+/**
+ * Check thresholds and return action for a task
+ * @param task
+ * @return
+ */
+struct rspamd_action *rspamd_check_action_metric(struct rspamd_task *task,
+ struct rspamd_passthrough_result **ppr,
+ struct rspamd_scan_result *scan_result);
+
+struct rspamd_action_config *rspamd_find_action_config_for_action(struct rspamd_scan_result *scan_result,
+ struct rspamd_action *act);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/libmime/scan_result_private.h b/src/libmime/scan_result_private.h
new file mode 100644
index 0000000..cf0c0c5
--- /dev/null
+++ b/src/libmime/scan_result_private.h
@@ -0,0 +1,55 @@
+//
+// Created by Vsevolod Stakhov on 2019-01-14.
+//
+
+#ifndef RSPAMD_SCAN_RESULT_PRIVATE_H
+#define RSPAMD_SCAN_RESULT_PRIVATE_H
+
+#include "scan_result.h"
+#include "contrib/libucl/khash.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RSPAMD_OPTS_SEED 0x9f1f608628a4fefbULL
+#define rspamd_symopt_hash(opt) (rspamd_cryptobox_fast_hash( \
+ ((struct rspamd_symbol_option *) opt)->option, \
+ ((struct rspamd_symbol_option *) opt)->optlen, RSPAMD_OPTS_SEED))
+static inline bool
+rspamd_symopt_equal(const struct rspamd_symbol_option *o1,
+ const struct rspamd_symbol_option *o2)
+{
+ if (o1->optlen == o2->optlen) {
+ return (memcmp(o1->option, o2->option, o1->optlen) == 0);
+ }
+
+ return false;
+}
+
+KHASH_INIT(rspamd_options_hash, struct rspamd_symbol_option *, char,
+ 0, rspamd_symopt_hash, rspamd_symopt_equal);
+/**
+ * Result of metric processing
+ */
+KHASH_MAP_INIT_STR(rspamd_symbols_hash, struct rspamd_symbol_result *);
+#if UINTPTR_MAX <= UINT_MAX
+/* 32 bit */
+#define rspamd_ptr_hash_func(key) (khint32_t)(((uintptr_t) (key)) >> 1)
+#else
+/* likely 64 bit */
+#define rspamd_ptr_hash_func(key) (khint32_t)(((uintptr_t) (key)) >> 3)
+#endif
+#define rspamd_ptr_equal_func(a, b) ((a) == (b))
+KHASH_INIT(rspamd_symbols_group_hash,
+ void *,
+ double,
+ 1,
+ rspamd_ptr_hash_func,
+ rspamd_ptr_equal_func);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//RSPAMD_SCAN_RESULT_PRIVATE_H
diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h
new file mode 100644
index 0000000..e188b63
--- /dev/null
+++ b/src/libmime/smtp_parsers.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBMIME_SMTP_PARSERS_H_
+#define SRC_LIBMIME_SMTP_PARSERS_H_
+
+#include "config.h"
+#include "email_addr.h"
+#include "content_type.h"
+#include "task.h"
+#include "message.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int rspamd_smtp_addr_parse(const char *data, size_t len,
+ struct rspamd_email_address *addr);
+
+gboolean rspamd_content_disposition_parser(const char *data, size_t len,
+ struct rspamd_content_disposition *cd,
+ rspamd_mempool_t *pool);
+
+gboolean
+rspamd_rfc2047_parser(const gchar *in, gsize len, gint *pencoding,
+ const gchar **charset, gsize *charset_len,
+ const gchar **encoded, gsize *encoded_len);
+
+rspamd_inet_addr_t *rspamd_parse_smtp_ip(const char *data, size_t len,
+ rspamd_mempool_t *pool);
+
+guint64 rspamd_parse_smtp_date(const unsigned char *data, size_t len, GError **err);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SRC_LIBMIME_SMTP_PARSERS_H_ */