summaryrefslogtreecommitdiffstats
path: root/src/libmime/mime_parser.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/libmime/mime_parser.c1758
1 files changed, 1758 insertions, 0 deletions
diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c
new file mode 100644
index 0000000..217f0b8
--- /dev/null
+++ b/src/libmime/mime_parser.c
@@ -0,0 +1,1758 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "config.h"
+#include "task.h"
+#include "mime_parser.h"
+#include "mime_headers.h"
+#include "message.h"
+#include "multipattern.h"
+#include "contrib/libottery/ottery.h"
+#include "contrib/uthash/utlist.h"
+#include <openssl/cms.h>
+#include <openssl/pkcs7.h>
+#include "contrib/fastutf8/fastutf8.h"
+
+struct rspamd_mime_parser_lib_ctx {
+ struct rspamd_multipattern *mp_boundary;
+ guchar hkey[rspamd_cryptobox_SIPKEYBYTES]; /* Key for hashing */
+ guint key_usages;
+};
+
+struct rspamd_mime_parser_lib_ctx *lib_ctx = NULL;
+
+static const guint max_nested = 64;
+static const guint max_key_usages = 10000;
+
+#define msg_debug_mime(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \
+ rspamd_mime_log_id, "mime", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+
+INIT_LOG_MODULE(mime)
+
+#define RSPAMD_MIME_BOUNDARY_FLAG_CLOSED (1 << 0)
+#define RSPAMD_BOUNDARY_IS_CLOSED(b) ((b)->flags & RSPAMD_MIME_BOUNDARY_FLAG_CLOSED)
+
+struct rspamd_mime_boundary {
+ goffset boundary;
+ goffset start;
+ guint64 hash;
+ guint64 closed_hash;
+ gint flags;
+};
+
+struct rspamd_mime_parser_ctx {
+ GPtrArray *stack; /* Stack of parts */
+ GArray *boundaries; /* Boundaries found in the whole message */
+ const gchar *start;
+ const gchar *pos;
+ const gchar *end;
+ struct rspamd_task *task;
+ guint nesting;
+};
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_multipart_part(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ GError **err);
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_message(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ GError **err);
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_normal_part(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_content_type *ct,
+ GError **err);
+
+static enum rspamd_mime_parse_error
+rspamd_mime_process_multipart_node(struct rspamd_task *task,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_part *multipart,
+ const gchar *start, const gchar *end,
+ gboolean is_finished,
+ GError **err);
+
+
+#define RSPAMD_MIME_QUARK (rspamd_mime_parser_quark())
+static GQuark
+rspamd_mime_parser_quark(void)
+{
+ return g_quark_from_static_string("mime-parser");
+}
+
+const gchar *
+rspamd_cte_to_string(enum rspamd_cte ct)
+{
+ const gchar *ret = "unknown";
+
+ switch (ct) {
+ case RSPAMD_CTE_7BIT:
+ ret = "7bit";
+ break;
+ case RSPAMD_CTE_8BIT:
+ ret = "8bit";
+ break;
+ case RSPAMD_CTE_QP:
+ ret = "quoted-printable";
+ break;
+ case RSPAMD_CTE_B64:
+ ret = "base64";
+ break;
+ case RSPAMD_CTE_UUE:
+ ret = "X-uuencode";
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+enum rspamd_cte
+rspamd_cte_from_string(const gchar *str)
+{
+ enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
+
+ g_assert(str != NULL);
+
+ if (strcmp(str, "7bit") == 0) {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ else if (strcmp(str, "8bit") == 0) {
+ ret = RSPAMD_CTE_8BIT;
+ }
+ else if (strcmp(str, "quoted-printable") == 0) {
+ ret = RSPAMD_CTE_QP;
+ }
+ else if (strcmp(str, "base64") == 0) {
+ ret = RSPAMD_CTE_B64;
+ }
+ else if (strcmp(str, "X-uuencode") == 0) {
+ ret = RSPAMD_CTE_UUE;
+ }
+ else if (strcmp(str, "uuencode") == 0) {
+ ret = RSPAMD_CTE_UUE;
+ }
+ else if (strcmp(str, "X-uue") == 0) {
+ ret = RSPAMD_CTE_UUE;
+ }
+
+ return ret;
+}
+
+static void
+rspamd_mime_parser_init_lib(void)
+{
+ lib_ctx = g_malloc0(sizeof(*lib_ctx));
+ lib_ctx->mp_boundary = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT);
+ g_assert(lib_ctx->mp_boundary != NULL);
+ rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\r--", 0);
+ rspamd_multipattern_add_pattern(lib_ctx->mp_boundary, "\n--", 0);
+
+ GError *err = NULL;
+ if (!rspamd_multipattern_compile(lib_ctx->mp_boundary, &err)) {
+ msg_err("fatal error: cannot compile multipattern for mime parser boundaries: %e", err);
+ g_error_free(err);
+ g_abort();
+ }
+ ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey));
+}
+
+static enum rspamd_cte
+rspamd_mime_parse_cte(const gchar *in, gsize len)
+{
+ guint64 h;
+ enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
+
+ in = rspamd_string_len_strip(in, &len, " \t;,.+-#!`~'");
+ h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64,
+ in, len, 0xdeadbabe);
+
+ switch (h) {
+ case 0xCEDAA7056B4753F7ULL: /* 7bit */
+ ret = RSPAMD_CTE_7BIT;
+ break;
+ case 0x42E0745448B39FC1ULL: /* 8bit */
+ case 0x6B169E6B155BADC0ULL: /* binary */
+ ret = RSPAMD_CTE_8BIT;
+ break;
+ case 0x6D69A5BB02A633B0ULL: /* quoted-printable */
+ ret = RSPAMD_CTE_QP;
+ break;
+ case 0x96305588A76DC9A9ULL: /* base64 */
+ case 0x171029DE1B0423A9ULL: /* base-64 */
+ ret = RSPAMD_CTE_B64;
+ break;
+ case 0x420b54dc00d13cecULL: /* uuencode */
+ case 0x8df6700b8f6c4cf9ULL: /* x-uuencode */
+ case 0x41f725ec544356d3ULL: /* x-uue */
+ ret = RSPAMD_CTE_UUE;
+ break;
+ }
+
+ return ret;
+}
+
+static enum rspamd_cte
+rspamd_mime_part_get_cte_heuristic(struct rspamd_task *task,
+ struct rspamd_mime_part *part)
+{
+ const guint check_len = 128;
+ guint real_len, nspaces = 0, neqsign = 0, n8bit = 0, nqpencoded = 0,
+ padeqsign = 0, nupper = 0, nlower = 0;
+ gboolean b64_chars = TRUE;
+ const guchar *p, *end;
+ enum rspamd_cte ret = RSPAMD_CTE_UNKNOWN;
+
+ real_len = MIN(check_len, part->raw_data.len);
+ p = (const guchar *) part->raw_data.begin;
+ end = p + part->raw_data.len;
+
+ while (p < end && g_ascii_isspace(*p)) {
+ p++;
+ }
+
+ if (end - p > sizeof("begin-base64 ")) {
+ const guchar *uue_start;
+
+ if (memcmp(p, "begin ", sizeof("begin ") - 1) == 0) {
+ uue_start = p + sizeof("begin ") - 1;
+
+ while (uue_start < end && g_ascii_isspace(*uue_start)) {
+ uue_start++;
+ }
+
+ if (uue_start < end && g_ascii_isdigit(*uue_start)) {
+ return RSPAMD_CTE_UUE;
+ }
+ }
+ else if (memcmp(p, "begin-base64 ", sizeof("begin-base64 ") - 1) == 0) {
+ uue_start = p + sizeof("begin ") - 1;
+
+ while (uue_start < end && g_ascii_isspace(*uue_start)) {
+ uue_start++;
+ }
+
+ if (uue_start < end && g_ascii_isdigit(*uue_start)) {
+ return RSPAMD_CTE_UUE;
+ }
+ }
+ }
+
+ /* Skip trailing spaces */
+ while (end > p && g_ascii_isspace(*(end - 1))) {
+ end--;
+ }
+
+ if (end > p + 2) {
+ if (*(end - 1) == '=') {
+ padeqsign++;
+ end--;
+ }
+
+ if (*(end - 1) == '=') {
+ padeqsign++;
+ end--;
+ }
+ }
+
+ /* Adjust end to analyse only first characters */
+ if (end - p > real_len) {
+ end = p + real_len;
+ }
+
+ while (p < end) {
+ if (*p == ' ') {
+ nspaces++;
+ }
+ else if (*p == '=') {
+ b64_chars = FALSE; /* Eqsign must not be inside base64 */
+ neqsign++;
+ p++;
+
+ if (p + 2 < end && g_ascii_isxdigit(*p) && g_ascii_isxdigit(*(p + 1))) {
+ p++;
+ nqpencoded++;
+ }
+
+ continue;
+ }
+ else if (*p >= 0x80) {
+ n8bit++;
+ b64_chars = FALSE;
+ }
+ else if (!(g_ascii_isalnum(*p) || *p == '/' || *p == '+')) {
+ b64_chars = FALSE;
+ }
+ else if (g_ascii_isupper(*p)) {
+ nupper++;
+ }
+ else if (g_ascii_islower(*p)) {
+ nlower++;
+ }
+
+ p++;
+ }
+
+ if (b64_chars && neqsign <= 2 && nspaces == 0) {
+ /* Need more thinking */
+
+ if (part->raw_data.len > 80) {
+ if (padeqsign > 0) {
+ ret = RSPAMD_CTE_B64;
+ }
+ else {
+ /* We have a large piece of data with no spaces and base64
+ * symbols only, no padding is detected as well...
+ *
+ * There is a small chance that our first 128 characters
+ * are either some garbage or it is a base64 with no padding
+ * (e.g. when it is not needed)
+ */
+ if (nupper > 1 && nlower > 1) {
+ /*
+ * We have both uppercase and lowercase letters, so it can be
+ * base64
+ */
+ ret = RSPAMD_CTE_B64;
+ }
+ else {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ }
+ }
+ else {
+
+ if (((end - (const guchar *) part->raw_data.begin) + padeqsign) % 4 == 0) {
+ if (padeqsign == 0) {
+ /*
+ * It can be either base64 or plain text, hard to say
+ * Let's assume that if we have > 1 uppercase it is
+ * likely base64
+ */
+ if (nupper > 1 && nlower > 1) {
+ ret = RSPAMD_CTE_B64;
+ }
+ else {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ }
+ else {
+ ret = RSPAMD_CTE_B64;
+ }
+ }
+ else {
+ /* No way */
+ if (padeqsign == 1 || padeqsign == 2) {
+ ret = RSPAMD_CTE_B64;
+ }
+ else {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ }
+ }
+ }
+ else if (n8bit == 0) {
+ if (neqsign > 2 && nqpencoded > 2) {
+ ret = RSPAMD_CTE_QP;
+ }
+ else {
+ ret = RSPAMD_CTE_7BIT;
+ }
+ }
+ else {
+ ret = RSPAMD_CTE_8BIT;
+ }
+
+ msg_debug_mime("detected cte: %s", rspamd_cte_to_string(ret));
+
+ return ret;
+}
+
+static void
+rspamd_mime_part_get_cte(struct rspamd_task *task,
+ struct rspamd_mime_headers_table *hdrs,
+ struct rspamd_mime_part *part,
+ gboolean apply_heuristic)
+{
+ struct rspamd_mime_header *hdr, *cur;
+ enum rspamd_cte cte = RSPAMD_CTE_UNKNOWN;
+ gboolean parent_propagated = FALSE;
+
+ hdr = rspamd_message_get_header_from_hash(hdrs, "Content-Transfer-Encoding", FALSE);
+
+ if (hdr == NULL) {
+ if (part->parent_part && part->parent_part->cte != RSPAMD_CTE_UNKNOWN &&
+ !(part->parent_part->flags & RSPAMD_MIME_PART_MISSING_CTE)) {
+ part->cte = part->parent_part->cte;
+ parent_propagated = TRUE;
+
+ goto check_cte;
+ }
+
+ if (apply_heuristic) {
+ part->cte = rspamd_mime_part_get_cte_heuristic(task, part);
+ msg_info_task("detected missing CTE for part as: %s",
+ rspamd_cte_to_string(part->cte));
+ }
+
+ part->flags |= RSPAMD_MIME_PART_MISSING_CTE;
+ }
+ else {
+ DL_FOREACH(hdr, cur)
+ {
+ gsize hlen;
+ gchar lc_buf[128];
+
+ hlen = rspamd_snprintf(lc_buf, sizeof(lc_buf), "%s", cur->value);
+ rspamd_str_lc(lc_buf, hlen);
+ cte = rspamd_mime_parse_cte(lc_buf, hlen);
+
+ if (cte != RSPAMD_CTE_UNKNOWN) {
+ part->cte = cte;
+ break;
+ }
+ }
+
+ check_cte:
+ if (apply_heuristic) {
+ if (part->cte == RSPAMD_CTE_UNKNOWN) {
+ part->cte = rspamd_mime_part_get_cte_heuristic(task, part);
+
+ msg_info_task("corrected bad CTE for part to: %s",
+ rspamd_cte_to_string(part->cte));
+ }
+ else if (part->cte == RSPAMD_CTE_B64 ||
+ part->cte == RSPAMD_CTE_QP) {
+ /* Additionally check sanity */
+ cte = rspamd_mime_part_get_cte_heuristic(task, part);
+
+ if (cte == RSPAMD_CTE_8BIT) {
+ msg_info_task(
+ "incorrect cte specified for part: %s, %s detected",
+ rspamd_cte_to_string(part->cte),
+ rspamd_cte_to_string(cte));
+ part->cte = cte;
+ part->flags |= RSPAMD_MIME_PART_BAD_CTE;
+ }
+ else if (cte != part->cte && parent_propagated) {
+ part->cte = cte;
+ msg_info_task("detected missing CTE for part as: %s",
+ rspamd_cte_to_string(part->cte));
+ }
+ }
+ else {
+ msg_debug_mime("processed cte: %s",
+ rspamd_cte_to_string(cte));
+ }
+ }
+ else {
+ msg_debug_mime("processed cte: %s", rspamd_cte_to_string(cte));
+ }
+ }
+}
+static void
+rspamd_mime_part_get_cd(struct rspamd_task *task, struct rspamd_mime_part *part)
+{
+ struct rspamd_mime_header *hdr, *cur;
+ struct rspamd_content_disposition *cd = NULL;
+ rspamd_ftok_t srch;
+ struct rspamd_content_type_param *found;
+
+ hdr = rspamd_message_get_header_from_hash(part->raw_headers,
+ "Content-Disposition", FALSE);
+
+
+ if (hdr == NULL) {
+ cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd));
+ cd->type = RSPAMD_CT_INLINE;
+
+ /* We can also have content disposition definitions in Content-Type */
+ if (part->ct && part->ct->attrs) {
+ RSPAMD_FTOK_ASSIGN(&srch, "name");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+
+ if (!found) {
+ RSPAMD_FTOK_ASSIGN(&srch, "filename");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+ }
+
+ if (found) {
+ cd->type = RSPAMD_CT_ATTACHMENT;
+ memcpy(&cd->filename, &found->value, sizeof(cd->filename));
+ }
+ }
+ }
+ else {
+ DL_FOREACH(hdr, cur)
+ {
+ gsize hlen;
+ cd = NULL;
+
+ if (cur->value) {
+ hlen = strlen(cur->value);
+ cd = rspamd_content_disposition_parse(cur->value, hlen,
+ task->task_pool);
+ }
+
+ if (cd) {
+ /* We still need to check filename */
+ if (cd->filename.len == 0) {
+ if (part->ct && part->ct->attrs) {
+ RSPAMD_FTOK_ASSIGN(&srch, "name");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+
+ if (!found) {
+ RSPAMD_FTOK_ASSIGN(&srch, "filename");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+ }
+
+ if (found) {
+ cd->type = RSPAMD_CT_ATTACHMENT;
+ memcpy(&cd->filename, &found->value,
+ sizeof(cd->filename));
+ }
+ }
+ }
+
+ msg_debug_mime("processed content disposition: %s, file: \"%T\"",
+ cd->lc_data, &cd->filename);
+ break;
+ }
+ else if (part->ct) {
+ /*
+ * Even in case of malformed Content-Disposition, we can still
+ * fall back to Content-Type
+ */
+ cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*cd));
+ cd->type = RSPAMD_CT_INLINE;
+
+ /* We can also have content disposition definitions in Content-Type */
+ if (part->ct->attrs) {
+ RSPAMD_FTOK_ASSIGN(&srch, "name");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+
+ if (!found) {
+ RSPAMD_FTOK_ASSIGN(&srch, "filename");
+ found = g_hash_table_lookup(part->ct->attrs, &srch);
+ }
+
+ if (found) {
+ cd->type = RSPAMD_CT_ATTACHMENT;
+ memcpy(&cd->filename, &found->value, sizeof(cd->filename));
+ }
+ }
+ }
+ }
+ }
+
+ part->cd = cd;
+}
+
+void rspamd_mime_parser_calc_digest(struct rspamd_mime_part *part)
+{
+ /* Blake2b applied to string 'rspamd' */
+ static const guchar hash_key[] = {
+ 0xef,
+ 0x43,
+ 0xae,
+ 0x80,
+ 0xcc,
+ 0x8d,
+ 0xc3,
+ 0x4c,
+ 0x6f,
+ 0x1b,
+ 0xd6,
+ 0x18,
+ 0x1b,
+ 0xae,
+ 0x87,
+ 0x74,
+ 0x0c,
+ 0xca,
+ 0xf7,
+ 0x8e,
+ 0x5f,
+ 0x2e,
+ 0x54,
+ 0x32,
+ 0xf6,
+ 0x79,
+ 0xb9,
+ 0x27,
+ 0x26,
+ 0x96,
+ 0x20,
+ 0x92,
+ 0x70,
+ 0x07,
+ 0x85,
+ 0xeb,
+ 0x83,
+ 0xf7,
+ 0x89,
+ 0xe0,
+ 0xd7,
+ 0x32,
+ 0x2a,
+ 0xd2,
+ 0x1a,
+ 0x64,
+ 0x41,
+ 0xef,
+ 0x49,
+ 0xff,
+ 0xc3,
+ 0x8c,
+ 0x54,
+ 0xf9,
+ 0x67,
+ 0x74,
+ 0x30,
+ 0x1e,
+ 0x70,
+ 0x2e,
+ 0xb7,
+ 0x12,
+ 0x09,
+ 0xfe,
+ };
+
+ if (part->parsed_data.len > 0) {
+ rspamd_cryptobox_hash(part->digest,
+ part->parsed_data.begin, part->parsed_data.len,
+ hash_key, sizeof(hash_key));
+ }
+}
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_normal_part(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_content_type *ct,
+ GError **err)
+{
+ rspamd_fstring_t *parsed;
+ gssize r;
+
+ g_assert(part != NULL);
+
+ rspamd_mime_part_get_cte(task, part->raw_headers, part,
+ part->ct && !(part->ct->flags & RSPAMD_CONTENT_TYPE_MESSAGE));
+ rspamd_mime_part_get_cd(task, part);
+
+ switch (part->cte) {
+ case RSPAMD_CTE_7BIT:
+ case RSPAMD_CTE_8BIT:
+ case RSPAMD_CTE_UNKNOWN:
+ if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
+ if (part->cte != RSPAMD_CTE_7BIT) {
+ /* We have something that has a missing content-type,
+ * but it has non-7bit characters.
+ *
+ * In theory, it is very unsafe to process it as a text part
+ * as we unlikely get some sane result
+ */
+
+ /*
+ * On the other hand, there is an evidence that some
+ * emails actually rely on that.
+ * So we apply an expensive hack here:
+ * if there are no 8bit characters -OR- the content is valid
+ * UTF8, we can still imply Content-Type == text/plain
+ */
+
+ if (rspamd_str_has_8bit(part->raw_data.begin, part->raw_data.len) &&
+ !rspamd_fast_utf8_validate(part->raw_data.begin, part->raw_data.len)) {
+ part->ct->flags &= ~RSPAMD_CONTENT_TYPE_TEXT;
+ part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
+ }
+ }
+ }
+
+ if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) {
+ /* Need to copy text as we have couple of in-place change functions */
+ parsed = rspamd_fstring_sized_new(part->raw_data.len);
+ parsed->len = part->raw_data.len;
+ memcpy(parsed->str, part->raw_data.begin, parsed->len);
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ }
+ else {
+ part->parsed_data.begin = part->raw_data.begin;
+ part->parsed_data.len = part->raw_data.len;
+ }
+ break;
+ case RSPAMD_CTE_QP:
+ parsed = rspamd_fstring_sized_new(part->raw_data.len);
+ r = rspamd_decode_qp_buf(part->raw_data.begin, part->raw_data.len,
+ parsed->str, parsed->allocated);
+ if (r != -1) {
+ parsed->len = r;
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ }
+ else {
+ msg_err_task("invalid quoted-printable encoded part, assume 8bit");
+ if (part->ct) {
+ part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
+ }
+ part->cte = RSPAMD_CTE_8BIT;
+ memcpy(parsed->str, part->raw_data.begin, part->raw_data.len);
+ parsed->len = part->raw_data.len;
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ }
+ break;
+ case RSPAMD_CTE_B64:
+ parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12);
+ rspamd_cryptobox_base64_decode(part->raw_data.begin,
+ part->raw_data.len,
+ parsed->str, &parsed->len);
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ break;
+ case RSPAMD_CTE_UUE:
+ parsed = rspamd_fstring_sized_new(part->raw_data.len / 4 * 3 + 12);
+ r = rspamd_decode_uue_buf(part->raw_data.begin, part->raw_data.len,
+ parsed->str, parsed->allocated);
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) rspamd_fstring_free, parsed);
+ if (r != -1) {
+ parsed->len = r;
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ }
+ else {
+ msg_err_task("invalid uuencoding in encoded part, assume 8bit");
+ if (part->ct) {
+ part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
+ }
+ part->cte = RSPAMD_CTE_8BIT;
+ parsed->len = MIN(part->raw_data.len, parsed->allocated);
+ memcpy(parsed->str, part->raw_data.begin, parsed->len);
+ rspamd_mempool_notify_alloc(task->task_pool, parsed->len);
+ part->parsed_data.begin = parsed->str;
+ part->parsed_data.len = parsed->len;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ part->part_number = MESSAGE_FIELD(task, parts)->len;
+ part->urls = g_ptr_array_new();
+ g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
+ msg_debug_mime("parsed data part %T/%T of length %z (%z orig), %s cte",
+ &part->ct->type, &part->ct->subtype, part->parsed_data.len,
+ part->raw_data.len, rspamd_cte_to_string(part->cte));
+ rspamd_mime_parser_calc_digest(part);
+
+ if (ct && (ct->flags & RSPAMD_CONTENT_TYPE_SMIME)) {
+ CMS_ContentInfo *cms;
+ const unsigned char *der_beg = part->parsed_data.begin;
+ cms = d2i_CMS_ContentInfo(NULL, &der_beg, part->parsed_data.len);
+
+ if (cms) {
+ const ASN1_OBJECT *asn_ct = CMS_get0_eContentType(cms);
+ int ct_nid = OBJ_obj2nid(asn_ct);
+
+ if (ct_nid == NID_pkcs7_data) {
+ BIO *bio = BIO_new_mem_buf(part->parsed_data.begin,
+ part->parsed_data.len);
+
+ PKCS7 *p7;
+ p7 = d2i_PKCS7_bio(bio, NULL);
+
+ if (p7) {
+ ct_nid = OBJ_obj2nid(p7->type);
+
+ if (ct_nid == NID_pkcs7_signed) {
+ PKCS7 *p7_signed_content = p7->d.sign->contents;
+
+ ct_nid = OBJ_obj2nid(p7_signed_content->type);
+
+ if (ct_nid == NID_pkcs7_data && p7_signed_content->d.data) {
+ int ret;
+
+ msg_debug_mime("found an additional part inside of "
+ "smime structure of type %T/%T; length=%d",
+ &ct->type, &ct->subtype, p7_signed_content->d.data->length);
+ /*
+ * Since ASN.1 structures are freed, we need to copy
+ * the content
+ */
+ gchar *cpy = rspamd_mempool_alloc(task->task_pool,
+ p7_signed_content->d.data->length);
+ memcpy(cpy, p7_signed_content->d.data->data,
+ p7_signed_content->d.data->length);
+ ret = rspamd_mime_process_multipart_node(task,
+ st, NULL,
+ cpy, cpy + p7_signed_content->d.data->length,
+ TRUE, err);
+
+ PKCS7_free(p7);
+ BIO_free(bio);
+ CMS_ContentInfo_free(cms);
+
+ return ret;
+ }
+ }
+
+ PKCS7_free(p7);
+ }
+
+ BIO_free(bio);
+ }
+
+ CMS_ContentInfo_free(cms);
+ }
+ }
+
+ return RSPAMD_MIME_PARSE_OK;
+}
+
+struct rspamd_mime_multipart_cbdata {
+ struct rspamd_task *task;
+ struct rspamd_mime_part *multipart;
+ struct rspamd_mime_parser_ctx *st;
+ const gchar *part_start;
+ rspamd_ftok_t *cur_boundary;
+ guint64 bhash;
+ GError **err;
+};
+
+static enum rspamd_mime_parse_error
+rspamd_mime_process_multipart_node(struct rspamd_task *task,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_part *multipart,
+ const gchar *start, const gchar *end,
+ gboolean is_finished,
+ GError **err)
+{
+ struct rspamd_content_type *ct, *sel = NULL;
+ struct rspamd_mime_header *hdr = NULL, *cur;
+ struct rspamd_mime_part *npart;
+ GString str;
+ goffset hdr_pos, body_pos;
+ enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_FATAL;
+
+
+ str.str = (gchar *) start;
+ str.len = end - start;
+
+ if (*start == '\n' || *start == '\r') {
+ /*
+ * We have a part that starts from newline which means that
+ * there are completely no headers in this part,
+ * hence we assume it as a text part
+ */
+ hdr_pos = 0;
+ body_pos = 0;
+
+ if (!is_finished) {
+ /* Ignore garbage */
+ const gchar *p = start;
+ gboolean seen_something = FALSE;
+
+ while (p < end) {
+ if (g_ascii_isalnum(*p)) {
+ seen_something = TRUE;
+ break;
+ }
+ p++;
+ }
+
+ if (!seen_something) {
+ return RSPAMD_MIME_PARSE_NO_PART;
+ }
+ }
+ }
+ else {
+ hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
+ }
+
+ npart = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_part));
+ npart->parent_part = multipart;
+ npart->raw_headers = rspamd_message_headers_new();
+ npart->headers_order = NULL;
+
+ if (multipart) {
+ if (multipart->specific.mp->children == NULL) {
+ multipart->specific.mp->children = g_ptr_array_sized_new(2);
+ }
+
+ g_ptr_array_add(multipart->specific.mp->children, npart);
+ }
+
+ if (hdr_pos > 0 && hdr_pos < str.len) {
+ npart->raw_headers_str = str.str;
+ npart->raw_headers_len = hdr_pos;
+ npart->raw_data.begin = start + body_pos;
+ npart->raw_data.len = (end - start) - body_pos;
+
+ if (npart->raw_headers_len > 0) {
+ rspamd_mime_headers_process(task, npart->raw_headers,
+ &npart->headers_order,
+ npart->raw_headers_str,
+ npart->raw_headers_len,
+ FALSE);
+
+ /* Preserve the natural order */
+ if (npart->headers_order) {
+ LL_REVERSE2(npart->headers_order, ord_next);
+ }
+ }
+
+ hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
+ "Content-Type", FALSE);
+ }
+ else {
+ npart->raw_headers_str = 0;
+ npart->raw_headers_len = 0;
+ npart->raw_data.begin = start;
+ npart->raw_data.len = end - start;
+ }
+
+
+ if (hdr != NULL) {
+
+ DL_FOREACH(hdr, cur)
+ {
+ ct = rspamd_content_type_parse(cur->value, strlen(cur->value),
+ task->task_pool);
+
+ /* Here we prefer multipart content-type or any content-type */
+ if (ct) {
+ if (sel == NULL) {
+ sel = ct;
+ }
+ else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+ sel = ct;
+ }
+ }
+ }
+ }
+
+ if (sel == NULL) {
+ sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel));
+ RSPAMD_FTOK_ASSIGN(&sel->type, "text");
+ RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain");
+ }
+
+ npart->ct = sel;
+
+ if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+ st->nesting++;
+ g_ptr_array_add(st->stack, npart);
+ npart->part_type = RSPAMD_MIME_PART_MULTIPART;
+ npart->specific.mp = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_multipart));
+ memcpy(&npart->specific.mp->boundary, &sel->orig_boundary,
+ sizeof(rspamd_ftok_t));
+ ret = rspamd_mime_parse_multipart_part(task, npart, st, err);
+ }
+ else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
+ st->nesting++;
+ g_ptr_array_add(st->stack, npart);
+ npart->part_type = RSPAMD_MIME_PART_MESSAGE;
+
+ if ((ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err)) == RSPAMD_MIME_PARSE_OK) {
+ ret = rspamd_mime_parse_message(task, npart, st, err);
+ }
+ }
+ else {
+ ret = rspamd_mime_parse_normal_part(task, npart, st, sel, err);
+ }
+
+ return ret;
+}
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_multipart_cb(struct rspamd_task *task,
+ struct rspamd_mime_part *multipart,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_multipart_cbdata *cb,
+ struct rspamd_mime_boundary *b)
+{
+ const gchar *pos = st->start + b->boundary;
+ enum rspamd_mime_parse_error ret;
+
+ task = cb->task;
+
+ /* Now check boundary */
+ if (!cb->part_start) {
+ cb->part_start = st->start + b->start;
+ st->pos = cb->part_start;
+ }
+ else {
+ /*
+ * We have seen the start of the boundary,
+ * but it might be unsuitable (e.g. in broken headers)
+ */
+ if (cb->part_start < pos && cb->cur_boundary) {
+
+ if ((ret = rspamd_mime_process_multipart_node(task, cb->st,
+ cb->multipart, cb->part_start, pos, TRUE, cb->err)) != RSPAMD_MIME_PARSE_OK) {
+ return ret;
+ }
+
+ if (b->start > 0) {
+ /* Go towards the next part */
+ cb->part_start = st->start + b->start;
+ cb->st->pos = cb->part_start;
+ }
+ }
+ else {
+ /* We have an empty boundary, do nothing */
+ }
+ }
+
+ return RSPAMD_MIME_PARSE_OK;
+}
+
+static enum rspamd_mime_parse_error
+rspamd_multipart_boundaries_filter(struct rspamd_task *task,
+ struct rspamd_mime_part *multipart,
+ struct rspamd_mime_parser_ctx *st,
+ struct rspamd_mime_multipart_cbdata *cb)
+{
+ struct rspamd_mime_boundary *cur;
+ goffset last_offset;
+ guint i, sel = 0;
+ enum rspamd_mime_parse_error ret;
+
+ last_offset = (multipart->raw_data.begin - st->start) +
+ multipart->raw_data.len;
+
+ /* Find the first offset suitable for this part */
+ for (i = 0; i < st->boundaries->len; i++) {
+ cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i);
+
+ if (cur->start >= multipart->raw_data.begin - st->start) {
+ if (cb->cur_boundary) {
+ /* Check boundary */
+ msg_debug_mime("compare %L and %L (and %L)",
+ cb->bhash, cur->hash, cur->closed_hash);
+
+ if (cb->bhash == cur->hash) {
+ sel = i;
+ break;
+ }
+ else if (cb->bhash == cur->closed_hash) {
+ /* Not a closing element in fact */
+ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
+ cur->hash = cur->closed_hash;
+ sel = i;
+ break;
+ }
+ }
+ else {
+ /* Set current boundary */
+ cb->cur_boundary = rspamd_mempool_alloc(task->task_pool,
+ sizeof(rspamd_ftok_t));
+ cb->cur_boundary->begin = st->start + cur->boundary;
+ cb->cur_boundary->len = 0;
+ cb->bhash = cur->hash;
+ sel = i;
+ break;
+ }
+ }
+ }
+
+ /* Now we can go forward with boundaries that are same to what we have */
+ for (i = sel; i < st->boundaries->len; i++) {
+ cur = &g_array_index(st->boundaries, struct rspamd_mime_boundary, i);
+
+ if (cur->boundary > last_offset) {
+ break;
+ }
+
+ if (cur->hash == cb->bhash || cur->closed_hash == cb->bhash) {
+ if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st,
+ cb, cur)) != RSPAMD_MIME_PARSE_OK) {
+ return ret;
+ }
+
+ if (cur->closed_hash == cb->bhash) {
+ /* We have again fake closed hash */
+ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
+ cur->hash = cur->closed_hash;
+ }
+
+ if (RSPAMD_BOUNDARY_IS_CLOSED(cur)) {
+ /* We also might check the next boundary... */
+ if (i < st->boundaries->len - 1) {
+ cur = &g_array_index(st->boundaries,
+ struct rspamd_mime_boundary, i + 1);
+
+ if (cur->hash == cb->bhash) {
+ continue;
+ }
+ else if (cur->closed_hash == cb->bhash) {
+ /* We have again fake closed hash */
+ cur->flags &= ~(RSPAMD_MIME_BOUNDARY_FLAG_CLOSED);
+ cur->hash = cur->closed_hash;
+ continue;
+ }
+ }
+
+ break;
+ }
+ }
+ }
+
+ if (i == st->boundaries->len && cb->cur_boundary) {
+ /* Process the last part */
+ struct rspamd_mime_boundary fb;
+
+ fb.boundary = last_offset;
+ fb.start = -1;
+
+ if ((ret = rspamd_mime_parse_multipart_cb(task, multipart, st,
+ cb, &fb)) != RSPAMD_MIME_PARSE_OK) {
+ return ret;
+ }
+ }
+
+ return RSPAMD_MIME_PARSE_OK;
+}
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_multipart_part(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ GError **err)
+{
+ struct rspamd_mime_multipart_cbdata cbdata;
+ enum rspamd_mime_parse_error ret;
+
+ if (st->nesting > max_nested) {
+ g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
+ st->nesting);
+ return RSPAMD_MIME_PARSE_NESTING;
+ }
+
+ part->part_number = MESSAGE_FIELD(task, parts)->len;
+ part->urls = g_ptr_array_new();
+ g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
+ st->nesting++;
+ rspamd_mime_part_get_cte(task, part->raw_headers, part, FALSE);
+
+ st->pos = part->raw_data.begin;
+ cbdata.multipart = part;
+ cbdata.task = task;
+ cbdata.st = st;
+ cbdata.part_start = NULL;
+ cbdata.err = err;
+
+ if (part->ct->boundary.len > 0) {
+ /* We know our boundary */
+ cbdata.cur_boundary = &part->ct->boundary;
+ rspamd_cryptobox_siphash((guchar *) &cbdata.bhash,
+ cbdata.cur_boundary->begin, cbdata.cur_boundary->len,
+ lib_ctx->hkey);
+ msg_debug_mime("hash: %T -> %L", cbdata.cur_boundary, cbdata.bhash);
+ }
+ else {
+ /* Guess boundary */
+ cbdata.cur_boundary = NULL;
+ cbdata.bhash = 0;
+ }
+
+ ret = rspamd_multipart_boundaries_filter(task, part, st, &cbdata);
+ /* Cleanup stack */
+ st->nesting--;
+ g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1);
+
+ return ret;
+}
+
+/* Process boundary like structures in a message */
+static gint
+rspamd_mime_preprocess_cb(struct rspamd_multipattern *mp,
+ guint strnum,
+ gint match_start,
+ gint match_pos,
+ const gchar *text,
+ gsize len,
+ void *context)
+{
+ const gchar *end = text + len, *p = text + match_pos, *bend;
+ gsize blen;
+ gboolean closing = FALSE;
+ struct rspamd_mime_boundary b;
+ struct rspamd_mime_parser_ctx *st = context;
+ struct rspamd_task *task;
+
+ task = st->task;
+
+ if (G_LIKELY(p < end)) {
+
+ blen = 0;
+
+ while (p < end) {
+ if (*p == '\r' || *p == '\n') {
+ break;
+ }
+
+ blen++;
+ p++;
+ }
+
+ if (blen > 0) {
+ /* We have found something like boundary */
+ p = text + match_pos;
+ bend = p + blen - 1;
+
+ if (*bend == '-') {
+ /* We need to verify last -- */
+ if (bend > p + 1 && *(bend - 1) == '-') {
+ closing = TRUE;
+ bend--;
+ blen -= 2;
+ }
+ else {
+ /* Not a closing boundary somehow, e.g. if a boundary=='-' */
+ bend++;
+ }
+ }
+ else {
+ bend++;
+ }
+
+ while (bend < end) {
+ if (*bend == '\r') {
+ bend++;
+
+ /* \r\n */
+ if (bend < end && *bend == '\n') {
+ bend++;
+ }
+ }
+ else if (*bend == '\n') {
+ /* \n */
+ bend++;
+ }
+ else if (g_ascii_isspace(*bend)) {
+ /* Spaces in the same line, skip them */
+ bend++;
+ continue;
+ }
+
+ break;
+ }
+
+ b.boundary = p - st->start - 2;
+ b.start = bend - st->start;
+
+ /* Small optimisation as boundaries are usually short strings */
+ gchar *lc_copy, lc_copy_buf[128];
+
+ if (blen + 2 < sizeof(lc_copy_buf)) {
+ lc_copy = lc_copy_buf;
+ }
+ else {
+ lc_copy = g_malloc(blen + 2);
+ }
+
+ if (closing) {
+ memcpy(lc_copy, p, blen + 2);
+ rspamd_str_lc(lc_copy, blen + 2);
+ }
+ else {
+ memcpy(lc_copy, p, blen);
+ rspamd_str_lc(lc_copy, blen);
+ }
+
+ rspamd_cryptobox_siphash((guchar *) &b.hash, lc_copy, blen,
+ lib_ctx->hkey);
+ msg_debug_mime("normal hash: %*s -> %L, %d boffset, %d data offset",
+ (gint) blen, lc_copy, b.hash, (int) b.boundary, (int) b.start);
+
+ if (closing) {
+ b.flags = RSPAMD_MIME_BOUNDARY_FLAG_CLOSED;
+ rspamd_cryptobox_siphash((guchar *) &b.closed_hash, lc_copy,
+ blen + 2,
+ lib_ctx->hkey);
+ msg_debug_mime("closing hash: %*s -> %L, %d boffset, %d data offset",
+ (gint) blen + 2, lc_copy,
+ b.closed_hash,
+ (int) b.boundary, (int) b.start);
+ }
+ else {
+ b.flags = 0;
+ b.closed_hash = 0;
+ }
+
+ /* Check if a string has been allocated on the heap */
+ if (blen + 2 >= sizeof(lc_copy_buf)) {
+ g_free(lc_copy);
+ }
+ g_array_append_val(st->boundaries, b);
+ }
+ }
+
+ return 0;
+}
+
+static goffset
+rspamd_mime_parser_headers_heuristic(GString *input, goffset *body_start)
+{
+ const gsize default_max_len = 76;
+ gsize max_len = MIN(input->len, default_max_len);
+ const gchar *p, *end;
+ enum {
+ st_before_colon = 0,
+ st_colon,
+ st_spaces_after_colon,
+ st_value,
+ st_error
+ } state = st_before_colon;
+
+ p = input->str;
+ end = p + max_len;
+
+ while (p < end) {
+ switch (state) {
+ case st_before_colon:
+ if (G_UNLIKELY(*p == ':')) {
+ state = st_colon;
+ }
+ else if (G_UNLIKELY(!g_ascii_isgraph(*p))) {
+ state = st_error;
+ }
+
+ p++;
+ break;
+ case st_colon:
+ if (g_ascii_isspace(*p)) {
+ state = st_spaces_after_colon;
+ }
+ else {
+ state = st_value;
+ }
+ p++;
+ break;
+ case st_spaces_after_colon:
+ if (!g_ascii_isspace(*p)) {
+ state = st_value;
+ }
+ p++;
+ break;
+ case st_value:
+ /* We accept any value */
+ goto end;
+ break;
+ case st_error:
+ return (-1);
+ break;
+ }
+ }
+
+end:
+ if (state == st_value) {
+ if (body_start) {
+ *body_start = input->len;
+ }
+
+ return input->len;
+ }
+
+ return (-1);
+}
+
+static void
+rspamd_mime_preprocess_message(struct rspamd_task *task,
+ struct rspamd_mime_part *top,
+ struct rspamd_mime_parser_ctx *st)
+{
+
+ if (top->raw_data.begin >= st->pos) {
+ rspamd_multipattern_lookup(lib_ctx->mp_boundary,
+ top->raw_data.begin - 1,
+ top->raw_data.len + 1,
+ rspamd_mime_preprocess_cb, st, NULL);
+ }
+ else {
+ rspamd_multipattern_lookup(lib_ctx->mp_boundary,
+ st->pos,
+ st->end - st->pos,
+ rspamd_mime_preprocess_cb, st, NULL);
+ }
+}
+
+static void
+rspamd_mime_parse_stack_free(struct rspamd_mime_parser_ctx *st)
+{
+ if (st) {
+ g_ptr_array_free(st->stack, TRUE);
+ g_array_free(st->boundaries, TRUE);
+ g_free(st);
+ }
+}
+
+static enum rspamd_mime_parse_error
+rspamd_mime_parse_message(struct rspamd_task *task,
+ struct rspamd_mime_part *part,
+ struct rspamd_mime_parser_ctx *st,
+ GError **err)
+{
+ struct rspamd_content_type *ct, *sel = NULL;
+ struct rspamd_mime_header *hdr = NULL, *cur;
+ const gchar *pbegin, *p;
+ gsize plen, len;
+ struct rspamd_mime_part *npart;
+ goffset hdr_pos, body_pos;
+ guint i;
+ enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
+ GString str;
+ struct rspamd_mime_parser_ctx *nst = st;
+
+ if (st->nesting > max_nested) {
+ g_set_error(err, RSPAMD_MIME_QUARK, E2BIG, "Nesting level is too high: %d",
+ st->nesting);
+ return RSPAMD_MIME_PARSE_NESTING;
+ }
+
+ /* Allocate real part */
+ npart = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_part));
+
+ if (part == NULL) {
+ /* Top level message */
+ p = task->msg.begin;
+ len = task->msg.len;
+
+ str.str = (gchar *) p;
+ str.len = len;
+
+ hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
+
+ if (hdr_pos > 0 && hdr_pos < str.len) {
+
+ MESSAGE_FIELD(task, raw_headers_content).begin = str.str;
+ MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos;
+ MESSAGE_FIELD(task, raw_headers_content).body_start = str.str + body_pos;
+
+ if (MESSAGE_FIELD(task, raw_headers_content).len > 0) {
+ rspamd_mime_headers_process(task,
+ MESSAGE_FIELD(task, raw_headers),
+ &MESSAGE_FIELD(task, headers_order),
+ MESSAGE_FIELD(task, raw_headers_content).begin,
+ MESSAGE_FIELD(task, raw_headers_content).len,
+ TRUE);
+ npart->raw_headers = rspamd_message_headers_ref(
+ MESSAGE_FIELD(task, raw_headers));
+
+ /* Preserve the natural order */
+ if (MESSAGE_FIELD(task, headers_order)) {
+ LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next);
+ }
+ }
+
+ hdr = rspamd_message_get_header_from_hash(
+ MESSAGE_FIELD(task, raw_headers),
+ "Content-Type", FALSE);
+ }
+ else {
+ /* First apply heuristic, maybe we have just headers */
+ hdr_pos = rspamd_mime_parser_headers_heuristic(&str, &body_pos);
+
+ if (hdr_pos > 0 && hdr_pos <= str.len) {
+ MESSAGE_FIELD(task, raw_headers_content).begin = str.str;
+ MESSAGE_FIELD(task, raw_headers_content).len = hdr_pos;
+ MESSAGE_FIELD(task, raw_headers_content).body_start = str.str +
+ body_pos;
+
+ if (MESSAGE_FIELD(task, raw_headers_content).len > 0) {
+ rspamd_mime_headers_process(task,
+ MESSAGE_FIELD(task, raw_headers),
+ &MESSAGE_FIELD(task, headers_order),
+ MESSAGE_FIELD(task, raw_headers_content).begin,
+ MESSAGE_FIELD(task, raw_headers_content).len,
+ TRUE);
+ npart->raw_headers = rspamd_message_headers_ref(
+ MESSAGE_FIELD(task, raw_headers));
+
+ /* Preserve the natural order */
+ if (MESSAGE_FIELD(task, headers_order)) {
+ LL_REVERSE2(MESSAGE_FIELD(task, headers_order), ord_next);
+ }
+ }
+
+ hdr = rspamd_message_get_header_from_hash(
+ MESSAGE_FIELD(task, raw_headers),
+ "Content-Type", FALSE);
+ task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
+ }
+ else {
+ body_pos = 0;
+ }
+ }
+
+ pbegin = st->start + body_pos;
+ plen = st->end - pbegin;
+ npart->headers_order = NULL;
+ }
+ else {
+ /*
+ * Here are dragons:
+ * We allocate new parser context as we need to shift pointers
+ */
+ nst = g_malloc0(sizeof(*st));
+ nst->stack = g_ptr_array_sized_new(4);
+ nst->boundaries = g_array_sized_new(FALSE, FALSE,
+ sizeof(struct rspamd_mime_boundary), 8);
+ nst->start = part->parsed_data.begin;
+ nst->end = nst->start + part->parsed_data.len;
+ nst->pos = nst->start;
+ nst->task = st->task;
+ nst->nesting = st->nesting;
+ st->nesting++;
+
+ str.str = (gchar *) part->parsed_data.begin;
+ str.len = part->parsed_data.len;
+
+ hdr_pos = rspamd_string_find_eoh(&str, &body_pos);
+ npart->raw_headers = rspamd_message_headers_new();
+ npart->headers_order = NULL;
+
+ if (hdr_pos > 0 && hdr_pos < str.len) {
+ npart->raw_headers_str = str.str;
+ npart->raw_headers_len = hdr_pos;
+ npart->raw_data.begin = str.str + body_pos;
+
+ if (npart->raw_headers_len > 0) {
+ rspamd_mime_headers_process(task,
+ npart->raw_headers,
+ &npart->headers_order,
+ npart->raw_headers_str,
+ npart->raw_headers_len,
+ FALSE);
+
+ /* Preserve the natural order */
+ if (npart->headers_order) {
+ LL_REVERSE2(npart->headers_order, ord_next);
+ }
+ }
+
+ hdr = rspamd_message_get_header_from_hash(npart->raw_headers,
+ "Content-Type", FALSE);
+ }
+ else {
+ body_pos = 0;
+ }
+
+ pbegin = part->parsed_data.begin + body_pos;
+ plen = part->parsed_data.len - body_pos;
+ }
+
+ npart->raw_data.begin = pbegin;
+ npart->raw_data.len = plen;
+ npart->parent_part = part;
+
+ if (hdr == NULL) {
+ sel = NULL;
+ }
+ else {
+ DL_FOREACH(hdr, cur)
+ {
+ ct = rspamd_content_type_parse(cur->value, strlen(cur->value),
+ task->task_pool);
+
+ /* Here we prefer multipart content-type or any content-type */
+ if (ct) {
+ if (sel == NULL) {
+ sel = ct;
+ }
+ else if (ct->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+ sel = ct;
+ }
+ }
+ }
+ }
+
+ if (sel == NULL) {
+ /* For messages we automatically assume plaintext */
+ msg_info_task("cannot find content-type for a message, assume text/plain");
+ sel = rspamd_mempool_alloc0(task->task_pool, sizeof(*sel));
+ sel->flags = RSPAMD_CONTENT_TYPE_TEXT | RSPAMD_CONTENT_TYPE_MISSING;
+ RSPAMD_FTOK_ASSIGN(&sel->type, "text");
+ RSPAMD_FTOK_ASSIGN(&sel->subtype, "plain");
+ }
+
+ npart->ct = sel;
+
+ if ((part == NULL || nst != st) &&
+ (sel->flags & (RSPAMD_CONTENT_TYPE_MULTIPART | RSPAMD_CONTENT_TYPE_MESSAGE))) {
+ /* Not a trivial message, need to preprocess */
+ rspamd_mime_preprocess_message(task, npart, nst);
+ }
+
+ if (sel->flags & RSPAMD_CONTENT_TYPE_MULTIPART) {
+ g_ptr_array_add(nst->stack, npart);
+ nst->nesting++;
+ npart->part_type = RSPAMD_MIME_PART_MULTIPART;
+ npart->specific.mp = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(struct rspamd_mime_multipart));
+ memcpy(&npart->specific.mp->boundary, &sel->orig_boundary,
+ sizeof(rspamd_ftok_t));
+ ret = rspamd_mime_parse_multipart_part(task, npart, nst, err);
+ }
+ else if (sel->flags & RSPAMD_CONTENT_TYPE_MESSAGE) {
+ if ((ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err)) == RSPAMD_MIME_PARSE_OK) {
+ npart->part_type = RSPAMD_MIME_PART_MESSAGE;
+ ret = rspamd_mime_parse_message(task, npart, nst, err);
+ }
+ }
+ else {
+ ret = rspamd_mime_parse_normal_part(task, npart, nst, sel, err);
+ }
+
+ if (ret != RSPAMD_MIME_PARSE_OK) {
+ return ret;
+ }
+
+ if (part && st->stack->len > 0) {
+ /* Remove message part from the parent stack */
+ g_ptr_array_remove_index_fast(st->stack, st->stack->len - 1);
+ st->nesting--;
+ }
+
+ /* Process leftovers for boundaries */
+ if (nst->boundaries) {
+ struct rspamd_mime_boundary *boundary, *start_boundary = NULL,
+ *end_boundary = NULL;
+ goffset cur_offset = nst->pos - nst->start,
+ end_offset = st->end - st->start;
+ guint sel_idx = 0;
+
+ for (;;) {
+ start_boundary = NULL;
+
+ for (i = sel_idx; i < nst->boundaries->len; i++) {
+ boundary = &g_array_index(nst->boundaries,
+ struct rspamd_mime_boundary, i);
+
+ if (boundary->start > cur_offset &&
+ boundary->boundary < end_offset &&
+ !RSPAMD_BOUNDARY_IS_CLOSED(boundary)) {
+ start_boundary = boundary;
+ sel_idx = i;
+ break;
+ }
+ }
+
+ if (start_boundary) {
+ const gchar *start, *end;
+
+ if (nst->boundaries->len > sel_idx + 1) {
+ end_boundary = &g_array_index(nst->boundaries,
+ struct rspamd_mime_boundary, sel_idx + 1);
+ end = nst->start + end_boundary->boundary;
+ }
+ else {
+ end = nst->end;
+ }
+
+ sel_idx++;
+
+ start = nst->start + start_boundary->start;
+
+ if (end > start &&
+ (ret = rspamd_mime_process_multipart_node(task, nst,
+ NULL, start, end, FALSE, err)) != RSPAMD_MIME_PARSE_OK) {
+
+ if (nst != st) {
+ rspamd_mime_parse_stack_free(nst);
+ }
+
+ if (ret == RSPAMD_MIME_PARSE_NO_PART) {
+ return RSPAMD_MIME_PARSE_OK;
+ }
+
+ return ret;
+ }
+ }
+ else {
+ break;
+ }
+ }
+ }
+
+ if (nst != st) {
+ rspamd_mime_parse_stack_free(nst);
+ }
+
+ return ret;
+}
+
+enum rspamd_mime_parse_error
+rspamd_mime_parse_task(struct rspamd_task *task, GError **err)
+{
+ struct rspamd_mime_parser_ctx *st;
+ enum rspamd_mime_parse_error ret = RSPAMD_MIME_PARSE_OK;
+
+ if (lib_ctx == NULL) {
+ rspamd_mime_parser_init_lib();
+ }
+
+ if (++lib_ctx->key_usages > max_key_usages) {
+ /* Regenerate siphash key */
+ ottery_rand_bytes(lib_ctx->hkey, sizeof(lib_ctx->hkey));
+ lib_ctx->key_usages = 0;
+ }
+
+ st = g_malloc0(sizeof(*st));
+ st->stack = g_ptr_array_sized_new(4);
+ st->pos = MESSAGE_FIELD(task, raw_headers_content).body_start;
+ st->end = task->msg.begin + task->msg.len;
+ st->boundaries = g_array_sized_new(FALSE, FALSE,
+ sizeof(struct rspamd_mime_boundary), 8);
+ st->task = task;
+
+ if (st->pos == NULL) {
+ st->pos = task->msg.begin;
+ }
+
+ st->start = task->msg.begin;
+ ret = rspamd_mime_parse_message(task, NULL, st, err);
+ rspamd_mime_parse_stack_free(st);
+
+ return ret;
+}