src/lib-mail/message-snippet.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

/* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "buffer.h"
#include "str.h"
#include "istream.h"
#include "mail-html2text.h"
#include "message-parser.h"
#include "message-decoder.h"
#include "message-snippet.h"

#include <ctype.h>

enum snippet_state {
	/* beginning of the line */
	SNIPPET_STATE_NEWLINE = 0,
	/* within normal text */
	SNIPPET_STATE_NORMAL,
	/* within quoted text - skip until EOL */
	SNIPPET_STATE_QUOTED
};

struct snippet_data {
	string_t *snippet;
	unsigned int chars_left;
};

struct snippet_context {
	struct snippet_data snippet;
	struct snippet_data quoted_snippet;
	enum snippet_state state;
	bool add_whitespace;
	struct mail_html2text *html2text;
	buffer_t *plain_output;
};

static void snippet_add_content(struct snippet_context *ctx,
				struct snippet_data *target,
				const unsigned char *data, size_t size,
				size_t *count_r)
{
	i_assert(target != NULL);
	if (size == 0)
		return;
	if (size >= 3 &&
	     ((data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) ||
	      (data[0] == 0xBF && data[1] == 0xBB && data[2] == 0xEF))) {
		*count_r = 3;
		return;
	}
	if (data[0] == '\0') {
		/* skip NULs without increasing snippet size */
		return;
	}
	if (i_isspace(*data)) {
		/* skip any leading whitespace */
		if (str_len(target->snippet) > 0)
			ctx->add_whitespace = TRUE;
		if (data[0] == '\n')
			ctx->state = SNIPPET_STATE_NEWLINE;
		return;
	}
	if (target->chars_left == 0)
		return;
	target->chars_left--;
	if (ctx->add_whitespace) {
		if (target->chars_left == 0) {
			/* don't add a trailing whitespace */
			return;
		}
		str_append_c(target->snippet, ' ');
		ctx->add_whitespace = FALSE;
		target->chars_left--;
	}
	*count_r = uni_utf8_char_bytes(data[0]);
	i_assert(*count_r <= size);
	str_append_data(target->snippet, data, *count_r);
}

static bool snippet_generate(struct snippet_context *ctx,
			     const unsigned char *data, size_t size)
{
	size_t i, count;
	struct snippet_data *target;

	if (ctx->html2text != NULL) {
		buffer_set_used_size(ctx->plain_output, 0);
		mail_html2text_more(ctx->html2text, data, size,
				    ctx->plain_output);
		data = ctx->plain_output->data;
		size = ctx->plain_output->used;
	}

	if (ctx->state == SNIPPET_STATE_QUOTED)
		target = &ctx->quoted_snippet;
	else
		target = &ctx->snippet;

	/* message-decoder should feed us only valid and complete
	   UTF-8 input */

	for (i = 0; i < size; i += count) {
		count = 1;
		switch (ctx->state) {
		case SNIPPET_STATE_NEWLINE:
			if (data[i] == '>') {
				ctx->state = SNIPPET_STATE_QUOTED;
				i++;
				target = &ctx->quoted_snippet;
			} else {
				ctx->state = SNIPPET_STATE_NORMAL;
				target = &ctx->snippet;
			}
			/* fallthrough */
		case SNIPPET_STATE_NORMAL:
		case SNIPPET_STATE_QUOTED:
			snippet_add_content(ctx, target, CONST_PTR_OFFSET(data, i),
					    size-i, &count);
			/* break here if we have enough non-quoted data,
			   quoted data does not need to break here as it's
			   only used if the actual snippet is left empty. */
			if (ctx->snippet.chars_left == 0)
				return FALSE;
			break;
		}
	}
	return TRUE;
}

static void snippet_copy(const char *src, string_t *dst)
{
	while (*src != '\0' && i_isspace(*src)) src++;
	str_append(dst, src);
}

int message_snippet_generate(struct istream *input,
			     unsigned int max_snippet_chars,
			     string_t *snippet)
{
	const struct message_parser_settings parser_set = { .flags = 0 };
	struct message_parser_ctx *parser;
	struct message_part *parts;
	struct message_part *skip_part = NULL;
	struct message_decoder_context *decoder;
	struct message_block raw_block, block;
	struct snippet_context ctx;
	pool_t pool;
	int ret;

	i_zero(&ctx);
	pool = pool_alloconly_create("message snippet", 2048);
	ctx.snippet.snippet = str_new(pool, max_snippet_chars);
	ctx.snippet.chars_left = max_snippet_chars;
	ctx.quoted_snippet.snippet = str_new(pool, max_snippet_chars);
	ctx.quoted_snippet.chars_left = max_snippet_chars - 1; /* -1 for '>' */
	parser = message_parser_init(pool_datastack_create(), input, &parser_set);
	decoder = message_decoder_init(NULL, 0);
	while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
		if (raw_block.part == skip_part)
			continue;
		if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
			continue;
		if (block.size == 0) {
			const char *ct;

			if (block.hdr != NULL)
				continue;

			/* We already have a snippet, don't look for more in
			   subsequent parts. */
			if (ctx.snippet.snippet->used != 0 ||
			    ctx.quoted_snippet.snippet->used != 0)
				break;

			skip_part = NULL;

			/* end of headers - verify that we can use this
			   Content-Type. we get here only once, because we
			   always handle only one non-multipart MIME part. */
			ct = message_decoder_current_content_type(decoder);
			if (ct == NULL)
				/* text/plain */ ;
			else if (mail_html2text_content_type_match(ct)) {
				mail_html2text_deinit(&ctx.html2text);
				ctx.html2text = mail_html2text_init(0);
				if (ctx.plain_output == NULL) {
					ctx.plain_output =
						buffer_create_dynamic(pool, 1024);
				}
			} else if (strncasecmp(ct, "text/", 5) != 0)
				skip_part = raw_block.part;
		} else if (!snippet_generate(&ctx, block.data, block.size))
			break;
	}
	i_assert(ret != 0);
	message_decoder_deinit(&decoder);
	message_parser_deinit(&parser, &parts);
	mail_html2text_deinit(&ctx.html2text);
	if (ctx.snippet.snippet->used != 0)
		snippet_copy(str_c(ctx.snippet.snippet), snippet);
	else if (ctx.quoted_snippet.snippet->used != 0) {
		str_append_c(snippet, '>');
		snippet_copy(str_c(ctx.quoted_snippet.snippet), snippet);
	}
	pool_unref(&pool);
	return input->stream_errno == 0 ? 0 : -1;
}