summaryrefslogtreecommitdiffstats
path: root/src/lib-mail/message-snippet.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib-mail/message-snippet.c')
-rw-r--r--src/lib-mail/message-snippet.c207
1 files changed, 207 insertions, 0 deletions
diff --git a/src/lib-mail/message-snippet.c b/src/lib-mail/message-snippet.c
new file mode 100644
index 0000000..2982e2e
--- /dev/null
+++ b/src/lib-mail/message-snippet.c
@@ -0,0 +1,207 @@
+/* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "str.h"
+#include "istream.h"
+#include "mail-html2text.h"
+#include "message-parser.h"
+#include "message-decoder.h"
+#include "message-snippet.h"
+
+#include <ctype.h>
+
+enum snippet_state {
+ /* beginning of the line */
+ SNIPPET_STATE_NEWLINE = 0,
+ /* within normal text */
+ SNIPPET_STATE_NORMAL,
+ /* within quoted text - skip until EOL */
+ SNIPPET_STATE_QUOTED
+};
+
+struct snippet_data {
+ string_t *snippet;
+ unsigned int chars_left;
+};
+
+struct snippet_context {
+ struct snippet_data snippet;
+ struct snippet_data quoted_snippet;
+ enum snippet_state state;
+ bool add_whitespace;
+ struct mail_html2text *html2text;
+ buffer_t *plain_output;
+};
+
+static void snippet_add_content(struct snippet_context *ctx,
+ struct snippet_data *target,
+ const unsigned char *data, size_t size,
+ size_t *count_r)
+{
+ i_assert(target != NULL);
+ if (size == 0)
+ return;
+ if (size >= 3 &&
+ ((data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) ||
+ (data[0] == 0xBF && data[1] == 0xBB && data[2] == 0xEF))) {
+ *count_r = 3;
+ return;
+ }
+ if (data[0] == '\0') {
+ /* skip NULs without increasing snippet size */
+ return;
+ }
+ if (i_isspace(*data)) {
+ /* skip any leading whitespace */
+ if (str_len(target->snippet) > 0)
+ ctx->add_whitespace = TRUE;
+ if (data[0] == '\n')
+ ctx->state = SNIPPET_STATE_NEWLINE;
+ return;
+ }
+ if (target->chars_left == 0)
+ return;
+ target->chars_left--;
+ if (ctx->add_whitespace) {
+ if (target->chars_left == 0) {
+ /* don't add a trailing whitespace */
+ return;
+ }
+ str_append_c(target->snippet, ' ');
+ ctx->add_whitespace = FALSE;
+ target->chars_left--;
+ }
+ *count_r = uni_utf8_char_bytes(data[0]);
+ i_assert(*count_r <= size);
+ str_append_data(target->snippet, data, *count_r);
+}
+
+static bool snippet_generate(struct snippet_context *ctx,
+ const unsigned char *data, size_t size)
+{
+ size_t i, count;
+ struct snippet_data *target;
+
+ if (ctx->html2text != NULL) {
+ buffer_set_used_size(ctx->plain_output, 0);
+ mail_html2text_more(ctx->html2text, data, size,
+ ctx->plain_output);
+ data = ctx->plain_output->data;
+ size = ctx->plain_output->used;
+ }
+
+ if (ctx->state == SNIPPET_STATE_QUOTED)
+ target = &ctx->quoted_snippet;
+ else
+ target = &ctx->snippet;
+
+ /* message-decoder should feed us only valid and complete
+ UTF-8 input */
+
+ for (i = 0; i < size; i += count) {
+ count = 1;
+ switch (ctx->state) {
+ case SNIPPET_STATE_NEWLINE:
+ if (data[i] == '>') {
+ ctx->state = SNIPPET_STATE_QUOTED;
+ i++;
+ target = &ctx->quoted_snippet;
+ } else {
+ ctx->state = SNIPPET_STATE_NORMAL;
+ target = &ctx->snippet;
+ }
+ /* fallthrough */
+ case SNIPPET_STATE_NORMAL:
+ case SNIPPET_STATE_QUOTED:
+ snippet_add_content(ctx, target, CONST_PTR_OFFSET(data, i),
+ size-i, &count);
+ /* break here if we have enough non-quoted data,
+ quoted data does not need to break here as it's
+ only used if the actual snippet is left empty. */
+ if (ctx->snippet.chars_left == 0)
+ return FALSE;
+ break;
+ }
+ }
+ return TRUE;
+}
+
+static void snippet_copy(const char *src, string_t *dst)
+{
+ while (*src != '\0' && i_isspace(*src)) src++;
+ str_append(dst, src);
+}
+
+int message_snippet_generate(struct istream *input,
+ unsigned int max_snippet_chars,
+ string_t *snippet)
+{
+ const struct message_parser_settings parser_set = { .flags = 0 };
+ struct message_parser_ctx *parser;
+ struct message_part *parts;
+ struct message_part *skip_part = NULL;
+ struct message_decoder_context *decoder;
+ struct message_block raw_block, block;
+ struct snippet_context ctx;
+ pool_t pool;
+ int ret;
+
+ i_zero(&ctx);
+ pool = pool_alloconly_create("message snippet", 2048);
+ ctx.snippet.snippet = str_new(pool, max_snippet_chars);
+ ctx.snippet.chars_left = max_snippet_chars;
+ ctx.quoted_snippet.snippet = str_new(pool, max_snippet_chars);
+ ctx.quoted_snippet.chars_left = max_snippet_chars - 1; /* -1 for '>' */
+ parser = message_parser_init(pool_datastack_create(), input, &parser_set);
+ decoder = message_decoder_init(NULL, 0);
+ while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
+ if (raw_block.part == skip_part)
+ continue;
+ if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
+ continue;
+ if (block.size == 0) {
+ const char *ct;
+
+ if (block.hdr != NULL)
+ continue;
+
+ /* We already have a snippet, don't look for more in
+ subsequent parts. */
+ if (ctx.snippet.snippet->used != 0 ||
+ ctx.quoted_snippet.snippet->used != 0)
+ break;
+
+ skip_part = NULL;
+
+ /* end of headers - verify that we can use this
+ Content-Type. we get here only once, because we
+ always handle only one non-multipart MIME part. */
+ ct = message_decoder_current_content_type(decoder);
+ if (ct == NULL)
+ /* text/plain */ ;
+ else if (mail_html2text_content_type_match(ct)) {
+ mail_html2text_deinit(&ctx.html2text);
+ ctx.html2text = mail_html2text_init(0);
+ if (ctx.plain_output == NULL) {
+ ctx.plain_output =
+ buffer_create_dynamic(pool, 1024);
+ }
+ } else if (strncasecmp(ct, "text/", 5) != 0)
+ skip_part = raw_block.part;
+ } else if (!snippet_generate(&ctx, block.data, block.size))
+ break;
+ }
+ i_assert(ret != 0);
+ message_decoder_deinit(&decoder);
+ message_parser_deinit(&parser, &parts);
+ mail_html2text_deinit(&ctx.html2text);
+ if (ctx.snippet.snippet->used != 0)
+ snippet_copy(str_c(ctx.snippet.snippet), snippet);
+ else if (ctx.quoted_snippet.snippet->used != 0) {
+ str_append_c(snippet, '>');
+ snippet_copy(str_c(ctx.quoted_snippet.snippet), snippet);
+ }
+ pool_unref(&pool);
+ return input->stream_errno == 0 ? 0 : -1;
+}