summaryrefslogtreecommitdiffstats
path: root/src/plugins/fts/fts-parser.c
blob: c0eac80d25acc0fd72651e3868bdf340826f20a8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/* Copyright (c) 2011-2018 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "buffer.h"
#include "unichar.h"
#include "message-parser.h"
#include "fts-parser.h"

static const struct fts_parser_vfuncs *parsers[] = {
	&fts_parser_html,
	&fts_parser_script,
	&fts_parser_tika
};

static const char *plaintext_content_types[] = {
	"text/plain",
	"message/delivery-status",
	"message/disposition-notification",
	"application/pgp-signature",
	NULL
};

bool fts_parser_init(struct fts_parser_context *parser_context,
		     struct fts_parser **parser_r)
{
	unsigned int i;
	i_assert(parser_context->user != NULL);
	i_assert(parser_context->content_type != NULL);

	if (str_array_find(plaintext_content_types, parser_context->content_type)) {
		/* we probably don't want/need to allow parsers to handle
		   plaintext? */
		return FALSE;
	}

	for (i = 0; i < N_ELEMENTS(parsers); i++) {
		*parser_r = parsers[i]->try_init(parser_context);
		if (*parser_r != NULL)
			return TRUE;
	}
	return FALSE;
}

struct fts_parser *fts_parser_text_init(void)
{
	return i_new(struct fts_parser, 1);
}

static bool data_has_nuls(const unsigned char *data, size_t size)
{
	size_t i;

	for (i = 0; i < size; i++) {
		if (data[i] == '\0')
			return TRUE;
	}
	return FALSE;
}

static void replace_nul_bytes(buffer_t *buf)
{
	unsigned char *data;
	size_t i, size;

	data = buffer_get_modifiable_data(buf, &size);
	for (i = 0; i < size; i++) {
		if (data[i] == '\0')
			data[i] = ' ';
	}
}

void fts_parser_more(struct fts_parser *parser, struct message_block *block)
{
	if (parser->v.more != NULL)
		parser->v.more(parser, block);

	if (!uni_utf8_data_is_valid(block->data, block->size) ||
	    data_has_nuls(block->data, block->size)) {
		/* output isn't valid UTF-8. make it. */
		if (parser->utf8_output == NULL) {
			parser->utf8_output =
				buffer_create_dynamic(default_pool, 4096);
		} else {
			buffer_set_used_size(parser->utf8_output, 0);
		}
		if (uni_utf8_get_valid_data(block->data, block->size,
					    parser->utf8_output)) {
			/* valid UTF-8, but there were NULs */
			buffer_append(parser->utf8_output, block->data,
				      block->size);
		}
		replace_nul_bytes(parser->utf8_output);
		block->data = parser->utf8_output->data;
		block->size = parser->utf8_output->used;
	}
}

int fts_parser_deinit(struct fts_parser **_parser, const char **retriable_err_msg_r)
{
	struct fts_parser *parser = *_parser;
	int ret = 1;

	*_parser = NULL;

	buffer_free(&parser->utf8_output);
	if (parser->v.deinit != NULL) {
		const char *error = NULL;
		ret = parser->v.deinit(parser, &error);
		if (ret == 0) {
			i_assert(error != NULL);
			if (retriable_err_msg_r != NULL)
				*retriable_err_msg_r = error;
		}
	} else
		i_free(parser);
	return ret;
}

void fts_parsers_unload(void)
{
	unsigned int i;

	for (i = 0; i < N_ELEMENTS(parsers); i++) {
		if (parsers[i]->unload != NULL)
			parsers[i]->unload();
	}
}