summaryrefslogtreecommitdiffstats
path: root/src/plugins/fts/fts-parser.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/fts/fts-parser.c')
-rw-r--r--src/plugins/fts/fts-parser.c127
1 files changed, 127 insertions, 0 deletions
diff --git a/src/plugins/fts/fts-parser.c b/src/plugins/fts/fts-parser.c
new file mode 100644
index 0000000..c0eac80
--- /dev/null
+++ b/src/plugins/fts/fts-parser.c
@@ -0,0 +1,127 @@
+/* Copyright (c) 2011-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "unichar.h"
+#include "message-parser.h"
+#include "fts-parser.h"
+
+static const struct fts_parser_vfuncs *parsers[] = {
+ &fts_parser_html,
+ &fts_parser_script,
+ &fts_parser_tika
+};
+
+static const char *plaintext_content_types[] = {
+ "text/plain",
+ "message/delivery-status",
+ "message/disposition-notification",
+ "application/pgp-signature",
+ NULL
+};
+
+bool fts_parser_init(struct fts_parser_context *parser_context,
+ struct fts_parser **parser_r)
+{
+ unsigned int i;
+ i_assert(parser_context->user != NULL);
+ i_assert(parser_context->content_type != NULL);
+
+ if (str_array_find(plaintext_content_types, parser_context->content_type)) {
+ /* we probably don't want/need to allow parsers to handle
+ plaintext? */
+ return FALSE;
+ }
+
+ for (i = 0; i < N_ELEMENTS(parsers); i++) {
+ *parser_r = parsers[i]->try_init(parser_context);
+ if (*parser_r != NULL)
+ return TRUE;
+ }
+ return FALSE;
+}
+
+struct fts_parser *fts_parser_text_init(void)
+{
+ return i_new(struct fts_parser, 1);
+}
+
+static bool data_has_nuls(const unsigned char *data, size_t size)
+{
+ size_t i;
+
+ for (i = 0; i < size; i++) {
+ if (data[i] == '\0')
+ return TRUE;
+ }
+ return FALSE;
+}
+
+static void replace_nul_bytes(buffer_t *buf)
+{
+ unsigned char *data;
+ size_t i, size;
+
+ data = buffer_get_modifiable_data(buf, &size);
+ for (i = 0; i < size; i++) {
+ if (data[i] == '\0')
+ data[i] = ' ';
+ }
+}
+
+void fts_parser_more(struct fts_parser *parser, struct message_block *block)
+{
+ if (parser->v.more != NULL)
+ parser->v.more(parser, block);
+
+ if (!uni_utf8_data_is_valid(block->data, block->size) ||
+ data_has_nuls(block->data, block->size)) {
+ /* output isn't valid UTF-8. make it. */
+ if (parser->utf8_output == NULL) {
+ parser->utf8_output =
+ buffer_create_dynamic(default_pool, 4096);
+ } else {
+ buffer_set_used_size(parser->utf8_output, 0);
+ }
+ if (uni_utf8_get_valid_data(block->data, block->size,
+ parser->utf8_output)) {
+ /* valid UTF-8, but there were NULs */
+ buffer_append(parser->utf8_output, block->data,
+ block->size);
+ }
+ replace_nul_bytes(parser->utf8_output);
+ block->data = parser->utf8_output->data;
+ block->size = parser->utf8_output->used;
+ }
+}
+
+int fts_parser_deinit(struct fts_parser **_parser, const char **retriable_err_msg_r)
+{
+ struct fts_parser *parser = *_parser;
+ int ret = 1;
+
+ *_parser = NULL;
+
+ buffer_free(&parser->utf8_output);
+ if (parser->v.deinit != NULL) {
+ const char *error = NULL;
+ ret = parser->v.deinit(parser, &error);
+ if (ret == 0) {
+ i_assert(error != NULL);
+ if (retriable_err_msg_r != NULL)
+ *retriable_err_msg_r = error;
+ }
+ } else
+ i_free(parser);
+ return ret;
+}
+
+void fts_parsers_unload(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < N_ELEMENTS(parsers); i++) {
+ if (parsers[i]->unload != NULL)
+ parsers[i]->unload();
+ }
+}