diff options
Diffstat (limited to 'src/plugins/fts/fts-parser.c')
-rw-r--r-- | src/plugins/fts/fts-parser.c | 127 |
1 files changed, 127 insertions, 0 deletions
diff --git a/src/plugins/fts/fts-parser.c b/src/plugins/fts/fts-parser.c new file mode 100644 index 0000000..c0eac80 --- /dev/null +++ b/src/plugins/fts/fts-parser.c @@ -0,0 +1,127 @@ +/* Copyright (c) 2011-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "buffer.h" +#include "unichar.h" +#include "message-parser.h" +#include "fts-parser.h" + +static const struct fts_parser_vfuncs *parsers[] = { + &fts_parser_html, + &fts_parser_script, + &fts_parser_tika +}; + +static const char *plaintext_content_types[] = { + "text/plain", + "message/delivery-status", + "message/disposition-notification", + "application/pgp-signature", + NULL +}; + +bool fts_parser_init(struct fts_parser_context *parser_context, + struct fts_parser **parser_r) +{ + unsigned int i; + i_assert(parser_context->user != NULL); + i_assert(parser_context->content_type != NULL); + + if (str_array_find(plaintext_content_types, parser_context->content_type)) { + /* we probably don't want/need to allow parsers to handle + plaintext? */ + return FALSE; + } + + for (i = 0; i < N_ELEMENTS(parsers); i++) { + *parser_r = parsers[i]->try_init(parser_context); + if (*parser_r != NULL) + return TRUE; + } + return FALSE; +} + +struct fts_parser *fts_parser_text_init(void) +{ + return i_new(struct fts_parser, 1); +} + +static bool data_has_nuls(const unsigned char *data, size_t size) +{ + size_t i; + + for (i = 0; i < size; i++) { + if (data[i] == '\0') + return TRUE; + } + return FALSE; +} + +static void replace_nul_bytes(buffer_t *buf) +{ + unsigned char *data; + size_t i, size; + + data = buffer_get_modifiable_data(buf, &size); + for (i = 0; i < size; i++) { + if (data[i] == '\0') + data[i] = ' '; + } +} + +void fts_parser_more(struct fts_parser *parser, struct message_block *block) +{ + if (parser->v.more != NULL) + parser->v.more(parser, block); + + if (!uni_utf8_data_is_valid(block->data, block->size) || + data_has_nuls(block->data, block->size)) { + /* output isn't valid UTF-8. make it. */ + if (parser->utf8_output == NULL) { + parser->utf8_output = + buffer_create_dynamic(default_pool, 4096); + } else { + buffer_set_used_size(parser->utf8_output, 0); + } + if (uni_utf8_get_valid_data(block->data, block->size, + parser->utf8_output)) { + /* valid UTF-8, but there were NULs */ + buffer_append(parser->utf8_output, block->data, + block->size); + } + replace_nul_bytes(parser->utf8_output); + block->data = parser->utf8_output->data; + block->size = parser->utf8_output->used; + } +} + +int fts_parser_deinit(struct fts_parser **_parser, const char **retriable_err_msg_r) +{ + struct fts_parser *parser = *_parser; + int ret = 1; + + *_parser = NULL; + + buffer_free(&parser->utf8_output); + if (parser->v.deinit != NULL) { + const char *error = NULL; + ret = parser->v.deinit(parser, &error); + if (ret == 0) { + i_assert(error != NULL); + if (retriable_err_msg_r != NULL) + *retriable_err_msg_r = error; + } + } else + i_free(parser); + return ret; +} + +void fts_parsers_unload(void) +{ + unsigned int i; + + for (i = 0; i < N_ELEMENTS(parsers); i++) { + if (parsers[i]->unload != NULL) + parsers[i]->unload(); + } +} |