diff options
Diffstat (limited to '')
-rw-r--r-- | src/plugins/fts/fts-search-args.c | 258 |
1 files changed, 258 insertions, 0 deletions
diff --git a/src/plugins/fts/fts-search-args.c b/src/plugins/fts/fts-search-args.c new file mode 100644 index 0000000..b58b238 --- /dev/null +++ b/src/plugins/fts/fts-search-args.c @@ -0,0 +1,258 @@ +/* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "array.h" +#include "mail-namespace.h" +#include "mail-search.h" +#include "fts-api-private.h" +#include "fts-tokenizer.h" +#include "fts-filter.h" +#include "fts-user.h" +#include "fts-search-args.h" + +static void strings_deduplicate(ARRAY_TYPE(const_string) *arr) +{ + const char *const *strings; + unsigned int i, count; + + strings = array_get(arr, &count); + for (i = 1; i < count; ) { + if (strcmp(strings[i-1], strings[i]) == 0) { + array_delete(arr, i, 1); + strings = array_get(arr, &count); + } else { + i++; + } + } +} + +static struct mail_search_arg * +fts_search_arg_create_or(const struct mail_search_arg *orig_arg, pool_t pool, + const ARRAY_TYPE(const_string) *tokens) +{ + struct mail_search_arg *arg, *or_arg, **argp; + const char *token; + + /* create the OR arg first as the parent */ + or_arg = p_new(pool, struct mail_search_arg, 1); + or_arg->type = SEARCH_OR; + + /* now create all the child args for the OR */ + argp = &or_arg->value.subargs; + array_foreach_elem(tokens, token) { + arg = p_new(pool, struct mail_search_arg, 1); + *arg = *orig_arg; + arg->match_not = FALSE; /* we copied this to the root OR */ + arg->next = NULL; + arg->value.str = p_strdup(pool, token); + + *argp = arg; + argp = &arg->next; + } + return or_arg; +} + +static int +fts_backend_dovecot_expand_tokens(struct fts_filter *filter, + pool_t pool, + struct mail_search_arg *parent_arg, + const struct mail_search_arg *orig_arg, + const char *orig_token, const char *token, + const char **error_r) +{ + struct mail_search_arg *arg; + ARRAY_TYPE(const_string) tokens; + const char *token2, *error; + int ret; + + t_array_init(&tokens, 4); + /* first add the word exactly as it without any tokenization */ + array_push_back(&tokens, &orig_token); + /* then add it tokenized, but without filtering */ + array_push_back(&tokens, &token); + + /* add the word filtered */ + if (filter != NULL) { + token2 = t_strdup(token); + ret = fts_filter_filter(filter, &token2, &error); + if (ret > 0) { + token2 = t_strdup(token2); + array_push_back(&tokens, &token2); + } else if (ret < 0) { + *error_r = t_strdup_printf("Couldn't filter search token: %s", error); + return -1; + } else { + /* The filter dropped the token, which means it was + never even indexed. Ignore this word entirely in the + search query. */ + return 0; + } + } + array_sort(&tokens, i_strcmp_p); + strings_deduplicate(&tokens); + + arg = fts_search_arg_create_or(orig_arg, pool, &tokens); + arg->next = parent_arg->value.subargs; + parent_arg->value.subargs = arg; + return 0; +} + +static int +fts_backend_dovecot_tokenize_lang(struct fts_user_language *user_lang, + pool_t pool, struct mail_search_arg *or_arg, + struct mail_search_arg *orig_arg, + const char *orig_token, const char **error_r) +{ + size_t orig_token_len = strlen(orig_token); + struct mail_search_arg *and_arg, *orig_or_args = or_arg->value.subargs; + const char *token, *error; + int ret; + + /* we want all the tokens found from the string to be found, so create + a parent AND and place all the filtered token alternatives under + it */ + and_arg = p_new(pool, struct mail_search_arg, 1); + and_arg->type = SEARCH_SUB; + and_arg->next = orig_or_args; + or_arg->value.subargs = and_arg; + + /* reset tokenizer between search args in case there's any state left + from some previous failure */ + fts_tokenizer_reset(user_lang->search_tokenizer); + while ((ret = fts_tokenizer_next(user_lang->search_tokenizer, + (const void *)orig_token, + orig_token_len, &token, &error)) > 0) { + if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool, + and_arg, orig_arg, orig_token, + token, error_r) < 0) + return -1; + } + while (ret >= 0 && + (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) { + if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool, + and_arg, orig_arg, orig_token, + token, error_r) < 0) + return -1; + } + if (ret < 0) { + *error_r = t_strdup_printf("Couldn't tokenize search args: %s", error); + return -1; + } + if (and_arg->value.subargs == NULL) { + /* nothing was actually expanded, remove the empty and_arg */ + or_arg->value.subargs = orig_or_args; + } + return 0; +} + +static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool, + struct mail_search_arg **argp) +{ + const ARRAY_TYPE(fts_user_language) *languages; + struct fts_user_language *lang; + struct mail_search_arg *or_arg, *orig_arg = *argp; + const char *error, *orig_token = orig_arg->value.str; + + if (((*argp)->type == SEARCH_HEADER || + (*argp)->type == SEARCH_HEADER_ADDRESS || + (*argp)->type == SEARCH_HEADER_COMPRESS_LWSP) && + !fts_header_has_language((*argp)->hdr_field_name)) { + /* use only the data-language */ + languages = fts_user_get_data_languages(backend->ns->user); + } else { + languages = fts_user_get_all_languages(backend->ns->user); + } + + /* OR together all the different expansions for different languages. + it's enough for one of them to match. */ + or_arg = p_new(pool, struct mail_search_arg, 1); + or_arg->type = SEARCH_OR; + or_arg->match_not = orig_arg->match_not; + or_arg->next = orig_arg->next; + + array_foreach_elem(languages, lang) { + if (fts_backend_dovecot_tokenize_lang(lang, pool, or_arg, + orig_arg, orig_token, &error) < 0) { + i_error("fts: %s", error); + return -1; + } + } + + if (or_arg->value.subargs == NULL) { + /* we couldn't parse any tokens from the input */ + or_arg->type = SEARCH_ALL; + or_arg->match_not = !or_arg->match_not; + } + *argp = or_arg; + return 0; +} + +static int +fts_search_args_expand_tree(struct fts_backend *backend, pool_t pool, + struct mail_search_arg **argp) +{ + int ret; + + for (; *argp != NULL; argp = &(*argp)->next) { + switch ((*argp)->type) { + case SEARCH_OR: + case SEARCH_SUB: + case SEARCH_INTHREAD: + if (fts_search_args_expand_tree(backend, pool, + &(*argp)->value.subargs) < 0) + return -1; + break; + case SEARCH_HEADER: + case SEARCH_HEADER_ADDRESS: + case SEARCH_HEADER_COMPRESS_LWSP: + if ((*argp)->value.str[0] == '\0') { + /* we're testing for the existence of + the header */ + break; + } + /* fall through */ + case SEARCH_BODY: + case SEARCH_TEXT: + T_BEGIN { + ret = fts_search_arg_expand(backend, pool, argp); + } T_END; + if (ret < 0) + return -1; + break; + default: + break; + } + } + return 0; +} + +int fts_search_args_expand(struct fts_backend *backend, + struct mail_search_args *args) +{ + struct mail_search_arg *args_dup, *orig_args = args->args; + + /* don't keep re-expanding every time the search args are used. + this is especially important to avoid an assert-crash in + index_search_result_update_flags(). */ + if (args->fts_expanded) + return 0; + args->fts_expanded = TRUE; + + /* duplicate the args, so if expansion fails we haven't changed + anything */ + args_dup = mail_search_arg_dup(args->pool, args->args); + + if (fts_search_args_expand_tree(backend, args->pool, &args_dup) < 0) + return -1; + + /* we'll need to re-simplify the args if we changed anything */ + args->simplified = FALSE; + args->args = args_dup; + mail_search_args_simplify(args); + + /* duplicated args aren't initialized */ + i_assert(args->init_refcount > 0); + mail_search_arg_init(args, args_dup); + mail_search_arg_deinit(orig_args); + return 0; +} |