diff options
Diffstat (limited to 'src/plugins/fts/fts-search-args.c')
-rw-r--r-- | src/plugins/fts/fts-search-args.c | 426 |
1 files changed, 426 insertions, 0 deletions
diff --git a/src/plugins/fts/fts-search-args.c b/src/plugins/fts/fts-search-args.c new file mode 100644 index 0000000..6b83573 --- /dev/null +++ b/src/plugins/fts/fts-search-args.c @@ -0,0 +1,426 @@ +/* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "array.h" +#include "mail-namespace.h" +#include "mail-search.h" +#include "fts-api-private.h" +#include "fts-tokenizer.h" +#include "fts-filter-private.h" +#include "fts-user.h" +#include "fts-search-args.h" +#include "fts-language.h" + +#define STOPWORDS_WORKAROUND_KEY "fts_stopwords_workaround" + +static void strings_deduplicate(ARRAY_TYPE(const_string) *arr) +{ + const char *const *strings; + unsigned int i, count; + + strings = array_get(arr, &count); + for (i = 1; i < count; ) { + if (strcmp(strings[i-1], strings[i]) == 0) { + array_delete(arr, i, 1); + strings = array_get(arr, &count); + } else { + i++; + } + } +} + +static struct mail_search_arg * +fts_search_arg_create_or(const struct mail_search_arg *orig_arg, pool_t pool, + const ARRAY_TYPE(const_string) *tokens) +{ + struct mail_search_arg *arg, *or_arg, **argp; + const char *token; + + /* create the OR arg first as the parent */ + or_arg = p_new(pool, struct mail_search_arg, 1); + or_arg->type = SEARCH_OR; + + /* now create all the child args for the OR */ + argp = &or_arg->value.subargs; + array_foreach_elem(tokens, token) { + arg = p_new(pool, struct mail_search_arg, 1); + *arg = *orig_arg; + arg->match_not = FALSE; /* we copied this to the root OR */ + arg->next = NULL; + arg->value.str = p_strdup(pool, token); + + *argp = arg; + argp = &arg->next; + } + return or_arg; +} + +static int +fts_backend_dovecot_expand_tokens(struct fts_filter *filter, + pool_t pool, + struct mail_search_arg *parent_arg, + const struct mail_search_arg *orig_arg, + const char *orig_token, const char *token, + const char **error_r) +{ + struct mail_search_arg *arg; + ARRAY_TYPE(const_string) tokens; + const char *token2, *error; + int ret; + + t_array_init(&tokens, 4); + /* first add the word exactly as it without any tokenization */ + array_push_back(&tokens, &orig_token); + /* then add it tokenized, but without filtering */ + array_push_back(&tokens, &token); + + /* add the word filtered */ + if (filter != NULL) { + token2 = t_strdup(token); + ret = fts_filter_filter(filter, &token2, &error); + if (ret > 0) { + token2 = t_strdup(token2); + array_push_back(&tokens, &token2); + } else if (ret < 0) { + *error_r = t_strdup_printf("Couldn't filter search token: %s", error); + return -1; + } else { + /* The filter dropped the token, which means it was + never even indexed. Ignore this word entirely in the + search query. */ + return 0; + } + } + array_sort(&tokens, i_strcmp_p); + strings_deduplicate(&tokens); + + arg = fts_search_arg_create_or(orig_arg, pool, &tokens); + arg->next = parent_arg->value.subargs; + parent_arg->value.subargs = arg; + return 0; +} + +#define BOTTOM_LANGUAGE_EXPANSION TRUE +#define TOP_LANGUAGE_EXPANSION FALSE + +static int +fts_backend_dovecot_tokenize_lang(struct fts_user_language *user_lang, + pool_t pool, struct mail_search_arg *or_arg, + struct mail_search_arg *orig_arg, + const char *orig_token, + bool bottom_language_expansion, + const char **error_r) +{ + size_t orig_token_len = strlen(orig_token); + struct mail_search_arg *and_arg, *orig_or_args = or_arg->value.subargs; + const char *token, *error; + int ret; + + /* we want all the tokens found from the string to be found, so create + a parent AND and place all the filtered token alternatives under + it */ + and_arg = p_new(pool, struct mail_search_arg, 1); + and_arg->type = SEARCH_SUB; + and_arg->next = orig_or_args; + or_arg->value.subargs = and_arg; + + /* reset tokenizer between search args in case there's any state left + from some previous failure */ + fts_tokenizer_reset(user_lang->search_tokenizer); + while ((ret = fts_tokenizer_next(user_lang->search_tokenizer, + (const void *)orig_token, + orig_token_len, &token, &error)) > 0) { + if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool, + and_arg, orig_arg, orig_token, + token, error_r) < 0) + return -1; + } + while (ret >= 0 && + (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) { + if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool, + and_arg, orig_arg, orig_token, + token, error_r) < 0) + return -1; + } + if (ret < 0) { + *error_r = t_strdup_printf("Couldn't tokenize search args: %s", error); + return -1; + } + if (and_arg->value.subargs == NULL) { + if (bottom_language_expansion) { + /* remove this empty term entirely */ + or_arg->value.subargs = orig_or_args; + } else { + /* The simplifier will propagate the NIL to the + upper operators, if required, and remove it at + the appropriate level */ + and_arg->type = SEARCH_NIL; + } + } + return 0; +} + +static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool, + struct fts_user_language *lang, + bool bottom_language_expansion, + struct mail_search_arg **argp) +{ + const ARRAY_TYPE(fts_user_language) *languages; + ARRAY_TYPE(fts_user_language) langs; + struct mail_search_arg *or_arg, *orig_arg = *argp; + const char *error, *orig_token = orig_arg->value.str; + + /* If we are invoked with no lang (null), we are operating in a bottom + language expansion, which is iterated here. In this case we also expect + to be removing NILs terms. + + Otherwise, if we are invoked with a specific lang, we are working in + a top level language expansion (done above us), and we do NOT want to + remove the NIL terms. */ + i_assert(bottom_language_expansion == (lang == NULL)); + + if (((*argp)->type == SEARCH_HEADER || + (*argp)->type == SEARCH_HEADER_ADDRESS || + (*argp)->type == SEARCH_HEADER_COMPRESS_LWSP) && + !fts_header_has_language((*argp)->hdr_field_name)) { + /* use only the data-language */ + lang = fts_user_get_data_lang(backend->ns->user); + } + if (lang != NULL) { + /* getting here either in case of bottom language expansion OR + in case of language-less headers ... */ + t_array_init(&langs, 1); + array_push_back(&langs, &lang); + languages = &langs; + } else { + /* ... otherwise getting here in case of top language expansion */ + languages = fts_user_get_all_languages(backend->ns->user); + } + + /* OR together all the different expansions for different languages. + it's enough for one of them to match. */ + or_arg = p_new(pool, struct mail_search_arg, 1); + or_arg->type = SEARCH_OR; + or_arg->match_not = orig_arg->match_not; + or_arg->next = orig_arg->next; + + /* this reduces to one single iteration on top language expansion or + languageless headers */ + array_foreach_elem(languages, lang) { + if (fts_backend_dovecot_tokenize_lang(lang, pool, or_arg, + orig_arg, orig_token, + bottom_language_expansion, + &error) < 0) { + i_error("fts: %s", error); + return -1; + } + } + + if (or_arg->value.subargs == NULL) { + /* We couldn't parse any tokens from the input. + This can happen only in bottom level expansion, + as in top level we grant that expansion always + produces at least a NIL term */ + or_arg->type = SEARCH_ALL; + or_arg->match_not = !or_arg->match_not; + } + *argp = or_arg; + return 0; +} + +static int +fts_search_args_expand_tree(struct fts_backend *backend, pool_t pool, + struct fts_user_language *lang, + bool bottom_language_expansion, + struct mail_search_arg **argp) +{ + int ret; + + for (; *argp != NULL; argp = &(*argp)->next) { + switch ((*argp)->type) { + case SEARCH_OR: + case SEARCH_SUB: + case SEARCH_INTHREAD: + if (fts_search_args_expand_tree(backend, pool, lang, + bottom_language_expansion, + &(*argp)->value.subargs) < 0) + return -1; + break; + case SEARCH_HEADER: + case SEARCH_HEADER_ADDRESS: + case SEARCH_HEADER_COMPRESS_LWSP: + if ((*argp)->value.str[0] == '\0') { + /* we're testing for the existence of + the header */ + break; + } + /* fall through */ + case SEARCH_BODY: + case SEARCH_TEXT: + T_BEGIN { + ret = fts_search_arg_expand(backend, pool, lang, + bottom_language_expansion, + argp); + } T_END; + if (ret < 0) + return -1; + break; + default: + break; + } + } + return 0; +} + +/* Takes in input the whole expression tree, as an implicit AND of argp-list terms. + Replaces the input expression tree with a single OR term, containing one AND + entry for each language, each AND entry containing a copy of the original + argp-list of terms, Then it expands each AND subargs-list according to the language. + + Input: implicit-AND(argp-list) + Output: OR(lang1(AND(argp-list-copy), lang2(AND(argp-list-copy)) ...) */ +static int +fts_search_args_expand_languages(struct fts_backend *backend, pool_t pool, + struct mail_search_arg **argp) +{ + if (*argp == NULL) + return 0; + + /* ensure there is an explicit top node wich has onyl a single term, + be it either an AND or an OR node */ + bool top_is_or = (*argp)->type == SEARCH_OR && (*argp)->next == NULL; + struct mail_search_arg *top_arg; + if (top_is_or) { + /* we already have a single top entry of type OR, reuse it */ + top_arg = (*argp); + } else { + /* create a single top entry of type AND with the original args */ + top_arg = p_new(pool, struct mail_search_arg, 1); + top_arg->value.subargs = (*argp); + } + + /* the top node will be populated from scratch with the language expansions */ + struct mail_search_arg *top_subargs = top_arg->value.subargs; + top_arg->value.subargs = NULL; + + int direct = 0, negated = 0; + for (struct mail_search_arg *arg = top_subargs; arg != NULL; arg = arg->next, ++direct) + if (arg->match_not) ++negated, --direct; + + #define XOR != /* '!=' is the boolean equivalent of bitwise xor '^' */ + + /* likely we might want a simplification that pushes all the negations + toward the root of the node before doing this, rather than the current + one that pushes them toward the leaves ? */ + + /* THIS CASE IS THE GREY ZONE ---------------------------------|______________| */ + bool want_invert = negated == 0 ? FALSE : direct == 0 ? TRUE : negated > direct; + bool invert = want_invert XOR top_arg->match_not; + + if (invert) { + top_arg->type = top_arg->type != SEARCH_OR ? SEARCH_OR : SEARCH_SUB; + for (struct mail_search_arg *arg = top_subargs; arg != NULL; arg = arg->next) + arg->match_not = !arg->match_not; + } + + const ARRAY_TYPE(fts_user_language) *languages = + fts_user_get_all_languages(backend->ns->user); + struct fts_user_language *lang; + array_foreach_elem(languages, lang) { + struct mail_search_arg *lang_arg = p_new(pool, struct mail_search_arg, 1); + lang_arg->type = top_is_or XOR invert ? SEARCH_OR : SEARCH_SUB; + lang_arg->match_not = invert; + lang_arg->value.subargs = mail_search_arg_dup(pool, top_subargs); + + if (fts_search_args_expand_tree(backend, pool, lang, + TOP_LANGUAGE_EXPANSION, + &lang_arg->value.subargs) < 0) + return -1; + + lang_arg->next = top_arg->value.subargs; + top_arg->value.subargs = lang_arg; + } + + *argp = top_arg; + return 0; +} + + +static bool fts_lang_has_stopwords(const struct fts_user_language *lang) +{ + struct fts_filter *filter; + for (filter = lang->filter; filter != NULL; filter = filter->parent) + if (strcmp(filter->class_name, "stopwords") == 0) + return TRUE; + return FALSE; +} + +static bool +fts_search_args_expand_language_top_level(struct fts_backend *backend) +{ + const char *setting = mail_user_plugin_getenv( + backend->ns->user, STOPWORDS_WORKAROUND_KEY); + + if (setting != NULL) { + if (strcasecmp(setting, "no") == 0) return FALSE; + if (strcasecmp(setting, "yes") == 0) return TRUE; + /* otherwise we imply auto */ + } + + struct fts_user_language *lang; + const ARRAY_TYPE(fts_user_language) *languages = + fts_user_get_all_languages(backend->ns->user); + + unsigned int langs_count = 0; + bool stopwords = FALSE; + array_foreach_elem(languages, lang) { + if (strcmp(lang->lang->name, "data") == 0) + continue; + + if (fts_lang_has_stopwords(lang)) + stopwords = TRUE; + + if (stopwords && ++langs_count > 1) + return TRUE; + } + return FALSE; +} + +int fts_search_args_expand(struct fts_backend *backend, + struct mail_search_args *args) +{ + struct mail_search_arg *args_dup, *orig_args = args->args; + + /* don't keep re-expanding every time the search args are used. + this is especially important to avoid an assert-crash in + index_search_result_update_flags(). */ + if (args->fts_expanded) + return 0; + args->fts_expanded = TRUE; + + /* duplicate the args, so if expansion fails we haven't changed + anything */ + args_dup = mail_search_arg_dup(args->pool, args->args); + + if (fts_search_args_expand_language_top_level(backend)) { + if (fts_search_args_expand_languages( + backend, args->pool, &args_dup) < 0) + return -1; + } else { + if (fts_search_args_expand_tree( + backend, args->pool, NULL, BOTTOM_LANGUAGE_EXPANSION, + &args_dup) < 0) + return -1; + } + + /* we'll need to re-simplify the args if we changed anything */ + args->simplified = FALSE; + args->args = args_dup; + mail_search_args_simplify(args); + + /* duplicated args aren't initialized */ + i_assert(args->init_refcount > 0); + mail_search_arg_init(args, args_dup); + mail_search_arg_deinit(orig_args); + return 0; +} |