summaryrefslogtreecommitdiffstats
path: root/src/plugins/fts/fts-search-args.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 17:36:47 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 17:36:47 +0000
commit0441d265f2bb9da249c7abf333f0f771fadb4ab5 (patch)
tree3f3789daa2f6db22da6e55e92bee0062a7d613fe /src/plugins/fts/fts-search-args.c
parentInitial commit. (diff)
downloaddovecot-0441d265f2bb9da249c7abf333f0f771fadb4ab5.tar.xz
dovecot-0441d265f2bb9da249c7abf333f0f771fadb4ab5.zip
Adding upstream version 1:2.3.21+dfsg1.upstream/1%2.3.21+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/plugins/fts/fts-search-args.c')
-rw-r--r--src/plugins/fts/fts-search-args.c426
1 files changed, 426 insertions, 0 deletions
diff --git a/src/plugins/fts/fts-search-args.c b/src/plugins/fts/fts-search-args.c
new file mode 100644
index 0000000..6b83573
--- /dev/null
+++ b/src/plugins/fts/fts-search-args.c
@@ -0,0 +1,426 @@
+/* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "mail-namespace.h"
+#include "mail-search.h"
+#include "fts-api-private.h"
+#include "fts-tokenizer.h"
+#include "fts-filter-private.h"
+#include "fts-user.h"
+#include "fts-search-args.h"
+#include "fts-language.h"
+
+#define STOPWORDS_WORKAROUND_KEY "fts_stopwords_workaround"
+
+static void strings_deduplicate(ARRAY_TYPE(const_string) *arr)
+{
+ const char *const *strings;
+ unsigned int i, count;
+
+ strings = array_get(arr, &count);
+ for (i = 1; i < count; ) {
+ if (strcmp(strings[i-1], strings[i]) == 0) {
+ array_delete(arr, i, 1);
+ strings = array_get(arr, &count);
+ } else {
+ i++;
+ }
+ }
+}
+
+static struct mail_search_arg *
+fts_search_arg_create_or(const struct mail_search_arg *orig_arg, pool_t pool,
+ const ARRAY_TYPE(const_string) *tokens)
+{
+ struct mail_search_arg *arg, *or_arg, **argp;
+ const char *token;
+
+ /* create the OR arg first as the parent */
+ or_arg = p_new(pool, struct mail_search_arg, 1);
+ or_arg->type = SEARCH_OR;
+
+ /* now create all the child args for the OR */
+ argp = &or_arg->value.subargs;
+ array_foreach_elem(tokens, token) {
+ arg = p_new(pool, struct mail_search_arg, 1);
+ *arg = *orig_arg;
+ arg->match_not = FALSE; /* we copied this to the root OR */
+ arg->next = NULL;
+ arg->value.str = p_strdup(pool, token);
+
+ *argp = arg;
+ argp = &arg->next;
+ }
+ return or_arg;
+}
+
+static int
+fts_backend_dovecot_expand_tokens(struct fts_filter *filter,
+ pool_t pool,
+ struct mail_search_arg *parent_arg,
+ const struct mail_search_arg *orig_arg,
+ const char *orig_token, const char *token,
+ const char **error_r)
+{
+ struct mail_search_arg *arg;
+ ARRAY_TYPE(const_string) tokens;
+ const char *token2, *error;
+ int ret;
+
+ t_array_init(&tokens, 4);
+ /* first add the word exactly as it without any tokenization */
+ array_push_back(&tokens, &orig_token);
+ /* then add it tokenized, but without filtering */
+ array_push_back(&tokens, &token);
+
+ /* add the word filtered */
+ if (filter != NULL) {
+ token2 = t_strdup(token);
+ ret = fts_filter_filter(filter, &token2, &error);
+ if (ret > 0) {
+ token2 = t_strdup(token2);
+ array_push_back(&tokens, &token2);
+ } else if (ret < 0) {
+ *error_r = t_strdup_printf("Couldn't filter search token: %s", error);
+ return -1;
+ } else {
+ /* The filter dropped the token, which means it was
+ never even indexed. Ignore this word entirely in the
+ search query. */
+ return 0;
+ }
+ }
+ array_sort(&tokens, i_strcmp_p);
+ strings_deduplicate(&tokens);
+
+ arg = fts_search_arg_create_or(orig_arg, pool, &tokens);
+ arg->next = parent_arg->value.subargs;
+ parent_arg->value.subargs = arg;
+ return 0;
+}
+
+#define BOTTOM_LANGUAGE_EXPANSION TRUE
+#define TOP_LANGUAGE_EXPANSION FALSE
+
+static int
+fts_backend_dovecot_tokenize_lang(struct fts_user_language *user_lang,
+ pool_t pool, struct mail_search_arg *or_arg,
+ struct mail_search_arg *orig_arg,
+ const char *orig_token,
+ bool bottom_language_expansion,
+ const char **error_r)
+{
+ size_t orig_token_len = strlen(orig_token);
+ struct mail_search_arg *and_arg, *orig_or_args = or_arg->value.subargs;
+ const char *token, *error;
+ int ret;
+
+ /* we want all the tokens found from the string to be found, so create
+ a parent AND and place all the filtered token alternatives under
+ it */
+ and_arg = p_new(pool, struct mail_search_arg, 1);
+ and_arg->type = SEARCH_SUB;
+ and_arg->next = orig_or_args;
+ or_arg->value.subargs = and_arg;
+
+ /* reset tokenizer between search args in case there's any state left
+ from some previous failure */
+ fts_tokenizer_reset(user_lang->search_tokenizer);
+ while ((ret = fts_tokenizer_next(user_lang->search_tokenizer,
+ (const void *)orig_token,
+ orig_token_len, &token, &error)) > 0) {
+ if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
+ and_arg, orig_arg, orig_token,
+ token, error_r) < 0)
+ return -1;
+ }
+ while (ret >= 0 &&
+ (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) {
+ if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
+ and_arg, orig_arg, orig_token,
+ token, error_r) < 0)
+ return -1;
+ }
+ if (ret < 0) {
+ *error_r = t_strdup_printf("Couldn't tokenize search args: %s", error);
+ return -1;
+ }
+ if (and_arg->value.subargs == NULL) {
+ if (bottom_language_expansion) {
+ /* remove this empty term entirely */
+ or_arg->value.subargs = orig_or_args;
+ } else {
+ /* The simplifier will propagate the NIL to the
+ upper operators, if required, and remove it at
+ the appropriate level */
+ and_arg->type = SEARCH_NIL;
+ }
+ }
+ return 0;
+}
+
+static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool,
+ struct fts_user_language *lang,
+ bool bottom_language_expansion,
+ struct mail_search_arg **argp)
+{
+ const ARRAY_TYPE(fts_user_language) *languages;
+ ARRAY_TYPE(fts_user_language) langs;
+ struct mail_search_arg *or_arg, *orig_arg = *argp;
+ const char *error, *orig_token = orig_arg->value.str;
+
+ /* If we are invoked with no lang (null), we are operating in a bottom
+ language expansion, which is iterated here. In this case we also expect
+ to be removing NILs terms.
+
+ Otherwise, if we are invoked with a specific lang, we are working in
+ a top level language expansion (done above us), and we do NOT want to
+ remove the NIL terms. */
+ i_assert(bottom_language_expansion == (lang == NULL));
+
+ if (((*argp)->type == SEARCH_HEADER ||
+ (*argp)->type == SEARCH_HEADER_ADDRESS ||
+ (*argp)->type == SEARCH_HEADER_COMPRESS_LWSP) &&
+ !fts_header_has_language((*argp)->hdr_field_name)) {
+ /* use only the data-language */
+ lang = fts_user_get_data_lang(backend->ns->user);
+ }
+ if (lang != NULL) {
+ /* getting here either in case of bottom language expansion OR
+ in case of language-less headers ... */
+ t_array_init(&langs, 1);
+ array_push_back(&langs, &lang);
+ languages = &langs;
+ } else {
+ /* ... otherwise getting here in case of top language expansion */
+ languages = fts_user_get_all_languages(backend->ns->user);
+ }
+
+ /* OR together all the different expansions for different languages.
+ it's enough for one of them to match. */
+ or_arg = p_new(pool, struct mail_search_arg, 1);
+ or_arg->type = SEARCH_OR;
+ or_arg->match_not = orig_arg->match_not;
+ or_arg->next = orig_arg->next;
+
+ /* this reduces to one single iteration on top language expansion or
+ languageless headers */
+ array_foreach_elem(languages, lang) {
+ if (fts_backend_dovecot_tokenize_lang(lang, pool, or_arg,
+ orig_arg, orig_token,
+ bottom_language_expansion,
+ &error) < 0) {
+ i_error("fts: %s", error);
+ return -1;
+ }
+ }
+
+ if (or_arg->value.subargs == NULL) {
+ /* We couldn't parse any tokens from the input.
+ This can happen only in bottom level expansion,
+ as in top level we grant that expansion always
+ produces at least a NIL term */
+ or_arg->type = SEARCH_ALL;
+ or_arg->match_not = !or_arg->match_not;
+ }
+ *argp = or_arg;
+ return 0;
+}
+
+static int
+fts_search_args_expand_tree(struct fts_backend *backend, pool_t pool,
+ struct fts_user_language *lang,
+ bool bottom_language_expansion,
+ struct mail_search_arg **argp)
+{
+ int ret;
+
+ for (; *argp != NULL; argp = &(*argp)->next) {
+ switch ((*argp)->type) {
+ case SEARCH_OR:
+ case SEARCH_SUB:
+ case SEARCH_INTHREAD:
+ if (fts_search_args_expand_tree(backend, pool, lang,
+ bottom_language_expansion,
+ &(*argp)->value.subargs) < 0)
+ return -1;
+ break;
+ case SEARCH_HEADER:
+ case SEARCH_HEADER_ADDRESS:
+ case SEARCH_HEADER_COMPRESS_LWSP:
+ if ((*argp)->value.str[0] == '\0') {
+ /* we're testing for the existence of
+ the header */
+ break;
+ }
+ /* fall through */
+ case SEARCH_BODY:
+ case SEARCH_TEXT:
+ T_BEGIN {
+ ret = fts_search_arg_expand(backend, pool, lang,
+ bottom_language_expansion,
+ argp);
+ } T_END;
+ if (ret < 0)
+ return -1;
+ break;
+ default:
+ break;
+ }
+ }
+ return 0;
+}
+
+/* Takes in input the whole expression tree, as an implicit AND of argp-list terms.
+ Replaces the input expression tree with a single OR term, containing one AND
+ entry for each language, each AND entry containing a copy of the original
+ argp-list of terms, Then it expands each AND subargs-list according to the language.
+
+ Input: implicit-AND(argp-list)
+ Output: OR(lang1(AND(argp-list-copy), lang2(AND(argp-list-copy)) ...) */
+static int
+fts_search_args_expand_languages(struct fts_backend *backend, pool_t pool,
+ struct mail_search_arg **argp)
+{
+ if (*argp == NULL)
+ return 0;
+
+ /* ensure there is an explicit top node wich has onyl a single term,
+ be it either an AND or an OR node */
+ bool top_is_or = (*argp)->type == SEARCH_OR && (*argp)->next == NULL;
+ struct mail_search_arg *top_arg;
+ if (top_is_or) {
+ /* we already have a single top entry of type OR, reuse it */
+ top_arg = (*argp);
+ } else {
+ /* create a single top entry of type AND with the original args */
+ top_arg = p_new(pool, struct mail_search_arg, 1);
+ top_arg->value.subargs = (*argp);
+ }
+
+ /* the top node will be populated from scratch with the language expansions */
+ struct mail_search_arg *top_subargs = top_arg->value.subargs;
+ top_arg->value.subargs = NULL;
+
+ int direct = 0, negated = 0;
+ for (struct mail_search_arg *arg = top_subargs; arg != NULL; arg = arg->next, ++direct)
+ if (arg->match_not) ++negated, --direct;
+
+ #define XOR != /* '!=' is the boolean equivalent of bitwise xor '^' */
+
+ /* likely we might want a simplification that pushes all the negations
+ toward the root of the node before doing this, rather than the current
+ one that pushes them toward the leaves ? */
+
+ /* THIS CASE IS THE GREY ZONE ---------------------------------|______________| */
+ bool want_invert = negated == 0 ? FALSE : direct == 0 ? TRUE : negated > direct;
+ bool invert = want_invert XOR top_arg->match_not;
+
+ if (invert) {
+ top_arg->type = top_arg->type != SEARCH_OR ? SEARCH_OR : SEARCH_SUB;
+ for (struct mail_search_arg *arg = top_subargs; arg != NULL; arg = arg->next)
+ arg->match_not = !arg->match_not;
+ }
+
+ const ARRAY_TYPE(fts_user_language) *languages =
+ fts_user_get_all_languages(backend->ns->user);
+ struct fts_user_language *lang;
+ array_foreach_elem(languages, lang) {
+ struct mail_search_arg *lang_arg = p_new(pool, struct mail_search_arg, 1);
+ lang_arg->type = top_is_or XOR invert ? SEARCH_OR : SEARCH_SUB;
+ lang_arg->match_not = invert;
+ lang_arg->value.subargs = mail_search_arg_dup(pool, top_subargs);
+
+ if (fts_search_args_expand_tree(backend, pool, lang,
+ TOP_LANGUAGE_EXPANSION,
+ &lang_arg->value.subargs) < 0)
+ return -1;
+
+ lang_arg->next = top_arg->value.subargs;
+ top_arg->value.subargs = lang_arg;
+ }
+
+ *argp = top_arg;
+ return 0;
+}
+
+
+static bool fts_lang_has_stopwords(const struct fts_user_language *lang)
+{
+ struct fts_filter *filter;
+ for (filter = lang->filter; filter != NULL; filter = filter->parent)
+ if (strcmp(filter->class_name, "stopwords") == 0)
+ return TRUE;
+ return FALSE;
+}
+
+static bool
+fts_search_args_expand_language_top_level(struct fts_backend *backend)
+{
+ const char *setting = mail_user_plugin_getenv(
+ backend->ns->user, STOPWORDS_WORKAROUND_KEY);
+
+ if (setting != NULL) {
+ if (strcasecmp(setting, "no") == 0) return FALSE;
+ if (strcasecmp(setting, "yes") == 0) return TRUE;
+ /* otherwise we imply auto */
+ }
+
+ struct fts_user_language *lang;
+ const ARRAY_TYPE(fts_user_language) *languages =
+ fts_user_get_all_languages(backend->ns->user);
+
+ unsigned int langs_count = 0;
+ bool stopwords = FALSE;
+ array_foreach_elem(languages, lang) {
+ if (strcmp(lang->lang->name, "data") == 0)
+ continue;
+
+ if (fts_lang_has_stopwords(lang))
+ stopwords = TRUE;
+
+ if (stopwords && ++langs_count > 1)
+ return TRUE;
+ }
+ return FALSE;
+}
+
+int fts_search_args_expand(struct fts_backend *backend,
+ struct mail_search_args *args)
+{
+ struct mail_search_arg *args_dup, *orig_args = args->args;
+
+ /* don't keep re-expanding every time the search args are used.
+ this is especially important to avoid an assert-crash in
+ index_search_result_update_flags(). */
+ if (args->fts_expanded)
+ return 0;
+ args->fts_expanded = TRUE;
+
+ /* duplicate the args, so if expansion fails we haven't changed
+ anything */
+ args_dup = mail_search_arg_dup(args->pool, args->args);
+
+ if (fts_search_args_expand_language_top_level(backend)) {
+ if (fts_search_args_expand_languages(
+ backend, args->pool, &args_dup) < 0)
+ return -1;
+ } else {
+ if (fts_search_args_expand_tree(
+ backend, args->pool, NULL, BOTTOM_LANGUAGE_EXPANSION,
+ &args_dup) < 0)
+ return -1;
+ }
+
+ /* we'll need to re-simplify the args if we changed anything */
+ args->simplified = FALSE;
+ args->args = args_dup;
+ mail_search_args_simplify(args);
+
+ /* duplicated args aren't initialized */
+ i_assert(args->init_refcount > 0);
+ mail_search_arg_init(args, args_dup);
+ mail_search_arg_deinit(orig_args);
+ return 0;
+}