/* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "array.h"
#include "mail-namespace.h"
#include "mail-search.h"
#include "fts-api-private.h"
#include "fts-tokenizer.h"
#include "fts-filter-private.h"
#include "fts-user.h"
#include "fts-search-args.h"
#include "fts-language.h"

#define STOPWORDS_WORKAROUND_KEY "fts_stopwords_workaround"

static void strings_deduplicate(ARRAY_TYPE(const_string) *arr)
{
	const char *const *strings;
	unsigned int i, count;

	strings = array_get(arr, &count);
	for (i = 1; i < count; ) {
		if (strcmp(strings[i-1], strings[i]) == 0) {
			array_delete(arr, i, 1);
			strings = array_get(arr, &count);
		} else {
			i++;
		}
	}
}

static struct mail_search_arg *
fts_search_arg_create_or(const struct mail_search_arg *orig_arg, pool_t pool,
			 const ARRAY_TYPE(const_string) *tokens)
{
	struct mail_search_arg *arg, *or_arg, **argp;
	const char *token;

	/* create the OR arg first as the parent */
	or_arg = p_new(pool, struct mail_search_arg, 1);
	or_arg->type = SEARCH_OR;

	/* now create all the child args for the OR */
	argp = &or_arg->value.subargs;
	array_foreach_elem(tokens, token) {
		arg = p_new(pool, struct mail_search_arg, 1);
		*arg = *orig_arg;
		arg->match_not = FALSE; /* we copied this to the root OR */
		arg->next = NULL;
		arg->value.str = p_strdup(pool, token);

		*argp = arg;
		argp = &arg->next;
	}
	return or_arg;
}

static int
fts_backend_dovecot_expand_tokens(struct fts_filter *filter,
				  pool_t pool,
				  struct mail_search_arg *parent_arg,
				  const struct mail_search_arg *orig_arg,
				  const char *orig_token, const char *token,
				  const char **error_r)
{
	struct mail_search_arg *arg;
	ARRAY_TYPE(const_string) tokens;
	const char *token2, *error;
	int ret;

	t_array_init(&tokens, 4);
	/* first add the word exactly as it without any tokenization */
	array_push_back(&tokens, &orig_token);
	/* then add it tokenized, but without filtering */
	array_push_back(&tokens, &token);

	/* add the word filtered */
	if (filter != NULL) {
		token2 = t_strdup(token);
		ret = fts_filter_filter(filter, &token2, &error);
		if (ret > 0) {
			token2 = t_strdup(token2);
			array_push_back(&tokens, &token2);
		} else if (ret < 0) {
			*error_r = t_strdup_printf("Couldn't filter search token: %s", error);
			return -1;
		} else {
			/* The filter dropped the token, which means it was
			   never even indexed. Ignore this word entirely in the
			   search query. */
			return 0;
		}
	}
	array_sort(&tokens, i_strcmp_p);
	strings_deduplicate(&tokens);

	arg = fts_search_arg_create_or(orig_arg, pool, &tokens);
	arg->next = parent_arg->value.subargs;
	parent_arg->value.subargs = arg;
	return 0;
}

#define BOTTOM_LANGUAGE_EXPANSION TRUE
#define TOP_LANGUAGE_EXPANSION FALSE

static int
fts_backend_dovecot_tokenize_lang(struct fts_user_language *user_lang,
				  pool_t pool, struct mail_search_arg *or_arg,
				  struct mail_search_arg *orig_arg,
				  const char *orig_token,
				  bool bottom_language_expansion,
				  const char **error_r)
{
	size_t orig_token_len = strlen(orig_token);
	struct mail_search_arg *and_arg, *orig_or_args = or_arg->value.subargs;
	const char *token, *error;
	int ret;

	/* we want all the tokens found from the string to be found, so create
	   a parent AND and place all the filtered token alternatives under
	   it */
	and_arg = p_new(pool, struct mail_search_arg, 1);
	and_arg->type = SEARCH_SUB;
	and_arg->next = orig_or_args;
	or_arg->value.subargs = and_arg;

	/* reset tokenizer between search args in case there's any state left
	   from some previous failure */
	fts_tokenizer_reset(user_lang->search_tokenizer);
	while ((ret = fts_tokenizer_next(user_lang->search_tokenizer,
					 (const void *)orig_token,
					 orig_token_len, &token, &error)) > 0) {
		if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
						      and_arg, orig_arg, orig_token,
						      token, error_r) < 0)
			return -1;
	}
	while (ret >= 0 &&
	       (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) {
		if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
						      and_arg, orig_arg, orig_token,
						      token, error_r) < 0)
			return -1;
	}
	if (ret < 0) {
		*error_r = t_strdup_printf("Couldn't tokenize search args: %s", error);
		return -1;
	}
	if (and_arg->value.subargs == NULL) {
		if (bottom_language_expansion) {
			/* remove this empty term entirely */
			or_arg->value.subargs = orig_or_args;
		} else {
			/* The simplifier will propagate the NIL to the
			   upper operators, if required, and remove it at
			   the appropriate level */
			and_arg->type = SEARCH_NIL;
		}
	}
	return 0;
}

static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool,
				 struct fts_user_language *lang,
				 bool bottom_language_expansion,
				 struct mail_search_arg **argp)
{
	const ARRAY_TYPE(fts_user_language) *languages;
	ARRAY_TYPE(fts_user_language) langs;
	struct mail_search_arg *or_arg, *orig_arg = *argp;
	const char *error, *orig_token = orig_arg->value.str;

	/* If we are invoked with no lang (null), we are operating in a bottom
	   language expansion, which is iterated here. In this case we also expect
	   to be removing NILs terms.

	   Otherwise, if we are invoked with a specific lang, we are working in
	   a top level language expansion (done above us), and we do NOT want to
	   remove the NIL terms. */
	i_assert(bottom_language_expansion == (lang == NULL));

	if (((*argp)->type == SEARCH_HEADER ||
	     (*argp)->type == SEARCH_HEADER_ADDRESS ||
	     (*argp)->type == SEARCH_HEADER_COMPRESS_LWSP) &&
	    !fts_header_has_language((*argp)->hdr_field_name)) {
		/* use only the data-language */
		lang = fts_user_get_data_lang(backend->ns->user);
	}
	if (lang != NULL) {
		/* getting here either in case of bottom language expansion OR
		   in case of language-less headers ... */
		t_array_init(&langs, 1);
		array_push_back(&langs, &lang);
		languages = &langs;
	} else {
		/* ... otherwise getting here in case of top language expansion */
		languages = fts_user_get_all_languages(backend->ns->user);
	}

	/* OR together all the different expansions for different languages.
	   it's enough for one of them to match. */
	or_arg = p_new(pool, struct mail_search_arg, 1);
	or_arg->type = SEARCH_OR;
	or_arg->match_not = orig_arg->match_not;
	or_arg->next = orig_arg->next;

	/* this reduces to one single iteration on top language expansion or
	   languageless headers */
	array_foreach_elem(languages, lang) {
		if (fts_backend_dovecot_tokenize_lang(lang, pool, or_arg,
						      orig_arg, orig_token,
						      bottom_language_expansion,
						      &error) < 0) {
			i_error("fts: %s", error);
			return -1;
		}
	}

	if (or_arg->value.subargs == NULL) {
		/* We couldn't parse any tokens from the input.
		   This can happen only in bottom level expansion,
		   as in top level we grant that expansion always
		   produces at least a NIL term */
		or_arg->type = SEARCH_ALL;
		or_arg->match_not = !or_arg->match_not;
	}
	*argp = or_arg;
	return 0;
}

static int
fts_search_args_expand_tree(struct fts_backend *backend, pool_t pool,
			    struct fts_user_language *lang,
			    bool bottom_language_expansion,
			    struct mail_search_arg **argp)
{
	int ret;

	for (; *argp != NULL; argp = &(*argp)->next) {
		switch ((*argp)->type) {
		case SEARCH_OR:
		case SEARCH_SUB:
		case SEARCH_INTHREAD:
			if (fts_search_args_expand_tree(backend, pool, lang,
							bottom_language_expansion,
							&(*argp)->value.subargs) < 0)
				return -1;
			break;
		case SEARCH_HEADER:
		case SEARCH_HEADER_ADDRESS:
		case SEARCH_HEADER_COMPRESS_LWSP:
			if ((*argp)->value.str[0] == '\0') {
				/* we're testing for the existence of
				   the header */
				break;
			}
			/* fall through */
		case SEARCH_BODY:
		case SEARCH_TEXT:
			T_BEGIN {
				ret = fts_search_arg_expand(backend, pool, lang,
							    bottom_language_expansion,
							    argp);
			} T_END;
			if (ret < 0)
				return -1;
			break;
		default:
			break;
		}
	}
	return 0;
}

/* Takes in input the whole expression tree, as an implicit AND of argp-list terms.
   Replaces the input expression tree with a single OR term, containing one AND
   entry for each language, each AND entry containing a copy of the original
   argp-list of terms, Then it expands each AND subargs-list according to the language.

   Input: implicit-AND(argp-list)
   Output: OR(lang1(AND(argp-list-copy), lang2(AND(argp-list-copy)) ...) */
static int
fts_search_args_expand_languages(struct fts_backend *backend, pool_t pool,
			         struct mail_search_arg **argp)
{
	if (*argp == NULL)
		return 0;

	/* ensure there is an explicit top node wich has onyl a single term,
	   be it either an AND or an OR node */
	bool top_is_or = (*argp)->type == SEARCH_OR && (*argp)->next == NULL;
	struct mail_search_arg *top_arg;
	if (top_is_or) {
		/* we already have a single top entry of type OR, reuse it */
		top_arg = (*argp);
	} else {
		/* create a single top entry of type AND with the original args */
		top_arg = p_new(pool, struct mail_search_arg, 1);
		top_arg->value.subargs = (*argp);
	}

	/* the top node will be populated from scratch with the language expansions */
	struct mail_search_arg *top_subargs = top_arg->value.subargs;
	top_arg->value.subargs = NULL;

	int direct = 0, negated = 0;
	for (struct mail_search_arg *arg = top_subargs; arg != NULL; arg = arg->next, ++direct)
		if (arg->match_not) ++negated, --direct;

	#define XOR != /* '!=' is the boolean equivalent of bitwise xor '^' */

	/* likely we might want a simplification that pushes all the negations
	   toward the root of the node before doing this, rather than the current
	   one that pushes them toward the leaves ? */

	/* THIS CASE IS THE GREY ZONE ---------------------------------|______________| */
	bool want_invert = negated == 0 ? FALSE : direct == 0 ? TRUE : negated > direct;
	bool invert = want_invert XOR top_arg->match_not;

	if (invert) {
		top_arg->type = top_arg->type != SEARCH_OR ? SEARCH_OR : SEARCH_SUB;
		for (struct mail_search_arg *arg = top_subargs; arg != NULL; arg = arg->next)
			arg->match_not = !arg->match_not;
	}

	const ARRAY_TYPE(fts_user_language) *languages =
		fts_user_get_all_languages(backend->ns->user);
	struct fts_user_language *lang;
	array_foreach_elem(languages, lang) {
		struct mail_search_arg *lang_arg = p_new(pool, struct mail_search_arg, 1);
		lang_arg->type = top_is_or XOR invert ? SEARCH_OR : SEARCH_SUB;
		lang_arg->match_not = invert;
		lang_arg->value.subargs = mail_search_arg_dup(pool, top_subargs);

		if (fts_search_args_expand_tree(backend, pool, lang,
						TOP_LANGUAGE_EXPANSION,
						&lang_arg->value.subargs) < 0)
			return -1;

		lang_arg->next = top_arg->value.subargs;
		top_arg->value.subargs = lang_arg;
	}

	*argp = top_arg;
	return 0;
}


static bool fts_lang_has_stopwords(const struct fts_user_language *lang)
{
	struct fts_filter *filter;
	for (filter = lang->filter; filter != NULL; filter = filter->parent)
		if (strcmp(filter->class_name, "stopwords") == 0)
			return TRUE;
	return FALSE;
}

static bool
fts_search_args_expand_language_top_level(struct fts_backend *backend)
{
	const char *setting = mail_user_plugin_getenv(
		backend->ns->user, STOPWORDS_WORKAROUND_KEY);

	if (setting != NULL) {
		if (strcasecmp(setting, "no") == 0) return FALSE;
		if (strcasecmp(setting, "yes") == 0) return TRUE;
		/* otherwise we imply auto */
	}

	struct fts_user_language *lang;
	const ARRAY_TYPE(fts_user_language) *languages =
		fts_user_get_all_languages(backend->ns->user);

	unsigned int langs_count = 0;
	bool stopwords = FALSE;
	array_foreach_elem(languages, lang) {
		if (strcmp(lang->lang->name, "data") == 0)
			continue;

		if (fts_lang_has_stopwords(lang))
			stopwords = TRUE;

		if (stopwords && ++langs_count > 1)
			return TRUE;
	}
	return FALSE;
}

int fts_search_args_expand(struct fts_backend *backend,
			   struct mail_search_args *args)
{
	struct mail_search_arg *args_dup, *orig_args = args->args;

	/* don't keep re-expanding every time the search args are used.
	   this is especially important to avoid an assert-crash in
	   index_search_result_update_flags(). */
	if (args->fts_expanded)
		return 0;
	args->fts_expanded = TRUE;

	/* duplicate the args, so if expansion fails we haven't changed
	   anything */
	args_dup = mail_search_arg_dup(args->pool, args->args);

	if (fts_search_args_expand_language_top_level(backend)) {
		if (fts_search_args_expand_languages(
			backend, args->pool, &args_dup) < 0)
			return -1;
	} else {
		if (fts_search_args_expand_tree(
			backend, args->pool, NULL, BOTTOM_LANGUAGE_EXPANSION,
			&args_dup) < 0)
			return -1;
	}

	/* we'll need to re-simplify the args if we changed anything */
	args->simplified = FALSE;
	args->args = args_dup;
	mail_search_args_simplify(args);

	/* duplicated args aren't initialized */
	i_assert(args->init_refcount > 0);
	mail_search_arg_init(args, args_dup);
	mail_search_arg_deinit(orig_args);
	return 0;
}