summaryrefslogtreecommitdiffstats
path: root/src/plugins/fts-lucene/lucene-wrapper.cc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 09:51:24 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 09:51:24 +0000
commitf7548d6d28c313cf80e6f3ef89aed16a19815df1 (patch)
treea3f6f2a3f247293bee59ecd28e8cd8ceb6ca064a /src/plugins/fts-lucene/lucene-wrapper.cc
parentInitial commit. (diff)
downloaddovecot-upstream.tar.xz
dovecot-upstream.zip
Adding upstream version 1:2.3.19.1+dfsg1.upstream/1%2.3.19.1+dfsg1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/plugins/fts-lucene/lucene-wrapper.cc')
-rw-r--r--src/plugins/fts-lucene/lucene-wrapper.cc1639
1 files changed, 1639 insertions, 0 deletions
diff --git a/src/plugins/fts-lucene/lucene-wrapper.cc b/src/plugins/fts-lucene/lucene-wrapper.cc
new file mode 100644
index 0000000..7446693
--- /dev/null
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc
@@ -0,0 +1,1639 @@
+/* Copyright (c) 2006-2018 Dovecot authors, see the included COPYING file */
+
+extern "C" {
+#include "lib.h"
+#include "array.h"
+#include "unichar.h"
+#include "hash.h"
+#include "hex-binary.h"
+#include "ioloop.h"
+#include "unlink-directory.h"
+#include "ioloop.h"
+#include "mail-index.h"
+#include "mail-search.h"
+#include "mail-namespace.h"
+#include "mailbox-list-private.h"
+#include "mail-storage.h"
+#include "fts-expunge-log.h"
+#include "fts-lucene-plugin.h"
+#include "lucene-wrapper.h"
+
+#include <sys/stat.h>
+#ifdef HAVE_LIBEXTTEXTCAT_TEXTCAT_H
+# include <libexttextcat/textcat.h>
+#elif defined (HAVE_LIBTEXTCAT_TEXTCAT_H)
+# include <libtextcat/textcat.h>
+#elif defined (HAVE_FTS_TEXTCAT)
+# include <textcat.h>
+#endif
+};
+#include <CLucene.h>
+#include <CLucene/util/CLStreams.h>
+#include <CLucene/search/MultiPhraseQuery.h>
+#include "SnowballAnalyzer.h"
+
+/* Lucene's default is 10000. Use it here also.. */
+#define MAX_TERMS_PER_DOCUMENT 10000
+#define FTS_LUCENE_MAX_SEARCH_TERMS 1000
+
+#define LUCENE_LOCK_OVERRIDE_SECS 60
+#define LUCENE_INDEX_CLOSE_TIMEOUT_MSECS (120*1000)
+
+using namespace lucene::document;
+using namespace lucene::index;
+using namespace lucene::search;
+using namespace lucene::queryParser;
+using namespace lucene::analysis;
+using namespace lucene::analysis;
+using namespace lucene::util;
+
+struct lucene_query {
+ Query *query;
+ BooleanClause::Occur occur;
+};
+ARRAY_DEFINE_TYPE(lucene_query, struct lucene_query);
+
+struct lucene_analyzer {
+ char *lang;
+ Analyzer *analyzer;
+};
+
+struct lucene_index {
+ char *path;
+ struct mailbox_list *list;
+ struct fts_lucene_settings set;
+ normalizer_func_t *normalizer;
+
+ wchar_t mailbox_guid[MAILBOX_GUID_HEX_LENGTH + 1];
+
+ IndexReader *reader;
+ IndexWriter *writer;
+ IndexSearcher *searcher;
+ struct timeout *to_close;
+
+ buffer_t *normalizer_buf;
+ Analyzer *default_analyzer, *cur_analyzer;
+ ARRAY(struct lucene_analyzer) analyzers;
+
+ Document *doc;
+ uint32_t prev_uid, prev_part_idx;
+ bool no_analyzer;
+};
+
+struct rescan_context {
+ struct lucene_index *index;
+
+ struct mailbox *box;
+ guid_128_t box_guid;
+ int box_ret;
+
+ pool_t pool;
+ HASH_TABLE(uint8_t *, uint8_t *) seen_mailbox_guids;
+
+ ARRAY_TYPE(seq_range) uids;
+ struct seq_range_iter uids_iter;
+ unsigned int uids_iter_n;
+
+ uint32_t last_existing_uid;
+ bool warned;
+};
+
+static void *textcat = NULL;
+#ifdef HAVE_FTS_TEXTCAT
+static bool textcat_broken = FALSE;
+#endif
+static int textcat_refcount = 0;
+
+static void lucene_handle_error(struct lucene_index *index, CLuceneError &err,
+ const char *msg);
+static void rescan_clear_unseen_mailboxes(struct lucene_index *index,
+ struct rescan_context *rescan_ctx);
+
+struct lucene_index *lucene_index_init(const char *path,
+ struct mailbox_list *list,
+ const struct fts_lucene_settings *set)
+{
+ struct lucene_index *index;
+
+ index = i_new(struct lucene_index, 1);
+ index->path = i_strdup(path);
+ index->list = list;
+ if (set != NULL) {
+ index->set = *set;
+ index->normalizer = !set->normalize ? NULL :
+ mailbox_list_get_namespace(list)->user->default_normalizer;
+ } else {
+ /* this is valid only for doveadm dump, so it doesn't matter */
+ index->set.default_language = "";
+ }
+ if (index->set.use_libfts) {
+ index->default_analyzer = _CLNEW KeywordAnalyzer();
+ } else
+#ifdef HAVE_FTS_STEMMER
+ if (set == NULL || !set->no_snowball) {
+ index->default_analyzer =
+ _CLNEW snowball::SnowballAnalyzer(index->normalizer,
+ index->set.default_language);
+ } else
+#endif
+ {
+ index->default_analyzer = _CLNEW standard::StandardAnalyzer();
+ if (index->normalizer != NULL) {
+ index->normalizer_buf =
+ buffer_create_dynamic(default_pool, 1024);
+ }
+ }
+
+ i_array_init(&index->analyzers, 32);
+ textcat_refcount++;
+
+ return index;
+}
+
+void lucene_index_close(struct lucene_index *index)
+{
+ timeout_remove(&index->to_close);
+
+ _CLDELETE(index->searcher);
+ if (index->writer != NULL) {
+ try {
+ index->writer->close();
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "IndexWriter::close");
+ }
+ _CLDELETE(index->writer);
+ }
+ if (index->reader != NULL) {
+ try {
+ index->reader->close();
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "IndexReader::close");
+ }
+ _CLDELETE(index->reader);
+ }
+}
+
+void lucene_index_deinit(struct lucene_index *index)
+{
+ struct lucene_analyzer *a;
+
+ lucene_index_close(index);
+ array_foreach_modifiable(&index->analyzers, a) {
+ i_free(a->lang);
+ _CLDELETE(a->analyzer);
+ }
+ array_free(&index->analyzers);
+ if (--textcat_refcount == 0 && textcat != NULL) {
+#ifdef HAVE_FTS_TEXTCAT
+ textcat_Done(textcat);
+#endif
+ textcat = NULL;
+ }
+ _CLDELETE(index->default_analyzer);
+ if (index->normalizer_buf != NULL)
+ buffer_free(&index->normalizer_buf);
+ i_free(index->path);
+ i_free(index);
+}
+
+static void lucene_data_translate(struct lucene_index *index,
+ wchar_t *data, unsigned int len)
+{
+ const char *whitespace_chars = index->set.whitespace_chars;
+ unsigned int i;
+
+ if (*whitespace_chars == '\0' || index->set.use_libfts)
+ return;
+
+ for (i = 0; i < len; i++) {
+ if (strchr(whitespace_chars, data[i]) != NULL)
+ data[i] = ' ';
+ }
+}
+
+void lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
+ wchar_t *dest, size_t destsize)
+{
+ ARRAY_TYPE(unichars) dest_arr;
+ buffer_t buf = { { 0, 0 } };
+
+ i_assert(sizeof(wchar_t) == sizeof(unichar_t));
+
+ buffer_create_from_data(&buf, dest, sizeof(wchar_t) * destsize);
+ array_create_from_buffer(&dest_arr, &buf, sizeof(wchar_t));
+ if (uni_utf8_to_ucs4_n(src, srcsize, &dest_arr) < 0)
+ i_unreached();
+ i_assert(array_count(&dest_arr)+1 == destsize);
+ dest[destsize-1] = 0;
+}
+
+static const wchar_t *
+t_lucene_utf8_to_tchar(struct lucene_index *index, const char *str)
+{
+ ARRAY_TYPE(unichars) dest_arr;
+ const unichar_t *chars;
+ wchar_t *ret;
+ unsigned int len;
+
+ i_assert(sizeof(wchar_t) == sizeof(unichar_t));
+
+ t_array_init(&dest_arr, strlen(str) + 1);
+ if (uni_utf8_to_ucs4(str, &dest_arr) < 0)
+ i_unreached();
+ (void)array_append_space(&dest_arr);
+
+ chars = array_get_modifiable(&dest_arr, &len);
+ ret = (wchar_t *)chars;
+ lucene_data_translate(index, ret, len - 1);
+ return ret;
+}
+
+void lucene_index_select_mailbox(struct lucene_index *index,
+ const wchar_t guid[MAILBOX_GUID_HEX_LENGTH])
+{
+ memcpy(index->mailbox_guid, guid,
+ MAILBOX_GUID_HEX_LENGTH * sizeof(wchar_t));
+ index->mailbox_guid[MAILBOX_GUID_HEX_LENGTH] = '\0';
+}
+
+void lucene_index_unselect_mailbox(struct lucene_index *index)
+{
+ memset(index->mailbox_guid, 0, sizeof(index->mailbox_guid));
+}
+
+static void lucene_handle_error(struct lucene_index *index, CLuceneError &err,
+ const char *msg)
+{
+ const char *error, *what = err.what();
+
+ i_error("lucene index %s: %s failed (#%d): %s",
+ index->path, msg, err.number(), what);
+
+ if (index->list != NULL &&
+ (err.number() == CL_ERR_CorruptIndex ||
+ err.number() == CL_ERR_IO)) {
+ /* delete corrupted index. most IO errors are also about
+ missing files and other such corruption.. */
+ if (unlink_directory(index->path, (enum unlink_directory_flags)0, &error) < 0)
+ i_error("unlink_directory(%s) failed: %s", index->path, error);
+ rescan_clear_unseen_mailboxes(index, NULL);
+ }
+}
+
+static int lucene_index_open(struct lucene_index *index)
+{
+ if (index->reader != NULL) {
+ i_assert(index->to_close != NULL);
+ timeout_reset(index->to_close);
+ return 1;
+ }
+
+ if (!IndexReader::indexExists(index->path))
+ return 0;
+
+ try {
+ index->reader = IndexReader::open(index->path);
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "IndexReader::open()");
+ return -1;
+ }
+ i_assert(index->to_close == NULL);
+ index->to_close = timeout_add(LUCENE_INDEX_CLOSE_TIMEOUT_MSECS,
+ lucene_index_close, index);
+ return 1;
+}
+
+static int lucene_index_open_search(struct lucene_index *index)
+{
+ int ret;
+
+ if (index->searcher != NULL)
+ return 1;
+
+ if ((ret = lucene_index_open(index)) <= 0)
+ return ret;
+
+ index->searcher = _CLNEW IndexSearcher(index->reader);
+ return 1;
+}
+
+static int
+lucene_doc_get_uid(struct lucene_index *index, Document *doc, uint32_t *uid_r)
+{
+ Field *field = doc->getField(_T("uid"));
+ const TCHAR *uid = field == NULL ? NULL : field->stringValue();
+ if (uid == NULL) {
+ i_error("lucene: Corrupted FTS index %s: No UID for document",
+ index->path);
+ return -1;
+ }
+
+ uint32_t num = 0;
+ while (*uid != 0) {
+ num = num*10 + (*uid - '0');
+ uid++;
+ }
+ *uid_r = num;
+ return 0;
+}
+
+static uint32_t
+lucene_doc_get_part(struct lucene_index *index, Document *doc)
+{
+ Field *field = doc->getField(_T("part"));
+ const TCHAR *part = field == NULL ? NULL : field->stringValue();
+ if (part == NULL)
+ return 0;
+
+ uint32_t num = 0;
+ while (*part != 0) {
+ num = num*10 + (*part - '0');
+ part++;
+ }
+ return num;
+}
+
+int lucene_index_get_last_uid(struct lucene_index *index, uint32_t *last_uid_r)
+{
+ int ret = 0;
+
+ *last_uid_r = 0;
+
+ if ((ret = lucene_index_open_search(index)) <= 0)
+ return ret;
+
+ Term mailbox_term(_T("box"), index->mailbox_guid);
+ TermQuery query(&mailbox_term);
+
+ uint32_t last_uid = 0;
+ try {
+ Hits *hits = index->searcher->search(&query);
+
+ for (size_t i = 0; i < hits->length(); i++) {
+ uint32_t uid;
+
+ if (lucene_doc_get_uid(index, &hits->doc(i),
+ &uid) < 0) {
+ ret = -1;
+ break;
+ }
+
+ if (uid > last_uid)
+ last_uid = uid;
+ }
+ _CLDELETE(hits);
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "last_uid search");
+ ret = -1;
+ }
+ *last_uid_r = last_uid;
+ return ret;
+}
+
+int lucene_index_get_doc_count(struct lucene_index *index, uint32_t *count_r)
+{
+ int ret;
+
+ if (index->reader == NULL) {
+ lucene_index_close(index);
+ if ((ret = lucene_index_open(index)) < 0)
+ return -1;
+ if (ret == 0) {
+ *count_r = 0;
+ return 0;
+ }
+ }
+ *count_r = index->reader->numDocs();
+ return 0;
+}
+
+static int lucene_settings_check(struct lucene_index *index)
+{
+ uint32_t set_checksum;
+ const char *error;
+ int ret = 0;
+
+ set_checksum = fts_lucene_settings_checksum(&index->set);
+ ret = fts_index_have_compatible_settings(index->list, set_checksum);
+ if (ret != 0)
+ return ret;
+
+ i_warning("fts-lucene: Settings have changed, rebuilding index for mailbox");
+
+ /* settings changed, rebuild index */
+ if (unlink_directory(index->path, (enum unlink_directory_flags)0, &error) < 0) {
+ i_error("unlink_directory(%s) failed: %s", index->path, error);
+ ret = -1;
+ } else {
+ rescan_clear_unseen_mailboxes(index, NULL);
+ }
+ return ret;
+}
+
+int lucene_index_build_init(struct lucene_index *index)
+{
+ const char *lock_path;
+ struct stat st;
+
+ lucene_index_close(index);
+
+ lock_path = t_strdup_printf("%s/write.lock", index->path);
+ if (stat(lock_path, &st) == 0 &&
+ st.st_mtime < time(NULL) - LUCENE_LOCK_OVERRIDE_SECS) {
+ if (unlink(lock_path) < 0)
+ i_error("unlink(%s) failed: %m", lock_path);
+ }
+
+ if (lucene_settings_check(index) < 0)
+ return -1;
+
+ bool exists = IndexReader::indexExists(index->path);
+ try {
+ index->writer = _CLNEW IndexWriter(index->path,
+ index->default_analyzer,
+ !exists);
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "IndexWriter()");
+ return -1;
+ }
+ index->writer->setMaxFieldLength(MAX_TERMS_PER_DOCUMENT);
+ return 0;
+}
+
+#ifdef HAVE_FTS_TEXTCAT
+static Analyzer *get_analyzer(struct lucene_index *index, const char *lang)
+{
+ normalizer_func_t *normalizer = index->normalizer;
+ const struct lucene_analyzer *a;
+ struct lucene_analyzer new_analyzer;
+ Analyzer *analyzer;
+
+ array_foreach(&index->analyzers, a) {
+ if (strcmp(a->lang, lang) == 0)
+ return a->analyzer;
+ }
+
+ memset(&new_analyzer, 0, sizeof(new_analyzer));
+ new_analyzer.lang = i_strdup(lang);
+ new_analyzer.analyzer =
+ _CLNEW snowball::SnowballAnalyzer(normalizer, lang);
+ array_append_i(&index->analyzers.arr, &new_analyzer, 1);
+ return new_analyzer.analyzer;
+}
+
+static void *textcat_init(struct lucene_index *index)
+{
+ const char *textcat_dir = index->set.textcat_dir;
+ unsigned int len;
+
+ if (textcat_dir == NULL)
+ return NULL;
+
+ /* textcat really wants the '/' suffix */
+ len = strlen(textcat_dir);
+ if (len > 0 && textcat_dir[len-1] != '/')
+ textcat_dir = t_strconcat(textcat_dir, "/", NULL);
+
+ return special_textcat_Init(index->set.textcat_conf, textcat_dir);
+}
+
+static Analyzer *
+guess_analyzer(struct lucene_index *index, const void *data, size_t size)
+{
+ const char *lang;
+
+ if (textcat_broken)
+ return NULL;
+
+ if (textcat == NULL) {
+ textcat = textcat_init(index);
+ if (textcat == NULL) {
+ textcat_broken = TRUE;
+ return NULL;
+ }
+ }
+
+ /* try to guess the language */
+ lang = textcat_Classify(textcat, (const char *)data,
+ I_MIN(size, 500));
+ const char *p = strchr(lang, ']');
+ if (lang[0] != '[' || p == NULL)
+ return NULL;
+ lang = t_strdup_until(lang+1, p);
+ if (strcmp(lang, index->set.default_language) == 0)
+ return index->default_analyzer;
+
+ return get_analyzer(index, lang);
+}
+#else
+static Analyzer *
+guess_analyzer(struct lucene_index *index ATTR_UNUSED,
+ const void *data ATTR_UNUSED, size_t size ATTR_UNUSED)
+{
+ return NULL;
+}
+#endif
+
+static int lucene_index_build_flush(struct lucene_index *index)
+{
+ int ret = 0;
+
+ if (index->doc == NULL)
+ return 0;
+
+ try {
+ CL_NS(analysis)::Analyzer *analyzer = NULL;
+
+ if (!index->set.use_libfts) {
+ analyzer = index->cur_analyzer != NULL ?
+ index->cur_analyzer : index->default_analyzer;
+ }
+ index->writer->addDocument(index->doc, analyzer);
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "IndexWriter::addDocument()");
+ ret = -1;
+ }
+
+ _CLDELETE(index->doc);
+ index->doc = NULL;
+ index->cur_analyzer = NULL;
+ return ret;
+}
+
+int lucene_index_build_more(struct lucene_index *index, uint32_t uid,
+ uint32_t part_idx, const unsigned char *data,
+ size_t size, const char *hdr_name)
+{
+ wchar_t id[MAX_INT_STRLEN];
+ size_t namesize, datasize;
+
+ if (uid != index->prev_uid || part_idx != index->prev_part_idx) {
+ if (lucene_index_build_flush(index) < 0)
+ return -1;
+ index->prev_uid = uid;
+ index->prev_part_idx = part_idx;
+
+ index->doc = _CLNEW Document();
+ swprintf(id, N_ELEMENTS(id), L"%u", uid);
+ index->doc->add(*_CLNEW Field(_T("uid"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
+ if (part_idx != 0) {
+ swprintf(id, N_ELEMENTS(id), L"%u", part_idx);
+ index->doc->add(*_CLNEW Field(_T("part"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
+ }
+ index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
+ }
+
+ if (index->normalizer_buf != NULL && !index->set.use_libfts) {
+ buffer_set_used_size(index->normalizer_buf, 0);
+ index->normalizer(data, size, index->normalizer_buf);
+ data = (const unsigned char *)index->normalizer_buf->data;
+ size = index->normalizer_buf->used;
+ }
+
+ datasize = uni_utf8_strlen_n(data, size) + 1;
+ wchar_t *dest, *dest_free = NULL;
+ if (datasize < 4096)
+ dest = t_new(wchar_t, datasize);
+ else
+ dest = dest_free = i_new(wchar_t, datasize);
+ lucene_utf8_n_to_tchar(data, size, dest, datasize);
+ lucene_data_translate(index, dest, datasize-1);
+
+ int token_flag = index->set.use_libfts ?
+ Field::INDEX_UNTOKENIZED : Field::INDEX_TOKENIZED;
+ if (hdr_name != NULL) {
+ /* hdr_name should be ASCII, but don't break in case it isn't */
+ hdr_name = t_str_lcase(hdr_name);
+ namesize = uni_utf8_strlen(hdr_name) + 1;
+ wchar_t wname[namesize];
+ lucene_utf8_n_to_tchar((const unsigned char *)hdr_name,
+ strlen(hdr_name), wname, namesize);
+ if (!index->set.use_libfts)
+ index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | token_flag));
+ index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | token_flag));
+
+ if (fts_header_want_indexed(hdr_name))
+ index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | token_flag));
+ } else if (size > 0) {
+ if (index->cur_analyzer == NULL && !index->set.use_libfts)
+ index->cur_analyzer = guess_analyzer(index, data, size);
+ index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | token_flag));
+ }
+ i_free(dest_free);
+ return 0;
+}
+
+int lucene_index_build_deinit(struct lucene_index *index)
+{
+ int ret = 0;
+
+ if (index->prev_uid == 0) {
+ /* no changes. */
+ return 0;
+ }
+ index->prev_uid = 0;
+ index->prev_part_idx = 0;
+
+ if (index->writer == NULL) {
+ lucene_index_close(index);
+ return -1;
+ }
+
+ if (lucene_index_build_flush(index) < 0)
+ ret = -1;
+
+ try {
+ index->writer->close();
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "IndexWriter::close()");
+ ret = -1;
+ }
+
+ lucene_index_close(index);
+ return ret;
+}
+
+static int
+wcharguid_to_guid(guid_128_t dest, const wchar_t *src)
+{
+ buffer_t buf = { { 0, 0 } };
+ char src_chars[GUID_128_SIZE*2 + 1];
+ unsigned int i;
+
+ for (i = 0; i < sizeof(src_chars)-1; i++) {
+ if ((src[i] >= '0' && src[i] <= '9') ||
+ (src[i] >= 'a' && src[i] <= 'f'))
+ src_chars[i] = src[i];
+ else
+ return -1;
+ }
+ if (src[i] != '\0')
+ return -1;
+ src_chars[i] = '\0';
+
+ buffer_create_from_data(&buf, dest, GUID_128_SIZE);
+ return hex_to_binary(src_chars, &buf);
+}
+
+static int
+rescan_get_uids(struct mailbox *box, ARRAY_TYPE(seq_range) *uids)
+{
+ struct mailbox_status status;
+
+ if (mailbox_get_status(box, STATUS_MESSAGES, &status) < 0)
+ return -1;
+
+ if (status.messages > 0) T_BEGIN {
+ ARRAY_TYPE(seq_range) seqs;
+
+ t_array_init(&seqs, 2);
+ seq_range_array_add_range(&seqs, 1, status.messages);
+ mailbox_get_uid_range(box, &seqs, uids);
+ } T_END;
+ return 0;
+}
+
+static int rescan_finish(struct rescan_context *ctx)
+{
+ int ret;
+
+ ret = fts_index_set_last_uid(ctx->box, ctx->last_existing_uid);
+ mailbox_free(&ctx->box);
+ return ret;
+}
+
+static int
+fts_lucene_get_mailbox_guid(struct lucene_index *index, Document *doc,
+ guid_128_t guid_r)
+{
+ Field *field = doc->getField(_T("box"));
+ const TCHAR *box_guid = field == NULL ? NULL : field->stringValue();
+ if (box_guid == NULL) {
+ i_error("lucene: Corrupted FTS index %s: No mailbox for document",
+ index->path);
+ return -1;
+ }
+
+ if (wcharguid_to_guid(guid_r, box_guid) < 0) {
+ i_error("lucene: Corrupted FTS index %s: "
+ "box field not in expected format", index->path);
+ return -1;
+ }
+ return 0;
+}
+
+static int
+rescan_open_mailbox(struct rescan_context *ctx, Document *doc)
+{
+ guid_128_t guid, *guidp;
+ int ret;
+
+ if (fts_lucene_get_mailbox_guid(ctx->index, doc, guid) < 0)
+ return 0;
+
+ if (memcmp(guid, ctx->box_guid, sizeof(guid)) == 0) {
+ /* same as last one */
+ return ctx->box_ret;
+ }
+ memcpy(ctx->box_guid, guid, sizeof(ctx->box_guid));
+
+ guidp = p_new(ctx->pool, guid_128_t, 1);
+ memcpy(guidp, guid, sizeof(*guidp));
+ hash_table_insert(ctx->seen_mailbox_guids, guidp, guidp);
+
+ if (ctx->box != NULL)
+ rescan_finish(ctx);
+ ctx->box = mailbox_alloc_guid(ctx->index->list, guid,
+ (enum mailbox_flags)0);
+ if (mailbox_open(ctx->box) < 0) {
+ enum mail_error error;
+ const char *errstr;
+
+ errstr = mailbox_get_last_internal_error(ctx->box, &error);
+ if (error == MAIL_ERROR_NOTFOUND)
+ ret = 0;
+ else {
+ i_error("lucene: Couldn't open mailbox %s: %s",
+ mailbox_get_vname(ctx->box), errstr);
+ ret = -1;
+ }
+ mailbox_free(&ctx->box);
+ ctx->box_ret = ret;
+ return ret;
+ }
+ if (mailbox_sync(ctx->box, (enum mailbox_sync_flags)0) < 0) {
+ i_error("lucene: Failed to sync mailbox %s: %s",
+ mailbox_get_vname(ctx->box),
+ mailbox_get_last_internal_error(ctx->box, NULL));
+ mailbox_free(&ctx->box);
+ ctx->box_ret = -1;
+ return -1;
+ }
+
+ array_clear(&ctx->uids);
+ rescan_get_uids(ctx->box, &ctx->uids);
+
+ ctx->warned = FALSE;
+ ctx->last_existing_uid = 0;
+ ctx->uids_iter_n = 0;
+ seq_range_array_iter_init(&ctx->uids_iter, &ctx->uids);
+
+ ctx->box_ret = 1;
+ return 1;
+}
+
+static int
+rescan_next(struct rescan_context *ctx, Document *doc)
+{
+ uint32_t lucene_uid, idx_uid;
+
+ if (lucene_doc_get_uid(ctx->index, doc, &lucene_uid) < 0)
+ return 0;
+
+ if (seq_range_array_iter_nth(&ctx->uids_iter, ctx->uids_iter_n,
+ &idx_uid)) {
+ if (idx_uid == lucene_uid) {
+ ctx->uids_iter_n++;
+ ctx->last_existing_uid = idx_uid;
+ return 1;
+ }
+ if (idx_uid < lucene_uid) {
+ /* lucene is missing an UID from the middle. delete
+ the rest of the messages from this mailbox and
+ reindex. */
+ if (!ctx->warned) {
+ i_warning("lucene: Mailbox %s "
+ "missing UIDs in the middle",
+ mailbox_get_vname(ctx->box));
+ ctx->warned = TRUE;
+ }
+ } else {
+ /* UID has been expunged from index. delete from
+ lucene as well. */
+ }
+ return 0;
+ } else {
+ /* the rest of the messages have been expunged from index */
+ return 0;
+ }
+}
+
+static void
+rescan_clear_unseen_mailbox(struct lucene_index *index,
+ struct rescan_context *rescan_ctx,
+ const char *vname,
+ const struct fts_index_header *hdr)
+{
+ struct mailbox *box;
+ struct mailbox_metadata metadata;
+
+ box = mailbox_alloc(index->list, vname,
+ (enum mailbox_flags)0);
+ if (mailbox_open(box) == 0 &&
+ mailbox_get_metadata(box, MAILBOX_METADATA_GUID,
+ &metadata) == 0 &&
+ (rescan_ctx == NULL ||
+ hash_table_lookup(rescan_ctx->seen_mailbox_guids,
+ metadata.guid) == NULL)) {
+ /* this mailbox had no records in lucene index.
+ make sure its last indexed uid is 0 */
+ (void)fts_index_set_header(box, hdr);
+ }
+ mailbox_free(&box);
+}
+
+static void rescan_clear_unseen_mailboxes(struct lucene_index *index,
+ struct rescan_context *rescan_ctx)
+{
+ const enum mailbox_list_iter_flags iter_flags =
+ (enum mailbox_list_iter_flags)
+ (MAILBOX_LIST_ITER_NO_AUTO_BOXES |
+ MAILBOX_LIST_ITER_RETURN_NO_FLAGS);
+ struct mailbox_list_iterate_context *iter;
+ const struct mailbox_info *info;
+ struct fts_index_header hdr;
+ struct mail_namespace *ns = index->list->ns;
+ const char *vname;
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.settings_checksum = fts_lucene_settings_checksum(&index->set);
+
+ iter = mailbox_list_iter_init(index->list, "*", iter_flags);
+ while ((info = mailbox_list_iter_next(iter)) != NULL)
+ rescan_clear_unseen_mailbox(index, rescan_ctx, info->vname, &hdr);
+ (void)mailbox_list_iter_deinit(&iter);
+
+ if (ns->prefix_len > 0 &&
+ ns->prefix[ns->prefix_len-1] == mail_namespace_get_sep(ns)) {
+ /* namespace prefix itself isn't returned by the listing */
+ vname = t_strndup(index->list->ns->prefix,
+ index->list->ns->prefix_len-1);
+ rescan_clear_unseen_mailbox(index, rescan_ctx, vname, &hdr);
+ }
+}
+
+int lucene_index_rescan(struct lucene_index *index)
+{
+ static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL };
+ struct rescan_context ctx;
+ bool failed = false;
+ int ret;
+
+ i_assert(index->list != NULL);
+
+ if ((ret = lucene_index_open_search(index)) < 0)
+ return ret;
+
+ Term term(_T("box"), _T("*"));
+ WildcardQuery query(&term);
+ Sort sort(sort_fields);
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.index = index;
+ ctx.pool = pool_alloconly_create("guids", 1024);
+ hash_table_create(&ctx.seen_mailbox_guids, ctx.pool, 0,
+ guid_128_hash, guid_128_cmp);
+ i_array_init(&ctx.uids, 128);
+
+ if (ret > 0) try {
+ Hits *hits = index->searcher->search(&query, &sort);
+
+ for (size_t i = 0; i < hits->length(); i++) {
+ ret = rescan_open_mailbox(&ctx, &hits->doc(i));
+ if (ret > 0)
+ ret = rescan_next(&ctx, &hits->doc(i));
+ if (ret < 0)
+ failed = true;
+ else if (ret == 0)
+ index->reader->deleteDocument(hits->id(i));
+ }
+ _CLDELETE(hits);
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "rescan search");
+ failed = true;
+ }
+ lucene_index_close(index);
+ if (ctx.box != NULL)
+ rescan_finish(&ctx);
+ array_free(&ctx.uids);
+
+ rescan_clear_unseen_mailboxes(index, &ctx);
+ hash_table_destroy(&ctx.seen_mailbox_guids);
+ pool_unref(&ctx.pool);
+ return failed ? -1 : 0;
+}
+
+static void guid128_to_wguid(const guid_128_t guid,
+ wchar_t wguid_hex[MAILBOX_GUID_HEX_LENGTH + 1])
+{
+ buffer_t buf = { { 0, 0 } };
+ unsigned char guid_hex[MAILBOX_GUID_HEX_LENGTH];
+ unsigned int i;
+
+ buffer_create_from_data(&buf, guid_hex, MAILBOX_GUID_HEX_LENGTH);
+ binary_to_hex_append(&buf, guid, GUID_128_SIZE);
+ for (i = 0; i < MAILBOX_GUID_HEX_LENGTH; i++)
+ wguid_hex[i] = guid_hex[i];
+ wguid_hex[i] = '\0';
+}
+
+static bool
+lucene_index_add_uid_filter(BooleanQuery *query,
+ const struct fts_expunge_log_read_record *rec)
+{
+ struct seq_range_iter iter;
+ wchar_t wuid[MAX_INT_STRLEN];
+ unsigned int n;
+ uint32_t uid;
+
+ /* RangeQuery and WildcardQuery work by enumerating through all terms
+ that match them, and then adding TermQueries for them. So we can
+ simply do the same directly, and if it looks like there are too
+ many terms just go through everything. */
+
+ if (seq_range_count(&rec->uids) > FTS_LUCENE_MAX_SEARCH_TERMS)
+ return false;
+
+ seq_range_array_iter_init(&iter, &rec->uids); n = 0;
+ while (seq_range_array_iter_nth(&iter, n++, &uid)) {
+ swprintf(wuid, N_ELEMENTS(wuid), L"%u", uid);
+
+ Term *term = _CLNEW Term(_T("uid"), wuid);
+ query->add(_CLNEW TermQuery(term), true, BooleanClause::SHOULD);
+ _CLDECDELETE(term);
+ }
+ return true;
+}
+
+static int
+lucene_index_expunge_record(struct lucene_index *index,
+ const struct fts_expunge_log_read_record *rec)
+{
+ int ret;
+
+ if ((ret = lucene_index_open_search(index)) <= 0)
+ return ret;
+
+ BooleanQuery query;
+ BooleanQuery uids_query;
+
+ if (lucene_index_add_uid_filter(&uids_query, rec))
+ query.add(&uids_query, BooleanClause::MUST);
+
+ wchar_t wguid[MAILBOX_GUID_HEX_LENGTH + 1];
+ guid128_to_wguid(rec->mailbox_guid, wguid);
+ Term term(_T("box"), wguid);
+ TermQuery mailbox_query(&term);
+ query.add(&mailbox_query, BooleanClause::MUST);
+
+ try {
+ Hits *hits = index->searcher->search(&query);
+
+ for (size_t i = 0; i < hits->length(); i++) {
+ uint32_t uid;
+
+ if (lucene_doc_get_uid(index, &hits->doc(i),
+ &uid) < 0 ||
+ seq_range_exists(&rec->uids, uid))
+ index->reader->deleteDocument(hits->id(i));
+ }
+ _CLDELETE(hits);
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "expunge search");
+ ret = -1;
+ }
+ return ret < 0 ? -1 : 0;
+}
+
+int lucene_index_expunge_from_log(struct lucene_index *index,
+ struct fts_expunge_log *log)
+{
+ struct fts_expunge_log_read_ctx *ctx;
+ const struct fts_expunge_log_read_record *rec;
+ int ret = 0, ret2;
+
+ ctx = fts_expunge_log_read_begin(log);
+ while ((rec = fts_expunge_log_read_next(ctx)) != NULL) {
+ if (lucene_index_expunge_record(index, rec) < 0) {
+ ret = -1;
+ break;
+ }
+ }
+
+ lucene_index_close(index);
+
+ ret2 = fts_expunge_log_read_end(&ctx);
+ if (ret < 0 || ret2 < 0)
+ return -1;
+ return ret2;
+}
+
+int lucene_index_optimize(struct lucene_index *index)
+{
+ int ret = 0;
+
+ if (!IndexReader::indexExists(index->path))
+ return 0;
+ if (IndexReader::isLocked(index->path))
+ IndexReader::unlock(index->path);
+
+ IndexWriter *writer = NULL;
+ try {
+ writer = _CLNEW IndexWriter(index->path, index->default_analyzer, false);
+ writer->optimize();
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "IndexWriter::optimize()");
+ ret = -1;
+ }
+ try {
+ writer->close();
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "IndexWriter::close()");
+ ret = -1;
+ }
+ if (writer != NULL)
+ _CLDELETE(writer);
+ return ret;
+}
+
+// Mostly copy&pasted from CLucene's QueryParser
+static Query* getFieldQuery(Analyzer *analyzer, const TCHAR* _field, const TCHAR* queryText, bool fuzzy) {
+ // Use the analyzer to get all the tokens, and then build a TermQuery,
+ // PhraseQuery, or nothing based on the term count
+
+ StringReader reader(queryText);
+ TokenStream* source = analyzer->tokenStream(_field, &reader);
+
+ CLVector<CL_NS(analysis)::Token*, Deletor::Object<CL_NS(analysis)::Token> > v;
+ CL_NS(analysis)::Token* t = NULL;
+ int32_t positionCount = 0;
+ bool severalTokensAtSamePosition = false;
+
+ while (true) {
+ t = _CLNEW Token();
+ try {
+ Token* _t = source->next(t);
+ if (_t == NULL) _CLDELETE(t);
+ }_CLCATCH_ERR(CL_ERR_IO, _CLLDELETE(source);_CLLDELETE(t);,{
+ t = NULL;
+ });
+ if (t == NULL)
+ break;
+ v.push_back(t);
+ if (t->getPositionIncrement() != 0)
+ positionCount += t->getPositionIncrement();
+ else
+ severalTokensAtSamePosition = true;
+ }
+ try {
+ source->close();
+ }
+ _CLCATCH_ERR_CLEANUP(CL_ERR_IO, {_CLLDELETE(source);_CLLDELETE(t);} ); /* cleanup */
+ _CLLDELETE(source);
+
+ if (v.size() == 0)
+ return NULL;
+ else if (v.size() == 1) {
+ Term* tm = _CLNEW Term(_field, v.at(0)->termBuffer());
+ Query* ret;
+ if (fuzzy)
+ ret = _CLNEW FuzzyQuery( tm );
+ else
+ ret = _CLNEW TermQuery( tm );
+ _CLDECDELETE(tm);
+ return ret;
+ } else {
+ if (severalTokensAtSamePosition) {
+ if (positionCount == 1) {
+ // no phrase query:
+ BooleanQuery* q = _CLNEW BooleanQuery(true);
+ for(size_t i=0; i<v.size(); i++ ){
+ Term* tm = _CLNEW Term(_field, v.at(i)->termBuffer());
+ q->add(_CLNEW TermQuery(tm), true, BooleanClause::SHOULD);
+ _CLDECDELETE(tm);
+ }
+ return q;
+ }else {
+ MultiPhraseQuery* mpq = _CLNEW MultiPhraseQuery();
+ CLArrayList<Term*> multiTerms;
+ int32_t position = -1;
+ for (size_t i = 0; i < v.size(); i++) {
+ t = v.at(i);
+ if (t->getPositionIncrement() > 0 && multiTerms.size() > 0) {
+ ValueArray<Term*> termsArray(multiTerms.size());
+ multiTerms.toArray(termsArray.values);
+ mpq->add(&termsArray,position);
+ multiTerms.clear();
+ }
+ position += t->getPositionIncrement();
+ multiTerms.push_back(_CLNEW Term(_field, t->termBuffer()));
+ }
+ ValueArray<Term*> termsArray(multiTerms.size());
+ multiTerms.toArray(termsArray.values);
+ mpq->add(&termsArray,position);
+ return mpq;
+ }
+ }else {
+ PhraseQuery* pq = _CLNEW PhraseQuery();
+ int32_t position = -1;
+
+ for (size_t i = 0; i < v.size(); i++) {
+ t = v.at(i);
+ Term* tm = _CLNEW Term(_field, t->termBuffer());
+ position += t->getPositionIncrement();
+ pq->add(tm,position);
+ _CLDECDELETE(tm);
+ }
+ return pq;
+ }
+ }
+}
+
+static Query *
+lucene_get_query_str(struct lucene_index *index,
+ const TCHAR *key, const char *str, bool fuzzy)
+{
+ const TCHAR *wvalue;
+ Analyzer *analyzer;
+
+ if (index->set.use_libfts) {
+ const wchar_t *wstr = t_lucene_utf8_to_tchar(index, str);
+ Term* tm = _CLNEW Term(key, wstr);
+ Query* ret;
+ if (fuzzy)
+ ret = _CLNEW FuzzyQuery( tm );
+ else
+ ret = _CLNEW TermQuery( tm );
+ _CLDECDELETE(tm);
+ return ret;
+ }
+
+ if (index->normalizer_buf != NULL) {
+ buffer_set_used_size(index->normalizer_buf, 0);
+ index->normalizer(str, strlen(str), index->normalizer_buf);
+ buffer_append_c(index->normalizer_buf, '\0');
+ str = (const char *)index->normalizer_buf->data;
+ }
+
+ wvalue = t_lucene_utf8_to_tchar(index, str);
+ analyzer = guess_analyzer(index, str, strlen(str));
+ if (analyzer == NULL) {
+ analyzer = index->default_analyzer;
+ i_assert(analyzer != NULL);
+ }
+
+ return getFieldQuery(analyzer, key, wvalue, fuzzy);
+}
+
+static Query *
+lucene_get_query(struct lucene_index *index,
+ const TCHAR *key, const struct mail_search_arg *arg)
+{
+ return lucene_get_query_str(index, key, arg->value.str, arg->fuzzy);
+}
+
+static bool
+lucene_add_definite_query(struct lucene_index *index,
+ ARRAY_TYPE(lucene_query) &queries,
+ struct mail_search_arg *arg,
+ enum fts_lookup_flags flags)
+{
+ bool and_args = (flags & FTS_LOOKUP_FLAG_AND_ARGS) != 0;
+ Query *q;
+
+ if (arg->no_fts)
+ return false;
+
+ if (arg->match_not && !and_args) {
+ /* FIXME: we could handle this by doing multiple queries.. */
+ return false;
+ }
+
+ switch (arg->type) {
+ case SEARCH_TEXT: {
+ Query *q1 = lucene_get_query(index, _T("hdr"), arg);
+ Query *q2 = lucene_get_query(index, _T("body"), arg);
+
+ if (q1 == NULL && q2 == NULL)
+ q = NULL;
+ else {
+ BooleanQuery *bq = _CLNEW BooleanQuery();
+ if (q1 != NULL)
+ bq->add(q1, true, BooleanClause::SHOULD);
+ if (q2 != NULL)
+ bq->add(q2, true, BooleanClause::SHOULD);
+ q = bq;
+ }
+ break;
+ }
+ case SEARCH_BODY:
+ q = lucene_get_query(index, _T("body"), arg);
+ break;
+ case SEARCH_HEADER:
+ case SEARCH_HEADER_ADDRESS:
+ case SEARCH_HEADER_COMPRESS_LWSP:
+ if (!fts_header_want_indexed(arg->hdr_field_name) ||
+ *arg->value.str == '\0')
+ return false;
+
+ q = lucene_get_query(index,
+ t_lucene_utf8_to_tchar(index, t_str_lcase(arg->hdr_field_name)),
+ arg);
+ break;
+ default:
+ return false;
+ }
+
+ if (q == NULL) {
+ /* couldn't handle this search after all (e.g. trying to search
+ a stop word) */
+ return false;
+ }
+
+ struct lucene_query *lq = array_append_space(&queries);
+ lq->query = q;
+ if (!and_args)
+ lq->occur = BooleanClause::SHOULD;
+ else if (!arg->match_not)
+ lq->occur = BooleanClause::MUST;
+ else
+ lq->occur = BooleanClause::MUST_NOT;
+ return true;
+}
+
+static bool
+lucene_add_maybe_query(struct lucene_index *index,
+ ARRAY_TYPE(lucene_query) &queries,
+ struct mail_search_arg *arg,
+ enum fts_lookup_flags flags)
+{
+ bool and_args = (flags & FTS_LOOKUP_FLAG_AND_ARGS) != 0;
+ Query *q = NULL;
+
+ if (arg->no_fts)
+ return false;
+
+ if (arg->match_not) {
+ /* FIXME: we could handle this by doing multiple queries.. */
+ return false;
+ }
+
+ switch (arg->type) {
+ case SEARCH_HEADER:
+ case SEARCH_HEADER_ADDRESS:
+ case SEARCH_HEADER_COMPRESS_LWSP:
+ if (*arg->value.str == '\0' && !index->set.use_libfts) {
+ /* checking potential existence of the header name */
+ q = lucene_get_query_str(index, _T("hdr"),
+ t_str_lcase(arg->hdr_field_name), FALSE);
+ break;
+ }
+
+ if (fts_header_want_indexed(arg->hdr_field_name))
+ return false;
+
+ /* we can check if the search key exists in some header and
+ filter out the messages that have no chance of matching */
+ q = lucene_get_query(index, _T("hdr"), arg);
+ break;
+ default:
+ return false;
+ }
+
+ if (q == NULL) {
+ /* couldn't handle this search after all (e.g. trying to search
+ a stop word) */
+ return false;
+ }
+ struct lucene_query *lq = array_append_space(&queries);
+ lq->query = q;
+ if (!and_args)
+ lq->occur = BooleanClause::SHOULD;
+ else if (!arg->match_not)
+ lq->occur = BooleanClause::MUST;
+ else
+ lq->occur = BooleanClause::MUST_NOT;
+ return true;
+}
+
+static bool queries_have_non_must_nots(ARRAY_TYPE(lucene_query) &queries)
+{
+ const struct lucene_query *lq;
+
+ array_foreach(&queries, lq) {
+ if (lq->occur != BooleanClause::MUST_NOT)
+ return TRUE;
+ }
+ return FALSE;
+}
+
+static void search_query_add(BooleanQuery &query,
+ ARRAY_TYPE(lucene_query) &queries)
+{
+ BooleanQuery *search_query = _CLNEW BooleanQuery();
+ const struct lucene_query *lq;
+
+ if (queries_have_non_must_nots(queries)) {
+ array_foreach(&queries, lq)
+ search_query->add(lq->query, true, lq->occur);
+ query.add(search_query, true, BooleanClause::MUST);
+ } else {
+ array_foreach(&queries, lq)
+ search_query->add(lq->query, true, BooleanClause::SHOULD);
+ query.add(search_query, true, BooleanClause::MUST_NOT);
+ }
+}
+
+static int
+lucene_index_search(struct lucene_index *index,
+ ARRAY_TYPE(lucene_query) &queries,
+ struct fts_result *result, ARRAY_TYPE(seq_range) *uids_r)
+{
+ struct fts_score_map *score;
+ int ret = 0;
+
+ BooleanQuery query;
+ search_query_add(query, queries);
+
+ Term mailbox_term(_T("box"), index->mailbox_guid);
+ TermQuery mailbox_query(&mailbox_term);
+ query.add(&mailbox_query, BooleanClause::MUST);
+
+ try {
+ Hits *hits = index->searcher->search(&query);
+
+ uint32_t last_uid = 0;
+ if (result != NULL)
+ result->scores_sorted = true;
+
+ for (size_t i = 0; i < hits->length(); i++) {
+ uint32_t uid;
+
+ if (lucene_doc_get_uid(index, &hits->doc(i),
+ &uid) < 0) {
+ ret = -1;
+ break;
+ }
+
+ if (seq_range_array_add(uids_r, uid)) {
+ /* duplicate result */
+ } else if (result != NULL) {
+ if (uid < last_uid)
+ result->scores_sorted = false;
+ last_uid = uid;
+
+ score = array_append_space(&result->scores);
+ score->uid = uid;
+ score->score = hits->score(i);
+ }
+ }
+ _CLDELETE(hits);
+ return ret;
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "search");
+ return -1;
+ }
+}
+
+int lucene_index_lookup(struct lucene_index *index,
+ struct mail_search_arg *args,
+ enum fts_lookup_flags flags,
+ struct fts_result *result)
+{
+ struct mail_search_arg *arg;
+
+ if (lucene_index_open_search(index) <= 0)
+ return -1;
+
+ ARRAY_TYPE(lucene_query) def_queries;
+ t_array_init(&def_queries, 16);
+ bool have_definites = false;
+
+ for (arg = args; arg != NULL; arg = arg->next) {
+ if (lucene_add_definite_query(index, def_queries, arg, flags)) {
+ arg->match_always = true;
+ have_definites = true;
+ }
+ }
+
+ if (have_definites) {
+ ARRAY_TYPE(seq_range) *uids_arr =
+ (flags & FTS_LOOKUP_FLAG_NO_AUTO_FUZZY) == 0 ?
+ &result->definite_uids : &result->maybe_uids;
+ if (lucene_index_search(index, def_queries, result,
+ uids_arr) < 0)
+ return -1;
+ }
+
+ if (have_definites) {
+ /* FIXME: mixing up definite + maybe queries is broken. if the
+ definite query matched, it'll just assume that the maybe
+ queries matched as well */
+ return 0;
+ }
+
+ ARRAY_TYPE(lucene_query) maybe_queries;
+ t_array_init(&maybe_queries, 16);
+ bool have_maybies = false;
+
+ for (arg = args; arg != NULL; arg = arg->next) {
+ if (lucene_add_maybe_query(index, maybe_queries, arg, flags)) {
+ arg->match_always = true;
+ have_maybies = true;
+ }
+ }
+
+ if (have_maybies) {
+ if (lucene_index_search(index, maybe_queries, NULL,
+ &result->maybe_uids) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static int
+lucene_index_search_multi(struct lucene_index *index,
+ HASH_TABLE_TYPE(wguid_result) guids,
+ ARRAY_TYPE(lucene_query) &queries,
+ enum fts_lookup_flags flags,
+ struct fts_multi_result *result)
+{
+ struct fts_score_map *score;
+ int ret = 0;
+
+ BooleanQuery query;
+ search_query_add(query, queries);
+
+ BooleanQuery mailbox_query;
+ struct hash_iterate_context *iter;
+ void *key, *value;
+ iter = hash_table_iterate_init(guids);
+ while (hash_table_iterate(iter, guids, &key, &value)) {
+ Term *term = _CLNEW Term(_T("box"), (wchar_t *)key);
+ TermQuery *q = _CLNEW TermQuery(term);
+ mailbox_query.add(q, true, BooleanClause::SHOULD);
+ }
+ hash_table_iterate_deinit(&iter);
+
+ query.add(&mailbox_query, BooleanClause::MUST);
+ try {
+ Hits *hits = index->searcher->search(&query);
+
+ for (size_t i = 0; i < hits->length(); i++) {
+ uint32_t uid;
+
+ Field *field = hits->doc(i).getField(_T("box"));
+ const TCHAR *box_guid = field == NULL ? NULL : field->stringValue();
+ if (box_guid == NULL) {
+ i_error("lucene: Corrupted FTS index %s: No mailbox for document",
+ index->path);
+ ret = -1;
+ break;
+ }
+ struct fts_result *br =
+ hash_table_lookup(guids, box_guid);
+ if (br == NULL) {
+ i_warning("lucene: Returned unexpected mailbox with GUID %ls", box_guid);
+ continue;
+ }
+
+ if (lucene_doc_get_uid(index, &hits->doc(i),
+ &uid) < 0) {
+ ret = -1;
+ break;
+ }
+
+ ARRAY_TYPE(seq_range) *uids_arr =
+ (flags & FTS_LOOKUP_FLAG_NO_AUTO_FUZZY) == 0 ?
+ &br->maybe_uids : &br->definite_uids;
+ if (!array_is_created(uids_arr)) {
+ p_array_init(uids_arr, result->pool, 32);
+ p_array_init(&br->scores, result->pool, 32);
+ }
+ if (seq_range_array_add(uids_arr, uid)) {
+ /* duplicate result */
+ } else {
+ score = array_append_space(&br->scores);
+ score->uid = uid;
+ score->score = hits->score(i);
+ }
+ }
+ _CLDELETE(hits);
+ return ret;
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "multi search");
+ return -1;
+ }
+}
+
+int lucene_index_lookup_multi(struct lucene_index *index,
+ HASH_TABLE_TYPE(wguid_result) guids,
+ struct mail_search_arg *args,
+ enum fts_lookup_flags flags,
+ struct fts_multi_result *result)
+{
+ struct mail_search_arg *arg;
+
+ if (lucene_index_open_search(index) <= 0)
+ return -1;
+
+ ARRAY_TYPE(lucene_query) def_queries;
+ t_array_init(&def_queries, 16);
+ bool have_definites = false;
+
+ for (arg = args; arg != NULL; arg = arg->next) {
+ if (lucene_add_definite_query(index, def_queries, arg, flags)) {
+ arg->match_always = true;
+ have_definites = true;
+ }
+ }
+
+ if (have_definites) {
+ if (lucene_index_search_multi(index, guids, def_queries, flags,
+ result) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+struct lucene_index_iter {
+ struct lucene_index *index;
+ struct lucene_index_record rec;
+
+ Term *term;
+ WildcardQuery *query;
+ Sort *sort;
+
+ Hits *hits;
+ size_t i;
+ bool failed;
+};
+
+struct lucene_index_iter *
+lucene_index_iter_init(struct lucene_index *index)
+{
+ static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL };
+ struct lucene_index_iter *iter;
+ int ret;
+
+ iter = i_new(struct lucene_index_iter, 1);
+ iter->index = index;
+ if ((ret = lucene_index_open_search(index)) <= 0) {
+ if (ret < 0)
+ iter->failed = true;
+ return iter;
+ }
+
+ iter->term = _CLNEW Term(_T("box"), _T("*"));
+ iter->query = _CLNEW WildcardQuery(iter->term);
+ iter->sort = _CLNEW Sort(sort_fields);
+
+ try {
+ iter->hits = index->searcher->search(iter->query, iter->sort);
+ } catch (CLuceneError &err) {
+ lucene_handle_error(index, err, "rescan search");
+ iter->failed = true;
+ }
+ return iter;
+}
+
+const struct lucene_index_record *
+lucene_index_iter_next(struct lucene_index_iter *iter)
+{
+ if (iter->hits == NULL)
+ return NULL;
+ if (iter->i == iter->hits->length())
+ return NULL;
+
+ Document *doc = &iter->hits->doc(iter->i);
+ iter->i++;
+
+ memset(&iter->rec, 0, sizeof(iter->rec));
+ (void)fts_lucene_get_mailbox_guid(iter->index, doc,
+ iter->rec.mailbox_guid);
+ (void)lucene_doc_get_uid(iter->index, doc, &iter->rec.uid);
+ iter->rec.part_num = lucene_doc_get_part(iter->index, doc);
+ return &iter->rec;
+}
+
+int lucene_index_iter_deinit(struct lucene_index_iter **_iter)
+{
+ struct lucene_index_iter *iter = *_iter;
+ int ret = iter->failed ? -1 : 0;
+
+ *_iter = NULL;
+ if (iter->hits != NULL)
+ _CLDELETE(iter->hits);
+ if (iter->query != NULL) {
+ _CLDELETE(iter->query);
+ _CLDELETE(iter->sort);
+ _CLDELETE(iter->term);
+ }
+ i_free(iter);
+ return ret;
+}
+
+void lucene_shutdown(void)
+{
+ _lucene_shutdown();
+}