summaryrefslogtreecommitdiffstats
path: root/src/lib-storage/index/index-thread-finish.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib-storage/index/index-thread-finish.c')
-rw-r--r--src/lib-storage/index/index-thread-finish.c682
1 files changed, 682 insertions, 0 deletions
diff --git a/src/lib-storage/index/index-thread-finish.c b/src/lib-storage/index/index-thread-finish.c
new file mode 100644
index 0000000..42326b6
--- /dev/null
+++ b/src/lib-storage/index/index-thread-finish.c
@@ -0,0 +1,682 @@
+/* Copyright (c) 2002-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "hash.h"
+#include "imap-base-subject.h"
+#include "mail-storage-private.h"
+#include "index-thread-private.h"
+
+
+struct mail_thread_shadow_node {
+ uint32_t first_child_idx, next_sibling_idx;
+};
+
+struct mail_thread_root_node {
+ /* node.idx usually points to indexes from mail hash. However
+ REFERENCES step (5) may add temporary dummy roots. They use larger
+ index numbers than exist in the hash. */
+ struct mail_thread_child_node node;
+
+ /* Used temporarily by (5)(B) base subject gathering.
+ root_idx1 is node's index in roots[] array + 1.
+ parent_root_idx points to root_idx1, or 0 for root. */
+ unsigned int root_idx1;
+ uint32_t parent_root_idx1;
+
+ /* subject contained a Re: or Fwd: */
+ bool reply_or_forward:1;
+ /* a dummy node */
+ bool dummy:1;
+ /* ignore this node - it's a dummy without children */
+ bool ignore:1;
+};
+
+struct thread_finish_context {
+ unsigned int refcount;
+
+ struct mail *tmp_mail;
+ struct mail_thread_cache *cache;
+
+ ARRAY(struct mail_thread_root_node) roots;
+ ARRAY(struct mail_thread_shadow_node) shadow_nodes;
+ unsigned int next_new_root_idx;
+
+ bool use_sent_date:1;
+ bool return_seqs:1;
+};
+
+struct mail_thread_iterate_context {
+ struct thread_finish_context *ctx;
+
+ ARRAY_TYPE(mail_thread_child_node) children;
+ unsigned int next_idx;
+ bool failed;
+};
+
+struct subject_gather_context {
+ struct thread_finish_context *ctx;
+
+ pool_t subject_pool;
+ HASH_TABLE(char *, struct mail_thread_root_node *) subject_hash;
+};
+
+static void
+add_base_subject(struct subject_gather_context *ctx, const char *subject,
+ struct mail_thread_root_node *node)
+{
+ struct mail_thread_root_node *hash_node;
+ char *hash_subject;
+ bool is_reply_or_forward;
+
+ subject = imap_get_base_subject_cased(pool_datastack_create(), subject,
+ &is_reply_or_forward);
+ /* (ii) If the thread subject is empty, skip this message. */
+ if (*subject == '\0')
+ return;
+
+ /* (iii) Look up the message associated with the thread
+ subject in the subject table. */
+ if (!hash_table_lookup_full(ctx->subject_hash, subject, &hash_subject,
+ &hash_node)) {
+ /* (iv) If there is no message in the subject table with the
+ thread subject, add the current message and the thread
+ subject to the subject table. */
+ hash_subject = p_strdup(ctx->subject_pool, subject);
+ hash_table_insert(ctx->subject_hash, hash_subject, node);
+ } else {
+ /* Otherwise, if the message in the subject table is not a
+ dummy, AND either of the following criteria are true:
+
+ The current message is a dummy, OR
+
+ The message in the subject table is a reply or forward
+ and the current message is not.
+
+ then replace the message in the subject table with the
+ current message. */
+ if (!hash_node->dummy &&
+ (node->dummy ||
+ (hash_node->reply_or_forward && !is_reply_or_forward))) {
+ hash_node->parent_root_idx1 = node->root_idx1;
+ hash_table_update(ctx->subject_hash, hash_subject, node);
+ } else {
+ node->parent_root_idx1 = hash_node->root_idx1;
+ }
+ }
+
+ node->reply_or_forward = is_reply_or_forward;
+}
+
+static int mail_thread_child_node_cmp(const struct mail_thread_child_node *c1,
+ const struct mail_thread_child_node *c2)
+{
+ if (c1->sort_date < c2->sort_date)
+ return -1;
+ if (c1->sort_date > c2->sort_date)
+ return 1;
+
+ if (c1->uid < c2->uid)
+ return -1;
+ if (c1->uid > c2->uid)
+ return 1;
+ return 0;
+}
+
+static int mail_thread_root_node_cmp(const struct mail_thread_root_node *r1,
+ const struct mail_thread_root_node *r2)
+{
+ return mail_thread_child_node_cmp(&r1->node, &r2->node);
+}
+
+static uint32_t
+thread_lookup_existing(struct thread_finish_context *ctx, uint32_t idx)
+{
+ const struct mail_thread_node *node;
+
+ node = array_idx(&ctx->cache->thread_nodes, idx);
+ i_assert(MAIL_THREAD_NODE_EXISTS(node));
+ i_assert(node->uid != 0);
+ return node->uid;
+}
+
+static void
+thread_child_node_fill(struct thread_finish_context *ctx,
+ struct mail_thread_child_node *child)
+{
+ int tz;
+
+ child->uid = thread_lookup_existing(ctx, child->idx);
+
+ if (!mail_set_uid(ctx->tmp_mail, child->uid)) {
+ /* the UID should have existed. we would have rebuild
+ the thread tree otherwise. */
+ i_unreached();
+ }
+
+ /* get sent date if we want to use it and if it's valid */
+ if (!ctx->use_sent_date)
+ child->sort_date = 0;
+ else if (mail_get_date(ctx->tmp_mail, &child->sort_date, &tz) < 0)
+ child->sort_date = 0;
+
+ if (child->sort_date == 0) {
+ /* fallback to received date */
+ (void)mail_get_received_date(ctx->tmp_mail, &child->sort_date);
+ }
+}
+
+static void
+thread_sort_children(struct thread_finish_context *ctx, uint32_t parent_idx,
+ ARRAY_TYPE(mail_thread_child_node) *sorted_children)
+{
+ const struct mail_thread_shadow_node *shadows;
+ struct mail_thread_child_node child;
+ unsigned int count;
+
+ i_zero(&child);
+ array_clear(sorted_children);
+
+ /* add all child indexes to the array */
+ shadows = array_get(&ctx->shadow_nodes, &count);
+ child.idx = shadows[parent_idx].first_child_idx;
+ i_assert(child.idx != 0);
+ if (shadows[child.idx].next_sibling_idx == 0) {
+ /* only child - don't bother setting sort date */
+ child.uid = thread_lookup_existing(ctx, child.idx);
+
+ array_push_back(sorted_children, &child);
+ return;
+ }
+ while (child.idx != 0) {
+ thread_child_node_fill(ctx, &child);
+
+ array_push_back(sorted_children, &child);
+ child.idx = shadows[child.idx].next_sibling_idx;
+ }
+
+ /* sort the children */
+ array_sort(sorted_children, mail_thread_child_node_cmp);
+}
+
+static void gather_base_subjects(struct thread_finish_context *ctx)
+{
+ struct subject_gather_context gather_ctx;
+ struct mail_thread_root_node *roots;
+ const char *subject;
+ unsigned int i, count;
+ ARRAY_TYPE(mail_thread_child_node) sorted_children;
+ const struct mail_thread_child_node *children;
+ uint32_t idx, uid;
+
+ i_zero(&gather_ctx);
+ gather_ctx.ctx = ctx;
+
+ roots = array_get_modifiable(&ctx->roots, &count);
+ if (count == 0)
+ return;
+ gather_ctx.subject_pool =
+ pool_alloconly_create(MEMPOOL_GROWING"base subjects",
+ nearest_power(count * 20));
+ hash_table_create(&gather_ctx.subject_hash, gather_ctx.subject_pool,
+ count * 2, str_hash, strcmp);
+
+ i_array_init(&sorted_children, 64);
+ for (i = 0; i < count; i++) {
+ roots[i].root_idx1 = i + 1;
+ if (!roots[i].dummy)
+ idx = roots[i].node.idx;
+ else if (!roots[i].ignore) {
+ /* find the oldest child */
+ thread_sort_children(ctx, roots[i].node.idx,
+ &sorted_children);
+ children = array_front(&sorted_children);
+ idx = children[0].idx;
+ } else {
+ /* dummy without children */
+ continue;
+ }
+
+ uid = thread_lookup_existing(ctx, idx);
+ if (!mail_set_uid(ctx->tmp_mail, uid)) {
+ /* the UID should have existed. we would have rebuild
+ the thread tree otherwise. */
+ i_unreached();
+ }
+ if (mail_get_first_header(ctx->tmp_mail, HDR_SUBJECT,
+ &subject) > 0) T_BEGIN {
+ add_base_subject(&gather_ctx, subject, &roots[i]);
+ } T_END;
+ }
+ i_assert(roots[count-1].parent_root_idx1 <= count);
+ array_free(&sorted_children);
+ hash_table_destroy(&gather_ctx.subject_hash);
+ pool_unref(&gather_ctx.subject_pool);
+}
+
+static void thread_add_shadow_child(struct thread_finish_context *ctx,
+ uint32_t parent_idx, uint32_t child_idx)
+{
+ struct mail_thread_shadow_node *parent_shadow, *child_shadow;
+
+ parent_shadow = array_idx_get_space(&ctx->shadow_nodes, parent_idx);
+ child_shadow = array_idx_modifiable(&ctx->shadow_nodes, child_idx);
+
+ child_shadow->next_sibling_idx = parent_shadow->first_child_idx;
+ parent_shadow->first_child_idx = child_idx;
+}
+
+static void mail_thread_root_thread_merge(struct thread_finish_context *ctx,
+ struct mail_thread_root_node *cur)
+{
+ struct mail_thread_root_node *roots, *root, new_root;
+ struct mail_thread_shadow_node *shadows;
+ unsigned int count;
+ uint32_t idx, next_idx;
+
+ i_assert(cur->parent_root_idx1 != 0);
+
+ /* The highest parent is the same as the current message in the
+ subject table. */
+ roots = array_get_modifiable(&ctx->roots, &count);
+ root = cur;
+ do {
+ i_assert(root->parent_root_idx1 <= count);
+ root = &roots[root->parent_root_idx1 - 1];
+ } while (root->parent_root_idx1 != 0);
+ i_assert(!root->ignore);
+
+ shadows = array_front_modifiable(&ctx->shadow_nodes);
+ if (cur->dummy) {
+ /* If both messages are dummies, append the current
+ message's children to the children of the message in
+ the subject table (the children of both messages
+ become siblings), and then delete the current message. */
+ i_assert(root->dummy);
+
+ idx = shadows[cur->node.idx].first_child_idx;
+ while (idx != 0) {
+ next_idx = shadows[idx].next_sibling_idx;
+ thread_add_shadow_child(ctx, root->node.idx, idx);
+ idx = next_idx;
+ }
+
+ shadows[cur->node.idx].first_child_idx = 0;
+ cur->ignore = TRUE;
+ } else if (root->dummy || (cur->reply_or_forward &&
+ !root->reply_or_forward)) {
+ /* a) If the message in the subject table is a dummy and the
+ current message is not, make the current message a
+ child of the message in the subject table (a sibling
+ of its children).
+
+ b) If the current message is a reply or forward and the
+ message in the subject table is not, make the current
+ message a child of the message in the subject table (a
+ sibling of its children). */
+ thread_add_shadow_child(ctx, root->node.idx, cur->node.idx);
+ cur->ignore = TRUE;
+ } else {
+ /* Otherwise, create a new dummy message and make both
+ the current message and the message in the subject
+ table children of the dummy. Then replace the message
+ in the subject table with the dummy message. */
+ i_zero(&new_root);
+ new_root.root_idx1 = array_count(&ctx->roots) + 1;
+ new_root.node.idx = ctx->next_new_root_idx++;
+ new_root.dummy = TRUE;
+
+ thread_add_shadow_child(ctx, new_root.node.idx, root->node.idx);
+ thread_add_shadow_child(ctx, new_root.node.idx, cur->node.idx);
+
+ root->parent_root_idx1 = new_root.root_idx1;
+ root->ignore = TRUE;
+ cur->ignore = TRUE;
+
+ /* append last, since it breaks root and cur pointers */
+ array_push_back(&ctx->roots, &new_root);
+
+ /* make sure all shadow indexes are accessible directly */
+ (void)array_idx_modifiable(&ctx->shadow_nodes,
+ new_root.node.idx);
+ }
+}
+
+static bool merge_subject_threads(struct thread_finish_context *ctx)
+{
+ struct mail_thread_root_node *roots;
+ unsigned int i, count;
+ bool changed = FALSE;
+
+ roots = array_get_modifiable(&ctx->roots, &count);
+ for (i = 0; i < count; i++) {
+ if (roots[i].parent_root_idx1 != 0 && !roots[i].ignore) {
+ mail_thread_root_thread_merge(ctx, &roots[i]);
+ /* more roots may have been added */
+ roots = array_front_modifiable(&ctx->roots);
+ changed = TRUE;
+ }
+ }
+
+ return changed;
+}
+
+static void sort_root_nodes(struct thread_finish_context *ctx)
+{
+ ARRAY_TYPE(mail_thread_child_node) sorted_children;
+ const struct mail_thread_child_node *children;
+ const struct mail_thread_shadow_node *shadows;
+ struct mail_thread_root_node *roots;
+ unsigned int i, count, child_count;
+
+ i_array_init(&sorted_children, 64);
+ shadows = array_front(&ctx->shadow_nodes);
+ roots = array_get_modifiable(&ctx->roots, &count);
+ for (i = 0; i < count; i++) {
+ if (roots[i].ignore)
+ continue;
+ if (roots[i].dummy) {
+ /* sort by the first child */
+ if (shadows[roots[i].node.idx].first_child_idx == 0) {
+ /* childless dummy node */
+ roots[i].ignore = TRUE;
+ continue;
+ }
+ thread_sort_children(ctx, roots[i].node.idx,
+ &sorted_children);
+ children = array_get(&sorted_children, &child_count);
+ if (child_count == 1) {
+ /* only one child - deferred step (3).
+ promote the child to the root. */
+ roots[i].node = children[0];
+ thread_child_node_fill(ctx, &roots[i].node);
+ roots[i].dummy = FALSE;
+ } else {
+ roots[i].node.uid = children[0].uid;
+ roots[i].node.sort_date = children[0].sort_date;
+ }
+ } else {
+ thread_child_node_fill(ctx, &roots[i].node);
+ }
+ }
+ array_free(&sorted_children);
+ array_sort(&ctx->roots, mail_thread_root_node_cmp);
+}
+
+static int mail_thread_root_node_idx_cmp(const void *key, const void *value)
+{
+ const uint32_t *idx = key;
+ const struct mail_thread_root_node *root = value;
+
+ return *idx < root->node.idx ? -1 :
+ *idx > root->node.idx ? 1 : 0;
+}
+
+static void sort_root_nodes_ref2(struct thread_finish_context *ctx,
+ uint32_t record_count)
+{
+ const struct mail_thread_node *node;
+ struct mail_thread_root_node *roots, *root;
+ struct mail_thread_child_node child;
+ const struct mail_thread_shadow_node *shadows;
+ unsigned int root_count;
+ uint32_t idx, parent_idx;
+
+ roots = array_get_modifiable(&ctx->roots, &root_count);
+
+ /* drop childless dummy nodes */
+ shadows = array_front(&ctx->shadow_nodes);
+ for (idx = 1; idx < root_count; idx++) {
+ if (roots[idx].dummy &&
+ shadows[roots[idx].node.idx].first_child_idx == 0)
+ roots[idx].ignore = TRUE;
+ }
+
+ for (idx = 1; idx < record_count; idx++) {
+ node = array_idx(&ctx->cache->thread_nodes, idx);
+ if (!MAIL_THREAD_NODE_EXISTS(node))
+ continue;
+
+ child.idx = idx;
+ thread_child_node_fill(ctx, &child);
+
+ parent_idx = idx;
+ while (node->parent_idx != 0) {
+ parent_idx = node->parent_idx;
+ node = array_idx(&ctx->cache->thread_nodes,
+ node->parent_idx);
+ }
+ root = bsearch(&parent_idx, roots, root_count, sizeof(*roots),
+ mail_thread_root_node_idx_cmp);
+ i_assert(root != NULL);
+
+ if (root->node.sort_date < child.sort_date)
+ root->node.sort_date = child.sort_date;
+ }
+ array_sort(&ctx->roots, mail_thread_root_node_cmp);
+}
+
+static void mail_thread_create_shadows(struct thread_finish_context *ctx,
+ uint32_t record_count)
+{
+ const struct mail_thread_node *node, *parent;
+ struct mail_thread_root_node root;
+ struct mail_thread_child_node child;
+ uint32_t idx, parent_idx;
+
+ ctx->use_sent_date = FALSE;
+
+ i_zero(&root);
+ i_zero(&child);
+
+ /* We may see dummy messages without parents or children. We can't
+ free them since the nodes are in an array, but they may get reused
+ later so just leave them be. With the current algorithm when this
+ happens all the struct fields are always zero at that point, so
+ we don't even have to try to zero them. */
+ for (idx = 1; idx < record_count; idx++) {
+ node = array_idx(&ctx->cache->thread_nodes, idx);
+
+ if (node->parent_idx == 0) {
+ /* root node - add to roots list */
+ root.node.idx = idx;
+ if (!MAIL_THREAD_NODE_EXISTS(node)) {
+ root.dummy = TRUE;
+ root.node.uid = 0;
+ } else {
+ root.dummy = FALSE;
+ root.node.uid = node->uid;
+ }
+ array_push_back(&ctx->roots, &root);
+ continue;
+ }
+ i_assert(node->parent_idx < record_count);
+
+ if (!MAIL_THREAD_NODE_EXISTS(node)) {
+ /* dummy node */
+ continue;
+ }
+
+ /* Find the node's first non-dummy parent and add the
+ node as its child. If there are no non-dummy
+ parents, add it as the highest dummy's child. */
+ parent_idx = node->parent_idx;
+ parent = array_idx(&ctx->cache->thread_nodes, parent_idx);
+ while (!MAIL_THREAD_NODE_EXISTS(parent) &&
+ parent->parent_idx != 0) {
+ parent_idx = parent->parent_idx;
+ parent = array_idx(&ctx->cache->thread_nodes,
+ parent_idx);
+ }
+ thread_add_shadow_child(ctx, parent_idx, idx);
+ }
+}
+
+static void mail_thread_finish(struct thread_finish_context *ctx,
+ enum mail_thread_type thread_type)
+{
+ unsigned int record_count = array_count(&ctx->cache->thread_nodes);
+
+ ctx->next_new_root_idx = record_count + 1;
+
+ /* (2) save root nodes and (3) remove dummy messages */
+ i_array_init(&ctx->roots, I_MIN(128, record_count));
+ i_array_init(&ctx->shadow_nodes, record_count);
+ /* make sure all shadow indexes are accessible directly. */
+ (void)array_idx_get_space(&ctx->shadow_nodes, record_count);
+
+ mail_thread_create_shadows(ctx, record_count);
+
+ /* (4) */
+ ctx->use_sent_date = TRUE;
+ switch (thread_type) {
+ case MAIL_THREAD_REFERENCES:
+ sort_root_nodes(ctx);
+ /* (5) Gather together messages under the root that have
+ the same base subject text. */
+ gather_base_subjects(ctx);
+ /* (5.C) Merge threads with the same thread subject. */
+ if (merge_subject_threads(ctx)) {
+ /* root ordering may have changed, sort them again. */
+ sort_root_nodes(ctx);
+ }
+ break;
+ case MAIL_THREAD_REFS:
+ sort_root_nodes_ref2(ctx, record_count);
+ break;
+ default:
+ i_unreached();
+ }
+}
+
+static void
+nodes_change_uids_to_seqs(struct mail_thread_iterate_context *iter, bool root)
+{
+ struct mail_thread_child_node *children;
+ struct mailbox *box = iter->ctx->tmp_mail->box;
+ unsigned int i, count;
+ uint32_t uid, seq;
+
+ children = array_get_modifiable(&iter->children, &count);
+ for (i = 0; i < count; i++) {
+ uid = children[i].uid;
+ if (uid == 0) {
+ /* dummy root */
+ if (root)
+ continue;
+ i_unreached();
+ } else {
+ mailbox_get_seq_range(box, uid, uid, &seq, &seq);
+ i_assert(seq != 0);
+ }
+ children[i].uid = seq;
+ }
+}
+
+static void
+mail_thread_iterate_fill_root(struct mail_thread_iterate_context *iter)
+{
+ struct mail_thread_root_node *roots;
+ unsigned int i, count;
+
+ roots = array_get_modifiable(&iter->ctx->roots, &count);
+ i_array_init(&iter->children, count);
+ for (i = 0; i < count; i++) {
+ if (!roots[i].ignore) {
+ if (roots[i].dummy)
+ roots[i].node.uid = 0;
+ array_push_back(&iter->children, &roots[i].node);
+ }
+ }
+}
+
+static struct mail_thread_iterate_context *
+mail_thread_iterate_children(struct mail_thread_iterate_context *parent_iter,
+ uint32_t parent_idx)
+{
+ struct mail_thread_iterate_context *child_iter;
+
+ child_iter = i_new(struct mail_thread_iterate_context, 1);
+ child_iter->ctx = parent_iter->ctx;
+ child_iter->ctx->refcount++;
+
+ i_array_init(&child_iter->children, 8);
+ struct event_reason *reason = event_reason_begin("mailbox:thread");
+ thread_sort_children(child_iter->ctx, parent_idx,
+ &child_iter->children);
+ if (child_iter->ctx->return_seqs)
+ nodes_change_uids_to_seqs(child_iter, FALSE);
+ event_reason_end(&reason);
+ return child_iter;
+}
+
+struct mail_thread_iterate_context *
+mail_thread_iterate_init_full(struct mail_thread_cache *cache,
+ struct mail *tmp_mail,
+ enum mail_thread_type thread_type,
+ bool return_seqs)
+{
+ struct mail_thread_iterate_context *iter;
+ struct thread_finish_context *ctx;
+
+ iter = i_new(struct mail_thread_iterate_context, 1);
+ ctx = iter->ctx = i_new(struct thread_finish_context, 1);
+ ctx->refcount = 1;
+ ctx->cache = cache;
+ ctx->tmp_mail = tmp_mail;
+ ctx->return_seqs = return_seqs;
+
+ struct event_reason *reason = event_reason_begin("mailbox:thread");
+ mail_thread_finish(ctx, thread_type);
+
+ mail_thread_iterate_fill_root(iter);
+ if (return_seqs)
+ nodes_change_uids_to_seqs(iter, TRUE);
+ event_reason_end(&reason);
+ return iter;
+}
+
+const struct mail_thread_child_node *
+mail_thread_iterate_next(struct mail_thread_iterate_context *iter,
+ struct mail_thread_iterate_context **child_iter_r)
+{
+ const struct mail_thread_child_node *children, *child;
+ const struct mail_thread_shadow_node *shadow;
+ unsigned int count;
+
+ children = array_get(&iter->children, &count);
+ if (iter->next_idx >= count)
+ return NULL;
+
+ child = &children[iter->next_idx++];
+ shadow = array_idx(&iter->ctx->shadow_nodes, child->idx);
+ *child_iter_r = shadow->first_child_idx == 0 ? NULL :
+ mail_thread_iterate_children(iter, child->idx);
+ if (child->uid == 0 && *child_iter_r == NULL) {
+ /* this is a dummy node without children,
+ there's no point in returning it */
+ return mail_thread_iterate_next(iter, child_iter_r);
+ }
+ return child;
+}
+
+unsigned int mail_thread_iterate_count(struct mail_thread_iterate_context *iter)
+{
+ return array_count(&iter->children);
+}
+
+int mail_thread_iterate_deinit(struct mail_thread_iterate_context **_iter)
+{
+ struct mail_thread_iterate_context *iter = *_iter;
+
+ *_iter = NULL;
+
+ if (--iter->ctx->refcount == 0) {
+ array_free(&iter->ctx->roots);
+ array_free(&iter->ctx->shadow_nodes);
+ i_free(iter->ctx);
+ }
+ array_free(&iter->children);
+ i_free(iter);
+ return 0;
+}