diff options
Diffstat (limited to 'src/lib-storage/index/index-thread-finish.c')
-rw-r--r-- | src/lib-storage/index/index-thread-finish.c | 682 |
1 files changed, 682 insertions, 0 deletions
diff --git a/src/lib-storage/index/index-thread-finish.c b/src/lib-storage/index/index-thread-finish.c new file mode 100644 index 0000000..42326b6 --- /dev/null +++ b/src/lib-storage/index/index-thread-finish.c @@ -0,0 +1,682 @@ +/* Copyright (c) 2002-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "array.h" +#include "hash.h" +#include "imap-base-subject.h" +#include "mail-storage-private.h" +#include "index-thread-private.h" + + +struct mail_thread_shadow_node { + uint32_t first_child_idx, next_sibling_idx; +}; + +struct mail_thread_root_node { + /* node.idx usually points to indexes from mail hash. However + REFERENCES step (5) may add temporary dummy roots. They use larger + index numbers than exist in the hash. */ + struct mail_thread_child_node node; + + /* Used temporarily by (5)(B) base subject gathering. + root_idx1 is node's index in roots[] array + 1. + parent_root_idx points to root_idx1, or 0 for root. */ + unsigned int root_idx1; + uint32_t parent_root_idx1; + + /* subject contained a Re: or Fwd: */ + bool reply_or_forward:1; + /* a dummy node */ + bool dummy:1; + /* ignore this node - it's a dummy without children */ + bool ignore:1; +}; + +struct thread_finish_context { + unsigned int refcount; + + struct mail *tmp_mail; + struct mail_thread_cache *cache; + + ARRAY(struct mail_thread_root_node) roots; + ARRAY(struct mail_thread_shadow_node) shadow_nodes; + unsigned int next_new_root_idx; + + bool use_sent_date:1; + bool return_seqs:1; +}; + +struct mail_thread_iterate_context { + struct thread_finish_context *ctx; + + ARRAY_TYPE(mail_thread_child_node) children; + unsigned int next_idx; + bool failed; +}; + +struct subject_gather_context { + struct thread_finish_context *ctx; + + pool_t subject_pool; + HASH_TABLE(char *, struct mail_thread_root_node *) subject_hash; +}; + +static void +add_base_subject(struct subject_gather_context *ctx, const char *subject, + struct mail_thread_root_node *node) +{ + struct mail_thread_root_node *hash_node; + char *hash_subject; + bool is_reply_or_forward; + + subject = imap_get_base_subject_cased(pool_datastack_create(), subject, + &is_reply_or_forward); + /* (ii) If the thread subject is empty, skip this message. */ + if (*subject == '\0') + return; + + /* (iii) Look up the message associated with the thread + subject in the subject table. */ + if (!hash_table_lookup_full(ctx->subject_hash, subject, &hash_subject, + &hash_node)) { + /* (iv) If there is no message in the subject table with the + thread subject, add the current message and the thread + subject to the subject table. */ + hash_subject = p_strdup(ctx->subject_pool, subject); + hash_table_insert(ctx->subject_hash, hash_subject, node); + } else { + /* Otherwise, if the message in the subject table is not a + dummy, AND either of the following criteria are true: + + The current message is a dummy, OR + + The message in the subject table is a reply or forward + and the current message is not. + + then replace the message in the subject table with the + current message. */ + if (!hash_node->dummy && + (node->dummy || + (hash_node->reply_or_forward && !is_reply_or_forward))) { + hash_node->parent_root_idx1 = node->root_idx1; + hash_table_update(ctx->subject_hash, hash_subject, node); + } else { + node->parent_root_idx1 = hash_node->root_idx1; + } + } + + node->reply_or_forward = is_reply_or_forward; +} + +static int mail_thread_child_node_cmp(const struct mail_thread_child_node *c1, + const struct mail_thread_child_node *c2) +{ + if (c1->sort_date < c2->sort_date) + return -1; + if (c1->sort_date > c2->sort_date) + return 1; + + if (c1->uid < c2->uid) + return -1; + if (c1->uid > c2->uid) + return 1; + return 0; +} + +static int mail_thread_root_node_cmp(const struct mail_thread_root_node *r1, + const struct mail_thread_root_node *r2) +{ + return mail_thread_child_node_cmp(&r1->node, &r2->node); +} + +static uint32_t +thread_lookup_existing(struct thread_finish_context *ctx, uint32_t idx) +{ + const struct mail_thread_node *node; + + node = array_idx(&ctx->cache->thread_nodes, idx); + i_assert(MAIL_THREAD_NODE_EXISTS(node)); + i_assert(node->uid != 0); + return node->uid; +} + +static void +thread_child_node_fill(struct thread_finish_context *ctx, + struct mail_thread_child_node *child) +{ + int tz; + + child->uid = thread_lookup_existing(ctx, child->idx); + + if (!mail_set_uid(ctx->tmp_mail, child->uid)) { + /* the UID should have existed. we would have rebuild + the thread tree otherwise. */ + i_unreached(); + } + + /* get sent date if we want to use it and if it's valid */ + if (!ctx->use_sent_date) + child->sort_date = 0; + else if (mail_get_date(ctx->tmp_mail, &child->sort_date, &tz) < 0) + child->sort_date = 0; + + if (child->sort_date == 0) { + /* fallback to received date */ + (void)mail_get_received_date(ctx->tmp_mail, &child->sort_date); + } +} + +static void +thread_sort_children(struct thread_finish_context *ctx, uint32_t parent_idx, + ARRAY_TYPE(mail_thread_child_node) *sorted_children) +{ + const struct mail_thread_shadow_node *shadows; + struct mail_thread_child_node child; + unsigned int count; + + i_zero(&child); + array_clear(sorted_children); + + /* add all child indexes to the array */ + shadows = array_get(&ctx->shadow_nodes, &count); + child.idx = shadows[parent_idx].first_child_idx; + i_assert(child.idx != 0); + if (shadows[child.idx].next_sibling_idx == 0) { + /* only child - don't bother setting sort date */ + child.uid = thread_lookup_existing(ctx, child.idx); + + array_push_back(sorted_children, &child); + return; + } + while (child.idx != 0) { + thread_child_node_fill(ctx, &child); + + array_push_back(sorted_children, &child); + child.idx = shadows[child.idx].next_sibling_idx; + } + + /* sort the children */ + array_sort(sorted_children, mail_thread_child_node_cmp); +} + +static void gather_base_subjects(struct thread_finish_context *ctx) +{ + struct subject_gather_context gather_ctx; + struct mail_thread_root_node *roots; + const char *subject; + unsigned int i, count; + ARRAY_TYPE(mail_thread_child_node) sorted_children; + const struct mail_thread_child_node *children; + uint32_t idx, uid; + + i_zero(&gather_ctx); + gather_ctx.ctx = ctx; + + roots = array_get_modifiable(&ctx->roots, &count); + if (count == 0) + return; + gather_ctx.subject_pool = + pool_alloconly_create(MEMPOOL_GROWING"base subjects", + nearest_power(count * 20)); + hash_table_create(&gather_ctx.subject_hash, gather_ctx.subject_pool, + count * 2, str_hash, strcmp); + + i_array_init(&sorted_children, 64); + for (i = 0; i < count; i++) { + roots[i].root_idx1 = i + 1; + if (!roots[i].dummy) + idx = roots[i].node.idx; + else if (!roots[i].ignore) { + /* find the oldest child */ + thread_sort_children(ctx, roots[i].node.idx, + &sorted_children); + children = array_front(&sorted_children); + idx = children[0].idx; + } else { + /* dummy without children */ + continue; + } + + uid = thread_lookup_existing(ctx, idx); + if (!mail_set_uid(ctx->tmp_mail, uid)) { + /* the UID should have existed. we would have rebuild + the thread tree otherwise. */ + i_unreached(); + } + if (mail_get_first_header(ctx->tmp_mail, HDR_SUBJECT, + &subject) > 0) T_BEGIN { + add_base_subject(&gather_ctx, subject, &roots[i]); + } T_END; + } + i_assert(roots[count-1].parent_root_idx1 <= count); + array_free(&sorted_children); + hash_table_destroy(&gather_ctx.subject_hash); + pool_unref(&gather_ctx.subject_pool); +} + +static void thread_add_shadow_child(struct thread_finish_context *ctx, + uint32_t parent_idx, uint32_t child_idx) +{ + struct mail_thread_shadow_node *parent_shadow, *child_shadow; + + parent_shadow = array_idx_get_space(&ctx->shadow_nodes, parent_idx); + child_shadow = array_idx_modifiable(&ctx->shadow_nodes, child_idx); + + child_shadow->next_sibling_idx = parent_shadow->first_child_idx; + parent_shadow->first_child_idx = child_idx; +} + +static void mail_thread_root_thread_merge(struct thread_finish_context *ctx, + struct mail_thread_root_node *cur) +{ + struct mail_thread_root_node *roots, *root, new_root; + struct mail_thread_shadow_node *shadows; + unsigned int count; + uint32_t idx, next_idx; + + i_assert(cur->parent_root_idx1 != 0); + + /* The highest parent is the same as the current message in the + subject table. */ + roots = array_get_modifiable(&ctx->roots, &count); + root = cur; + do { + i_assert(root->parent_root_idx1 <= count); + root = &roots[root->parent_root_idx1 - 1]; + } while (root->parent_root_idx1 != 0); + i_assert(!root->ignore); + + shadows = array_front_modifiable(&ctx->shadow_nodes); + if (cur->dummy) { + /* If both messages are dummies, append the current + message's children to the children of the message in + the subject table (the children of both messages + become siblings), and then delete the current message. */ + i_assert(root->dummy); + + idx = shadows[cur->node.idx].first_child_idx; + while (idx != 0) { + next_idx = shadows[idx].next_sibling_idx; + thread_add_shadow_child(ctx, root->node.idx, idx); + idx = next_idx; + } + + shadows[cur->node.idx].first_child_idx = 0; + cur->ignore = TRUE; + } else if (root->dummy || (cur->reply_or_forward && + !root->reply_or_forward)) { + /* a) If the message in the subject table is a dummy and the + current message is not, make the current message a + child of the message in the subject table (a sibling + of its children). + + b) If the current message is a reply or forward and the + message in the subject table is not, make the current + message a child of the message in the subject table (a + sibling of its children). */ + thread_add_shadow_child(ctx, root->node.idx, cur->node.idx); + cur->ignore = TRUE; + } else { + /* Otherwise, create a new dummy message and make both + the current message and the message in the subject + table children of the dummy. Then replace the message + in the subject table with the dummy message. */ + i_zero(&new_root); + new_root.root_idx1 = array_count(&ctx->roots) + 1; + new_root.node.idx = ctx->next_new_root_idx++; + new_root.dummy = TRUE; + + thread_add_shadow_child(ctx, new_root.node.idx, root->node.idx); + thread_add_shadow_child(ctx, new_root.node.idx, cur->node.idx); + + root->parent_root_idx1 = new_root.root_idx1; + root->ignore = TRUE; + cur->ignore = TRUE; + + /* append last, since it breaks root and cur pointers */ + array_push_back(&ctx->roots, &new_root); + + /* make sure all shadow indexes are accessible directly */ + (void)array_idx_modifiable(&ctx->shadow_nodes, + new_root.node.idx); + } +} + +static bool merge_subject_threads(struct thread_finish_context *ctx) +{ + struct mail_thread_root_node *roots; + unsigned int i, count; + bool changed = FALSE; + + roots = array_get_modifiable(&ctx->roots, &count); + for (i = 0; i < count; i++) { + if (roots[i].parent_root_idx1 != 0 && !roots[i].ignore) { + mail_thread_root_thread_merge(ctx, &roots[i]); + /* more roots may have been added */ + roots = array_front_modifiable(&ctx->roots); + changed = TRUE; + } + } + + return changed; +} + +static void sort_root_nodes(struct thread_finish_context *ctx) +{ + ARRAY_TYPE(mail_thread_child_node) sorted_children; + const struct mail_thread_child_node *children; + const struct mail_thread_shadow_node *shadows; + struct mail_thread_root_node *roots; + unsigned int i, count, child_count; + + i_array_init(&sorted_children, 64); + shadows = array_front(&ctx->shadow_nodes); + roots = array_get_modifiable(&ctx->roots, &count); + for (i = 0; i < count; i++) { + if (roots[i].ignore) + continue; + if (roots[i].dummy) { + /* sort by the first child */ + if (shadows[roots[i].node.idx].first_child_idx == 0) { + /* childless dummy node */ + roots[i].ignore = TRUE; + continue; + } + thread_sort_children(ctx, roots[i].node.idx, + &sorted_children); + children = array_get(&sorted_children, &child_count); + if (child_count == 1) { + /* only one child - deferred step (3). + promote the child to the root. */ + roots[i].node = children[0]; + thread_child_node_fill(ctx, &roots[i].node); + roots[i].dummy = FALSE; + } else { + roots[i].node.uid = children[0].uid; + roots[i].node.sort_date = children[0].sort_date; + } + } else { + thread_child_node_fill(ctx, &roots[i].node); + } + } + array_free(&sorted_children); + array_sort(&ctx->roots, mail_thread_root_node_cmp); +} + +static int mail_thread_root_node_idx_cmp(const void *key, const void *value) +{ + const uint32_t *idx = key; + const struct mail_thread_root_node *root = value; + + return *idx < root->node.idx ? -1 : + *idx > root->node.idx ? 1 : 0; +} + +static void sort_root_nodes_ref2(struct thread_finish_context *ctx, + uint32_t record_count) +{ + const struct mail_thread_node *node; + struct mail_thread_root_node *roots, *root; + struct mail_thread_child_node child; + const struct mail_thread_shadow_node *shadows; + unsigned int root_count; + uint32_t idx, parent_idx; + + roots = array_get_modifiable(&ctx->roots, &root_count); + + /* drop childless dummy nodes */ + shadows = array_front(&ctx->shadow_nodes); + for (idx = 1; idx < root_count; idx++) { + if (roots[idx].dummy && + shadows[roots[idx].node.idx].first_child_idx == 0) + roots[idx].ignore = TRUE; + } + + for (idx = 1; idx < record_count; idx++) { + node = array_idx(&ctx->cache->thread_nodes, idx); + if (!MAIL_THREAD_NODE_EXISTS(node)) + continue; + + child.idx = idx; + thread_child_node_fill(ctx, &child); + + parent_idx = idx; + while (node->parent_idx != 0) { + parent_idx = node->parent_idx; + node = array_idx(&ctx->cache->thread_nodes, + node->parent_idx); + } + root = bsearch(&parent_idx, roots, root_count, sizeof(*roots), + mail_thread_root_node_idx_cmp); + i_assert(root != NULL); + + if (root->node.sort_date < child.sort_date) + root->node.sort_date = child.sort_date; + } + array_sort(&ctx->roots, mail_thread_root_node_cmp); +} + +static void mail_thread_create_shadows(struct thread_finish_context *ctx, + uint32_t record_count) +{ + const struct mail_thread_node *node, *parent; + struct mail_thread_root_node root; + struct mail_thread_child_node child; + uint32_t idx, parent_idx; + + ctx->use_sent_date = FALSE; + + i_zero(&root); + i_zero(&child); + + /* We may see dummy messages without parents or children. We can't + free them since the nodes are in an array, but they may get reused + later so just leave them be. With the current algorithm when this + happens all the struct fields are always zero at that point, so + we don't even have to try to zero them. */ + for (idx = 1; idx < record_count; idx++) { + node = array_idx(&ctx->cache->thread_nodes, idx); + + if (node->parent_idx == 0) { + /* root node - add to roots list */ + root.node.idx = idx; + if (!MAIL_THREAD_NODE_EXISTS(node)) { + root.dummy = TRUE; + root.node.uid = 0; + } else { + root.dummy = FALSE; + root.node.uid = node->uid; + } + array_push_back(&ctx->roots, &root); + continue; + } + i_assert(node->parent_idx < record_count); + + if (!MAIL_THREAD_NODE_EXISTS(node)) { + /* dummy node */ + continue; + } + + /* Find the node's first non-dummy parent and add the + node as its child. If there are no non-dummy + parents, add it as the highest dummy's child. */ + parent_idx = node->parent_idx; + parent = array_idx(&ctx->cache->thread_nodes, parent_idx); + while (!MAIL_THREAD_NODE_EXISTS(parent) && + parent->parent_idx != 0) { + parent_idx = parent->parent_idx; + parent = array_idx(&ctx->cache->thread_nodes, + parent_idx); + } + thread_add_shadow_child(ctx, parent_idx, idx); + } +} + +static void mail_thread_finish(struct thread_finish_context *ctx, + enum mail_thread_type thread_type) +{ + unsigned int record_count = array_count(&ctx->cache->thread_nodes); + + ctx->next_new_root_idx = record_count + 1; + + /* (2) save root nodes and (3) remove dummy messages */ + i_array_init(&ctx->roots, I_MIN(128, record_count)); + i_array_init(&ctx->shadow_nodes, record_count); + /* make sure all shadow indexes are accessible directly. */ + (void)array_idx_get_space(&ctx->shadow_nodes, record_count); + + mail_thread_create_shadows(ctx, record_count); + + /* (4) */ + ctx->use_sent_date = TRUE; + switch (thread_type) { + case MAIL_THREAD_REFERENCES: + sort_root_nodes(ctx); + /* (5) Gather together messages under the root that have + the same base subject text. */ + gather_base_subjects(ctx); + /* (5.C) Merge threads with the same thread subject. */ + if (merge_subject_threads(ctx)) { + /* root ordering may have changed, sort them again. */ + sort_root_nodes(ctx); + } + break; + case MAIL_THREAD_REFS: + sort_root_nodes_ref2(ctx, record_count); + break; + default: + i_unreached(); + } +} + +static void +nodes_change_uids_to_seqs(struct mail_thread_iterate_context *iter, bool root) +{ + struct mail_thread_child_node *children; + struct mailbox *box = iter->ctx->tmp_mail->box; + unsigned int i, count; + uint32_t uid, seq; + + children = array_get_modifiable(&iter->children, &count); + for (i = 0; i < count; i++) { + uid = children[i].uid; + if (uid == 0) { + /* dummy root */ + if (root) + continue; + i_unreached(); + } else { + mailbox_get_seq_range(box, uid, uid, &seq, &seq); + i_assert(seq != 0); + } + children[i].uid = seq; + } +} + +static void +mail_thread_iterate_fill_root(struct mail_thread_iterate_context *iter) +{ + struct mail_thread_root_node *roots; + unsigned int i, count; + + roots = array_get_modifiable(&iter->ctx->roots, &count); + i_array_init(&iter->children, count); + for (i = 0; i < count; i++) { + if (!roots[i].ignore) { + if (roots[i].dummy) + roots[i].node.uid = 0; + array_push_back(&iter->children, &roots[i].node); + } + } +} + +static struct mail_thread_iterate_context * +mail_thread_iterate_children(struct mail_thread_iterate_context *parent_iter, + uint32_t parent_idx) +{ + struct mail_thread_iterate_context *child_iter; + + child_iter = i_new(struct mail_thread_iterate_context, 1); + child_iter->ctx = parent_iter->ctx; + child_iter->ctx->refcount++; + + i_array_init(&child_iter->children, 8); + struct event_reason *reason = event_reason_begin("mailbox:thread"); + thread_sort_children(child_iter->ctx, parent_idx, + &child_iter->children); + if (child_iter->ctx->return_seqs) + nodes_change_uids_to_seqs(child_iter, FALSE); + event_reason_end(&reason); + return child_iter; +} + +struct mail_thread_iterate_context * +mail_thread_iterate_init_full(struct mail_thread_cache *cache, + struct mail *tmp_mail, + enum mail_thread_type thread_type, + bool return_seqs) +{ + struct mail_thread_iterate_context *iter; + struct thread_finish_context *ctx; + + iter = i_new(struct mail_thread_iterate_context, 1); + ctx = iter->ctx = i_new(struct thread_finish_context, 1); + ctx->refcount = 1; + ctx->cache = cache; + ctx->tmp_mail = tmp_mail; + ctx->return_seqs = return_seqs; + + struct event_reason *reason = event_reason_begin("mailbox:thread"); + mail_thread_finish(ctx, thread_type); + + mail_thread_iterate_fill_root(iter); + if (return_seqs) + nodes_change_uids_to_seqs(iter, TRUE); + event_reason_end(&reason); + return iter; +} + +const struct mail_thread_child_node * +mail_thread_iterate_next(struct mail_thread_iterate_context *iter, + struct mail_thread_iterate_context **child_iter_r) +{ + const struct mail_thread_child_node *children, *child; + const struct mail_thread_shadow_node *shadow; + unsigned int count; + + children = array_get(&iter->children, &count); + if (iter->next_idx >= count) + return NULL; + + child = &children[iter->next_idx++]; + shadow = array_idx(&iter->ctx->shadow_nodes, child->idx); + *child_iter_r = shadow->first_child_idx == 0 ? NULL : + mail_thread_iterate_children(iter, child->idx); + if (child->uid == 0 && *child_iter_r == NULL) { + /* this is a dummy node without children, + there's no point in returning it */ + return mail_thread_iterate_next(iter, child_iter_r); + } + return child; +} + +unsigned int mail_thread_iterate_count(struct mail_thread_iterate_context *iter) +{ + return array_count(&iter->children); +} + +int mail_thread_iterate_deinit(struct mail_thread_iterate_context **_iter) +{ + struct mail_thread_iterate_context *iter = *_iter; + + *_iter = NULL; + + if (--iter->ctx->refcount == 0) { + array_free(&iter->ctx->roots); + array_free(&iter->ctx->shadow_nodes); + i_free(iter->ctx); + } + array_free(&iter->children); + i_free(iter); + return 0; +} |