diff options
Diffstat (limited to 'src/k_merge_tree.h')
-rw-r--r-- | src/k_merge_tree.h | 471 |
1 files changed, 471 insertions, 0 deletions
diff --git a/src/k_merge_tree.h b/src/k_merge_tree.h new file mode 100644 index 0000000..2310628 --- /dev/null +++ b/src/k_merge_tree.h @@ -0,0 +1,471 @@ +/* + + K-Way Merge Template + By Jordan Zimmerman + +*/ + +#ifndef KMERGE_TREE_H +#define KMERGE_TREE_H + +/* + +K-Way Merge + +An implementation of "k-Way Merging" as described in "Fundamentals of Data Structures" by Horowitz/Sahni. + +The idea is to merge k sorted arrays limiting the number of comparisons. A tree is built containing the +results of comparing the heads of each array. The top most node is always the smallest entry. Then, its +corresponding leaf in the tree is refilled and the tree is processed again. It's easier to see in the +following example: + +Imagine 4 sorted arrays: +{5, 10, 15, 20} +{10, 13, 16, 19} +{2, 19, 26, 40} +{18, 22, 23, 24} + +The initial tree looks like this: + + + 2 + / \ + 2 5 + / \ / \ + 18 2 10 5 + +The '/' and '\' represent links. The bottom row has the leaves and they contain the heads of the arrays. +The rows above the leaves represent the smaller of the two child nodes. Thus, the top node is the smallest. +To process the next iteration, the top node gets popped and its leaf gets refilled. Then, the new leaf's +associated nodes are processed. So, after the 2 is taken, it is filled with 19 (the next head of its array). +After processing, the tree looks like this: + + + 5 + / \ + 18 5 + / \ / \ + 18 19 10 5 + +So, you can see how the number of comparisons is reduced. + +A good use of this is when you have a very large array that needs to be sorted. Break it up into n small +arrays and sort those. Then use this merge sort for the final sort. This can also be done with files. If +you have n sorted files, you can merge them into one sorted file. K Way Merging works best when comparing +is somewhat expensive. + +*/ + +#include <math.h> + +#include <functional> + +template <class T, class owner_t, class iterator_t, class comparitor = std::less<T> > +class kmerge_tree_c +{ + +public: + // create the tree with the given number of buckets. Call add() for each of the buckets + // and then call execute() to build things. Call get_top() then next() until get_top returns + // false. + kmerge_tree_c(long bucket_qty); + ~kmerge_tree_c(); + + // add a sorted collection to the tree + // + // begin/end - start end of a collection + void add(owner_t *owner, iterator_t begin, iterator_t end); + + // process the first sort + void execute(void); + + // advance to the next entry + void next(void); + + // return the next entry without re-processing the tree + // if false is returned, the merge is complete + bool get_top(owner_t *&owner, iterator_t& iterator) + { + if (top_node_ptr_mbr->has_iterator) { + owner = top_node_ptr_mbr->owner_ptr; + iterator = top_node_ptr_mbr->current_iterator; + return iterator != top_node_ptr_mbr->end_iterator; + } + else { + return false; + } + } + +private: + class node_rec + { + + public: + node_rec(void) : + left_child_ptr(NULL), + right_child_ptr(NULL), + parent_ptr(NULL), + next_ptr(NULL), + previous_ptr(NULL), + next_leaf_ptr(NULL), + source_node_ptr(NULL), + owner_ptr(NULL), + has_iterator(false) + { + } + ~node_rec() + { + delete left_child_ptr; + delete right_child_ptr; + } + + node_rec* left_child_ptr; // owned the left child of this node + node_rec* right_child_ptr; // owned the right child of this node + node_rec* parent_ptr; // copy the parent of this node + node_rec* next_ptr; // copy the next sibling + node_rec* previous_ptr; // copy the previous sibling + node_rec* next_leaf_ptr; // copy only for the bottom rows, a linked list of the buckets + node_rec* source_node_ptr; // copy ptr back to the node from whence this iterator came + owner_t *owner_ptr; + int has_iterator; + iterator_t current_iterator; + iterator_t end_iterator; + + private: + node_rec(const node_rec&); + node_rec& operator=(const node_rec&); + + }; + + void build_tree(void); + node_rec* build_levels(long number_of_levels); + void build_left_siblings(kmerge_tree_c::node_rec* node_ptr); + void build_right_siblings(kmerge_tree_c::node_rec* node_ptr); + void compare_nodes(kmerge_tree_c::node_rec* node_ptr); + + comparitor comparitor_mbr; + long bucket_qty_mbr; + long number_of_levels_mbr; + node_rec* top_node_ptr_mbr; // owned + node_rec* first_leaf_ptr; // copy + node_rec* last_leaf_ptr; // copy + +}; + +inline long kmerge_tree_brute_log2(long value) +{ + + long square = 2; + long count = 1; + while ( square < value ) + { + square *= 2; + ++count; + } + + return count; + +} // kmerge_tree_brute_log2 + +//~~~~~~~~~~class kmerge_tree_c + +template <class T, class owner_t, class iterator_t, class comparitor> +kmerge_tree_c<T, owner_t, iterator_t, comparitor>::kmerge_tree_c(long bucket_qty) : + bucket_qty_mbr(bucket_qty), + number_of_levels_mbr(bucket_qty ? ::kmerge_tree_brute_log2(bucket_qty_mbr) : 0), // don't add one - build_levels is zero based + top_node_ptr_mbr(NULL), + first_leaf_ptr(NULL), + last_leaf_ptr(NULL) +{ + + build_tree(); + +} + +template <class T, class owner_t, class iterator_t, class comparitor> +kmerge_tree_c<T, owner_t, iterator_t, comparitor>::~kmerge_tree_c() +{ + + delete top_node_ptr_mbr; + +} + +/* + Unlike the book, I'm going to make my life easy + by maintaining every possible link to each node that I might need +*/ +template <class T, class owner_t, class iterator_t, class comparitor> +void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_tree(void) +{ + + // the book says that the number of levels is (log2 * k) + 1 + top_node_ptr_mbr = build_levels(number_of_levels_mbr); + if ( top_node_ptr_mbr ) + { + build_left_siblings(top_node_ptr_mbr); + build_right_siblings(top_node_ptr_mbr); + } + +} + +/* + highly recursive tree builder + as long as number_of_levels isn't zero, each node builds its own children + and updates the parent link for them. + + If no children are to be built (i.e. number_of_levels is 0), then the leaf linked list is created +*/ +template <class T, class owner_t, class iterator_t, class comparitor> +typename kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec * +kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_levels(long number_of_levels) +{ + + node_rec* node_ptr = new node_rec; + + if ( number_of_levels ) + { + node_ptr->left_child_ptr = build_levels(number_of_levels - 1); + if ( node_ptr->left_child_ptr ) + { + node_ptr->left_child_ptr->parent_ptr = node_ptr; + node_ptr->right_child_ptr = build_levels(number_of_levels - 1); + if ( node_ptr->right_child_ptr ) + { + node_ptr->right_child_ptr->parent_ptr = node_ptr; + } + } + } + else + { + if ( last_leaf_ptr ) + { + last_leaf_ptr->next_leaf_ptr = node_ptr; + last_leaf_ptr = node_ptr; + } + else + { + first_leaf_ptr = last_leaf_ptr = node_ptr; + } + } + + return node_ptr; + +} + +/* + when we process a comparison, we'll have to compare two siblings + this code matches each link with its sibling + + To get every node correct, I had to write two routines: one which works + with left nodes and one which works with right nodes. build_tree() starts it + off with the top node's children and then these two swing back and forth + between each other. +*/ + +template <class T, class owner_t, class iterator_t, class comparitor> +void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_left_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr) +{ + + if ( node_ptr->parent_ptr ) + { + if ( node_ptr->parent_ptr->right_child_ptr ) + { + node_ptr->next_ptr = node_ptr->parent_ptr->right_child_ptr; + node_ptr->parent_ptr->right_child_ptr->previous_ptr = node_ptr; + } + } + + if ( node_ptr->left_child_ptr ) + { + build_left_siblings(node_ptr->left_child_ptr); + } + + if ( node_ptr->right_child_ptr ) + { + build_right_siblings(node_ptr->right_child_ptr); + } + +} + +template <class T, class owner_t, class iterator_t, class comparitor> +void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_right_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr) +{ + + if ( node_ptr->parent_ptr ) + { + if ( node_ptr->parent_ptr->left_child_ptr ) + { + node_ptr->previous_ptr = node_ptr->parent_ptr->left_child_ptr; + node_ptr->parent_ptr->left_child_ptr->next_ptr = node_ptr; + } + } + + if ( node_ptr->right_child_ptr ) + { + build_right_siblings(node_ptr->right_child_ptr); + } + + if ( node_ptr->left_child_ptr ) + { + build_left_siblings(node_ptr->left_child_ptr); + } + +} + +// fill the leaf linked list +template <class T, class owner_t, class iterator_t, class comparitor> +void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::add(owner_t *owner, iterator_t begin, iterator_t end) +{ + + if ( begin == end ) + { + return; + } + + node_rec* node_ptr = first_leaf_ptr; + while ( node_ptr ) + { + if ( node_ptr->has_iterator ) + { + node_ptr = node_ptr->next_leaf_ptr; + } + else + { + node_ptr->has_iterator = true; + node_ptr->owner_ptr = owner; + node_ptr->current_iterator = begin; + node_ptr->end_iterator = end; + break; + } + } +} + +/* + fill the initial tree by comparing each sibling in the + leaf linked list and then factoring that up to the parents + This is only done once so it doesn't have to be that efficient +*/ +template <class T, class owner_t, class iterator_t, class comparitor> +void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::execute(void) +{ + + for ( long parent_level = 0; parent_level < number_of_levels_mbr; ++parent_level ) + { + // the number of nodes to skip is (parent level + 1) ^ 2 - 1 + long skip_nodes = 2; + for ( long skip = 0; skip < parent_level; ++skip ) + { + skip_nodes *= 2; + } + --skip_nodes; + + node_rec* node_ptr = first_leaf_ptr; + while ( node_ptr ) + { + node_rec* parent_ptr = node_ptr; + long i; + for ( i = 0; i < parent_level; ++i ) + { + parent_ptr = parent_ptr->parent_ptr; + } + + compare_nodes(parent_ptr); + + for ( i = 0; i < skip_nodes; ++i ) + { + node_ptr = node_ptr->next_leaf_ptr; + } + + node_ptr = node_ptr->next_leaf_ptr; + } + } + +} + +// compare the given node and its sibling and bubble the result up to the parent +template <class T, class owner_t, class iterator_t, class comparitor> +void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::compare_nodes(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr) +{ + +// assert(node_ptr->parent_ptr); + + node_rec* node1_ptr = NULL; + node_rec* node2_ptr = NULL; + if ( node_ptr->next_ptr ) + { + node1_ptr = node_ptr; + node2_ptr = node_ptr->next_ptr; + } + else + { + node1_ptr = node_ptr->previous_ptr; + node2_ptr = node_ptr; + } + + long result; + if ( !node2_ptr->has_iterator ) { + result = -1; + } + else if ( !node1_ptr->has_iterator) { + result = 1; + } + else if ( (node1_ptr->current_iterator != node1_ptr->end_iterator) && (node2_ptr->current_iterator != node2_ptr->end_iterator) ) + { + // no need to check for exact equality - we just want the lesser of the two + result = comparitor_mbr(*node1_ptr->current_iterator, *node2_ptr->current_iterator) ? -1 : 1; + } + else if ( node1_ptr->current_iterator != node1_ptr->end_iterator ) + { + result = -1; + } + else // if ( node2_ptr->current_iterator != node2_ptr->end_iterator ) + { + result = 1; + } + + node_rec* winner_ptr = (result <= 0) ? node1_ptr : node2_ptr; + node_rec* parent_ptr = node_ptr->parent_ptr; + + parent_ptr->owner_ptr = winner_ptr->owner_ptr; + parent_ptr->has_iterator = winner_ptr->has_iterator; + parent_ptr->current_iterator = winner_ptr->current_iterator; + parent_ptr->end_iterator = winner_ptr->end_iterator; + parent_ptr->source_node_ptr = winner_ptr; + +} + +/* + pop the top node, factor it down the list to find its + leaf, replace its leaf and then factor that back up +*/ +template <class T, class owner_t, class iterator_t, class comparitor> +void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::next(void) +{ + + if ( top_node_ptr_mbr->current_iterator == top_node_ptr_mbr->end_iterator ) + { + return; + } + + // find the source leaf ptr + node_rec* node_ptr = top_node_ptr_mbr; + while ( node_ptr->source_node_ptr ) + { + node_ptr = node_ptr->source_node_ptr; + } + +// assert(!node_ptr->left_child_ptr && !node_ptr->right_child_ptr); +// assert(top_node_ptr_mbr->current_iterator == node_ptr->current_iterator); + + ++node_ptr->current_iterator; + + while ( node_ptr->parent_ptr ) + { + compare_nodes(node_ptr); + node_ptr = node_ptr->parent_ptr; + } + +} + +#endif // KMERGE_TREE_H + |