summaryrefslogtreecommitdiffstats
path: root/src/k_merge_tree.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/k_merge_tree.h')
-rw-r--r--src/k_merge_tree.h471
1 files changed, 471 insertions, 0 deletions
diff --git a/src/k_merge_tree.h b/src/k_merge_tree.h
new file mode 100644
index 0000000..2310628
--- /dev/null
+++ b/src/k_merge_tree.h
@@ -0,0 +1,471 @@
+/*
+
+ K-Way Merge Template
+ By Jordan Zimmerman
+
+*/
+
+#ifndef KMERGE_TREE_H
+#define KMERGE_TREE_H
+
+/*
+
+K-Way Merge
+
+An implementation of "k-Way Merging" as described in "Fundamentals of Data Structures" by Horowitz/Sahni.
+
+The idea is to merge k sorted arrays limiting the number of comparisons. A tree is built containing the
+results of comparing the heads of each array. The top most node is always the smallest entry. Then, its
+corresponding leaf in the tree is refilled and the tree is processed again. It's easier to see in the
+following example:
+
+Imagine 4 sorted arrays:
+{5, 10, 15, 20}
+{10, 13, 16, 19}
+{2, 19, 26, 40}
+{18, 22, 23, 24}
+
+The initial tree looks like this:
+
+
+ 2
+ / \
+ 2 5
+ / \ / \
+ 18 2 10 5
+
+The '/' and '\' represent links. The bottom row has the leaves and they contain the heads of the arrays.
+The rows above the leaves represent the smaller of the two child nodes. Thus, the top node is the smallest.
+To process the next iteration, the top node gets popped and its leaf gets refilled. Then, the new leaf's
+associated nodes are processed. So, after the 2 is taken, it is filled with 19 (the next head of its array).
+After processing, the tree looks like this:
+
+
+ 5
+ / \
+ 18 5
+ / \ / \
+ 18 19 10 5
+
+So, you can see how the number of comparisons is reduced.
+
+A good use of this is when you have a very large array that needs to be sorted. Break it up into n small
+arrays and sort those. Then use this merge sort for the final sort. This can also be done with files. If
+you have n sorted files, you can merge them into one sorted file. K Way Merging works best when comparing
+is somewhat expensive.
+
+*/
+
+#include <math.h>
+
+#include <functional>
+
+template <class T, class owner_t, class iterator_t, class comparitor = std::less<T> >
+class kmerge_tree_c
+{
+
+public:
+ // create the tree with the given number of buckets. Call add() for each of the buckets
+ // and then call execute() to build things. Call get_top() then next() until get_top returns
+ // false.
+ kmerge_tree_c(long bucket_qty);
+ ~kmerge_tree_c();
+
+ // add a sorted collection to the tree
+ //
+ // begin/end - start end of a collection
+ void add(owner_t *owner, iterator_t begin, iterator_t end);
+
+ // process the first sort
+ void execute(void);
+
+ // advance to the next entry
+ void next(void);
+
+ // return the next entry without re-processing the tree
+ // if false is returned, the merge is complete
+ bool get_top(owner_t *&owner, iterator_t& iterator)
+ {
+ if (top_node_ptr_mbr->has_iterator) {
+ owner = top_node_ptr_mbr->owner_ptr;
+ iterator = top_node_ptr_mbr->current_iterator;
+ return iterator != top_node_ptr_mbr->end_iterator;
+ }
+ else {
+ return false;
+ }
+ }
+
+private:
+ class node_rec
+ {
+
+ public:
+ node_rec(void) :
+ left_child_ptr(NULL),
+ right_child_ptr(NULL),
+ parent_ptr(NULL),
+ next_ptr(NULL),
+ previous_ptr(NULL),
+ next_leaf_ptr(NULL),
+ source_node_ptr(NULL),
+ owner_ptr(NULL),
+ has_iterator(false)
+ {
+ }
+ ~node_rec()
+ {
+ delete left_child_ptr;
+ delete right_child_ptr;
+ }
+
+ node_rec* left_child_ptr; // owned the left child of this node
+ node_rec* right_child_ptr; // owned the right child of this node
+ node_rec* parent_ptr; // copy the parent of this node
+ node_rec* next_ptr; // copy the next sibling
+ node_rec* previous_ptr; // copy the previous sibling
+ node_rec* next_leaf_ptr; // copy only for the bottom rows, a linked list of the buckets
+ node_rec* source_node_ptr; // copy ptr back to the node from whence this iterator came
+ owner_t *owner_ptr;
+ int has_iterator;
+ iterator_t current_iterator;
+ iterator_t end_iterator;
+
+ private:
+ node_rec(const node_rec&);
+ node_rec& operator=(const node_rec&);
+
+ };
+
+ void build_tree(void);
+ node_rec* build_levels(long number_of_levels);
+ void build_left_siblings(kmerge_tree_c::node_rec* node_ptr);
+ void build_right_siblings(kmerge_tree_c::node_rec* node_ptr);
+ void compare_nodes(kmerge_tree_c::node_rec* node_ptr);
+
+ comparitor comparitor_mbr;
+ long bucket_qty_mbr;
+ long number_of_levels_mbr;
+ node_rec* top_node_ptr_mbr; // owned
+ node_rec* first_leaf_ptr; // copy
+ node_rec* last_leaf_ptr; // copy
+
+};
+
+inline long kmerge_tree_brute_log2(long value)
+{
+
+ long square = 2;
+ long count = 1;
+ while ( square < value )
+ {
+ square *= 2;
+ ++count;
+ }
+
+ return count;
+
+} // kmerge_tree_brute_log2
+
+//~~~~~~~~~~class kmerge_tree_c
+
+template <class T, class owner_t, class iterator_t, class comparitor>
+kmerge_tree_c<T, owner_t, iterator_t, comparitor>::kmerge_tree_c(long bucket_qty) :
+ bucket_qty_mbr(bucket_qty),
+ number_of_levels_mbr(bucket_qty ? ::kmerge_tree_brute_log2(bucket_qty_mbr) : 0), // don't add one - build_levels is zero based
+ top_node_ptr_mbr(NULL),
+ first_leaf_ptr(NULL),
+ last_leaf_ptr(NULL)
+{
+
+ build_tree();
+
+}
+
+template <class T, class owner_t, class iterator_t, class comparitor>
+kmerge_tree_c<T, owner_t, iterator_t, comparitor>::~kmerge_tree_c()
+{
+
+ delete top_node_ptr_mbr;
+
+}
+
+/*
+ Unlike the book, I'm going to make my life easy
+ by maintaining every possible link to each node that I might need
+*/
+template <class T, class owner_t, class iterator_t, class comparitor>
+void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_tree(void)
+{
+
+ // the book says that the number of levels is (log2 * k) + 1
+ top_node_ptr_mbr = build_levels(number_of_levels_mbr);
+ if ( top_node_ptr_mbr )
+ {
+ build_left_siblings(top_node_ptr_mbr);
+ build_right_siblings(top_node_ptr_mbr);
+ }
+
+}
+
+/*
+ highly recursive tree builder
+ as long as number_of_levels isn't zero, each node builds its own children
+ and updates the parent link for them.
+
+ If no children are to be built (i.e. number_of_levels is 0), then the leaf linked list is created
+*/
+template <class T, class owner_t, class iterator_t, class comparitor>
+typename kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec *
+kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_levels(long number_of_levels)
+{
+
+ node_rec* node_ptr = new node_rec;
+
+ if ( number_of_levels )
+ {
+ node_ptr->left_child_ptr = build_levels(number_of_levels - 1);
+ if ( node_ptr->left_child_ptr )
+ {
+ node_ptr->left_child_ptr->parent_ptr = node_ptr;
+ node_ptr->right_child_ptr = build_levels(number_of_levels - 1);
+ if ( node_ptr->right_child_ptr )
+ {
+ node_ptr->right_child_ptr->parent_ptr = node_ptr;
+ }
+ }
+ }
+ else
+ {
+ if ( last_leaf_ptr )
+ {
+ last_leaf_ptr->next_leaf_ptr = node_ptr;
+ last_leaf_ptr = node_ptr;
+ }
+ else
+ {
+ first_leaf_ptr = last_leaf_ptr = node_ptr;
+ }
+ }
+
+ return node_ptr;
+
+}
+
+/*
+ when we process a comparison, we'll have to compare two siblings
+ this code matches each link with its sibling
+
+ To get every node correct, I had to write two routines: one which works
+ with left nodes and one which works with right nodes. build_tree() starts it
+ off with the top node's children and then these two swing back and forth
+ between each other.
+*/
+
+template <class T, class owner_t, class iterator_t, class comparitor>
+void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_left_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
+{
+
+ if ( node_ptr->parent_ptr )
+ {
+ if ( node_ptr->parent_ptr->right_child_ptr )
+ {
+ node_ptr->next_ptr = node_ptr->parent_ptr->right_child_ptr;
+ node_ptr->parent_ptr->right_child_ptr->previous_ptr = node_ptr;
+ }
+ }
+
+ if ( node_ptr->left_child_ptr )
+ {
+ build_left_siblings(node_ptr->left_child_ptr);
+ }
+
+ if ( node_ptr->right_child_ptr )
+ {
+ build_right_siblings(node_ptr->right_child_ptr);
+ }
+
+}
+
+template <class T, class owner_t, class iterator_t, class comparitor>
+void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_right_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
+{
+
+ if ( node_ptr->parent_ptr )
+ {
+ if ( node_ptr->parent_ptr->left_child_ptr )
+ {
+ node_ptr->previous_ptr = node_ptr->parent_ptr->left_child_ptr;
+ node_ptr->parent_ptr->left_child_ptr->next_ptr = node_ptr;
+ }
+ }
+
+ if ( node_ptr->right_child_ptr )
+ {
+ build_right_siblings(node_ptr->right_child_ptr);
+ }
+
+ if ( node_ptr->left_child_ptr )
+ {
+ build_left_siblings(node_ptr->left_child_ptr);
+ }
+
+}
+
+// fill the leaf linked list
+template <class T, class owner_t, class iterator_t, class comparitor>
+void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::add(owner_t *owner, iterator_t begin, iterator_t end)
+{
+
+ if ( begin == end )
+ {
+ return;
+ }
+
+ node_rec* node_ptr = first_leaf_ptr;
+ while ( node_ptr )
+ {
+ if ( node_ptr->has_iterator )
+ {
+ node_ptr = node_ptr->next_leaf_ptr;
+ }
+ else
+ {
+ node_ptr->has_iterator = true;
+ node_ptr->owner_ptr = owner;
+ node_ptr->current_iterator = begin;
+ node_ptr->end_iterator = end;
+ break;
+ }
+ }
+}
+
+/*
+ fill the initial tree by comparing each sibling in the
+ leaf linked list and then factoring that up to the parents
+ This is only done once so it doesn't have to be that efficient
+*/
+template <class T, class owner_t, class iterator_t, class comparitor>
+void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::execute(void)
+{
+
+ for ( long parent_level = 0; parent_level < number_of_levels_mbr; ++parent_level )
+ {
+ // the number of nodes to skip is (parent level + 1) ^ 2 - 1
+ long skip_nodes = 2;
+ for ( long skip = 0; skip < parent_level; ++skip )
+ {
+ skip_nodes *= 2;
+ }
+ --skip_nodes;
+
+ node_rec* node_ptr = first_leaf_ptr;
+ while ( node_ptr )
+ {
+ node_rec* parent_ptr = node_ptr;
+ long i;
+ for ( i = 0; i < parent_level; ++i )
+ {
+ parent_ptr = parent_ptr->parent_ptr;
+ }
+
+ compare_nodes(parent_ptr);
+
+ for ( i = 0; i < skip_nodes; ++i )
+ {
+ node_ptr = node_ptr->next_leaf_ptr;
+ }
+
+ node_ptr = node_ptr->next_leaf_ptr;
+ }
+ }
+
+}
+
+// compare the given node and its sibling and bubble the result up to the parent
+template <class T, class owner_t, class iterator_t, class comparitor>
+void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::compare_nodes(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
+{
+
+// assert(node_ptr->parent_ptr);
+
+ node_rec* node1_ptr = NULL;
+ node_rec* node2_ptr = NULL;
+ if ( node_ptr->next_ptr )
+ {
+ node1_ptr = node_ptr;
+ node2_ptr = node_ptr->next_ptr;
+ }
+ else
+ {
+ node1_ptr = node_ptr->previous_ptr;
+ node2_ptr = node_ptr;
+ }
+
+ long result;
+ if ( !node2_ptr->has_iterator ) {
+ result = -1;
+ }
+ else if ( !node1_ptr->has_iterator) {
+ result = 1;
+ }
+ else if ( (node1_ptr->current_iterator != node1_ptr->end_iterator) && (node2_ptr->current_iterator != node2_ptr->end_iterator) )
+ {
+ // no need to check for exact equality - we just want the lesser of the two
+ result = comparitor_mbr(*node1_ptr->current_iterator, *node2_ptr->current_iterator) ? -1 : 1;
+ }
+ else if ( node1_ptr->current_iterator != node1_ptr->end_iterator )
+ {
+ result = -1;
+ }
+ else // if ( node2_ptr->current_iterator != node2_ptr->end_iterator )
+ {
+ result = 1;
+ }
+
+ node_rec* winner_ptr = (result <= 0) ? node1_ptr : node2_ptr;
+ node_rec* parent_ptr = node_ptr->parent_ptr;
+
+ parent_ptr->owner_ptr = winner_ptr->owner_ptr;
+ parent_ptr->has_iterator = winner_ptr->has_iterator;
+ parent_ptr->current_iterator = winner_ptr->current_iterator;
+ parent_ptr->end_iterator = winner_ptr->end_iterator;
+ parent_ptr->source_node_ptr = winner_ptr;
+
+}
+
+/*
+ pop the top node, factor it down the list to find its
+ leaf, replace its leaf and then factor that back up
+*/
+template <class T, class owner_t, class iterator_t, class comparitor>
+void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::next(void)
+{
+
+ if ( top_node_ptr_mbr->current_iterator == top_node_ptr_mbr->end_iterator )
+ {
+ return;
+ }
+
+ // find the source leaf ptr
+ node_rec* node_ptr = top_node_ptr_mbr;
+ while ( node_ptr->source_node_ptr )
+ {
+ node_ptr = node_ptr->source_node_ptr;
+ }
+
+// assert(!node_ptr->left_child_ptr && !node_ptr->right_child_ptr);
+// assert(top_node_ptr_mbr->current_iterator == node_ptr->current_iterator);
+
+ ++node_ptr->current_iterator;
+
+ while ( node_ptr->parent_ptr )
+ {
+ compare_nodes(node_ptr);
+ node_ptr = node_ptr->parent_ptr;
+ }
+
+}
+
+#endif // KMERGE_TREE_H
+