summaryrefslogtreecommitdiffstats
path: root/src/k_merge_tree.h
blob: 2310628d82db07f3ff7b991d0b3d596a7ca5b0a7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
/*

	K-Way Merge Template
	By Jordan Zimmerman
	
*/

#ifndef KMERGE_TREE_H
#define KMERGE_TREE_H

/*

K-Way Merge

An implementation of "k-Way Merging" as described in "Fundamentals of Data Structures" by Horowitz/Sahni.

The idea is to merge k sorted arrays limiting the number of comparisons. A tree is built containing the 
results of comparing the heads of each array. The top most node is always the smallest entry. Then, its 
corresponding leaf in the tree is refilled and the tree is processed again. It's easier to see in the 
following example:

Imagine 4 sorted arrays:
{5, 10, 15, 20}
{10, 13, 16, 19}
{2, 19, 26, 40}
{18, 22, 23, 24}

The initial tree looks like this:


           2
       /       \
     2           5
   /   \       /   \
 18     2    10     5
 
The '/' and '\' represent links. The bottom row has the leaves and they contain the heads of the arrays. 
The rows above the leaves represent the smaller of the two child nodes. Thus, the top node is the smallest. 
To process the next iteration, the top node gets popped and its leaf gets refilled. Then, the new leaf's 
associated nodes are processed. So, after the 2 is taken, it is filled with 19 (the next head of its array). 
After processing, the tree looks like this:


           5
       /       \
     18          5
   /   \       /   \
 18    19    10     5
 
So, you can see how the number of comparisons is reduced. 

A good use of this is when you have a very large array that needs to be sorted. Break it up into n small 
arrays and sort those. Then use this merge sort for the final sort. This can also be done with files. If 
you have n sorted files, you can merge them into one sorted file. K Way Merging works best when comparing 
is somewhat expensive.

*/

#include <math.h>

#include <functional>

template <class T, class owner_t, class iterator_t, class comparitor = std::less<T> > 
class kmerge_tree_c
{

public:
	// create the tree with the given number of buckets. Call add() for each of the buckets
	// and then call execute() to build things. Call get_top() then next() until get_top returns
	// false.
	kmerge_tree_c(long bucket_qty);
	~kmerge_tree_c();
	
	// add a sorted collection to the tree
	//
	// begin/end - start end of a collection
	void			add(owner_t *owner, iterator_t begin, iterator_t end);
	
	// process the first sort
	void			execute(void);
	
	// advance to the next entry
	void			next(void);
	
	// return the next entry without re-processing the tree
	// if false is returned, the merge is complete
	bool			get_top(owner_t *&owner, iterator_t& iterator)
	{
        if (top_node_ptr_mbr->has_iterator) {
            owner = top_node_ptr_mbr->owner_ptr;
            iterator = top_node_ptr_mbr->current_iterator;
            return iterator != top_node_ptr_mbr->end_iterator;
        }
        else {
            return false;
        }
	}
	
private:
	class node_rec
	{
	
	public:
		node_rec(void) :
			left_child_ptr(NULL),
			right_child_ptr(NULL),
			parent_ptr(NULL),
			next_ptr(NULL),
			previous_ptr(NULL),
			next_leaf_ptr(NULL),
			source_node_ptr(NULL),
            owner_ptr(NULL),
			has_iterator(false)
		{
		}
		~node_rec()
		{
			delete left_child_ptr;
			delete right_child_ptr;
		}
		
		node_rec*				left_child_ptr;		// owned	the left child of this node
		node_rec*				right_child_ptr;	// owned	the right child of this node
		node_rec*				parent_ptr;			// copy		the parent of this node
		node_rec*				next_ptr;			// copy		the next sibling
		node_rec*				previous_ptr;		// copy		the previous sibling
		node_rec*				next_leaf_ptr;		// copy		only for the bottom rows, a linked list of the buckets
		node_rec*				source_node_ptr;	// copy		ptr back to the node from whence this iterator came
        owner_t                 *owner_ptr;
		int						has_iterator;
		iterator_t				current_iterator;
		iterator_t				end_iterator;

	private:
		node_rec(const node_rec&);
		node_rec&	operator=(const node_rec&);

	};
	
	void				build_tree(void);	
	node_rec*			build_levels(long number_of_levels);
	void				build_left_siblings(kmerge_tree_c::node_rec* node_ptr);
	void				build_right_siblings(kmerge_tree_c::node_rec* node_ptr);
	void				compare_nodes(kmerge_tree_c::node_rec* node_ptr);

	comparitor	comparitor_mbr;
	long		bucket_qty_mbr;
	long		number_of_levels_mbr;
	node_rec*	top_node_ptr_mbr;	// owned
	node_rec*	first_leaf_ptr;		// copy
	node_rec*	last_leaf_ptr;		// copy

};

inline long kmerge_tree_brute_log2(long value)
{

        long    square = 2;
        long    count = 1;
        while ( square < value )
        {
                square *= 2;
                ++count;
        }
        
        return count;

} // kmerge_tree_brute_log2

//~~~~~~~~~~class kmerge_tree_c

template <class T, class owner_t, class iterator_t, class comparitor> 
kmerge_tree_c<T, owner_t, iterator_t, comparitor>::kmerge_tree_c(long bucket_qty) :
	bucket_qty_mbr(bucket_qty),
    number_of_levels_mbr(bucket_qty ? ::kmerge_tree_brute_log2(bucket_qty_mbr) : 0),    // don't add one - build_levels is zero based
	top_node_ptr_mbr(NULL),
	first_leaf_ptr(NULL),
	last_leaf_ptr(NULL)
{

	build_tree();

}

template <class T, class owner_t, class iterator_t, class comparitor> 
kmerge_tree_c<T, owner_t, iterator_t, comparitor>::~kmerge_tree_c()
{

	delete top_node_ptr_mbr;

}

/*
	Unlike the book, I'm going to make my life easy
	by maintaining every possible link to each node that I might need
*/
template <class T, class owner_t, class iterator_t, class comparitor> 
void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_tree(void)
{

	// the book says that the number of levels is (log2 * k) + 1
	top_node_ptr_mbr = build_levels(number_of_levels_mbr);
	if ( top_node_ptr_mbr )
	{
		build_left_siblings(top_node_ptr_mbr);
		build_right_siblings(top_node_ptr_mbr);
	}

}

/*
	highly recursive tree builder
	as long as number_of_levels isn't zero, each node builds its own children
	and updates the parent link for them. 
	
	If no children are to be built (i.e. number_of_levels is 0), then the leaf linked list is created
*/
template <class T, class owner_t, class iterator_t, class comparitor> 
typename kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec *
kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_levels(long number_of_levels)
{

	node_rec*		node_ptr = new node_rec;

	if ( number_of_levels )
	{
		node_ptr->left_child_ptr = build_levels(number_of_levels - 1);
		if ( node_ptr->left_child_ptr )
		{
			node_ptr->left_child_ptr->parent_ptr = node_ptr;
			node_ptr->right_child_ptr = build_levels(number_of_levels - 1);
			if ( node_ptr->right_child_ptr )
			{
				node_ptr->right_child_ptr->parent_ptr = node_ptr;
			}
		}
	}
	else
	{
		if ( last_leaf_ptr )
		{
			last_leaf_ptr->next_leaf_ptr = node_ptr;
			last_leaf_ptr = node_ptr;
		}
		else
		{
			first_leaf_ptr = last_leaf_ptr = node_ptr;
		}
	}
	
	return node_ptr;

}

/*
	when we process a comparison, we'll have to compare two siblings
	this code matches each link with its sibling
	
	To get every node correct, I had to write two routines: one which works
	with left nodes and one which works with right nodes. build_tree() starts it
	off with the top node's children and then these two swing back and forth
	between each other.
*/

template <class T, class owner_t, class iterator_t, class comparitor> 
void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_left_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
{

	if ( node_ptr->parent_ptr )
	{
		if ( node_ptr->parent_ptr->right_child_ptr )
		{
			node_ptr->next_ptr = node_ptr->parent_ptr->right_child_ptr;
			node_ptr->parent_ptr->right_child_ptr->previous_ptr = node_ptr;
		}
	}
	
	if ( node_ptr->left_child_ptr )
	{
		build_left_siblings(node_ptr->left_child_ptr);
	}
	
	if ( node_ptr->right_child_ptr )
	{
		build_right_siblings(node_ptr->right_child_ptr);
	}

}

template <class T, class owner_t, class iterator_t, class comparitor> 
void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_right_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
{

	if ( node_ptr->parent_ptr )
	{
		if ( node_ptr->parent_ptr->left_child_ptr )
		{
			node_ptr->previous_ptr = node_ptr->parent_ptr->left_child_ptr;
			node_ptr->parent_ptr->left_child_ptr->next_ptr = node_ptr;
		}
	}
	
	if ( node_ptr->right_child_ptr )
	{
		build_right_siblings(node_ptr->right_child_ptr);
	}

	if ( node_ptr->left_child_ptr )
	{
		build_left_siblings(node_ptr->left_child_ptr);
	}

}

// fill the leaf linked list
template <class T, class owner_t, class iterator_t, class comparitor> 
void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::add(owner_t *owner, iterator_t begin, iterator_t end)
{

	if ( begin == end )
	{
		return;
	}

	node_rec*		node_ptr = first_leaf_ptr;
	while ( node_ptr )
	{
		if ( node_ptr->has_iterator )
		{
			node_ptr = node_ptr->next_leaf_ptr;
		}
		else
		{
			node_ptr->has_iterator = true;
            node_ptr->owner_ptr = owner;
			node_ptr->current_iterator = begin;
			node_ptr->end_iterator = end;
			break;
		}
	}
}

/*
	fill the initial tree by comparing each sibling in the
	leaf linked list and then factoring that up to the parents
	This is only done once so it doesn't have to be that efficient
*/
template <class T, class owner_t, class iterator_t, class comparitor> 
void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::execute(void)
{

	for ( long parent_level = 0; parent_level < number_of_levels_mbr; ++parent_level )
	{
		// the number of nodes to skip is (parent level + 1) ^ 2 - 1
		long			skip_nodes = 2;
		for ( long skip = 0; skip < parent_level; ++skip )
		{
			skip_nodes *= 2;
		}
		--skip_nodes;

		node_rec*		node_ptr = first_leaf_ptr;
		while ( node_ptr )
		{
			node_rec*	parent_ptr = node_ptr;
			long		i;
			for ( i = 0; i < parent_level; ++i )
			{
				parent_ptr = parent_ptr->parent_ptr;
			}
		
			compare_nodes(parent_ptr);
			
			for ( i = 0; i < skip_nodes; ++i )
			{
				node_ptr = node_ptr->next_leaf_ptr;
			}
			
			node_ptr = node_ptr->next_leaf_ptr;
		}
	}

}

// compare the given node and its sibling and bubble the result up to the parent
template <class T, class owner_t, class iterator_t, class comparitor> 
void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::compare_nodes(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
{

//	assert(node_ptr->parent_ptr);

	node_rec*	node1_ptr = NULL;
	node_rec*	node2_ptr = NULL;
	if ( node_ptr->next_ptr )
	{
		node1_ptr = node_ptr;
		node2_ptr = node_ptr->next_ptr;
	}
	else
	{
		node1_ptr = node_ptr->previous_ptr;
		node2_ptr = node_ptr;
	}
	
	long				result;
    if ( !node2_ptr->has_iterator ) {
        result = -1;
    }
    else if ( !node1_ptr->has_iterator) {
        result = 1;
    }
	else if ( (node1_ptr->current_iterator != node1_ptr->end_iterator) && (node2_ptr->current_iterator != node2_ptr->end_iterator) )
	{
		// no need to check for exact equality - we just want the lesser of the two
		result = comparitor_mbr(*node1_ptr->current_iterator, *node2_ptr->current_iterator) ? -1 : 1;
	}
	else if ( node1_ptr->current_iterator != node1_ptr->end_iterator )
	{
		result = -1;
	}
	else // if ( node2_ptr->current_iterator != node2_ptr->end_iterator )
	{
		result = 1;
	}

	node_rec*	winner_ptr = (result <= 0) ? node1_ptr : node2_ptr; 
	node_rec*	parent_ptr = node_ptr->parent_ptr;
	
    parent_ptr->owner_ptr = winner_ptr->owner_ptr;
    parent_ptr->has_iterator = winner_ptr->has_iterator;
	parent_ptr->current_iterator = winner_ptr->current_iterator;
	parent_ptr->end_iterator = winner_ptr->end_iterator;
	parent_ptr->source_node_ptr = winner_ptr;

}

/*
	pop the top node, factor it down the list to find its
	leaf, replace its leaf and then factor that back up
*/
template <class T, class owner_t, class iterator_t, class comparitor> 
void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::next(void)
{

	if ( top_node_ptr_mbr->current_iterator == top_node_ptr_mbr->end_iterator )
	{
		return;
	}

	// find the source leaf ptr
	node_rec*	node_ptr = top_node_ptr_mbr;
	while ( node_ptr->source_node_ptr )
	{
		node_ptr = node_ptr->source_node_ptr;
	}
	
//	assert(!node_ptr->left_child_ptr && !node_ptr->right_child_ptr);
//	assert(top_node_ptr_mbr->current_iterator == node_ptr->current_iterator);
	
	++node_ptr->current_iterator;
	
	while ( node_ptr->parent_ptr )
	{
		compare_nodes(node_ptr);
		node_ptr = node_ptr->parent_ptr;
	}

}

#endif	// KMERGE_TREE_H