summaryrefslogtreecommitdiffstats
path: root/src/backend/lib/integerset.c
blob: 069a35fb23cd5005b8308f66ef45e5455d62072f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
/*-------------------------------------------------------------------------
 *
 * integerset.c
 *	  Data structure to hold a large set of 64-bit integers efficiently
 *
 * IntegerSet provides an in-memory data structure to hold a set of
 * arbitrary 64-bit integers.  Internally, the values are stored in a
 * B-tree, with a special packed representation at the leaf level using
 * the Simple-8b algorithm, which can pack clusters of nearby values
 * very tightly.
 *
 * Memory consumption depends on the number of values stored, but also
 * on how far the values are from each other.  In the best case, with
 * long runs of consecutive integers, memory consumption can be as low as
 * 0.1 bytes per integer.  In the worst case, if integers are more than
 * 2^32 apart, it uses about 8 bytes per integer.  In typical use, the
 * consumption per integer is somewhere between those extremes, depending
 * on the range of integers stored, and how "clustered" they are.
 *
 *
 * Interface
 * ---------
 *
 *	intset_create			- Create a new, empty set
 *	intset_add_member		- Add an integer to the set
 *	intset_is_member		- Test if an integer is in the set
 *	intset_begin_iterate	- Begin iterating through all integers in set
 *	intset_iterate_next		- Return next set member, if any
 *
 * intset_create() creates the set in the current memory context.  Subsequent
 * operations that add to the data structure will continue to allocate from
 * that same context, even if it's not current anymore.
 *
 * Note that there is no function to free an integer set.  If you need to do
 * that, create a dedicated memory context to hold it, and destroy the memory
 * context instead.
 *
 *
 * Limitations
 * -----------
 *
 * - Values must be added in order.  (Random insertions would require
 *   splitting nodes, which hasn't been implemented.)
 *
 * - Values cannot be added while iteration is in progress.
 *
 * - No support for removing values.
 *
 * None of these limitations are fundamental to the data structure, so they
 * could be lifted if needed, by writing some new code.  But the current
 * users of this facility don't need them.
 *
 *
 * References
 * ----------
 *
 * Simple-8b encoding is based on:
 *
 * Vo Ngoc Anh, Alistair Moffat, Index compression using 64-bit words,
 *   Software - Practice & Experience, v.40 n.2, p.131-147, February 2010
 *   (https://doi.org/10.1002/spe.948)
 *
 *
 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *	  src/backend/lib/integerset.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "access/htup_details.h"
#include "lib/integerset.h"
#include "port/pg_bitutils.h"
#include "utils/memutils.h"


/*
 * Maximum number of integers that can be encoded in a single Simple-8b
 * codeword. (Defined here before anything else, so that we can size arrays
 * using this.)
 */
#define SIMPLE8B_MAX_VALUES_PER_CODEWORD 240

/*
 * Parameters for shape of the in-memory B-tree.
 *
 * These set the size of each internal and leaf node.  They don't necessarily
 * need to be the same, because the tree is just an in-memory structure.
 * With the default 64, each node is about 1 kb.
 *
 * If you change these, you must recalculate MAX_TREE_LEVELS, too!
 */
#define MAX_INTERNAL_ITEMS	64
#define MAX_LEAF_ITEMS	64

/*
 * Maximum height of the tree.
 *
 * MAX_TREE_ITEMS is calculated from the "fan-out" of the B-tree.  The
 * theoretical maximum number of items that we can store in a set is 2^64,
 * so MAX_TREE_LEVELS should be set so that:
 *
 *   MAX_LEAF_ITEMS * MAX_INTERNAL_ITEMS ^ (MAX_TREE_LEVELS - 1) >= 2^64.
 *
 * In practice, we'll need far fewer levels, because you will run out of
 * memory long before reaching that number, but let's be conservative.
 */
#define MAX_TREE_LEVELS		11

/*
 * Node structures, for the in-memory B-tree.
 *
 * An internal node holds a number of downlink pointers to leaf nodes, or
 * to internal nodes on a lower level.  For each downlink, the key value
 * corresponding to the lower level node is stored in a sorted array.  The
 * stored key values are low keys.  In other words, if the downlink has value
 * X, then all items stored on that child are >= X.
 *
 * Each leaf node holds a number of "items", with a varying number of
 * integers packed into each item.  Each item consists of two 64-bit words:
 * The first word holds the first integer stored in the item, in plain format.
 * The second word contains between 0 and 240 more integers, packed using
 * Simple-8b encoding.  By storing the first integer in plain, unpacked,
 * format, we can use binary search to quickly find an item that holds (or
 * would hold) a particular integer.  And by storing the rest in packed form,
 * we still get pretty good memory density, if there are clusters of integers
 * with similar values.
 *
 * Each leaf node also has a pointer to the next leaf node, so that the leaf
 * nodes can be easily walked from beginning to end when iterating.
 */
typedef struct intset_node intset_node;
typedef struct intset_leaf_node intset_leaf_node;
typedef struct intset_internal_node intset_internal_node;

/* Common structure of both leaf and internal nodes. */
struct intset_node
{
	uint16		level;			/* tree level of this node */
	uint16		num_items;		/* number of items in this node */
};

/* Internal node */
struct intset_internal_node
{
	/* common header, must match intset_node */
	uint16		level;			/* >= 1 on internal nodes */
	uint16		num_items;

	/*
	 * 'values' is an array of key values, and 'downlinks' are pointers to
	 * lower-level nodes, corresponding to the key values.
	 */
	uint64		values[MAX_INTERNAL_ITEMS];
	intset_node *downlinks[MAX_INTERNAL_ITEMS];
};

/* Leaf node */
typedef struct
{
	uint64		first;			/* first integer in this item */
	uint64		codeword;		/* simple8b encoded differences from 'first' */
} leaf_item;

#define MAX_VALUES_PER_LEAF_ITEM	(1 + SIMPLE8B_MAX_VALUES_PER_CODEWORD)

struct intset_leaf_node
{
	/* common header, must match intset_node */
	uint16		level;			/* 0 on leafs */
	uint16		num_items;

	intset_leaf_node *next;		/* right sibling, if any */

	leaf_item	items[MAX_LEAF_ITEMS];
};

/*
 * We buffer insertions in a simple array, before packing and inserting them
 * into the B-tree.  MAX_BUFFERED_VALUES sets the size of the buffer.  The
 * encoder assumes that it is large enough that we can always fill a leaf
 * item with buffered new items.  In other words, MAX_BUFFERED_VALUES must be
 * larger than MAX_VALUES_PER_LEAF_ITEM.  For efficiency, make it much larger.
 */
#define MAX_BUFFERED_VALUES			(MAX_VALUES_PER_LEAF_ITEM * 2)

/*
 * IntegerSet is the top-level object representing the set.
 *
 * The integers are stored in an in-memory B-tree structure, plus an array
 * for newly-added integers.  IntegerSet also tracks information about memory
 * usage, as well as the current position when iterating the set with
 * intset_begin_iterate / intset_iterate_next.
 */
struct IntegerSet
{
	/*
	 * 'context' is the memory context holding this integer set and all its
	 * tree nodes.
	 *
	 * 'mem_used' tracks the amount of memory used.  We don't do anything with
	 * it in integerset.c itself, but the callers can ask for it with
	 * intset_memory_usage().
	 */
	MemoryContext context;
	uint64		mem_used;

	uint64		num_entries;	/* total # of values in the set */
	uint64		highest_value;	/* highest value stored in this set */

	/*
	 * B-tree to hold the packed values.
	 *
	 * 'rightmost_nodes' hold pointers to the rightmost node on each level.
	 * rightmost_parent[0] is rightmost leaf, rightmost_parent[1] is its
	 * parent, and so forth, all the way up to the root. These are needed when
	 * adding new values. (Currently, we require that new values are added at
	 * the end.)
	 */
	int			num_levels;		/* height of the tree */
	intset_node *root;			/* root node */
	intset_node *rightmost_nodes[MAX_TREE_LEVELS];
	intset_leaf_node *leftmost_leaf;	/* leftmost leaf node */

	/*
	 * Holding area for new items that haven't been inserted to the tree yet.
	 */
	uint64		buffered_values[MAX_BUFFERED_VALUES];
	int			num_buffered_values;

	/*
	 * Iterator support.
	 *
	 * 'iter_values' is an array of integers ready to be returned to the
	 * caller; 'iter_num_values' is the length of that array, and
	 * 'iter_valueno' is the next index.  'iter_node' and 'iter_itemno' point
	 * to the leaf node, and item within the leaf node, to get the next batch
	 * of values from.
	 *
	 * Normally, 'iter_values' points to 'iter_values_buf', which holds items
	 * decoded from a leaf item.  But after we have scanned the whole B-tree,
	 * we iterate through all the unbuffered values, too, by pointing
	 * iter_values to 'buffered_values'.
	 */
	bool		iter_active;	/* is iteration in progress? */

	const uint64 *iter_values;
	int			iter_num_values;	/* number of elements in 'iter_values' */
	int			iter_valueno;	/* next index into 'iter_values' */

	intset_leaf_node *iter_node;	/* current leaf node */
	int			iter_itemno;	/* next item in 'iter_node' to decode */

	uint64		iter_values_buf[MAX_VALUES_PER_LEAF_ITEM];
};

/*
 * Prototypes for internal functions.
 */
static void intset_update_upper(IntegerSet *intset, int level,
								intset_node *child, uint64 child_key);
static void intset_flush_buffered_values(IntegerSet *intset);

static int	intset_binsrch_uint64(uint64 value, uint64 *arr, int arr_elems,
								  bool nextkey);
static int	intset_binsrch_leaf(uint64 value, leaf_item *arr, int arr_elems,
								bool nextkey);

static uint64 simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base);
static int	simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base);
static bool simple8b_contains(uint64 codeword, uint64 key, uint64 base);


/*
 * Create a new, initially empty, integer set.
 *
 * The integer set is created in the current memory context.
 * We will do all subsequent allocations in the same context, too, regardless
 * of which memory context is current when new integers are added to the set.
 */
IntegerSet *
intset_create(void)
{
	IntegerSet *intset;

	intset = (IntegerSet *) palloc(sizeof(IntegerSet));
	intset->context = CurrentMemoryContext;
	intset->mem_used = GetMemoryChunkSpace(intset);

	intset->num_entries = 0;
	intset->highest_value = 0;

	intset->num_levels = 0;
	intset->root = NULL;
	memset(intset->rightmost_nodes, 0, sizeof(intset->rightmost_nodes));
	intset->leftmost_leaf = NULL;

	intset->num_buffered_values = 0;

	intset->iter_active = false;
	intset->iter_node = NULL;
	intset->iter_itemno = 0;
	intset->iter_valueno = 0;
	intset->iter_num_values = 0;
	intset->iter_values = NULL;

	return intset;
}

/*
 * Allocate a new node.
 */
static intset_internal_node *
intset_new_internal_node(IntegerSet *intset)
{
	intset_internal_node *n;

	n = (intset_internal_node *) MemoryContextAlloc(intset->context,
													sizeof(intset_internal_node));
	intset->mem_used += GetMemoryChunkSpace(n);

	n->level = 0;				/* caller must set */
	n->num_items = 0;

	return n;
}

static intset_leaf_node *
intset_new_leaf_node(IntegerSet *intset)
{
	intset_leaf_node *n;

	n = (intset_leaf_node *) MemoryContextAlloc(intset->context,
												sizeof(intset_leaf_node));
	intset->mem_used += GetMemoryChunkSpace(n);

	n->level = 0;
	n->num_items = 0;
	n->next = NULL;

	return n;
}

/*
 * Return the number of entries in the integer set.
 */
uint64
intset_num_entries(IntegerSet *intset)
{
	return intset->num_entries;
}

/*
 * Return the amount of memory used by the integer set.
 */
uint64
intset_memory_usage(IntegerSet *intset)
{
	return intset->mem_used;
}

/*
 * Add a value to the set.
 *
 * Values must be added in order.
 */
void
intset_add_member(IntegerSet *intset, uint64 x)
{
	if (intset->iter_active)
		elog(ERROR, "cannot add new values to integer set while iteration is in progress");

	if (x <= intset->highest_value && intset->num_entries > 0)
		elog(ERROR, "cannot add value to integer set out of order");

	if (intset->num_buffered_values >= MAX_BUFFERED_VALUES)
	{
		/* Time to flush our buffer */
		intset_flush_buffered_values(intset);
		Assert(intset->num_buffered_values < MAX_BUFFERED_VALUES);
	}

	/* Add it to the buffer of newly-added values */
	intset->buffered_values[intset->num_buffered_values] = x;
	intset->num_buffered_values++;
	intset->num_entries++;
	intset->highest_value = x;
}

/*
 * Take a batch of buffered values, and pack them into the B-tree.
 */
static void
intset_flush_buffered_values(IntegerSet *intset)
{
	uint64	   *values = intset->buffered_values;
	uint64		num_values = intset->num_buffered_values;
	int			num_packed = 0;
	intset_leaf_node *leaf;

	leaf = (intset_leaf_node *) intset->rightmost_nodes[0];

	/*
	 * If the tree is completely empty, create the first leaf page, which is
	 * also the root.
	 */
	if (leaf == NULL)
	{
		/*
		 * This is the very first item in the set.
		 *
		 * Allocate root node. It's also a leaf.
		 */
		leaf = intset_new_leaf_node(intset);

		intset->root = (intset_node *) leaf;
		intset->leftmost_leaf = leaf;
		intset->rightmost_nodes[0] = (intset_node *) leaf;
		intset->num_levels = 1;
	}

	/*
	 * If there are less than MAX_VALUES_PER_LEAF_ITEM values in the buffer,
	 * stop.  In most cases, we cannot encode that many values in a single
	 * value, but this way, the encoder doesn't have to worry about running
	 * out of input.
	 */
	while (num_values - num_packed >= MAX_VALUES_PER_LEAF_ITEM)
	{
		leaf_item	item;
		int			num_encoded;

		/*
		 * Construct the next leaf item, packing as many buffered values as
		 * possible.
		 */
		item.first = values[num_packed];
		item.codeword = simple8b_encode(&values[num_packed + 1],
										&num_encoded,
										item.first);

		/*
		 * Add the item to the node, allocating a new node if the old one is
		 * full.
		 */
		if (leaf->num_items >= MAX_LEAF_ITEMS)
		{
			/* Allocate new leaf and link it to the tree */
			intset_leaf_node *old_leaf = leaf;

			leaf = intset_new_leaf_node(intset);
			old_leaf->next = leaf;
			intset->rightmost_nodes[0] = (intset_node *) leaf;
			intset_update_upper(intset, 1, (intset_node *) leaf, item.first);
		}
		leaf->items[leaf->num_items++] = item;

		num_packed += 1 + num_encoded;
	}

	/*
	 * Move any remaining buffered values to the beginning of the array.
	 */
	if (num_packed < intset->num_buffered_values)
	{
		memmove(&intset->buffered_values[0],
				&intset->buffered_values[num_packed],
				(intset->num_buffered_values - num_packed) * sizeof(uint64));
	}
	intset->num_buffered_values -= num_packed;
}

/*
 * Insert a downlink into parent node, after creating a new node.
 *
 * Recurses if the parent node is full, too.
 */
static void
intset_update_upper(IntegerSet *intset, int level, intset_node *child,
					uint64 child_key)
{
	intset_internal_node *parent;

	Assert(level > 0);

	/*
	 * Create a new root node, if necessary.
	 */
	if (level >= intset->num_levels)
	{
		intset_node *oldroot = intset->root;
		uint64		downlink_key;

		/* MAX_TREE_LEVELS should be more than enough, this shouldn't happen */
		if (intset->num_levels == MAX_TREE_LEVELS)
			elog(ERROR, "could not expand integer set, maximum number of levels reached");
		intset->num_levels++;

		/*
		 * Get the first value on the old root page, to be used as the
		 * downlink.
		 */
		if (intset->root->level == 0)
			downlink_key = ((intset_leaf_node *) oldroot)->items[0].first;
		else
			downlink_key = ((intset_internal_node *) oldroot)->values[0];

		parent = intset_new_internal_node(intset);
		parent->level = level;
		parent->values[0] = downlink_key;
		parent->downlinks[0] = oldroot;
		parent->num_items = 1;

		intset->root = (intset_node *) parent;
		intset->rightmost_nodes[level] = (intset_node *) parent;
	}

	/*
	 * Place the downlink on the parent page.
	 */
	parent = (intset_internal_node *) intset->rightmost_nodes[level];

	if (parent->num_items < MAX_INTERNAL_ITEMS)
	{
		parent->values[parent->num_items] = child_key;
		parent->downlinks[parent->num_items] = child;
		parent->num_items++;
	}
	else
	{
		/*
		 * Doesn't fit.  Allocate new parent, with the downlink as the first
		 * item on it, and recursively insert the downlink to the new parent
		 * to the grandparent.
		 */
		parent = intset_new_internal_node(intset);
		parent->level = level;
		parent->values[0] = child_key;
		parent->downlinks[0] = child;
		parent->num_items = 1;

		intset->rightmost_nodes[level] = (intset_node *) parent;

		intset_update_upper(intset, level + 1, (intset_node *) parent, child_key);
	}
}

/*
 * Does the set contain the given value?
 */
bool
intset_is_member(IntegerSet *intset, uint64 x)
{
	intset_node *node;
	intset_leaf_node *leaf;
	int			level;
	int			itemno;
	leaf_item  *item;

	/*
	 * The value might be in the buffer of newly-added values.
	 */
	if (intset->num_buffered_values > 0 && x >= intset->buffered_values[0])
	{
		int			itemno;

		itemno = intset_binsrch_uint64(x,
									   intset->buffered_values,
									   intset->num_buffered_values,
									   false);
		if (itemno >= intset->num_buffered_values)
			return false;
		else
			return (intset->buffered_values[itemno] == x);
	}

	/*
	 * Start from the root, and walk down the B-tree to find the right leaf
	 * node.
	 */
	if (!intset->root)
		return false;
	node = intset->root;
	for (level = intset->num_levels - 1; level > 0; level--)
	{
		intset_internal_node *n = (intset_internal_node *) node;

		Assert(node->level == level);

		itemno = intset_binsrch_uint64(x, n->values, n->num_items, true);
		if (itemno == 0)
			return false;
		node = n->downlinks[itemno - 1];
	}
	Assert(node->level == 0);
	leaf = (intset_leaf_node *) node;

	/*
	 * Binary search to find the right item on the leaf page
	 */
	itemno = intset_binsrch_leaf(x, leaf->items, leaf->num_items, true);
	if (itemno == 0)
		return false;
	item = &leaf->items[itemno - 1];

	/* Is this a match to the first value on the item? */
	if (item->first == x)
		return true;
	Assert(x > item->first);

	/* Is it in the packed codeword? */
	if (simple8b_contains(item->codeword, x, item->first))
		return true;

	return false;
}

/*
 * Begin in-order scan through all the values.
 *
 * While the iteration is in-progress, you cannot add new values to the set.
 */
void
intset_begin_iterate(IntegerSet *intset)
{
	/* Note that we allow an iteration to be abandoned midway */
	intset->iter_active = true;
	intset->iter_node = intset->leftmost_leaf;
	intset->iter_itemno = 0;
	intset->iter_valueno = 0;
	intset->iter_num_values = 0;
	intset->iter_values = intset->iter_values_buf;
}

/*
 * Returns the next integer, when iterating.
 *
 * intset_begin_iterate() must be called first.  intset_iterate_next() returns
 * the next value in the set.  Returns true, if there was another value, and
 * stores the value in *next.  Otherwise, returns false.
 */
bool
intset_iterate_next(IntegerSet *intset, uint64 *next)
{
	Assert(intset->iter_active);
	for (;;)
	{
		/* Return next iter_values[] entry if any */
		if (intset->iter_valueno < intset->iter_num_values)
		{
			*next = intset->iter_values[intset->iter_valueno++];
			return true;
		}

		/* Decode next item in current leaf node, if any */
		if (intset->iter_node &&
			intset->iter_itemno < intset->iter_node->num_items)
		{
			leaf_item  *item;
			int			num_decoded;

			item = &intset->iter_node->items[intset->iter_itemno++];

			intset->iter_values_buf[0] = item->first;
			num_decoded = simple8b_decode(item->codeword,
										  &intset->iter_values_buf[1],
										  item->first);
			intset->iter_num_values = num_decoded + 1;
			intset->iter_valueno = 0;
			continue;
		}

		/* No more items on this leaf, step to next node */
		if (intset->iter_node)
		{
			intset->iter_node = intset->iter_node->next;
			intset->iter_itemno = 0;
			continue;
		}

		/*
		 * We have reached the end of the B-tree.  But we might still have
		 * some integers in the buffer of newly-added values.
		 */
		if (intset->iter_values == (const uint64 *) intset->iter_values_buf)
		{
			intset->iter_values = intset->buffered_values;
			intset->iter_num_values = intset->num_buffered_values;
			intset->iter_valueno = 0;
			continue;
		}

		break;
	}

	/* No more results. */
	intset->iter_active = false;
	*next = 0;					/* prevent uninitialized-variable warnings */
	return false;
}

/*
 * intset_binsrch_uint64() -- search a sorted array of uint64s
 *
 * Returns the first position with key equal or less than the given key.
 * The returned position would be the "insert" location for the given key,
 * that is, the position where the new key should be inserted to.
 *
 * 'nextkey' affects the behavior on equal keys.  If true, and there is an
 * equal key in the array, this returns the position immediately after the
 * equal key.  If false, this returns the position of the equal key itself.
 */
static int
intset_binsrch_uint64(uint64 item, uint64 *arr, int arr_elems, bool nextkey)
{
	int			low,
				high,
				mid;

	low = 0;
	high = arr_elems;
	while (high > low)
	{
		mid = low + (high - low) / 2;

		if (nextkey)
		{
			if (item >= arr[mid])
				low = mid + 1;
			else
				high = mid;
		}
		else
		{
			if (item > arr[mid])
				low = mid + 1;
			else
				high = mid;
		}
	}

	return low;
}

/* same, but for an array of leaf items */
static int
intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey)
{
	int			low,
				high,
				mid;

	low = 0;
	high = arr_elems;
	while (high > low)
	{
		mid = low + (high - low) / 2;

		if (nextkey)
		{
			if (item >= arr[mid].first)
				low = mid + 1;
			else
				high = mid;
		}
		else
		{
			if (item > arr[mid].first)
				low = mid + 1;
			else
				high = mid;
		}
	}

	return low;
}

/*
 * Simple-8b encoding.
 *
 * The simple-8b algorithm packs between 1 and 240 integers into 64-bit words,
 * called "codewords".  The number of integers packed into a single codeword
 * depends on the integers being packed; small integers are encoded using
 * fewer bits than large integers.  A single codeword can store a single
 * 60-bit integer, or two 30-bit integers, for example.
 *
 * Since we're storing a unique, sorted, set of integers, we actually encode
 * the *differences* between consecutive integers.  That way, clusters of
 * integers that are close to each other are packed efficiently, regardless
 * of their absolute values.
 *
 * In Simple-8b, each codeword consists of a 4-bit selector, which indicates
 * how many integers are encoded in the codeword, and the encoded integers are
 * packed into the remaining 60 bits.  The selector allows for 16 different
 * ways of using the remaining 60 bits, called "modes".  The number of integers
 * packed into a single codeword in each mode is listed in the simple8b_modes
 * table below.  For example, consider the following codeword:
 *
 *      20-bit integer       20-bit integer       20-bit integer
 * 1101 00000000000000010010 01111010000100100000 00000000000000010100
 * ^
 * selector
 *
 * The selector 1101 is 13 in decimal.  From the modes table below, we see
 * that it means that the codeword encodes three 20-bit integers.  In decimal,
 * those integers are 18, 500000 and 20.  Because we encode deltas rather than
 * absolute values, the actual values that they represent are 18, 500018 and
 * 500038.
 *
 * Modes 0 and 1 are a bit special; they encode a run of 240 or 120 zeroes
 * (which means 240 or 120 consecutive integers, since we're encoding the
 * deltas between integers), without using the rest of the codeword bits
 * for anything.
 *
 * Simple-8b cannot encode integers larger than 60 bits.  Values larger than
 * that are always stored in the 'first' field of a leaf item, never in the
 * packed codeword.  If there is a sequence of integers that are more than
 * 2^60 apart, the codeword will go unused on those items.  To represent that,
 * we use a magic EMPTY_CODEWORD codeword value.
 */
static const struct simple8b_mode
{
	uint8		bits_per_int;
	uint8		num_ints;
}			simple8b_modes[17] =

{
	{0, 240},					/* mode  0: 240 zeroes */
	{0, 120},					/* mode  1: 120 zeroes */
	{1, 60},					/* mode  2: sixty 1-bit integers */
	{2, 30},					/* mode  3: thirty 2-bit integers */
	{3, 20},					/* mode  4: twenty 3-bit integers */
	{4, 15},					/* mode  5: fifteen 4-bit integers */
	{5, 12},					/* mode  6: twelve 5-bit integers */
	{6, 10},					/* mode  7: ten 6-bit integers */
	{7, 8},						/* mode  8: eight 7-bit integers (four bits
								 * are wasted) */
	{8, 7},						/* mode  9: seven 8-bit integers (four bits
								 * are wasted) */
	{10, 6},					/* mode 10: six 10-bit integers */
	{12, 5},					/* mode 11: five 12-bit integers */
	{15, 4},					/* mode 12: four 15-bit integers */
	{20, 3},					/* mode 13: three 20-bit integers */
	{30, 2},					/* mode 14: two 30-bit integers */
	{60, 1},					/* mode 15: one 60-bit integer */

	{0, 0}						/* sentinel value */
};

/*
 * EMPTY_CODEWORD is a special value, used to indicate "no values".
 * It is used if the next value is too large to be encoded with Simple-8b.
 *
 * This value looks like a mode-0 codeword, but we can distinguish it
 * because a regular mode-0 codeword would have zeroes in the unused bits.
 */
#define EMPTY_CODEWORD		UINT64CONST(0x0FFFFFFFFFFFFFFF)

/*
 * Encode a number of integers into a Simple-8b codeword.
 *
 * (What we actually encode are deltas between successive integers.
 * "base" is the value before ints[0].)
 *
 * The input array must contain at least SIMPLE8B_MAX_VALUES_PER_CODEWORD
 * elements, ensuring that we can produce a full codeword.
 *
 * Returns the encoded codeword, and sets *num_encoded to the number of
 * input integers that were encoded.  That can be zero, if the first delta
 * is too large to be encoded.
 */
static uint64
simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base)
{
	int			selector;
	int			nints;
	int			bits;
	uint64		diff;
	uint64		last_val;
	uint64		codeword;
	int			i;

	Assert(ints[0] > base);

	/*
	 * Select the "mode" to use for this codeword.
	 *
	 * In each iteration, check if the next value can be represented in the
	 * current mode we're considering.  If it's too large, then step up the
	 * mode to a wider one, and repeat.  If it fits, move on to the next
	 * integer.  Repeat until the codeword is full, given the current mode.
	 *
	 * Note that we don't have any way to represent unused slots in the
	 * codeword, so we require each codeword to be "full".  It is always
	 * possible to produce a full codeword unless the very first delta is too
	 * large to be encoded.  For example, if the first delta is small but the
	 * second is too large to be encoded, we'll end up using the last "mode",
	 * which has nints == 1.
	 */
	selector = 0;
	nints = simple8b_modes[0].num_ints;
	bits = simple8b_modes[0].bits_per_int;
	diff = ints[0] - base - 1;
	last_val = ints[0];
	i = 0;						/* number of deltas we have accepted */
	for (;;)
	{
		if (diff >= (UINT64CONST(1) << bits))
		{
			/* too large, step up to next mode */
			selector++;
			nints = simple8b_modes[selector].num_ints;
			bits = simple8b_modes[selector].bits_per_int;
			/* we might already have accepted enough deltas for this mode */
			if (i >= nints)
				break;
		}
		else
		{
			/* accept this delta; then done if codeword is full */
			i++;
			if (i >= nints)
				break;
			/* examine next delta */
			Assert(ints[i] > last_val);
			diff = ints[i] - last_val - 1;
			last_val = ints[i];
		}
	}

	if (nints == 0)
	{
		/*
		 * The first delta is too large to be encoded with Simple-8b.
		 *
		 * If there is at least one not-too-large integer in the input, we
		 * will encode it using mode 15 (or a more compact mode).  Hence, we
		 * can only get here if the *first* delta is >= 2^60.
		 */
		Assert(i == 0);
		*num_encoded = 0;
		return EMPTY_CODEWORD;
	}

	/*
	 * Encode the integers using the selected mode.  Note that we shift them
	 * into the codeword in reverse order, so that they will come out in the
	 * correct order in the decoder.
	 */
	codeword = 0;
	if (bits > 0)
	{
		for (i = nints - 1; i > 0; i--)
		{
			diff = ints[i] - ints[i - 1] - 1;
			codeword |= diff;
			codeword <<= bits;
		}
		diff = ints[0] - base - 1;
		codeword |= diff;
	}

	/* add selector to the codeword, and return */
	codeword |= (uint64) selector << 60;

	*num_encoded = nints;
	return codeword;
}

/*
 * Decode a codeword into an array of integers.
 * Returns the number of integers decoded.
 */
static int
simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
{
	int			selector = (codeword >> 60);
	int			nints = simple8b_modes[selector].num_ints;
	int			bits = simple8b_modes[selector].bits_per_int;
	uint64		mask = (UINT64CONST(1) << bits) - 1;
	uint64		curr_value;

	if (codeword == EMPTY_CODEWORD)
		return 0;

	curr_value = base;
	for (int i = 0; i < nints; i++)
	{
		uint64		diff = codeword & mask;

		curr_value += 1 + diff;
		decoded[i] = curr_value;
		codeword >>= bits;
	}
	return nints;
}

/*
 * This is very similar to simple8b_decode(), but instead of decoding all
 * the values to an array, it just checks if the given "key" is part of
 * the codeword.
 */
static bool
simple8b_contains(uint64 codeword, uint64 key, uint64 base)
{
	int			selector = (codeword >> 60);
	int			nints = simple8b_modes[selector].num_ints;
	int			bits = simple8b_modes[selector].bits_per_int;

	if (codeword == EMPTY_CODEWORD)
		return false;

	if (bits == 0)
	{
		/* Special handling for 0-bit cases. */
		return (key - base) <= nints;
	}
	else
	{
		uint64		mask = (UINT64CONST(1) << bits) - 1;
		uint64		curr_value;

		curr_value = base;
		for (int i = 0; i < nints; i++)
		{
			uint64		diff = codeword & mask;

			curr_value += 1 + diff;

			if (curr_value >= key)
			{
				if (curr_value == key)
					return true;
				else
					return false;
			}

			codeword >>= bits;
		}
	}
	return false;
}