summaryrefslogtreecommitdiffstats
path: root/src/backend/lib
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/lib')
-rw-r--r--src/backend/lib/Makefile27
-rw-r--r--src/backend/lib/README35
-rw-r--r--src/backend/lib/binaryheap.c307
-rw-r--r--src/backend/lib/bipartite_match.c180
-rw-r--r--src/backend/lib/bloomfilter.c295
-rw-r--r--src/backend/lib/dshash.c882
-rw-r--r--src/backend/lib/hyperloglog.c255
-rw-r--r--src/backend/lib/ilist.c111
-rw-r--r--src/backend/lib/integerset.c1045
-rw-r--r--src/backend/lib/knapsack.c111
-rw-r--r--src/backend/lib/pairingheap.c333
-rw-r--r--src/backend/lib/rbtree.c770
12 files changed, 4351 insertions, 0 deletions
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
new file mode 100644
index 0000000..9dad313
--- /dev/null
+++ b/src/backend/lib/Makefile
@@ -0,0 +1,27 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for lib (miscellaneous stuff)
+#
+# IDENTIFICATION
+# src/backend/lib/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/lib
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ binaryheap.o \
+ bipartite_match.o \
+ bloomfilter.o \
+ dshash.o \
+ hyperloglog.o \
+ ilist.o \
+ integerset.o \
+ knapsack.o \
+ pairingheap.o \
+ rbtree.o \
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/README b/src/backend/lib/README
new file mode 100644
index 0000000..f2fb591
--- /dev/null
+++ b/src/backend/lib/README
@@ -0,0 +1,35 @@
+This directory contains a general purpose data structures, for use anywhere
+in the backend:
+
+binaryheap.c - a binary heap
+
+bipartite_match.c - Hopcroft-Karp maximum cardinality algorithm for bipartite graphs
+
+bloomfilter.c - probabilistic, space-efficient set membership testing
+
+dshash.c - concurrent hash tables backed by dynamic shared memory areas
+
+hyperloglog.c - a streaming cardinality estimator
+
+ilist.c - single and double-linked lists
+
+integerset.c - a data structure for holding large set of integers
+
+knapsack.c - knapsack problem solver
+
+pairingheap.c - a pairing heap
+
+rbtree.c - a red-black tree
+
+stringinfo.c - an extensible string type
+
+
+Aside from the inherent characteristics of the data structures, there are a
+few practical differences between the binary heap and the pairing heap. The
+binary heap is fully allocated at creation, and cannot be expanded beyond the
+allocated size. The pairing heap on the other hand has no inherent maximum
+size, but the caller needs to allocate each element being stored in the heap,
+while the binary heap works with plain Datums or pointers.
+
+The linked-lists in ilist.c can be embedded directly into other structs, as
+opposed to the List interface in nodes/pg_list.h.
diff --git a/src/backend/lib/binaryheap.c b/src/backend/lib/binaryheap.c
new file mode 100644
index 0000000..d54e245
--- /dev/null
+++ b/src/backend/lib/binaryheap.c
@@ -0,0 +1,307 @@
+/*-------------------------------------------------------------------------
+ *
+ * binaryheap.c
+ * A simple binary heap implementation
+ *
+ * Portions Copyright (c) 2012-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/lib/binaryheap.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "lib/binaryheap.h"
+
+static void sift_down(binaryheap *heap, int node_off);
+static void sift_up(binaryheap *heap, int node_off);
+static inline void swap_nodes(binaryheap *heap, int a, int b);
+
+/*
+ * binaryheap_allocate
+ *
+ * Returns a pointer to a newly-allocated heap that has the capacity to
+ * store the given number of nodes, with the heap property defined by
+ * the given comparator function, which will be invoked with the additional
+ * argument specified by 'arg'.
+ */
+binaryheap *
+binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
+{
+ int sz;
+ binaryheap *heap;
+
+ sz = offsetof(binaryheap, bh_nodes) + sizeof(Datum) * capacity;
+ heap = (binaryheap *) palloc(sz);
+ heap->bh_space = capacity;
+ heap->bh_compare = compare;
+ heap->bh_arg = arg;
+
+ heap->bh_size = 0;
+ heap->bh_has_heap_property = true;
+
+ return heap;
+}
+
+/*
+ * binaryheap_reset
+ *
+ * Resets the heap to an empty state, losing its data content but not the
+ * parameters passed at allocation.
+ */
+void
+binaryheap_reset(binaryheap *heap)
+{
+ heap->bh_size = 0;
+ heap->bh_has_heap_property = true;
+}
+
+/*
+ * binaryheap_free
+ *
+ * Releases memory used by the given binaryheap.
+ */
+void
+binaryheap_free(binaryheap *heap)
+{
+ pfree(heap);
+}
+
+/*
+ * These utility functions return the offset of the left child, right
+ * child, and parent of the node at the given index, respectively.
+ *
+ * The heap is represented as an array of nodes, with the root node
+ * stored at index 0. The left child of node i is at index 2*i+1, and
+ * the right child at 2*i+2. The parent of node i is at index (i-1)/2.
+ */
+
+static inline int
+left_offset(int i)
+{
+ return 2 * i + 1;
+}
+
+static inline int
+right_offset(int i)
+{
+ return 2 * i + 2;
+}
+
+static inline int
+parent_offset(int i)
+{
+ return (i - 1) / 2;
+}
+
+/*
+ * binaryheap_add_unordered
+ *
+ * Adds the given datum to the end of the heap's list of nodes in O(1) without
+ * preserving the heap property. This is a convenience to add elements quickly
+ * to a new heap. To obtain a valid heap, one must call binaryheap_build()
+ * afterwards.
+ */
+void
+binaryheap_add_unordered(binaryheap *heap, Datum d)
+{
+ if (heap->bh_size >= heap->bh_space)
+ elog(ERROR, "out of binary heap slots");
+ heap->bh_has_heap_property = false;
+ heap->bh_nodes[heap->bh_size] = d;
+ heap->bh_size++;
+}
+
+/*
+ * binaryheap_build
+ *
+ * Assembles a valid heap in O(n) from the nodes added by
+ * binaryheap_add_unordered(). Not needed otherwise.
+ */
+void
+binaryheap_build(binaryheap *heap)
+{
+ int i;
+
+ for (i = parent_offset(heap->bh_size - 1); i >= 0; i--)
+ sift_down(heap, i);
+ heap->bh_has_heap_property = true;
+}
+
+/*
+ * binaryheap_add
+ *
+ * Adds the given datum to the heap in O(log n) time, while preserving
+ * the heap property.
+ */
+void
+binaryheap_add(binaryheap *heap, Datum d)
+{
+ if (heap->bh_size >= heap->bh_space)
+ elog(ERROR, "out of binary heap slots");
+ heap->bh_nodes[heap->bh_size] = d;
+ heap->bh_size++;
+ sift_up(heap, heap->bh_size - 1);
+}
+
+/*
+ * binaryheap_first
+ *
+ * Returns a pointer to the first (root, topmost) node in the heap
+ * without modifying the heap. The caller must ensure that this
+ * routine is not used on an empty heap. Always O(1).
+ */
+Datum
+binaryheap_first(binaryheap *heap)
+{
+ Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property);
+ return heap->bh_nodes[0];
+}
+
+/*
+ * binaryheap_remove_first
+ *
+ * Removes the first (root, topmost) node in the heap and returns a
+ * pointer to it after rebalancing the heap. The caller must ensure
+ * that this routine is not used on an empty heap. O(log n) worst
+ * case.
+ */
+Datum
+binaryheap_remove_first(binaryheap *heap)
+{
+ Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property);
+
+ if (heap->bh_size == 1)
+ {
+ heap->bh_size--;
+ return heap->bh_nodes[0];
+ }
+
+ /*
+ * Swap the root and last nodes, decrease the size of the heap (i.e.
+ * remove the former root node) and sift the new root node down to its
+ * correct position.
+ */
+ swap_nodes(heap, 0, heap->bh_size - 1);
+ heap->bh_size--;
+ sift_down(heap, 0);
+
+ return heap->bh_nodes[heap->bh_size];
+}
+
+/*
+ * binaryheap_replace_first
+ *
+ * Replace the topmost element of a non-empty heap, preserving the heap
+ * property. O(1) in the best case, or O(log n) if it must fall back to
+ * sifting the new node down.
+ */
+void
+binaryheap_replace_first(binaryheap *heap, Datum d)
+{
+ Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property);
+
+ heap->bh_nodes[0] = d;
+
+ if (heap->bh_size > 1)
+ sift_down(heap, 0);
+}
+
+/*
+ * Swap the contents of two nodes.
+ */
+static inline void
+swap_nodes(binaryheap *heap, int a, int b)
+{
+ Datum swap;
+
+ swap = heap->bh_nodes[a];
+ heap->bh_nodes[a] = heap->bh_nodes[b];
+ heap->bh_nodes[b] = swap;
+}
+
+/*
+ * Sift a node up to the highest position it can hold according to the
+ * comparator.
+ */
+static void
+sift_up(binaryheap *heap, int node_off)
+{
+ while (node_off != 0)
+ {
+ int cmp;
+ int parent_off;
+
+ /*
+ * If this node is smaller than its parent, the heap condition is
+ * satisfied, and we're done.
+ */
+ parent_off = parent_offset(node_off);
+ cmp = heap->bh_compare(heap->bh_nodes[node_off],
+ heap->bh_nodes[parent_off],
+ heap->bh_arg);
+ if (cmp <= 0)
+ break;
+
+ /*
+ * Otherwise, swap the node and its parent and go on to check the
+ * node's new parent.
+ */
+ swap_nodes(heap, node_off, parent_off);
+ node_off = parent_off;
+ }
+}
+
+/*
+ * Sift a node down from its current position to satisfy the heap
+ * property.
+ */
+static void
+sift_down(binaryheap *heap, int node_off)
+{
+ while (true)
+ {
+ int left_off = left_offset(node_off);
+ int right_off = right_offset(node_off);
+ int swap_off = 0;
+
+ /* Is the left child larger than the parent? */
+ if (left_off < heap->bh_size &&
+ heap->bh_compare(heap->bh_nodes[node_off],
+ heap->bh_nodes[left_off],
+ heap->bh_arg) < 0)
+ swap_off = left_off;
+
+ /* Is the right child larger than the parent? */
+ if (right_off < heap->bh_size &&
+ heap->bh_compare(heap->bh_nodes[node_off],
+ heap->bh_nodes[right_off],
+ heap->bh_arg) < 0)
+ {
+ /* swap with the larger child */
+ if (!swap_off ||
+ heap->bh_compare(heap->bh_nodes[left_off],
+ heap->bh_nodes[right_off],
+ heap->bh_arg) < 0)
+ swap_off = right_off;
+ }
+
+ /*
+ * If we didn't find anything to swap, the heap condition is
+ * satisfied, and we're done.
+ */
+ if (!swap_off)
+ break;
+
+ /*
+ * Otherwise, swap the node with the child that violates the heap
+ * property; then go on to check its children.
+ */
+ swap_nodes(heap, swap_off, node_off);
+ node_off = swap_off;
+ }
+}
diff --git a/src/backend/lib/bipartite_match.c b/src/backend/lib/bipartite_match.c
new file mode 100644
index 0000000..baa1c13
--- /dev/null
+++ b/src/backend/lib/bipartite_match.c
@@ -0,0 +1,180 @@
+/*-------------------------------------------------------------------------
+ *
+ * bipartite_match.c
+ * Hopcroft-Karp maximum cardinality algorithm for bipartite graphs
+ *
+ * This implementation is based on pseudocode found at:
+ *
+ * https://en.wikipedia.org/w/index.php?title=Hopcroft%E2%80%93Karp_algorithm&oldid=593898016
+ *
+ * Copyright (c) 2015-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/lib/bipartite_match.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "lib/bipartite_match.h"
+#include "miscadmin.h"
+
+/*
+ * The distances computed in hk_breadth_search can easily be seen to never
+ * exceed u_size. Since we restrict u_size to be less than SHRT_MAX, we
+ * can therefore use SHRT_MAX as the "infinity" distance needed as a marker.
+ */
+#define HK_INFINITY SHRT_MAX
+
+static bool hk_breadth_search(BipartiteMatchState *state);
+static bool hk_depth_search(BipartiteMatchState *state, int u);
+
+/*
+ * Given the size of U and V, where each is indexed 1..size, and an adjacency
+ * list, perform the matching and return the resulting state.
+ */
+BipartiteMatchState *
+BipartiteMatch(int u_size, int v_size, short **adjacency)
+{
+ BipartiteMatchState *state = palloc(sizeof(BipartiteMatchState));
+
+ if (u_size < 0 || u_size >= SHRT_MAX ||
+ v_size < 0 || v_size >= SHRT_MAX)
+ elog(ERROR, "invalid set size for BipartiteMatch");
+
+ state->u_size = u_size;
+ state->v_size = v_size;
+ state->adjacency = adjacency;
+ state->matching = 0;
+ state->pair_uv = (short *) palloc0((u_size + 1) * sizeof(short));
+ state->pair_vu = (short *) palloc0((v_size + 1) * sizeof(short));
+ state->distance = (short *) palloc((u_size + 1) * sizeof(short));
+ state->queue = (short *) palloc((u_size + 2) * sizeof(short));
+
+ while (hk_breadth_search(state))
+ {
+ int u;
+
+ for (u = 1; u <= u_size; u++)
+ {
+ if (state->pair_uv[u] == 0)
+ if (hk_depth_search(state, u))
+ state->matching++;
+ }
+
+ CHECK_FOR_INTERRUPTS(); /* just in case */
+ }
+
+ return state;
+}
+
+/*
+ * Free a state returned by BipartiteMatch, except for the original adjacency
+ * list, which is owned by the caller. This only frees memory, so it's optional.
+ */
+void
+BipartiteMatchFree(BipartiteMatchState *state)
+{
+ /* adjacency matrix is treated as owned by the caller */
+ pfree(state->pair_uv);
+ pfree(state->pair_vu);
+ pfree(state->distance);
+ pfree(state->queue);
+ pfree(state);
+}
+
+/*
+ * Perform the breadth-first search step of H-K matching.
+ * Returns true if successful.
+ */
+static bool
+hk_breadth_search(BipartiteMatchState *state)
+{
+ int usize = state->u_size;
+ short *queue = state->queue;
+ short *distance = state->distance;
+ int qhead = 0; /* we never enqueue any node more than once */
+ int qtail = 0; /* so don't have to worry about wrapping */
+ int u;
+
+ distance[0] = HK_INFINITY;
+
+ for (u = 1; u <= usize; u++)
+ {
+ if (state->pair_uv[u] == 0)
+ {
+ distance[u] = 0;
+ queue[qhead++] = u;
+ }
+ else
+ distance[u] = HK_INFINITY;
+ }
+
+ while (qtail < qhead)
+ {
+ u = queue[qtail++];
+
+ if (distance[u] < distance[0])
+ {
+ short *u_adj = state->adjacency[u];
+ int i = u_adj ? u_adj[0] : 0;
+
+ for (; i > 0; i--)
+ {
+ int u_next = state->pair_vu[u_adj[i]];
+
+ if (distance[u_next] == HK_INFINITY)
+ {
+ distance[u_next] = 1 + distance[u];
+ Assert(qhead < usize + 2);
+ queue[qhead++] = u_next;
+ }
+ }
+ }
+ }
+
+ return (distance[0] != HK_INFINITY);
+}
+
+/*
+ * Perform the depth-first search step of H-K matching.
+ * Returns true if successful.
+ */
+static bool
+hk_depth_search(BipartiteMatchState *state, int u)
+{
+ short *distance = state->distance;
+ short *pair_uv = state->pair_uv;
+ short *pair_vu = state->pair_vu;
+ short *u_adj = state->adjacency[u];
+ int i = u_adj ? u_adj[0] : 0;
+ short nextdist;
+
+ if (u == 0)
+ return true;
+ if (distance[u] == HK_INFINITY)
+ return false;
+ nextdist = distance[u] + 1;
+
+ check_stack_depth();
+
+ for (; i > 0; i--)
+ {
+ int v = u_adj[i];
+
+ if (distance[pair_vu[v]] == nextdist)
+ {
+ if (hk_depth_search(state, pair_vu[v]))
+ {
+ pair_vu[v] = u;
+ pair_uv[u] = v;
+ return true;
+ }
+ }
+ }
+
+ distance[u] = HK_INFINITY;
+ return false;
+}
diff --git a/src/backend/lib/bloomfilter.c b/src/backend/lib/bloomfilter.c
new file mode 100644
index 0000000..daf2c40
--- /dev/null
+++ b/src/backend/lib/bloomfilter.c
@@ -0,0 +1,295 @@
+/*-------------------------------------------------------------------------
+ *
+ * bloomfilter.c
+ * Space-efficient set membership testing
+ *
+ * A Bloom filter is a probabilistic data structure that is used to test an
+ * element's membership of a set. False positives are possible, but false
+ * negatives are not; a test of membership of the set returns either "possibly
+ * in set" or "definitely not in set". This is typically very space efficient,
+ * which can be a decisive advantage.
+ *
+ * Elements can be added to the set, but not removed. The more elements that
+ * are added, the larger the probability of false positives. Caller must hint
+ * an estimated total size of the set when the Bloom filter is initialized.
+ * This is used to balance the use of memory against the final false positive
+ * rate.
+ *
+ * The implementation is well suited to data synchronization problems between
+ * unordered sets, especially where predictable performance is important and
+ * some false positives are acceptable. It's also well suited to cache
+ * filtering problems where a relatively small and/or low cardinality set is
+ * fingerprinted, especially when many subsequent membership tests end up
+ * indicating that values of interest are not present. That should save the
+ * caller many authoritative lookups, such as expensive probes of a much larger
+ * on-disk structure.
+ *
+ * Copyright (c) 2018-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/lib/bloomfilter.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "common/hashfn.h"
+#include "lib/bloomfilter.h"
+#include "port/pg_bitutils.h"
+
+#define MAX_HASH_FUNCS 10
+
+struct bloom_filter
+{
+ /* K hash functions are used, seeded by caller's seed */
+ int k_hash_funcs;
+ uint64 seed;
+ /* m is bitset size, in bits. Must be a power of two <= 2^32. */
+ uint64 m;
+ unsigned char bitset[FLEXIBLE_ARRAY_MEMBER];
+};
+
+static int my_bloom_power(uint64 target_bitset_bits);
+static int optimal_k(uint64 bitset_bits, int64 total_elems);
+static void k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem,
+ size_t len);
+static inline uint32 mod_m(uint32 a, uint64 m);
+
+/*
+ * Create Bloom filter in caller's memory context. We aim for a false positive
+ * rate of between 1% and 2% when bitset size is not constrained by memory
+ * availability.
+ *
+ * total_elems is an estimate of the final size of the set. It should be
+ * approximately correct, but the implementation can cope well with it being
+ * off by perhaps a factor of five or more. See "Bloom Filters in
+ * Probabilistic Verification" (Dillinger & Manolios, 2004) for details of why
+ * this is the case.
+ *
+ * bloom_work_mem is sized in KB, in line with the general work_mem convention.
+ * This determines the size of the underlying bitset (trivial bookkeeping space
+ * isn't counted). The bitset is always sized as a power of two number of
+ * bits, and the largest possible bitset is 512MB (2^32 bits). The
+ * implementation allocates only enough memory to target its standard false
+ * positive rate, using a simple formula with caller's total_elems estimate as
+ * an input. The bitset might be as small as 1MB, even when bloom_work_mem is
+ * much higher.
+ *
+ * The Bloom filter is seeded using a value provided by the caller. Using a
+ * distinct seed value on every call makes it unlikely that the same false
+ * positives will reoccur when the same set is fingerprinted a second time.
+ * Callers that don't care about this pass a constant as their seed, typically
+ * 0. Callers can use a pseudo-random seed in the range of 0 - INT_MAX by
+ * calling random().
+ */
+bloom_filter *
+bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
+{
+ bloom_filter *filter;
+ int bloom_power;
+ uint64 bitset_bytes;
+ uint64 bitset_bits;
+
+ /*
+ * Aim for two bytes per element; this is sufficient to get a false
+ * positive rate below 1%, independent of the size of the bitset or total
+ * number of elements. Also, if rounding down the size of the bitset to
+ * the next lowest power of two turns out to be a significant drop, the
+ * false positive rate still won't exceed 2% in almost all cases.
+ */
+ bitset_bytes = Min(bloom_work_mem * UINT64CONST(1024), total_elems * 2);
+ bitset_bytes = Max(1024 * 1024, bitset_bytes);
+
+ /*
+ * Size in bits should be the highest power of two <= target. bitset_bits
+ * is uint64 because PG_UINT32_MAX is 2^32 - 1, not 2^32
+ */
+ bloom_power = my_bloom_power(bitset_bytes * BITS_PER_BYTE);
+ bitset_bits = UINT64CONST(1) << bloom_power;
+ bitset_bytes = bitset_bits / BITS_PER_BYTE;
+
+ /* Allocate bloom filter with unset bitset */
+ filter = palloc0(offsetof(bloom_filter, bitset) +
+ sizeof(unsigned char) * bitset_bytes);
+ filter->k_hash_funcs = optimal_k(bitset_bits, total_elems);
+ filter->seed = seed;
+ filter->m = bitset_bits;
+
+ return filter;
+}
+
+/*
+ * Free Bloom filter
+ */
+void
+bloom_free(bloom_filter *filter)
+{
+ pfree(filter);
+}
+
+/*
+ * Add element to Bloom filter
+ */
+void
+bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len)
+{
+ uint32 hashes[MAX_HASH_FUNCS];
+ int i;
+
+ k_hashes(filter, hashes, elem, len);
+
+ /* Map a bit-wise address to a byte-wise address + bit offset */
+ for (i = 0; i < filter->k_hash_funcs; i++)
+ {
+ filter->bitset[hashes[i] >> 3] |= 1 << (hashes[i] & 7);
+ }
+}
+
+/*
+ * Test if Bloom filter definitely lacks element.
+ *
+ * Returns true if the element is definitely not in the set of elements
+ * observed by bloom_add_element(). Otherwise, returns false, indicating that
+ * element is probably present in set.
+ */
+bool
+bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len)
+{
+ uint32 hashes[MAX_HASH_FUNCS];
+ int i;
+
+ k_hashes(filter, hashes, elem, len);
+
+ /* Map a bit-wise address to a byte-wise address + bit offset */
+ for (i = 0; i < filter->k_hash_funcs; i++)
+ {
+ if (!(filter->bitset[hashes[i] >> 3] & (1 << (hashes[i] & 7))))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * What proportion of bits are currently set?
+ *
+ * Returns proportion, expressed as a multiplier of filter size. That should
+ * generally be close to 0.5, even when we have more than enough memory to
+ * ensure a false positive rate within target 1% to 2% band, since more hash
+ * functions are used as more memory is available per element.
+ *
+ * This is the only instrumentation that is low overhead enough to appear in
+ * debug traces. When debugging Bloom filter code, it's likely to be far more
+ * interesting to directly test the false positive rate.
+ */
+double
+bloom_prop_bits_set(bloom_filter *filter)
+{
+ int bitset_bytes = filter->m / BITS_PER_BYTE;
+ uint64 bits_set = pg_popcount((char *) filter->bitset, bitset_bytes);
+
+ return bits_set / (double) filter->m;
+}
+
+/*
+ * Which element in the sequence of powers of two is less than or equal to
+ * target_bitset_bits?
+ *
+ * Value returned here must be generally safe as the basis for actual bitset
+ * size.
+ *
+ * Bitset is never allowed to exceed 2 ^ 32 bits (512MB). This is sufficient
+ * for the needs of all current callers, and allows us to use 32-bit hash
+ * functions. It also makes it easy to stay under the MaxAllocSize restriction
+ * (caller needs to leave room for non-bitset fields that appear before
+ * flexible array member, so a 1GB bitset would use an allocation that just
+ * exceeds MaxAllocSize).
+ */
+static int
+my_bloom_power(uint64 target_bitset_bits)
+{
+ int bloom_power = -1;
+
+ while (target_bitset_bits > 0 && bloom_power < 32)
+ {
+ bloom_power++;
+ target_bitset_bits >>= 1;
+ }
+
+ return bloom_power;
+}
+
+/*
+ * Determine optimal number of hash functions based on size of filter in bits,
+ * and projected total number of elements. The optimal number is the number
+ * that minimizes the false positive rate.
+ */
+static int
+optimal_k(uint64 bitset_bits, int64 total_elems)
+{
+ int k = rint(log(2.0) * bitset_bits / total_elems);
+
+ return Max(1, Min(k, MAX_HASH_FUNCS));
+}
+
+/*
+ * Generate k hash values for element.
+ *
+ * Caller passes array, which is filled-in with k values determined by hashing
+ * caller's element.
+ *
+ * Only 2 real independent hash functions are actually used to support an
+ * interface of up to MAX_HASH_FUNCS hash functions; enhanced double hashing is
+ * used to make this work. The main reason we prefer enhanced double hashing
+ * to classic double hashing is that the latter has an issue with collisions
+ * when using power of two sized bitsets. See Dillinger & Manolios for full
+ * details.
+ */
+static void
+k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem, size_t len)
+{
+ uint64 hash;
+ uint32 x,
+ y;
+ uint64 m;
+ int i;
+
+ /* Use 64-bit hashing to get two independent 32-bit hashes */
+ hash = DatumGetUInt64(hash_any_extended(elem, len, filter->seed));
+ x = (uint32) hash;
+ y = (uint32) (hash >> 32);
+ m = filter->m;
+
+ x = mod_m(x, m);
+ y = mod_m(y, m);
+
+ /* Accumulate hashes */
+ hashes[0] = x;
+ for (i = 1; i < filter->k_hash_funcs; i++)
+ {
+ x = mod_m(x + y, m);
+ y = mod_m(y + i, m);
+
+ hashes[i] = x;
+ }
+}
+
+/*
+ * Calculate "val MOD m" inexpensively.
+ *
+ * Assumes that m (which is bitset size) is a power of two.
+ *
+ * Using a power of two number of bits for bitset size allows us to use bitwise
+ * AND operations to calculate the modulo of a hash value. It's also a simple
+ * way of avoiding the modulo bias effect.
+ */
+static inline uint32
+mod_m(uint32 val, uint64 m)
+{
+ Assert(m <= PG_UINT32_MAX + UINT64CONST(1));
+ Assert(((m - 1) & m) == 0);
+
+ return val & (m - 1);
+}
diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
new file mode 100644
index 0000000..99128b2
--- /dev/null
+++ b/src/backend/lib/dshash.c
@@ -0,0 +1,882 @@
+/*-------------------------------------------------------------------------
+ *
+ * dshash.c
+ * Concurrent hash tables backed by dynamic shared memory areas.
+ *
+ * This is an open hashing hash table, with a linked list at each table
+ * entry. It supports dynamic resizing, as required to prevent the linked
+ * lists from growing too long on average. Currently, only growing is
+ * supported: the hash table never becomes smaller.
+ *
+ * To deal with concurrency, it has a fixed size set of partitions, each of
+ * which is independently locked. Each bucket maps to a partition; so insert,
+ * find and iterate operations normally only acquire one lock. Therefore,
+ * good concurrency is achieved whenever such operations don't collide at the
+ * lock partition level. However, when a resize operation begins, all
+ * partition locks must be acquired simultaneously for a brief period. This
+ * is only expected to happen a small number of times until a stable size is
+ * found, since growth is geometric.
+ *
+ * Future versions may support iterators and incremental resizing; for now
+ * the implementation is minimalist.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/lib/dshash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "common/hashfn.h"
+#include "lib/dshash.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "utils/dsa.h"
+#include "utils/memutils.h"
+
+/*
+ * An item in the hash table. This wraps the user's entry object in an
+ * envelop that holds a pointer back to the bucket and a pointer to the next
+ * item in the bucket.
+ */
+struct dshash_table_item
+{
+ /* The next item in the same bucket. */
+ dsa_pointer next;
+ /* The hashed key, to avoid having to recompute it. */
+ dshash_hash hash;
+ /* The user's entry object follows here. See ENTRY_FROM_ITEM(item). */
+};
+
+/*
+ * The number of partitions for locking purposes. This is set to match
+ * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
+ * the buffer pool must be good enough for any other purpose. This could
+ * become a runtime parameter in future.
+ */
+#define DSHASH_NUM_PARTITIONS_LOG2 7
+#define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
+
+/* A magic value used to identify our hash tables. */
+#define DSHASH_MAGIC 0x75ff6a20
+
+/*
+ * Tracking information for each lock partition. Initially, each partition
+ * corresponds to one bucket, but each time the hash table grows, the buckets
+ * covered by each partition split so the number of buckets covered doubles.
+ *
+ * We might want to add padding here so that each partition is on a different
+ * cache line, but doing so would bloat this structure considerably.
+ */
+typedef struct dshash_partition
+{
+ LWLock lock; /* Protects all buckets in this partition. */
+ size_t count; /* # of items in this partition's buckets */
+} dshash_partition;
+
+/*
+ * The head object for a hash table. This will be stored in dynamic shared
+ * memory.
+ */
+typedef struct dshash_table_control
+{
+ dshash_table_handle handle;
+ uint32 magic;
+ dshash_partition partitions[DSHASH_NUM_PARTITIONS];
+ int lwlock_tranche_id;
+
+ /*
+ * The following members are written to only when ALL partitions locks are
+ * held. They can be read when any one partition lock is held.
+ */
+
+ /* Number of buckets expressed as power of 2 (8 = 256 buckets). */
+ size_t size_log2; /* log2(number of buckets) */
+ dsa_pointer buckets; /* current bucket array */
+} dshash_table_control;
+
+/*
+ * Per-backend state for a dynamic hash table.
+ */
+struct dshash_table
+{
+ dsa_area *area; /* Backing dynamic shared memory area. */
+ dshash_parameters params; /* Parameters. */
+ void *arg; /* User-supplied data pointer. */
+ dshash_table_control *control; /* Control object in DSM. */
+ dsa_pointer *buckets; /* Current bucket pointers in DSM. */
+ size_t size_log2; /* log2(number of buckets) */
+};
+
+/* Given a pointer to an item, find the entry (user data) it holds. */
+#define ENTRY_FROM_ITEM(item) \
+ ((char *)(item) + MAXALIGN(sizeof(dshash_table_item)))
+
+/* Given a pointer to an entry, find the item that holds it. */
+#define ITEM_FROM_ENTRY(entry) \
+ ((dshash_table_item *)((char *)(entry) - \
+ MAXALIGN(sizeof(dshash_table_item))))
+
+/* How many resize operations (bucket splits) have there been? */
+#define NUM_SPLITS(size_log2) \
+ (size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
+
+/* How many buckets are there in each partition at a given size? */
+#define BUCKETS_PER_PARTITION(size_log2) \
+ (((size_t) 1) << NUM_SPLITS(size_log2))
+
+/* Max entries before we need to grow. Half + quarter = 75% load factor. */
+#define MAX_COUNT_PER_PARTITION(hash_table) \
+ (BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \
+ BUCKETS_PER_PARTITION(hash_table->size_log2) / 4)
+
+/* Choose partition based on the highest order bits of the hash. */
+#define PARTITION_FOR_HASH(hash) \
+ (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2))
+
+/*
+ * Find the bucket index for a given hash and table size. Each time the table
+ * doubles in size, the appropriate bucket for a given hash value doubles and
+ * possibly adds one, depending on the newly revealed bit, so that all buckets
+ * are split.
+ */
+#define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2) \
+ (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2)))
+
+/* The index of the first bucket in a given partition. */
+#define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \
+ ((partition) << NUM_SPLITS(size_log2))
+
+/* The head of the active bucket for a given hash value (lvalue). */
+#define BUCKET_FOR_HASH(hash_table, hash) \
+ (hash_table->buckets[ \
+ BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, \
+ hash_table->size_log2)])
+
+static void delete_item(dshash_table *hash_table,
+ dshash_table_item *item);
+static void resize(dshash_table *hash_table, size_t new_size);
+static inline void ensure_valid_bucket_pointers(dshash_table *hash_table);
+static inline dshash_table_item *find_in_bucket(dshash_table *hash_table,
+ const void *key,
+ dsa_pointer item_pointer);
+static void insert_item_into_bucket(dshash_table *hash_table,
+ dsa_pointer item_pointer,
+ dshash_table_item *item,
+ dsa_pointer *bucket);
+static dshash_table_item *insert_into_bucket(dshash_table *hash_table,
+ const void *key,
+ dsa_pointer *bucket);
+static bool delete_key_from_bucket(dshash_table *hash_table,
+ const void *key,
+ dsa_pointer *bucket_head);
+static bool delete_item_from_bucket(dshash_table *hash_table,
+ dshash_table_item *item,
+ dsa_pointer *bucket_head);
+static inline dshash_hash hash_key(dshash_table *hash_table, const void *key);
+static inline bool equal_keys(dshash_table *hash_table,
+ const void *a, const void *b);
+
+#define PARTITION_LOCK(hash_table, i) \
+ (&(hash_table)->control->partitions[(i)].lock)
+
+#define ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table) \
+ Assert(!LWLockAnyHeldByMe(&(hash_table)->control->partitions[0].lock, \
+ DSHASH_NUM_PARTITIONS, sizeof(dshash_partition)))
+
+/*
+ * Create a new hash table backed by the given dynamic shared area, with the
+ * given parameters. The returned object is allocated in backend-local memory
+ * using the current MemoryContext. 'arg' will be passed through to the
+ * compare and hash functions.
+ */
+dshash_table *
+dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
+{
+ dshash_table *hash_table;
+ dsa_pointer control;
+
+ /* Allocate the backend-local object representing the hash table. */
+ hash_table = palloc(sizeof(dshash_table));
+
+ /* Allocate the control object in shared memory. */
+ control = dsa_allocate(area, sizeof(dshash_table_control));
+
+ /* Set up the local and shared hash table structs. */
+ hash_table->area = area;
+ hash_table->params = *params;
+ hash_table->arg = arg;
+ hash_table->control = dsa_get_address(area, control);
+ hash_table->control->handle = control;
+ hash_table->control->magic = DSHASH_MAGIC;
+ hash_table->control->lwlock_tranche_id = params->tranche_id;
+
+ /* Set up the array of lock partitions. */
+ {
+ dshash_partition *partitions = hash_table->control->partitions;
+ int tranche_id = hash_table->control->lwlock_tranche_id;
+ int i;
+
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ {
+ LWLockInitialize(&partitions[i].lock, tranche_id);
+ partitions[i].count = 0;
+ }
+ }
+
+ /*
+ * Set up the initial array of buckets. Our initial size is the same as
+ * the number of partitions.
+ */
+ hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2;
+ hash_table->control->buckets =
+ dsa_allocate_extended(area,
+ sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS,
+ DSA_ALLOC_NO_OOM | DSA_ALLOC_ZERO);
+ if (!DsaPointerIsValid(hash_table->control->buckets))
+ {
+ dsa_free(area, control);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on DSA request of size %zu.",
+ sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS)));
+ }
+ hash_table->buckets = dsa_get_address(area,
+ hash_table->control->buckets);
+ hash_table->size_log2 = hash_table->control->size_log2;
+
+ return hash_table;
+}
+
+/*
+ * Attach to an existing hash table using a handle. The returned object is
+ * allocated in backend-local memory using the current MemoryContext. 'arg'
+ * will be passed through to the compare and hash functions.
+ */
+dshash_table *
+dshash_attach(dsa_area *area, const dshash_parameters *params,
+ dshash_table_handle handle, void *arg)
+{
+ dshash_table *hash_table;
+ dsa_pointer control;
+
+ /* Allocate the backend-local object representing the hash table. */
+ hash_table = palloc(sizeof(dshash_table));
+
+ /* Find the control object in shared memory. */
+ control = handle;
+
+ /* Set up the local hash table struct. */
+ hash_table->area = area;
+ hash_table->params = *params;
+ hash_table->arg = arg;
+ hash_table->control = dsa_get_address(area, control);
+ Assert(hash_table->control->magic == DSHASH_MAGIC);
+
+ /*
+ * These will later be set to the correct values by
+ * ensure_valid_bucket_pointers(), at which time we'll be holding a
+ * partition lock for interlocking against concurrent resizing.
+ */
+ hash_table->buckets = NULL;
+ hash_table->size_log2 = 0;
+
+ return hash_table;
+}
+
+/*
+ * Detach from a hash table. This frees backend-local resources associated
+ * with the hash table, but the hash table will continue to exist until it is
+ * either explicitly destroyed (by a backend that is still attached to it), or
+ * the area that backs it is returned to the operating system.
+ */
+void
+dshash_detach(dshash_table *hash_table)
+{
+ ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+ /* The hash table may have been destroyed. Just free local memory. */
+ pfree(hash_table);
+}
+
+/*
+ * Destroy a hash table, returning all memory to the area. The caller must be
+ * certain that no other backend will attempt to access the hash table before
+ * calling this function. Other backend must explicitly call dshash_detach to
+ * free up backend-local memory associated with the hash table. The backend
+ * that calls dshash_destroy must not call dshash_detach.
+ */
+void
+dshash_destroy(dshash_table *hash_table)
+{
+ size_t size;
+ size_t i;
+
+ Assert(hash_table->control->magic == DSHASH_MAGIC);
+ ensure_valid_bucket_pointers(hash_table);
+
+ /* Free all the entries. */
+ size = ((size_t) 1) << hash_table->size_log2;
+ for (i = 0; i < size; ++i)
+ {
+ dsa_pointer item_pointer = hash_table->buckets[i];
+
+ while (DsaPointerIsValid(item_pointer))
+ {
+ dshash_table_item *item;
+ dsa_pointer next_item_pointer;
+
+ item = dsa_get_address(hash_table->area, item_pointer);
+ next_item_pointer = item->next;
+ dsa_free(hash_table->area, item_pointer);
+ item_pointer = next_item_pointer;
+ }
+ }
+
+ /*
+ * Vandalize the control block to help catch programming errors where
+ * other backends access the memory formerly occupied by this hash table.
+ */
+ hash_table->control->magic = 0;
+
+ /* Free the active table and control object. */
+ dsa_free(hash_table->area, hash_table->control->buckets);
+ dsa_free(hash_table->area, hash_table->control->handle);
+
+ pfree(hash_table);
+}
+
+/*
+ * Get a handle that can be used by other processes to attach to this hash
+ * table.
+ */
+dshash_table_handle
+dshash_get_hash_table_handle(dshash_table *hash_table)
+{
+ Assert(hash_table->control->magic == DSHASH_MAGIC);
+
+ return hash_table->control->handle;
+}
+
+/*
+ * Look up an entry, given a key. Returns a pointer to an entry if one can be
+ * found with the given key. Returns NULL if the key is not found. If a
+ * non-NULL value is returned, the entry is locked and must be released by
+ * calling dshash_release_lock. If an error is raised before
+ * dshash_release_lock is called, the lock will be released automatically, but
+ * the caller must take care to ensure that the entry is not left corrupted.
+ * The lock mode is either shared or exclusive depending on 'exclusive'.
+ *
+ * The caller must not hold a lock already.
+ *
+ * Note that the lock held is in fact an LWLock, so interrupts will be held on
+ * return from this function, and not resumed until dshash_release_lock is
+ * called. It is a very good idea for the caller to release the lock quickly.
+ */
+void *
+dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
+{
+ dshash_hash hash;
+ size_t partition;
+ dshash_table_item *item;
+
+ hash = hash_key(hash_table, key);
+ partition = PARTITION_FOR_HASH(hash);
+
+ Assert(hash_table->control->magic == DSHASH_MAGIC);
+ ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+ LWLockAcquire(PARTITION_LOCK(hash_table, partition),
+ exclusive ? LW_EXCLUSIVE : LW_SHARED);
+ ensure_valid_bucket_pointers(hash_table);
+
+ /* Search the active bucket. */
+ item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
+
+ if (!item)
+ {
+ /* Not found. */
+ LWLockRelease(PARTITION_LOCK(hash_table, partition));
+ return NULL;
+ }
+ else
+ {
+ /* The caller will free the lock by calling dshash_release_lock. */
+ return ENTRY_FROM_ITEM(item);
+ }
+}
+
+/*
+ * Returns a pointer to an exclusively locked item which must be released with
+ * dshash_release_lock. If the key is found in the hash table, 'found' is set
+ * to true and a pointer to the existing entry is returned. If the key is not
+ * found, 'found' is set to false, and a pointer to a newly created entry is
+ * returned.
+ *
+ * Notes above dshash_find() regarding locking and error handling equally
+ * apply here.
+ */
+void *
+dshash_find_or_insert(dshash_table *hash_table,
+ const void *key,
+ bool *found)
+{
+ dshash_hash hash;
+ size_t partition_index;
+ dshash_partition *partition;
+ dshash_table_item *item;
+
+ hash = hash_key(hash_table, key);
+ partition_index = PARTITION_FOR_HASH(hash);
+ partition = &hash_table->control->partitions[partition_index];
+
+ Assert(hash_table->control->magic == DSHASH_MAGIC);
+ ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+restart:
+ LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
+ LW_EXCLUSIVE);
+ ensure_valid_bucket_pointers(hash_table);
+
+ /* Search the active bucket. */
+ item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
+
+ if (item)
+ *found = true;
+ else
+ {
+ *found = false;
+
+ /* Check if we are getting too full. */
+ if (partition->count > MAX_COUNT_PER_PARTITION(hash_table))
+ {
+ /*
+ * The load factor (= keys / buckets) for all buckets protected by
+ * this partition is > 0.75. Presumably the same applies
+ * generally across the whole hash table (though we don't attempt
+ * to track that directly to avoid contention on some kind of
+ * central counter; we just assume that this partition is
+ * representative). This is a good time to resize.
+ *
+ * Give up our existing lock first, because resizing needs to
+ * reacquire all the locks in the right order to avoid deadlocks.
+ */
+ LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
+ resize(hash_table, hash_table->size_log2 + 1);
+
+ goto restart;
+ }
+
+ /* Finally we can try to insert the new item. */
+ item = insert_into_bucket(hash_table, key,
+ &BUCKET_FOR_HASH(hash_table, hash));
+ item->hash = hash;
+ /* Adjust per-lock-partition counter for load factor knowledge. */
+ ++partition->count;
+ }
+
+ /* The caller must release the lock with dshash_release_lock. */
+ return ENTRY_FROM_ITEM(item);
+}
+
+/*
+ * Remove an entry by key. Returns true if the key was found and the
+ * corresponding entry was removed.
+ *
+ * To delete an entry that you already have a pointer to, see
+ * dshash_delete_entry.
+ */
+bool
+dshash_delete_key(dshash_table *hash_table, const void *key)
+{
+ dshash_hash hash;
+ size_t partition;
+ bool found;
+
+ Assert(hash_table->control->magic == DSHASH_MAGIC);
+ ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+ hash = hash_key(hash_table, key);
+ partition = PARTITION_FOR_HASH(hash);
+
+ LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE);
+ ensure_valid_bucket_pointers(hash_table);
+
+ if (delete_key_from_bucket(hash_table, key,
+ &BUCKET_FOR_HASH(hash_table, hash)))
+ {
+ Assert(hash_table->control->partitions[partition].count > 0);
+ found = true;
+ --hash_table->control->partitions[partition].count;
+ }
+ else
+ found = false;
+
+ LWLockRelease(PARTITION_LOCK(hash_table, partition));
+
+ return found;
+}
+
+/*
+ * Remove an entry. The entry must already be exclusively locked, and must
+ * have been obtained by dshash_find or dshash_find_or_insert. Note that this
+ * function releases the lock just like dshash_release_lock.
+ *
+ * To delete an entry by key, see dshash_delete_key.
+ */
+void
+dshash_delete_entry(dshash_table *hash_table, void *entry)
+{
+ dshash_table_item *item = ITEM_FROM_ENTRY(entry);
+ size_t partition = PARTITION_FOR_HASH(item->hash);
+
+ Assert(hash_table->control->magic == DSHASH_MAGIC);
+ Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
+ LW_EXCLUSIVE));
+
+ delete_item(hash_table, item);
+ LWLockRelease(PARTITION_LOCK(hash_table, partition));
+}
+
+/*
+ * Unlock an entry which was locked by dshash_find or dshash_find_or_insert.
+ */
+void
+dshash_release_lock(dshash_table *hash_table, void *entry)
+{
+ dshash_table_item *item = ITEM_FROM_ENTRY(entry);
+ size_t partition_index = PARTITION_FOR_HASH(item->hash);
+
+ Assert(hash_table->control->magic == DSHASH_MAGIC);
+
+ LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
+}
+
+/*
+ * A compare function that forwards to memcmp.
+ */
+int
+dshash_memcmp(const void *a, const void *b, size_t size, void *arg)
+{
+ return memcmp(a, b, size);
+}
+
+/*
+ * A hash function that forwards to tag_hash.
+ */
+dshash_hash
+dshash_memhash(const void *v, size_t size, void *arg)
+{
+ return tag_hash(v, size);
+}
+
+/*
+ * Print debugging information about the internal state of the hash table to
+ * stderr. The caller must hold no partition locks.
+ */
+void
+dshash_dump(dshash_table *hash_table)
+{
+ size_t i;
+ size_t j;
+
+ Assert(hash_table->control->magic == DSHASH_MAGIC);
+ ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ {
+ Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
+ LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
+ }
+
+ ensure_valid_bucket_pointers(hash_table);
+
+ fprintf(stderr,
+ "hash table size = %zu\n", (size_t) 1 << hash_table->size_log2);
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ {
+ dshash_partition *partition = &hash_table->control->partitions[i];
+ size_t begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2);
+ size_t end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2);
+
+ fprintf(stderr, " partition %zu\n", i);
+ fprintf(stderr,
+ " active buckets (key count = %zu)\n", partition->count);
+
+ for (j = begin; j < end; ++j)
+ {
+ size_t count = 0;
+ dsa_pointer bucket = hash_table->buckets[j];
+
+ while (DsaPointerIsValid(bucket))
+ {
+ dshash_table_item *item;
+
+ item = dsa_get_address(hash_table->area, bucket);
+
+ bucket = item->next;
+ ++count;
+ }
+ fprintf(stderr, " bucket %zu (key count = %zu)\n", j, count);
+ }
+ }
+
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ LWLockRelease(PARTITION_LOCK(hash_table, i));
+}
+
+/*
+ * Delete a locked item to which we have a pointer.
+ */
+static void
+delete_item(dshash_table *hash_table, dshash_table_item *item)
+{
+ size_t hash = item->hash;
+ size_t partition = PARTITION_FOR_HASH(hash);
+
+ Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition)));
+
+ if (delete_item_from_bucket(hash_table, item,
+ &BUCKET_FOR_HASH(hash_table, hash)))
+ {
+ Assert(hash_table->control->partitions[partition].count > 0);
+ --hash_table->control->partitions[partition].count;
+ }
+ else
+ {
+ Assert(false);
+ }
+}
+
+/*
+ * Grow the hash table if necessary to the requested number of buckets. The
+ * requested size must be double some previously observed size.
+ *
+ * Must be called without any partition lock held.
+ */
+static void
+resize(dshash_table *hash_table, size_t new_size_log2)
+{
+ dsa_pointer old_buckets;
+ dsa_pointer new_buckets_shared;
+ dsa_pointer *new_buckets;
+ size_t size;
+ size_t new_size = ((size_t) 1) << new_size_log2;
+ size_t i;
+
+ /*
+ * Acquire the locks for all lock partitions. This is expensive, but we
+ * shouldn't have to do it many times.
+ */
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ {
+ Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
+
+ LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE);
+ if (i == 0 && hash_table->control->size_log2 >= new_size_log2)
+ {
+ /*
+ * Another backend has already increased the size; we can avoid
+ * obtaining all the locks and return early.
+ */
+ LWLockRelease(PARTITION_LOCK(hash_table, 0));
+ return;
+ }
+ }
+
+ Assert(new_size_log2 == hash_table->control->size_log2 + 1);
+
+ /* Allocate the space for the new table. */
+ new_buckets_shared = dsa_allocate0(hash_table->area,
+ sizeof(dsa_pointer) * new_size);
+ new_buckets = dsa_get_address(hash_table->area, new_buckets_shared);
+
+ /*
+ * We've allocated the new bucket array; all that remains to do now is to
+ * reinsert all items, which amounts to adjusting all the pointers.
+ */
+ size = ((size_t) 1) << hash_table->control->size_log2;
+ for (i = 0; i < size; ++i)
+ {
+ dsa_pointer item_pointer = hash_table->buckets[i];
+
+ while (DsaPointerIsValid(item_pointer))
+ {
+ dshash_table_item *item;
+ dsa_pointer next_item_pointer;
+
+ item = dsa_get_address(hash_table->area, item_pointer);
+ next_item_pointer = item->next;
+ insert_item_into_bucket(hash_table, item_pointer, item,
+ &new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash,
+ new_size_log2)]);
+ item_pointer = next_item_pointer;
+ }
+ }
+
+ /* Swap the hash table into place and free the old one. */
+ old_buckets = hash_table->control->buckets;
+ hash_table->control->buckets = new_buckets_shared;
+ hash_table->control->size_log2 = new_size_log2;
+ hash_table->buckets = new_buckets;
+ dsa_free(hash_table->area, old_buckets);
+
+ /* Release all the locks. */
+ for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+ LWLockRelease(PARTITION_LOCK(hash_table, i));
+}
+
+/*
+ * Make sure that our backend-local bucket pointers are up to date. The
+ * caller must have locked one lock partition, which prevents resize() from
+ * running concurrently.
+ */
+static inline void
+ensure_valid_bucket_pointers(dshash_table *hash_table)
+{
+ if (hash_table->size_log2 != hash_table->control->size_log2)
+ {
+ hash_table->buckets = dsa_get_address(hash_table->area,
+ hash_table->control->buckets);
+ hash_table->size_log2 = hash_table->control->size_log2;
+ }
+}
+
+/*
+ * Scan a locked bucket for a match, using the provided compare function.
+ */
+static inline dshash_table_item *
+find_in_bucket(dshash_table *hash_table, const void *key,
+ dsa_pointer item_pointer)
+{
+ while (DsaPointerIsValid(item_pointer))
+ {
+ dshash_table_item *item;
+
+ item = dsa_get_address(hash_table->area, item_pointer);
+ if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
+ return item;
+ item_pointer = item->next;
+ }
+ return NULL;
+}
+
+/*
+ * Insert an already-allocated item into a bucket.
+ */
+static void
+insert_item_into_bucket(dshash_table *hash_table,
+ dsa_pointer item_pointer,
+ dshash_table_item *item,
+ dsa_pointer *bucket)
+{
+ Assert(item == dsa_get_address(hash_table->area, item_pointer));
+
+ item->next = *bucket;
+ *bucket = item_pointer;
+}
+
+/*
+ * Allocate space for an entry with the given key and insert it into the
+ * provided bucket.
+ */
+static dshash_table_item *
+insert_into_bucket(dshash_table *hash_table,
+ const void *key,
+ dsa_pointer *bucket)
+{
+ dsa_pointer item_pointer;
+ dshash_table_item *item;
+
+ item_pointer = dsa_allocate(hash_table->area,
+ hash_table->params.entry_size +
+ MAXALIGN(sizeof(dshash_table_item)));
+ item = dsa_get_address(hash_table->area, item_pointer);
+ memcpy(ENTRY_FROM_ITEM(item), key, hash_table->params.key_size);
+ insert_item_into_bucket(hash_table, item_pointer, item, bucket);
+ return item;
+}
+
+/*
+ * Search a bucket for a matching key and delete it.
+ */
+static bool
+delete_key_from_bucket(dshash_table *hash_table,
+ const void *key,
+ dsa_pointer *bucket_head)
+{
+ while (DsaPointerIsValid(*bucket_head))
+ {
+ dshash_table_item *item;
+
+ item = dsa_get_address(hash_table->area, *bucket_head);
+
+ if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
+ {
+ dsa_pointer next;
+
+ next = item->next;
+ dsa_free(hash_table->area, *bucket_head);
+ *bucket_head = next;
+
+ return true;
+ }
+ bucket_head = &item->next;
+ }
+ return false;
+}
+
+/*
+ * Delete the specified item from the bucket.
+ */
+static bool
+delete_item_from_bucket(dshash_table *hash_table,
+ dshash_table_item *item,
+ dsa_pointer *bucket_head)
+{
+ while (DsaPointerIsValid(*bucket_head))
+ {
+ dshash_table_item *bucket_item;
+
+ bucket_item = dsa_get_address(hash_table->area, *bucket_head);
+
+ if (bucket_item == item)
+ {
+ dsa_pointer next;
+
+ next = item->next;
+ dsa_free(hash_table->area, *bucket_head);
+ *bucket_head = next;
+ return true;
+ }
+ bucket_head = &bucket_item->next;
+ }
+ return false;
+}
+
+/*
+ * Compute the hash value for a key.
+ */
+static inline dshash_hash
+hash_key(dshash_table *hash_table, const void *key)
+{
+ return hash_table->params.hash_function(key,
+ hash_table->params.key_size,
+ hash_table->arg);
+}
+
+/*
+ * Check whether two keys compare equal.
+ */
+static inline bool
+equal_keys(dshash_table *hash_table, const void *a, const void *b)
+{
+ return hash_table->params.compare_function(a, b,
+ hash_table->params.key_size,
+ hash_table->arg) == 0;
+}
diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c
new file mode 100644
index 0000000..f4e0241
--- /dev/null
+++ b/src/backend/lib/hyperloglog.c
@@ -0,0 +1,255 @@
+/*-------------------------------------------------------------------------
+ *
+ * hyperloglog.c
+ * HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014-2021, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation. This is probably not ideally
+ * suited to estimating the cardinality of very large sets; in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ * src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "lib/hyperloglog.h"
+#include "port/pg_bitutils.h"
+
+#define POW_2_32 (4294967296.0)
+#define NEG_POW_2_32 (-4294967296.0)
+
+static inline uint8 rho(uint32 x, uint8 b);
+
+/*
+ * Initialize HyperLogLog track state, by bit width
+ *
+ * bwidth is bit width (so register size will be 2 to the power of bwidth).
+ * Must be between 4 and 16 inclusive.
+ */
+void
+initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
+{
+ double alpha;
+
+ if (bwidth < 4 || bwidth > 16)
+ elog(ERROR, "bit width must be between 4 and 16 inclusive");
+
+ cState->registerWidth = bwidth;
+ cState->nRegisters = (Size) 1 << bwidth;
+ cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
+
+ /*
+ * Initialize hashes array to zero, not negative infinity, per discussion
+ * of the coupon collector problem in the HyperLogLog paper
+ */
+ cState->hashesArr = palloc0(cState->arrSize);
+
+ /*
+ * "alpha" is a value that for each possible number of registers (m) is
+ * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
+ * is "the indicator function" through which we finally compute E,
+ * estimated cardinality).
+ */
+ switch (cState->nRegisters)
+ {
+ case 16:
+ alpha = 0.673;
+ break;
+ case 32:
+ alpha = 0.697;
+ break;
+ case 64:
+ alpha = 0.709;
+ break;
+ default:
+ alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
+ }
+
+ /*
+ * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
+ * estimate E
+ */
+ cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
+}
+
+/*
+ * Initialize HyperLogLog track state, by error rate
+ *
+ * Instead of specifying bwidth (number of bits used for addressing the
+ * register), this method allows sizing the counter for particular error
+ * rate using a simple formula from the paper:
+ *
+ * e = 1.04 / sqrt(m)
+ *
+ * where 'm' is the number of registers, i.e. (2^bwidth). The method
+ * finds the lowest bwidth with 'e' below the requested error rate, and
+ * then uses it to initialize the counter.
+ *
+ * As bwidth has to be between 4 and 16, the worst possible error rate
+ * is between ~25% (bwidth=4) and 0.4% (bwidth=16).
+ */
+void
+initHyperLogLogError(hyperLogLogState *cState, double error)
+{
+ uint8 bwidth = 4;
+
+ while (bwidth < 16)
+ {
+ double m = (Size) 1 << bwidth;
+
+ if (1.04 / sqrt(m) < error)
+ break;
+ bwidth++;
+ }
+
+ initHyperLogLog(cState, bwidth);
+}
+
+/*
+ * Free HyperLogLog track state
+ *
+ * Releases allocated resources, but not the state itself (in case it's not
+ * allocated by palloc).
+ */
+void
+freeHyperLogLog(hyperLogLogState *cState)
+{
+ Assert(cState->hashesArr != NULL);
+ pfree(cState->hashesArr);
+}
+
+/*
+ * Adds element to the estimator, from caller-supplied hash.
+ *
+ * It is critical that the hash value passed be an actual hash value, typically
+ * generated using hash_any(). The algorithm relies on a specific bit-pattern
+ * observable in conjunction with stochastic averaging. There must be a
+ * uniform distribution of bits in hash values for each distinct original value
+ * observed.
+ */
+void
+addHyperLogLog(hyperLogLogState *cState, uint32 hash)
+{
+ uint8 count;
+ uint32 index;
+
+ /* Use the first "k" (registerWidth) bits as a zero based index */
+ index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+ /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+ count = rho(hash << cState->registerWidth,
+ BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+ cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
+}
+
+/*
+ * Estimates cardinality, based on elements added so far
+ */
+double
+estimateHyperLogLog(hyperLogLogState *cState)
+{
+ double result;
+ double sum = 0.0;
+ int i;
+
+ for (i = 0; i < cState->nRegisters; i++)
+ {
+ sum += 1.0 / pow(2.0, cState->hashesArr[i]);
+ }
+
+ /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+ result = cState->alphaMM / sum;
+
+ if (result <= (5.0 / 2.0) * cState->nRegisters)
+ {
+ /* Small range correction */
+ int zero_count = 0;
+
+ for (i = 0; i < cState->nRegisters; i++)
+ {
+ if (cState->hashesArr[i] == 0)
+ zero_count++;
+ }
+
+ if (zero_count != 0)
+ result = cState->nRegisters * log((double) cState->nRegisters /
+ zero_count);
+ }
+ else if (result > (1.0 / 30.0) * POW_2_32)
+ {
+ /* Large range correction */
+ result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+ }
+
+ return result;
+}
+
+/*
+ * Worker for addHyperLogLog().
+ *
+ * Calculates the position of the first set bit in first b bits of x argument
+ * starting from the first, reading from most significant to least significant
+ * bits.
+ *
+ * Example (when considering fist 10 bits of x):
+ *
+ * rho(x = 0b1000000000) returns 1
+ * rho(x = 0b0010000000) returns 3
+ * rho(x = 0b0000000000) returns b + 1
+ *
+ * "The binary address determined by the first b bits of x"
+ *
+ * Return value "j" used to index bit pattern to watch.
+ */
+static inline uint8
+rho(uint32 x, uint8 b)
+{
+ uint8 j = 1;
+
+ if (x == 0)
+ return b + 1;
+
+ j = 32 - pg_leftmost_one_pos32(x);
+
+ if (j > b)
+ return b + 1;
+
+ return j;
+}
diff --git a/src/backend/lib/ilist.c b/src/backend/lib/ilist.c
new file mode 100644
index 0000000..e9a07c1
--- /dev/null
+++ b/src/backend/lib/ilist.c
@@ -0,0 +1,111 @@
+/*-------------------------------------------------------------------------
+ *
+ * ilist.c
+ * support for integrated/inline doubly- and singly- linked lists
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/lib/ilist.c
+ *
+ * NOTES
+ * This file only contains functions that are too big to be considered
+ * for inlining. See ilist.h for most of the goodies.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "lib/ilist.h"
+
+/*
+ * Delete 'node' from list.
+ *
+ * It is not allowed to delete a 'node' which is not in the list 'head'
+ *
+ * Caution: this is O(n); consider using slist_delete_current() instead.
+ */
+void
+slist_delete(slist_head *head, slist_node *node)
+{
+ slist_node *last = &head->head;
+ slist_node *cur;
+ bool found PG_USED_FOR_ASSERTS_ONLY = false;
+
+ while ((cur = last->next) != NULL)
+ {
+ if (cur == node)
+ {
+ last->next = cur->next;
+#ifdef USE_ASSERT_CHECKING
+ found = true;
+#endif
+ break;
+ }
+ last = cur;
+ }
+ Assert(found);
+
+ slist_check(head);
+}
+
+#ifdef ILIST_DEBUG
+/*
+ * Verify integrity of a doubly linked list
+ */
+void
+dlist_check(dlist_head *head)
+{
+ dlist_node *cur;
+
+ if (head == NULL)
+ elog(ERROR, "doubly linked list head address is NULL");
+
+ if (head->head.next == NULL && head->head.prev == NULL)
+ return; /* OK, initialized as zeroes */
+
+ /* iterate in forward direction */
+ for (cur = head->head.next; cur != &head->head; cur = cur->next)
+ {
+ if (cur == NULL ||
+ cur->next == NULL ||
+ cur->prev == NULL ||
+ cur->prev->next != cur ||
+ cur->next->prev != cur)
+ elog(ERROR, "doubly linked list is corrupted");
+ }
+
+ /* iterate in backward direction */
+ for (cur = head->head.prev; cur != &head->head; cur = cur->prev)
+ {
+ if (cur == NULL ||
+ cur->next == NULL ||
+ cur->prev == NULL ||
+ cur->prev->next != cur ||
+ cur->next->prev != cur)
+ elog(ERROR, "doubly linked list is corrupted");
+ }
+}
+
+/*
+ * Verify integrity of a singly linked list
+ */
+void
+slist_check(slist_head *head)
+{
+ slist_node *cur;
+
+ if (head == NULL)
+ elog(ERROR, "singly linked list head address is NULL");
+
+ /*
+ * there isn't much we can test in a singly linked list except that it
+ * actually ends sometime, i.e. hasn't introduced a cycle or similar
+ */
+ for (cur = head->head.next; cur != NULL; cur = cur->next)
+ ;
+}
+
+#endif /* ILIST_DEBUG */
diff --git a/src/backend/lib/integerset.c b/src/backend/lib/integerset.c
new file mode 100644
index 0000000..278a91b
--- /dev/null
+++ b/src/backend/lib/integerset.c
@@ -0,0 +1,1045 @@
+/*-------------------------------------------------------------------------
+ *
+ * integerset.c
+ * Data structure to hold a large set of 64-bit integers efficiently
+ *
+ * IntegerSet provides an in-memory data structure to hold a set of
+ * arbitrary 64-bit integers. Internally, the values are stored in a
+ * B-tree, with a special packed representation at the leaf level using
+ * the Simple-8b algorithm, which can pack clusters of nearby values
+ * very tightly.
+ *
+ * Memory consumption depends on the number of values stored, but also
+ * on how far the values are from each other. In the best case, with
+ * long runs of consecutive integers, memory consumption can be as low as
+ * 0.1 bytes per integer. In the worst case, if integers are more than
+ * 2^32 apart, it uses about 8 bytes per integer. In typical use, the
+ * consumption per integer is somewhere between those extremes, depending
+ * on the range of integers stored, and how "clustered" they are.
+ *
+ *
+ * Interface
+ * ---------
+ *
+ * intset_create - Create a new, empty set
+ * intset_add_member - Add an integer to the set
+ * intset_is_member - Test if an integer is in the set
+ * intset_begin_iterate - Begin iterating through all integers in set
+ * intset_iterate_next - Return next set member, if any
+ *
+ * intset_create() creates the set in the current memory context. Subsequent
+ * operations that add to the data structure will continue to allocate from
+ * that same context, even if it's not current anymore.
+ *
+ * Note that there is no function to free an integer set. If you need to do
+ * that, create a dedicated memory context to hold it, and destroy the memory
+ * context instead.
+ *
+ *
+ * Limitations
+ * -----------
+ *
+ * - Values must be added in order. (Random insertions would require
+ * splitting nodes, which hasn't been implemented.)
+ *
+ * - Values cannot be added while iteration is in progress.
+ *
+ * - No support for removing values.
+ *
+ * None of these limitations are fundamental to the data structure, so they
+ * could be lifted if needed, by writing some new code. But the current
+ * users of this facility don't need them.
+ *
+ *
+ * References
+ * ----------
+ *
+ * Simple-8b encoding is based on:
+ *
+ * Vo Ngoc Anh, Alistair Moffat, Index compression using 64-bit words,
+ * Software - Practice & Experience, v.40 n.2, p.131-147, February 2010
+ * (https://doi.org/10.1002/spe.948)
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/lib/integerset.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "lib/integerset.h"
+#include "port/pg_bitutils.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Maximum number of integers that can be encoded in a single Simple-8b
+ * codeword. (Defined here before anything else, so that we can size arrays
+ * using this.)
+ */
+#define SIMPLE8B_MAX_VALUES_PER_CODEWORD 240
+
+/*
+ * Parameters for shape of the in-memory B-tree.
+ *
+ * These set the size of each internal and leaf node. They don't necessarily
+ * need to be the same, because the tree is just an in-memory structure.
+ * With the default 64, each node is about 1 kb.
+ *
+ * If you change these, you must recalculate MAX_TREE_LEVELS, too!
+ */
+#define MAX_INTERNAL_ITEMS 64
+#define MAX_LEAF_ITEMS 64
+
+/*
+ * Maximum height of the tree.
+ *
+ * MAX_TREE_ITEMS is calculated from the "fan-out" of the B-tree. The
+ * theoretical maximum number of items that we can store in a set is 2^64,
+ * so MAX_TREE_LEVELS should be set so that:
+ *
+ * MAX_LEAF_ITEMS * MAX_INTERNAL_ITEMS ^ (MAX_TREE_LEVELS - 1) >= 2^64.
+ *
+ * In practice, we'll need far fewer levels, because you will run out of
+ * memory long before reaching that number, but let's be conservative.
+ */
+#define MAX_TREE_LEVELS 11
+
+/*
+ * Node structures, for the in-memory B-tree.
+ *
+ * An internal node holds a number of downlink pointers to leaf nodes, or
+ * to internal nodes on a lower level. For each downlink, the key value
+ * corresponding to the lower level node is stored in a sorted array. The
+ * stored key values are low keys. In other words, if the downlink has value
+ * X, then all items stored on that child are >= X.
+ *
+ * Each leaf node holds a number of "items", with a varying number of
+ * integers packed into each item. Each item consists of two 64-bit words:
+ * The first word holds the first integer stored in the item, in plain format.
+ * The second word contains between 0 and 240 more integers, packed using
+ * Simple-8b encoding. By storing the first integer in plain, unpacked,
+ * format, we can use binary search to quickly find an item that holds (or
+ * would hold) a particular integer. And by storing the rest in packed form,
+ * we still get pretty good memory density, if there are clusters of integers
+ * with similar values.
+ *
+ * Each leaf node also has a pointer to the next leaf node, so that the leaf
+ * nodes can be easily walked from beginning to end when iterating.
+ */
+typedef struct intset_node intset_node;
+typedef struct intset_leaf_node intset_leaf_node;
+typedef struct intset_internal_node intset_internal_node;
+
+/* Common structure of both leaf and internal nodes. */
+struct intset_node
+{
+ uint16 level; /* tree level of this node */
+ uint16 num_items; /* number of items in this node */
+};
+
+/* Internal node */
+struct intset_internal_node
+{
+ /* common header, must match intset_node */
+ uint16 level; /* >= 1 on internal nodes */
+ uint16 num_items;
+
+ /*
+ * 'values' is an array of key values, and 'downlinks' are pointers to
+ * lower-level nodes, corresponding to the key values.
+ */
+ uint64 values[MAX_INTERNAL_ITEMS];
+ intset_node *downlinks[MAX_INTERNAL_ITEMS];
+};
+
+/* Leaf node */
+typedef struct
+{
+ uint64 first; /* first integer in this item */
+ uint64 codeword; /* simple8b encoded differences from 'first' */
+} leaf_item;
+
+#define MAX_VALUES_PER_LEAF_ITEM (1 + SIMPLE8B_MAX_VALUES_PER_CODEWORD)
+
+struct intset_leaf_node
+{
+ /* common header, must match intset_node */
+ uint16 level; /* 0 on leafs */
+ uint16 num_items;
+
+ intset_leaf_node *next; /* right sibling, if any */
+
+ leaf_item items[MAX_LEAF_ITEMS];
+};
+
+/*
+ * We buffer insertions in a simple array, before packing and inserting them
+ * into the B-tree. MAX_BUFFERED_VALUES sets the size of the buffer. The
+ * encoder assumes that it is large enough that we can always fill a leaf
+ * item with buffered new items. In other words, MAX_BUFFERED_VALUES must be
+ * larger than MAX_VALUES_PER_LEAF_ITEM. For efficiency, make it much larger.
+ */
+#define MAX_BUFFERED_VALUES (MAX_VALUES_PER_LEAF_ITEM * 2)
+
+/*
+ * IntegerSet is the top-level object representing the set.
+ *
+ * The integers are stored in an in-memory B-tree structure, plus an array
+ * for newly-added integers. IntegerSet also tracks information about memory
+ * usage, as well as the current position when iterating the set with
+ * intset_begin_iterate / intset_iterate_next.
+ */
+struct IntegerSet
+{
+ /*
+ * 'context' is the memory context holding this integer set and all its
+ * tree nodes.
+ *
+ * 'mem_used' tracks the amount of memory used. We don't do anything with
+ * it in integerset.c itself, but the callers can ask for it with
+ * intset_memory_usage().
+ */
+ MemoryContext context;
+ uint64 mem_used;
+
+ uint64 num_entries; /* total # of values in the set */
+ uint64 highest_value; /* highest value stored in this set */
+
+ /*
+ * B-tree to hold the packed values.
+ *
+ * 'rightmost_nodes' hold pointers to the rightmost node on each level.
+ * rightmost_parent[0] is rightmost leaf, rightmost_parent[1] is its
+ * parent, and so forth, all the way up to the root. These are needed when
+ * adding new values. (Currently, we require that new values are added at
+ * the end.)
+ */
+ int num_levels; /* height of the tree */
+ intset_node *root; /* root node */
+ intset_node *rightmost_nodes[MAX_TREE_LEVELS];
+ intset_leaf_node *leftmost_leaf; /* leftmost leaf node */
+
+ /*
+ * Holding area for new items that haven't been inserted to the tree yet.
+ */
+ uint64 buffered_values[MAX_BUFFERED_VALUES];
+ int num_buffered_values;
+
+ /*
+ * Iterator support.
+ *
+ * 'iter_values' is an array of integers ready to be returned to the
+ * caller; 'iter_num_values' is the length of that array, and
+ * 'iter_valueno' is the next index. 'iter_node' and 'iter_itemno' point
+ * to the leaf node, and item within the leaf node, to get the next batch
+ * of values from.
+ *
+ * Normally, 'iter_values' points to 'iter_values_buf', which holds items
+ * decoded from a leaf item. But after we have scanned the whole B-tree,
+ * we iterate through all the unbuffered values, too, by pointing
+ * iter_values to 'buffered_values'.
+ */
+ bool iter_active; /* is iteration in progress? */
+
+ const uint64 *iter_values;
+ int iter_num_values; /* number of elements in 'iter_values' */
+ int iter_valueno; /* next index into 'iter_values' */
+
+ intset_leaf_node *iter_node; /* current leaf node */
+ int iter_itemno; /* next item in 'iter_node' to decode */
+
+ uint64 iter_values_buf[MAX_VALUES_PER_LEAF_ITEM];
+};
+
+/*
+ * Prototypes for internal functions.
+ */
+static void intset_update_upper(IntegerSet *intset, int level,
+ intset_node *child, uint64 child_key);
+static void intset_flush_buffered_values(IntegerSet *intset);
+
+static int intset_binsrch_uint64(uint64 value, uint64 *arr, int arr_elems,
+ bool nextkey);
+static int intset_binsrch_leaf(uint64 value, leaf_item *arr, int arr_elems,
+ bool nextkey);
+
+static uint64 simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base);
+static int simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base);
+static bool simple8b_contains(uint64 codeword, uint64 key, uint64 base);
+
+
+/*
+ * Create a new, initially empty, integer set.
+ *
+ * The integer set is created in the current memory context.
+ * We will do all subsequent allocations in the same context, too, regardless
+ * of which memory context is current when new integers are added to the set.
+ */
+IntegerSet *
+intset_create(void)
+{
+ IntegerSet *intset;
+
+ intset = (IntegerSet *) palloc(sizeof(IntegerSet));
+ intset->context = CurrentMemoryContext;
+ intset->mem_used = GetMemoryChunkSpace(intset);
+
+ intset->num_entries = 0;
+ intset->highest_value = 0;
+
+ intset->num_levels = 0;
+ intset->root = NULL;
+ memset(intset->rightmost_nodes, 0, sizeof(intset->rightmost_nodes));
+ intset->leftmost_leaf = NULL;
+
+ intset->num_buffered_values = 0;
+
+ intset->iter_active = false;
+ intset->iter_node = NULL;
+ intset->iter_itemno = 0;
+ intset->iter_valueno = 0;
+ intset->iter_num_values = 0;
+ intset->iter_values = NULL;
+
+ return intset;
+}
+
+/*
+ * Allocate a new node.
+ */
+static intset_internal_node *
+intset_new_internal_node(IntegerSet *intset)
+{
+ intset_internal_node *n;
+
+ n = (intset_internal_node *) MemoryContextAlloc(intset->context,
+ sizeof(intset_internal_node));
+ intset->mem_used += GetMemoryChunkSpace(n);
+
+ n->level = 0; /* caller must set */
+ n->num_items = 0;
+
+ return n;
+}
+
+static intset_leaf_node *
+intset_new_leaf_node(IntegerSet *intset)
+{
+ intset_leaf_node *n;
+
+ n = (intset_leaf_node *) MemoryContextAlloc(intset->context,
+ sizeof(intset_leaf_node));
+ intset->mem_used += GetMemoryChunkSpace(n);
+
+ n->level = 0;
+ n->num_items = 0;
+ n->next = NULL;
+
+ return n;
+}
+
+/*
+ * Return the number of entries in the integer set.
+ */
+uint64
+intset_num_entries(IntegerSet *intset)
+{
+ return intset->num_entries;
+}
+
+/*
+ * Return the amount of memory used by the integer set.
+ */
+uint64
+intset_memory_usage(IntegerSet *intset)
+{
+ return intset->mem_used;
+}
+
+/*
+ * Add a value to the set.
+ *
+ * Values must be added in order.
+ */
+void
+intset_add_member(IntegerSet *intset, uint64 x)
+{
+ if (intset->iter_active)
+ elog(ERROR, "cannot add new values to integer set while iteration is in progress");
+
+ if (x <= intset->highest_value && intset->num_entries > 0)
+ elog(ERROR, "cannot add value to integer set out of order");
+
+ if (intset->num_buffered_values >= MAX_BUFFERED_VALUES)
+ {
+ /* Time to flush our buffer */
+ intset_flush_buffered_values(intset);
+ Assert(intset->num_buffered_values < MAX_BUFFERED_VALUES);
+ }
+
+ /* Add it to the buffer of newly-added values */
+ intset->buffered_values[intset->num_buffered_values] = x;
+ intset->num_buffered_values++;
+ intset->num_entries++;
+ intset->highest_value = x;
+}
+
+/*
+ * Take a batch of buffered values, and pack them into the B-tree.
+ */
+static void
+intset_flush_buffered_values(IntegerSet *intset)
+{
+ uint64 *values = intset->buffered_values;
+ uint64 num_values = intset->num_buffered_values;
+ int num_packed = 0;
+ intset_leaf_node *leaf;
+
+ leaf = (intset_leaf_node *) intset->rightmost_nodes[0];
+
+ /*
+ * If the tree is completely empty, create the first leaf page, which is
+ * also the root.
+ */
+ if (leaf == NULL)
+ {
+ /*
+ * This is the very first item in the set.
+ *
+ * Allocate root node. It's also a leaf.
+ */
+ leaf = intset_new_leaf_node(intset);
+
+ intset->root = (intset_node *) leaf;
+ intset->leftmost_leaf = leaf;
+ intset->rightmost_nodes[0] = (intset_node *) leaf;
+ intset->num_levels = 1;
+ }
+
+ /*
+ * If there are less than MAX_VALUES_PER_LEAF_ITEM values in the buffer,
+ * stop. In most cases, we cannot encode that many values in a single
+ * value, but this way, the encoder doesn't have to worry about running
+ * out of input.
+ */
+ while (num_values - num_packed >= MAX_VALUES_PER_LEAF_ITEM)
+ {
+ leaf_item item;
+ int num_encoded;
+
+ /*
+ * Construct the next leaf item, packing as many buffered values as
+ * possible.
+ */
+ item.first = values[num_packed];
+ item.codeword = simple8b_encode(&values[num_packed + 1],
+ &num_encoded,
+ item.first);
+
+ /*
+ * Add the item to the node, allocating a new node if the old one is
+ * full.
+ */
+ if (leaf->num_items >= MAX_LEAF_ITEMS)
+ {
+ /* Allocate new leaf and link it to the tree */
+ intset_leaf_node *old_leaf = leaf;
+
+ leaf = intset_new_leaf_node(intset);
+ old_leaf->next = leaf;
+ intset->rightmost_nodes[0] = (intset_node *) leaf;
+ intset_update_upper(intset, 1, (intset_node *) leaf, item.first);
+ }
+ leaf->items[leaf->num_items++] = item;
+
+ num_packed += 1 + num_encoded;
+ }
+
+ /*
+ * Move any remaining buffered values to the beginning of the array.
+ */
+ if (num_packed < intset->num_buffered_values)
+ {
+ memmove(&intset->buffered_values[0],
+ &intset->buffered_values[num_packed],
+ (intset->num_buffered_values - num_packed) * sizeof(uint64));
+ }
+ intset->num_buffered_values -= num_packed;
+}
+
+/*
+ * Insert a downlink into parent node, after creating a new node.
+ *
+ * Recurses if the parent node is full, too.
+ */
+static void
+intset_update_upper(IntegerSet *intset, int level, intset_node *child,
+ uint64 child_key)
+{
+ intset_internal_node *parent;
+
+ Assert(level > 0);
+
+ /*
+ * Create a new root node, if necessary.
+ */
+ if (level >= intset->num_levels)
+ {
+ intset_node *oldroot = intset->root;
+ uint64 downlink_key;
+
+ /* MAX_TREE_LEVELS should be more than enough, this shouldn't happen */
+ if (intset->num_levels == MAX_TREE_LEVELS)
+ elog(ERROR, "could not expand integer set, maximum number of levels reached");
+ intset->num_levels++;
+
+ /*
+ * Get the first value on the old root page, to be used as the
+ * downlink.
+ */
+ if (intset->root->level == 0)
+ downlink_key = ((intset_leaf_node *) oldroot)->items[0].first;
+ else
+ downlink_key = ((intset_internal_node *) oldroot)->values[0];
+
+ parent = intset_new_internal_node(intset);
+ parent->level = level;
+ parent->values[0] = downlink_key;
+ parent->downlinks[0] = oldroot;
+ parent->num_items = 1;
+
+ intset->root = (intset_node *) parent;
+ intset->rightmost_nodes[level] = (intset_node *) parent;
+ }
+
+ /*
+ * Place the downlink on the parent page.
+ */
+ parent = (intset_internal_node *) intset->rightmost_nodes[level];
+
+ if (parent->num_items < MAX_INTERNAL_ITEMS)
+ {
+ parent->values[parent->num_items] = child_key;
+ parent->downlinks[parent->num_items] = child;
+ parent->num_items++;
+ }
+ else
+ {
+ /*
+ * Doesn't fit. Allocate new parent, with the downlink as the first
+ * item on it, and recursively insert the downlink to the new parent
+ * to the grandparent.
+ */
+ parent = intset_new_internal_node(intset);
+ parent->level = level;
+ parent->values[0] = child_key;
+ parent->downlinks[0] = child;
+ parent->num_items = 1;
+
+ intset->rightmost_nodes[level] = (intset_node *) parent;
+
+ intset_update_upper(intset, level + 1, (intset_node *) parent, child_key);
+ }
+}
+
+/*
+ * Does the set contain the given value?
+ */
+bool
+intset_is_member(IntegerSet *intset, uint64 x)
+{
+ intset_node *node;
+ intset_leaf_node *leaf;
+ int level;
+ int itemno;
+ leaf_item *item;
+
+ /*
+ * The value might be in the buffer of newly-added values.
+ */
+ if (intset->num_buffered_values > 0 && x >= intset->buffered_values[0])
+ {
+ int itemno;
+
+ itemno = intset_binsrch_uint64(x,
+ intset->buffered_values,
+ intset->num_buffered_values,
+ false);
+ if (itemno >= intset->num_buffered_values)
+ return false;
+ else
+ return (intset->buffered_values[itemno] == x);
+ }
+
+ /*
+ * Start from the root, and walk down the B-tree to find the right leaf
+ * node.
+ */
+ if (!intset->root)
+ return false;
+ node = intset->root;
+ for (level = intset->num_levels - 1; level > 0; level--)
+ {
+ intset_internal_node *n = (intset_internal_node *) node;
+
+ Assert(node->level == level);
+
+ itemno = intset_binsrch_uint64(x, n->values, n->num_items, true);
+ if (itemno == 0)
+ return false;
+ node = n->downlinks[itemno - 1];
+ }
+ Assert(node->level == 0);
+ leaf = (intset_leaf_node *) node;
+
+ /*
+ * Binary search to find the right item on the leaf page
+ */
+ itemno = intset_binsrch_leaf(x, leaf->items, leaf->num_items, true);
+ if (itemno == 0)
+ return false;
+ item = &leaf->items[itemno - 1];
+
+ /* Is this a match to the first value on the item? */
+ if (item->first == x)
+ return true;
+ Assert(x > item->first);
+
+ /* Is it in the packed codeword? */
+ if (simple8b_contains(item->codeword, x, item->first))
+ return true;
+
+ return false;
+}
+
+/*
+ * Begin in-order scan through all the values.
+ *
+ * While the iteration is in-progress, you cannot add new values to the set.
+ */
+void
+intset_begin_iterate(IntegerSet *intset)
+{
+ /* Note that we allow an iteration to be abandoned midway */
+ intset->iter_active = true;
+ intset->iter_node = intset->leftmost_leaf;
+ intset->iter_itemno = 0;
+ intset->iter_valueno = 0;
+ intset->iter_num_values = 0;
+ intset->iter_values = intset->iter_values_buf;
+}
+
+/*
+ * Returns the next integer, when iterating.
+ *
+ * intset_begin_iterate() must be called first. intset_iterate_next() returns
+ * the next value in the set. Returns true, if there was another value, and
+ * stores the value in *next. Otherwise, returns false.
+ */
+bool
+intset_iterate_next(IntegerSet *intset, uint64 *next)
+{
+ Assert(intset->iter_active);
+ for (;;)
+ {
+ /* Return next iter_values[] entry if any */
+ if (intset->iter_valueno < intset->iter_num_values)
+ {
+ *next = intset->iter_values[intset->iter_valueno++];
+ return true;
+ }
+
+ /* Decode next item in current leaf node, if any */
+ if (intset->iter_node &&
+ intset->iter_itemno < intset->iter_node->num_items)
+ {
+ leaf_item *item;
+ int num_decoded;
+
+ item = &intset->iter_node->items[intset->iter_itemno++];
+
+ intset->iter_values_buf[0] = item->first;
+ num_decoded = simple8b_decode(item->codeword,
+ &intset->iter_values_buf[1],
+ item->first);
+ intset->iter_num_values = num_decoded + 1;
+ intset->iter_valueno = 0;
+ continue;
+ }
+
+ /* No more items on this leaf, step to next node */
+ if (intset->iter_node)
+ {
+ intset->iter_node = intset->iter_node->next;
+ intset->iter_itemno = 0;
+ continue;
+ }
+
+ /*
+ * We have reached the end of the B-tree. But we might still have
+ * some integers in the buffer of newly-added values.
+ */
+ if (intset->iter_values == (const uint64 *) intset->iter_values_buf)
+ {
+ intset->iter_values = intset->buffered_values;
+ intset->iter_num_values = intset->num_buffered_values;
+ intset->iter_valueno = 0;
+ continue;
+ }
+
+ break;
+ }
+
+ /* No more results. */
+ intset->iter_active = false;
+ *next = 0; /* prevent uninitialized-variable warnings */
+ return false;
+}
+
+/*
+ * intset_binsrch_uint64() -- search a sorted array of uint64s
+ *
+ * Returns the first position with key equal or less than the given key.
+ * The returned position would be the "insert" location for the given key,
+ * that is, the position where the new key should be inserted to.
+ *
+ * 'nextkey' affects the behavior on equal keys. If true, and there is an
+ * equal key in the array, this returns the position immediately after the
+ * equal key. If false, this returns the position of the equal key itself.
+ */
+static int
+intset_binsrch_uint64(uint64 item, uint64 *arr, int arr_elems, bool nextkey)
+{
+ int low,
+ high,
+ mid;
+
+ low = 0;
+ high = arr_elems;
+ while (high > low)
+ {
+ mid = low + (high - low) / 2;
+
+ if (nextkey)
+ {
+ if (item >= arr[mid])
+ low = mid + 1;
+ else
+ high = mid;
+ }
+ else
+ {
+ if (item > arr[mid])
+ low = mid + 1;
+ else
+ high = mid;
+ }
+ }
+
+ return low;
+}
+
+/* same, but for an array of leaf items */
+static int
+intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey)
+{
+ int low,
+ high,
+ mid;
+
+ low = 0;
+ high = arr_elems;
+ while (high > low)
+ {
+ mid = low + (high - low) / 2;
+
+ if (nextkey)
+ {
+ if (item >= arr[mid].first)
+ low = mid + 1;
+ else
+ high = mid;
+ }
+ else
+ {
+ if (item > arr[mid].first)
+ low = mid + 1;
+ else
+ high = mid;
+ }
+ }
+
+ return low;
+}
+
+/*
+ * Simple-8b encoding.
+ *
+ * The simple-8b algorithm packs between 1 and 240 integers into 64-bit words,
+ * called "codewords". The number of integers packed into a single codeword
+ * depends on the integers being packed; small integers are encoded using
+ * fewer bits than large integers. A single codeword can store a single
+ * 60-bit integer, or two 30-bit integers, for example.
+ *
+ * Since we're storing a unique, sorted, set of integers, we actually encode
+ * the *differences* between consecutive integers. That way, clusters of
+ * integers that are close to each other are packed efficiently, regardless
+ * of their absolute values.
+ *
+ * In Simple-8b, each codeword consists of a 4-bit selector, which indicates
+ * how many integers are encoded in the codeword, and the encoded integers are
+ * packed into the remaining 60 bits. The selector allows for 16 different
+ * ways of using the remaining 60 bits, called "modes". The number of integers
+ * packed into a single codeword in each mode is listed in the simple8b_modes
+ * table below. For example, consider the following codeword:
+ *
+ * 20-bit integer 20-bit integer 20-bit integer
+ * 1101 00000000000000010010 01111010000100100000 00000000000000010100
+ * ^
+ * selector
+ *
+ * The selector 1101 is 13 in decimal. From the modes table below, we see
+ * that it means that the codeword encodes three 20-bit integers. In decimal,
+ * those integers are 18, 500000 and 20. Because we encode deltas rather than
+ * absolute values, the actual values that they represent are 18, 500018 and
+ * 500038.
+ *
+ * Modes 0 and 1 are a bit special; they encode a run of 240 or 120 zeroes
+ * (which means 240 or 120 consecutive integers, since we're encoding the
+ * deltas between integers), without using the rest of the codeword bits
+ * for anything.
+ *
+ * Simple-8b cannot encode integers larger than 60 bits. Values larger than
+ * that are always stored in the 'first' field of a leaf item, never in the
+ * packed codeword. If there is a sequence of integers that are more than
+ * 2^60 apart, the codeword will go unused on those items. To represent that,
+ * we use a magic EMPTY_CODEWORD codeword value.
+ */
+static const struct simple8b_mode
+{
+ uint8 bits_per_int;
+ uint8 num_ints;
+} simple8b_modes[17] =
+
+{
+ {0, 240}, /* mode 0: 240 zeroes */
+ {0, 120}, /* mode 1: 120 zeroes */
+ {1, 60}, /* mode 2: sixty 1-bit integers */
+ {2, 30}, /* mode 3: thirty 2-bit integers */
+ {3, 20}, /* mode 4: twenty 3-bit integers */
+ {4, 15}, /* mode 5: fifteen 4-bit integers */
+ {5, 12}, /* mode 6: twelve 5-bit integers */
+ {6, 10}, /* mode 7: ten 6-bit integers */
+ {7, 8}, /* mode 8: eight 7-bit integers (four bits
+ * are wasted) */
+ {8, 7}, /* mode 9: seven 8-bit integers (four bits
+ * are wasted) */
+ {10, 6}, /* mode 10: six 10-bit integers */
+ {12, 5}, /* mode 11: five 12-bit integers */
+ {15, 4}, /* mode 12: four 15-bit integers */
+ {20, 3}, /* mode 13: three 20-bit integers */
+ {30, 2}, /* mode 14: two 30-bit integers */
+ {60, 1}, /* mode 15: one 60-bit integer */
+
+ {0, 0} /* sentinel value */
+};
+
+/*
+ * EMPTY_CODEWORD is a special value, used to indicate "no values".
+ * It is used if the next value is too large to be encoded with Simple-8b.
+ *
+ * This value looks like a mode-0 codeword, but we can distinguish it
+ * because a regular mode-0 codeword would have zeroes in the unused bits.
+ */
+#define EMPTY_CODEWORD UINT64CONST(0x0FFFFFFFFFFFFFFF)
+
+/*
+ * Encode a number of integers into a Simple-8b codeword.
+ *
+ * (What we actually encode are deltas between successive integers.
+ * "base" is the value before ints[0].)
+ *
+ * The input array must contain at least SIMPLE8B_MAX_VALUES_PER_CODEWORD
+ * elements, ensuring that we can produce a full codeword.
+ *
+ * Returns the encoded codeword, and sets *num_encoded to the number of
+ * input integers that were encoded. That can be zero, if the first delta
+ * is too large to be encoded.
+ */
+static uint64
+simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base)
+{
+ int selector;
+ int nints;
+ int bits;
+ uint64 diff;
+ uint64 last_val;
+ uint64 codeword;
+ int i;
+
+ Assert(ints[0] > base);
+
+ /*
+ * Select the "mode" to use for this codeword.
+ *
+ * In each iteration, check if the next value can be represented in the
+ * current mode we're considering. If it's too large, then step up the
+ * mode to a wider one, and repeat. If it fits, move on to the next
+ * integer. Repeat until the codeword is full, given the current mode.
+ *
+ * Note that we don't have any way to represent unused slots in the
+ * codeword, so we require each codeword to be "full". It is always
+ * possible to produce a full codeword unless the very first delta is too
+ * large to be encoded. For example, if the first delta is small but the
+ * second is too large to be encoded, we'll end up using the last "mode",
+ * which has nints == 1.
+ */
+ selector = 0;
+ nints = simple8b_modes[0].num_ints;
+ bits = simple8b_modes[0].bits_per_int;
+ diff = ints[0] - base - 1;
+ last_val = ints[0];
+ i = 0; /* number of deltas we have accepted */
+ for (;;)
+ {
+ if (diff >= (UINT64CONST(1) << bits))
+ {
+ /* too large, step up to next mode */
+ selector++;
+ nints = simple8b_modes[selector].num_ints;
+ bits = simple8b_modes[selector].bits_per_int;
+ /* we might already have accepted enough deltas for this mode */
+ if (i >= nints)
+ break;
+ }
+ else
+ {
+ /* accept this delta; then done if codeword is full */
+ i++;
+ if (i >= nints)
+ break;
+ /* examine next delta */
+ Assert(ints[i] > last_val);
+ diff = ints[i] - last_val - 1;
+ last_val = ints[i];
+ }
+ }
+
+ if (nints == 0)
+ {
+ /*
+ * The first delta is too large to be encoded with Simple-8b.
+ *
+ * If there is at least one not-too-large integer in the input, we
+ * will encode it using mode 15 (or a more compact mode). Hence, we
+ * can only get here if the *first* delta is >= 2^60.
+ */
+ Assert(i == 0);
+ *num_encoded = 0;
+ return EMPTY_CODEWORD;
+ }
+
+ /*
+ * Encode the integers using the selected mode. Note that we shift them
+ * into the codeword in reverse order, so that they will come out in the
+ * correct order in the decoder.
+ */
+ codeword = 0;
+ if (bits > 0)
+ {
+ for (i = nints - 1; i > 0; i--)
+ {
+ diff = ints[i] - ints[i - 1] - 1;
+ codeword |= diff;
+ codeword <<= bits;
+ }
+ diff = ints[0] - base - 1;
+ codeword |= diff;
+ }
+
+ /* add selector to the codeword, and return */
+ codeword |= (uint64) selector << 60;
+
+ *num_encoded = nints;
+ return codeword;
+}
+
+/*
+ * Decode a codeword into an array of integers.
+ * Returns the number of integers decoded.
+ */
+static int
+simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
+{
+ int selector = (codeword >> 60);
+ int nints = simple8b_modes[selector].num_ints;
+ int bits = simple8b_modes[selector].bits_per_int;
+ uint64 mask = (UINT64CONST(1) << bits) - 1;
+ uint64 curr_value;
+
+ if (codeword == EMPTY_CODEWORD)
+ return 0;
+
+ curr_value = base;
+ for (int i = 0; i < nints; i++)
+ {
+ uint64 diff = codeword & mask;
+
+ curr_value += 1 + diff;
+ decoded[i] = curr_value;
+ codeword >>= bits;
+ }
+ return nints;
+}
+
+/*
+ * This is very similar to simple8b_decode(), but instead of decoding all
+ * the values to an array, it just checks if the given "key" is part of
+ * the codeword.
+ */
+static bool
+simple8b_contains(uint64 codeword, uint64 key, uint64 base)
+{
+ int selector = (codeword >> 60);
+ int nints = simple8b_modes[selector].num_ints;
+ int bits = simple8b_modes[selector].bits_per_int;
+
+ if (codeword == EMPTY_CODEWORD)
+ return false;
+
+ if (bits == 0)
+ {
+ /* Special handling for 0-bit cases. */
+ return (key - base) <= nints;
+ }
+ else
+ {
+ uint64 mask = (UINT64CONST(1) << bits) - 1;
+ uint64 curr_value;
+
+ curr_value = base;
+ for (int i = 0; i < nints; i++)
+ {
+ uint64 diff = codeword & mask;
+
+ curr_value += 1 + diff;
+
+ if (curr_value >= key)
+ {
+ if (curr_value == key)
+ return true;
+ else
+ return false;
+ }
+
+ codeword >>= bits;
+ }
+ }
+ return false;
+}
diff --git a/src/backend/lib/knapsack.c b/src/backend/lib/knapsack.c
new file mode 100644
index 0000000..50c84b4
--- /dev/null
+++ b/src/backend/lib/knapsack.c
@@ -0,0 +1,111 @@
+/*-------------------------------------------------------------------------
+ *
+ * knapsack.c
+ * Knapsack problem solver
+ *
+ * Given input vectors of integral item weights (must be >= 0) and values
+ * (double >= 0), compute the set of items which produces the greatest total
+ * value without exceeding a specified total weight; each item is included at
+ * most once (this is the 0/1 knapsack problem). Weight 0 items will always be
+ * included.
+ *
+ * The performance of this algorithm is pseudo-polynomial, O(nW) where W is the
+ * weight limit. To use with non-integral weights or approximate solutions,
+ * the caller should pre-scale the input weights to a suitable range. This
+ * allows approximate solutions in polynomial time (the general case of the
+ * exact problem is NP-hard).
+ *
+ * Copyright (c) 2017-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/lib/knapsack.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+#include <limits.h>
+
+#include "lib/knapsack.h"
+#include "miscadmin.h"
+#include "nodes/bitmapset.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+
+/*
+ * DiscreteKnapsack
+ *
+ * The item_values input is optional; if omitted, all the items are assumed to
+ * have value 1.
+ *
+ * Returns a Bitmapset of the 0..(n-1) indexes of the items chosen for
+ * inclusion in the solution.
+ *
+ * This uses the usual dynamic-programming algorithm, adapted to reuse the
+ * memory on each pass (by working from larger weights to smaller). At the
+ * start of pass number i, the values[w] array contains the largest value
+ * computed with total weight <= w, using only items with indices < i; and
+ * sets[w] contains the bitmap of items actually used for that value. (The
+ * bitmapsets are all pre-initialized with an unused high bit so that memory
+ * allocation is done only once.)
+ */
+Bitmapset *
+DiscreteKnapsack(int max_weight, int num_items,
+ int *item_weights, double *item_values)
+{
+ MemoryContext local_ctx = AllocSetContextCreate(CurrentMemoryContext,
+ "Knapsack",
+ ALLOCSET_SMALL_SIZES);
+ MemoryContext oldctx = MemoryContextSwitchTo(local_ctx);
+ double *values;
+ Bitmapset **sets;
+ Bitmapset *result;
+ int i,
+ j;
+
+ Assert(max_weight >= 0);
+ Assert(num_items > 0 && item_weights);
+
+ values = palloc((1 + max_weight) * sizeof(double));
+ sets = palloc((1 + max_weight) * sizeof(Bitmapset *));
+
+ for (i = 0; i <= max_weight; ++i)
+ {
+ values[i] = 0;
+ sets[i] = bms_make_singleton(num_items);
+ }
+
+ for (i = 0; i < num_items; ++i)
+ {
+ int iw = item_weights[i];
+ double iv = item_values ? item_values[i] : 1;
+
+ for (j = max_weight; j >= iw; --j)
+ {
+ int ow = j - iw;
+
+ if (values[j] <= values[ow] + iv)
+ {
+ /* copy sets[ow] to sets[j] without realloc */
+ if (j != ow)
+ {
+ sets[j] = bms_del_members(sets[j], sets[j]);
+ sets[j] = bms_add_members(sets[j], sets[ow]);
+ }
+
+ sets[j] = bms_add_member(sets[j], i);
+
+ values[j] = values[ow] + iv;
+ }
+ }
+ }
+
+ MemoryContextSwitchTo(oldctx);
+
+ result = bms_del_member(bms_copy(sets[max_weight]), num_items);
+
+ MemoryContextDelete(local_ctx);
+
+ return result;
+}
diff --git a/src/backend/lib/pairingheap.c b/src/backend/lib/pairingheap.c
new file mode 100644
index 0000000..bed3d2e
--- /dev/null
+++ b/src/backend/lib/pairingheap.c
@@ -0,0 +1,333 @@
+/*-------------------------------------------------------------------------
+ *
+ * pairingheap.c
+ * A Pairing Heap implementation
+ *
+ * A pairing heap is a data structure that's useful for implementing
+ * priority queues. It is simple to implement, and provides amortized O(1)
+ * insert and find-min operations, and amortized O(log n) delete-min.
+ *
+ * The pairing heap was first described in this paper:
+ *
+ * Michael L. Fredman, Robert Sedgewick, Daniel D. Sleator, and Robert E.
+ * Tarjan. 1986.
+ * The pairing heap: a new form of self-adjusting heap.
+ * Algorithmica 1, 1 (January 1986), pages 111-129. DOI: 10.1007/BF01840439
+ *
+ * Portions Copyright (c) 2012-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/lib/pairingheap.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/pairingheap.h"
+
+static pairingheap_node *merge(pairingheap *heap, pairingheap_node *a,
+ pairingheap_node *b);
+static pairingheap_node *merge_children(pairingheap *heap,
+ pairingheap_node *children);
+
+/*
+ * pairingheap_allocate
+ *
+ * Returns a pointer to a newly-allocated heap, with the heap property defined
+ * by the given comparator function, which will be invoked with the additional
+ * argument specified by 'arg'.
+ */
+pairingheap *
+pairingheap_allocate(pairingheap_comparator compare, void *arg)
+{
+ pairingheap *heap;
+
+ heap = (pairingheap *) palloc(sizeof(pairingheap));
+ heap->ph_compare = compare;
+ heap->ph_arg = arg;
+
+ heap->ph_root = NULL;
+
+ return heap;
+}
+
+/*
+ * pairingheap_free
+ *
+ * Releases memory used by the given pairingheap.
+ *
+ * Note: The nodes in the heap are not freed!
+ */
+void
+pairingheap_free(pairingheap *heap)
+{
+ pfree(heap);
+}
+
+/*
+ * A helper function to merge two subheaps into one.
+ *
+ * The subheap with smaller value is put as a child of the other one (assuming
+ * a max-heap).
+ *
+ * The next_sibling and prev_or_parent pointers of the input nodes are
+ * ignored. On return, the returned node's next_sibling and prev_or_parent
+ * pointers are garbage.
+ */
+static pairingheap_node *
+merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
+{
+ if (a == NULL)
+ return b;
+ if (b == NULL)
+ return a;
+
+ /* swap 'a' and 'b' so that 'a' is the one with larger value */
+ if (heap->ph_compare(a, b, heap->ph_arg) < 0)
+ {
+ pairingheap_node *tmp;
+
+ tmp = a;
+ a = b;
+ b = tmp;
+ }
+
+ /* and put 'b' as a child of 'a' */
+ if (a->first_child)
+ a->first_child->prev_or_parent = b;
+ b->prev_or_parent = a;
+ b->next_sibling = a->first_child;
+ a->first_child = b;
+
+ return a;
+}
+
+/*
+ * pairingheap_add
+ *
+ * Adds the given node to the heap in O(1) time.
+ */
+void
+pairingheap_add(pairingheap *heap, pairingheap_node *node)
+{
+ node->first_child = NULL;
+
+ /* Link the new node as a new tree */
+ heap->ph_root = merge(heap, heap->ph_root, node);
+ heap->ph_root->prev_or_parent = NULL;
+ heap->ph_root->next_sibling = NULL;
+}
+
+/*
+ * pairingheap_first
+ *
+ * Returns a pointer to the first (root, topmost) node in the heap without
+ * modifying the heap. The caller must ensure that this routine is not used on
+ * an empty heap. Always O(1).
+ */
+pairingheap_node *
+pairingheap_first(pairingheap *heap)
+{
+ Assert(!pairingheap_is_empty(heap));
+
+ return heap->ph_root;
+}
+
+/*
+ * pairingheap_remove_first
+ *
+ * Removes the first (root, topmost) node in the heap and returns a pointer to
+ * it after rebalancing the heap. The caller must ensure that this routine is
+ * not used on an empty heap. O(log n) amortized.
+ */
+pairingheap_node *
+pairingheap_remove_first(pairingheap *heap)
+{
+ pairingheap_node *result;
+ pairingheap_node *children;
+
+ Assert(!pairingheap_is_empty(heap));
+
+ /* Remove the root, and form a new heap of its children. */
+ result = heap->ph_root;
+ children = result->first_child;
+
+ heap->ph_root = merge_children(heap, children);
+ if (heap->ph_root)
+ {
+ heap->ph_root->prev_or_parent = NULL;
+ heap->ph_root->next_sibling = NULL;
+ }
+
+ return result;
+}
+
+/*
+ * Remove 'node' from the heap. O(log n) amortized.
+ */
+void
+pairingheap_remove(pairingheap *heap, pairingheap_node *node)
+{
+ pairingheap_node *children;
+ pairingheap_node *replacement;
+ pairingheap_node *next_sibling;
+ pairingheap_node **prev_ptr;
+
+ /*
+ * If the removed node happens to be the root node, do it with
+ * pairingheap_remove_first().
+ */
+ if (node == heap->ph_root)
+ {
+ (void) pairingheap_remove_first(heap);
+ return;
+ }
+
+ /*
+ * Before we modify anything, remember the removed node's first_child and
+ * next_sibling pointers.
+ */
+ children = node->first_child;
+ next_sibling = node->next_sibling;
+
+ /*
+ * Also find the pointer to the removed node in its previous sibling, or
+ * if this is the first child of its parent, in its parent.
+ */
+ if (node->prev_or_parent->first_child == node)
+ prev_ptr = &node->prev_or_parent->first_child;
+ else
+ prev_ptr = &node->prev_or_parent->next_sibling;
+ Assert(*prev_ptr == node);
+
+ /*
+ * If this node has children, make a new subheap of the children and link
+ * the subheap in place of the removed node. Otherwise just unlink this
+ * node.
+ */
+ if (children)
+ {
+ replacement = merge_children(heap, children);
+
+ replacement->prev_or_parent = node->prev_or_parent;
+ replacement->next_sibling = node->next_sibling;
+ *prev_ptr = replacement;
+ if (next_sibling)
+ next_sibling->prev_or_parent = replacement;
+ }
+ else
+ {
+ *prev_ptr = next_sibling;
+ if (next_sibling)
+ next_sibling->prev_or_parent = node->prev_or_parent;
+ }
+}
+
+/*
+ * Merge a list of subheaps into a single heap.
+ *
+ * This implements the basic two-pass merging strategy, first forming pairs
+ * from left to right, and then merging the pairs.
+ */
+static pairingheap_node *
+merge_children(pairingheap *heap, pairingheap_node *children)
+{
+ pairingheap_node *curr,
+ *next;
+ pairingheap_node *pairs;
+ pairingheap_node *newroot;
+
+ if (children == NULL || children->next_sibling == NULL)
+ return children;
+
+ /* Walk the subheaps from left to right, merging in pairs */
+ next = children;
+ pairs = NULL;
+ for (;;)
+ {
+ curr = next;
+
+ if (curr == NULL)
+ break;
+
+ if (curr->next_sibling == NULL)
+ {
+ /* last odd node at the end of list */
+ curr->next_sibling = pairs;
+ pairs = curr;
+ break;
+ }
+
+ next = curr->next_sibling->next_sibling;
+
+ /* merge this and the next subheap, and add to 'pairs' list. */
+
+ curr = merge(heap, curr, curr->next_sibling);
+ curr->next_sibling = pairs;
+ pairs = curr;
+ }
+
+ /*
+ * Merge all the pairs together to form a single heap.
+ */
+ newroot = pairs;
+ next = pairs->next_sibling;
+ while (next)
+ {
+ curr = next;
+ next = curr->next_sibling;
+
+ newroot = merge(heap, newroot, curr);
+ }
+
+ return newroot;
+}
+
+/*
+ * A debug function to dump the contents of the heap as a string.
+ *
+ * The 'dumpfunc' callback appends a string representation of a single node
+ * to the StringInfo. 'opaque' can be used to pass more information to the
+ * callback.
+ */
+#ifdef PAIRINGHEAP_DEBUG
+static void
+pairingheap_dump_recurse(StringInfo buf,
+ pairingheap_node *node,
+ void (*dumpfunc) (pairingheap_node *node, StringInfo buf, void *opaque),
+ void *opaque,
+ int depth,
+ pairingheap_node *prev_or_parent)
+{
+ while (node)
+ {
+ Assert(node->prev_or_parent == prev_or_parent);
+
+ appendStringInfoSpaces(buf, depth * 4);
+ dumpfunc(node, buf, opaque);
+ appendStringInfoChar(buf, '\n');
+ if (node->first_child)
+ pairingheap_dump_recurse(buf, node->first_child, dumpfunc, opaque, depth + 1, node);
+ prev_or_parent = node;
+ node = node->next_sibling;
+ }
+}
+
+char *
+pairingheap_dump(pairingheap *heap,
+ void (*dumpfunc) (pairingheap_node *node, StringInfo buf, void *opaque),
+ void *opaque)
+{
+ StringInfoData buf;
+
+ if (!heap->ph_root)
+ return pstrdup("(empty)");
+
+ initStringInfo(&buf);
+
+ pairingheap_dump_recurse(&buf, heap->ph_root, dumpfunc, opaque, 0, NULL);
+
+ return buf.data;
+}
+#endif
diff --git a/src/backend/lib/rbtree.c b/src/backend/lib/rbtree.c
new file mode 100644
index 0000000..536df1f
--- /dev/null
+++ b/src/backend/lib/rbtree.c
@@ -0,0 +1,770 @@
+/*-------------------------------------------------------------------------
+ *
+ * rbtree.c
+ * implementation for PostgreSQL generic Red-Black binary tree package
+ * Adopted from http://algolist.manual.ru/ds/rbtree.php
+ *
+ * This code comes from Thomas Niemann's "Sorting and Searching Algorithms:
+ * a Cookbook".
+ *
+ * See http://www.cs.auckland.ac.nz/software/AlgAnim/niemann/s_man.htm for
+ * license terms: "Source code, when part of a software project, may be used
+ * freely without reference to the author."
+ *
+ * Red-black trees are a type of balanced binary tree wherein (1) any child of
+ * a red node is always black, and (2) every path from root to leaf traverses
+ * an equal number of black nodes. From these properties, it follows that the
+ * longest path from root to leaf is only about twice as long as the shortest,
+ * so lookups are guaranteed to run in O(lg n) time.
+ *
+ * Copyright (c) 2009-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/lib/rbtree.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "lib/rbtree.h"
+
+
+/*
+ * Colors of nodes (values of RBTNode.color)
+ */
+#define RBTBLACK (0)
+#define RBTRED (1)
+
+/*
+ * RBTree control structure
+ */
+struct RBTree
+{
+ RBTNode *root; /* root node, or RBTNIL if tree is empty */
+
+ /* Remaining fields are constant after rbt_create */
+
+ Size node_size; /* actual size of tree nodes */
+ /* The caller-supplied manipulation functions */
+ rbt_comparator comparator;
+ rbt_combiner combiner;
+ rbt_allocfunc allocfunc;
+ rbt_freefunc freefunc;
+ /* Passthrough arg passed to all manipulation functions */
+ void *arg;
+};
+
+/*
+ * all leafs are sentinels, use customized NIL name to prevent
+ * collision with system-wide constant NIL which is actually NULL
+ */
+#define RBTNIL (&sentinel)
+
+static RBTNode sentinel =
+{
+ RBTBLACK, RBTNIL, RBTNIL, NULL
+};
+
+
+/*
+ * rbt_create: create an empty RBTree
+ *
+ * Arguments are:
+ * node_size: actual size of tree nodes (> sizeof(RBTNode))
+ * The manipulation functions:
+ * comparator: compare two RBTNodes for less/equal/greater
+ * combiner: merge an existing tree entry with a new one
+ * allocfunc: allocate a new RBTNode
+ * freefunc: free an old RBTNode
+ * arg: passthrough pointer that will be passed to the manipulation functions
+ *
+ * Note that the combiner's righthand argument will be a "proposed" tree node,
+ * ie the input to rbt_insert, in which the RBTNode fields themselves aren't
+ * valid. Similarly, either input to the comparator may be a "proposed" node.
+ * This shouldn't matter since the functions aren't supposed to look at the
+ * RBTNode fields, only the extra fields of the struct the RBTNode is embedded
+ * in.
+ *
+ * The freefunc should just be pfree or equivalent; it should NOT attempt
+ * to free any subsidiary data, because the node passed to it may not contain
+ * valid data! freefunc can be NULL if caller doesn't require retail
+ * space reclamation.
+ *
+ * The RBTree node is palloc'd in the caller's memory context. Note that
+ * all contents of the tree are actually allocated by the caller, not here.
+ *
+ * Since tree contents are managed by the caller, there is currently not
+ * an explicit "destroy" operation; typically a tree would be freed by
+ * resetting or deleting the memory context it's stored in. You can pfree
+ * the RBTree node if you feel the urge.
+ */
+RBTree *
+rbt_create(Size node_size,
+ rbt_comparator comparator,
+ rbt_combiner combiner,
+ rbt_allocfunc allocfunc,
+ rbt_freefunc freefunc,
+ void *arg)
+{
+ RBTree *tree = (RBTree *) palloc(sizeof(RBTree));
+
+ Assert(node_size > sizeof(RBTNode));
+
+ tree->root = RBTNIL;
+ tree->node_size = node_size;
+ tree->comparator = comparator;
+ tree->combiner = combiner;
+ tree->allocfunc = allocfunc;
+ tree->freefunc = freefunc;
+
+ tree->arg = arg;
+
+ return tree;
+}
+
+/* Copy the additional data fields from one RBTNode to another */
+static inline void
+rbt_copy_data(RBTree *rbt, RBTNode *dest, const RBTNode *src)
+{
+ memcpy(dest + 1, src + 1, rbt->node_size - sizeof(RBTNode));
+}
+
+/**********************************************************************
+ * Search *
+ **********************************************************************/
+
+/*
+ * rbt_find: search for a value in an RBTree
+ *
+ * data represents the value to try to find. Its RBTNode fields need not
+ * be valid, it's the extra data in the larger struct that is of interest.
+ *
+ * Returns the matching tree entry, or NULL if no match is found.
+ */
+RBTNode *
+rbt_find(RBTree *rbt, const RBTNode *data)
+{
+ RBTNode *node = rbt->root;
+
+ while (node != RBTNIL)
+ {
+ int cmp = rbt->comparator(data, node, rbt->arg);
+
+ if (cmp == 0)
+ return node;
+ else if (cmp < 0)
+ node = node->left;
+ else
+ node = node->right;
+ }
+
+ return NULL;
+}
+
+/*
+ * rbt_leftmost: fetch the leftmost (smallest-valued) tree node.
+ * Returns NULL if tree is empty.
+ *
+ * Note: in the original implementation this included an unlink step, but
+ * that's a bit awkward. Just call rbt_delete on the result if that's what
+ * you want.
+ */
+RBTNode *
+rbt_leftmost(RBTree *rbt)
+{
+ RBTNode *node = rbt->root;
+ RBTNode *leftmost = rbt->root;
+
+ while (node != RBTNIL)
+ {
+ leftmost = node;
+ node = node->left;
+ }
+
+ if (leftmost != RBTNIL)
+ return leftmost;
+
+ return NULL;
+}
+
+/**********************************************************************
+ * Insertion *
+ **********************************************************************/
+
+/*
+ * Rotate node x to left.
+ *
+ * x's right child takes its place in the tree, and x becomes the left
+ * child of that node.
+ */
+static void
+rbt_rotate_left(RBTree *rbt, RBTNode *x)
+{
+ RBTNode *y = x->right;
+
+ /* establish x->right link */
+ x->right = y->left;
+ if (y->left != RBTNIL)
+ y->left->parent = x;
+
+ /* establish y->parent link */
+ if (y != RBTNIL)
+ y->parent = x->parent;
+ if (x->parent)
+ {
+ if (x == x->parent->left)
+ x->parent->left = y;
+ else
+ x->parent->right = y;
+ }
+ else
+ {
+ rbt->root = y;
+ }
+
+ /* link x and y */
+ y->left = x;
+ if (x != RBTNIL)
+ x->parent = y;
+}
+
+/*
+ * Rotate node x to right.
+ *
+ * x's left right child takes its place in the tree, and x becomes the right
+ * child of that node.
+ */
+static void
+rbt_rotate_right(RBTree *rbt, RBTNode *x)
+{
+ RBTNode *y = x->left;
+
+ /* establish x->left link */
+ x->left = y->right;
+ if (y->right != RBTNIL)
+ y->right->parent = x;
+
+ /* establish y->parent link */
+ if (y != RBTNIL)
+ y->parent = x->parent;
+ if (x->parent)
+ {
+ if (x == x->parent->right)
+ x->parent->right = y;
+ else
+ x->parent->left = y;
+ }
+ else
+ {
+ rbt->root = y;
+ }
+
+ /* link x and y */
+ y->right = x;
+ if (x != RBTNIL)
+ x->parent = y;
+}
+
+/*
+ * Maintain Red-Black tree balance after inserting node x.
+ *
+ * The newly inserted node is always initially marked red. That may lead to
+ * a situation where a red node has a red child, which is prohibited. We can
+ * always fix the problem by a series of color changes and/or "rotations",
+ * which move the problem progressively higher up in the tree. If one of the
+ * two red nodes is the root, we can always fix the problem by changing the
+ * root from red to black.
+ *
+ * (This does not work lower down in the tree because we must also maintain
+ * the invariant that every leaf has equal black-height.)
+ */
+static void
+rbt_insert_fixup(RBTree *rbt, RBTNode *x)
+{
+ /*
+ * x is always a red node. Initially, it is the newly inserted node. Each
+ * iteration of this loop moves it higher up in the tree.
+ */
+ while (x != rbt->root && x->parent->color == RBTRED)
+ {
+ /*
+ * x and x->parent are both red. Fix depends on whether x->parent is
+ * a left or right child. In either case, we define y to be the
+ * "uncle" of x, that is, the other child of x's grandparent.
+ *
+ * If the uncle is red, we flip the grandparent to red and its two
+ * children to black. Then we loop around again to check whether the
+ * grandparent still has a problem.
+ *
+ * If the uncle is black, we will perform one or two "rotations" to
+ * balance the tree. Either x or x->parent will take the
+ * grandparent's position in the tree and recolored black, and the
+ * original grandparent will be recolored red and become a child of
+ * that node. This always leaves us with a valid red-black tree, so
+ * the loop will terminate.
+ */
+ if (x->parent == x->parent->parent->left)
+ {
+ RBTNode *y = x->parent->parent->right;
+
+ if (y->color == RBTRED)
+ {
+ /* uncle is RBTRED */
+ x->parent->color = RBTBLACK;
+ y->color = RBTBLACK;
+ x->parent->parent->color = RBTRED;
+
+ x = x->parent->parent;
+ }
+ else
+ {
+ /* uncle is RBTBLACK */
+ if (x == x->parent->right)
+ {
+ /* make x a left child */
+ x = x->parent;
+ rbt_rotate_left(rbt, x);
+ }
+
+ /* recolor and rotate */
+ x->parent->color = RBTBLACK;
+ x->parent->parent->color = RBTRED;
+
+ rbt_rotate_right(rbt, x->parent->parent);
+ }
+ }
+ else
+ {
+ /* mirror image of above code */
+ RBTNode *y = x->parent->parent->left;
+
+ if (y->color == RBTRED)
+ {
+ /* uncle is RBTRED */
+ x->parent->color = RBTBLACK;
+ y->color = RBTBLACK;
+ x->parent->parent->color = RBTRED;
+
+ x = x->parent->parent;
+ }
+ else
+ {
+ /* uncle is RBTBLACK */
+ if (x == x->parent->left)
+ {
+ x = x->parent;
+ rbt_rotate_right(rbt, x);
+ }
+ x->parent->color = RBTBLACK;
+ x->parent->parent->color = RBTRED;
+
+ rbt_rotate_left(rbt, x->parent->parent);
+ }
+ }
+ }
+
+ /*
+ * The root may already have been black; if not, the black-height of every
+ * node in the tree increases by one.
+ */
+ rbt->root->color = RBTBLACK;
+}
+
+/*
+ * rbt_insert: insert a new value into the tree.
+ *
+ * data represents the value to insert. Its RBTNode fields need not
+ * be valid, it's the extra data in the larger struct that is of interest.
+ *
+ * If the value represented by "data" is not present in the tree, then
+ * we copy "data" into a new tree entry and return that node, setting *isNew
+ * to true.
+ *
+ * If the value represented by "data" is already present, then we call the
+ * combiner function to merge data into the existing node, and return the
+ * existing node, setting *isNew to false.
+ *
+ * "data" is unmodified in either case; it's typically just a local
+ * variable in the caller.
+ */
+RBTNode *
+rbt_insert(RBTree *rbt, const RBTNode *data, bool *isNew)
+{
+ RBTNode *current,
+ *parent,
+ *x;
+ int cmp;
+
+ /* find where node belongs */
+ current = rbt->root;
+ parent = NULL;
+ cmp = 0; /* just to prevent compiler warning */
+
+ while (current != RBTNIL)
+ {
+ cmp = rbt->comparator(data, current, rbt->arg);
+ if (cmp == 0)
+ {
+ /*
+ * Found node with given key. Apply combiner.
+ */
+ rbt->combiner(current, data, rbt->arg);
+ *isNew = false;
+ return current;
+ }
+ parent = current;
+ current = (cmp < 0) ? current->left : current->right;
+ }
+
+ /*
+ * Value is not present, so create a new node containing data.
+ */
+ *isNew = true;
+
+ x = rbt->allocfunc(rbt->arg);
+
+ x->color = RBTRED;
+
+ x->left = RBTNIL;
+ x->right = RBTNIL;
+ x->parent = parent;
+ rbt_copy_data(rbt, x, data);
+
+ /* insert node in tree */
+ if (parent)
+ {
+ if (cmp < 0)
+ parent->left = x;
+ else
+ parent->right = x;
+ }
+ else
+ {
+ rbt->root = x;
+ }
+
+ rbt_insert_fixup(rbt, x);
+
+ return x;
+}
+
+/**********************************************************************
+ * Deletion *
+ **********************************************************************/
+
+/*
+ * Maintain Red-Black tree balance after deleting a black node.
+ */
+static void
+rbt_delete_fixup(RBTree *rbt, RBTNode *x)
+{
+ /*
+ * x is always a black node. Initially, it is the former child of the
+ * deleted node. Each iteration of this loop moves it higher up in the
+ * tree.
+ */
+ while (x != rbt->root && x->color == RBTBLACK)
+ {
+ /*
+ * Left and right cases are symmetric. Any nodes that are children of
+ * x have a black-height one less than the remainder of the nodes in
+ * the tree. We rotate and recolor nodes to move the problem up the
+ * tree: at some stage we'll either fix the problem, or reach the root
+ * (where the black-height is allowed to decrease).
+ */
+ if (x == x->parent->left)
+ {
+ RBTNode *w = x->parent->right;
+
+ if (w->color == RBTRED)
+ {
+ w->color = RBTBLACK;
+ x->parent->color = RBTRED;
+
+ rbt_rotate_left(rbt, x->parent);
+ w = x->parent->right;
+ }
+
+ if (w->left->color == RBTBLACK && w->right->color == RBTBLACK)
+ {
+ w->color = RBTRED;
+
+ x = x->parent;
+ }
+ else
+ {
+ if (w->right->color == RBTBLACK)
+ {
+ w->left->color = RBTBLACK;
+ w->color = RBTRED;
+
+ rbt_rotate_right(rbt, w);
+ w = x->parent->right;
+ }
+ w->color = x->parent->color;
+ x->parent->color = RBTBLACK;
+ w->right->color = RBTBLACK;
+
+ rbt_rotate_left(rbt, x->parent);
+ x = rbt->root; /* Arrange for loop to terminate. */
+ }
+ }
+ else
+ {
+ RBTNode *w = x->parent->left;
+
+ if (w->color == RBTRED)
+ {
+ w->color = RBTBLACK;
+ x->parent->color = RBTRED;
+
+ rbt_rotate_right(rbt, x->parent);
+ w = x->parent->left;
+ }
+
+ if (w->right->color == RBTBLACK && w->left->color == RBTBLACK)
+ {
+ w->color = RBTRED;
+
+ x = x->parent;
+ }
+ else
+ {
+ if (w->left->color == RBTBLACK)
+ {
+ w->right->color = RBTBLACK;
+ w->color = RBTRED;
+
+ rbt_rotate_left(rbt, w);
+ w = x->parent->left;
+ }
+ w->color = x->parent->color;
+ x->parent->color = RBTBLACK;
+ w->left->color = RBTBLACK;
+
+ rbt_rotate_right(rbt, x->parent);
+ x = rbt->root; /* Arrange for loop to terminate. */
+ }
+ }
+ }
+ x->color = RBTBLACK;
+}
+
+/*
+ * Delete node z from tree.
+ */
+static void
+rbt_delete_node(RBTree *rbt, RBTNode *z)
+{
+ RBTNode *x,
+ *y;
+
+ /* This is just paranoia: we should only get called on a valid node */
+ if (!z || z == RBTNIL)
+ return;
+
+ /*
+ * y is the node that will actually be removed from the tree. This will
+ * be z if z has fewer than two children, or the tree successor of z
+ * otherwise.
+ */
+ if (z->left == RBTNIL || z->right == RBTNIL)
+ {
+ /* y has a RBTNIL node as a child */
+ y = z;
+ }
+ else
+ {
+ /* find tree successor */
+ y = z->right;
+ while (y->left != RBTNIL)
+ y = y->left;
+ }
+
+ /* x is y's only child */
+ if (y->left != RBTNIL)
+ x = y->left;
+ else
+ x = y->right;
+
+ /* Remove y from the tree. */
+ x->parent = y->parent;
+ if (y->parent)
+ {
+ if (y == y->parent->left)
+ y->parent->left = x;
+ else
+ y->parent->right = x;
+ }
+ else
+ {
+ rbt->root = x;
+ }
+
+ /*
+ * If we removed the tree successor of z rather than z itself, then move
+ * the data for the removed node to the one we were supposed to remove.
+ */
+ if (y != z)
+ rbt_copy_data(rbt, z, y);
+
+ /*
+ * Removing a black node might make some paths from root to leaf contain
+ * fewer black nodes than others, or it might make two red nodes adjacent.
+ */
+ if (y->color == RBTBLACK)
+ rbt_delete_fixup(rbt, x);
+
+ /* Now we can recycle the y node */
+ if (rbt->freefunc)
+ rbt->freefunc(y, rbt->arg);
+}
+
+/*
+ * rbt_delete: remove the given tree entry
+ *
+ * "node" must have previously been found via rbt_find or rbt_leftmost.
+ * It is caller's responsibility to free any subsidiary data attached
+ * to the node before calling rbt_delete. (Do *not* try to push that
+ * responsibility off to the freefunc, as some other physical node
+ * may be the one actually freed!)
+ */
+void
+rbt_delete(RBTree *rbt, RBTNode *node)
+{
+ rbt_delete_node(rbt, node);
+}
+
+/**********************************************************************
+ * Traverse *
+ **********************************************************************/
+
+static RBTNode *
+rbt_left_right_iterator(RBTreeIterator *iter)
+{
+ if (iter->last_visited == NULL)
+ {
+ iter->last_visited = iter->rbt->root;
+ while (iter->last_visited->left != RBTNIL)
+ iter->last_visited = iter->last_visited->left;
+
+ return iter->last_visited;
+ }
+
+ if (iter->last_visited->right != RBTNIL)
+ {
+ iter->last_visited = iter->last_visited->right;
+ while (iter->last_visited->left != RBTNIL)
+ iter->last_visited = iter->last_visited->left;
+
+ return iter->last_visited;
+ }
+
+ for (;;)
+ {
+ RBTNode *came_from = iter->last_visited;
+
+ iter->last_visited = iter->last_visited->parent;
+ if (iter->last_visited == NULL)
+ {
+ iter->is_over = true;
+ break;
+ }
+
+ if (iter->last_visited->left == came_from)
+ break; /* came from left sub-tree, return current
+ * node */
+
+ /* else - came from right sub-tree, continue to move up */
+ }
+
+ return iter->last_visited;
+}
+
+static RBTNode *
+rbt_right_left_iterator(RBTreeIterator *iter)
+{
+ if (iter->last_visited == NULL)
+ {
+ iter->last_visited = iter->rbt->root;
+ while (iter->last_visited->right != RBTNIL)
+ iter->last_visited = iter->last_visited->right;
+
+ return iter->last_visited;
+ }
+
+ if (iter->last_visited->left != RBTNIL)
+ {
+ iter->last_visited = iter->last_visited->left;
+ while (iter->last_visited->right != RBTNIL)
+ iter->last_visited = iter->last_visited->right;
+
+ return iter->last_visited;
+ }
+
+ for (;;)
+ {
+ RBTNode *came_from = iter->last_visited;
+
+ iter->last_visited = iter->last_visited->parent;
+ if (iter->last_visited == NULL)
+ {
+ iter->is_over = true;
+ break;
+ }
+
+ if (iter->last_visited->right == came_from)
+ break; /* came from right sub-tree, return current
+ * node */
+
+ /* else - came from left sub-tree, continue to move up */
+ }
+
+ return iter->last_visited;
+}
+
+/*
+ * rbt_begin_iterate: prepare to traverse the tree in any of several orders
+ *
+ * After calling rbt_begin_iterate, call rbt_iterate repeatedly until it
+ * returns NULL or the traversal stops being of interest.
+ *
+ * If the tree is changed during traversal, results of further calls to
+ * rbt_iterate are unspecified. Multiple concurrent iterators on the same
+ * tree are allowed.
+ *
+ * The iterator state is stored in the 'iter' struct. The caller should
+ * treat it as an opaque struct.
+ */
+void
+rbt_begin_iterate(RBTree *rbt, RBTOrderControl ctrl, RBTreeIterator *iter)
+{
+ /* Common initialization for all traversal orders */
+ iter->rbt = rbt;
+ iter->last_visited = NULL;
+ iter->is_over = (rbt->root == RBTNIL);
+
+ switch (ctrl)
+ {
+ case LeftRightWalk: /* visit left, then self, then right */
+ iter->iterate = rbt_left_right_iterator;
+ break;
+ case RightLeftWalk: /* visit right, then self, then left */
+ iter->iterate = rbt_right_left_iterator;
+ break;
+ default:
+ elog(ERROR, "unrecognized rbtree iteration order: %d", ctrl);
+ }
+}
+
+/*
+ * rbt_iterate: return the next node in traversal order, or NULL if no more
+ */
+RBTNode *
+rbt_iterate(RBTreeIterator *iter)
+{
+ if (iter->is_over)
+ return NULL;
+
+ return iter->iterate(iter);
+}