Adding upstream version 15.5.upstream/15.5

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:17:33 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:17:33 +0000
commit: 5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree: 739caf8c461053357daa9f162bef34516c7bf452 /src/backend/lib
parent: Initial commit. (diff)
download: postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
12 files changed, 4512 insertions, 0 deletions
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
new file mode 100644
index 0000000..9dad313
--- /dev/null
+++ b/src/backend/lib/Makefile
@@ -0,0 +1,27 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for lib (miscellaneous stuff)
+#
+# IDENTIFICATION
+#    src/backend/lib/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/lib
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	binaryheap.o \
+	bipartite_match.o \
+	bloomfilter.o \
+	dshash.o \
+	hyperloglog.o \
+	ilist.o \
+	integerset.o \
+	knapsack.o \
+	pairingheap.o \
+	rbtree.o \
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/README b/src/backend/lib/README
new file mode 100644
index 0000000..f2fb591
--- /dev/null
+++ b/src/backend/lib/README
@@ -0,0 +1,35 @@
+This directory contains a general purpose data structures, for use anywhere
+in the backend:
+
+binaryheap.c - a binary heap
+
+bipartite_match.c - Hopcroft-Karp maximum cardinality algorithm for bipartite graphs
+
+bloomfilter.c - probabilistic, space-efficient set membership testing
+
+dshash.c - concurrent hash tables backed by dynamic shared memory areas
+
+hyperloglog.c - a streaming cardinality estimator
+
+ilist.c - single and double-linked lists
+
+integerset.c - a data structure for holding large set of integers
+
+knapsack.c - knapsack problem solver
+
+pairingheap.c - a pairing heap
+
+rbtree.c - a red-black tree
+
+stringinfo.c - an extensible string type
+
+
+Aside from the inherent characteristics of the data structures, there are a
+few practical differences between the binary heap and the pairing heap. The
+binary heap is fully allocated at creation, and cannot be expanded beyond the
+allocated size. The pairing heap on the other hand has no inherent maximum
+size, but the caller needs to allocate each element being stored in the heap,
+while the binary heap works with plain Datums or pointers.
+
+The linked-lists in ilist.c can be embedded directly into other structs, as
+opposed to the List interface in nodes/pg_list.h.
diff --git a/src/backend/lib/binaryheap.c b/src/backend/lib/binaryheap.c
new file mode 100644
index 0000000..999c23e
--- /dev/null
+++ b/src/backend/lib/binaryheap.c
@@ -0,0 +1,317 @@
+/*-------------------------------------------------------------------------
+ *
+ * binaryheap.c
+ *	  A simple binary heap implementation
+ *
+ * Portions Copyright (c) 2012-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/binaryheap.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "lib/binaryheap.h"
+
+static void sift_down(binaryheap *heap, int node_off);
+static void sift_up(binaryheap *heap, int node_off);
+
+/*
+ * binaryheap_allocate
+ *
+ * Returns a pointer to a newly-allocated heap that has the capacity to
+ * store the given number of nodes, with the heap property defined by
+ * the given comparator function, which will be invoked with the additional
+ * argument specified by 'arg'.
+ */
+binaryheap *
+binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
+{
+	int			sz;
+	binaryheap *heap;
+
+	sz = offsetof(binaryheap, bh_nodes) + sizeof(Datum) * capacity;
+	heap = (binaryheap *) palloc(sz);
+	heap->bh_space = capacity;
+	heap->bh_compare = compare;
+	heap->bh_arg = arg;
+
+	heap->bh_size = 0;
+	heap->bh_has_heap_property = true;
+
+	return heap;
+}
+
+/*
+ * binaryheap_reset
+ *
+ * Resets the heap to an empty state, losing its data content but not the
+ * parameters passed at allocation.
+ */
+void
+binaryheap_reset(binaryheap *heap)
+{
+	heap->bh_size = 0;
+	heap->bh_has_heap_property = true;
+}
+
+/*
+ * binaryheap_free
+ *
+ * Releases memory used by the given binaryheap.
+ */
+void
+binaryheap_free(binaryheap *heap)
+{
+	pfree(heap);
+}
+
+/*
+ * These utility functions return the offset of the left child, right
+ * child, and parent of the node at the given index, respectively.
+ *
+ * The heap is represented as an array of nodes, with the root node
+ * stored at index 0. The left child of node i is at index 2*i+1, and
+ * the right child at 2*i+2. The parent of node i is at index (i-1)/2.
+ */
+
+static inline int
+left_offset(int i)
+{
+	return 2 * i + 1;
+}
+
+static inline int
+right_offset(int i)
+{
+	return 2 * i + 2;
+}
+
+static inline int
+parent_offset(int i)
+{
+	return (i - 1) / 2;
+}
+
+/*
+ * binaryheap_add_unordered
+ *
+ * Adds the given datum to the end of the heap's list of nodes in O(1) without
+ * preserving the heap property. This is a convenience to add elements quickly
+ * to a new heap. To obtain a valid heap, one must call binaryheap_build()
+ * afterwards.
+ */
+void
+binaryheap_add_unordered(binaryheap *heap, Datum d)
+{
+	if (heap->bh_size >= heap->bh_space)
+		elog(ERROR, "out of binary heap slots");
+	heap->bh_has_heap_property = false;
+	heap->bh_nodes[heap->bh_size] = d;
+	heap->bh_size++;
+}
+
+/*
+ * binaryheap_build
+ *
+ * Assembles a valid heap in O(n) from the nodes added by
+ * binaryheap_add_unordered(). Not needed otherwise.
+ */
+void
+binaryheap_build(binaryheap *heap)
+{
+	int			i;
+
+	for (i = parent_offset(heap->bh_size - 1); i >= 0; i--)
+		sift_down(heap, i);
+	heap->bh_has_heap_property = true;
+}
+
+/*
+ * binaryheap_add
+ *
+ * Adds the given datum to the heap in O(log n) time, while preserving
+ * the heap property.
+ */
+void
+binaryheap_add(binaryheap *heap, Datum d)
+{
+	if (heap->bh_size >= heap->bh_space)
+		elog(ERROR, "out of binary heap slots");
+	heap->bh_nodes[heap->bh_size] = d;
+	heap->bh_size++;
+	sift_up(heap, heap->bh_size - 1);
+}
+
+/*
+ * binaryheap_first
+ *
+ * Returns a pointer to the first (root, topmost) node in the heap
+ * without modifying the heap. The caller must ensure that this
+ * routine is not used on an empty heap. Always O(1).
+ */
+Datum
+binaryheap_first(binaryheap *heap)
+{
+	Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property);
+	return heap->bh_nodes[0];
+}
+
+/*
+ * binaryheap_remove_first
+ *
+ * Removes the first (root, topmost) node in the heap and returns a
+ * pointer to it after rebalancing the heap. The caller must ensure
+ * that this routine is not used on an empty heap. O(log n) worst
+ * case.
+ */
+Datum
+binaryheap_remove_first(binaryheap *heap)
+{
+	Datum		result;
+
+	Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property);
+
+	/* extract the root node, which will be the result */
+	result = heap->bh_nodes[0];
+
+	/* easy if heap contains one element */
+	if (heap->bh_size == 1)
+	{
+		heap->bh_size--;
+		return result;
+	}
+
+	/*
+	 * Remove the last node, placing it in the vacated root entry, and sift
+	 * the new root node down to its correct position.
+	 */
+	heap->bh_nodes[0] = heap->bh_nodes[--heap->bh_size];
+	sift_down(heap, 0);
+
+	return result;
+}
+
+/*
+ * binaryheap_replace_first
+ *
+ * Replace the topmost element of a non-empty heap, preserving the heap
+ * property.  O(1) in the best case, or O(log n) if it must fall back to
+ * sifting the new node down.
+ */
+void
+binaryheap_replace_first(binaryheap *heap, Datum d)
+{
+	Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property);
+
+	heap->bh_nodes[0] = d;
+
+	if (heap->bh_size > 1)
+		sift_down(heap, 0);
+}
+
+/*
+ * Sift a node up to the highest position it can hold according to the
+ * comparator.
+ */
+static void
+sift_up(binaryheap *heap, int node_off)
+{
+	Datum		node_val = heap->bh_nodes[node_off];
+
+	/*
+	 * Within the loop, the node_off'th array entry is a "hole" that
+	 * notionally holds node_val, but we don't actually store node_val there
+	 * till the end, saving some unnecessary data copying steps.
+	 */
+	while (node_off != 0)
+	{
+		int			cmp;
+		int			parent_off;
+		Datum		parent_val;
+
+		/*
+		 * If this node is smaller than its parent, the heap condition is
+		 * satisfied, and we're done.
+		 */
+		parent_off = parent_offset(node_off);
+		parent_val = heap->bh_nodes[parent_off];
+		cmp = heap->bh_compare(node_val,
+							   parent_val,
+							   heap->bh_arg);
+		if (cmp <= 0)
+			break;
+
+		/*
+		 * Otherwise, swap the parent value with the hole, and go on to check
+		 * the node's new parent.
+		 */
+		heap->bh_nodes[node_off] = parent_val;
+		node_off = parent_off;
+	}
+	/* Re-fill the hole */
+	heap->bh_nodes[node_off] = node_val;
+}
+
+/*
+ * Sift a node down from its current position to satisfy the heap
+ * property.
+ */
+static void
+sift_down(binaryheap *heap, int node_off)
+{
+	Datum		node_val = heap->bh_nodes[node_off];
+
+	/*
+	 * Within the loop, the node_off'th array entry is a "hole" that
+	 * notionally holds node_val, but we don't actually store node_val there
+	 * till the end, saving some unnecessary data copying steps.
+	 */
+	while (true)
+	{
+		int			left_off = left_offset(node_off);
+		int			right_off = right_offset(node_off);
+		int			swap_off = 0;
+
+		/* Is the left child larger than the parent? */
+		if (left_off < heap->bh_size &&
+			heap->bh_compare(node_val,
+							 heap->bh_nodes[left_off],
+							 heap->bh_arg) < 0)
+			swap_off = left_off;
+
+		/* Is the right child larger than the parent? */
+		if (right_off < heap->bh_size &&
+			heap->bh_compare(node_val,
+							 heap->bh_nodes[right_off],
+							 heap->bh_arg) < 0)
+		{
+			/* swap with the larger child */
+			if (!swap_off ||
+				heap->bh_compare(heap->bh_nodes[left_off],
+								 heap->bh_nodes[right_off],
+								 heap->bh_arg) < 0)
+				swap_off = right_off;
+		}
+
+		/*
+		 * If we didn't find anything to swap, the heap condition is
+		 * satisfied, and we're done.
+		 */
+		if (!swap_off)
+			break;
+
+		/*
+		 * Otherwise, swap the hole with the child that violates the heap
+		 * property; then go on to check its children.
+		 */
+		heap->bh_nodes[node_off] = heap->bh_nodes[swap_off];
+		node_off = swap_off;
+	}
+	/* Re-fill the hole */
+	heap->bh_nodes[node_off] = node_val;
+}
diff --git a/src/backend/lib/bipartite_match.c b/src/backend/lib/bipartite_match.c
new file mode 100644
index 0000000..8e1637f
--- /dev/null
+++ b/src/backend/lib/bipartite_match.c
@@ -0,0 +1,180 @@
+/*-------------------------------------------------------------------------
+ *
+ * bipartite_match.c
+ *	  Hopcroft-Karp maximum cardinality algorithm for bipartite graphs
+ *
+ * This implementation is based on pseudocode found at:
+ *
+ * https://en.wikipedia.org/w/index.php?title=Hopcroft%E2%80%93Karp_algorithm&oldid=593898016
+ *
+ * Copyright (c) 2015-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/bipartite_match.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "lib/bipartite_match.h"
+#include "miscadmin.h"
+
+/*
+ * The distances computed in hk_breadth_search can easily be seen to never
+ * exceed u_size.  Since we restrict u_size to be less than SHRT_MAX, we
+ * can therefore use SHRT_MAX as the "infinity" distance needed as a marker.
+ */
+#define HK_INFINITY  SHRT_MAX
+
+static bool hk_breadth_search(BipartiteMatchState *state);
+static bool hk_depth_search(BipartiteMatchState *state, int u);
+
+/*
+ * Given the size of U and V, where each is indexed 1..size, and an adjacency
+ * list, perform the matching and return the resulting state.
+ */
+BipartiteMatchState *
+BipartiteMatch(int u_size, int v_size, short **adjacency)
+{
+	BipartiteMatchState *state = palloc(sizeof(BipartiteMatchState));
+
+	if (u_size < 0 || u_size >= SHRT_MAX ||
+		v_size < 0 || v_size >= SHRT_MAX)
+		elog(ERROR, "invalid set size for BipartiteMatch");
+
+	state->u_size = u_size;
+	state->v_size = v_size;
+	state->adjacency = adjacency;
+	state->matching = 0;
+	state->pair_uv = (short *) palloc0((u_size + 1) * sizeof(short));
+	state->pair_vu = (short *) palloc0((v_size + 1) * sizeof(short));
+	state->distance = (short *) palloc((u_size + 1) * sizeof(short));
+	state->queue = (short *) palloc((u_size + 2) * sizeof(short));
+
+	while (hk_breadth_search(state))
+	{
+		int			u;
+
+		for (u = 1; u <= u_size; u++)
+		{
+			if (state->pair_uv[u] == 0)
+				if (hk_depth_search(state, u))
+					state->matching++;
+		}
+
+		CHECK_FOR_INTERRUPTS(); /* just in case */
+	}
+
+	return state;
+}
+
+/*
+ * Free a state returned by BipartiteMatch, except for the original adjacency
+ * list, which is owned by the caller. This only frees memory, so it's optional.
+ */
+void
+BipartiteMatchFree(BipartiteMatchState *state)
+{
+	/* adjacency matrix is treated as owned by the caller */
+	pfree(state->pair_uv);
+	pfree(state->pair_vu);
+	pfree(state->distance);
+	pfree(state->queue);
+	pfree(state);
+}
+
+/*
+ * Perform the breadth-first search step of H-K matching.
+ * Returns true if successful.
+ */
+static bool
+hk_breadth_search(BipartiteMatchState *state)
+{
+	int			usize = state->u_size;
+	short	   *queue = state->queue;
+	short	   *distance = state->distance;
+	int			qhead = 0;		/* we never enqueue any node more than once */
+	int			qtail = 0;		/* so don't have to worry about wrapping */
+	int			u;
+
+	distance[0] = HK_INFINITY;
+
+	for (u = 1; u <= usize; u++)
+	{
+		if (state->pair_uv[u] == 0)
+		{
+			distance[u] = 0;
+			queue[qhead++] = u;
+		}
+		else
+			distance[u] = HK_INFINITY;
+	}
+
+	while (qtail < qhead)
+	{
+		u = queue[qtail++];
+
+		if (distance[u] < distance[0])
+		{
+			short	   *u_adj = state->adjacency[u];
+			int			i = u_adj ? u_adj[0] : 0;
+
+			for (; i > 0; i--)
+			{
+				int			u_next = state->pair_vu[u_adj[i]];
+
+				if (distance[u_next] == HK_INFINITY)
+				{
+					distance[u_next] = 1 + distance[u];
+					Assert(qhead < usize + 2);
+					queue[qhead++] = u_next;
+				}
+			}
+		}
+	}
+
+	return (distance[0] != HK_INFINITY);
+}
+
+/*
+ * Perform the depth-first search step of H-K matching.
+ * Returns true if successful.
+ */
+static bool
+hk_depth_search(BipartiteMatchState *state, int u)
+{
+	short	   *distance = state->distance;
+	short	   *pair_uv = state->pair_uv;
+	short	   *pair_vu = state->pair_vu;
+	short	   *u_adj = state->adjacency[u];
+	int			i = u_adj ? u_adj[0] : 0;
+	short		nextdist;
+
+	if (u == 0)
+		return true;
+	if (distance[u] == HK_INFINITY)
+		return false;
+	nextdist = distance[u] + 1;
+
+	check_stack_depth();
+
+	for (; i > 0; i--)
+	{
+		int			v = u_adj[i];
+
+		if (distance[pair_vu[v]] == nextdist)
+		{
+			if (hk_depth_search(state, pair_vu[v]))
+			{
+				pair_vu[v] = u;
+				pair_uv[u] = v;
+				return true;
+			}
+		}
+	}
+
+	distance[u] = HK_INFINITY;
+	return false;
+}
diff --git a/src/backend/lib/bloomfilter.c b/src/backend/lib/bloomfilter.c
new file mode 100644
index 0000000..465ca7c
--- /dev/null
+++ b/src/backend/lib/bloomfilter.c
@@ -0,0 +1,294 @@
+/*-------------------------------------------------------------------------
+ *
+ * bloomfilter.c
+ *		Space-efficient set membership testing
+ *
+ * A Bloom filter is a probabilistic data structure that is used to test an
+ * element's membership of a set.  False positives are possible, but false
+ * negatives are not; a test of membership of the set returns either "possibly
+ * in set" or "definitely not in set".  This is typically very space efficient,
+ * which can be a decisive advantage.
+ *
+ * Elements can be added to the set, but not removed.  The more elements that
+ * are added, the larger the probability of false positives.  Caller must hint
+ * an estimated total size of the set when the Bloom filter is initialized.
+ * This is used to balance the use of memory against the final false positive
+ * rate.
+ *
+ * The implementation is well suited to data synchronization problems between
+ * unordered sets, especially where predictable performance is important and
+ * some false positives are acceptable.  It's also well suited to cache
+ * filtering problems where a relatively small and/or low cardinality set is
+ * fingerprinted, especially when many subsequent membership tests end up
+ * indicating that values of interest are not present.  That should save the
+ * caller many authoritative lookups, such as expensive probes of a much larger
+ * on-disk structure.
+ *
+ * Copyright (c) 2018-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/bloomfilter.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "common/hashfn.h"
+#include "lib/bloomfilter.h"
+#include "port/pg_bitutils.h"
+
+#define MAX_HASH_FUNCS		10
+
+struct bloom_filter
+{
+	/* K hash functions are used, seeded by caller's seed */
+	int			k_hash_funcs;
+	uint64		seed;
+	/* m is bitset size, in bits.  Must be a power of two <= 2^32.  */
+	uint64		m;
+	unsigned char bitset[FLEXIBLE_ARRAY_MEMBER];
+};
+
+static int	my_bloom_power(uint64 target_bitset_bits);
+static int	optimal_k(uint64 bitset_bits, int64 total_elems);
+static void k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem,
+					 size_t len);
+static inline uint32 mod_m(uint32 a, uint64 m);
+
+/*
+ * Create Bloom filter in caller's memory context.  We aim for a false positive
+ * rate of between 1% and 2% when bitset size is not constrained by memory
+ * availability.
+ *
+ * total_elems is an estimate of the final size of the set.  It should be
+ * approximately correct, but the implementation can cope well with it being
+ * off by perhaps a factor of five or more.  See "Bloom Filters in
+ * Probabilistic Verification" (Dillinger & Manolios, 2004) for details of why
+ * this is the case.
+ *
+ * bloom_work_mem is sized in KB, in line with the general work_mem convention.
+ * This determines the size of the underlying bitset (trivial bookkeeping space
+ * isn't counted).  The bitset is always sized as a power of two number of
+ * bits, and the largest possible bitset is 512MB (2^32 bits).  The
+ * implementation allocates only enough memory to target its standard false
+ * positive rate, using a simple formula with caller's total_elems estimate as
+ * an input.  The bitset might be as small as 1MB, even when bloom_work_mem is
+ * much higher.
+ *
+ * The Bloom filter is seeded using a value provided by the caller.  Using a
+ * distinct seed value on every call makes it unlikely that the same false
+ * positives will reoccur when the same set is fingerprinted a second time.
+ * Callers that don't care about this pass a constant as their seed, typically
+ * 0.  Callers can also use a pseudo-random seed, eg from pg_prng_uint64().
+ */
+bloom_filter *
+bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
+{
+	bloom_filter *filter;
+	int			bloom_power;
+	uint64		bitset_bytes;
+	uint64		bitset_bits;
+
+	/*
+	 * Aim for two bytes per element; this is sufficient to get a false
+	 * positive rate below 1%, independent of the size of the bitset or total
+	 * number of elements.  Also, if rounding down the size of the bitset to
+	 * the next lowest power of two turns out to be a significant drop, the
+	 * false positive rate still won't exceed 2% in almost all cases.
+	 */
+	bitset_bytes = Min(bloom_work_mem * UINT64CONST(1024), total_elems * 2);
+	bitset_bytes = Max(1024 * 1024, bitset_bytes);
+
+	/*
+	 * Size in bits should be the highest power of two <= target.  bitset_bits
+	 * is uint64 because PG_UINT32_MAX is 2^32 - 1, not 2^32
+	 */
+	bloom_power = my_bloom_power(bitset_bytes * BITS_PER_BYTE);
+	bitset_bits = UINT64CONST(1) << bloom_power;
+	bitset_bytes = bitset_bits / BITS_PER_BYTE;
+
+	/* Allocate bloom filter with unset bitset */
+	filter = palloc0(offsetof(bloom_filter, bitset) +
+					 sizeof(unsigned char) * bitset_bytes);
+	filter->k_hash_funcs = optimal_k(bitset_bits, total_elems);
+	filter->seed = seed;
+	filter->m = bitset_bits;
+
+	return filter;
+}
+
+/*
+ * Free Bloom filter
+ */
+void
+bloom_free(bloom_filter *filter)
+{
+	pfree(filter);
+}
+
+/*
+ * Add element to Bloom filter
+ */
+void
+bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len)
+{
+	uint32		hashes[MAX_HASH_FUNCS];
+	int			i;
+
+	k_hashes(filter, hashes, elem, len);
+
+	/* Map a bit-wise address to a byte-wise address + bit offset */
+	for (i = 0; i < filter->k_hash_funcs; i++)
+	{
+		filter->bitset[hashes[i] >> 3] |= 1 << (hashes[i] & 7);
+	}
+}
+
+/*
+ * Test if Bloom filter definitely lacks element.
+ *
+ * Returns true if the element is definitely not in the set of elements
+ * observed by bloom_add_element().  Otherwise, returns false, indicating that
+ * element is probably present in set.
+ */
+bool
+bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len)
+{
+	uint32		hashes[MAX_HASH_FUNCS];
+	int			i;
+
+	k_hashes(filter, hashes, elem, len);
+
+	/* Map a bit-wise address to a byte-wise address + bit offset */
+	for (i = 0; i < filter->k_hash_funcs; i++)
+	{
+		if (!(filter->bitset[hashes[i] >> 3] & (1 << (hashes[i] & 7))))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * What proportion of bits are currently set?
+ *
+ * Returns proportion, expressed as a multiplier of filter size.  That should
+ * generally be close to 0.5, even when we have more than enough memory to
+ * ensure a false positive rate within target 1% to 2% band, since more hash
+ * functions are used as more memory is available per element.
+ *
+ * This is the only instrumentation that is low overhead enough to appear in
+ * debug traces.  When debugging Bloom filter code, it's likely to be far more
+ * interesting to directly test the false positive rate.
+ */
+double
+bloom_prop_bits_set(bloom_filter *filter)
+{
+	int			bitset_bytes = filter->m / BITS_PER_BYTE;
+	uint64		bits_set = pg_popcount((char *) filter->bitset, bitset_bytes);
+
+	return bits_set / (double) filter->m;
+}
+
+/*
+ * Which element in the sequence of powers of two is less than or equal to
+ * target_bitset_bits?
+ *
+ * Value returned here must be generally safe as the basis for actual bitset
+ * size.
+ *
+ * Bitset is never allowed to exceed 2 ^ 32 bits (512MB).  This is sufficient
+ * for the needs of all current callers, and allows us to use 32-bit hash
+ * functions.  It also makes it easy to stay under the MaxAllocSize restriction
+ * (caller needs to leave room for non-bitset fields that appear before
+ * flexible array member, so a 1GB bitset would use an allocation that just
+ * exceeds MaxAllocSize).
+ */
+static int
+my_bloom_power(uint64 target_bitset_bits)
+{
+	int			bloom_power = -1;
+
+	while (target_bitset_bits > 0 && bloom_power < 32)
+	{
+		bloom_power++;
+		target_bitset_bits >>= 1;
+	}
+
+	return bloom_power;
+}
+
+/*
+ * Determine optimal number of hash functions based on size of filter in bits,
+ * and projected total number of elements.  The optimal number is the number
+ * that minimizes the false positive rate.
+ */
+static int
+optimal_k(uint64 bitset_bits, int64 total_elems)
+{
+	int			k = rint(log(2.0) * bitset_bits / total_elems);
+
+	return Max(1, Min(k, MAX_HASH_FUNCS));
+}
+
+/*
+ * Generate k hash values for element.
+ *
+ * Caller passes array, which is filled-in with k values determined by hashing
+ * caller's element.
+ *
+ * Only 2 real independent hash functions are actually used to support an
+ * interface of up to MAX_HASH_FUNCS hash functions; enhanced double hashing is
+ * used to make this work.  The main reason we prefer enhanced double hashing
+ * to classic double hashing is that the latter has an issue with collisions
+ * when using power of two sized bitsets.  See Dillinger & Manolios for full
+ * details.
+ */
+static void
+k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem, size_t len)
+{
+	uint64		hash;
+	uint32		x,
+				y;
+	uint64		m;
+	int			i;
+
+	/* Use 64-bit hashing to get two independent 32-bit hashes */
+	hash = DatumGetUInt64(hash_any_extended(elem, len, filter->seed));
+	x = (uint32) hash;
+	y = (uint32) (hash >> 32);
+	m = filter->m;
+
+	x = mod_m(x, m);
+	y = mod_m(y, m);
+
+	/* Accumulate hashes */
+	hashes[0] = x;
+	for (i = 1; i < filter->k_hash_funcs; i++)
+	{
+		x = mod_m(x + y, m);
+		y = mod_m(y + i, m);
+
+		hashes[i] = x;
+	}
+}
+
+/*
+ * Calculate "val MOD m" inexpensively.
+ *
+ * Assumes that m (which is bitset size) is a power of two.
+ *
+ * Using a power of two number of bits for bitset size allows us to use bitwise
+ * AND operations to calculate the modulo of a hash value.  It's also a simple
+ * way of avoiding the modulo bias effect.
+ */
+static inline uint32
+mod_m(uint32 val, uint64 m)
+{
+	Assert(m <= PG_UINT32_MAX + UINT64CONST(1));
+	Assert(((m - 1) & m) == 0);
+
+	return val & (m - 1);
+}
diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
new file mode 100644
index 0000000..c5c032a
--- /dev/null
+++ b/src/backend/lib/dshash.c
@@ -0,0 +1,1034 @@
+/*-------------------------------------------------------------------------
+ *
+ * dshash.c
+ *	  Concurrent hash tables backed by dynamic shared memory areas.
+ *
+ * This is an open hashing hash table, with a linked list at each table
+ * entry.  It supports dynamic resizing, as required to prevent the linked
+ * lists from growing too long on average.  Currently, only growing is
+ * supported: the hash table never becomes smaller.
+ *
+ * To deal with concurrency, it has a fixed size set of partitions, each of
+ * which is independently locked.  Each bucket maps to a partition; so insert,
+ * find and iterate operations normally only acquire one lock.  Therefore,
+ * good concurrency is achieved whenever such operations don't collide at the
+ * lock partition level.  However, when a resize operation begins, all
+ * partition locks must be acquired simultaneously for a brief period.  This
+ * is only expected to happen a small number of times until a stable size is
+ * found, since growth is geometric.
+ *
+ * Future versions may support iterators and incremental resizing; for now
+ * the implementation is minimalist.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/dshash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "common/hashfn.h"
+#include "lib/dshash.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "utils/dsa.h"
+#include "utils/memutils.h"
+
+/*
+ * An item in the hash table.  This wraps the user's entry object in an
+ * envelop that holds a pointer back to the bucket and a pointer to the next
+ * item in the bucket.
+ */
+struct dshash_table_item
+{
+	/* The next item in the same bucket. */
+	dsa_pointer next;
+	/* The hashed key, to avoid having to recompute it. */
+	dshash_hash hash;
+	/* The user's entry object follows here.  See ENTRY_FROM_ITEM(item). */
+};
+
+/*
+ * The number of partitions for locking purposes.  This is set to match
+ * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
+ * the buffer pool must be good enough for any other purpose.  This could
+ * become a runtime parameter in future.
+ */
+#define DSHASH_NUM_PARTITIONS_LOG2 7
+#define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
+
+/* A magic value used to identify our hash tables. */
+#define DSHASH_MAGIC 0x75ff6a20
+
+/*
+ * Tracking information for each lock partition.  Initially, each partition
+ * corresponds to one bucket, but each time the hash table grows, the buckets
+ * covered by each partition split so the number of buckets covered doubles.
+ *
+ * We might want to add padding here so that each partition is on a different
+ * cache line, but doing so would bloat this structure considerably.
+ */
+typedef struct dshash_partition
+{
+	LWLock		lock;			/* Protects all buckets in this partition. */
+	size_t		count;			/* # of items in this partition's buckets */
+} dshash_partition;
+
+/*
+ * The head object for a hash table.  This will be stored in dynamic shared
+ * memory.
+ */
+typedef struct dshash_table_control
+{
+	dshash_table_handle handle;
+	uint32		magic;
+	dshash_partition partitions[DSHASH_NUM_PARTITIONS];
+	int			lwlock_tranche_id;
+
+	/*
+	 * The following members are written to only when ALL partitions locks are
+	 * held.  They can be read when any one partition lock is held.
+	 */
+
+	/* Number of buckets expressed as power of 2 (8 = 256 buckets). */
+	size_t		size_log2;		/* log2(number of buckets) */
+	dsa_pointer buckets;		/* current bucket array */
+} dshash_table_control;
+
+/*
+ * Per-backend state for a dynamic hash table.
+ */
+struct dshash_table
+{
+	dsa_area   *area;			/* Backing dynamic shared memory area. */
+	dshash_parameters params;	/* Parameters. */
+	void	   *arg;			/* User-supplied data pointer. */
+	dshash_table_control *control;	/* Control object in DSM. */
+	dsa_pointer *buckets;		/* Current bucket pointers in DSM. */
+	size_t		size_log2;		/* log2(number of buckets) */
+};
+
+/* Given a pointer to an item, find the entry (user data) it holds. */
+#define ENTRY_FROM_ITEM(item) \
+	((char *)(item) + MAXALIGN(sizeof(dshash_table_item)))
+
+/* Given a pointer to an entry, find the item that holds it. */
+#define ITEM_FROM_ENTRY(entry)											\
+	((dshash_table_item *)((char *)(entry) -							\
+							 MAXALIGN(sizeof(dshash_table_item))))
+
+/* How many resize operations (bucket splits) have there been? */
+#define NUM_SPLITS(size_log2)					\
+	(size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
+
+/* How many buckets are there in a given size? */
+#define NUM_BUCKETS(size_log2)		\
+	(((size_t) 1) << (size_log2))
+
+/* How many buckets are there in each partition at a given size? */
+#define BUCKETS_PER_PARTITION(size_log2)		\
+	(((size_t) 1) << NUM_SPLITS(size_log2))
+
+/* Max entries before we need to grow.  Half + quarter = 75% load factor. */
+#define MAX_COUNT_PER_PARTITION(hash_table)				\
+	(BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \
+	 BUCKETS_PER_PARTITION(hash_table->size_log2) / 4)
+
+/* Choose partition based on the highest order bits of the hash. */
+#define PARTITION_FOR_HASH(hash)										\
+	(hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2))
+
+/*
+ * Find the bucket index for a given hash and table size.  Each time the table
+ * doubles in size, the appropriate bucket for a given hash value doubles and
+ * possibly adds one, depending on the newly revealed bit, so that all buckets
+ * are split.
+ */
+#define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2)		\
+	(hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2)))
+
+/* The index of the first bucket in a given partition. */
+#define BUCKET_INDEX_FOR_PARTITION(partition, size_log2)	\
+	((partition) << NUM_SPLITS(size_log2))
+
+/* Choose partition based on bucket index. */
+#define PARTITION_FOR_BUCKET_INDEX(bucket_idx, size_log2)				\
+	((bucket_idx) >> NUM_SPLITS(size_log2))
+
+/* The head of the active bucket for a given hash value (lvalue). */
+#define BUCKET_FOR_HASH(hash_table, hash)								\
+	(hash_table->buckets[												\
+		BUCKET_INDEX_FOR_HASH_AND_SIZE(hash,							\
+									   hash_table->size_log2)])
+
+static void delete_item(dshash_table *hash_table,
+						dshash_table_item *item);
+static void resize(dshash_table *hash_table, size_t new_size);
+static inline void ensure_valid_bucket_pointers(dshash_table *hash_table);
+static inline dshash_table_item *find_in_bucket(dshash_table *hash_table,
+												const void *key,
+												dsa_pointer item_pointer);
+static void insert_item_into_bucket(dshash_table *hash_table,
+									dsa_pointer item_pointer,
+									dshash_table_item *item,
+									dsa_pointer *bucket);
+static dshash_table_item *insert_into_bucket(dshash_table *hash_table,
+											 const void *key,
+											 dsa_pointer *bucket);
+static bool delete_key_from_bucket(dshash_table *hash_table,
+								   const void *key,
+								   dsa_pointer *bucket_head);
+static bool delete_item_from_bucket(dshash_table *hash_table,
+									dshash_table_item *item,
+									dsa_pointer *bucket_head);
+static inline dshash_hash hash_key(dshash_table *hash_table, const void *key);
+static inline bool equal_keys(dshash_table *hash_table,
+							  const void *a, const void *b);
+
+#define PARTITION_LOCK(hash_table, i)			\
+	(&(hash_table)->control->partitions[(i)].lock)
+
+#define ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table) \
+	Assert(!LWLockAnyHeldByMe(&(hash_table)->control->partitions[0].lock, \
+		   DSHASH_NUM_PARTITIONS, sizeof(dshash_partition)))
+
+/*
+ * Create a new hash table backed by the given dynamic shared area, with the
+ * given parameters.  The returned object is allocated in backend-local memory
+ * using the current MemoryContext.  'arg' will be passed through to the
+ * compare and hash functions.
+ */
+dshash_table *
+dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
+{
+	dshash_table *hash_table;
+	dsa_pointer control;
+
+	/* Allocate the backend-local object representing the hash table. */
+	hash_table = palloc(sizeof(dshash_table));
+
+	/* Allocate the control object in shared memory. */
+	control = dsa_allocate(area, sizeof(dshash_table_control));
+
+	/* Set up the local and shared hash table structs. */
+	hash_table->area = area;
+	hash_table->params = *params;
+	hash_table->arg = arg;
+	hash_table->control = dsa_get_address(area, control);
+	hash_table->control->handle = control;
+	hash_table->control->magic = DSHASH_MAGIC;
+	hash_table->control->lwlock_tranche_id = params->tranche_id;
+
+	/* Set up the array of lock partitions. */
+	{
+		dshash_partition *partitions = hash_table->control->partitions;
+		int			tranche_id = hash_table->control->lwlock_tranche_id;
+		int			i;
+
+		for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+		{
+			LWLockInitialize(&partitions[i].lock, tranche_id);
+			partitions[i].count = 0;
+		}
+	}
+
+	/*
+	 * Set up the initial array of buckets.  Our initial size is the same as
+	 * the number of partitions.
+	 */
+	hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2;
+	hash_table->control->buckets =
+		dsa_allocate_extended(area,
+							  sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS,
+							  DSA_ALLOC_NO_OOM | DSA_ALLOC_ZERO);
+	if (!DsaPointerIsValid(hash_table->control->buckets))
+	{
+		dsa_free(area, control);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed on DSA request of size %zu.",
+						   sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS)));
+	}
+	hash_table->buckets = dsa_get_address(area,
+										  hash_table->control->buckets);
+	hash_table->size_log2 = hash_table->control->size_log2;
+
+	return hash_table;
+}
+
+/*
+ * Attach to an existing hash table using a handle.  The returned object is
+ * allocated in backend-local memory using the current MemoryContext.  'arg'
+ * will be passed through to the compare and hash functions.
+ */
+dshash_table *
+dshash_attach(dsa_area *area, const dshash_parameters *params,
+			  dshash_table_handle handle, void *arg)
+{
+	dshash_table *hash_table;
+	dsa_pointer control;
+
+	/* Allocate the backend-local object representing the hash table. */
+	hash_table = palloc(sizeof(dshash_table));
+
+	/* Find the control object in shared memory. */
+	control = handle;
+
+	/* Set up the local hash table struct. */
+	hash_table->area = area;
+	hash_table->params = *params;
+	hash_table->arg = arg;
+	hash_table->control = dsa_get_address(area, control);
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+
+	/*
+	 * These will later be set to the correct values by
+	 * ensure_valid_bucket_pointers(), at which time we'll be holding a
+	 * partition lock for interlocking against concurrent resizing.
+	 */
+	hash_table->buckets = NULL;
+	hash_table->size_log2 = 0;
+
+	return hash_table;
+}
+
+/*
+ * Detach from a hash table.  This frees backend-local resources associated
+ * with the hash table, but the hash table will continue to exist until it is
+ * either explicitly destroyed (by a backend that is still attached to it), or
+ * the area that backs it is returned to the operating system.
+ */
+void
+dshash_detach(dshash_table *hash_table)
+{
+	ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+	/* The hash table may have been destroyed.  Just free local memory. */
+	pfree(hash_table);
+}
+
+/*
+ * Destroy a hash table, returning all memory to the area.  The caller must be
+ * certain that no other backend will attempt to access the hash table before
+ * calling this function.  Other backend must explicitly call dshash_detach to
+ * free up backend-local memory associated with the hash table.  The backend
+ * that calls dshash_destroy must not call dshash_detach.
+ */
+void
+dshash_destroy(dshash_table *hash_table)
+{
+	size_t		size;
+	size_t		i;
+
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+	ensure_valid_bucket_pointers(hash_table);
+
+	/* Free all the entries. */
+	size = NUM_BUCKETS(hash_table->size_log2);
+	for (i = 0; i < size; ++i)
+	{
+		dsa_pointer item_pointer = hash_table->buckets[i];
+
+		while (DsaPointerIsValid(item_pointer))
+		{
+			dshash_table_item *item;
+			dsa_pointer next_item_pointer;
+
+			item = dsa_get_address(hash_table->area, item_pointer);
+			next_item_pointer = item->next;
+			dsa_free(hash_table->area, item_pointer);
+			item_pointer = next_item_pointer;
+		}
+	}
+
+	/*
+	 * Vandalize the control block to help catch programming errors where
+	 * other backends access the memory formerly occupied by this hash table.
+	 */
+	hash_table->control->magic = 0;
+
+	/* Free the active table and control object. */
+	dsa_free(hash_table->area, hash_table->control->buckets);
+	dsa_free(hash_table->area, hash_table->control->handle);
+
+	pfree(hash_table);
+}
+
+/*
+ * Get a handle that can be used by other processes to attach to this hash
+ * table.
+ */
+dshash_table_handle
+dshash_get_hash_table_handle(dshash_table *hash_table)
+{
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+
+	return hash_table->control->handle;
+}
+
+/*
+ * Look up an entry, given a key.  Returns a pointer to an entry if one can be
+ * found with the given key.  Returns NULL if the key is not found.  If a
+ * non-NULL value is returned, the entry is locked and must be released by
+ * calling dshash_release_lock.  If an error is raised before
+ * dshash_release_lock is called, the lock will be released automatically, but
+ * the caller must take care to ensure that the entry is not left corrupted.
+ * The lock mode is either shared or exclusive depending on 'exclusive'.
+ *
+ * The caller must not hold a lock already.
+ *
+ * Note that the lock held is in fact an LWLock, so interrupts will be held on
+ * return from this function, and not resumed until dshash_release_lock is
+ * called.  It is a very good idea for the caller to release the lock quickly.
+ */
+void *
+dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
+{
+	dshash_hash hash;
+	size_t		partition;
+	dshash_table_item *item;
+
+	hash = hash_key(hash_table, key);
+	partition = PARTITION_FOR_HASH(hash);
+
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+	ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+	LWLockAcquire(PARTITION_LOCK(hash_table, partition),
+				  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+	ensure_valid_bucket_pointers(hash_table);
+
+	/* Search the active bucket. */
+	item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
+
+	if (!item)
+	{
+		/* Not found. */
+		LWLockRelease(PARTITION_LOCK(hash_table, partition));
+		return NULL;
+	}
+	else
+	{
+		/* The caller will free the lock by calling dshash_release_lock. */
+		return ENTRY_FROM_ITEM(item);
+	}
+}
+
+/*
+ * Returns a pointer to an exclusively locked item which must be released with
+ * dshash_release_lock.  If the key is found in the hash table, 'found' is set
+ * to true and a pointer to the existing entry is returned.  If the key is not
+ * found, 'found' is set to false, and a pointer to a newly created entry is
+ * returned.
+ *
+ * Notes above dshash_find() regarding locking and error handling equally
+ * apply here.
+ */
+void *
+dshash_find_or_insert(dshash_table *hash_table,
+					  const void *key,
+					  bool *found)
+{
+	dshash_hash hash;
+	size_t		partition_index;
+	dshash_partition *partition;
+	dshash_table_item *item;
+
+	hash = hash_key(hash_table, key);
+	partition_index = PARTITION_FOR_HASH(hash);
+	partition = &hash_table->control->partitions[partition_index];
+
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+	ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+restart:
+	LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
+				  LW_EXCLUSIVE);
+	ensure_valid_bucket_pointers(hash_table);
+
+	/* Search the active bucket. */
+	item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
+
+	if (item)
+		*found = true;
+	else
+	{
+		*found = false;
+
+		/* Check if we are getting too full. */
+		if (partition->count > MAX_COUNT_PER_PARTITION(hash_table))
+		{
+			/*
+			 * The load factor (= keys / buckets) for all buckets protected by
+			 * this partition is > 0.75.  Presumably the same applies
+			 * generally across the whole hash table (though we don't attempt
+			 * to track that directly to avoid contention on some kind of
+			 * central counter; we just assume that this partition is
+			 * representative).  This is a good time to resize.
+			 *
+			 * Give up our existing lock first, because resizing needs to
+			 * reacquire all the locks in the right order to avoid deadlocks.
+			 */
+			LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
+			resize(hash_table, hash_table->size_log2 + 1);
+
+			goto restart;
+		}
+
+		/* Finally we can try to insert the new item. */
+		item = insert_into_bucket(hash_table, key,
+								  &BUCKET_FOR_HASH(hash_table, hash));
+		item->hash = hash;
+		/* Adjust per-lock-partition counter for load factor knowledge. */
+		++partition->count;
+	}
+
+	/* The caller must release the lock with dshash_release_lock. */
+	return ENTRY_FROM_ITEM(item);
+}
+
+/*
+ * Remove an entry by key.  Returns true if the key was found and the
+ * corresponding entry was removed.
+ *
+ * To delete an entry that you already have a pointer to, see
+ * dshash_delete_entry.
+ */
+bool
+dshash_delete_key(dshash_table *hash_table, const void *key)
+{
+	dshash_hash hash;
+	size_t		partition;
+	bool		found;
+
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+	ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+	hash = hash_key(hash_table, key);
+	partition = PARTITION_FOR_HASH(hash);
+
+	LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE);
+	ensure_valid_bucket_pointers(hash_table);
+
+	if (delete_key_from_bucket(hash_table, key,
+							   &BUCKET_FOR_HASH(hash_table, hash)))
+	{
+		Assert(hash_table->control->partitions[partition].count > 0);
+		found = true;
+		--hash_table->control->partitions[partition].count;
+	}
+	else
+		found = false;
+
+	LWLockRelease(PARTITION_LOCK(hash_table, partition));
+
+	return found;
+}
+
+/*
+ * Remove an entry.  The entry must already be exclusively locked, and must
+ * have been obtained by dshash_find or dshash_find_or_insert.  Note that this
+ * function releases the lock just like dshash_release_lock.
+ *
+ * To delete an entry by key, see dshash_delete_key.
+ */
+void
+dshash_delete_entry(dshash_table *hash_table, void *entry)
+{
+	dshash_table_item *item = ITEM_FROM_ENTRY(entry);
+	size_t		partition = PARTITION_FOR_HASH(item->hash);
+
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+	Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
+								LW_EXCLUSIVE));
+
+	delete_item(hash_table, item);
+	LWLockRelease(PARTITION_LOCK(hash_table, partition));
+}
+
+/*
+ * Unlock an entry which was locked by dshash_find or dshash_find_or_insert.
+ */
+void
+dshash_release_lock(dshash_table *hash_table, void *entry)
+{
+	dshash_table_item *item = ITEM_FROM_ENTRY(entry);
+	size_t		partition_index = PARTITION_FOR_HASH(item->hash);
+
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+
+	LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
+}
+
+/*
+ * A compare function that forwards to memcmp.
+ */
+int
+dshash_memcmp(const void *a, const void *b, size_t size, void *arg)
+{
+	return memcmp(a, b, size);
+}
+
+/*
+ * A hash function that forwards to tag_hash.
+ */
+dshash_hash
+dshash_memhash(const void *v, size_t size, void *arg)
+{
+	return tag_hash(v, size);
+}
+
+/*
+ * Sequentially scan through dshash table and return all the elements one by
+ * one, return NULL when all elements have been returned.
+ *
+ * dshash_seq_term needs to be called when a scan finished.  The caller may
+ * delete returned elements midst of a scan by using dshash_delete_current()
+ * if exclusive = true.
+ */
+void
+dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
+				bool exclusive)
+{
+	status->hash_table = hash_table;
+	status->curbucket = 0;
+	status->nbuckets = 0;
+	status->curitem = NULL;
+	status->pnextitem = InvalidDsaPointer;
+	status->curpartition = -1;
+	status->exclusive = exclusive;
+}
+
+/*
+ * Returns the next element.
+ *
+ * Returned elements are locked and the caller may not release the lock. It is
+ * released by future calls to dshash_seq_next() or dshash_seq_term().
+ */
+void *
+dshash_seq_next(dshash_seq_status *status)
+{
+	dsa_pointer next_item_pointer;
+
+	/*
+	 * Not yet holding any partition locks. Need to determine the size of the
+	 * hash table, it could have been resized since we were looking last.
+	 * Since we iterate in partition order, we can start by unconditionally
+	 * lock partition 0.
+	 *
+	 * Once we hold the lock, no resizing can happen until the scan ends. So
+	 * we don't need to repeatedly call ensure_valid_bucket_pointers().
+	 */
+	if (status->curpartition == -1)
+	{
+		Assert(status->curbucket == 0);
+		ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(status->hash_table);
+
+		status->curpartition = 0;
+
+		LWLockAcquire(PARTITION_LOCK(status->hash_table,
+									 status->curpartition),
+					  status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
+
+		ensure_valid_bucket_pointers(status->hash_table);
+
+		status->nbuckets =
+			NUM_BUCKETS(status->hash_table->control->size_log2);
+		next_item_pointer = status->hash_table->buckets[status->curbucket];
+	}
+	else
+		next_item_pointer = status->pnextitem;
+
+	Assert(LWLockHeldByMeInMode(PARTITION_LOCK(status->hash_table,
+											   status->curpartition),
+								status->exclusive ? LW_EXCLUSIVE : LW_SHARED));
+
+	/* Move to the next bucket if we finished the current bucket */
+	while (!DsaPointerIsValid(next_item_pointer))
+	{
+		int			next_partition;
+
+		if (++status->curbucket >= status->nbuckets)
+		{
+			/* all buckets have been scanned. finish. */
+			return NULL;
+		}
+
+		/* Check if move to the next partition */
+		next_partition =
+			PARTITION_FOR_BUCKET_INDEX(status->curbucket,
+									   status->hash_table->size_log2);
+
+		if (status->curpartition != next_partition)
+		{
+			/*
+			 * Move to the next partition. Lock the next partition then
+			 * release the current, not in the reverse order to avoid
+			 * concurrent resizing.  Avoid dead lock by taking lock in the
+			 * same order with resize().
+			 */
+			LWLockAcquire(PARTITION_LOCK(status->hash_table,
+										 next_partition),
+						  status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
+			LWLockRelease(PARTITION_LOCK(status->hash_table,
+										 status->curpartition));
+			status->curpartition = next_partition;
+		}
+
+		next_item_pointer = status->hash_table->buckets[status->curbucket];
+	}
+
+	status->curitem =
+		dsa_get_address(status->hash_table->area, next_item_pointer);
+
+	/*
+	 * The caller may delete the item. Store the next item in case of
+	 * deletion.
+	 */
+	status->pnextitem = status->curitem->next;
+
+	return ENTRY_FROM_ITEM(status->curitem);
+}
+
+/*
+ * Terminates the seqscan and release all locks.
+ *
+ * Needs to be called after finishing or when exiting a seqscan.
+ */
+void
+dshash_seq_term(dshash_seq_status *status)
+{
+	if (status->curpartition >= 0)
+		LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
+}
+
+/*
+ * Remove the current entry of the seq scan.
+ */
+void
+dshash_delete_current(dshash_seq_status *status)
+{
+	dshash_table *hash_table = status->hash_table;
+	dshash_table_item *item = status->curitem;
+	size_t		partition PG_USED_FOR_ASSERTS_ONLY;
+
+	partition = PARTITION_FOR_HASH(item->hash);
+
+	Assert(status->exclusive);
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+	Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
+								LW_EXCLUSIVE));
+
+	delete_item(hash_table, item);
+}
+
+/*
+ * Print debugging information about the internal state of the hash table to
+ * stderr.  The caller must hold no partition locks.
+ */
+void
+dshash_dump(dshash_table *hash_table)
+{
+	size_t		i;
+	size_t		j;
+
+	Assert(hash_table->control->magic == DSHASH_MAGIC);
+	ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
+
+	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+	{
+		Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
+		LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
+	}
+
+	ensure_valid_bucket_pointers(hash_table);
+
+	fprintf(stderr,
+			"hash table size = %zu\n", (size_t) 1 << hash_table->size_log2);
+	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+	{
+		dshash_partition *partition = &hash_table->control->partitions[i];
+		size_t		begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2);
+		size_t		end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2);
+
+		fprintf(stderr, "  partition %zu\n", i);
+		fprintf(stderr,
+				"    active buckets (key count = %zu)\n", partition->count);
+
+		for (j = begin; j < end; ++j)
+		{
+			size_t		count = 0;
+			dsa_pointer bucket = hash_table->buckets[j];
+
+			while (DsaPointerIsValid(bucket))
+			{
+				dshash_table_item *item;
+
+				item = dsa_get_address(hash_table->area, bucket);
+
+				bucket = item->next;
+				++count;
+			}
+			fprintf(stderr, "      bucket %zu (key count = %zu)\n", j, count);
+		}
+	}
+
+	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+		LWLockRelease(PARTITION_LOCK(hash_table, i));
+}
+
+/*
+ * Delete a locked item to which we have a pointer.
+ */
+static void
+delete_item(dshash_table *hash_table, dshash_table_item *item)
+{
+	size_t		hash = item->hash;
+	size_t		partition = PARTITION_FOR_HASH(hash);
+
+	Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition)));
+
+	if (delete_item_from_bucket(hash_table, item,
+								&BUCKET_FOR_HASH(hash_table, hash)))
+	{
+		Assert(hash_table->control->partitions[partition].count > 0);
+		--hash_table->control->partitions[partition].count;
+	}
+	else
+	{
+		Assert(false);
+	}
+}
+
+/*
+ * Grow the hash table if necessary to the requested number of buckets.  The
+ * requested size must be double some previously observed size.
+ *
+ * Must be called without any partition lock held.
+ */
+static void
+resize(dshash_table *hash_table, size_t new_size_log2)
+{
+	dsa_pointer old_buckets;
+	dsa_pointer new_buckets_shared;
+	dsa_pointer *new_buckets;
+	size_t		size;
+	size_t		new_size = ((size_t) 1) << new_size_log2;
+	size_t		i;
+
+	/*
+	 * Acquire the locks for all lock partitions.  This is expensive, but we
+	 * shouldn't have to do it many times.
+	 */
+	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+	{
+		Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
+
+		LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE);
+		if (i == 0 && hash_table->control->size_log2 >= new_size_log2)
+		{
+			/*
+			 * Another backend has already increased the size; we can avoid
+			 * obtaining all the locks and return early.
+			 */
+			LWLockRelease(PARTITION_LOCK(hash_table, 0));
+			return;
+		}
+	}
+
+	Assert(new_size_log2 == hash_table->control->size_log2 + 1);
+
+	/* Allocate the space for the new table. */
+	new_buckets_shared = dsa_allocate0(hash_table->area,
+									   sizeof(dsa_pointer) * new_size);
+	new_buckets = dsa_get_address(hash_table->area, new_buckets_shared);
+
+	/*
+	 * We've allocated the new bucket array; all that remains to do now is to
+	 * reinsert all items, which amounts to adjusting all the pointers.
+	 */
+	size = ((size_t) 1) << hash_table->control->size_log2;
+	for (i = 0; i < size; ++i)
+	{
+		dsa_pointer item_pointer = hash_table->buckets[i];
+
+		while (DsaPointerIsValid(item_pointer))
+		{
+			dshash_table_item *item;
+			dsa_pointer next_item_pointer;
+
+			item = dsa_get_address(hash_table->area, item_pointer);
+			next_item_pointer = item->next;
+			insert_item_into_bucket(hash_table, item_pointer, item,
+									&new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash,
+																				new_size_log2)]);
+			item_pointer = next_item_pointer;
+		}
+	}
+
+	/* Swap the hash table into place and free the old one. */
+	old_buckets = hash_table->control->buckets;
+	hash_table->control->buckets = new_buckets_shared;
+	hash_table->control->size_log2 = new_size_log2;
+	hash_table->buckets = new_buckets;
+	dsa_free(hash_table->area, old_buckets);
+
+	/* Release all the locks. */
+	for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+		LWLockRelease(PARTITION_LOCK(hash_table, i));
+}
+
+/*
+ * Make sure that our backend-local bucket pointers are up to date.  The
+ * caller must have locked one lock partition, which prevents resize() from
+ * running concurrently.
+ */
+static inline void
+ensure_valid_bucket_pointers(dshash_table *hash_table)
+{
+	if (hash_table->size_log2 != hash_table->control->size_log2)
+	{
+		hash_table->buckets = dsa_get_address(hash_table->area,
+											  hash_table->control->buckets);
+		hash_table->size_log2 = hash_table->control->size_log2;
+	}
+}
+
+/*
+ * Scan a locked bucket for a match, using the provided compare function.
+ */
+static inline dshash_table_item *
+find_in_bucket(dshash_table *hash_table, const void *key,
+			   dsa_pointer item_pointer)
+{
+	while (DsaPointerIsValid(item_pointer))
+	{
+		dshash_table_item *item;
+
+		item = dsa_get_address(hash_table->area, item_pointer);
+		if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
+			return item;
+		item_pointer = item->next;
+	}
+	return NULL;
+}
+
+/*
+ * Insert an already-allocated item into a bucket.
+ */
+static void
+insert_item_into_bucket(dshash_table *hash_table,
+						dsa_pointer item_pointer,
+						dshash_table_item *item,
+						dsa_pointer *bucket)
+{
+	Assert(item == dsa_get_address(hash_table->area, item_pointer));
+
+	item->next = *bucket;
+	*bucket = item_pointer;
+}
+
+/*
+ * Allocate space for an entry with the given key and insert it into the
+ * provided bucket.
+ */
+static dshash_table_item *
+insert_into_bucket(dshash_table *hash_table,
+				   const void *key,
+				   dsa_pointer *bucket)
+{
+	dsa_pointer item_pointer;
+	dshash_table_item *item;
+
+	item_pointer = dsa_allocate(hash_table->area,
+								hash_table->params.entry_size +
+								MAXALIGN(sizeof(dshash_table_item)));
+	item = dsa_get_address(hash_table->area, item_pointer);
+	memcpy(ENTRY_FROM_ITEM(item), key, hash_table->params.key_size);
+	insert_item_into_bucket(hash_table, item_pointer, item, bucket);
+	return item;
+}
+
+/*
+ * Search a bucket for a matching key and delete it.
+ */
+static bool
+delete_key_from_bucket(dshash_table *hash_table,
+					   const void *key,
+					   dsa_pointer *bucket_head)
+{
+	while (DsaPointerIsValid(*bucket_head))
+	{
+		dshash_table_item *item;
+
+		item = dsa_get_address(hash_table->area, *bucket_head);
+
+		if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
+		{
+			dsa_pointer next;
+
+			next = item->next;
+			dsa_free(hash_table->area, *bucket_head);
+			*bucket_head = next;
+
+			return true;
+		}
+		bucket_head = &item->next;
+	}
+	return false;
+}
+
+/*
+ * Delete the specified item from the bucket.
+ */
+static bool
+delete_item_from_bucket(dshash_table *hash_table,
+						dshash_table_item *item,
+						dsa_pointer *bucket_head)
+{
+	while (DsaPointerIsValid(*bucket_head))
+	{
+		dshash_table_item *bucket_item;
+
+		bucket_item = dsa_get_address(hash_table->area, *bucket_head);
+
+		if (bucket_item == item)
+		{
+			dsa_pointer next;
+
+			next = item->next;
+			dsa_free(hash_table->area, *bucket_head);
+			*bucket_head = next;
+			return true;
+		}
+		bucket_head = &bucket_item->next;
+	}
+	return false;
+}
+
+/*
+ * Compute the hash value for a key.
+ */
+static inline dshash_hash
+hash_key(dshash_table *hash_table, const void *key)
+{
+	return hash_table->params.hash_function(key,
+											hash_table->params.key_size,
+											hash_table->arg);
+}
+
+/*
+ * Check whether two keys compare equal.
+ */
+static inline bool
+equal_keys(dshash_table *hash_table, const void *a, const void *b)
+{
+	return hash_table->params.compare_function(a, b,
+											   hash_table->params.key_size,
+											   hash_table->arg) == 0;
+}
diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c
new file mode 100644
index 0000000..06b1e28
--- /dev/null
+++ b/src/backend/lib/hyperloglog.c
@@ -0,0 +1,255 @@
+/*-------------------------------------------------------------------------
+ *
+ * hyperloglog.c
+ *	  HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014-2022, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+ * suited to estimating the cardinality of very large sets;  in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "lib/hyperloglog.h"
+#include "port/pg_bitutils.h"
+
+#define POW_2_32			(4294967296.0)
+#define NEG_POW_2_32		(-4294967296.0)
+
+static inline uint8 rho(uint32 x, uint8 b);
+
+/*
+ * Initialize HyperLogLog track state, by bit width
+ *
+ * bwidth is bit width (so register size will be 2 to the power of bwidth).
+ * Must be between 4 and 16 inclusive.
+ */
+void
+initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
+{
+	double		alpha;
+
+	if (bwidth < 4 || bwidth > 16)
+		elog(ERROR, "bit width must be between 4 and 16 inclusive");
+
+	cState->registerWidth = bwidth;
+	cState->nRegisters = (Size) 1 << bwidth;
+	cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
+
+	/*
+	 * Initialize hashes array to zero, not negative infinity, per discussion
+	 * of the coupon collector problem in the HyperLogLog paper
+	 */
+	cState->hashesArr = palloc0(cState->arrSize);
+
+	/*
+	 * "alpha" is a value that for each possible number of registers (m) is
+	 * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
+	 * is "the indicator function" through which we finally compute E,
+	 * estimated cardinality).
+	 */
+	switch (cState->nRegisters)
+	{
+		case 16:
+			alpha = 0.673;
+			break;
+		case 32:
+			alpha = 0.697;
+			break;
+		case 64:
+			alpha = 0.709;
+			break;
+		default:
+			alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
+	}
+
+	/*
+	 * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
+	 * estimate E
+	 */
+	cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
+}
+
+/*
+ * Initialize HyperLogLog track state, by error rate
+ *
+ * Instead of specifying bwidth (number of bits used for addressing the
+ * register), this method allows sizing the counter for particular error
+ * rate using a simple formula from the paper:
+ *
+ *	 e = 1.04 / sqrt(m)
+ *
+ * where 'm' is the number of registers, i.e. (2^bwidth). The method
+ * finds the lowest bwidth with 'e' below the requested error rate, and
+ * then uses it to initialize the counter.
+ *
+ * As bwidth has to be between 4 and 16, the worst possible error rate
+ * is between ~25% (bwidth=4) and 0.4% (bwidth=16).
+ */
+void
+initHyperLogLogError(hyperLogLogState *cState, double error)
+{
+	uint8		bwidth = 4;
+
+	while (bwidth < 16)
+	{
+		double		m = (Size) 1 << bwidth;
+
+		if (1.04 / sqrt(m) < error)
+			break;
+		bwidth++;
+	}
+
+	initHyperLogLog(cState, bwidth);
+}
+
+/*
+ * Free HyperLogLog track state
+ *
+ * Releases allocated resources, but not the state itself (in case it's not
+ * allocated by palloc).
+ */
+void
+freeHyperLogLog(hyperLogLogState *cState)
+{
+	Assert(cState->hashesArr != NULL);
+	pfree(cState->hashesArr);
+}
+
+/*
+ * Adds element to the estimator, from caller-supplied hash.
+ *
+ * It is critical that the hash value passed be an actual hash value, typically
+ * generated using hash_any().  The algorithm relies on a specific bit-pattern
+ * observable in conjunction with stochastic averaging.  There must be a
+ * uniform distribution of bits in hash values for each distinct original value
+ * observed.
+ */
+void
+addHyperLogLog(hyperLogLogState *cState, uint32 hash)
+{
+	uint8		count;
+	uint32		index;
+
+	/* Use the first "k" (registerWidth) bits as a zero based index */
+	index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+	count = rho(hash << cState->registerWidth,
+				BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+	cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
+}
+
+/*
+ * Estimates cardinality, based on elements added so far
+ */
+double
+estimateHyperLogLog(hyperLogLogState *cState)
+{
+	double		result;
+	double		sum = 0.0;
+	int			i;
+
+	for (i = 0; i < cState->nRegisters; i++)
+	{
+		sum += 1.0 / pow(2.0, cState->hashesArr[i]);
+	}
+
+	/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+	result = cState->alphaMM / sum;
+
+	if (result <= (5.0 / 2.0) * cState->nRegisters)
+	{
+		/* Small range correction */
+		int			zero_count = 0;
+
+		for (i = 0; i < cState->nRegisters; i++)
+		{
+			if (cState->hashesArr[i] == 0)
+				zero_count++;
+		}
+
+		if (zero_count != 0)
+			result = cState->nRegisters * log((double) cState->nRegisters /
+											  zero_count);
+	}
+	else if (result > (1.0 / 30.0) * POW_2_32)
+	{
+		/* Large range correction */
+		result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+	}
+
+	return result;
+}
+
+/*
+ * Worker for addHyperLogLog().
+ *
+ * Calculates the position of the first set bit in first b bits of x argument
+ * starting from the first, reading from most significant to least significant
+ * bits.
+ *
+ * Example (when considering fist 10 bits of x):
+ *
+ * rho(x = 0b1000000000)   returns 1
+ * rho(x = 0b0010000000)   returns 3
+ * rho(x = 0b0000000000)   returns b + 1
+ *
+ * "The binary address determined by the first b bits of x"
+ *
+ * Return value "j" used to index bit pattern to watch.
+ */
+static inline uint8
+rho(uint32 x, uint8 b)
+{
+	uint8		j = 1;
+
+	if (x == 0)
+		return b + 1;
+
+	j = 32 - pg_leftmost_one_pos32(x);
+
+	if (j > b)
+		return b + 1;
+
+	return j;
+}
diff --git a/src/backend/lib/ilist.c b/src/backend/lib/ilist.c
new file mode 100644
index 0000000..29ef216
--- /dev/null
+++ b/src/backend/lib/ilist.c
@@ -0,0 +1,111 @@
+/*-------------------------------------------------------------------------
+ *
+ * ilist.c
+ *	  support for integrated/inline doubly- and singly- linked lists
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/ilist.c
+ *
+ * NOTES
+ *	  This file only contains functions that are too big to be considered
+ *	  for inlining.  See ilist.h for most of the goodies.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "lib/ilist.h"
+
+/*
+ * Delete 'node' from list.
+ *
+ * It is not allowed to delete a 'node' which is not in the list 'head'
+ *
+ * Caution: this is O(n); consider using slist_delete_current() instead.
+ */
+void
+slist_delete(slist_head *head, slist_node *node)
+{
+	slist_node *last = &head->head;
+	slist_node *cur;
+	bool		found PG_USED_FOR_ASSERTS_ONLY = false;
+
+	while ((cur = last->next) != NULL)
+	{
+		if (cur == node)
+		{
+			last->next = cur->next;
+#ifdef USE_ASSERT_CHECKING
+			found = true;
+#endif
+			break;
+		}
+		last = cur;
+	}
+	Assert(found);
+
+	slist_check(head);
+}
+
+#ifdef ILIST_DEBUG
+/*
+ * Verify integrity of a doubly linked list
+ */
+void
+dlist_check(dlist_head *head)
+{
+	dlist_node *cur;
+
+	if (head == NULL)
+		elog(ERROR, "doubly linked list head address is NULL");
+
+	if (head->head.next == NULL && head->head.prev == NULL)
+		return;					/* OK, initialized as zeroes */
+
+	/* iterate in forward direction */
+	for (cur = head->head.next; cur != &head->head; cur = cur->next)
+	{
+		if (cur == NULL ||
+			cur->next == NULL ||
+			cur->prev == NULL ||
+			cur->prev->next != cur ||
+			cur->next->prev != cur)
+			elog(ERROR, "doubly linked list is corrupted");
+	}
+
+	/* iterate in backward direction */
+	for (cur = head->head.prev; cur != &head->head; cur = cur->prev)
+	{
+		if (cur == NULL ||
+			cur->next == NULL ||
+			cur->prev == NULL ||
+			cur->prev->next != cur ||
+			cur->next->prev != cur)
+			elog(ERROR, "doubly linked list is corrupted");
+	}
+}
+
+/*
+ * Verify integrity of a singly linked list
+ */
+void
+slist_check(slist_head *head)
+{
+	slist_node *cur;
+
+	if (head == NULL)
+		elog(ERROR, "singly linked list head address is NULL");
+
+	/*
+	 * there isn't much we can test in a singly linked list except that it
+	 * actually ends sometime, i.e. hasn't introduced a cycle or similar
+	 */
+	for (cur = head->head.next; cur != NULL; cur = cur->next)
+		;
+}
+
+#endif							/* ILIST_DEBUG */
diff --git a/src/backend/lib/integerset.c b/src/backend/lib/integerset.c
new file mode 100644
index 0000000..5aff292
--- /dev/null
+++ b/src/backend/lib/integerset.c
@@ -0,0 +1,1045 @@
+/*-------------------------------------------------------------------------
+ *
+ * integerset.c
+ *	  Data structure to hold a large set of 64-bit integers efficiently
+ *
+ * IntegerSet provides an in-memory data structure to hold a set of
+ * arbitrary 64-bit integers.  Internally, the values are stored in a
+ * B-tree, with a special packed representation at the leaf level using
+ * the Simple-8b algorithm, which can pack clusters of nearby values
+ * very tightly.
+ *
+ * Memory consumption depends on the number of values stored, but also
+ * on how far the values are from each other.  In the best case, with
+ * long runs of consecutive integers, memory consumption can be as low as
+ * 0.1 bytes per integer.  In the worst case, if integers are more than
+ * 2^32 apart, it uses about 8 bytes per integer.  In typical use, the
+ * consumption per integer is somewhere between those extremes, depending
+ * on the range of integers stored, and how "clustered" they are.
+ *
+ *
+ * Interface
+ * ---------
+ *
+ *	intset_create			- Create a new, empty set
+ *	intset_add_member		- Add an integer to the set
+ *	intset_is_member		- Test if an integer is in the set
+ *	intset_begin_iterate	- Begin iterating through all integers in set
+ *	intset_iterate_next		- Return next set member, if any
+ *
+ * intset_create() creates the set in the current memory context.  Subsequent
+ * operations that add to the data structure will continue to allocate from
+ * that same context, even if it's not current anymore.
+ *
+ * Note that there is no function to free an integer set.  If you need to do
+ * that, create a dedicated memory context to hold it, and destroy the memory
+ * context instead.
+ *
+ *
+ * Limitations
+ * -----------
+ *
+ * - Values must be added in order.  (Random insertions would require
+ *   splitting nodes, which hasn't been implemented.)
+ *
+ * - Values cannot be added while iteration is in progress.
+ *
+ * - No support for removing values.
+ *
+ * None of these limitations are fundamental to the data structure, so they
+ * could be lifted if needed, by writing some new code.  But the current
+ * users of this facility don't need them.
+ *
+ *
+ * References
+ * ----------
+ *
+ * Simple-8b encoding is based on:
+ *
+ * Vo Ngoc Anh, Alistair Moffat, Index compression using 64-bit words,
+ *   Software - Practice & Experience, v.40 n.2, p.131-147, February 2010
+ *   (https://doi.org/10.1002/spe.948)
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/integerset.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "lib/integerset.h"
+#include "port/pg_bitutils.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Maximum number of integers that can be encoded in a single Simple-8b
+ * codeword. (Defined here before anything else, so that we can size arrays
+ * using this.)
+ */
+#define SIMPLE8B_MAX_VALUES_PER_CODEWORD 240
+
+/*
+ * Parameters for shape of the in-memory B-tree.
+ *
+ * These set the size of each internal and leaf node.  They don't necessarily
+ * need to be the same, because the tree is just an in-memory structure.
+ * With the default 64, each node is about 1 kb.
+ *
+ * If you change these, you must recalculate MAX_TREE_LEVELS, too!
+ */
+#define MAX_INTERNAL_ITEMS	64
+#define MAX_LEAF_ITEMS	64
+
+/*
+ * Maximum height of the tree.
+ *
+ * MAX_TREE_ITEMS is calculated from the "fan-out" of the B-tree.  The
+ * theoretical maximum number of items that we can store in a set is 2^64,
+ * so MAX_TREE_LEVELS should be set so that:
+ *
+ *   MAX_LEAF_ITEMS * MAX_INTERNAL_ITEMS ^ (MAX_TREE_LEVELS - 1) >= 2^64.
+ *
+ * In practice, we'll need far fewer levels, because you will run out of
+ * memory long before reaching that number, but let's be conservative.
+ */
+#define MAX_TREE_LEVELS		11
+
+/*
+ * Node structures, for the in-memory B-tree.
+ *
+ * An internal node holds a number of downlink pointers to leaf nodes, or
+ * to internal nodes on a lower level.  For each downlink, the key value
+ * corresponding to the lower level node is stored in a sorted array.  The
+ * stored key values are low keys.  In other words, if the downlink has value
+ * X, then all items stored on that child are >= X.
+ *
+ * Each leaf node holds a number of "items", with a varying number of
+ * integers packed into each item.  Each item consists of two 64-bit words:
+ * The first word holds the first integer stored in the item, in plain format.
+ * The second word contains between 0 and 240 more integers, packed using
+ * Simple-8b encoding.  By storing the first integer in plain, unpacked,
+ * format, we can use binary search to quickly find an item that holds (or
+ * would hold) a particular integer.  And by storing the rest in packed form,
+ * we still get pretty good memory density, if there are clusters of integers
+ * with similar values.
+ *
+ * Each leaf node also has a pointer to the next leaf node, so that the leaf
+ * nodes can be easily walked from beginning to end when iterating.
+ */
+typedef struct intset_node intset_node;
+typedef struct intset_leaf_node intset_leaf_node;
+typedef struct intset_internal_node intset_internal_node;
+
+/* Common structure of both leaf and internal nodes. */
+struct intset_node
+{
+	uint16		level;			/* tree level of this node */
+	uint16		num_items;		/* number of items in this node */
+};
+
+/* Internal node */
+struct intset_internal_node
+{
+	/* common header, must match intset_node */
+	uint16		level;			/* >= 1 on internal nodes */
+	uint16		num_items;
+
+	/*
+	 * 'values' is an array of key values, and 'downlinks' are pointers to
+	 * lower-level nodes, corresponding to the key values.
+	 */
+	uint64		values[MAX_INTERNAL_ITEMS];
+	intset_node *downlinks[MAX_INTERNAL_ITEMS];
+};
+
+/* Leaf node */
+typedef struct
+{
+	uint64		first;			/* first integer in this item */
+	uint64		codeword;		/* simple8b encoded differences from 'first' */
+} leaf_item;
+
+#define MAX_VALUES_PER_LEAF_ITEM	(1 + SIMPLE8B_MAX_VALUES_PER_CODEWORD)
+
+struct intset_leaf_node
+{
+	/* common header, must match intset_node */
+	uint16		level;			/* 0 on leafs */
+	uint16		num_items;
+
+	intset_leaf_node *next;		/* right sibling, if any */
+
+	leaf_item	items[MAX_LEAF_ITEMS];
+};
+
+/*
+ * We buffer insertions in a simple array, before packing and inserting them
+ * into the B-tree.  MAX_BUFFERED_VALUES sets the size of the buffer.  The
+ * encoder assumes that it is large enough that we can always fill a leaf
+ * item with buffered new items.  In other words, MAX_BUFFERED_VALUES must be
+ * larger than MAX_VALUES_PER_LEAF_ITEM.  For efficiency, make it much larger.
+ */
+#define MAX_BUFFERED_VALUES			(MAX_VALUES_PER_LEAF_ITEM * 2)
+
+/*
+ * IntegerSet is the top-level object representing the set.
+ *
+ * The integers are stored in an in-memory B-tree structure, plus an array
+ * for newly-added integers.  IntegerSet also tracks information about memory
+ * usage, as well as the current position when iterating the set with
+ * intset_begin_iterate / intset_iterate_next.
+ */
+struct IntegerSet
+{
+	/*
+	 * 'context' is the memory context holding this integer set and all its
+	 * tree nodes.
+	 *
+	 * 'mem_used' tracks the amount of memory used.  We don't do anything with
+	 * it in integerset.c itself, but the callers can ask for it with
+	 * intset_memory_usage().
+	 */
+	MemoryContext context;
+	uint64		mem_used;
+
+	uint64		num_entries;	/* total # of values in the set */
+	uint64		highest_value;	/* highest value stored in this set */
+
+	/*
+	 * B-tree to hold the packed values.
+	 *
+	 * 'rightmost_nodes' hold pointers to the rightmost node on each level.
+	 * rightmost_parent[0] is rightmost leaf, rightmost_parent[1] is its
+	 * parent, and so forth, all the way up to the root. These are needed when
+	 * adding new values. (Currently, we require that new values are added at
+	 * the end.)
+	 */
+	int			num_levels;		/* height of the tree */
+	intset_node *root;			/* root node */
+	intset_node *rightmost_nodes[MAX_TREE_LEVELS];
+	intset_leaf_node *leftmost_leaf;	/* leftmost leaf node */
+
+	/*
+	 * Holding area for new items that haven't been inserted to the tree yet.
+	 */
+	uint64		buffered_values[MAX_BUFFERED_VALUES];
+	int			num_buffered_values;
+
+	/*
+	 * Iterator support.
+	 *
+	 * 'iter_values' is an array of integers ready to be returned to the
+	 * caller; 'iter_num_values' is the length of that array, and
+	 * 'iter_valueno' is the next index.  'iter_node' and 'iter_itemno' point
+	 * to the leaf node, and item within the leaf node, to get the next batch
+	 * of values from.
+	 *
+	 * Normally, 'iter_values' points to 'iter_values_buf', which holds items
+	 * decoded from a leaf item.  But after we have scanned the whole B-tree,
+	 * we iterate through all the unbuffered values, too, by pointing
+	 * iter_values to 'buffered_values'.
+	 */
+	bool		iter_active;	/* is iteration in progress? */
+
+	const uint64 *iter_values;
+	int			iter_num_values;	/* number of elements in 'iter_values' */
+	int			iter_valueno;	/* next index into 'iter_values' */
+
+	intset_leaf_node *iter_node;	/* current leaf node */
+	int			iter_itemno;	/* next item in 'iter_node' to decode */
+
+	uint64		iter_values_buf[MAX_VALUES_PER_LEAF_ITEM];
+};
+
+/*
+ * Prototypes for internal functions.
+ */
+static void intset_update_upper(IntegerSet *intset, int level,
+								intset_node *child, uint64 child_key);
+static void intset_flush_buffered_values(IntegerSet *intset);
+
+static int	intset_binsrch_uint64(uint64 value, uint64 *arr, int arr_elems,
+								  bool nextkey);
+static int	intset_binsrch_leaf(uint64 value, leaf_item *arr, int arr_elems,
+								bool nextkey);
+
+static uint64 simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base);
+static int	simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base);
+static bool simple8b_contains(uint64 codeword, uint64 key, uint64 base);
+
+
+/*
+ * Create a new, initially empty, integer set.
+ *
+ * The integer set is created in the current memory context.
+ * We will do all subsequent allocations in the same context, too, regardless
+ * of which memory context is current when new integers are added to the set.
+ */
+IntegerSet *
+intset_create(void)
+{
+	IntegerSet *intset;
+
+	intset = (IntegerSet *) palloc(sizeof(IntegerSet));
+	intset->context = CurrentMemoryContext;
+	intset->mem_used = GetMemoryChunkSpace(intset);
+
+	intset->num_entries = 0;
+	intset->highest_value = 0;
+
+	intset->num_levels = 0;
+	intset->root = NULL;
+	memset(intset->rightmost_nodes, 0, sizeof(intset->rightmost_nodes));
+	intset->leftmost_leaf = NULL;
+
+	intset->num_buffered_values = 0;
+
+	intset->iter_active = false;
+	intset->iter_node = NULL;
+	intset->iter_itemno = 0;
+	intset->iter_valueno = 0;
+	intset->iter_num_values = 0;
+	intset->iter_values = NULL;
+
+	return intset;
+}
+
+/*
+ * Allocate a new node.
+ */
+static intset_internal_node *
+intset_new_internal_node(IntegerSet *intset)
+{
+	intset_internal_node *n;
+
+	n = (intset_internal_node *) MemoryContextAlloc(intset->context,
+													sizeof(intset_internal_node));
+	intset->mem_used += GetMemoryChunkSpace(n);
+
+	n->level = 0;				/* caller must set */
+	n->num_items = 0;
+
+	return n;
+}
+
+static intset_leaf_node *
+intset_new_leaf_node(IntegerSet *intset)
+{
+	intset_leaf_node *n;
+
+	n = (intset_leaf_node *) MemoryContextAlloc(intset->context,
+												sizeof(intset_leaf_node));
+	intset->mem_used += GetMemoryChunkSpace(n);
+
+	n->level = 0;
+	n->num_items = 0;
+	n->next = NULL;
+
+	return n;
+}
+
+/*
+ * Return the number of entries in the integer set.
+ */
+uint64
+intset_num_entries(IntegerSet *intset)
+{
+	return intset->num_entries;
+}
+
+/*
+ * Return the amount of memory used by the integer set.
+ */
+uint64
+intset_memory_usage(IntegerSet *intset)
+{
+	return intset->mem_used;
+}
+
+/*
+ * Add a value to the set.
+ *
+ * Values must be added in order.
+ */
+void
+intset_add_member(IntegerSet *intset, uint64 x)
+{
+	if (intset->iter_active)
+		elog(ERROR, "cannot add new values to integer set while iteration is in progress");
+
+	if (x <= intset->highest_value && intset->num_entries > 0)
+		elog(ERROR, "cannot add value to integer set out of order");
+
+	if (intset->num_buffered_values >= MAX_BUFFERED_VALUES)
+	{
+		/* Time to flush our buffer */
+		intset_flush_buffered_values(intset);
+		Assert(intset->num_buffered_values < MAX_BUFFERED_VALUES);
+	}
+
+	/* Add it to the buffer of newly-added values */
+	intset->buffered_values[intset->num_buffered_values] = x;
+	intset->num_buffered_values++;
+	intset->num_entries++;
+	intset->highest_value = x;
+}
+
+/*
+ * Take a batch of buffered values, and pack them into the B-tree.
+ */
+static void
+intset_flush_buffered_values(IntegerSet *intset)
+{
+	uint64	   *values = intset->buffered_values;
+	uint64		num_values = intset->num_buffered_values;
+	int			num_packed = 0;
+	intset_leaf_node *leaf;
+
+	leaf = (intset_leaf_node *) intset->rightmost_nodes[0];
+
+	/*
+	 * If the tree is completely empty, create the first leaf page, which is
+	 * also the root.
+	 */
+	if (leaf == NULL)
+	{
+		/*
+		 * This is the very first item in the set.
+		 *
+		 * Allocate root node. It's also a leaf.
+		 */
+		leaf = intset_new_leaf_node(intset);
+
+		intset->root = (intset_node *) leaf;
+		intset->leftmost_leaf = leaf;
+		intset->rightmost_nodes[0] = (intset_node *) leaf;
+		intset->num_levels = 1;
+	}
+
+	/*
+	 * If there are less than MAX_VALUES_PER_LEAF_ITEM values in the buffer,
+	 * stop.  In most cases, we cannot encode that many values in a single
+	 * value, but this way, the encoder doesn't have to worry about running
+	 * out of input.
+	 */
+	while (num_values - num_packed >= MAX_VALUES_PER_LEAF_ITEM)
+	{
+		leaf_item	item;
+		int			num_encoded;
+
+		/*
+		 * Construct the next leaf item, packing as many buffered values as
+		 * possible.
+		 */
+		item.first = values[num_packed];
+		item.codeword = simple8b_encode(&values[num_packed + 1],
+										&num_encoded,
+										item.first);
+
+		/*
+		 * Add the item to the node, allocating a new node if the old one is
+		 * full.
+		 */
+		if (leaf->num_items >= MAX_LEAF_ITEMS)
+		{
+			/* Allocate new leaf and link it to the tree */
+			intset_leaf_node *old_leaf = leaf;
+
+			leaf = intset_new_leaf_node(intset);
+			old_leaf->next = leaf;
+			intset->rightmost_nodes[0] = (intset_node *) leaf;
+			intset_update_upper(intset, 1, (intset_node *) leaf, item.first);
+		}
+		leaf->items[leaf->num_items++] = item;
+
+		num_packed += 1 + num_encoded;
+	}
+
+	/*
+	 * Move any remaining buffered values to the beginning of the array.
+	 */
+	if (num_packed < intset->num_buffered_values)
+	{
+		memmove(&intset->buffered_values[0],
+				&intset->buffered_values[num_packed],
+				(intset->num_buffered_values - num_packed) * sizeof(uint64));
+	}
+	intset->num_buffered_values -= num_packed;
+}
+
+/*
+ * Insert a downlink into parent node, after creating a new node.
+ *
+ * Recurses if the parent node is full, too.
+ */
+static void
+intset_update_upper(IntegerSet *intset, int level, intset_node *child,
+					uint64 child_key)
+{
+	intset_internal_node *parent;
+
+	Assert(level > 0);
+
+	/*
+	 * Create a new root node, if necessary.
+	 */
+	if (level >= intset->num_levels)
+	{
+		intset_node *oldroot = intset->root;
+		uint64		downlink_key;
+
+		/* MAX_TREE_LEVELS should be more than enough, this shouldn't happen */
+		if (intset->num_levels == MAX_TREE_LEVELS)
+			elog(ERROR, "could not expand integer set, maximum number of levels reached");
+		intset->num_levels++;
+
+		/*
+		 * Get the first value on the old root page, to be used as the
+		 * downlink.
+		 */
+		if (intset->root->level == 0)
+			downlink_key = ((intset_leaf_node *) oldroot)->items[0].first;
+		else
+			downlink_key = ((intset_internal_node *) oldroot)->values[0];
+
+		parent = intset_new_internal_node(intset);
+		parent->level = level;
+		parent->values[0] = downlink_key;
+		parent->downlinks[0] = oldroot;
+		parent->num_items = 1;
+
+		intset->root = (intset_node *) parent;
+		intset->rightmost_nodes[level] = (intset_node *) parent;
+	}
+
+	/*
+	 * Place the downlink on the parent page.
+	 */
+	parent = (intset_internal_node *) intset->rightmost_nodes[level];
+
+	if (parent->num_items < MAX_INTERNAL_ITEMS)
+	{
+		parent->values[parent->num_items] = child_key;
+		parent->downlinks[parent->num_items] = child;
+		parent->num_items++;
+	}
+	else
+	{
+		/*
+		 * Doesn't fit.  Allocate new parent, with the downlink as the first
+		 * item on it, and recursively insert the downlink to the new parent
+		 * to the grandparent.
+		 */
+		parent = intset_new_internal_node(intset);
+		parent->level = level;
+		parent->values[0] = child_key;
+		parent->downlinks[0] = child;
+		parent->num_items = 1;
+
+		intset->rightmost_nodes[level] = (intset_node *) parent;
+
+		intset_update_upper(intset, level + 1, (intset_node *) parent, child_key);
+	}
+}
+
+/*
+ * Does the set contain the given value?
+ */
+bool
+intset_is_member(IntegerSet *intset, uint64 x)
+{
+	intset_node *node;
+	intset_leaf_node *leaf;
+	int			level;
+	int			itemno;
+	leaf_item  *item;
+
+	/*
+	 * The value might be in the buffer of newly-added values.
+	 */
+	if (intset->num_buffered_values > 0 && x >= intset->buffered_values[0])
+	{
+		int			itemno;
+
+		itemno = intset_binsrch_uint64(x,
+									   intset->buffered_values,
+									   intset->num_buffered_values,
+									   false);
+		if (itemno >= intset->num_buffered_values)
+			return false;
+		else
+			return (intset->buffered_values[itemno] == x);
+	}
+
+	/*
+	 * Start from the root, and walk down the B-tree to find the right leaf
+	 * node.
+	 */
+	if (!intset->root)
+		return false;
+	node = intset->root;
+	for (level = intset->num_levels - 1; level > 0; level--)
+	{
+		intset_internal_node *n = (intset_internal_node *) node;
+
+		Assert(node->level == level);
+
+		itemno = intset_binsrch_uint64(x, n->values, n->num_items, true);
+		if (itemno == 0)
+			return false;
+		node = n->downlinks[itemno - 1];
+	}
+	Assert(node->level == 0);
+	leaf = (intset_leaf_node *) node;
+
+	/*
+	 * Binary search to find the right item on the leaf page
+	 */
+	itemno = intset_binsrch_leaf(x, leaf->items, leaf->num_items, true);
+	if (itemno == 0)
+		return false;
+	item = &leaf->items[itemno - 1];
+
+	/* Is this a match to the first value on the item? */
+	if (item->first == x)
+		return true;
+	Assert(x > item->first);
+
+	/* Is it in the packed codeword? */
+	if (simple8b_contains(item->codeword, x, item->first))
+		return true;
+
+	return false;
+}
+
+/*
+ * Begin in-order scan through all the values.
+ *
+ * While the iteration is in-progress, you cannot add new values to the set.
+ */
+void
+intset_begin_iterate(IntegerSet *intset)
+{
+	/* Note that we allow an iteration to be abandoned midway */
+	intset->iter_active = true;
+	intset->iter_node = intset->leftmost_leaf;
+	intset->iter_itemno = 0;
+	intset->iter_valueno = 0;
+	intset->iter_num_values = 0;
+	intset->iter_values = intset->iter_values_buf;
+}
+
+/*
+ * Returns the next integer, when iterating.
+ *
+ * intset_begin_iterate() must be called first.  intset_iterate_next() returns
+ * the next value in the set.  Returns true, if there was another value, and
+ * stores the value in *next.  Otherwise, returns false.
+ */
+bool
+intset_iterate_next(IntegerSet *intset, uint64 *next)
+{
+	Assert(intset->iter_active);
+	for (;;)
+	{
+		/* Return next iter_values[] entry if any */
+		if (intset->iter_valueno < intset->iter_num_values)
+		{
+			*next = intset->iter_values[intset->iter_valueno++];
+			return true;
+		}
+
+		/* Decode next item in current leaf node, if any */
+		if (intset->iter_node &&
+			intset->iter_itemno < intset->iter_node->num_items)
+		{
+			leaf_item  *item;
+			int			num_decoded;
+
+			item = &intset->iter_node->items[intset->iter_itemno++];
+
+			intset->iter_values_buf[0] = item->first;
+			num_decoded = simple8b_decode(item->codeword,
+										  &intset->iter_values_buf[1],
+										  item->first);
+			intset->iter_num_values = num_decoded + 1;
+			intset->iter_valueno = 0;
+			continue;
+		}
+
+		/* No more items on this leaf, step to next node */
+		if (intset->iter_node)
+		{
+			intset->iter_node = intset->iter_node->next;
+			intset->iter_itemno = 0;
+			continue;
+		}
+
+		/*
+		 * We have reached the end of the B-tree.  But we might still have
+		 * some integers in the buffer of newly-added values.
+		 */
+		if (intset->iter_values == (const uint64 *) intset->iter_values_buf)
+		{
+			intset->iter_values = intset->buffered_values;
+			intset->iter_num_values = intset->num_buffered_values;
+			intset->iter_valueno = 0;
+			continue;
+		}
+
+		break;
+	}
+
+	/* No more results. */
+	intset->iter_active = false;
+	*next = 0;					/* prevent uninitialized-variable warnings */
+	return false;
+}
+
+/*
+ * intset_binsrch_uint64() -- search a sorted array of uint64s
+ *
+ * Returns the first position with key equal or less than the given key.
+ * The returned position would be the "insert" location for the given key,
+ * that is, the position where the new key should be inserted to.
+ *
+ * 'nextkey' affects the behavior on equal keys.  If true, and there is an
+ * equal key in the array, this returns the position immediately after the
+ * equal key.  If false, this returns the position of the equal key itself.
+ */
+static int
+intset_binsrch_uint64(uint64 item, uint64 *arr, int arr_elems, bool nextkey)
+{
+	int			low,
+				high,
+				mid;
+
+	low = 0;
+	high = arr_elems;
+	while (high > low)
+	{
+		mid = low + (high - low) / 2;
+
+		if (nextkey)
+		{
+			if (item >= arr[mid])
+				low = mid + 1;
+			else
+				high = mid;
+		}
+		else
+		{
+			if (item > arr[mid])
+				low = mid + 1;
+			else
+				high = mid;
+		}
+	}
+
+	return low;
+}
+
+/* same, but for an array of leaf items */
+static int
+intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey)
+{
+	int			low,
+				high,
+				mid;
+
+	low = 0;
+	high = arr_elems;
+	while (high > low)
+	{
+		mid = low + (high - low) / 2;
+
+		if (nextkey)
+		{
+			if (item >= arr[mid].first)
+				low = mid + 1;
+			else
+				high = mid;
+		}
+		else
+		{
+			if (item > arr[mid].first)
+				low = mid + 1;
+			else
+				high = mid;
+		}
+	}
+
+	return low;
+}
+
+/*
+ * Simple-8b encoding.
+ *
+ * The simple-8b algorithm packs between 1 and 240 integers into 64-bit words,
+ * called "codewords".  The number of integers packed into a single codeword
+ * depends on the integers being packed; small integers are encoded using
+ * fewer bits than large integers.  A single codeword can store a single
+ * 60-bit integer, or two 30-bit integers, for example.
+ *
+ * Since we're storing a unique, sorted, set of integers, we actually encode
+ * the *differences* between consecutive integers.  That way, clusters of
+ * integers that are close to each other are packed efficiently, regardless
+ * of their absolute values.
+ *
+ * In Simple-8b, each codeword consists of a 4-bit selector, which indicates
+ * how many integers are encoded in the codeword, and the encoded integers are
+ * packed into the remaining 60 bits.  The selector allows for 16 different
+ * ways of using the remaining 60 bits, called "modes".  The number of integers
+ * packed into a single codeword in each mode is listed in the simple8b_modes
+ * table below.  For example, consider the following codeword:
+ *
+ *      20-bit integer       20-bit integer       20-bit integer
+ * 1101 00000000000000010010 01111010000100100000 00000000000000010100
+ * ^
+ * selector
+ *
+ * The selector 1101 is 13 in decimal.  From the modes table below, we see
+ * that it means that the codeword encodes three 20-bit integers.  In decimal,
+ * those integers are 18, 500000 and 20.  Because we encode deltas rather than
+ * absolute values, the actual values that they represent are 18, 500018 and
+ * 500038.
+ *
+ * Modes 0 and 1 are a bit special; they encode a run of 240 or 120 zeroes
+ * (which means 240 or 120 consecutive integers, since we're encoding the
+ * deltas between integers), without using the rest of the codeword bits
+ * for anything.
+ *
+ * Simple-8b cannot encode integers larger than 60 bits.  Values larger than
+ * that are always stored in the 'first' field of a leaf item, never in the
+ * packed codeword.  If there is a sequence of integers that are more than
+ * 2^60 apart, the codeword will go unused on those items.  To represent that,
+ * we use a magic EMPTY_CODEWORD codeword value.
+ */
+static const struct simple8b_mode
+{
+	uint8		bits_per_int;
+	uint8		num_ints;
+}			simple8b_modes[17] =
+
+{
+	{0, 240},					/* mode  0: 240 zeroes */
+	{0, 120},					/* mode  1: 120 zeroes */
+	{1, 60},					/* mode  2: sixty 1-bit integers */
+	{2, 30},					/* mode  3: thirty 2-bit integers */
+	{3, 20},					/* mode  4: twenty 3-bit integers */
+	{4, 15},					/* mode  5: fifteen 4-bit integers */
+	{5, 12},					/* mode  6: twelve 5-bit integers */
+	{6, 10},					/* mode  7: ten 6-bit integers */
+	{7, 8},						/* mode  8: eight 7-bit integers (four bits
+								 * are wasted) */
+	{8, 7},						/* mode  9: seven 8-bit integers (four bits
+								 * are wasted) */
+	{10, 6},					/* mode 10: six 10-bit integers */
+	{12, 5},					/* mode 11: five 12-bit integers */
+	{15, 4},					/* mode 12: four 15-bit integers */
+	{20, 3},					/* mode 13: three 20-bit integers */
+	{30, 2},					/* mode 14: two 30-bit integers */
+	{60, 1},					/* mode 15: one 60-bit integer */
+
+	{0, 0}						/* sentinel value */
+};
+
+/*
+ * EMPTY_CODEWORD is a special value, used to indicate "no values".
+ * It is used if the next value is too large to be encoded with Simple-8b.
+ *
+ * This value looks like a mode-0 codeword, but we can distinguish it
+ * because a regular mode-0 codeword would have zeroes in the unused bits.
+ */
+#define EMPTY_CODEWORD		UINT64CONST(0x0FFFFFFFFFFFFFFF)
+
+/*
+ * Encode a number of integers into a Simple-8b codeword.
+ *
+ * (What we actually encode are deltas between successive integers.
+ * "base" is the value before ints[0].)
+ *
+ * The input array must contain at least SIMPLE8B_MAX_VALUES_PER_CODEWORD
+ * elements, ensuring that we can produce a full codeword.
+ *
+ * Returns the encoded codeword, and sets *num_encoded to the number of
+ * input integers that were encoded.  That can be zero, if the first delta
+ * is too large to be encoded.
+ */
+static uint64
+simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base)
+{
+	int			selector;
+	int			nints;
+	int			bits;
+	uint64		diff;
+	uint64		last_val;
+	uint64		codeword;
+	int			i;
+
+	Assert(ints[0] > base);
+
+	/*
+	 * Select the "mode" to use for this codeword.
+	 *
+	 * In each iteration, check if the next value can be represented in the
+	 * current mode we're considering.  If it's too large, then step up the
+	 * mode to a wider one, and repeat.  If it fits, move on to the next
+	 * integer.  Repeat until the codeword is full, given the current mode.
+	 *
+	 * Note that we don't have any way to represent unused slots in the
+	 * codeword, so we require each codeword to be "full".  It is always
+	 * possible to produce a full codeword unless the very first delta is too
+	 * large to be encoded.  For example, if the first delta is small but the
+	 * second is too large to be encoded, we'll end up using the last "mode",
+	 * which has nints == 1.
+	 */
+	selector = 0;
+	nints = simple8b_modes[0].num_ints;
+	bits = simple8b_modes[0].bits_per_int;
+	diff = ints[0] - base - 1;
+	last_val = ints[0];
+	i = 0;						/* number of deltas we have accepted */
+	for (;;)
+	{
+		if (diff >= (UINT64CONST(1) << bits))
+		{
+			/* too large, step up to next mode */
+			selector++;
+			nints = simple8b_modes[selector].num_ints;
+			bits = simple8b_modes[selector].bits_per_int;
+			/* we might already have accepted enough deltas for this mode */
+			if (i >= nints)
+				break;
+		}
+		else
+		{
+			/* accept this delta; then done if codeword is full */
+			i++;
+			if (i >= nints)
+				break;
+			/* examine next delta */
+			Assert(ints[i] > last_val);
+			diff = ints[i] - last_val - 1;
+			last_val = ints[i];
+		}
+	}
+
+	if (nints == 0)
+	{
+		/*
+		 * The first delta is too large to be encoded with Simple-8b.
+		 *
+		 * If there is at least one not-too-large integer in the input, we
+		 * will encode it using mode 15 (or a more compact mode).  Hence, we
+		 * can only get here if the *first* delta is >= 2^60.
+		 */
+		Assert(i == 0);
+		*num_encoded = 0;
+		return EMPTY_CODEWORD;
+	}
+
+	/*
+	 * Encode the integers using the selected mode.  Note that we shift them
+	 * into the codeword in reverse order, so that they will come out in the
+	 * correct order in the decoder.
+	 */
+	codeword = 0;
+	if (bits > 0)
+	{
+		for (i = nints - 1; i > 0; i--)
+		{
+			diff = ints[i] - ints[i - 1] - 1;
+			codeword |= diff;
+			codeword <<= bits;
+		}
+		diff = ints[0] - base - 1;
+		codeword |= diff;
+	}
+
+	/* add selector to the codeword, and return */
+	codeword |= (uint64) selector << 60;
+
+	*num_encoded = nints;
+	return codeword;
+}
+
+/*
+ * Decode a codeword into an array of integers.
+ * Returns the number of integers decoded.
+ */
+static int
+simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
+{
+	int			selector = (codeword >> 60);
+	int			nints = simple8b_modes[selector].num_ints;
+	int			bits = simple8b_modes[selector].bits_per_int;
+	uint64		mask = (UINT64CONST(1) << bits) - 1;
+	uint64		curr_value;
+
+	if (codeword == EMPTY_CODEWORD)
+		return 0;
+
+	curr_value = base;
+	for (int i = 0; i < nints; i++)
+	{
+		uint64		diff = codeword & mask;
+
+		curr_value += 1 + diff;
+		decoded[i] = curr_value;
+		codeword >>= bits;
+	}
+	return nints;
+}
+
+/*
+ * This is very similar to simple8b_decode(), but instead of decoding all
+ * the values to an array, it just checks if the given "key" is part of
+ * the codeword.
+ */
+static bool
+simple8b_contains(uint64 codeword, uint64 key, uint64 base)
+{
+	int			selector = (codeword >> 60);
+	int			nints = simple8b_modes[selector].num_ints;
+	int			bits = simple8b_modes[selector].bits_per_int;
+
+	if (codeword == EMPTY_CODEWORD)
+		return false;
+
+	if (bits == 0)
+	{
+		/* Special handling for 0-bit cases. */
+		return (key - base) <= nints;
+	}
+	else
+	{
+		uint64		mask = (UINT64CONST(1) << bits) - 1;
+		uint64		curr_value;
+
+		curr_value = base;
+		for (int i = 0; i < nints; i++)
+		{
+			uint64		diff = codeword & mask;
+
+			curr_value += 1 + diff;
+
+			if (curr_value >= key)
+			{
+				if (curr_value == key)
+					return true;
+				else
+					return false;
+			}
+
+			codeword >>= bits;
+		}
+	}
+	return false;
+}
diff --git a/src/backend/lib/knapsack.c b/src/backend/lib/knapsack.c
new file mode 100644
index 0000000..f4c5166
--- /dev/null
+++ b/src/backend/lib/knapsack.c
@@ -0,0 +1,111 @@
+/*-------------------------------------------------------------------------
+ *
+ * knapsack.c
+ *	  Knapsack problem solver
+ *
+ * Given input vectors of integral item weights (must be >= 0) and values
+ * (double >= 0), compute the set of items which produces the greatest total
+ * value without exceeding a specified total weight; each item is included at
+ * most once (this is the 0/1 knapsack problem).  Weight 0 items will always be
+ * included.
+ *
+ * The performance of this algorithm is pseudo-polynomial, O(nW) where W is the
+ * weight limit.  To use with non-integral weights or approximate solutions,
+ * the caller should pre-scale the input weights to a suitable range.  This
+ * allows approximate solutions in polynomial time (the general case of the
+ * exact problem is NP-hard).
+ *
+ * Copyright (c) 2017-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/knapsack.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+#include <limits.h>
+
+#include "lib/knapsack.h"
+#include "miscadmin.h"
+#include "nodes/bitmapset.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+
+/*
+ * DiscreteKnapsack
+ *
+ * The item_values input is optional; if omitted, all the items are assumed to
+ * have value 1.
+ *
+ * Returns a Bitmapset of the 0..(n-1) indexes of the items chosen for
+ * inclusion in the solution.
+ *
+ * This uses the usual dynamic-programming algorithm, adapted to reuse the
+ * memory on each pass (by working from larger weights to smaller).  At the
+ * start of pass number i, the values[w] array contains the largest value
+ * computed with total weight <= w, using only items with indices < i; and
+ * sets[w] contains the bitmap of items actually used for that value.  (The
+ * bitmapsets are all pre-initialized with an unused high bit so that memory
+ * allocation is done only once.)
+ */
+Bitmapset *
+DiscreteKnapsack(int max_weight, int num_items,
+				 int *item_weights, double *item_values)
+{
+	MemoryContext local_ctx = AllocSetContextCreate(CurrentMemoryContext,
+													"Knapsack",
+													ALLOCSET_SMALL_SIZES);
+	MemoryContext oldctx = MemoryContextSwitchTo(local_ctx);
+	double	   *values;
+	Bitmapset **sets;
+	Bitmapset  *result;
+	int			i,
+				j;
+
+	Assert(max_weight >= 0);
+	Assert(num_items > 0 && item_weights);
+
+	values = palloc((1 + max_weight) * sizeof(double));
+	sets = palloc((1 + max_weight) * sizeof(Bitmapset *));
+
+	for (i = 0; i <= max_weight; ++i)
+	{
+		values[i] = 0;
+		sets[i] = bms_make_singleton(num_items);
+	}
+
+	for (i = 0; i < num_items; ++i)
+	{
+		int			iw = item_weights[i];
+		double		iv = item_values ? item_values[i] : 1;
+
+		for (j = max_weight; j >= iw; --j)
+		{
+			int			ow = j - iw;
+
+			if (values[j] <= values[ow] + iv)
+			{
+				/* copy sets[ow] to sets[j] without realloc */
+				if (j != ow)
+				{
+					sets[j] = bms_del_members(sets[j], sets[j]);
+					sets[j] = bms_add_members(sets[j], sets[ow]);
+				}
+
+				sets[j] = bms_add_member(sets[j], i);
+
+				values[j] = values[ow] + iv;
+			}
+		}
+	}
+
+	MemoryContextSwitchTo(oldctx);
+
+	result = bms_del_member(bms_copy(sets[max_weight]), num_items);
+
+	MemoryContextDelete(local_ctx);
+
+	return result;
+}
diff --git a/src/backend/lib/pairingheap.c b/src/backend/lib/pairingheap.c
new file mode 100644
index 0000000..d561df0
--- /dev/null
+++ b/src/backend/lib/pairingheap.c
@@ -0,0 +1,333 @@
+/*-------------------------------------------------------------------------
+ *
+ * pairingheap.c
+ *	  A Pairing Heap implementation
+ *
+ * A pairing heap is a data structure that's useful for implementing
+ * priority queues. It is simple to implement, and provides amortized O(1)
+ * insert and find-min operations, and amortized O(log n) delete-min.
+ *
+ * The pairing heap was first described in this paper:
+ *
+ *	Michael L. Fredman, Robert Sedgewick, Daniel D. Sleator, and Robert E.
+ *	 Tarjan. 1986.
+ *	The pairing heap: a new form of self-adjusting heap.
+ *	Algorithmica 1, 1 (January 1986), pages 111-129. DOI: 10.1007/BF01840439
+ *
+ * Portions Copyright (c) 2012-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/pairingheap.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/pairingheap.h"
+
+static pairingheap_node *merge(pairingheap *heap, pairingheap_node *a,
+							   pairingheap_node *b);
+static pairingheap_node *merge_children(pairingheap *heap,
+										pairingheap_node *children);
+
+/*
+ * pairingheap_allocate
+ *
+ * Returns a pointer to a newly-allocated heap, with the heap property defined
+ * by the given comparator function, which will be invoked with the additional
+ * argument specified by 'arg'.
+ */
+pairingheap *
+pairingheap_allocate(pairingheap_comparator compare, void *arg)
+{
+	pairingheap *heap;
+
+	heap = (pairingheap *) palloc(sizeof(pairingheap));
+	heap->ph_compare = compare;
+	heap->ph_arg = arg;
+
+	heap->ph_root = NULL;
+
+	return heap;
+}
+
+/*
+ * pairingheap_free
+ *
+ * Releases memory used by the given pairingheap.
+ *
+ * Note: The nodes in the heap are not freed!
+ */
+void
+pairingheap_free(pairingheap *heap)
+{
+	pfree(heap);
+}
+
+/*
+ * A helper function to merge two subheaps into one.
+ *
+ * The subheap with smaller value is put as a child of the other one (assuming
+ * a max-heap).
+ *
+ * The next_sibling and prev_or_parent pointers of the input nodes are
+ * ignored. On return, the returned node's next_sibling and prev_or_parent
+ * pointers are garbage.
+ */
+static pairingheap_node *
+merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
+{
+	if (a == NULL)
+		return b;
+	if (b == NULL)
+		return a;
+
+	/* swap 'a' and 'b' so that 'a' is the one with larger value */
+	if (heap->ph_compare(a, b, heap->ph_arg) < 0)
+	{
+		pairingheap_node *tmp;
+
+		tmp = a;
+		a = b;
+		b = tmp;
+	}
+
+	/* and put 'b' as a child of 'a' */
+	if (a->first_child)
+		a->first_child->prev_or_parent = b;
+	b->prev_or_parent = a;
+	b->next_sibling = a->first_child;
+	a->first_child = b;
+
+	return a;
+}
+
+/*
+ * pairingheap_add
+ *
+ * Adds the given node to the heap in O(1) time.
+ */
+void
+pairingheap_add(pairingheap *heap, pairingheap_node *node)
+{
+	node->first_child = NULL;
+
+	/* Link the new node as a new tree */
+	heap->ph_root = merge(heap, heap->ph_root, node);
+	heap->ph_root->prev_or_parent = NULL;
+	heap->ph_root->next_sibling = NULL;
+}
+
+/*
+ * pairingheap_first
+ *
+ * Returns a pointer to the first (root, topmost) node in the heap without
+ * modifying the heap. The caller must ensure that this routine is not used on
+ * an empty heap. Always O(1).
+ */
+pairingheap_node *
+pairingheap_first(pairingheap *heap)
+{
+	Assert(!pairingheap_is_empty(heap));
+
+	return heap->ph_root;
+}
+
+/*
+ * pairingheap_remove_first
+ *
+ * Removes the first (root, topmost) node in the heap and returns a pointer to
+ * it after rebalancing the heap. The caller must ensure that this routine is
+ * not used on an empty heap. O(log n) amortized.
+ */
+pairingheap_node *
+pairingheap_remove_first(pairingheap *heap)
+{
+	pairingheap_node *result;
+	pairingheap_node *children;
+
+	Assert(!pairingheap_is_empty(heap));
+
+	/* Remove the root, and form a new heap of its children. */
+	result = heap->ph_root;
+	children = result->first_child;
+
+	heap->ph_root = merge_children(heap, children);
+	if (heap->ph_root)
+	{
+		heap->ph_root->prev_or_parent = NULL;
+		heap->ph_root->next_sibling = NULL;
+	}
+
+	return result;
+}
+
+/*
+ * Remove 'node' from the heap. O(log n) amortized.
+ */
+void
+pairingheap_remove(pairingheap *heap, pairingheap_node *node)
+{
+	pairingheap_node *children;
+	pairingheap_node *replacement;
+	pairingheap_node *next_sibling;
+	pairingheap_node **prev_ptr;
+
+	/*
+	 * If the removed node happens to be the root node, do it with
+	 * pairingheap_remove_first().
+	 */
+	if (node == heap->ph_root)
+	{
+		(void) pairingheap_remove_first(heap);
+		return;
+	}
+
+	/*
+	 * Before we modify anything, remember the removed node's first_child and
+	 * next_sibling pointers.
+	 */
+	children = node->first_child;
+	next_sibling = node->next_sibling;
+
+	/*
+	 * Also find the pointer to the removed node in its previous sibling, or
+	 * if this is the first child of its parent, in its parent.
+	 */
+	if (node->prev_or_parent->first_child == node)
+		prev_ptr = &node->prev_or_parent->first_child;
+	else
+		prev_ptr = &node->prev_or_parent->next_sibling;
+	Assert(*prev_ptr == node);
+
+	/*
+	 * If this node has children, make a new subheap of the children and link
+	 * the subheap in place of the removed node. Otherwise just unlink this
+	 * node.
+	 */
+	if (children)
+	{
+		replacement = merge_children(heap, children);
+
+		replacement->prev_or_parent = node->prev_or_parent;
+		replacement->next_sibling = node->next_sibling;
+		*prev_ptr = replacement;
+		if (next_sibling)
+			next_sibling->prev_or_parent = replacement;
+	}
+	else
+	{
+		*prev_ptr = next_sibling;
+		if (next_sibling)
+			next_sibling->prev_or_parent = node->prev_or_parent;
+	}
+}
+
+/*
+ * Merge a list of subheaps into a single heap.
+ *
+ * This implements the basic two-pass merging strategy, first forming pairs
+ * from left to right, and then merging the pairs.
+ */
+static pairingheap_node *
+merge_children(pairingheap *heap, pairingheap_node *children)
+{
+	pairingheap_node *curr,
+			   *next;
+	pairingheap_node *pairs;
+	pairingheap_node *newroot;
+
+	if (children == NULL || children->next_sibling == NULL)
+		return children;
+
+	/* Walk the subheaps from left to right, merging in pairs */
+	next = children;
+	pairs = NULL;
+	for (;;)
+	{
+		curr = next;
+
+		if (curr == NULL)
+			break;
+
+		if (curr->next_sibling == NULL)
+		{
+			/* last odd node at the end of list */
+			curr->next_sibling = pairs;
+			pairs = curr;
+			break;
+		}
+
+		next = curr->next_sibling->next_sibling;
+
+		/* merge this and the next subheap, and add to 'pairs' list. */
+
+		curr = merge(heap, curr, curr->next_sibling);
+		curr->next_sibling = pairs;
+		pairs = curr;
+	}
+
+	/*
+	 * Merge all the pairs together to form a single heap.
+	 */
+	newroot = pairs;
+	next = pairs->next_sibling;
+	while (next)
+	{
+		curr = next;
+		next = curr->next_sibling;
+
+		newroot = merge(heap, newroot, curr);
+	}
+
+	return newroot;
+}
+
+/*
+ * A debug function to dump the contents of the heap as a string.
+ *
+ * The 'dumpfunc' callback appends a string representation of a single node
+ * to the StringInfo. 'opaque' can be used to pass more information to the
+ * callback.
+ */
+#ifdef PAIRINGHEAP_DEBUG
+static void
+pairingheap_dump_recurse(StringInfo buf,
+						 pairingheap_node *node,
+						 void (*dumpfunc) (pairingheap_node *node, StringInfo buf, void *opaque),
+						 void *opaque,
+						 int depth,
+						 pairingheap_node *prev_or_parent)
+{
+	while (node)
+	{
+		Assert(node->prev_or_parent == prev_or_parent);
+
+		appendStringInfoSpaces(buf, depth * 4);
+		dumpfunc(node, buf, opaque);
+		appendStringInfoChar(buf, '\n');
+		if (node->first_child)
+			pairingheap_dump_recurse(buf, node->first_child, dumpfunc, opaque, depth + 1, node);
+		prev_or_parent = node;
+		node = node->next_sibling;
+	}
+}
+
+char *
+pairingheap_dump(pairingheap *heap,
+				 void (*dumpfunc) (pairingheap_node *node, StringInfo buf, void *opaque),
+				 void *opaque)
+{
+	StringInfoData buf;
+
+	if (!heap->ph_root)
+		return pstrdup("(empty)");
+
+	initStringInfo(&buf);
+
+	pairingheap_dump_recurse(&buf, heap->ph_root, dumpfunc, opaque, 0, NULL);
+
+	return buf.data;
+}
+#endif
diff --git a/src/backend/lib/rbtree.c b/src/backend/lib/rbtree.c
new file mode 100644
index 0000000..a9981db
--- /dev/null
+++ b/src/backend/lib/rbtree.c
@@ -0,0 +1,770 @@
+/*-------------------------------------------------------------------------
+ *
+ * rbtree.c
+ *	  implementation for PostgreSQL generic Red-Black binary tree package
+ *	  Adopted from http://algolist.manual.ru/ds/rbtree.php
+ *
+ * This code comes from Thomas Niemann's "Sorting and Searching Algorithms:
+ * a Cookbook".
+ *
+ * See http://www.cs.auckland.ac.nz/software/AlgAnim/niemann/s_man.htm for
+ * license terms: "Source code, when part of a software project, may be used
+ * freely without reference to the author."
+ *
+ * Red-black trees are a type of balanced binary tree wherein (1) any child of
+ * a red node is always black, and (2) every path from root to leaf traverses
+ * an equal number of black nodes.  From these properties, it follows that the
+ * longest path from root to leaf is only about twice as long as the shortest,
+ * so lookups are guaranteed to run in O(lg n) time.
+ *
+ * Copyright (c) 2009-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/rbtree.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "lib/rbtree.h"
+
+
+/*
+ * Colors of nodes (values of RBTNode.color)
+ */
+#define RBTBLACK	(0)
+#define RBTRED		(1)
+
+/*
+ * RBTree control structure
+ */
+struct RBTree
+{
+	RBTNode    *root;			/* root node, or RBTNIL if tree is empty */
+
+	/* Remaining fields are constant after rbt_create */
+
+	Size		node_size;		/* actual size of tree nodes */
+	/* The caller-supplied manipulation functions */
+	rbt_comparator comparator;
+	rbt_combiner combiner;
+	rbt_allocfunc allocfunc;
+	rbt_freefunc freefunc;
+	/* Passthrough arg passed to all manipulation functions */
+	void	   *arg;
+};
+
+/*
+ * all leafs are sentinels, use customized NIL name to prevent
+ * collision with system-wide constant NIL which is actually NULL
+ */
+#define RBTNIL (&sentinel)
+
+static RBTNode sentinel =
+{
+	RBTBLACK, RBTNIL, RBTNIL, NULL
+};
+
+
+/*
+ * rbt_create: create an empty RBTree
+ *
+ * Arguments are:
+ *	node_size: actual size of tree nodes (> sizeof(RBTNode))
+ *	The manipulation functions:
+ *	comparator: compare two RBTNodes for less/equal/greater
+ *	combiner: merge an existing tree entry with a new one
+ *	allocfunc: allocate a new RBTNode
+ *	freefunc: free an old RBTNode
+ *	arg: passthrough pointer that will be passed to the manipulation functions
+ *
+ * Note that the combiner's righthand argument will be a "proposed" tree node,
+ * ie the input to rbt_insert, in which the RBTNode fields themselves aren't
+ * valid.  Similarly, either input to the comparator may be a "proposed" node.
+ * This shouldn't matter since the functions aren't supposed to look at the
+ * RBTNode fields, only the extra fields of the struct the RBTNode is embedded
+ * in.
+ *
+ * The freefunc should just be pfree or equivalent; it should NOT attempt
+ * to free any subsidiary data, because the node passed to it may not contain
+ * valid data!	freefunc can be NULL if caller doesn't require retail
+ * space reclamation.
+ *
+ * The RBTree node is palloc'd in the caller's memory context.  Note that
+ * all contents of the tree are actually allocated by the caller, not here.
+ *
+ * Since tree contents are managed by the caller, there is currently not
+ * an explicit "destroy" operation; typically a tree would be freed by
+ * resetting or deleting the memory context it's stored in.  You can pfree
+ * the RBTree node if you feel the urge.
+ */
+RBTree *
+rbt_create(Size node_size,
+		   rbt_comparator comparator,
+		   rbt_combiner combiner,
+		   rbt_allocfunc allocfunc,
+		   rbt_freefunc freefunc,
+		   void *arg)
+{
+	RBTree	   *tree = (RBTree *) palloc(sizeof(RBTree));
+
+	Assert(node_size > sizeof(RBTNode));
+
+	tree->root = RBTNIL;
+	tree->node_size = node_size;
+	tree->comparator = comparator;
+	tree->combiner = combiner;
+	tree->allocfunc = allocfunc;
+	tree->freefunc = freefunc;
+
+	tree->arg = arg;
+
+	return tree;
+}
+
+/* Copy the additional data fields from one RBTNode to another */
+static inline void
+rbt_copy_data(RBTree *rbt, RBTNode *dest, const RBTNode *src)
+{
+	memcpy(dest + 1, src + 1, rbt->node_size - sizeof(RBTNode));
+}
+
+/**********************************************************************
+ *						  Search									  *
+ **********************************************************************/
+
+/*
+ * rbt_find: search for a value in an RBTree
+ *
+ * data represents the value to try to find.  Its RBTNode fields need not
+ * be valid, it's the extra data in the larger struct that is of interest.
+ *
+ * Returns the matching tree entry, or NULL if no match is found.
+ */
+RBTNode *
+rbt_find(RBTree *rbt, const RBTNode *data)
+{
+	RBTNode    *node = rbt->root;
+
+	while (node != RBTNIL)
+	{
+		int			cmp = rbt->comparator(data, node, rbt->arg);
+
+		if (cmp == 0)
+			return node;
+		else if (cmp < 0)
+			node = node->left;
+		else
+			node = node->right;
+	}
+
+	return NULL;
+}
+
+/*
+ * rbt_leftmost: fetch the leftmost (smallest-valued) tree node.
+ * Returns NULL if tree is empty.
+ *
+ * Note: in the original implementation this included an unlink step, but
+ * that's a bit awkward.  Just call rbt_delete on the result if that's what
+ * you want.
+ */
+RBTNode *
+rbt_leftmost(RBTree *rbt)
+{
+	RBTNode    *node = rbt->root;
+	RBTNode    *leftmost = rbt->root;
+
+	while (node != RBTNIL)
+	{
+		leftmost = node;
+		node = node->left;
+	}
+
+	if (leftmost != RBTNIL)
+		return leftmost;
+
+	return NULL;
+}
+
+/**********************************************************************
+ *							  Insertion								  *
+ **********************************************************************/
+
+/*
+ * Rotate node x to left.
+ *
+ * x's right child takes its place in the tree, and x becomes the left
+ * child of that node.
+ */
+static void
+rbt_rotate_left(RBTree *rbt, RBTNode *x)
+{
+	RBTNode    *y = x->right;
+
+	/* establish x->right link */
+	x->right = y->left;
+	if (y->left != RBTNIL)
+		y->left->parent = x;
+
+	/* establish y->parent link */
+	if (y != RBTNIL)
+		y->parent = x->parent;
+	if (x->parent)
+	{
+		if (x == x->parent->left)
+			x->parent->left = y;
+		else
+			x->parent->right = y;
+	}
+	else
+	{
+		rbt->root = y;
+	}
+
+	/* link x and y */
+	y->left = x;
+	if (x != RBTNIL)
+		x->parent = y;
+}
+
+/*
+ * Rotate node x to right.
+ *
+ * x's left right child takes its place in the tree, and x becomes the right
+ * child of that node.
+ */
+static void
+rbt_rotate_right(RBTree *rbt, RBTNode *x)
+{
+	RBTNode    *y = x->left;
+
+	/* establish x->left link */
+	x->left = y->right;
+	if (y->right != RBTNIL)
+		y->right->parent = x;
+
+	/* establish y->parent link */
+	if (y != RBTNIL)
+		y->parent = x->parent;
+	if (x->parent)
+	{
+		if (x == x->parent->right)
+			x->parent->right = y;
+		else
+			x->parent->left = y;
+	}
+	else
+	{
+		rbt->root = y;
+	}
+
+	/* link x and y */
+	y->right = x;
+	if (x != RBTNIL)
+		x->parent = y;
+}
+
+/*
+ * Maintain Red-Black tree balance after inserting node x.
+ *
+ * The newly inserted node is always initially marked red.  That may lead to
+ * a situation where a red node has a red child, which is prohibited.  We can
+ * always fix the problem by a series of color changes and/or "rotations",
+ * which move the problem progressively higher up in the tree.  If one of the
+ * two red nodes is the root, we can always fix the problem by changing the
+ * root from red to black.
+ *
+ * (This does not work lower down in the tree because we must also maintain
+ * the invariant that every leaf has equal black-height.)
+ */
+static void
+rbt_insert_fixup(RBTree *rbt, RBTNode *x)
+{
+	/*
+	 * x is always a red node.  Initially, it is the newly inserted node. Each
+	 * iteration of this loop moves it higher up in the tree.
+	 */
+	while (x != rbt->root && x->parent->color == RBTRED)
+	{
+		/*
+		 * x and x->parent are both red.  Fix depends on whether x->parent is
+		 * a left or right child.  In either case, we define y to be the
+		 * "uncle" of x, that is, the other child of x's grandparent.
+		 *
+		 * If the uncle is red, we flip the grandparent to red and its two
+		 * children to black.  Then we loop around again to check whether the
+		 * grandparent still has a problem.
+		 *
+		 * If the uncle is black, we will perform one or two "rotations" to
+		 * balance the tree.  Either x or x->parent will take the
+		 * grandparent's position in the tree and recolored black, and the
+		 * original grandparent will be recolored red and become a child of
+		 * that node. This always leaves us with a valid red-black tree, so
+		 * the loop will terminate.
+		 */
+		if (x->parent == x->parent->parent->left)
+		{
+			RBTNode    *y = x->parent->parent->right;
+
+			if (y->color == RBTRED)
+			{
+				/* uncle is RBTRED */
+				x->parent->color = RBTBLACK;
+				y->color = RBTBLACK;
+				x->parent->parent->color = RBTRED;
+
+				x = x->parent->parent;
+			}
+			else
+			{
+				/* uncle is RBTBLACK */
+				if (x == x->parent->right)
+				{
+					/* make x a left child */
+					x = x->parent;
+					rbt_rotate_left(rbt, x);
+				}
+
+				/* recolor and rotate */
+				x->parent->color = RBTBLACK;
+				x->parent->parent->color = RBTRED;
+
+				rbt_rotate_right(rbt, x->parent->parent);
+			}
+		}
+		else
+		{
+			/* mirror image of above code */
+			RBTNode    *y = x->parent->parent->left;
+
+			if (y->color == RBTRED)
+			{
+				/* uncle is RBTRED */
+				x->parent->color = RBTBLACK;
+				y->color = RBTBLACK;
+				x->parent->parent->color = RBTRED;
+
+				x = x->parent->parent;
+			}
+			else
+			{
+				/* uncle is RBTBLACK */
+				if (x == x->parent->left)
+				{
+					x = x->parent;
+					rbt_rotate_right(rbt, x);
+				}
+				x->parent->color = RBTBLACK;
+				x->parent->parent->color = RBTRED;
+
+				rbt_rotate_left(rbt, x->parent->parent);
+			}
+		}
+	}
+
+	/*
+	 * The root may already have been black; if not, the black-height of every
+	 * node in the tree increases by one.
+	 */
+	rbt->root->color = RBTBLACK;
+}
+
+/*
+ * rbt_insert: insert a new value into the tree.
+ *
+ * data represents the value to insert.  Its RBTNode fields need not
+ * be valid, it's the extra data in the larger struct that is of interest.
+ *
+ * If the value represented by "data" is not present in the tree, then
+ * we copy "data" into a new tree entry and return that node, setting *isNew
+ * to true.
+ *
+ * If the value represented by "data" is already present, then we call the
+ * combiner function to merge data into the existing node, and return the
+ * existing node, setting *isNew to false.
+ *
+ * "data" is unmodified in either case; it's typically just a local
+ * variable in the caller.
+ */
+RBTNode *
+rbt_insert(RBTree *rbt, const RBTNode *data, bool *isNew)
+{
+	RBTNode    *current,
+			   *parent,
+			   *x;
+	int			cmp;
+
+	/* find where node belongs */
+	current = rbt->root;
+	parent = NULL;
+	cmp = 0;					/* just to prevent compiler warning */
+
+	while (current != RBTNIL)
+	{
+		cmp = rbt->comparator(data, current, rbt->arg);
+		if (cmp == 0)
+		{
+			/*
+			 * Found node with given key.  Apply combiner.
+			 */
+			rbt->combiner(current, data, rbt->arg);
+			*isNew = false;
+			return current;
+		}
+		parent = current;
+		current = (cmp < 0) ? current->left : current->right;
+	}
+
+	/*
+	 * Value is not present, so create a new node containing data.
+	 */
+	*isNew = true;
+
+	x = rbt->allocfunc(rbt->arg);
+
+	x->color = RBTRED;
+
+	x->left = RBTNIL;
+	x->right = RBTNIL;
+	x->parent = parent;
+	rbt_copy_data(rbt, x, data);
+
+	/* insert node in tree */
+	if (parent)
+	{
+		if (cmp < 0)
+			parent->left = x;
+		else
+			parent->right = x;
+	}
+	else
+	{
+		rbt->root = x;
+	}
+
+	rbt_insert_fixup(rbt, x);
+
+	return x;
+}
+
+/**********************************************************************
+ *							Deletion								  *
+ **********************************************************************/
+
+/*
+ * Maintain Red-Black tree balance after deleting a black node.
+ */
+static void
+rbt_delete_fixup(RBTree *rbt, RBTNode *x)
+{
+	/*
+	 * x is always a black node.  Initially, it is the former child of the
+	 * deleted node.  Each iteration of this loop moves it higher up in the
+	 * tree.
+	 */
+	while (x != rbt->root && x->color == RBTBLACK)
+	{
+		/*
+		 * Left and right cases are symmetric.  Any nodes that are children of
+		 * x have a black-height one less than the remainder of the nodes in
+		 * the tree.  We rotate and recolor nodes to move the problem up the
+		 * tree: at some stage we'll either fix the problem, or reach the root
+		 * (where the black-height is allowed to decrease).
+		 */
+		if (x == x->parent->left)
+		{
+			RBTNode    *w = x->parent->right;
+
+			if (w->color == RBTRED)
+			{
+				w->color = RBTBLACK;
+				x->parent->color = RBTRED;
+
+				rbt_rotate_left(rbt, x->parent);
+				w = x->parent->right;
+			}
+
+			if (w->left->color == RBTBLACK && w->right->color == RBTBLACK)
+			{
+				w->color = RBTRED;
+
+				x = x->parent;
+			}
+			else
+			{
+				if (w->right->color == RBTBLACK)
+				{
+					w->left->color = RBTBLACK;
+					w->color = RBTRED;
+
+					rbt_rotate_right(rbt, w);
+					w = x->parent->right;
+				}
+				w->color = x->parent->color;
+				x->parent->color = RBTBLACK;
+				w->right->color = RBTBLACK;
+
+				rbt_rotate_left(rbt, x->parent);
+				x = rbt->root;	/* Arrange for loop to terminate. */
+			}
+		}
+		else
+		{
+			RBTNode    *w = x->parent->left;
+
+			if (w->color == RBTRED)
+			{
+				w->color = RBTBLACK;
+				x->parent->color = RBTRED;
+
+				rbt_rotate_right(rbt, x->parent);
+				w = x->parent->left;
+			}
+
+			if (w->right->color == RBTBLACK && w->left->color == RBTBLACK)
+			{
+				w->color = RBTRED;
+
+				x = x->parent;
+			}
+			else
+			{
+				if (w->left->color == RBTBLACK)
+				{
+					w->right->color = RBTBLACK;
+					w->color = RBTRED;
+
+					rbt_rotate_left(rbt, w);
+					w = x->parent->left;
+				}
+				w->color = x->parent->color;
+				x->parent->color = RBTBLACK;
+				w->left->color = RBTBLACK;
+
+				rbt_rotate_right(rbt, x->parent);
+				x = rbt->root;	/* Arrange for loop to terminate. */
+			}
+		}
+	}
+	x->color = RBTBLACK;
+}
+
+/*
+ * Delete node z from tree.
+ */
+static void
+rbt_delete_node(RBTree *rbt, RBTNode *z)
+{
+	RBTNode    *x,
+			   *y;
+
+	/* This is just paranoia: we should only get called on a valid node */
+	if (!z || z == RBTNIL)
+		return;
+
+	/*
+	 * y is the node that will actually be removed from the tree.  This will
+	 * be z if z has fewer than two children, or the tree successor of z
+	 * otherwise.
+	 */
+	if (z->left == RBTNIL || z->right == RBTNIL)
+	{
+		/* y has a RBTNIL node as a child */
+		y = z;
+	}
+	else
+	{
+		/* find tree successor */
+		y = z->right;
+		while (y->left != RBTNIL)
+			y = y->left;
+	}
+
+	/* x is y's only child */
+	if (y->left != RBTNIL)
+		x = y->left;
+	else
+		x = y->right;
+
+	/* Remove y from the tree. */
+	x->parent = y->parent;
+	if (y->parent)
+	{
+		if (y == y->parent->left)
+			y->parent->left = x;
+		else
+			y->parent->right = x;
+	}
+	else
+	{
+		rbt->root = x;
+	}
+
+	/*
+	 * If we removed the tree successor of z rather than z itself, then move
+	 * the data for the removed node to the one we were supposed to remove.
+	 */
+	if (y != z)
+		rbt_copy_data(rbt, z, y);
+
+	/*
+	 * Removing a black node might make some paths from root to leaf contain
+	 * fewer black nodes than others, or it might make two red nodes adjacent.
+	 */
+	if (y->color == RBTBLACK)
+		rbt_delete_fixup(rbt, x);
+
+	/* Now we can recycle the y node */
+	if (rbt->freefunc)
+		rbt->freefunc(y, rbt->arg);
+}
+
+/*
+ * rbt_delete: remove the given tree entry
+ *
+ * "node" must have previously been found via rbt_find or rbt_leftmost.
+ * It is caller's responsibility to free any subsidiary data attached
+ * to the node before calling rbt_delete.  (Do *not* try to push that
+ * responsibility off to the freefunc, as some other physical node
+ * may be the one actually freed!)
+ */
+void
+rbt_delete(RBTree *rbt, RBTNode *node)
+{
+	rbt_delete_node(rbt, node);
+}
+
+/**********************************************************************
+ *						  Traverse									  *
+ **********************************************************************/
+
+static RBTNode *
+rbt_left_right_iterator(RBTreeIterator *iter)
+{
+	if (iter->last_visited == NULL)
+	{
+		iter->last_visited = iter->rbt->root;
+		while (iter->last_visited->left != RBTNIL)
+			iter->last_visited = iter->last_visited->left;
+
+		return iter->last_visited;
+	}
+
+	if (iter->last_visited->right != RBTNIL)
+	{
+		iter->last_visited = iter->last_visited->right;
+		while (iter->last_visited->left != RBTNIL)
+			iter->last_visited = iter->last_visited->left;
+
+		return iter->last_visited;
+	}
+
+	for (;;)
+	{
+		RBTNode    *came_from = iter->last_visited;
+
+		iter->last_visited = iter->last_visited->parent;
+		if (iter->last_visited == NULL)
+		{
+			iter->is_over = true;
+			break;
+		}
+
+		if (iter->last_visited->left == came_from)
+			break;				/* came from left sub-tree, return current
+								 * node */
+
+		/* else - came from right sub-tree, continue to move up */
+	}
+
+	return iter->last_visited;
+}
+
+static RBTNode *
+rbt_right_left_iterator(RBTreeIterator *iter)
+{
+	if (iter->last_visited == NULL)
+	{
+		iter->last_visited = iter->rbt->root;
+		while (iter->last_visited->right != RBTNIL)
+			iter->last_visited = iter->last_visited->right;
+
+		return iter->last_visited;
+	}
+
+	if (iter->last_visited->left != RBTNIL)
+	{
+		iter->last_visited = iter->last_visited->left;
+		while (iter->last_visited->right != RBTNIL)
+			iter->last_visited = iter->last_visited->right;
+
+		return iter->last_visited;
+	}
+
+	for (;;)
+	{
+		RBTNode    *came_from = iter->last_visited;
+
+		iter->last_visited = iter->last_visited->parent;
+		if (iter->last_visited == NULL)
+		{
+			iter->is_over = true;
+			break;
+		}
+
+		if (iter->last_visited->right == came_from)
+			break;				/* came from right sub-tree, return current
+								 * node */
+
+		/* else - came from left sub-tree, continue to move up */
+	}
+
+	return iter->last_visited;
+}
+
+/*
+ * rbt_begin_iterate: prepare to traverse the tree in any of several orders
+ *
+ * After calling rbt_begin_iterate, call rbt_iterate repeatedly until it
+ * returns NULL or the traversal stops being of interest.
+ *
+ * If the tree is changed during traversal, results of further calls to
+ * rbt_iterate are unspecified.  Multiple concurrent iterators on the same
+ * tree are allowed.
+ *
+ * The iterator state is stored in the 'iter' struct.  The caller should
+ * treat it as an opaque struct.
+ */
+void
+rbt_begin_iterate(RBTree *rbt, RBTOrderControl ctrl, RBTreeIterator *iter)
+{
+	/* Common initialization for all traversal orders */
+	iter->rbt = rbt;
+	iter->last_visited = NULL;
+	iter->is_over = (rbt->root == RBTNIL);
+
+	switch (ctrl)
+	{
+		case LeftRightWalk:		/* visit left, then self, then right */
+			iter->iterate = rbt_left_right_iterator;
+			break;
+		case RightLeftWalk:		/* visit right, then self, then left */
+			iter->iterate = rbt_right_left_iterator;
+			break;
+		default:
+			elog(ERROR, "unrecognized rbtree iteration order: %d", ctrl);
+	}
+}
+
+/*
+ * rbt_iterate: return the next node in traversal order, or NULL if no more
+ */
+RBTNode *
+rbt_iterate(RBTreeIterator *iter)
+{
+	if (iter->is_over)
+		return NULL;
+
+	return iter->iterate(iter);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:17:33 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:17:33 +0000
commit	5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree	739caf8c461053357daa9f162bef34516c7bf452 /src/backend/lib
parent	Initial commit. (diff)
download	postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip