Adding upstream version 14.5.upstream/14.5 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
commit: 46651ce6fe013220ed397add242004d764fc0153 (patch)
tree: 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/storage/ipc
parent: Initial commit. (diff)
download: postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
18 files changed, 16981 insertions, 0 deletions
diff --git a/src/backend/storage/ipc/Makefile b/src/backend/storage/ipc/Makefile
new file mode 100644
index 0000000..df90c6b
--- /dev/null
+++ b/src/backend/storage/ipc/Makefile
@@ -0,0 +1,30 @@
+#
+# Makefile for storage/ipc
+#
+# src/backend/storage/ipc/Makefile
+#
+
+subdir = src/backend/storage/ipc
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	barrier.o \
+	dsm.o \
+	dsm_impl.o \
+	ipc.o \
+	ipci.o \
+	latch.o \
+	pmsignal.o \
+	procarray.o \
+	procsignal.o \
+	shm_mq.o \
+	shm_toc.o \
+	shmem.o \
+	shmqueue.o \
+	signalfuncs.o \
+	sinval.o \
+	sinvaladt.o \
+	standby.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/ipc/barrier.c b/src/backend/storage/ipc/barrier.c
new file mode 100644
index 0000000..5c05297
--- /dev/null
+++ b/src/backend/storage/ipc/barrier.c
@@ -0,0 +1,333 @@
+/*-------------------------------------------------------------------------
+ *
+ * barrier.c
+ *	  Barriers for synchronizing cooperating processes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * From Wikipedia[1]: "In parallel computing, a barrier is a type of
+ * synchronization method.  A barrier for a group of threads or processes in
+ * the source code means any thread/process must stop at this point and cannot
+ * proceed until all other threads/processes reach this barrier."
+ *
+ * This implementation of barriers allows for static sets of participants
+ * known up front, or dynamic sets of participants which processes can join or
+ * leave at any time.  In the dynamic case, a phase number can be used to
+ * track progress through a parallel algorithm, and may be necessary to
+ * synchronize with the current phase of a multi-phase algorithm when a new
+ * participant joins.  In the static case, the phase number is used
+ * internally, but it isn't strictly necessary for client code to access it
+ * because the phase can only advance when the declared number of participants
+ * reaches the barrier, so client code should be in no doubt about the current
+ * phase of computation at all times.
+ *
+ * Consider a parallel algorithm that involves separate phases of computation
+ * A, B and C where the output of each phase is needed before the next phase
+ * can begin.
+ *
+ * In the case of a static barrier initialized with 4 participants, each
+ * participant works on phase A, then calls BarrierArriveAndWait to wait until
+ * all 4 participants have reached that point.  When BarrierArriveAndWait
+ * returns control, each participant can work on B, and so on.  Because the
+ * barrier knows how many participants to expect, the phases of computation
+ * don't need labels or numbers, since each process's program counter implies
+ * the current phase.  Even if some of the processes are slow to start up and
+ * begin running phase A, the other participants are expecting them and will
+ * patiently wait at the barrier.  The code could be written as follows:
+ *
+ *     perform_a();
+ *     BarrierArriveAndWait(&barrier, ...);
+ *     perform_b();
+ *     BarrierArriveAndWait(&barrier, ...);
+ *     perform_c();
+ *     BarrierArriveAndWait(&barrier, ...);
+ *
+ * If the number of participants is not known up front, then a dynamic barrier
+ * is needed and the number should be set to zero at initialization.  New
+ * complications arise because the number necessarily changes over time as
+ * participants attach and detach, and therefore phases B, C or even the end
+ * of processing may be reached before any given participant has started
+ * running and attached.  Therefore the client code must perform an initial
+ * test of the phase number after attaching, because it needs to find out
+ * which phase of the algorithm has been reached by any participants that are
+ * already attached in order to synchronize with that work.  Once the program
+ * counter or some other representation of current progress is synchronized
+ * with the barrier's phase, normal control flow can be used just as in the
+ * static case.  Our example could be written using a switch statement with
+ * cases that fall-through, as follows:
+ *
+ *     phase = BarrierAttach(&barrier);
+ *     switch (phase)
+ *     {
+ *     case PHASE_A:
+ *         perform_a();
+ *         BarrierArriveAndWait(&barrier, ...);
+ *     case PHASE_B:
+ *         perform_b();
+ *         BarrierArriveAndWait(&barrier, ...);
+ *     case PHASE_C:
+ *         perform_c();
+ *         BarrierArriveAndWait(&barrier, ...);
+ *     }
+ *     BarrierDetach(&barrier);
+ *
+ * Static barriers behave similarly to POSIX's pthread_barrier_t.  Dynamic
+ * barriers behave similarly to Java's java.util.concurrent.Phaser.
+ *
+ * [1] https://en.wikipedia.org/wiki/Barrier_(computer_science)
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/barrier.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "storage/barrier.h"
+
+static inline bool BarrierDetachImpl(Barrier *barrier, bool arrive);
+
+/*
+ * Initialize this barrier.  To use a static party size, provide the number of
+ * participants to wait for at each phase indicating that that number of
+ * backends is implicitly attached.  To use a dynamic party size, specify zero
+ * here and then use BarrierAttach() and
+ * BarrierDetach()/BarrierArriveAndDetach() to register and deregister
+ * participants explicitly.
+ */
+void
+BarrierInit(Barrier *barrier, int participants)
+{
+	SpinLockInit(&barrier->mutex);
+	barrier->participants = participants;
+	barrier->arrived = 0;
+	barrier->phase = 0;
+	barrier->elected = 0;
+	barrier->static_party = participants > 0;
+	ConditionVariableInit(&barrier->condition_variable);
+}
+
+/*
+ * Arrive at this barrier, wait for all other attached participants to arrive
+ * too and then return.  Increments the current phase.  The caller must be
+ * attached.
+ *
+ * While waiting, pg_stat_activity shows a wait_event_type and wait_event
+ * controlled by the wait_event_info passed in, which should be a value from
+ * one of the WaitEventXXX enums defined in pgstat.h.
+ *
+ * Return true in one arbitrarily chosen participant.  Return false in all
+ * others.  The return code can be used to elect one participant to execute a
+ * phase of work that must be done serially while other participants wait.
+ */
+bool
+BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info)
+{
+	bool		release = false;
+	bool		elected;
+	int			start_phase;
+	int			next_phase;
+
+	SpinLockAcquire(&barrier->mutex);
+	start_phase = barrier->phase;
+	next_phase = start_phase + 1;
+	++barrier->arrived;
+	if (barrier->arrived == barrier->participants)
+	{
+		release = true;
+		barrier->arrived = 0;
+		barrier->phase = next_phase;
+		barrier->elected = next_phase;
+	}
+	SpinLockRelease(&barrier->mutex);
+
+	/*
+	 * If we were the last expected participant to arrive, we can release our
+	 * peers and return true to indicate that this backend has been elected to
+	 * perform any serial work.
+	 */
+	if (release)
+	{
+		ConditionVariableBroadcast(&barrier->condition_variable);
+
+		return true;
+	}
+
+	/*
+	 * Otherwise we have to wait for the last participant to arrive and
+	 * advance the phase.
+	 */
+	elected = false;
+	ConditionVariablePrepareToSleep(&barrier->condition_variable);
+	for (;;)
+	{
+		/*
+		 * We know that phase must either be start_phase, indicating that we
+		 * need to keep waiting, or next_phase, indicating that the last
+		 * participant that we were waiting for has either arrived or detached
+		 * so that the next phase has begun.  The phase cannot advance any
+		 * further than that without this backend's participation, because
+		 * this backend is attached.
+		 */
+		SpinLockAcquire(&barrier->mutex);
+		Assert(barrier->phase == start_phase || barrier->phase == next_phase);
+		release = barrier->phase == next_phase;
+		if (release && barrier->elected != next_phase)
+		{
+			/*
+			 * Usually the backend that arrives last and releases the other
+			 * backends is elected to return true (see above), so that it can
+			 * begin processing serial work while it has a CPU timeslice.
+			 * However, if the barrier advanced because someone detached, then
+			 * one of the backends that is awoken will need to be elected.
+			 */
+			barrier->elected = barrier->phase;
+			elected = true;
+		}
+		SpinLockRelease(&barrier->mutex);
+		if (release)
+			break;
+		ConditionVariableSleep(&barrier->condition_variable, wait_event_info);
+	}
+	ConditionVariableCancelSleep();
+
+	return elected;
+}
+
+/*
+ * Arrive at this barrier, but detach rather than waiting.  Returns true if
+ * the caller was the last to detach.
+ */
+bool
+BarrierArriveAndDetach(Barrier *barrier)
+{
+	return BarrierDetachImpl(barrier, true);
+}
+
+/*
+ * Arrive at a barrier, and detach all but the last to arrive.  Returns true if
+ * the caller was the last to arrive, and is therefore still attached.
+ */
+bool
+BarrierArriveAndDetachExceptLast(Barrier *barrier)
+{
+	SpinLockAcquire(&barrier->mutex);
+	if (barrier->participants > 1)
+	{
+		--barrier->participants;
+		SpinLockRelease(&barrier->mutex);
+
+		return false;
+	}
+	Assert(barrier->participants == 1);
+	++barrier->phase;
+	SpinLockRelease(&barrier->mutex);
+
+	return true;
+}
+
+/*
+ * Attach to a barrier.  All waiting participants will now wait for this
+ * participant to call BarrierArriveAndWait(), BarrierDetach() or
+ * BarrierArriveAndDetach().  Return the current phase.
+ */
+int
+BarrierAttach(Barrier *barrier)
+{
+	int			phase;
+
+	Assert(!barrier->static_party);
+
+	SpinLockAcquire(&barrier->mutex);
+	++barrier->participants;
+	phase = barrier->phase;
+	SpinLockRelease(&barrier->mutex);
+
+	return phase;
+}
+
+/*
+ * Detach from a barrier.  This may release other waiters from
+ * BarrierArriveAndWait() and advance the phase if they were only waiting for
+ * this backend.  Return true if this participant was the last to detach.
+ */
+bool
+BarrierDetach(Barrier *barrier)
+{
+	return BarrierDetachImpl(barrier, false);
+}
+
+/*
+ * Return the current phase of a barrier.  The caller must be attached.
+ */
+int
+BarrierPhase(Barrier *barrier)
+{
+	/*
+	 * It is OK to read barrier->phase without locking, because it can't
+	 * change without us (we are attached to it), and we executed a memory
+	 * barrier when we either attached or participated in changing it last
+	 * time.
+	 */
+	return barrier->phase;
+}
+
+/*
+ * Return an instantaneous snapshot of the number of participants currently
+ * attached to this barrier.  For debugging purposes only.
+ */
+int
+BarrierParticipants(Barrier *barrier)
+{
+	int			participants;
+
+	SpinLockAcquire(&barrier->mutex);
+	participants = barrier->participants;
+	SpinLockRelease(&barrier->mutex);
+
+	return participants;
+}
+
+/*
+ * Detach from a barrier.  If 'arrive' is true then also increment the phase
+ * if there are no other participants.  If there are other participants
+ * waiting, then the phase will be advanced and they'll be released if they
+ * were only waiting for the caller.  Return true if this participant was the
+ * last to detach.
+ */
+static inline bool
+BarrierDetachImpl(Barrier *barrier, bool arrive)
+{
+	bool		release;
+	bool		last;
+
+	Assert(!barrier->static_party);
+
+	SpinLockAcquire(&barrier->mutex);
+	Assert(barrier->participants > 0);
+	--barrier->participants;
+
+	/*
+	 * If any other participants are waiting and we were the last participant
+	 * waited for, release them.  If no other participants are waiting, but
+	 * this is a BarrierArriveAndDetach() call, then advance the phase too.
+	 */
+	if ((arrive || barrier->participants > 0) &&
+		barrier->arrived == barrier->participants)
+	{
+		release = true;
+		barrier->arrived = 0;
+		++barrier->phase;
+	}
+	else
+		release = false;
+
+	last = barrier->participants == 0;
+	SpinLockRelease(&barrier->mutex);
+
+	if (release)
+		ConditionVariableBroadcast(&barrier->condition_variable);
+
+	return last;
+}
diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c
new file mode 100644
index 0000000..b461a5f
--- /dev/null
+++ b/src/backend/storage/ipc/dsm.c
@@ -0,0 +1,1248 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm.c
+ *	  manage dynamic shared memory segments
+ *
+ * This file provides a set of services to make programming with dynamic
+ * shared memory segments more convenient.  Unlike the low-level
+ * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
+ * created using this module will be cleaned up automatically.  Mappings
+ * will be removed when the resource owner under which they were created
+ * is cleaned up, unless dsm_pin_mapping() is used, in which case they
+ * have session lifespan.  Segments will be removed when there are no
+ * remaining mappings, or at postmaster shutdown in any case.  After a
+ * hard postmaster crash, remaining segments will be removed, if they
+ * still exist, at the next postmaster startup.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/dsm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "port/pg_bitutils.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "utils/freepage.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+
+#define PG_DYNSHMEM_CONTROL_MAGIC		0x9a503d32
+
+#define PG_DYNSHMEM_FIXED_SLOTS			64
+#define PG_DYNSHMEM_SLOTS_PER_BACKEND	5
+
+#define INVALID_CONTROL_SLOT		((uint32) -1)
+
+/* Backend-local tracking for on-detach callbacks. */
+typedef struct dsm_segment_detach_callback
+{
+	on_dsm_detach_callback function;
+	Datum		arg;
+	slist_node	node;
+} dsm_segment_detach_callback;
+
+/* Backend-local state for a dynamic shared memory segment. */
+struct dsm_segment
+{
+	dlist_node	node;			/* List link in dsm_segment_list. */
+	ResourceOwner resowner;		/* Resource owner. */
+	dsm_handle	handle;			/* Segment name. */
+	uint32		control_slot;	/* Slot in control segment. */
+	void	   *impl_private;	/* Implementation-specific private data. */
+	void	   *mapped_address; /* Mapping address, or NULL if unmapped. */
+	Size		mapped_size;	/* Size of our mapping. */
+	slist_head	on_detach;		/* On-detach callbacks. */
+};
+
+/* Shared-memory state for a dynamic shared memory segment. */
+typedef struct dsm_control_item
+{
+	dsm_handle	handle;
+	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
+	size_t		first_page;
+	size_t		npages;
+	void	   *impl_private_pm_handle; /* only needed on Windows */
+	bool		pinned;
+} dsm_control_item;
+
+/* Layout of the dynamic shared memory control segment. */
+typedef struct dsm_control_header
+{
+	uint32		magic;
+	uint32		nitems;
+	uint32		maxitems;
+	dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
+} dsm_control_header;
+
+static void dsm_cleanup_for_mmap(void);
+static void dsm_postmaster_shutdown(int code, Datum arg);
+static dsm_segment *dsm_create_descriptor(void);
+static bool dsm_control_segment_sane(dsm_control_header *control,
+									 Size mapped_size);
+static uint64 dsm_control_bytes_needed(uint32 nitems);
+static inline dsm_handle make_main_region_dsm_handle(int slot);
+static inline bool is_main_region_dsm_handle(dsm_handle handle);
+
+/* Has this backend initialized the dynamic shared memory system yet? */
+static bool dsm_init_done = false;
+
+/* Preallocated DSM space in the main shared memory region. */
+static void *dsm_main_space_begin = NULL;
+
+/*
+ * List of dynamic shared memory segments used by this backend.
+ *
+ * At process exit time, we must decrement the reference count of each
+ * segment we have attached; this list makes it possible to find all such
+ * segments.
+ *
+ * This list should always be empty in the postmaster.  We could probably
+ * allow the postmaster to map dynamic shared memory segments before it
+ * begins to start child processes, provided that each process adjusted
+ * the reference counts for those segments in the control segment at
+ * startup time, but there's no obvious need for such a facility, which
+ * would also be complex to handle in the EXEC_BACKEND case.  Once the
+ * postmaster has begun spawning children, there's an additional problem:
+ * each new mapping would require an update to the control segment,
+ * which requires locking, in which the postmaster must not be involved.
+ */
+static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
+
+/*
+ * Control segment information.
+ *
+ * Unlike ordinary shared memory segments, the control segment is not
+ * reference counted; instead, it lasts for the postmaster's entire
+ * life cycle.  For simplicity, it doesn't have a dsm_segment object either.
+ */
+static dsm_handle dsm_control_handle;
+static dsm_control_header *dsm_control;
+static Size dsm_control_mapped_size = 0;
+static void *dsm_control_impl_private = NULL;
+
+/*
+ * Start up the dynamic shared memory system.
+ *
+ * This is called just once during each cluster lifetime, at postmaster
+ * startup time.
+ */
+void
+dsm_postmaster_startup(PGShmemHeader *shim)
+{
+	void	   *dsm_control_address = NULL;
+	uint32		maxitems;
+	Size		segsize;
+
+	Assert(!IsUnderPostmaster);
+
+	/*
+	 * If we're using the mmap implementations, clean up any leftovers.
+	 * Cleanup isn't needed on Windows, and happens earlier in startup for
+	 * POSIX and System V shared memory, via a direct call to
+	 * dsm_cleanup_using_control_segment.
+	 */
+	if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
+		dsm_cleanup_for_mmap();
+
+	/* Determine size for new control segment. */
+	maxitems = PG_DYNSHMEM_FIXED_SLOTS
+		+ PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
+	elog(DEBUG2, "dynamic shared memory system will support %u segments",
+		 maxitems);
+	segsize = dsm_control_bytes_needed(maxitems);
+
+	/*
+	 * Loop until we find an unused identifier for the new control segment. We
+	 * sometimes use 0 as a sentinel value indicating that no control segment
+	 * is known to exist, so avoid using that value for a real control
+	 * segment.
+	 */
+	for (;;)
+	{
+		Assert(dsm_control_address == NULL);
+		Assert(dsm_control_mapped_size == 0);
+		dsm_control_handle = random() << 1; /* Even numbers only */
+		if (dsm_control_handle == DSM_HANDLE_INVALID)
+			continue;
+		if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
+						&dsm_control_impl_private, &dsm_control_address,
+						&dsm_control_mapped_size, ERROR))
+			break;
+	}
+	dsm_control = dsm_control_address;
+	on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
+	elog(DEBUG2,
+		 "created dynamic shared memory control segment %u (%zu bytes)",
+		 dsm_control_handle, segsize);
+	shim->dsm_control = dsm_control_handle;
+
+	/* Initialize control segment. */
+	dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
+	dsm_control->nitems = 0;
+	dsm_control->maxitems = maxitems;
+}
+
+/*
+ * Determine whether the control segment from the previous postmaster
+ * invocation still exists.  If so, remove the dynamic shared memory
+ * segments to which it refers, and then the control segment itself.
+ */
+void
+dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
+{
+	void	   *mapped_address = NULL;
+	void	   *junk_mapped_address = NULL;
+	void	   *impl_private = NULL;
+	void	   *junk_impl_private = NULL;
+	Size		mapped_size = 0;
+	Size		junk_mapped_size = 0;
+	uint32		nitems;
+	uint32		i;
+	dsm_control_header *old_control;
+
+	/*
+	 * Try to attach the segment.  If this fails, it probably just means that
+	 * the operating system has been rebooted and the segment no longer
+	 * exists, or an unrelated process has used the same shm ID.  So just fall
+	 * out quietly.
+	 */
+	if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
+					 &mapped_address, &mapped_size, DEBUG1))
+		return;
+
+	/*
+	 * We've managed to reattach it, but the contents might not be sane. If
+	 * they aren't, we disregard the segment after all.
+	 */
+	old_control = (dsm_control_header *) mapped_address;
+	if (!dsm_control_segment_sane(old_control, mapped_size))
+	{
+		dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
+					&mapped_address, &mapped_size, LOG);
+		return;
+	}
+
+	/*
+	 * OK, the control segment looks basically valid, so we can use it to get
+	 * a list of segments that need to be removed.
+	 */
+	nitems = old_control->nitems;
+	for (i = 0; i < nitems; ++i)
+	{
+		dsm_handle	handle;
+		uint32		refcnt;
+
+		/* If the reference count is 0, the slot is actually unused. */
+		refcnt = old_control->item[i].refcnt;
+		if (refcnt == 0)
+			continue;
+
+		/* If it was using the main shmem area, there is nothing to do. */
+		handle = old_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
+		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
+			 handle, refcnt);
+
+		/* Destroy the referenced segment. */
+		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+					&junk_mapped_address, &junk_mapped_size, LOG);
+	}
+
+	/* Destroy the old control segment, too. */
+	elog(DEBUG2,
+		 "cleaning up dynamic shared memory control segment with ID %u",
+		 old_control_handle);
+	dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
+				&mapped_address, &mapped_size, LOG);
+}
+
+/*
+ * When we're using the mmap shared memory implementation, "shared memory"
+ * segments might even manage to survive an operating system reboot.
+ * But there's no guarantee as to exactly what will survive: some segments
+ * may survive, and others may not, and the contents of some may be out
+ * of date.  In particular, the control segment may be out of date, so we
+ * can't rely on it to figure out what to remove.  However, since we know
+ * what directory contains the files we used as shared memory, we can simply
+ * scan the directory and blow everything away that shouldn't be there.
+ */
+static void
+dsm_cleanup_for_mmap(void)
+{
+	DIR		   *dir;
+	struct dirent *dent;
+
+	/* Scan the directory for something with a name of the correct format. */
+	dir = AllocateDir(PG_DYNSHMEM_DIR);
+
+	while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
+	{
+		if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
+					strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
+		{
+			char		buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
+
+			snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
+
+			elog(DEBUG2, "removing file \"%s\"", buf);
+
+			/* We found a matching file; so remove it. */
+			if (unlink(buf) != 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m", buf)));
+		}
+	}
+
+	/* Cleanup complete. */
+	FreeDir(dir);
+}
+
+/*
+ * At shutdown time, we iterate over the control segment and remove all
+ * remaining dynamic shared memory segments.  We avoid throwing errors here;
+ * the postmaster is shutting down either way, and this is just non-critical
+ * resource cleanup.
+ */
+static void
+dsm_postmaster_shutdown(int code, Datum arg)
+{
+	uint32		nitems;
+	uint32		i;
+	void	   *dsm_control_address;
+	void	   *junk_mapped_address = NULL;
+	void	   *junk_impl_private = NULL;
+	Size		junk_mapped_size = 0;
+	PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
+
+	/*
+	 * If some other backend exited uncleanly, it might have corrupted the
+	 * control segment while it was dying.  In that case, we warn and ignore
+	 * the contents of the control segment.  This may end up leaving behind
+	 * stray shared memory segments, but there's not much we can do about that
+	 * if the metadata is gone.
+	 */
+	nitems = dsm_control->nitems;
+	if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
+	{
+		ereport(LOG,
+				(errmsg("dynamic shared memory control segment is corrupt")));
+		return;
+	}
+
+	/* Remove any remaining segments. */
+	for (i = 0; i < nitems; ++i)
+	{
+		dsm_handle	handle;
+
+		/* If the reference count is 0, the slot is actually unused. */
+		if (dsm_control->item[i].refcnt == 0)
+			continue;
+
+		handle = dsm_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
+		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
+			 handle);
+
+		/* Destroy the segment. */
+		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+					&junk_mapped_address, &junk_mapped_size, LOG);
+	}
+
+	/* Remove the control segment itself. */
+	elog(DEBUG2,
+		 "cleaning up dynamic shared memory control segment with ID %u",
+		 dsm_control_handle);
+	dsm_control_address = dsm_control;
+	dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
+				&dsm_control_impl_private, &dsm_control_address,
+				&dsm_control_mapped_size, LOG);
+	dsm_control = dsm_control_address;
+	shim->dsm_control = 0;
+}
+
+/*
+ * Prepare this backend for dynamic shared memory usage.  Under EXEC_BACKEND,
+ * we must reread the state file and map the control segment; in other cases,
+ * we'll have inherited the postmaster's mapping and global variables.
+ */
+static void
+dsm_backend_startup(void)
+{
+#ifdef EXEC_BACKEND
+	{
+		void	   *control_address = NULL;
+
+		/* Attach control segment. */
+		Assert(dsm_control_handle != 0);
+		dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
+					&dsm_control_impl_private, &control_address,
+					&dsm_control_mapped_size, ERROR);
+		dsm_control = control_address;
+		/* If control segment doesn't look sane, something is badly wrong. */
+		if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
+		{
+			dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
+						&dsm_control_impl_private, &control_address,
+						&dsm_control_mapped_size, WARNING);
+			ereport(FATAL,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("dynamic shared memory control segment is not valid")));
+		}
+	}
+#endif
+
+	dsm_init_done = true;
+}
+
+#ifdef EXEC_BACKEND
+/*
+ * When running under EXEC_BACKEND, we get a callback here when the main
+ * shared memory segment is re-attached, so that we can record the control
+ * handle retrieved from it.
+ */
+void
+dsm_set_control_handle(dsm_handle h)
+{
+	Assert(dsm_control_handle == 0 && h != 0);
+	dsm_control_handle = h;
+}
+#endif
+
+/*
+ * Reserve some space in the main shared memory segment for DSM segments.
+ */
+size_t
+dsm_estimate_size(void)
+{
+	return 1024 * 1024 * (size_t) min_dynamic_shared_memory;
+}
+
+/*
+ * Initialize space in the main shared memory segment for DSM segments.
+ */
+void
+dsm_shmem_init(void)
+{
+	size_t		size = dsm_estimate_size();
+	bool		found;
+
+	if (size == 0)
+		return;
+
+	dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found);
+	if (!found)
+	{
+		FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin;
+		size_t		first_page = 0;
+		size_t		pages;
+
+		/* Reserve space for the FreePageManager. */
+		while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager))
+			++first_page;
+
+		/* Initialize it and give it all the rest of the space. */
+		FreePageManagerInitialize(fpm, dsm_main_space_begin);
+		pages = (size / FPM_PAGE_SIZE) - first_page;
+		FreePageManagerPut(fpm, first_page, pages);
+	}
+}
+
+/*
+ * Create a new dynamic shared memory segment.
+ *
+ * If there is a non-NULL CurrentResourceOwner, the new segment is associated
+ * with it and must be detached before the resource owner releases, or a
+ * warning will be logged.  If CurrentResourceOwner is NULL, the segment
+ * remains attached until explicitly detached or the session ends.
+ * Creating with a NULL CurrentResourceOwner is equivalent to creating
+ * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
+ */
+dsm_segment *
+dsm_create(Size size, int flags)
+{
+	dsm_segment *seg;
+	uint32		i;
+	uint32		nitems;
+	size_t		npages = 0;
+	size_t		first_page = 0;
+	FreePageManager *dsm_main_space_fpm = dsm_main_space_begin;
+	bool		using_main_dsm_region = false;
+
+	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
+	Assert(IsUnderPostmaster);
+
+	if (!dsm_init_done)
+		dsm_backend_startup();
+
+	/* Create a new segment descriptor. */
+	seg = dsm_create_descriptor();
+
+	/*
+	 * Lock the control segment while we try to allocate from the main shared
+	 * memory area, if configured.
+	 */
+	if (dsm_main_space_fpm)
+	{
+		npages = size / FPM_PAGE_SIZE;
+		if (size % FPM_PAGE_SIZE > 0)
+			++npages;
+
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+		if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page))
+		{
+			/* We can carve out a piece of the main shared memory segment. */
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = npages * FPM_PAGE_SIZE;
+			using_main_dsm_region = true;
+			/* We'll choose a handle below. */
+		}
+	}
+
+	if (!using_main_dsm_region)
+	{
+		/*
+		 * We need to create a new memory segment.  Loop until we find an
+		 * unused segment identifier.
+		 */
+		if (dsm_main_space_fpm)
+			LWLockRelease(DynamicSharedMemoryControlLock);
+		for (;;)
+		{
+			Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
+			seg->handle = random() << 1;	/* Even numbers only */
+			if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
+				continue;
+			if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
+							&seg->mapped_address, &seg->mapped_size, ERROR))
+				break;
+		}
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	}
+
+	/* Search the control segment for an unused slot. */
+	nitems = dsm_control->nitems;
+	for (i = 0; i < nitems; ++i)
+	{
+		if (dsm_control->item[i].refcnt == 0)
+		{
+			if (using_main_dsm_region)
+			{
+				seg->handle = make_main_region_dsm_handle(i);
+				dsm_control->item[i].first_page = first_page;
+				dsm_control->item[i].npages = npages;
+			}
+			else
+				Assert(!is_main_region_dsm_handle(seg->handle));
+			dsm_control->item[i].handle = seg->handle;
+			/* refcnt of 1 triggers destruction, so start at 2 */
+			dsm_control->item[i].refcnt = 2;
+			dsm_control->item[i].impl_private_pm_handle = NULL;
+			dsm_control->item[i].pinned = false;
+			seg->control_slot = i;
+			LWLockRelease(DynamicSharedMemoryControlLock);
+			return seg;
+		}
+	}
+
+	/* Verify that we can support an additional mapping. */
+	if (nitems >= dsm_control->maxitems)
+	{
+		if (using_main_dsm_region)
+			FreePageManagerPut(dsm_main_space_fpm, first_page, npages);
+		LWLockRelease(DynamicSharedMemoryControlLock);
+		if (!using_main_dsm_region)
+			dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
+		if (seg->resowner != NULL)
+			ResourceOwnerForgetDSM(seg->resowner, seg);
+		dlist_delete(&seg->node);
+		pfree(seg);
+
+		if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
+			return NULL;
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("too many dynamic shared memory segments")));
+	}
+
+	/* Enter the handle into a new array slot. */
+	if (using_main_dsm_region)
+	{
+		seg->handle = make_main_region_dsm_handle(nitems);
+		dsm_control->item[i].first_page = first_page;
+		dsm_control->item[i].npages = npages;
+	}
+	dsm_control->item[nitems].handle = seg->handle;
+	/* refcnt of 1 triggers destruction, so start at 2 */
+	dsm_control->item[nitems].refcnt = 2;
+	dsm_control->item[nitems].impl_private_pm_handle = NULL;
+	dsm_control->item[nitems].pinned = false;
+	seg->control_slot = nitems;
+	dsm_control->nitems++;
+	LWLockRelease(DynamicSharedMemoryControlLock);
+
+	return seg;
+}
+
+/*
+ * Attach a dynamic shared memory segment.
+ *
+ * See comments for dsm_segment_handle() for an explanation of how this
+ * is intended to be used.
+ *
+ * This function will return NULL if the segment isn't known to the system.
+ * This can happen if we're asked to attach the segment, but then everyone
+ * else detaches it (causing it to be destroyed) before we get around to
+ * attaching it.
+ *
+ * If there is a non-NULL CurrentResourceOwner, the attached segment is
+ * associated with it and must be detached before the resource owner releases,
+ * or a warning will be logged.  Otherwise the segment remains attached until
+ * explicitly detached or the session ends.  See the note atop dsm_create().
+ */
+dsm_segment *
+dsm_attach(dsm_handle h)
+{
+	dsm_segment *seg;
+	dlist_iter	iter;
+	uint32		i;
+	uint32		nitems;
+
+	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
+	Assert(IsUnderPostmaster);
+
+	if (!dsm_init_done)
+		dsm_backend_startup();
+
+	/*
+	 * Since this is just a debugging cross-check, we could leave it out
+	 * altogether, or include it only in assert-enabled builds.  But since the
+	 * list of attached segments should normally be very short, let's include
+	 * it always for right now.
+	 *
+	 * If you're hitting this error, you probably want to attempt to find an
+	 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
+	 * create a new one.
+	 */
+	dlist_foreach(iter, &dsm_segment_list)
+	{
+		seg = dlist_container(dsm_segment, node, iter.cur);
+		if (seg->handle == h)
+			elog(ERROR, "can't attach the same segment more than once");
+	}
+
+	/* Create a new segment descriptor. */
+	seg = dsm_create_descriptor();
+	seg->handle = h;
+
+	/* Bump reference count for this segment in shared memory. */
+	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	nitems = dsm_control->nitems;
+	for (i = 0; i < nitems; ++i)
+	{
+		/*
+		 * If the reference count is 0, the slot is actually unused.  If the
+		 * reference count is 1, the slot is still in use, but the segment is
+		 * in the process of going away; even if the handle matches, another
+		 * slot may already have started using the same handle value by
+		 * coincidence so we have to keep searching.
+		 */
+		if (dsm_control->item[i].refcnt <= 1)
+			continue;
+
+		/* If the handle doesn't match, it's not the slot we want. */
+		if (dsm_control->item[i].handle != seg->handle)
+			continue;
+
+		/* Otherwise we've found a match. */
+		dsm_control->item[i].refcnt++;
+		seg->control_slot = i;
+		if (is_main_region_dsm_handle(seg->handle))
+		{
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				dsm_control->item[i].first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE;
+		}
+		break;
+	}
+	LWLockRelease(DynamicSharedMemoryControlLock);
+
+	/*
+	 * If we didn't find the handle we're looking for in the control segment,
+	 * it probably means that everyone else who had it mapped, including the
+	 * original creator, died before we got to this point. It's up to the
+	 * caller to decide what to do about that.
+	 */
+	if (seg->control_slot == INVALID_CONTROL_SLOT)
+	{
+		dsm_detach(seg);
+		return NULL;
+	}
+
+	/* Here's where we actually try to map the segment. */
+	if (!is_main_region_dsm_handle(seg->handle))
+		dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
+					&seg->mapped_address, &seg->mapped_size, ERROR);
+
+	return seg;
+}
+
+/*
+ * At backend shutdown time, detach any segments that are still attached.
+ * (This is similar to dsm_detach_all, except that there's no reason to
+ * unmap the control segment before exiting, so we don't bother.)
+ */
+void
+dsm_backend_shutdown(void)
+{
+	while (!dlist_is_empty(&dsm_segment_list))
+	{
+		dsm_segment *seg;
+
+		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
+		dsm_detach(seg);
+	}
+}
+
+/*
+ * Detach all shared memory segments, including the control segments.  This
+ * should be called, along with PGSharedMemoryDetach, in processes that
+ * might inherit mappings but are not intended to be connected to dynamic
+ * shared memory.
+ */
+void
+dsm_detach_all(void)
+{
+	void	   *control_address = dsm_control;
+
+	while (!dlist_is_empty(&dsm_segment_list))
+	{
+		dsm_segment *seg;
+
+		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
+		dsm_detach(seg);
+	}
+
+	if (control_address != NULL)
+		dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
+					&dsm_control_impl_private, &control_address,
+					&dsm_control_mapped_size, ERROR);
+}
+
+/*
+ * Detach from a shared memory segment, destroying the segment if we
+ * remove the last reference.
+ *
+ * This function should never fail.  It will often be invoked when aborting
+ * a transaction, and a further error won't serve any purpose.  It's not a
+ * complete disaster if we fail to unmap or destroy the segment; it means a
+ * resource leak, but that doesn't necessarily preclude further operations.
+ */
+void
+dsm_detach(dsm_segment *seg)
+{
+	/*
+	 * Invoke registered callbacks.  Just in case one of those callbacks
+	 * throws a further error that brings us back here, pop the callback
+	 * before invoking it, to avoid infinite error recursion.  Don't allow
+	 * interrupts while running the individual callbacks in non-error code
+	 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
+	 * a statement timeout or similar.
+	 */
+	HOLD_INTERRUPTS();
+	while (!slist_is_empty(&seg->on_detach))
+	{
+		slist_node *node;
+		dsm_segment_detach_callback *cb;
+		on_dsm_detach_callback function;
+		Datum		arg;
+
+		node = slist_pop_head_node(&seg->on_detach);
+		cb = slist_container(dsm_segment_detach_callback, node, node);
+		function = cb->function;
+		arg = cb->arg;
+		pfree(cb);
+
+		function(seg, arg);
+	}
+	RESUME_INTERRUPTS();
+
+	/*
+	 * Try to remove the mapping, if one exists.  Normally, there will be, but
+	 * maybe not, if we failed partway through a create or attach operation.
+	 * We remove the mapping before decrementing the reference count so that
+	 * the process that sees a zero reference count can be certain that no
+	 * remaining mappings exist.  Even if this fails, we pretend that it
+	 * works, because retrying is likely to fail in the same way.
+	 */
+	if (seg->mapped_address != NULL)
+	{
+		if (!is_main_region_dsm_handle(seg->handle))
+			dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
+		seg->impl_private = NULL;
+		seg->mapped_address = NULL;
+		seg->mapped_size = 0;
+	}
+
+	/* Reduce reference count, if we previously increased it. */
+	if (seg->control_slot != INVALID_CONTROL_SLOT)
+	{
+		uint32		refcnt;
+		uint32		control_slot = seg->control_slot;
+
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+		Assert(dsm_control->item[control_slot].handle == seg->handle);
+		Assert(dsm_control->item[control_slot].refcnt > 1);
+		refcnt = --dsm_control->item[control_slot].refcnt;
+		seg->control_slot = INVALID_CONTROL_SLOT;
+		LWLockRelease(DynamicSharedMemoryControlLock);
+
+		/* If new reference count is 1, try to destroy the segment. */
+		if (refcnt == 1)
+		{
+			/* A pinned segment should never reach 1. */
+			Assert(!dsm_control->item[control_slot].pinned);
+
+			/*
+			 * If we fail to destroy the segment here, or are killed before we
+			 * finish doing so, the reference count will remain at 1, which
+			 * will mean that nobody else can attach to the segment.  At
+			 * postmaster shutdown time, or when a new postmaster is started
+			 * after a hard kill, another attempt will be made to remove the
+			 * segment.
+			 *
+			 * The main case we're worried about here is being killed by a
+			 * signal before we can finish removing the segment.  In that
+			 * case, it's important to be sure that the segment still gets
+			 * removed. If we actually fail to remove the segment for some
+			 * other reason, the postmaster may not have any better luck than
+			 * we did.  There's not much we can do about that, though.
+			 */
+			if (is_main_region_dsm_handle(seg->handle) ||
+				dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+							&seg->mapped_address, &seg->mapped_size, WARNING))
+			{
+				LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+				if (is_main_region_dsm_handle(seg->handle))
+					FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+									   dsm_control->item[control_slot].first_page,
+									   dsm_control->item[control_slot].npages);
+				Assert(dsm_control->item[control_slot].handle == seg->handle);
+				Assert(dsm_control->item[control_slot].refcnt == 1);
+				dsm_control->item[control_slot].refcnt = 0;
+				LWLockRelease(DynamicSharedMemoryControlLock);
+			}
+		}
+	}
+
+	/* Clean up our remaining backend-private data structures. */
+	if (seg->resowner != NULL)
+		ResourceOwnerForgetDSM(seg->resowner, seg);
+	dlist_delete(&seg->node);
+	pfree(seg);
+}
+
+/*
+ * Keep a dynamic shared memory mapping until end of session.
+ *
+ * By default, mappings are owned by the current resource owner, which
+ * typically means they stick around for the duration of the current query
+ * only.
+ */
+void
+dsm_pin_mapping(dsm_segment *seg)
+{
+	if (seg->resowner != NULL)
+	{
+		ResourceOwnerForgetDSM(seg->resowner, seg);
+		seg->resowner = NULL;
+	}
+}
+
+/*
+ * Arrange to remove a dynamic shared memory mapping at cleanup time.
+ *
+ * dsm_pin_mapping() can be used to preserve a mapping for the entire
+ * lifetime of a process; this function reverses that decision, making
+ * the segment owned by the current resource owner.  This may be useful
+ * just before performing some operation that will invalidate the segment
+ * for future use by this backend.
+ */
+void
+dsm_unpin_mapping(dsm_segment *seg)
+{
+	Assert(seg->resowner == NULL);
+	ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
+	seg->resowner = CurrentResourceOwner;
+	ResourceOwnerRememberDSM(seg->resowner, seg);
+}
+
+/*
+ * Keep a dynamic shared memory segment until postmaster shutdown, or until
+ * dsm_unpin_segment is called.
+ *
+ * This function should not be called more than once per segment, unless the
+ * segment is explicitly unpinned with dsm_unpin_segment in between calls.
+ *
+ * Note that this function does not arrange for the current process to
+ * keep the segment mapped indefinitely; if that behavior is desired,
+ * dsm_pin_mapping() should be used from each process that needs to
+ * retain the mapping.
+ */
+void
+dsm_pin_segment(dsm_segment *seg)
+{
+	void	   *handle;
+
+	/*
+	 * Bump reference count for this segment in shared memory. This will
+	 * ensure that even if there is no session which is attached to this
+	 * segment, it will remain until postmaster shutdown or an explicit call
+	 * to unpin.
+	 */
+	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	if (dsm_control->item[seg->control_slot].pinned)
+		elog(ERROR, "cannot pin a segment that is already pinned");
+	dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
+	dsm_control->item[seg->control_slot].pinned = true;
+	dsm_control->item[seg->control_slot].refcnt++;
+	dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
+	LWLockRelease(DynamicSharedMemoryControlLock);
+}
+
+/*
+ * Unpin a dynamic shared memory segment that was previously pinned with
+ * dsm_pin_segment.  This function should not be called unless dsm_pin_segment
+ * was previously called for this segment.
+ *
+ * The argument is a dsm_handle rather than a dsm_segment in case you want
+ * to unpin a segment to which you haven't attached.  This turns out to be
+ * useful if, for example, a reference to one shared memory segment is stored
+ * within another shared memory segment.  You might want to unpin the
+ * referenced segment before destroying the referencing segment.
+ */
+void
+dsm_unpin_segment(dsm_handle handle)
+{
+	uint32		control_slot = INVALID_CONTROL_SLOT;
+	bool		destroy = false;
+	uint32		i;
+
+	/* Find the control slot for the given handle. */
+	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	for (i = 0; i < dsm_control->nitems; ++i)
+	{
+		/* Skip unused slots and segments that are concurrently going away. */
+		if (dsm_control->item[i].refcnt <= 1)
+			continue;
+
+		/* If we've found our handle, we can stop searching. */
+		if (dsm_control->item[i].handle == handle)
+		{
+			control_slot = i;
+			break;
+		}
+	}
+
+	/*
+	 * We should definitely have found the slot, and it should not already be
+	 * in the process of going away, because this function should only be
+	 * called on a segment which is pinned.
+	 */
+	if (control_slot == INVALID_CONTROL_SLOT)
+		elog(ERROR, "cannot unpin unknown segment handle");
+	if (!dsm_control->item[control_slot].pinned)
+		elog(ERROR, "cannot unpin a segment that is not pinned");
+	Assert(dsm_control->item[control_slot].refcnt > 1);
+
+	/*
+	 * Allow implementation-specific code to run.  We have to do this before
+	 * releasing the lock, because impl_private_pm_handle may get modified by
+	 * dsm_impl_unpin_segment.
+	 */
+	dsm_impl_unpin_segment(handle,
+						   &dsm_control->item[control_slot].impl_private_pm_handle);
+
+	/* Note that 1 means no references (0 means unused slot). */
+	if (--dsm_control->item[control_slot].refcnt == 1)
+		destroy = true;
+	dsm_control->item[control_slot].pinned = false;
+
+	/* Now we can release the lock. */
+	LWLockRelease(DynamicSharedMemoryControlLock);
+
+	/* Clean up resources if that was the last reference. */
+	if (destroy)
+	{
+		void	   *junk_impl_private = NULL;
+		void	   *junk_mapped_address = NULL;
+		Size		junk_mapped_size = 0;
+
+		/*
+		 * For an explanation of how error handling works in this case, see
+		 * comments in dsm_detach.  Note that if we reach this point, the
+		 * current process certainly does not have the segment mapped, because
+		 * if it did, the reference count would have still been greater than 1
+		 * even after releasing the reference count held by the pin.  The fact
+		 * that there can't be a dsm_segment for this handle makes it OK to
+		 * pass the mapped size, mapped address, and private data as NULL
+		 * here.
+		 */
+		if (is_main_region_dsm_handle(handle) ||
+			dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+						&junk_mapped_address, &junk_mapped_size, WARNING))
+		{
+			LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+			if (is_main_region_dsm_handle(handle))
+				FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+								   dsm_control->item[control_slot].first_page,
+								   dsm_control->item[control_slot].npages);
+			Assert(dsm_control->item[control_slot].handle == handle);
+			Assert(dsm_control->item[control_slot].refcnt == 1);
+			dsm_control->item[control_slot].refcnt = 0;
+			LWLockRelease(DynamicSharedMemoryControlLock);
+		}
+	}
+}
+
+/*
+ * Find an existing mapping for a shared memory segment, if there is one.
+ */
+dsm_segment *
+dsm_find_mapping(dsm_handle h)
+{
+	dlist_iter	iter;
+	dsm_segment *seg;
+
+	dlist_foreach(iter, &dsm_segment_list)
+	{
+		seg = dlist_container(dsm_segment, node, iter.cur);
+		if (seg->handle == h)
+			return seg;
+	}
+
+	return NULL;
+}
+
+/*
+ * Get the address at which a dynamic shared memory segment is mapped.
+ */
+void *
+dsm_segment_address(dsm_segment *seg)
+{
+	Assert(seg->mapped_address != NULL);
+	return seg->mapped_address;
+}
+
+/*
+ * Get the size of a mapping.
+ */
+Size
+dsm_segment_map_length(dsm_segment *seg)
+{
+	Assert(seg->mapped_address != NULL);
+	return seg->mapped_size;
+}
+
+/*
+ * Get a handle for a mapping.
+ *
+ * To establish communication via dynamic shared memory between two backends,
+ * one of them should first call dsm_create() to establish a new shared
+ * memory mapping.  That process should then call dsm_segment_handle() to
+ * obtain a handle for the mapping, and pass that handle to the
+ * coordinating backend via some means (e.g. bgw_main_arg, or via the
+ * main shared memory segment).  The recipient, once in possession of the
+ * handle, should call dsm_attach().
+ */
+dsm_handle
+dsm_segment_handle(dsm_segment *seg)
+{
+	return seg->handle;
+}
+
+/*
+ * Register an on-detach callback for a dynamic shared memory segment.
+ */
+void
+on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
+{
+	dsm_segment_detach_callback *cb;
+
+	cb = MemoryContextAlloc(TopMemoryContext,
+							sizeof(dsm_segment_detach_callback));
+	cb->function = function;
+	cb->arg = arg;
+	slist_push_head(&seg->on_detach, &cb->node);
+}
+
+/*
+ * Unregister an on-detach callback for a dynamic shared memory segment.
+ */
+void
+cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
+					 Datum arg)
+{
+	slist_mutable_iter iter;
+
+	slist_foreach_modify(iter, &seg->on_detach)
+	{
+		dsm_segment_detach_callback *cb;
+
+		cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
+		if (cb->function == function && cb->arg == arg)
+		{
+			slist_delete_current(&iter);
+			pfree(cb);
+			break;
+		}
+	}
+}
+
+/*
+ * Discard all registered on-detach callbacks without executing them.
+ */
+void
+reset_on_dsm_detach(void)
+{
+	dlist_iter	iter;
+
+	dlist_foreach(iter, &dsm_segment_list)
+	{
+		dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
+
+		/* Throw away explicit on-detach actions one by one. */
+		while (!slist_is_empty(&seg->on_detach))
+		{
+			slist_node *node;
+			dsm_segment_detach_callback *cb;
+
+			node = slist_pop_head_node(&seg->on_detach);
+			cb = slist_container(dsm_segment_detach_callback, node, node);
+			pfree(cb);
+		}
+
+		/*
+		 * Decrementing the reference count is a sort of implicit on-detach
+		 * action; make sure we don't do that, either.
+		 */
+		seg->control_slot = INVALID_CONTROL_SLOT;
+	}
+}
+
+/*
+ * Create a segment descriptor.
+ */
+static dsm_segment *
+dsm_create_descriptor(void)
+{
+	dsm_segment *seg;
+
+	if (CurrentResourceOwner)
+		ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
+
+	seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
+	dlist_push_head(&dsm_segment_list, &seg->node);
+
+	/* seg->handle must be initialized by the caller */
+	seg->control_slot = INVALID_CONTROL_SLOT;
+	seg->impl_private = NULL;
+	seg->mapped_address = NULL;
+	seg->mapped_size = 0;
+
+	seg->resowner = CurrentResourceOwner;
+	if (CurrentResourceOwner)
+		ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
+
+	slist_init(&seg->on_detach);
+
+	return seg;
+}
+
+/*
+ * Sanity check a control segment.
+ *
+ * The goal here isn't to detect everything that could possibly be wrong with
+ * the control segment; there's not enough information for that.  Rather, the
+ * goal is to make sure that someone can iterate over the items in the segment
+ * without overrunning the end of the mapping and crashing.  We also check
+ * the magic number since, if that's messed up, this may not even be one of
+ * our segments at all.
+ */
+static bool
+dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
+{
+	if (mapped_size < offsetof(dsm_control_header, item))
+		return false;			/* Mapped size too short to read header. */
+	if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
+		return false;			/* Magic number doesn't match. */
+	if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
+		return false;			/* Max item count won't fit in map. */
+	if (control->nitems > control->maxitems)
+		return false;			/* Overfull. */
+	return true;
+}
+
+/*
+ * Compute the number of control-segment bytes needed to store a given
+ * number of items.
+ */
+static uint64
+dsm_control_bytes_needed(uint32 nitems)
+{
+	return offsetof(dsm_control_header, item)
+		+ sizeof(dsm_control_item) * (uint64) nitems;
+}
+
+static inline dsm_handle
+make_main_region_dsm_handle(int slot)
+{
+	dsm_handle	handle;
+
+	/*
+	 * We need to create a handle that doesn't collide with any existing extra
+	 * segment created by dsm_impl_op(), so we'll make it odd.  It also
+	 * mustn't collide with any other main area pseudo-segment, so we'll
+	 * include the slot number in some of the bits.  We also want to make an
+	 * effort to avoid newly created and recently destroyed handles from being
+	 * confused, so we'll make the rest of the bits random.
+	 */
+	handle = 1;
+	handle |= slot << 1;
+	handle |= random() << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1);
+	return handle;
+}
+
+static inline bool
+is_main_region_dsm_handle(dsm_handle handle)
+{
+	return handle & 1;
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
new file mode 100644
index 0000000..c51e3e6
--- /dev/null
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -0,0 +1,1058 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm_impl.c
+ *	  manage dynamic shared memory segments
+ *
+ * This file provides low-level APIs for creating and destroying shared
+ * memory segments using several different possible techniques.  We refer
+ * to these segments as dynamic because they can be created, altered, and
+ * destroyed at any point during the server life cycle.  This is unlike
+ * the main shared memory segment, of which there is always exactly one
+ * and which is always mapped at a fixed address in every PostgreSQL
+ * background process.
+ *
+ * Because not all systems provide the same primitives in this area, nor
+ * do all primitives behave the same way on all systems, we provide
+ * several implementations of this facility.  Many systems implement
+ * POSIX shared memory (shm_open etc.), which is well-suited to our needs
+ * in this area, with the exception that shared memory identifiers live
+ * in a flat system-wide namespace, raising the uncomfortable prospect of
+ * name collisions with other processes (including other copies of
+ * PostgreSQL) running on the same system.  Some systems only support
+ * the older System V shared memory interface (shmget etc.) which is
+ * also usable; however, the default allocation limits are often quite
+ * small, and the namespace is even more restricted.
+ *
+ * We also provide an mmap-based shared memory implementation.  This may
+ * be useful on systems that provide shared memory via a special-purpose
+ * filesystem; by opting for this implementation, the user can even
+ * control precisely where their shared memory segments are placed.  It
+ * can also be used as a fallback for systems where shm_open and shmget
+ * are not available or can't be used for some reason.  Of course,
+ * mapping a file residing on an actual spinning disk is a fairly poor
+ * approximation for shared memory because writeback may hurt performance
+ * substantially, but there should be few systems where we must make do
+ * with such poor tools.
+ *
+ * As ever, Windows requires its own implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/dsm_impl.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <signal.h>
+#include <unistd.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+#ifdef HAVE_SYS_IPC_H
+#include <sys/ipc.h>
+#endif
+#ifdef HAVE_SYS_SHM_H
+#include <sys/shm.h>
+#endif
+
+#include "common/file_perm.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "portability/mem.h"
+#include "postmaster/postmaster.h"
+#include "storage/dsm_impl.h"
+#include "storage/fd.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+#ifdef USE_DSM_POSIX
+static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
+						   void **impl_private, void **mapped_address,
+						   Size *mapped_size, int elevel);
+static int	dsm_impl_posix_resize(int fd, off_t size);
+#endif
+#ifdef USE_DSM_SYSV
+static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
+						  void **impl_private, void **mapped_address,
+						  Size *mapped_size, int elevel);
+#endif
+#ifdef USE_DSM_WINDOWS
+static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
+							 void **impl_private, void **mapped_address,
+							 Size *mapped_size, int elevel);
+#endif
+#ifdef USE_DSM_MMAP
+static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
+						  void **impl_private, void **mapped_address,
+						  Size *mapped_size, int elevel);
+#endif
+static int	errcode_for_dynamic_shared_memory(void);
+
+const struct config_enum_entry dynamic_shared_memory_options[] = {
+#ifdef USE_DSM_POSIX
+	{"posix", DSM_IMPL_POSIX, false},
+#endif
+#ifdef USE_DSM_SYSV
+	{"sysv", DSM_IMPL_SYSV, false},
+#endif
+#ifdef USE_DSM_WINDOWS
+	{"windows", DSM_IMPL_WINDOWS, false},
+#endif
+#ifdef USE_DSM_MMAP
+	{"mmap", DSM_IMPL_MMAP, false},
+#endif
+	{NULL, 0, false}
+};
+
+/* Implementation selector. */
+int			dynamic_shared_memory_type;
+
+/* Amount of space reserved for DSM segments in the main area. */
+int			min_dynamic_shared_memory;
+
+/* Size of buffer to be used for zero-filling. */
+#define ZBUFFER_SIZE				8192
+
+#define SEGMENT_NAME_PREFIX			"Global/PostgreSQL"
+
+/*------
+ * Perform a low-level shared memory operation in a platform-specific way,
+ * as dictated by the selected implementation.  Each implementation is
+ * required to implement the following primitives.
+ *
+ * DSM_OP_CREATE.  Create a segment whose size is the request_size and
+ * map it.
+ *
+ * DSM_OP_ATTACH.  Map the segment, whose size must be the request_size.
+ *
+ * DSM_OP_DETACH.  Unmap the segment.
+ *
+ * DSM_OP_DESTROY.  Unmap the segment, if it is mapped.  Destroy the
+ * segment.
+ *
+ * Arguments:
+ *	 op: The operation to be performed.
+ *	 handle: The handle of an existing object, or for DSM_OP_CREATE, the
+ *	   a new handle the caller wants created.
+ *	 request_size: For DSM_OP_CREATE, the requested size.  Otherwise, 0.
+ *	 impl_private: Private, implementation-specific data.  Will be a pointer
+ *	   to NULL for the first operation on a shared memory segment within this
+ *	   backend; thereafter, it will point to the value to which it was set
+ *	   on the previous call.
+ *	 mapped_address: Pointer to start of current mapping; pointer to NULL
+ *	   if none.  Updated with new mapping address.
+ *	 mapped_size: Pointer to size of current mapping; pointer to 0 if none.
+ *	   Updated with new mapped size.
+ *	 elevel: Level at which to log errors.
+ *
+ * Return value: true on success, false on failure.  When false is returned,
+ * a message should first be logged at the specified elevel, except in the
+ * case where DSM_OP_CREATE experiences a name collision, which should
+ * silently return false.
+ *-----
+ */
+bool
+dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
+			void **impl_private, void **mapped_address, Size *mapped_size,
+			int elevel)
+{
+	Assert(op == DSM_OP_CREATE || request_size == 0);
+	Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
+		   (*mapped_address == NULL && *mapped_size == 0));
+
+	switch (dynamic_shared_memory_type)
+	{
+#ifdef USE_DSM_POSIX
+		case DSM_IMPL_POSIX:
+			return dsm_impl_posix(op, handle, request_size, impl_private,
+								  mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_SYSV
+		case DSM_IMPL_SYSV:
+			return dsm_impl_sysv(op, handle, request_size, impl_private,
+								 mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_WINDOWS
+		case DSM_IMPL_WINDOWS:
+			return dsm_impl_windows(op, handle, request_size, impl_private,
+									mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_MMAP
+		case DSM_IMPL_MMAP:
+			return dsm_impl_mmap(op, handle, request_size, impl_private,
+								 mapped_address, mapped_size, elevel);
+#endif
+		default:
+			elog(ERROR, "unexpected dynamic shared memory type: %d",
+				 dynamic_shared_memory_type);
+			return false;
+	}
+}
+
+#ifdef USE_DSM_POSIX
+/*
+ * Operating system primitives to support POSIX shared memory.
+ *
+ * POSIX shared memory segments are created and attached using shm_open()
+ * and shm_unlink(); other operations, such as sizing or mapping the
+ * segment, are performed as if the shared memory segments were files.
+ *
+ * Indeed, on some platforms, they may be implemented that way.  While
+ * POSIX shared memory segments seem intended to exist in a flat namespace,
+ * some operating systems may implement them as files, even going so far
+ * to treat a request for /xyz as a request to create a file by that name
+ * in the root directory.  Users of such broken platforms should select
+ * a different shared memory implementation.
+ */
+static bool
+dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
+			   void **impl_private, void **mapped_address, Size *mapped_size,
+			   int elevel)
+{
+	char		name[64];
+	int			flags;
+	int			fd;
+	char	   *address;
+
+	snprintf(name, 64, "/PostgreSQL.%u", handle);
+
+	/* Handle teardown cases. */
+	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+	{
+		if (*mapped_address != NULL
+			&& munmap(*mapped_address, *mapped_size) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not unmap shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		*mapped_address = NULL;
+		*mapped_size = 0;
+		if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not remove shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		return true;
+	}
+
+	/*
+	 * Create new segment or open an existing one for attach.
+	 *
+	 * Even though we will close the FD before returning, it seems desirable
+	 * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
+	 * failure.  The fact that we won't hold the FD open long justifies using
+	 * ReserveExternalFD rather than AcquireExternalFD, though.
+	 */
+	ReserveExternalFD();
+
+	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
+	if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
+	{
+		ReleaseExternalFD();
+		if (op == DSM_OP_ATTACH || errno != EEXIST)
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not open shared memory segment \"%s\": %m",
+							name)));
+		return false;
+	}
+
+	/*
+	 * If we're attaching the segment, determine the current size; if we are
+	 * creating the segment, set the size to the requested value.
+	 */
+	if (op == DSM_OP_ATTACH)
+	{
+		struct stat st;
+
+		if (fstat(fd, &st) != 0)
+		{
+			int			save_errno;
+
+			/* Back out what's already been done. */
+			save_errno = errno;
+			close(fd);
+			ReleaseExternalFD();
+			errno = save_errno;
+
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not stat shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		request_size = st.st_size;
+	}
+	else if (dsm_impl_posix_resize(fd, request_size) != 0)
+	{
+		int			save_errno;
+
+		/* Back out what's already been done. */
+		save_errno = errno;
+		close(fd);
+		ReleaseExternalFD();
+		shm_unlink(name);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
+						name, request_size)));
+		return false;
+	}
+
+	/* Map it. */
+	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
+				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
+	if (address == MAP_FAILED)
+	{
+		int			save_errno;
+
+		/* Back out what's already been done. */
+		save_errno = errno;
+		close(fd);
+		ReleaseExternalFD();
+		if (op == DSM_OP_CREATE)
+			shm_unlink(name);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not map shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+	*mapped_address = address;
+	*mapped_size = request_size;
+	close(fd);
+	ReleaseExternalFD();
+
+	return true;
+}
+
+/*
+ * Set the size of a virtual memory region associated with a file descriptor.
+ * If necessary, also ensure that virtual memory is actually allocated by the
+ * operating system, to avoid nasty surprises later.
+ *
+ * Returns non-zero if either truncation or allocation fails, and sets errno.
+ */
+static int
+dsm_impl_posix_resize(int fd, off_t size)
+{
+	int			rc;
+	int			save_errno;
+	sigset_t	save_sigmask;
+
+	/*
+	 * Block all blockable signals, except SIGQUIT.  posix_fallocate() can run
+	 * for quite a long time, and is an all-or-nothing operation.  If we
+	 * allowed SIGUSR1 to interrupt us repeatedly (for example, due to recovery
+	 * conflicts), the retry loop might never succeed.
+	 */
+	if (IsUnderPostmaster)
+		sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask);
+
+	/* Truncate (or extend) the file to the requested size. */
+	do
+	{
+		rc = ftruncate(fd, size);
+	} while (rc < 0 && errno == EINTR);
+
+	/*
+	 * On Linux, a shm_open fd is backed by a tmpfs file.  After resizing with
+	 * ftruncate, the file may contain a hole.  Accessing memory backed by a
+	 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
+	 * is no more tmpfs space available.  So we ask tmpfs to allocate pages
+	 * here, so we can fail gracefully with ENOSPC now rather than risking
+	 * SIGBUS later.
+	 */
+#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
+	if (rc == 0)
+	{
+		/*
+		 * We still use a traditional EINTR retry loop to handle SIGCONT.
+		 * posix_fallocate() doesn't restart automatically, and we don't want
+		 * this to fail if you attach a debugger.
+		 */
+		pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
+		do
+		{
+			rc = posix_fallocate(fd, 0, size);
+		} while (rc == EINTR);
+		pgstat_report_wait_end();
+
+		/*
+		 * The caller expects errno to be set, but posix_fallocate() doesn't
+		 * set it.  Instead it returns error numbers directly.  So set errno,
+		 * even though we'll also return rc to indicate success or failure.
+		 */
+		errno = rc;
+	}
+#endif							/* HAVE_POSIX_FALLOCATE && __linux__ */
+
+	if (IsUnderPostmaster)
+	{
+		save_errno = errno;
+		sigprocmask(SIG_SETMASK, &save_sigmask, NULL);
+		errno = save_errno;
+	}
+
+	return rc;
+}
+
+#endif							/* USE_DSM_POSIX */
+
+#ifdef USE_DSM_SYSV
+/*
+ * Operating system primitives to support System V shared memory.
+ *
+ * System V shared memory segments are manipulated using shmget(), shmat(),
+ * shmdt(), and shmctl().  As the default allocation limits for System V
+ * shared memory are usually quite low, the POSIX facilities may be
+ * preferable; but those are not supported everywhere.
+ */
+static bool
+dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
+			  void **impl_private, void **mapped_address, Size *mapped_size,
+			  int elevel)
+{
+	key_t		key;
+	int			ident;
+	char	   *address;
+	char		name[64];
+	int		   *ident_cache;
+
+	/*
+	 * POSIX shared memory and mmap-based shared memory identify segments with
+	 * names.  To avoid needless error message variation, we use the handle as
+	 * the name.
+	 */
+	snprintf(name, 64, "%u", handle);
+
+	/*
+	 * The System V shared memory namespace is very restricted; names are of
+	 * type key_t, which is expected to be some sort of integer data type, but
+	 * not necessarily the same one as dsm_handle.  Since we use dsm_handle to
+	 * identify shared memory segments across processes, this might seem like
+	 * a problem, but it's really not.  If dsm_handle is bigger than key_t,
+	 * the cast below might truncate away some bits from the handle the
+	 * user-provided, but it'll truncate exactly the same bits away in exactly
+	 * the same fashion every time we use that handle, which is all that
+	 * really matters.  Conversely, if dsm_handle is smaller than key_t, we
+	 * won't use the full range of available key space, but that's no big deal
+	 * either.
+	 *
+	 * We do make sure that the key isn't negative, because that might not be
+	 * portable.
+	 */
+	key = (key_t) handle;
+	if (key < 1)				/* avoid compiler warning if type is unsigned */
+		key = -key;
+
+	/*
+	 * There's one special key, IPC_PRIVATE, which can't be used.  If we end
+	 * up with that value by chance during a create operation, just pretend it
+	 * already exists, so that caller will retry.  If we run into it anywhere
+	 * else, the caller has passed a handle that doesn't correspond to
+	 * anything we ever created, which should not happen.
+	 */
+	if (key == IPC_PRIVATE)
+	{
+		if (op != DSM_OP_CREATE)
+			elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
+		errno = EEXIST;
+		return false;
+	}
+
+	/*
+	 * Before we can do anything with a shared memory segment, we have to map
+	 * the shared memory key to a shared memory identifier using shmget(). To
+	 * avoid repeated lookups, we store the key using impl_private.
+	 */
+	if (*impl_private != NULL)
+	{
+		ident_cache = *impl_private;
+		ident = *ident_cache;
+	}
+	else
+	{
+		int			flags = IPCProtection;
+		size_t		segsize;
+
+		/*
+		 * Allocate the memory BEFORE acquiring the resource, so that we don't
+		 * leak the resource if memory allocation fails.
+		 */
+		ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
+
+		/*
+		 * When using shmget to find an existing segment, we must pass the
+		 * size as 0.  Passing a non-zero size which is greater than the
+		 * actual size will result in EINVAL.
+		 */
+		segsize = 0;
+
+		if (op == DSM_OP_CREATE)
+		{
+			flags |= IPC_CREAT | IPC_EXCL;
+			segsize = request_size;
+		}
+
+		if ((ident = shmget(key, segsize, flags)) == -1)
+		{
+			if (op == DSM_OP_ATTACH || errno != EEXIST)
+			{
+				int			save_errno = errno;
+
+				pfree(ident_cache);
+				errno = save_errno;
+				ereport(elevel,
+						(errcode_for_dynamic_shared_memory(),
+						 errmsg("could not get shared memory segment: %m")));
+			}
+			return false;
+		}
+
+		*ident_cache = ident;
+		*impl_private = ident_cache;
+	}
+
+	/* Handle teardown cases. */
+	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+	{
+		pfree(ident_cache);
+		*impl_private = NULL;
+		if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not unmap shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		*mapped_address = NULL;
+		*mapped_size = 0;
+		if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not remove shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		return true;
+	}
+
+	/* If we're attaching it, we must use IPC_STAT to determine the size. */
+	if (op == DSM_OP_ATTACH)
+	{
+		struct shmid_ds shm;
+
+		if (shmctl(ident, IPC_STAT, &shm) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not stat shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		request_size = shm.shm_segsz;
+	}
+
+	/* Map it. */
+	address = shmat(ident, NULL, PG_SHMAT_FLAGS);
+	if (address == (void *) -1)
+	{
+		int			save_errno;
+
+		/* Back out what's already been done. */
+		save_errno = errno;
+		if (op == DSM_OP_CREATE)
+			shmctl(ident, IPC_RMID, NULL);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not map shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+	*mapped_address = address;
+	*mapped_size = request_size;
+
+	return true;
+}
+#endif
+
+#ifdef USE_DSM_WINDOWS
+/*
+ * Operating system primitives to support Windows shared memory.
+ *
+ * Windows shared memory implementation is done using file mapping
+ * which can be backed by either physical file or system paging file.
+ * Current implementation uses system paging file as other effects
+ * like performance are not clear for physical file and it is used in similar
+ * way for main shared memory in windows.
+ *
+ * A memory mapping object is a kernel object - they always get deleted when
+ * the last reference to them goes away, either explicitly via a CloseHandle or
+ * when the process containing the reference exits.
+ */
+static bool
+dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
+				 void **impl_private, void **mapped_address,
+				 Size *mapped_size, int elevel)
+{
+	char	   *address;
+	HANDLE		hmap;
+	char		name[64];
+	MEMORY_BASIC_INFORMATION info;
+
+	/*
+	 * Storing the shared memory segment in the Global\ namespace, can allow
+	 * any process running in any session to access that file mapping object
+	 * provided that the caller has the required access rights. But to avoid
+	 * issues faced in main shared memory, we are using the naming convention
+	 * similar to main shared memory. We can change here once issue mentioned
+	 * in GetSharedMemName is resolved.
+	 */
+	snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+
+	/*
+	 * Handle teardown cases.  Since Windows automatically destroys the object
+	 * when no references remain, we can treat it the same as detach.
+	 */
+	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+	{
+		if (*mapped_address != NULL
+			&& UnmapViewOfFile(*mapped_address) == 0)
+		{
+			_dosmaperr(GetLastError());
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not unmap shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		if (*impl_private != NULL
+			&& CloseHandle(*impl_private) == 0)
+		{
+			_dosmaperr(GetLastError());
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not remove shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+
+		*impl_private = NULL;
+		*mapped_address = NULL;
+		*mapped_size = 0;
+		return true;
+	}
+
+	/* Create new segment or open an existing one for attach. */
+	if (op == DSM_OP_CREATE)
+	{
+		DWORD		size_high;
+		DWORD		size_low;
+		DWORD		errcode;
+
+		/* Shifts >= the width of the type are undefined. */
+#ifdef _WIN64
+		size_high = request_size >> 32;
+#else
+		size_high = 0;
+#endif
+		size_low = (DWORD) request_size;
+
+		/* CreateFileMapping might not clear the error code on success */
+		SetLastError(0);
+
+		hmap = CreateFileMapping(INVALID_HANDLE_VALUE,	/* Use the pagefile */
+								 NULL,	/* Default security attrs */
+								 PAGE_READWRITE,	/* Memory is read/write */
+								 size_high, /* Upper 32 bits of size */
+								 size_low,	/* Lower 32 bits of size */
+								 name);
+
+		errcode = GetLastError();
+		if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
+		{
+			/*
+			 * On Windows, when the segment already exists, a handle for the
+			 * existing segment is returned.  We must close it before
+			 * returning.  However, if the existing segment is created by a
+			 * service, then it returns ERROR_ACCESS_DENIED. We don't do
+			 * _dosmaperr here, so errno won't be modified.
+			 */
+			if (hmap)
+				CloseHandle(hmap);
+			return false;
+		}
+
+		if (!hmap)
+		{
+			_dosmaperr(errcode);
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not create shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+	}
+	else
+	{
+		hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
+							   FALSE,	/* do not inherit the name */
+							   name);	/* name of mapping object */
+		if (!hmap)
+		{
+			_dosmaperr(GetLastError());
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not open shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+	}
+
+	/* Map it. */
+	address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
+							0, 0, 0);
+	if (!address)
+	{
+		int			save_errno;
+
+		_dosmaperr(GetLastError());
+		/* Back out what's already been done. */
+		save_errno = errno;
+		CloseHandle(hmap);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not map shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+
+	/*
+	 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
+	 * need size only when we are attaching, but it's better to get the size
+	 * when creating new segment to keep size consistent both for
+	 * DSM_OP_CREATE and DSM_OP_ATTACH.
+	 */
+	if (VirtualQuery(address, &info, sizeof(info)) == 0)
+	{
+		int			save_errno;
+
+		_dosmaperr(GetLastError());
+		/* Back out what's already been done. */
+		save_errno = errno;
+		UnmapViewOfFile(address);
+		CloseHandle(hmap);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not stat shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+
+	*mapped_address = address;
+	*mapped_size = info.RegionSize;
+	*impl_private = hmap;
+
+	return true;
+}
+#endif
+
+#ifdef USE_DSM_MMAP
+/*
+ * Operating system primitives to support mmap-based shared memory.
+ *
+ * Calling this "shared memory" is somewhat of a misnomer, because what
+ * we're really doing is creating a bunch of files and mapping them into
+ * our address space.  The operating system may feel obliged to
+ * synchronize the contents to disk even if nothing is being paged out,
+ * which will not serve us well.  The user can relocate the pg_dynshmem
+ * directory to a ramdisk to avoid this problem, if available.
+ */
+static bool
+dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
+			  void **impl_private, void **mapped_address, Size *mapped_size,
+			  int elevel)
+{
+	char		name[64];
+	int			flags;
+	int			fd;
+	char	   *address;
+
+	snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
+			 handle);
+
+	/* Handle teardown cases. */
+	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+	{
+		if (*mapped_address != NULL
+			&& munmap(*mapped_address, *mapped_size) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not unmap shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		*mapped_address = NULL;
+		*mapped_size = 0;
+		if (op == DSM_OP_DESTROY && unlink(name) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not remove shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		return true;
+	}
+
+	/* Create new segment or open an existing one for attach. */
+	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
+	if ((fd = OpenTransientFile(name, flags)) == -1)
+	{
+		if (op == DSM_OP_ATTACH || errno != EEXIST)
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not open shared memory segment \"%s\": %m",
+							name)));
+		return false;
+	}
+
+	/*
+	 * If we're attaching the segment, determine the current size; if we are
+	 * creating the segment, set the size to the requested value.
+	 */
+	if (op == DSM_OP_ATTACH)
+	{
+		struct stat st;
+
+		if (fstat(fd, &st) != 0)
+		{
+			int			save_errno;
+
+			/* Back out what's already been done. */
+			save_errno = errno;
+			CloseTransientFile(fd);
+			errno = save_errno;
+
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not stat shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		request_size = st.st_size;
+	}
+	else
+	{
+		/*
+		 * Allocate a buffer full of zeros.
+		 *
+		 * Note: palloc zbuffer, instead of just using a local char array, to
+		 * ensure it is reasonably well-aligned; this may save a few cycles
+		 * transferring data to the kernel.
+		 */
+		char	   *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
+		uint32		remaining = request_size;
+		bool		success = true;
+
+		/*
+		 * Zero-fill the file. We have to do this the hard way to ensure that
+		 * all the file space has really been allocated, so that we don't
+		 * later seg fault when accessing the memory mapping.  This is pretty
+		 * pessimal.
+		 */
+		while (success && remaining > 0)
+		{
+			Size		goal = remaining;
+
+			if (goal > ZBUFFER_SIZE)
+				goal = ZBUFFER_SIZE;
+			pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
+			if (write(fd, zbuffer, goal) == goal)
+				remaining -= goal;
+			else
+				success = false;
+			pgstat_report_wait_end();
+		}
+
+		if (!success)
+		{
+			int			save_errno;
+
+			/* Back out what's already been done. */
+			save_errno = errno;
+			CloseTransientFile(fd);
+			unlink(name);
+			errno = save_errno ? save_errno : ENOSPC;
+
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
+							name, request_size)));
+			return false;
+		}
+	}
+
+	/* Map it. */
+	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
+				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
+	if (address == MAP_FAILED)
+	{
+		int			save_errno;
+
+		/* Back out what's already been done. */
+		save_errno = errno;
+		CloseTransientFile(fd);
+		if (op == DSM_OP_CREATE)
+			unlink(name);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not map shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+	*mapped_address = address;
+	*mapped_size = request_size;
+
+	if (CloseTransientFile(fd) != 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not close shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+
+	return true;
+}
+#endif
+
+/*
+ * Implementation-specific actions that must be performed when a segment is to
+ * be preserved even when no backend has it attached.
+ *
+ * Except on Windows, we don't need to do anything at all.  But since Windows
+ * cleans up segments automatically when no references remain, we duplicate
+ * the segment handle into the postmaster process.  The postmaster needn't
+ * do anything to receive the handle; Windows transfers it automatically.
+ */
+void
+dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
+					 void **impl_private_pm_handle)
+{
+	switch (dynamic_shared_memory_type)
+	{
+#ifdef USE_DSM_WINDOWS
+		case DSM_IMPL_WINDOWS:
+			{
+				HANDLE		hmap;
+
+				if (!DuplicateHandle(GetCurrentProcess(), impl_private,
+									 PostmasterHandle, &hmap, 0, FALSE,
+									 DUPLICATE_SAME_ACCESS))
+				{
+					char		name[64];
+
+					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+					_dosmaperr(GetLastError());
+					ereport(ERROR,
+							(errcode_for_dynamic_shared_memory(),
+							 errmsg("could not duplicate handle for \"%s\": %m",
+									name)));
+				}
+
+				/*
+				 * Here, we remember the handle that we created in the
+				 * postmaster process.  This handle isn't actually usable in
+				 * any process other than the postmaster, but that doesn't
+				 * matter.  We're just holding onto it so that, if the segment
+				 * is unpinned, dsm_impl_unpin_segment can close it.
+				 */
+				*impl_private_pm_handle = hmap;
+				break;
+			}
+#endif
+		default:
+			break;
+	}
+}
+
+/*
+ * Implementation-specific actions that must be performed when a segment is no
+ * longer to be preserved, so that it will be cleaned up when all backends
+ * have detached from it.
+ *
+ * Except on Windows, we don't need to do anything at all.  For Windows, we
+ * close the extra handle that dsm_impl_pin_segment created in the
+ * postmaster's process space.
+ */
+void
+dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
+{
+	switch (dynamic_shared_memory_type)
+	{
+#ifdef USE_DSM_WINDOWS
+		case DSM_IMPL_WINDOWS:
+			{
+				if (*impl_private &&
+					!DuplicateHandle(PostmasterHandle, *impl_private,
+									 NULL, NULL, 0, FALSE,
+									 DUPLICATE_CLOSE_SOURCE))
+				{
+					char		name[64];
+
+					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+					_dosmaperr(GetLastError());
+					ereport(ERROR,
+							(errcode_for_dynamic_shared_memory(),
+							 errmsg("could not duplicate handle for \"%s\": %m",
+									name)));
+				}
+
+				*impl_private = NULL;
+				break;
+			}
+#endif
+		default:
+			break;
+	}
+}
+
+static int
+errcode_for_dynamic_shared_memory(void)
+{
+	if (errno == EFBIG || errno == ENOMEM)
+		return errcode(ERRCODE_OUT_OF_MEMORY);
+	else
+		return errcode_for_file_access();
+}
diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c
new file mode 100644
index 0000000..4045d7d
--- /dev/null
+++ b/src/backend/storage/ipc/ipc.c
@@ -0,0 +1,435 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.c
+ *	  POSTGRES inter-process communication definitions.
+ *
+ * This file is misnamed, as it no longer has much of anything directly
+ * to do with IPC.  The functionality here is concerned with managing
+ * exit-time cleanup for either a postmaster or a backend.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/ipc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+#ifdef PROFILE_PID_DIR
+#include "postmaster/autovacuum.h"
+#endif
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "tcop/tcopprot.h"
+
+
+/*
+ * This flag is set during proc_exit() to change ereport()'s behavior,
+ * so that an ereport() from an on_proc_exit routine cannot get us out
+ * of the exit procedure.  We do NOT want to go back to the idle loop...
+ */
+bool		proc_exit_inprogress = false;
+
+/*
+ * Set when shmem_exit() is in progress.
+ */
+bool		shmem_exit_inprogress = false;
+
+/*
+ * This flag tracks whether we've called atexit() in the current process
+ * (or in the parent postmaster).
+ */
+static bool atexit_callback_setup = false;
+
+/* local functions */
+static void proc_exit_prepare(int code);
+
+
+/* ----------------------------------------------------------------
+ *						exit() handling stuff
+ *
+ * These functions are in generally the same spirit as atexit(),
+ * but provide some additional features we need --- in particular,
+ * we want to register callbacks to invoke when we are disconnecting
+ * from a broken shared-memory context but not exiting the postmaster.
+ *
+ * Callback functions can take zero, one, or two args: the first passed
+ * arg is the integer exitcode, the second is the Datum supplied when
+ * the callback was registered.
+ * ----------------------------------------------------------------
+ */
+
+#define MAX_ON_EXITS 20
+
+struct ONEXIT
+{
+	pg_on_exit_callback function;
+	Datum		arg;
+};
+
+static struct ONEXIT on_proc_exit_list[MAX_ON_EXITS];
+static struct ONEXIT on_shmem_exit_list[MAX_ON_EXITS];
+static struct ONEXIT before_shmem_exit_list[MAX_ON_EXITS];
+
+static int	on_proc_exit_index,
+			on_shmem_exit_index,
+			before_shmem_exit_index;
+
+
+/* ----------------------------------------------------------------
+ *		proc_exit
+ *
+ *		this function calls all the callbacks registered
+ *		for it (to free resources) and then calls exit.
+ *
+ *		This should be the only function to call exit().
+ *		-cim 2/6/90
+ *
+ *		Unfortunately, we can't really guarantee that add-on code
+ *		obeys the rule of not calling exit() directly.  So, while
+ *		this is the preferred way out of the system, we also register
+ *		an atexit callback that will make sure cleanup happens.
+ * ----------------------------------------------------------------
+ */
+void
+proc_exit(int code)
+{
+	/* Clean up everything that must be cleaned up */
+	proc_exit_prepare(code);
+
+#ifdef PROFILE_PID_DIR
+	{
+		/*
+		 * If we are profiling ourself then gprof's mcleanup() is about to
+		 * write out a profile to ./gmon.out.  Since mcleanup() always uses a
+		 * fixed file name, each backend will overwrite earlier profiles. To
+		 * fix that, we create a separate subdirectory for each backend
+		 * (./gprof/pid) and 'cd' to that subdirectory before we exit() - that
+		 * forces mcleanup() to write each profile into its own directory.  We
+		 * end up with something like: $PGDATA/gprof/8829/gmon.out
+		 * $PGDATA/gprof/8845/gmon.out ...
+		 *
+		 * To avoid undesirable disk space bloat, autovacuum workers are
+		 * discriminated against: all their gmon.out files go into the same
+		 * subdirectory.  Without this, an installation that is "just sitting
+		 * there" nonetheless eats megabytes of disk space every few seconds.
+		 *
+		 * Note that we do this here instead of in an on_proc_exit() callback
+		 * because we want to ensure that this code executes last - we don't
+		 * want to interfere with any other on_proc_exit() callback.  For the
+		 * same reason, we do not include it in proc_exit_prepare ... so if
+		 * you are exiting in the "wrong way" you won't drop your profile in a
+		 * nice place.
+		 */
+		char		gprofDirName[32];
+
+		if (IsAutoVacuumWorkerProcess())
+			snprintf(gprofDirName, 32, "gprof/avworker");
+		else
+			snprintf(gprofDirName, 32, "gprof/%d", (int) getpid());
+
+		/*
+		 * Use mkdir() instead of MakePGDirectory() since we aren't making a
+		 * PG directory here.
+		 */
+		mkdir("gprof", S_IRWXU | S_IRWXG | S_IRWXO);
+		mkdir(gprofDirName, S_IRWXU | S_IRWXG | S_IRWXO);
+		chdir(gprofDirName);
+	}
+#endif
+
+	elog(DEBUG3, "exit(%d)", code);
+
+	exit(code);
+}
+
+/*
+ * Code shared between proc_exit and the atexit handler.  Note that in
+ * normal exit through proc_exit, this will actually be called twice ...
+ * but the second call will have nothing to do.
+ */
+static void
+proc_exit_prepare(int code)
+{
+	/*
+	 * Once we set this flag, we are committed to exit.  Any ereport() will
+	 * NOT send control back to the main loop, but right back here.
+	 */
+	proc_exit_inprogress = true;
+
+	/*
+	 * Forget any pending cancel or die requests; we're doing our best to
+	 * close up shop already.  Note that the signal handlers will not set
+	 * these flags again, now that proc_exit_inprogress is set.
+	 */
+	InterruptPending = false;
+	ProcDiePending = false;
+	QueryCancelPending = false;
+	InterruptHoldoffCount = 1;
+	CritSectionCount = 0;
+
+	/*
+	 * Also clear the error context stack, to prevent error callbacks from
+	 * being invoked by any elog/ereport calls made during proc_exit. Whatever
+	 * context they might want to offer is probably not relevant, and in any
+	 * case they are likely to fail outright after we've done things like
+	 * aborting any open transaction.  (In normal exit scenarios the context
+	 * stack should be empty anyway, but it might not be in the case of
+	 * elog(FATAL) for example.)
+	 */
+	error_context_stack = NULL;
+	/* For the same reason, reset debug_query_string before it's clobbered */
+	debug_query_string = NULL;
+
+	/* do our shared memory exits first */
+	shmem_exit(code);
+
+	elog(DEBUG3, "proc_exit(%d): %d callbacks to make",
+		 code, on_proc_exit_index);
+
+	/*
+	 * call all the registered callbacks.
+	 *
+	 * Note that since we decrement on_proc_exit_index each time, if a
+	 * callback calls ereport(ERROR) or ereport(FATAL) then it won't be
+	 * invoked again when control comes back here (nor will the
+	 * previously-completed callbacks).  So, an infinite loop should not be
+	 * possible.
+	 */
+	while (--on_proc_exit_index >= 0)
+		on_proc_exit_list[on_proc_exit_index].function(code,
+													   on_proc_exit_list[on_proc_exit_index].arg);
+
+	on_proc_exit_index = 0;
+}
+
+/* ------------------
+ * Run all of the on_shmem_exit routines --- but don't actually exit.
+ * This is used by the postmaster to re-initialize shared memory and
+ * semaphores after a backend dies horribly.  As with proc_exit(), we
+ * remove each callback from the list before calling it, to avoid
+ * infinite loop in case of error.
+ * ------------------
+ */
+void
+shmem_exit(int code)
+{
+	shmem_exit_inprogress = true;
+
+	/*
+	 * Call before_shmem_exit callbacks.
+	 *
+	 * These should be things that need most of the system to still be up and
+	 * working, such as cleanup of temp relations, which requires catalog
+	 * access; or things that need to be completed because later cleanup steps
+	 * depend on them, such as releasing lwlocks.
+	 */
+	elog(DEBUG3, "shmem_exit(%d): %d before_shmem_exit callbacks to make",
+		 code, before_shmem_exit_index);
+	while (--before_shmem_exit_index >= 0)
+		before_shmem_exit_list[before_shmem_exit_index].function(code,
+																 before_shmem_exit_list[before_shmem_exit_index].arg);
+	before_shmem_exit_index = 0;
+
+	/*
+	 * Call dynamic shared memory callbacks.
+	 *
+	 * These serve the same purpose as late callbacks, but for dynamic shared
+	 * memory segments rather than the main shared memory segment.
+	 * dsm_backend_shutdown() has the same kind of progressive logic we use
+	 * for the main shared memory segment; namely, it unregisters each
+	 * callback before invoking it, so that we don't get stuck in an infinite
+	 * loop if one of those callbacks itself throws an ERROR or FATAL.
+	 *
+	 * Note that explicitly calling this function here is quite different from
+	 * registering it as an on_shmem_exit callback for precisely this reason:
+	 * if one dynamic shared memory callback errors out, the remaining
+	 * callbacks will still be invoked.  Thus, hard-coding this call puts it
+	 * equal footing with callbacks for the main shared memory segment.
+	 */
+	dsm_backend_shutdown();
+
+	/*
+	 * Call on_shmem_exit callbacks.
+	 *
+	 * These are generally releasing low-level shared memory resources.  In
+	 * some cases, this is a backstop against the possibility that the early
+	 * callbacks might themselves fail, leading to re-entry to this routine;
+	 * in other cases, it's cleanup that only happens at process exit.
+	 */
+	elog(DEBUG3, "shmem_exit(%d): %d on_shmem_exit callbacks to make",
+		 code, on_shmem_exit_index);
+	while (--on_shmem_exit_index >= 0)
+		on_shmem_exit_list[on_shmem_exit_index].function(code,
+														 on_shmem_exit_list[on_shmem_exit_index].arg);
+	on_shmem_exit_index = 0;
+
+	shmem_exit_inprogress = false;
+}
+
+/* ----------------------------------------------------------------
+ *		atexit_callback
+ *
+ *		Backstop to ensure that direct calls of exit() don't mess us up.
+ *
+ * Somebody who was being really uncooperative could call _exit(),
+ * but for that case we have a "dead man switch" that will make the
+ * postmaster treat it as a crash --- see pmsignal.c.
+ * ----------------------------------------------------------------
+ */
+static void
+atexit_callback(void)
+{
+	/* Clean up everything that must be cleaned up */
+	/* ... too bad we don't know the real exit code ... */
+	proc_exit_prepare(-1);
+}
+
+/* ----------------------------------------------------------------
+ *		on_proc_exit
+ *
+ *		this function adds a callback function to the list of
+ *		functions invoked by proc_exit().   -cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+void
+on_proc_exit(pg_on_exit_callback function, Datum arg)
+{
+	if (on_proc_exit_index >= MAX_ON_EXITS)
+		ereport(FATAL,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg_internal("out of on_proc_exit slots")));
+
+	on_proc_exit_list[on_proc_exit_index].function = function;
+	on_proc_exit_list[on_proc_exit_index].arg = arg;
+
+	++on_proc_exit_index;
+
+	if (!atexit_callback_setup)
+	{
+		atexit(atexit_callback);
+		atexit_callback_setup = true;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		before_shmem_exit
+ *
+ *		Register early callback to perform user-level cleanup,
+ *		e.g. transaction abort, before we begin shutting down
+ *		low-level subsystems.
+ * ----------------------------------------------------------------
+ */
+void
+before_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+	if (before_shmem_exit_index >= MAX_ON_EXITS)
+		ereport(FATAL,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg_internal("out of before_shmem_exit slots")));
+
+	before_shmem_exit_list[before_shmem_exit_index].function = function;
+	before_shmem_exit_list[before_shmem_exit_index].arg = arg;
+
+	++before_shmem_exit_index;
+
+	if (!atexit_callback_setup)
+	{
+		atexit(atexit_callback);
+		atexit_callback_setup = true;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		on_shmem_exit
+ *
+ *		Register ordinary callback to perform low-level shutdown
+ *		(e.g. releasing our PGPROC); run after before_shmem_exit
+ *		callbacks and before on_proc_exit callbacks.
+ * ----------------------------------------------------------------
+ */
+void
+on_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+	if (on_shmem_exit_index >= MAX_ON_EXITS)
+		ereport(FATAL,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg_internal("out of on_shmem_exit slots")));
+
+	on_shmem_exit_list[on_shmem_exit_index].function = function;
+	on_shmem_exit_list[on_shmem_exit_index].arg = arg;
+
+	++on_shmem_exit_index;
+
+	if (!atexit_callback_setup)
+	{
+		atexit(atexit_callback);
+		atexit_callback_setup = true;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		cancel_before_shmem_exit
+ *
+ *		this function removes a previously-registered before_shmem_exit
+ *		callback.  We only look at the latest entry for removal, as we
+ * 		expect callers to add and remove temporary before_shmem_exit
+ * 		callbacks in strict LIFO order.
+ * ----------------------------------------------------------------
+ */
+void
+cancel_before_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+	if (before_shmem_exit_index > 0 &&
+		before_shmem_exit_list[before_shmem_exit_index - 1].function
+		== function &&
+		before_shmem_exit_list[before_shmem_exit_index - 1].arg == arg)
+		--before_shmem_exit_index;
+	else
+		elog(ERROR, "before_shmem_exit callback (%p,0x%llx) is not the latest entry",
+			 function, (long long) arg);
+}
+
+/* ----------------------------------------------------------------
+ *		on_exit_reset
+ *
+ *		this function clears all on_proc_exit() and on_shmem_exit()
+ *		registered functions.  This is used just after forking a backend,
+ *		so that the backend doesn't believe it should call the postmaster's
+ *		on-exit routines when it exits...
+ * ----------------------------------------------------------------
+ */
+void
+on_exit_reset(void)
+{
+	before_shmem_exit_index = 0;
+	on_shmem_exit_index = 0;
+	on_proc_exit_index = 0;
+	reset_on_dsm_detach();
+}
+
+/* ----------------------------------------------------------------
+ *		check_on_shmem_exit_lists_are_empty
+ *
+ *		Debugging check that no shmem cleanup handlers have been registered
+ *		prematurely in the current process.
+ * ----------------------------------------------------------------
+ */
+void
+check_on_shmem_exit_lists_are_empty(void)
+{
+	if (before_shmem_exit_index)
+		elog(FATAL, "before_shmem_exit has been called prematurely");
+	if (on_shmem_exit_index)
+		elog(FATAL, "on_shmem_exit has been called prematurely");
+	/* Checking DSM detach state seems unnecessary given the above */
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
new file mode 100644
index 0000000..3e4ec53
--- /dev/null
+++ b/src/backend/storage/ipc/ipci.c
@@ -0,0 +1,291 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipci.c
+ *	  POSTGRES inter-process communication initialization code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/ipci.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/heapam.h"
+#include "access/multixact.h"
+#include "access/nbtree.h"
+#include "access/subtrans.h"
+#include "access/syncscan.h"
+#include "access/twophase.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "postmaster/bgworker_internals.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/postmaster.h"
+#include "replication/logicallauncher.h"
+#include "replication/origin.h"
+#include "replication/slot.h"
+#include "replication/walreceiver.h"
+#include "replication/walsender.h"
+#include "storage/bufmgr.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+#include "storage/pmsignal.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "utils/snapmgr.h"
+
+/* GUCs */
+int			shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE;
+
+shmem_startup_hook_type shmem_startup_hook = NULL;
+
+static Size total_addin_request = 0;
+static bool addin_request_allowed = true;
+
+
+/*
+ * RequestAddinShmemSpace
+ *		Request that extra shmem space be allocated for use by
+ *		a loadable module.
+ *
+ * This is only useful if called from the _PG_init hook of a library that
+ * is loaded into the postmaster via shared_preload_libraries.  Once
+ * shared memory has been allocated, calls will be ignored.  (We could
+ * raise an error, but it seems better to make it a no-op, so that
+ * libraries containing such calls can be reloaded if needed.)
+ */
+void
+RequestAddinShmemSpace(Size size)
+{
+	if (IsUnderPostmaster || !addin_request_allowed)
+		return;					/* too late */
+	total_addin_request = add_size(total_addin_request, size);
+}
+
+
+/*
+ * CreateSharedMemoryAndSemaphores
+ *		Creates and initializes shared memory and semaphores.
+ *
+ * This is called by the postmaster or by a standalone backend.
+ * It is also called by a backend forked from the postmaster in the
+ * EXEC_BACKEND case.  In the latter case, the shared memory segment
+ * already exists and has been physically attached to, but we have to
+ * initialize pointers in local memory that reference the shared structures,
+ * because we didn't inherit the correct pointer values from the postmaster
+ * as we do in the fork() scenario.  The easiest way to do that is to run
+ * through the same code as before.  (Note that the called routines mostly
+ * check IsUnderPostmaster, rather than EXEC_BACKEND, to detect this case.
+ * This is a bit code-wasteful and could be cleaned up.)
+ */
+void
+CreateSharedMemoryAndSemaphores(void)
+{
+	PGShmemHeader *shim = NULL;
+
+	if (!IsUnderPostmaster)
+	{
+		PGShmemHeader *seghdr;
+		Size		size;
+		int			numSemas;
+
+		/* Compute number of semaphores we'll need */
+		numSemas = ProcGlobalSemas();
+		numSemas += SpinlockSemas();
+
+		/*
+		 * Size of the Postgres shared-memory block is estimated via
+		 * moderately-accurate estimates for the big hogs, plus 100K for the
+		 * stuff that's too small to bother with estimating.
+		 *
+		 * We take some care during this phase to ensure that the total size
+		 * request doesn't overflow size_t.  If this gets through, we don't
+		 * need to be so careful during the actual allocation phase.
+		 */
+		size = 100000;
+		size = add_size(size, PGSemaphoreShmemSize(numSemas));
+		size = add_size(size, SpinlockSemaSize());
+		size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
+												 sizeof(ShmemIndexEnt)));
+		size = add_size(size, dsm_estimate_size());
+		size = add_size(size, BufferShmemSize());
+		size = add_size(size, LockShmemSize());
+		size = add_size(size, PredicateLockShmemSize());
+		size = add_size(size, ProcGlobalShmemSize());
+		size = add_size(size, XLOGShmemSize());
+		size = add_size(size, CLOGShmemSize());
+		size = add_size(size, CommitTsShmemSize());
+		size = add_size(size, SUBTRANSShmemSize());
+		size = add_size(size, TwoPhaseShmemSize());
+		size = add_size(size, BackgroundWorkerShmemSize());
+		size = add_size(size, MultiXactShmemSize());
+		size = add_size(size, LWLockShmemSize());
+		size = add_size(size, ProcArrayShmemSize());
+		size = add_size(size, BackendStatusShmemSize());
+		size = add_size(size, SInvalShmemSize());
+		size = add_size(size, PMSignalShmemSize());
+		size = add_size(size, ProcSignalShmemSize());
+		size = add_size(size, CheckpointerShmemSize());
+		size = add_size(size, AutoVacuumShmemSize());
+		size = add_size(size, ReplicationSlotsShmemSize());
+		size = add_size(size, ReplicationOriginShmemSize());
+		size = add_size(size, WalSndShmemSize());
+		size = add_size(size, WalRcvShmemSize());
+		size = add_size(size, PgArchShmemSize());
+		size = add_size(size, ApplyLauncherShmemSize());
+		size = add_size(size, SnapMgrShmemSize());
+		size = add_size(size, BTreeShmemSize());
+		size = add_size(size, SyncScanShmemSize());
+		size = add_size(size, AsyncShmemSize());
+#ifdef EXEC_BACKEND
+		size = add_size(size, ShmemBackendArraySize());
+#endif
+
+		/* freeze the addin request size and include it */
+		addin_request_allowed = false;
+		size = add_size(size, total_addin_request);
+
+		/* might as well round it off to a multiple of a typical page size */
+		size = add_size(size, 8192 - (size % 8192));
+
+		elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size);
+
+		/*
+		 * Create the shmem segment
+		 */
+		seghdr = PGSharedMemoryCreate(size, &shim);
+
+		InitShmemAccess(seghdr);
+
+		/*
+		 * Create semaphores
+		 */
+		PGReserveSemaphores(numSemas);
+
+		/*
+		 * If spinlocks are disabled, initialize emulation layer (which
+		 * depends on semaphores, so the order is important here).
+		 */
+#ifndef HAVE_SPINLOCKS
+		SpinlockSemaInit();
+#endif
+	}
+	else
+	{
+		/*
+		 * We are reattaching to an existing shared memory segment. This
+		 * should only be reached in the EXEC_BACKEND case.
+		 */
+#ifndef EXEC_BACKEND
+		elog(PANIC, "should be attached to shared memory already");
+#endif
+	}
+
+	/*
+	 * Set up shared memory allocation mechanism
+	 */
+	if (!IsUnderPostmaster)
+		InitShmemAllocation();
+
+	/*
+	 * Now initialize LWLocks, which do shared memory allocation and are
+	 * needed for InitShmemIndex.
+	 */
+	CreateLWLocks();
+
+	/*
+	 * Set up shmem.c index hashtable
+	 */
+	InitShmemIndex();
+
+	dsm_shmem_init();
+
+	/*
+	 * Set up xlog, clog, and buffers
+	 */
+	XLOGShmemInit();
+	CLOGShmemInit();
+	CommitTsShmemInit();
+	SUBTRANSShmemInit();
+	MultiXactShmemInit();
+	InitBufferPool();
+
+	/*
+	 * Set up lock manager
+	 */
+	InitLocks();
+
+	/*
+	 * Set up predicate lock manager
+	 */
+	InitPredicateLocks();
+
+	/*
+	 * Set up process table
+	 */
+	if (!IsUnderPostmaster)
+		InitProcGlobal();
+	CreateSharedProcArray();
+	CreateSharedBackendStatus();
+	TwoPhaseShmemInit();
+	BackgroundWorkerShmemInit();
+
+	/*
+	 * Set up shared-inval messaging
+	 */
+	CreateSharedInvalidationState();
+
+	/*
+	 * Set up interprocess signaling mechanisms
+	 */
+	PMSignalShmemInit();
+	ProcSignalShmemInit();
+	CheckpointerShmemInit();
+	AutoVacuumShmemInit();
+	ReplicationSlotsShmemInit();
+	ReplicationOriginShmemInit();
+	WalSndShmemInit();
+	WalRcvShmemInit();
+	PgArchShmemInit();
+	ApplyLauncherShmemInit();
+
+	/*
+	 * Set up other modules that need some shared memory space
+	 */
+	SnapMgrInit();
+	BTreeShmemInit();
+	SyncScanShmemInit();
+	AsyncShmemInit();
+
+#ifdef EXEC_BACKEND
+
+	/*
+	 * Alloc the win32 shared backend array
+	 */
+	if (!IsUnderPostmaster)
+		ShmemBackendArrayAllocation();
+#endif
+
+	/* Initialize dynamic shared memory facilities. */
+	if (!IsUnderPostmaster)
+		dsm_postmaster_startup(shim);
+
+	/*
+	 * Now give loadable modules a chance to set up their shmem allocations
+	 */
+	if (shmem_startup_hook)
+		shmem_startup_hook();
+}
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
new file mode 100644
index 0000000..3427bcf
--- /dev/null
+++ b/src/backend/storage/ipc/latch.c
@@ -0,0 +1,2158 @@
+/*-------------------------------------------------------------------------
+ *
+ * latch.c
+ *	  Routines for inter-process latches
+ *
+ * The poll() implementation uses the so-called self-pipe trick to overcome the
+ * race condition involved with poll() and setting a global flag in the signal
+ * handler. When a latch is set and the current process is waiting for it, the
+ * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe.
+ * A signal by itself doesn't interrupt poll() on all platforms, and even on
+ * platforms where it does, a signal that arrives just before the poll() call
+ * does not prevent poll() from entering sleep. An incoming byte on a pipe
+ * however reliably interrupts the sleep, and causes poll() to return
+ * immediately even if the signal arrives before poll() begins.
+ *
+ * The epoll() implementation overcomes the race with a different technique: it
+ * keeps SIGURG blocked and consumes from a signalfd() descriptor instead.  We
+ * don't need to register a signal handler or create our own self-pipe.  We
+ * assume that any system that has Linux epoll() also has Linux signalfd().
+ *
+ * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL.
+ *
+ * The Windows implementation uses Windows events that are inherited by all
+ * postmaster child processes. There's no need for the self-pipe trick there.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/latch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <unistd.h>
+#ifdef HAVE_SYS_EPOLL_H
+#include <sys/epoll.h>
+#endif
+#ifdef HAVE_SYS_EVENT_H
+#include <sys/event.h>
+#endif
+#ifdef HAVE_SYS_SIGNALFD_H
+#include <sys/signalfd.h>
+#endif
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "portability/instr_time.h"
+#include "postmaster/postmaster.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+#include "utils/memutils.h"
+
+/*
+ * Select the fd readiness primitive to use. Normally the "most modern"
+ * primitive supported by the OS will be used, but for testing it can be
+ * useful to manually specify the used primitive.  If desired, just add a
+ * define somewhere before this block.
+ */
+#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
+	defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
+/* don't overwrite manual choice */
+#elif defined(HAVE_SYS_EPOLL_H)
+#define WAIT_USE_EPOLL
+#elif defined(HAVE_KQUEUE)
+#define WAIT_USE_KQUEUE
+#elif defined(HAVE_POLL)
+#define WAIT_USE_POLL
+#elif WIN32
+#define WAIT_USE_WIN32
+#else
+#error "no wait set implementation available"
+#endif
+
+/*
+ * By default, we use a self-pipe with poll() and a signalfd with epoll(), if
+ * available.  We avoid signalfd on illumos for now based on problem reports.
+ * For testing the choice can also be manually specified.
+ */
+#if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL)
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+/* don't overwrite manual choice */
+#elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H) && \
+	!defined(__illumos__)
+#define WAIT_USE_SIGNALFD
+#else
+#define WAIT_USE_SELF_PIPE
+#endif
+#endif
+
+/* typedef in latch.h */
+struct WaitEventSet
+{
+	int			nevents;		/* number of registered events */
+	int			nevents_space;	/* maximum number of events in this set */
+
+	/*
+	 * Array, of nevents_space length, storing the definition of events this
+	 * set is waiting for.
+	 */
+	WaitEvent  *events;
+
+	/*
+	 * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
+	 * said latch, and latch_pos the offset in the ->events array. This is
+	 * useful because we check the state of the latch before performing doing
+	 * syscalls related to waiting.
+	 */
+	Latch	   *latch;
+	int			latch_pos;
+
+	/*
+	 * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
+	 * is set so that we'll exit immediately if postmaster death is detected,
+	 * instead of returning.
+	 */
+	bool		exit_on_postmaster_death;
+
+#if defined(WAIT_USE_EPOLL)
+	int			epoll_fd;
+	/* epoll_wait returns events in a user provided arrays, allocate once */
+	struct epoll_event *epoll_ret_events;
+#elif defined(WAIT_USE_KQUEUE)
+	int			kqueue_fd;
+	/* kevent returns events in a user provided arrays, allocate once */
+	struct kevent *kqueue_ret_events;
+	bool		report_postmaster_not_running;
+#elif defined(WAIT_USE_POLL)
+	/* poll expects events to be waited on every poll() call, prepare once */
+	struct pollfd *pollfds;
+#elif defined(WAIT_USE_WIN32)
+
+	/*
+	 * Array of windows events. The first element always contains
+	 * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
+	 * event->pos + 1).
+	 */
+	HANDLE	   *handles;
+#endif
+};
+
+/* A common WaitEventSet used to implement WatchLatch() */
+static WaitEventSet *LatchWaitSet;
+
+/* The position of the latch in LatchWaitSet. */
+#define LatchWaitSetLatchPos 0
+
+#ifndef WIN32
+/* Are we currently in WaitLatch? The signal handler would like to know. */
+static volatile sig_atomic_t waiting = false;
+#endif
+
+#ifdef WAIT_USE_SIGNALFD
+/* On Linux, we'll receive SIGURG via a signalfd file descriptor. */
+static int	signal_fd = -1;
+#endif
+
+#ifdef WAIT_USE_SELF_PIPE
+/* Read and write ends of the self-pipe */
+static int	selfpipe_readfd = -1;
+static int	selfpipe_writefd = -1;
+
+/* Process owning the self-pipe --- needed for checking purposes */
+static int	selfpipe_owner_pid = 0;
+
+/* Private function prototypes */
+static void latch_sigurg_handler(SIGNAL_ARGS);
+static void sendSelfPipeByte(void);
+#endif
+
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+static void drain(void);
+#endif
+
+#if defined(WAIT_USE_EPOLL)
+static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
+#elif defined(WAIT_USE_KQUEUE)
+static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
+#elif defined(WAIT_USE_POLL)
+static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
+#elif defined(WAIT_USE_WIN32)
+static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
+#endif
+
+static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+										WaitEvent *occurred_events, int nevents);
+
+/*
+ * Initialize the process-local latch infrastructure.
+ *
+ * This must be called once during startup of any process that can wait on
+ * latches, before it issues any InitLatch() or OwnLatch() calls.
+ */
+void
+InitializeLatchSupport(void)
+{
+#if defined(WAIT_USE_SELF_PIPE)
+	int			pipefd[2];
+
+	if (IsUnderPostmaster)
+	{
+		/*
+		 * We might have inherited connections to a self-pipe created by the
+		 * postmaster.  It's critical that child processes create their own
+		 * self-pipes, of course, and we really want them to close the
+		 * inherited FDs for safety's sake.
+		 */
+		if (selfpipe_owner_pid != 0)
+		{
+			/* Assert we go through here but once in a child process */
+			Assert(selfpipe_owner_pid != MyProcPid);
+			/* Release postmaster's pipe FDs; ignore any error */
+			(void) close(selfpipe_readfd);
+			(void) close(selfpipe_writefd);
+			/* Clean up, just for safety's sake; we'll set these below */
+			selfpipe_readfd = selfpipe_writefd = -1;
+			selfpipe_owner_pid = 0;
+			/* Keep fd.c's accounting straight */
+			ReleaseExternalFD();
+			ReleaseExternalFD();
+		}
+		else
+		{
+			/*
+			 * Postmaster didn't create a self-pipe ... or else we're in an
+			 * EXEC_BACKEND build, in which case it doesn't matter since the
+			 * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
+			 * fd.c won't have state to clean up, either.
+			 */
+			Assert(selfpipe_readfd == -1);
+		}
+	}
+	else
+	{
+		/* In postmaster or standalone backend, assert we do this but once */
+		Assert(selfpipe_readfd == -1);
+		Assert(selfpipe_owner_pid == 0);
+	}
+
+	/*
+	 * Set up the self-pipe that allows a signal handler to wake up the
+	 * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
+	 * that SetLatch won't block if the event has already been set many times
+	 * filling the kernel buffer. Make the read-end non-blocking too, so that
+	 * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
+	 * Also, make both FDs close-on-exec, since we surely do not want any
+	 * child processes messing with them.
+	 */
+	if (pipe(pipefd) < 0)
+		elog(FATAL, "pipe() failed: %m");
+	if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
+		elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
+	if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
+		elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
+	if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
+		elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
+	if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
+		elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
+
+	selfpipe_readfd = pipefd[0];
+	selfpipe_writefd = pipefd[1];
+	selfpipe_owner_pid = MyProcPid;
+
+	/* Tell fd.c about these two long-lived FDs */
+	ReserveExternalFD();
+	ReserveExternalFD();
+
+	pqsignal(SIGURG, latch_sigurg_handler);
+#endif
+
+#ifdef WAIT_USE_SIGNALFD
+	sigset_t	signalfd_mask;
+
+	/* Block SIGURG, because we'll receive it through a signalfd. */
+	sigaddset(&UnBlockSig, SIGURG);
+
+	/* Set up the signalfd to receive SIGURG notifications. */
+	sigemptyset(&signalfd_mask);
+	sigaddset(&signalfd_mask, SIGURG);
+	signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC);
+	if (signal_fd < 0)
+		elog(FATAL, "signalfd() failed");
+	ReserveExternalFD();
+#endif
+
+#ifdef WAIT_USE_KQUEUE
+	/* Ignore SIGURG, because we'll receive it via kqueue. */
+	pqsignal(SIGURG, SIG_IGN);
+#endif
+}
+
+void
+InitializeLatchWaitSet(void)
+{
+	int			latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(LatchWaitSet == NULL);
+
+	/* Set up the WaitEventSet used by WaitLatch(). */
+	LatchWaitSet = CreateWaitEventSet(TopMemoryContext, 2);
+	latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+								  MyLatch, NULL);
+	if (IsUnderPostmaster)
+		AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
+						  PGINVALID_SOCKET, NULL, NULL);
+
+	Assert(latch_pos == LatchWaitSetLatchPos);
+}
+
+void
+ShutdownLatchSupport(void)
+{
+#if defined(WAIT_USE_POLL)
+	pqsignal(SIGURG, SIG_IGN);
+#endif
+
+	if (LatchWaitSet)
+	{
+		FreeWaitEventSet(LatchWaitSet);
+		LatchWaitSet = NULL;
+	}
+
+#if defined(WAIT_USE_SELF_PIPE)
+	close(selfpipe_readfd);
+	close(selfpipe_writefd);
+	selfpipe_readfd = -1;
+	selfpipe_writefd = -1;
+	selfpipe_owner_pid = InvalidPid;
+#endif
+
+#if defined(WAIT_USE_SIGNALFD)
+	close(signal_fd);
+	signal_fd = -1;
+#endif
+}
+
+/*
+ * Initialize a process-local latch.
+ */
+void
+InitLatch(Latch *latch)
+{
+	latch->is_set = false;
+	latch->maybe_sleeping = false;
+	latch->owner_pid = MyProcPid;
+	latch->is_shared = false;
+
+#if defined(WAIT_USE_SELF_PIPE)
+	/* Assert InitializeLatchSupport has been called in this process */
+	Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
+#elif defined(WAIT_USE_SIGNALFD)
+	/* Assert InitializeLatchSupport has been called in this process */
+	Assert(signal_fd >= 0);
+#elif defined(WAIT_USE_WIN32)
+	latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
+	if (latch->event == NULL)
+		elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
+#endif							/* WIN32 */
+}
+
+/*
+ * Initialize a shared latch that can be set from other processes. The latch
+ * is initially owned by no-one; use OwnLatch to associate it with the
+ * current process.
+ *
+ * InitSharedLatch needs to be called in postmaster before forking child
+ * processes, usually right after allocating the shared memory block
+ * containing the latch with ShmemInitStruct. (The Unix implementation
+ * doesn't actually require that, but the Windows one does.) Because of
+ * this restriction, we have no concurrency issues to worry about here.
+ *
+ * Note that other handles created in this module are never marked as
+ * inheritable.  Thus we do not need to worry about cleaning up child
+ * process references to postmaster-private latches or WaitEventSets.
+ */
+void
+InitSharedLatch(Latch *latch)
+{
+#ifdef WIN32
+	SECURITY_ATTRIBUTES sa;
+
+	/*
+	 * Set up security attributes to specify that the events are inherited.
+	 */
+	ZeroMemory(&sa, sizeof(sa));
+	sa.nLength = sizeof(sa);
+	sa.bInheritHandle = TRUE;
+
+	latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
+	if (latch->event == NULL)
+		elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
+#endif
+
+	latch->is_set = false;
+	latch->maybe_sleeping = false;
+	latch->owner_pid = 0;
+	latch->is_shared = true;
+}
+
+/*
+ * Associate a shared latch with the current process, allowing it to
+ * wait on the latch.
+ *
+ * Although there is a sanity check for latch-already-owned, we don't do
+ * any sort of locking here, meaning that we could fail to detect the error
+ * if two processes try to own the same latch at about the same time.  If
+ * there is any risk of that, caller must provide an interlock to prevent it.
+ */
+void
+OwnLatch(Latch *latch)
+{
+	/* Sanity checks */
+	Assert(latch->is_shared);
+
+#if defined(WAIT_USE_SELF_PIPE)
+	/* Assert InitializeLatchSupport has been called in this process */
+	Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
+#elif defined(WAIT_USE_SIGNALFD)
+	/* Assert InitializeLatchSupport has been called in this process */
+	Assert(signal_fd >= 0);
+#endif
+
+	if (latch->owner_pid != 0)
+		elog(ERROR, "latch already owned");
+
+	latch->owner_pid = MyProcPid;
+}
+
+/*
+ * Disown a shared latch currently owned by the current process.
+ */
+void
+DisownLatch(Latch *latch)
+{
+	Assert(latch->is_shared);
+	Assert(latch->owner_pid == MyProcPid);
+
+	latch->owner_pid = 0;
+}
+
+/*
+ * Wait for a given latch to be set, or for postmaster death, or until timeout
+ * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
+ * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
+ * function returns immediately.
+ *
+ * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
+ * is given.  Although it is declared as "long", we don't actually support
+ * timeouts longer than INT_MAX milliseconds.  Note that some extra overhead
+ * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
+ *
+ * The latch must be owned by the current process, ie. it must be a
+ * process-local latch initialized with InitLatch, or a shared latch
+ * associated with the current process by calling OwnLatch.
+ *
+ * Returns bit mask indicating which condition(s) caused the wake-up. Note
+ * that if multiple wake-up conditions are true, there is no guarantee that
+ * we return all of them in one call, but we will return at least one.
+ */
+int
+WaitLatch(Latch *latch, int wakeEvents, long timeout,
+		  uint32 wait_event_info)
+{
+	WaitEvent	event;
+
+	/* Postmaster-managed callers must handle postmaster death somehow. */
+	Assert(!IsUnderPostmaster ||
+		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+		   (wakeEvents & WL_POSTMASTER_DEATH));
+
+	/*
+	 * Some callers may have a latch other than MyLatch, or no latch at all,
+	 * or want to handle postmaster death differently.  It's cheap to assign
+	 * those, so just do it every time.
+	 */
+	if (!(wakeEvents & WL_LATCH_SET))
+		latch = NULL;
+	ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
+	LatchWaitSet->exit_on_postmaster_death =
+		((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
+
+	if (WaitEventSetWait(LatchWaitSet,
+						 (wakeEvents & WL_TIMEOUT) ? timeout : -1,
+						 &event, 1,
+						 wait_event_info) == 0)
+		return WL_TIMEOUT;
+	else
+		return event.events;
+}
+
+/*
+ * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
+ * conditions.
+ *
+ * When waiting on a socket, EOF and error conditions always cause the socket
+ * to be reported as readable/writable/connected, so that the caller can deal
+ * with the condition.
+ *
+ * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
+ * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
+ * return value if the postmaster dies.  The latter is useful for rare cases
+ * where some behavior other than immediate exit is needed.
+ *
+ * NB: These days this is just a wrapper around the WaitEventSet API. When
+ * using a latch very frequently, consider creating a longer living
+ * WaitEventSet instead; that's more efficient.
+ */
+int
+WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
+				  long timeout, uint32 wait_event_info)
+{
+	int			ret = 0;
+	int			rc;
+	WaitEvent	event;
+	WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
+
+	if (wakeEvents & WL_TIMEOUT)
+		Assert(timeout >= 0);
+	else
+		timeout = -1;
+
+	if (wakeEvents & WL_LATCH_SET)
+		AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
+						  latch, NULL);
+
+	/* Postmaster-managed callers must handle postmaster death somehow. */
+	Assert(!IsUnderPostmaster ||
+		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+		   (wakeEvents & WL_POSTMASTER_DEATH));
+
+	if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
+		AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+
+	if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
+		AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+
+	if (wakeEvents & WL_SOCKET_MASK)
+	{
+		int			ev;
+
+		ev = wakeEvents & WL_SOCKET_MASK;
+		AddWaitEventToSet(set, ev, sock, NULL, NULL);
+	}
+
+	rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
+
+	if (rc == 0)
+		ret |= WL_TIMEOUT;
+	else
+	{
+		ret |= event.events & (WL_LATCH_SET |
+							   WL_POSTMASTER_DEATH |
+							   WL_SOCKET_MASK);
+	}
+
+	FreeWaitEventSet(set);
+
+	return ret;
+}
+
+/*
+ * Sets a latch and wakes up anyone waiting on it.
+ *
+ * This is cheap if the latch is already set, otherwise not so much.
+ *
+ * NB: when calling this in a signal handler, be sure to save and restore
+ * errno around it.  (That's standard practice in most signal handlers, of
+ * course, but we used to omit it in handlers that only set a flag.)
+ *
+ * NB: this function is called from critical sections and signal handlers so
+ * throwing an error is not a good idea.
+ */
+void
+SetLatch(Latch *latch)
+{
+#ifndef WIN32
+	pid_t		owner_pid;
+#else
+	HANDLE		handle;
+#endif
+
+	/*
+	 * The memory barrier has to be placed here to ensure that any flag
+	 * variables possibly changed by this process have been flushed to main
+	 * memory, before we check/set is_set.
+	 */
+	pg_memory_barrier();
+
+	/* Quick exit if already set */
+	if (latch->is_set)
+		return;
+
+	latch->is_set = true;
+
+	pg_memory_barrier();
+	if (!latch->maybe_sleeping)
+		return;
+
+#ifndef WIN32
+
+	/*
+	 * See if anyone's waiting for the latch. It can be the current process if
+	 * we're in a signal handler. We use the self-pipe or SIGURG to ourselves
+	 * to wake up WaitEventSetWaitBlock() without races in that case. If it's
+	 * another process, send a signal.
+	 *
+	 * Fetch owner_pid only once, in case the latch is concurrently getting
+	 * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
+	 * guaranteed to be true! In practice, the effective range of pid_t fits
+	 * in a 32 bit integer, and so should be atomic. In the worst case, we
+	 * might end up signaling the wrong process. Even then, you're very
+	 * unlucky if a process with that bogus pid exists and belongs to
+	 * Postgres; and PG database processes should handle excess SIGUSR1
+	 * interrupts without a problem anyhow.
+	 *
+	 * Another sort of race condition that's possible here is for a new
+	 * process to own the latch immediately after we look, so we don't signal
+	 * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
+	 * the standard coding convention of waiting at the bottom of their loops,
+	 * not the top, so that they'll correctly process latch-setting events
+	 * that happen before they enter the loop.
+	 */
+	owner_pid = latch->owner_pid;
+	if (owner_pid == 0)
+		return;
+	else if (owner_pid == MyProcPid)
+	{
+#if defined(WAIT_USE_SELF_PIPE)
+		if (waiting)
+			sendSelfPipeByte();
+#else
+		if (waiting)
+			kill(MyProcPid, SIGURG);
+#endif
+	}
+	else
+		kill(owner_pid, SIGURG);
+
+#else
+
+	/*
+	 * See if anyone's waiting for the latch. It can be the current process if
+	 * we're in a signal handler.
+	 *
+	 * Use a local variable here just in case somebody changes the event field
+	 * concurrently (which really should not happen).
+	 */
+	handle = latch->event;
+	if (handle)
+	{
+		SetEvent(handle);
+
+		/*
+		 * Note that we silently ignore any errors. We might be in a signal
+		 * handler or other critical path where it's not safe to call elog().
+		 */
+	}
+#endif
+
+}
+
+/*
+ * Clear the latch. Calling WaitLatch after this will sleep, unless
+ * the latch is set again before the WaitLatch call.
+ */
+void
+ResetLatch(Latch *latch)
+{
+	/* Only the owner should reset the latch */
+	Assert(latch->owner_pid == MyProcPid);
+	Assert(latch->maybe_sleeping == false);
+
+	latch->is_set = false;
+
+	/*
+	 * Ensure that the write to is_set gets flushed to main memory before we
+	 * examine any flag variables.  Otherwise a concurrent SetLatch might
+	 * falsely conclude that it needn't signal us, even though we have missed
+	 * seeing some flag updates that SetLatch was supposed to inform us of.
+	 */
+	pg_memory_barrier();
+}
+
+/*
+ * Create a WaitEventSet with space for nevents different events to wait for.
+ *
+ * These events can then be efficiently waited upon together, using
+ * WaitEventSetWait().
+ */
+WaitEventSet *
+CreateWaitEventSet(MemoryContext context, int nevents)
+{
+	WaitEventSet *set;
+	char	   *data;
+	Size		sz = 0;
+
+	/*
+	 * Use MAXALIGN size/alignment to guarantee that later uses of memory are
+	 * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
+	 * platforms, but earlier allocations like WaitEventSet and WaitEvent
+	 * might not be sized to guarantee that when purely using sizeof().
+	 */
+	sz += MAXALIGN(sizeof(WaitEventSet));
+	sz += MAXALIGN(sizeof(WaitEvent) * nevents);
+
+#if defined(WAIT_USE_EPOLL)
+	sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+	sz += MAXALIGN(sizeof(struct kevent) * nevents);
+#elif defined(WAIT_USE_POLL)
+	sz += MAXALIGN(sizeof(struct pollfd) * nevents);
+#elif defined(WAIT_USE_WIN32)
+	/* need space for the pgwin32_signal_event */
+	sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
+#endif
+
+	data = (char *) MemoryContextAllocZero(context, sz);
+
+	set = (WaitEventSet *) data;
+	data += MAXALIGN(sizeof(WaitEventSet));
+
+	set->events = (WaitEvent *) data;
+	data += MAXALIGN(sizeof(WaitEvent) * nevents);
+
+#if defined(WAIT_USE_EPOLL)
+	set->epoll_ret_events = (struct epoll_event *) data;
+	data += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+	set->kqueue_ret_events = (struct kevent *) data;
+	data += MAXALIGN(sizeof(struct kevent) * nevents);
+#elif defined(WAIT_USE_POLL)
+	set->pollfds = (struct pollfd *) data;
+	data += MAXALIGN(sizeof(struct pollfd) * nevents);
+#elif defined(WAIT_USE_WIN32)
+	set->handles = (HANDLE) data;
+	data += MAXALIGN(sizeof(HANDLE) * nevents);
+#endif
+
+	set->latch = NULL;
+	set->nevents_space = nevents;
+	set->exit_on_postmaster_death = false;
+
+#if defined(WAIT_USE_EPOLL)
+	if (!AcquireExternalFD())
+	{
+		/* treat this as though epoll_create1 itself returned EMFILE */
+		elog(ERROR, "epoll_create1 failed: %m");
+	}
+	set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+	if (set->epoll_fd < 0)
+	{
+		ReleaseExternalFD();
+		elog(ERROR, "epoll_create1 failed: %m");
+	}
+#elif defined(WAIT_USE_KQUEUE)
+	if (!AcquireExternalFD())
+	{
+		/* treat this as though kqueue itself returned EMFILE */
+		elog(ERROR, "kqueue failed: %m");
+	}
+	set->kqueue_fd = kqueue();
+	if (set->kqueue_fd < 0)
+	{
+		ReleaseExternalFD();
+		elog(ERROR, "kqueue failed: %m");
+	}
+	if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
+	{
+		int			save_errno = errno;
+
+		close(set->kqueue_fd);
+		ReleaseExternalFD();
+		errno = save_errno;
+		elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
+	}
+	set->report_postmaster_not_running = false;
+#elif defined(WAIT_USE_WIN32)
+
+	/*
+	 * To handle signals while waiting, we need to add a win32 specific event.
+	 * We accounted for the additional event at the top of this routine. See
+	 * port/win32/signal.c for more details.
+	 *
+	 * Note: pgwin32_signal_event should be first to ensure that it will be
+	 * reported when multiple events are set.  We want to guarantee that
+	 * pending signals are serviced.
+	 */
+	set->handles[0] = pgwin32_signal_event;
+	StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
+#endif
+
+	return set;
+}
+
+/*
+ * Free a previously created WaitEventSet.
+ *
+ * Note: preferably, this shouldn't have to free any resources that could be
+ * inherited across an exec().  If it did, we'd likely leak those resources in
+ * many scenarios.  For the epoll case, we ensure that by setting EPOLL_CLOEXEC
+ * when the FD is created.  For the Windows case, we assume that the handles
+ * involved are non-inheritable.
+ */
+void
+FreeWaitEventSet(WaitEventSet *set)
+{
+#if defined(WAIT_USE_EPOLL)
+	close(set->epoll_fd);
+	ReleaseExternalFD();
+#elif defined(WAIT_USE_KQUEUE)
+	close(set->kqueue_fd);
+	ReleaseExternalFD();
+#elif defined(WAIT_USE_WIN32)
+	WaitEvent  *cur_event;
+
+	for (cur_event = set->events;
+		 cur_event < (set->events + set->nevents);
+		 cur_event++)
+	{
+		if (cur_event->events & WL_LATCH_SET)
+		{
+			/* uses the latch's HANDLE */
+		}
+		else if (cur_event->events & WL_POSTMASTER_DEATH)
+		{
+			/* uses PostmasterHandle */
+		}
+		else
+		{
+			/* Clean up the event object we created for the socket */
+			WSAEventSelect(cur_event->fd, NULL, 0);
+			WSACloseEvent(set->handles[cur_event->pos + 1]);
+		}
+	}
+#endif
+
+	pfree(set);
+}
+
+/* ---
+ * Add an event to the set. Possible events are:
+ * - WL_LATCH_SET: Wait for the latch to be set
+ * - WL_POSTMASTER_DEATH: Wait for postmaster to die
+ * - WL_SOCKET_READABLE: Wait for socket to become readable,
+ *	 can be combined in one event with other WL_SOCKET_* events
+ * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
+ *	 can be combined with other WL_SOCKET_* events
+ * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
+ *	 can be combined with other WL_SOCKET_* events (on non-Windows
+ *	 platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
+ *
+ * Returns the offset in WaitEventSet->events (starting from 0), which can be
+ * used to modify previously added wait events using ModifyWaitEvent().
+ *
+ * In the WL_LATCH_SET case the latch must be owned by the current process,
+ * i.e. it must be a process-local latch initialized with InitLatch, or a
+ * shared latch associated with the current process by calling OwnLatch.
+ *
+ * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
+ * conditions cause the socket to be reported as readable/writable/connected,
+ * so that the caller can deal with the condition.
+ *
+ * The user_data pointer specified here will be set for the events returned
+ * by WaitEventSetWait(), allowing to easily associate additional data with
+ * events.
+ */
+int
+AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
+				  void *user_data)
+{
+	WaitEvent  *event;
+
+	/* not enough space */
+	Assert(set->nevents < set->nevents_space);
+
+	if (events == WL_EXIT_ON_PM_DEATH)
+	{
+		events = WL_POSTMASTER_DEATH;
+		set->exit_on_postmaster_death = true;
+	}
+
+	if (latch)
+	{
+		if (latch->owner_pid != MyProcPid)
+			elog(ERROR, "cannot wait on a latch owned by another process");
+		if (set->latch)
+			elog(ERROR, "cannot wait on more than one latch");
+		if ((events & WL_LATCH_SET) != WL_LATCH_SET)
+			elog(ERROR, "latch events only support being set");
+	}
+	else
+	{
+		if (events & WL_LATCH_SET)
+			elog(ERROR, "cannot wait on latch without a specified latch");
+	}
+
+	/* waiting for socket readiness without a socket indicates a bug */
+	if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
+		elog(ERROR, "cannot wait on socket event without a socket");
+
+	event = &set->events[set->nevents];
+	event->pos = set->nevents++;
+	event->fd = fd;
+	event->events = events;
+	event->user_data = user_data;
+#ifdef WIN32
+	event->reset = false;
+#endif
+
+	if (events == WL_LATCH_SET)
+	{
+		set->latch = latch;
+		set->latch_pos = event->pos;
+#if defined(WAIT_USE_SELF_PIPE)
+		event->fd = selfpipe_readfd;
+#elif defined(WAIT_USE_SIGNALFD)
+		event->fd = signal_fd;
+#else
+		event->fd = PGINVALID_SOCKET;
+#ifdef WAIT_USE_EPOLL
+		return event->pos;
+#endif
+#endif
+	}
+	else if (events == WL_POSTMASTER_DEATH)
+	{
+#ifndef WIN32
+		event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
+#endif
+	}
+
+	/* perform wait primitive specific initialization, if needed */
+#if defined(WAIT_USE_EPOLL)
+	WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
+#elif defined(WAIT_USE_KQUEUE)
+	WaitEventAdjustKqueue(set, event, 0);
+#elif defined(WAIT_USE_POLL)
+	WaitEventAdjustPoll(set, event);
+#elif defined(WAIT_USE_WIN32)
+	WaitEventAdjustWin32(set, event);
+#endif
+
+	return event->pos;
+}
+
+/*
+ * Change the event mask and, in the WL_LATCH_SET case, the latch associated
+ * with the WaitEvent.  The latch may be changed to NULL to disable the latch
+ * temporarily, and then set back to a latch later.
+ *
+ * 'pos' is the id returned by AddWaitEventToSet.
+ */
+void
+ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
+{
+	WaitEvent  *event;
+#if defined(WAIT_USE_KQUEUE)
+	int			old_events;
+#endif
+
+	Assert(pos < set->nevents);
+
+	event = &set->events[pos];
+#if defined(WAIT_USE_KQUEUE)
+	old_events = event->events;
+#endif
+
+	/*
+	 * If neither the event mask nor the associated latch changes, return
+	 * early. That's an important optimization for some sockets, where
+	 * ModifyWaitEvent is frequently used to switch from waiting for reads to
+	 * waiting on writes.
+	 */
+	if (events == event->events &&
+		(!(event->events & WL_LATCH_SET) || set->latch == latch))
+		return;
+
+	if (event->events & WL_LATCH_SET &&
+		events != event->events)
+	{
+		elog(ERROR, "cannot modify latch event");
+	}
+
+	if (event->events & WL_POSTMASTER_DEATH)
+	{
+		elog(ERROR, "cannot modify postmaster death event");
+	}
+
+	/* FIXME: validate event mask */
+	event->events = events;
+
+	if (events == WL_LATCH_SET)
+	{
+		if (latch && latch->owner_pid != MyProcPid)
+			elog(ERROR, "cannot wait on a latch owned by another process");
+		set->latch = latch;
+
+		/*
+		 * On Unix, we don't need to modify the kernel object because the
+		 * underlying pipe (if there is one) is the same for all latches so we
+		 * can return immediately.  On Windows, we need to update our array of
+		 * handles, but we leave the old one in place and tolerate spurious
+		 * wakeups if the latch is disabled.
+		 */
+#if defined(WAIT_USE_WIN32)
+		if (!latch)
+			return;
+#else
+		return;
+#endif
+	}
+
+#if defined(WAIT_USE_EPOLL)
+	WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
+#elif defined(WAIT_USE_KQUEUE)
+	WaitEventAdjustKqueue(set, event, old_events);
+#elif defined(WAIT_USE_POLL)
+	WaitEventAdjustPoll(set, event);
+#elif defined(WAIT_USE_WIN32)
+	WaitEventAdjustWin32(set, event);
+#endif
+}
+
+#if defined(WAIT_USE_EPOLL)
+/*
+ * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
+ */
+static void
+WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
+{
+	struct epoll_event epoll_ev;
+	int			rc;
+
+	/* pointer to our event, returned by epoll_wait */
+	epoll_ev.data.ptr = event;
+	/* always wait for errors */
+	epoll_ev.events = EPOLLERR | EPOLLHUP;
+
+	/* prepare pollfd entry once */
+	if (event->events == WL_LATCH_SET)
+	{
+		Assert(set->latch != NULL);
+		epoll_ev.events |= EPOLLIN;
+	}
+	else if (event->events == WL_POSTMASTER_DEATH)
+	{
+		epoll_ev.events |= EPOLLIN;
+	}
+	else
+	{
+		Assert(event->fd != PGINVALID_SOCKET);
+		Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+
+		if (event->events & WL_SOCKET_READABLE)
+			epoll_ev.events |= EPOLLIN;
+		if (event->events & WL_SOCKET_WRITEABLE)
+			epoll_ev.events |= EPOLLOUT;
+	}
+
+	/*
+	 * Even though unused, we also pass epoll_ev as the data argument if
+	 * EPOLL_CTL_DEL is passed as action.  There used to be an epoll bug
+	 * requiring that, and actually it makes the code simpler...
+	 */
+	rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
+
+	if (rc < 0)
+		ereport(ERROR,
+				(errcode_for_socket_access(),
+				 errmsg("%s() failed: %m",
+						"epoll_ctl")));
+}
+#endif
+
+#if defined(WAIT_USE_POLL)
+static void
+WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
+{
+	struct pollfd *pollfd = &set->pollfds[event->pos];
+
+	pollfd->revents = 0;
+	pollfd->fd = event->fd;
+
+	/* prepare pollfd entry once */
+	if (event->events == WL_LATCH_SET)
+	{
+		Assert(set->latch != NULL);
+		pollfd->events = POLLIN;
+	}
+	else if (event->events == WL_POSTMASTER_DEATH)
+	{
+		pollfd->events = POLLIN;
+	}
+	else
+	{
+		Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+		pollfd->events = 0;
+		if (event->events & WL_SOCKET_READABLE)
+			pollfd->events |= POLLIN;
+		if (event->events & WL_SOCKET_WRITEABLE)
+			pollfd->events |= POLLOUT;
+	}
+
+	Assert(event->fd != PGINVALID_SOCKET);
+}
+#endif
+
+#if defined(WAIT_USE_KQUEUE)
+
+/*
+ * On most BSD family systems, the udata member of struct kevent is of type
+ * void *, so we could directly convert to/from WaitEvent *.  Unfortunately,
+ * NetBSD has it as intptr_t, so here we wallpaper over that difference with
+ * an lvalue cast.
+ */
+#define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
+
+static inline void
+WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
+						 WaitEvent *event)
+{
+	k_ev->ident = event->fd;
+	k_ev->filter = filter;
+	k_ev->flags = action;
+	k_ev->fflags = 0;
+	k_ev->data = 0;
+	AccessWaitEvent(k_ev) = event;
+}
+
+static inline void
+WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
+{
+	/* For now postmaster death can only be added, not removed. */
+	k_ev->ident = PostmasterPid;
+	k_ev->filter = EVFILT_PROC;
+	k_ev->flags = EV_ADD;
+	k_ev->fflags = NOTE_EXIT;
+	k_ev->data = 0;
+	AccessWaitEvent(k_ev) = event;
+}
+
+static inline void
+WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event)
+{
+	/* For now latch can only be added, not removed. */
+	k_ev->ident = SIGURG;
+	k_ev->filter = EVFILT_SIGNAL;
+	k_ev->flags = EV_ADD;
+	k_ev->fflags = 0;
+	k_ev->data = 0;
+	AccessWaitEvent(k_ev) = event;
+}
+
+/*
+ * old_events is the previous event mask, used to compute what has changed.
+ */
+static void
+WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
+{
+	int			rc;
+	struct kevent k_ev[2];
+	int			count = 0;
+	bool		new_filt_read = false;
+	bool		old_filt_read = false;
+	bool		new_filt_write = false;
+	bool		old_filt_write = false;
+
+	if (old_events == event->events)
+		return;
+
+	Assert(event->events != WL_LATCH_SET || set->latch != NULL);
+	Assert(event->events == WL_LATCH_SET ||
+		   event->events == WL_POSTMASTER_DEATH ||
+		   (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+
+	if (event->events == WL_POSTMASTER_DEATH)
+	{
+		/*
+		 * Unlike all the other implementations, we detect postmaster death
+		 * using process notification instead of waiting on the postmaster
+		 * alive pipe.
+		 */
+		WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
+	}
+	else if (event->events == WL_LATCH_SET)
+	{
+		/* We detect latch wakeup using a signal event. */
+		WaitEventAdjustKqueueAddLatch(&k_ev[count++], event);
+	}
+	else
+	{
+		/*
+		 * We need to compute the adds and deletes required to get from the
+		 * old event mask to the new event mask, since kevent treats readable
+		 * and writable as separate events.
+		 */
+		if (old_events & WL_SOCKET_READABLE)
+			old_filt_read = true;
+		if (event->events & WL_SOCKET_READABLE)
+			new_filt_read = true;
+		if (old_events & WL_SOCKET_WRITEABLE)
+			old_filt_write = true;
+		if (event->events & WL_SOCKET_WRITEABLE)
+			new_filt_write = true;
+		if (old_filt_read && !new_filt_read)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
+									 event);
+		else if (!old_filt_read && new_filt_read)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
+									 event);
+		if (old_filt_write && !new_filt_write)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
+									 event);
+		else if (!old_filt_write && new_filt_write)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
+									 event);
+	}
+
+	Assert(count > 0);
+	Assert(count <= 2);
+
+	rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
+
+	/*
+	 * When adding the postmaster's pid, we have to consider that it might
+	 * already have exited and perhaps even been replaced by another process
+	 * with the same pid.  If so, we have to defer reporting this as an event
+	 * until the next call to WaitEventSetWaitBlock().
+	 */
+
+	if (rc < 0)
+	{
+		if (event->events == WL_POSTMASTER_DEATH &&
+			(errno == ESRCH || errno == EACCES))
+			set->report_postmaster_not_running = true;
+		else
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("%s() failed: %m",
+							"kevent")));
+	}
+	else if (event->events == WL_POSTMASTER_DEATH &&
+			 PostmasterPid != getppid() &&
+			 !PostmasterIsAlive())
+	{
+		/*
+		 * The extra PostmasterIsAliveInternal() check prevents false alarms
+		 * on systems that give a different value for getppid() while being
+		 * traced by a debugger.
+		 */
+		set->report_postmaster_not_running = true;
+	}
+}
+
+#endif
+
+#if defined(WAIT_USE_WIN32)
+static void
+WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
+{
+	HANDLE	   *handle = &set->handles[event->pos + 1];
+
+	if (event->events == WL_LATCH_SET)
+	{
+		Assert(set->latch != NULL);
+		*handle = set->latch->event;
+	}
+	else if (event->events == WL_POSTMASTER_DEATH)
+	{
+		*handle = PostmasterHandle;
+	}
+	else
+	{
+		int			flags = FD_CLOSE;	/* always check for errors/EOF */
+
+		if (event->events & WL_SOCKET_READABLE)
+			flags |= FD_READ;
+		if (event->events & WL_SOCKET_WRITEABLE)
+			flags |= FD_WRITE;
+		if (event->events & WL_SOCKET_CONNECTED)
+			flags |= FD_CONNECT;
+
+		if (*handle == WSA_INVALID_EVENT)
+		{
+			*handle = WSACreateEvent();
+			if (*handle == WSA_INVALID_EVENT)
+				elog(ERROR, "failed to create event for socket: error code %d",
+					 WSAGetLastError());
+		}
+		if (WSAEventSelect(event->fd, *handle, flags) != 0)
+			elog(ERROR, "failed to set up event for socket: error code %d",
+				 WSAGetLastError());
+
+		Assert(event->fd != PGINVALID_SOCKET);
+	}
+}
+#endif
+
+/*
+ * Wait for events added to the set to happen, or until the timeout is
+ * reached.  At most nevents occurred events are returned.
+ *
+ * If timeout = -1, block until an event occurs; if 0, check sockets for
+ * readiness, but don't block; if > 0, block for at most timeout milliseconds.
+ *
+ * Returns the number of events occurred, or 0 if the timeout was reached.
+ *
+ * Returned events will have the fd, pos, user_data fields set to the
+ * values associated with the registered event.
+ */
+int
+WaitEventSetWait(WaitEventSet *set, long timeout,
+				 WaitEvent *occurred_events, int nevents,
+				 uint32 wait_event_info)
+{
+	int			returned_events = 0;
+	instr_time	start_time;
+	instr_time	cur_time;
+	long		cur_timeout = -1;
+
+	Assert(nevents > 0);
+
+	/*
+	 * Initialize timeout if requested.  We must record the current time so
+	 * that we can determine the remaining timeout if interrupted.
+	 */
+	if (timeout >= 0)
+	{
+		INSTR_TIME_SET_CURRENT(start_time);
+		Assert(timeout >= 0 && timeout <= INT_MAX);
+		cur_timeout = timeout;
+	}
+
+	pgstat_report_wait_start(wait_event_info);
+
+#ifndef WIN32
+	waiting = true;
+#else
+	/* Ensure that signals are serviced even if latch is already set */
+	pgwin32_dispatch_queued_signals();
+#endif
+	while (returned_events == 0)
+	{
+		int			rc;
+
+		/*
+		 * Check if the latch is set already. If so, leave the loop
+		 * immediately, avoid blocking again. We don't attempt to report any
+		 * other events that might also be satisfied.
+		 *
+		 * If someone sets the latch between this and the
+		 * WaitEventSetWaitBlock() below, the setter will write a byte to the
+		 * pipe (or signal us and the signal handler will do that), and the
+		 * readiness routine will return immediately.
+		 *
+		 * On unix, If there's a pending byte in the self pipe, we'll notice
+		 * whenever blocking. Only clearing the pipe in that case avoids
+		 * having to drain it every time WaitLatchOrSocket() is used. Should
+		 * the pipe-buffer fill up we're still ok, because the pipe is in
+		 * nonblocking mode. It's unlikely for that to happen, because the
+		 * self pipe isn't filled unless we're blocking (waiting = true), or
+		 * from inside a signal handler in latch_sigurg_handler().
+		 *
+		 * On windows, we'll also notice if there's a pending event for the
+		 * latch when blocking, but there's no danger of anything filling up,
+		 * as "Setting an event that is already set has no effect.".
+		 *
+		 * Note: we assume that the kernel calls involved in latch management
+		 * will provide adequate synchronization on machines with weak memory
+		 * ordering, so that we cannot miss seeing is_set if a notification
+		 * has already been queued.
+		 */
+		if (set->latch && !set->latch->is_set)
+		{
+			/* about to sleep on a latch */
+			set->latch->maybe_sleeping = true;
+			pg_memory_barrier();
+			/* and recheck */
+		}
+
+		if (set->latch && set->latch->is_set)
+		{
+			occurred_events->fd = PGINVALID_SOCKET;
+			occurred_events->pos = set->latch_pos;
+			occurred_events->user_data =
+				set->events[set->latch_pos].user_data;
+			occurred_events->events = WL_LATCH_SET;
+			occurred_events++;
+			returned_events++;
+
+			/* could have been set above */
+			set->latch->maybe_sleeping = false;
+
+			break;
+		}
+
+		/*
+		 * Wait for events using the readiness primitive chosen at the top of
+		 * this file. If -1 is returned, a timeout has occurred, if 0 we have
+		 * to retry, everything >= 1 is the number of returned events.
+		 */
+		rc = WaitEventSetWaitBlock(set, cur_timeout,
+								   occurred_events, nevents);
+
+		if (set->latch)
+		{
+			Assert(set->latch->maybe_sleeping);
+			set->latch->maybe_sleeping = false;
+		}
+
+		if (rc == -1)
+			break;				/* timeout occurred */
+		else
+			returned_events = rc;
+
+		/* If we're not done, update cur_timeout for next iteration */
+		if (returned_events == 0 && timeout >= 0)
+		{
+			INSTR_TIME_SET_CURRENT(cur_time);
+			INSTR_TIME_SUBTRACT(cur_time, start_time);
+			cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+			if (cur_timeout <= 0)
+				break;
+		}
+	}
+#ifndef WIN32
+	waiting = false;
+#endif
+
+	pgstat_report_wait_end();
+
+	return returned_events;
+}
+
+
+#if defined(WAIT_USE_EPOLL)
+
+/*
+ * Wait using linux's epoll_wait(2).
+ *
+ * This is the preferable wait method, as several readiness notifications are
+ * delivered, without having to iterate through all of set->events. The return
+ * epoll_event struct contain a pointer to our events, making association
+ * easy.
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+					  WaitEvent *occurred_events, int nevents)
+{
+	int			returned_events = 0;
+	int			rc;
+	WaitEvent  *cur_event;
+	struct epoll_event *cur_epoll_event;
+
+	/* Sleep */
+	rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
+					nevents, cur_timeout);
+
+	/* Check return code */
+	if (rc < 0)
+	{
+		/* EINTR is okay, otherwise complain */
+		if (errno != EINTR)
+		{
+			waiting = false;
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("%s() failed: %m",
+							"epoll_wait")));
+		}
+		return 0;
+	}
+	else if (rc == 0)
+	{
+		/* timeout exceeded */
+		return -1;
+	}
+
+	/*
+	 * At least one event occurred, iterate over the returned epoll events
+	 * until they're either all processed, or we've returned all the events
+	 * the caller desired.
+	 */
+	for (cur_epoll_event = set->epoll_ret_events;
+		 cur_epoll_event < (set->epoll_ret_events + rc) &&
+		 returned_events < nevents;
+		 cur_epoll_event++)
+	{
+		/* epoll's data pointer is set to the associated WaitEvent */
+		cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
+
+		occurred_events->pos = cur_event->pos;
+		occurred_events->user_data = cur_event->user_data;
+		occurred_events->events = 0;
+
+		if (cur_event->events == WL_LATCH_SET &&
+			cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
+		{
+			/* Drain the signalfd. */
+			drain();
+
+			if (set->latch && set->latch->is_set)
+			{
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_LATCH_SET;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events == WL_POSTMASTER_DEATH &&
+				 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
+		{
+			/*
+			 * We expect an EPOLLHUP when the remote end is closed, but
+			 * because we don't expect the pipe to become readable or to have
+			 * any errors either, treat those cases as postmaster death, too.
+			 *
+			 * Be paranoid about a spurious event signaling the postmaster as
+			 * being dead.  There have been reports about that happening with
+			 * older primitives (select(2) to be specific), and a spurious
+			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+			 * cost much.
+			 */
+			if (!PostmasterIsAliveInternal())
+			{
+				if (set->exit_on_postmaster_death)
+					proc_exit(1);
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_POSTMASTER_DEATH;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+		{
+			Assert(cur_event->fd != PGINVALID_SOCKET);
+
+			if ((cur_event->events & WL_SOCKET_READABLE) &&
+				(cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
+			{
+				/* data available in socket, or EOF */
+				occurred_events->events |= WL_SOCKET_READABLE;
+			}
+
+			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+				(cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
+			{
+				/* writable, or EOF */
+				occurred_events->events |= WL_SOCKET_WRITEABLE;
+			}
+
+			if (occurred_events->events != 0)
+			{
+				occurred_events->fd = cur_event->fd;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+	}
+
+	return returned_events;
+}
+
+#elif defined(WAIT_USE_KQUEUE)
+
+/*
+ * Wait using kevent(2) on BSD-family systems and macOS.
+ *
+ * For now this mirrors the epoll code, but in future it could modify the fd
+ * set in the same call to kevent as it uses for waiting instead of doing that
+ * with separate system calls.
+ */
+static int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+					  WaitEvent *occurred_events, int nevents)
+{
+	int			returned_events = 0;
+	int			rc;
+	WaitEvent  *cur_event;
+	struct kevent *cur_kqueue_event;
+	struct timespec timeout;
+	struct timespec *timeout_p;
+
+	if (cur_timeout < 0)
+		timeout_p = NULL;
+	else
+	{
+		timeout.tv_sec = cur_timeout / 1000;
+		timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
+		timeout_p = &timeout;
+	}
+
+	/*
+	 * Report postmaster events discovered by WaitEventAdjustKqueue() or an
+	 * earlier call to WaitEventSetWait().
+	 */
+	if (unlikely(set->report_postmaster_not_running))
+	{
+		if (set->exit_on_postmaster_death)
+			proc_exit(1);
+		occurred_events->fd = PGINVALID_SOCKET;
+		occurred_events->events = WL_POSTMASTER_DEATH;
+		return 1;
+	}
+
+	/* Sleep */
+	rc = kevent(set->kqueue_fd, NULL, 0,
+				set->kqueue_ret_events, nevents,
+				timeout_p);
+
+	/* Check return code */
+	if (rc < 0)
+	{
+		/* EINTR is okay, otherwise complain */
+		if (errno != EINTR)
+		{
+			waiting = false;
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("%s() failed: %m",
+							"kevent")));
+		}
+		return 0;
+	}
+	else if (rc == 0)
+	{
+		/* timeout exceeded */
+		return -1;
+	}
+
+	/*
+	 * At least one event occurred, iterate over the returned kqueue events
+	 * until they're either all processed, or we've returned all the events
+	 * the caller desired.
+	 */
+	for (cur_kqueue_event = set->kqueue_ret_events;
+		 cur_kqueue_event < (set->kqueue_ret_events + rc) &&
+		 returned_events < nevents;
+		 cur_kqueue_event++)
+	{
+		/* kevent's udata points to the associated WaitEvent */
+		cur_event = AccessWaitEvent(cur_kqueue_event);
+
+		occurred_events->pos = cur_event->pos;
+		occurred_events->user_data = cur_event->user_data;
+		occurred_events->events = 0;
+
+		if (cur_event->events == WL_LATCH_SET &&
+			cur_kqueue_event->filter == EVFILT_SIGNAL)
+		{
+			if (set->latch && set->latch->is_set)
+			{
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_LATCH_SET;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events == WL_POSTMASTER_DEATH &&
+				 cur_kqueue_event->filter == EVFILT_PROC &&
+				 (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
+		{
+			/*
+			 * The kernel will tell this kqueue object only once about the
+			 * exit of the postmaster, so let's remember that for next time so
+			 * that we provide level-triggered semantics.
+			 */
+			set->report_postmaster_not_running = true;
+
+			if (set->exit_on_postmaster_death)
+				proc_exit(1);
+			occurred_events->fd = PGINVALID_SOCKET;
+			occurred_events->events = WL_POSTMASTER_DEATH;
+			occurred_events++;
+			returned_events++;
+		}
+		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+		{
+			Assert(cur_event->fd >= 0);
+
+			if ((cur_event->events & WL_SOCKET_READABLE) &&
+				(cur_kqueue_event->filter == EVFILT_READ))
+			{
+				/* readable, or EOF */
+				occurred_events->events |= WL_SOCKET_READABLE;
+			}
+
+			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+				(cur_kqueue_event->filter == EVFILT_WRITE))
+			{
+				/* writable, or EOF */
+				occurred_events->events |= WL_SOCKET_WRITEABLE;
+			}
+
+			if (occurred_events->events != 0)
+			{
+				occurred_events->fd = cur_event->fd;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+	}
+
+	return returned_events;
+}
+
+#elif defined(WAIT_USE_POLL)
+
+/*
+ * Wait using poll(2).
+ *
+ * This allows to receive readiness notifications for several events at once,
+ * but requires iterating through all of set->pollfds.
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+					  WaitEvent *occurred_events, int nevents)
+{
+	int			returned_events = 0;
+	int			rc;
+	WaitEvent  *cur_event;
+	struct pollfd *cur_pollfd;
+
+	/* Sleep */
+	rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
+
+	/* Check return code */
+	if (rc < 0)
+	{
+		/* EINTR is okay, otherwise complain */
+		if (errno != EINTR)
+		{
+			waiting = false;
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("%s() failed: %m",
+							"poll")));
+		}
+		return 0;
+	}
+	else if (rc == 0)
+	{
+		/* timeout exceeded */
+		return -1;
+	}
+
+	for (cur_event = set->events, cur_pollfd = set->pollfds;
+		 cur_event < (set->events + set->nevents) &&
+		 returned_events < nevents;
+		 cur_event++, cur_pollfd++)
+	{
+		/* no activity on this FD, skip */
+		if (cur_pollfd->revents == 0)
+			continue;
+
+		occurred_events->pos = cur_event->pos;
+		occurred_events->user_data = cur_event->user_data;
+		occurred_events->events = 0;
+
+		if (cur_event->events == WL_LATCH_SET &&
+			(cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
+		{
+			/* There's data in the self-pipe, clear it. */
+			drain();
+
+			if (set->latch && set->latch->is_set)
+			{
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_LATCH_SET;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events == WL_POSTMASTER_DEATH &&
+				 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
+		{
+			/*
+			 * We expect an POLLHUP when the remote end is closed, but because
+			 * we don't expect the pipe to become readable or to have any
+			 * errors either, treat those cases as postmaster death, too.
+			 *
+			 * Be paranoid about a spurious event signaling the postmaster as
+			 * being dead.  There have been reports about that happening with
+			 * older primitives (select(2) to be specific), and a spurious
+			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+			 * cost much.
+			 */
+			if (!PostmasterIsAliveInternal())
+			{
+				if (set->exit_on_postmaster_death)
+					proc_exit(1);
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_POSTMASTER_DEATH;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+		{
+			int			errflags = POLLHUP | POLLERR | POLLNVAL;
+
+			Assert(cur_event->fd >= PGINVALID_SOCKET);
+
+			if ((cur_event->events & WL_SOCKET_READABLE) &&
+				(cur_pollfd->revents & (POLLIN | errflags)))
+			{
+				/* data available in socket, or EOF */
+				occurred_events->events |= WL_SOCKET_READABLE;
+			}
+
+			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+				(cur_pollfd->revents & (POLLOUT | errflags)))
+			{
+				/* writeable, or EOF */
+				occurred_events->events |= WL_SOCKET_WRITEABLE;
+			}
+
+			if (occurred_events->events != 0)
+			{
+				occurred_events->fd = cur_event->fd;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+	}
+	return returned_events;
+}
+
+#elif defined(WAIT_USE_WIN32)
+
+/*
+ * Wait using Windows' WaitForMultipleObjects().
+ *
+ * Unfortunately this will only ever return a single readiness notification at
+ * a time.  Note that while the official documentation for
+ * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
+ * with a single bWaitAll = FALSE call,
+ * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
+ * that only one event is "consumed".
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+					  WaitEvent *occurred_events, int nevents)
+{
+	int			returned_events = 0;
+	DWORD		rc;
+	WaitEvent  *cur_event;
+
+	/* Reset any wait events that need it */
+	for (cur_event = set->events;
+		 cur_event < (set->events + set->nevents);
+		 cur_event++)
+	{
+		if (cur_event->reset)
+		{
+			WaitEventAdjustWin32(set, cur_event);
+			cur_event->reset = false;
+		}
+
+		/*
+		 * Windows does not guarantee to log an FD_WRITE network event
+		 * indicating that more data can be sent unless the previous send()
+		 * failed with WSAEWOULDBLOCK.  While our caller might well have made
+		 * such a call, we cannot assume that here.  Therefore, if waiting for
+		 * write-ready, force the issue by doing a dummy send().  If the dummy
+		 * send() succeeds, assume that the socket is in fact write-ready, and
+		 * return immediately.  Also, if it fails with something other than
+		 * WSAEWOULDBLOCK, return a write-ready indication to let our caller
+		 * deal with the error condition.
+		 */
+		if (cur_event->events & WL_SOCKET_WRITEABLE)
+		{
+			char		c;
+			WSABUF		buf;
+			DWORD		sent;
+			int			r;
+
+			buf.buf = &c;
+			buf.len = 0;
+
+			r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
+			if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
+			{
+				occurred_events->pos = cur_event->pos;
+				occurred_events->user_data = cur_event->user_data;
+				occurred_events->events = WL_SOCKET_WRITEABLE;
+				occurred_events->fd = cur_event->fd;
+				return 1;
+			}
+		}
+	}
+
+	/*
+	 * Sleep.
+	 *
+	 * Need to wait for ->nevents + 1, because signal handle is in [0].
+	 */
+	rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
+								cur_timeout);
+
+	/* Check return code */
+	if (rc == WAIT_FAILED)
+		elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
+			 GetLastError());
+	else if (rc == WAIT_TIMEOUT)
+	{
+		/* timeout exceeded */
+		return -1;
+	}
+
+	if (rc == WAIT_OBJECT_0)
+	{
+		/* Service newly-arrived signals */
+		pgwin32_dispatch_queued_signals();
+		return 0;				/* retry */
+	}
+
+	/*
+	 * With an offset of one, due to the always present pgwin32_signal_event,
+	 * the handle offset directly corresponds to a wait event.
+	 */
+	cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
+
+	occurred_events->pos = cur_event->pos;
+	occurred_events->user_data = cur_event->user_data;
+	occurred_events->events = 0;
+
+	if (cur_event->events == WL_LATCH_SET)
+	{
+		/*
+		 * We cannot use set->latch->event to reset the fired event if we
+		 * aren't waiting on this latch now.
+		 */
+		if (!ResetEvent(set->handles[cur_event->pos + 1]))
+			elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
+
+		if (set->latch && set->latch->is_set)
+		{
+			occurred_events->fd = PGINVALID_SOCKET;
+			occurred_events->events = WL_LATCH_SET;
+			occurred_events++;
+			returned_events++;
+		}
+	}
+	else if (cur_event->events == WL_POSTMASTER_DEATH)
+	{
+		/*
+		 * Postmaster apparently died.  Since the consequences of falsely
+		 * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
+		 * the trouble to positively verify this with PostmasterIsAlive(),
+		 * even though there is no known reason to think that the event could
+		 * be falsely set on Windows.
+		 */
+		if (!PostmasterIsAliveInternal())
+		{
+			if (set->exit_on_postmaster_death)
+				proc_exit(1);
+			occurred_events->fd = PGINVALID_SOCKET;
+			occurred_events->events = WL_POSTMASTER_DEATH;
+			occurred_events++;
+			returned_events++;
+		}
+	}
+	else if (cur_event->events & WL_SOCKET_MASK)
+	{
+		WSANETWORKEVENTS resEvents;
+		HANDLE		handle = set->handles[cur_event->pos + 1];
+
+		Assert(cur_event->fd);
+
+		occurred_events->fd = cur_event->fd;
+
+		ZeroMemory(&resEvents, sizeof(resEvents));
+		if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
+			elog(ERROR, "failed to enumerate network events: error code %d",
+				 WSAGetLastError());
+		if ((cur_event->events & WL_SOCKET_READABLE) &&
+			(resEvents.lNetworkEvents & FD_READ))
+		{
+			/* data available in socket */
+			occurred_events->events |= WL_SOCKET_READABLE;
+
+			/*------
+			 * WaitForMultipleObjects doesn't guarantee that a read event will
+			 * be returned if the latch is set at the same time.  Even if it
+			 * did, the caller might drop that event expecting it to reoccur
+			 * on next call.  So, we must force the event to be reset if this
+			 * WaitEventSet is used again in order to avoid an indefinite
+			 * hang.  Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
+			 * for the behavior of socket events.
+			 *------
+			 */
+			cur_event->reset = true;
+		}
+		if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+			(resEvents.lNetworkEvents & FD_WRITE))
+		{
+			/* writeable */
+			occurred_events->events |= WL_SOCKET_WRITEABLE;
+		}
+		if ((cur_event->events & WL_SOCKET_CONNECTED) &&
+			(resEvents.lNetworkEvents & FD_CONNECT))
+		{
+			/* connected */
+			occurred_events->events |= WL_SOCKET_CONNECTED;
+		}
+		if (resEvents.lNetworkEvents & FD_CLOSE)
+		{
+			/* EOF/error, so signal all caller-requested socket flags */
+			occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
+		}
+
+		if (occurred_events->events != 0)
+		{
+			occurred_events++;
+			returned_events++;
+		}
+	}
+
+	return returned_events;
+}
+#endif
+
+/*
+ * Get the number of wait events registered in a given WaitEventSet.
+ */
+int
+GetNumRegisteredWaitEvents(WaitEventSet *set)
+{
+	return set->nevents;
+}
+
+#if defined(WAIT_USE_SELF_PIPE)
+
+/*
+ * SetLatch uses SIGURG to wake up the process waiting on the latch.
+ *
+ * Wake up WaitLatch, if we're waiting.
+ */
+static void
+latch_sigurg_handler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	if (waiting)
+		sendSelfPipeByte();
+
+	errno = save_errno;
+}
+
+/* Send one byte to the self-pipe, to wake up WaitLatch */
+static void
+sendSelfPipeByte(void)
+{
+	int			rc;
+	char		dummy = 0;
+
+retry:
+	rc = write(selfpipe_writefd, &dummy, 1);
+	if (rc < 0)
+	{
+		/* If interrupted by signal, just retry */
+		if (errno == EINTR)
+			goto retry;
+
+		/*
+		 * If the pipe is full, we don't need to retry, the data that's there
+		 * already is enough to wake up WaitLatch.
+		 */
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+			return;
+
+		/*
+		 * Oops, the write() failed for some other reason. We might be in a
+		 * signal handler, so it's not safe to elog(). We have no choice but
+		 * silently ignore the error.
+		 */
+		return;
+	}
+}
+
+#endif
+
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+
+/*
+ * Read all available data from self-pipe or signalfd.
+ *
+ * Note: this is only called when waiting = true.  If it fails and doesn't
+ * return, it must reset that flag first (though ideally, this will never
+ * happen).
+ */
+static void
+drain(void)
+{
+	char		buf[1024];
+	int			rc;
+	int			fd;
+
+#ifdef WAIT_USE_SELF_PIPE
+	fd = selfpipe_readfd;
+#else
+	fd = signal_fd;
+#endif
+
+	for (;;)
+	{
+		rc = read(fd, buf, sizeof(buf));
+		if (rc < 0)
+		{
+			if (errno == EAGAIN || errno == EWOULDBLOCK)
+				break;			/* the descriptor is empty */
+			else if (errno == EINTR)
+				continue;		/* retry */
+			else
+			{
+				waiting = false;
+#ifdef WAIT_USE_SELF_PIPE
+				elog(ERROR, "read() on self-pipe failed: %m");
+#else
+				elog(ERROR, "read() on signalfd failed: %m");
+#endif
+			}
+		}
+		else if (rc == 0)
+		{
+			waiting = false;
+#ifdef WAIT_USE_SELF_PIPE
+			elog(ERROR, "unexpected EOF on self-pipe");
+#else
+			elog(ERROR, "unexpected EOF on signalfd");
+#endif
+		}
+		else if (rc < sizeof(buf))
+		{
+			/* we successfully drained the pipe; no need to read() again */
+			break;
+		}
+		/* else buffer wasn't big enough, so read again */
+	}
+}
+
+#endif
diff --git a/src/backend/storage/ipc/pmsignal.c b/src/backend/storage/ipc/pmsignal.c
new file mode 100644
index 0000000..280c239
--- /dev/null
+++ b/src/backend/storage/ipc/pmsignal.c
@@ -0,0 +1,430 @@
+/*-------------------------------------------------------------------------
+ *
+ * pmsignal.c
+ *	  routines for signaling between the postmaster and its child processes
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/pmsignal.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#include "miscadmin.h"
+#include "postmaster/postmaster.h"
+#include "replication/walsender.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+
+
+/*
+ * The postmaster is signaled by its children by sending SIGUSR1.  The
+ * specific reason is communicated via flags in shared memory.  We keep
+ * a boolean flag for each possible "reason", so that different reasons
+ * can be signaled by different backends at the same time.  (However,
+ * if the same reason is signaled more than once simultaneously, the
+ * postmaster will observe it only once.)
+ *
+ * The flags are actually declared as "volatile sig_atomic_t" for maximum
+ * portability.  This should ensure that loads and stores of the flag
+ * values are atomic, allowing us to dispense with any explicit locking.
+ *
+ * In addition to the per-reason flags, we store a set of per-child-process
+ * flags that are currently used only for detecting whether a backend has
+ * exited without performing proper shutdown.  The per-child-process flags
+ * have three possible states: UNUSED, ASSIGNED, ACTIVE.  An UNUSED slot is
+ * available for assignment.  An ASSIGNED slot is associated with a postmaster
+ * child process, but either the process has not touched shared memory yet,
+ * or it has successfully cleaned up after itself.  A ACTIVE slot means the
+ * process is actively using shared memory.  The slots are assigned to
+ * child processes at random, and postmaster.c is responsible for tracking
+ * which one goes with which PID.
+ *
+ * Actually there is a fourth state, WALSENDER.  This is just like ACTIVE,
+ * but carries the extra information that the child is a WAL sender.
+ * WAL senders too start in ACTIVE state, but switch to WALSENDER once they
+ * start streaming the WAL (and they never go back to ACTIVE after that).
+ *
+ * We also have a shared-memory field that is used for communication in
+ * the opposite direction, from postmaster to children: it tells why the
+ * postmaster has broadcasted SIGQUIT signals, if indeed it has done so.
+ */
+
+#define PM_CHILD_UNUSED		0	/* these values must fit in sig_atomic_t */
+#define PM_CHILD_ASSIGNED	1
+#define PM_CHILD_ACTIVE		2
+#define PM_CHILD_WALSENDER	3
+
+/* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */
+struct PMSignalData
+{
+	/* per-reason flags for signaling the postmaster */
+	sig_atomic_t PMSignalFlags[NUM_PMSIGNALS];
+	/* global flags for signals from postmaster to children */
+	QuitSignalReason sigquit_reason;	/* why SIGQUIT was sent */
+	/* per-child-process flags */
+	int			num_child_flags;	/* # of entries in PMChildFlags[] */
+	int			next_child_flag;	/* next slot to try to assign */
+	sig_atomic_t PMChildFlags[FLEXIBLE_ARRAY_MEMBER];
+};
+
+NON_EXEC_STATIC volatile PMSignalData *PMSignalState = NULL;
+
+/*
+ * Signal handler to be notified if postmaster dies.
+ */
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+volatile sig_atomic_t postmaster_possibly_dead = false;
+
+static void
+postmaster_death_handler(int signo)
+{
+	postmaster_possibly_dead = true;
+}
+
+/*
+ * The available signals depend on the OS.  SIGUSR1 and SIGUSR2 are already
+ * used for other things, so choose another one.
+ *
+ * Currently, we assume that we can always find a signal to use.  That
+ * seems like a reasonable assumption for all platforms that are modern
+ * enough to have a parent-death signaling mechanism.
+ */
+#if defined(SIGINFO)
+#define POSTMASTER_DEATH_SIGNAL SIGINFO
+#elif defined(SIGPWR)
+#define POSTMASTER_DEATH_SIGNAL SIGPWR
+#else
+#error "cannot find a signal to use for postmaster death"
+#endif
+
+#endif							/* USE_POSTMASTER_DEATH_SIGNAL */
+
+/*
+ * PMSignalShmemSize
+ *		Compute space needed for pmsignal.c's shared memory
+ */
+Size
+PMSignalShmemSize(void)
+{
+	Size		size;
+
+	size = offsetof(PMSignalData, PMChildFlags);
+	size = add_size(size, mul_size(MaxLivePostmasterChildren(),
+								   sizeof(sig_atomic_t)));
+
+	return size;
+}
+
+/*
+ * PMSignalShmemInit - initialize during shared-memory creation
+ */
+void
+PMSignalShmemInit(void)
+{
+	bool		found;
+
+	PMSignalState = (PMSignalData *)
+		ShmemInitStruct("PMSignalState", PMSignalShmemSize(), &found);
+
+	if (!found)
+	{
+		/* initialize all flags to zeroes */
+		MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
+		PMSignalState->num_child_flags = MaxLivePostmasterChildren();
+	}
+}
+
+/*
+ * SendPostmasterSignal - signal the postmaster from a child process
+ */
+void
+SendPostmasterSignal(PMSignalReason reason)
+{
+	/* If called in a standalone backend, do nothing */
+	if (!IsUnderPostmaster)
+		return;
+	/* Atomically set the proper flag */
+	PMSignalState->PMSignalFlags[reason] = true;
+	/* Send signal to postmaster */
+	kill(PostmasterPid, SIGUSR1);
+}
+
+/*
+ * CheckPostmasterSignal - check to see if a particular reason has been
+ * signaled, and clear the signal flag.  Should be called by postmaster
+ * after receiving SIGUSR1.
+ */
+bool
+CheckPostmasterSignal(PMSignalReason reason)
+{
+	/* Careful here --- don't clear flag if we haven't seen it set */
+	if (PMSignalState->PMSignalFlags[reason])
+	{
+		PMSignalState->PMSignalFlags[reason] = false;
+		return true;
+	}
+	return false;
+}
+
+/*
+ * SetQuitSignalReason - broadcast the reason for a system shutdown.
+ * Should be called by postmaster before sending SIGQUIT to children.
+ *
+ * Note: in a crash-and-restart scenario, the "reason" field gets cleared
+ * as a part of rebuilding shared memory; the postmaster need not do it
+ * explicitly.
+ */
+void
+SetQuitSignalReason(QuitSignalReason reason)
+{
+	PMSignalState->sigquit_reason = reason;
+}
+
+/*
+ * GetQuitSignalReason - obtain the reason for a system shutdown.
+ * Called by child processes when they receive SIGQUIT.
+ * If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT.
+ */
+QuitSignalReason
+GetQuitSignalReason(void)
+{
+	/* This is called in signal handlers, so be extra paranoid. */
+	if (!IsUnderPostmaster || PMSignalState == NULL)
+		return PMQUIT_NOT_SENT;
+	return PMSignalState->sigquit_reason;
+}
+
+
+/*
+ * AssignPostmasterChildSlot - select an unused slot for a new postmaster
+ * child process, and set its state to ASSIGNED.  Returns a slot number
+ * (one to N).
+ *
+ * Only the postmaster is allowed to execute this routine, so we need no
+ * special locking.
+ */
+int
+AssignPostmasterChildSlot(void)
+{
+	int			slot = PMSignalState->next_child_flag;
+	int			n;
+
+	/*
+	 * Scan for a free slot.  We track the last slot assigned so as not to
+	 * waste time repeatedly rescanning low-numbered slots.
+	 */
+	for (n = PMSignalState->num_child_flags; n > 0; n--)
+	{
+		if (--slot < 0)
+			slot = PMSignalState->num_child_flags - 1;
+		if (PMSignalState->PMChildFlags[slot] == PM_CHILD_UNUSED)
+		{
+			PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
+			PMSignalState->next_child_flag = slot;
+			return slot + 1;
+		}
+	}
+
+	/* Out of slots ... should never happen, else postmaster.c messed up */
+	elog(FATAL, "no free slots in PMChildFlags array");
+	return 0;					/* keep compiler quiet */
+}
+
+/*
+ * ReleasePostmasterChildSlot - release a slot after death of a postmaster
+ * child process.  This must be called in the postmaster process.
+ *
+ * Returns true if the slot had been in ASSIGNED state (the expected case),
+ * false otherwise (implying that the child failed to clean itself up).
+ */
+bool
+ReleasePostmasterChildSlot(int slot)
+{
+	bool		result;
+
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+
+	/*
+	 * Note: the slot state might already be unused, because the logic in
+	 * postmaster.c is such that this might get called twice when a child
+	 * crashes.  So we don't try to Assert anything about the state.
+	 */
+	result = (PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
+	PMSignalState->PMChildFlags[slot] = PM_CHILD_UNUSED;
+	return result;
+}
+
+/*
+ * IsPostmasterChildWalSender - check if given slot is in use by a
+ * walsender process.
+ */
+bool
+IsPostmasterChildWalSender(int slot)
+{
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+
+	if (PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER)
+		return true;
+	else
+		return false;
+}
+
+/*
+ * MarkPostmasterChildActive - mark a postmaster child as about to begin
+ * actively using shared memory.  This is called in the child process.
+ */
+void
+MarkPostmasterChildActive(void)
+{
+	int			slot = MyPMChildSlot;
+
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+	Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
+	PMSignalState->PMChildFlags[slot] = PM_CHILD_ACTIVE;
+}
+
+/*
+ * MarkPostmasterChildWalSender - mark a postmaster child as a WAL sender
+ * process.  This is called in the child process, sometime after marking the
+ * child as active.
+ */
+void
+MarkPostmasterChildWalSender(void)
+{
+	int			slot = MyPMChildSlot;
+
+	Assert(am_walsender);
+
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+	Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE);
+	PMSignalState->PMChildFlags[slot] = PM_CHILD_WALSENDER;
+}
+
+/*
+ * MarkPostmasterChildInactive - mark a postmaster child as done using
+ * shared memory.  This is called in the child process.
+ */
+void
+MarkPostmasterChildInactive(void)
+{
+	int			slot = MyPMChildSlot;
+
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+	Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE ||
+		   PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER);
+	PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
+}
+
+
+/*
+ * PostmasterIsAliveInternal - check whether postmaster process is still alive
+ *
+ * This is the slow path of PostmasterIsAlive(), where the caller has already
+ * checked 'postmaster_possibly_dead'.  (On platforms that don't support
+ * a signal for parent death, PostmasterIsAlive() is just an alias for this.)
+ */
+bool
+PostmasterIsAliveInternal(void)
+{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+	/*
+	 * Reset the flag before checking, so that we don't miss a signal if
+	 * postmaster dies right after the check.  If postmaster was indeed dead,
+	 * we'll re-arm it before returning to caller.
+	 */
+	postmaster_possibly_dead = false;
+#endif
+
+#ifndef WIN32
+	{
+		char		c;
+		ssize_t		rc;
+
+		rc = read(postmaster_alive_fds[POSTMASTER_FD_WATCH], &c, 1);
+
+		/*
+		 * In the usual case, the postmaster is still alive, and there is no
+		 * data in the pipe.
+		 */
+		if (rc < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+			return true;
+		else
+		{
+			/*
+			 * Postmaster is dead, or something went wrong with the read()
+			 * call.
+			 */
+
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+			postmaster_possibly_dead = true;
+#endif
+
+			if (rc < 0)
+				elog(FATAL, "read on postmaster death monitoring pipe failed: %m");
+			else if (rc > 0)
+				elog(FATAL, "unexpected data in postmaster death monitoring pipe");
+
+			return false;
+		}
+	}
+
+#else							/* WIN32 */
+	if (WaitForSingleObject(PostmasterHandle, 0) == WAIT_TIMEOUT)
+		return true;
+	else
+	{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+		postmaster_possibly_dead = true;
+#endif
+		return false;
+	}
+#endif							/* WIN32 */
+}
+
+/*
+ * PostmasterDeathSignalInit - request signal on postmaster death if possible
+ */
+void
+PostmasterDeathSignalInit(void)
+{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+	int			signum = POSTMASTER_DEATH_SIGNAL;
+
+	/* Register our signal handler. */
+	pqsignal(signum, postmaster_death_handler);
+
+	/* Request a signal on parent exit. */
+#if defined(PR_SET_PDEATHSIG)
+	if (prctl(PR_SET_PDEATHSIG, signum) < 0)
+		elog(ERROR, "could not request parent death signal: %m");
+#elif defined(PROC_PDEATHSIG_CTL)
+	if (procctl(P_PID, 0, PROC_PDEATHSIG_CTL, &signum) < 0)
+		elog(ERROR, "could not request parent death signal: %m");
+#else
+#error "USE_POSTMASTER_DEATH_SIGNAL set, but there is no mechanism to request the signal"
+#endif
+
+	/*
+	 * Just in case the parent was gone already and we missed it, we'd better
+	 * check the slow way on the first call.
+	 */
+	postmaster_possibly_dead = true;
+#endif							/* USE_POSTMASTER_DEATH_SIGNAL */
+}
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
new file mode 100644
index 0000000..755f842
--- /dev/null
+++ b/src/backend/storage/ipc/procarray.c
@@ -0,0 +1,5220 @@
+/*-------------------------------------------------------------------------
+ *
+ * procarray.c
+ *	  POSTGRES process array code.
+ *
+ *
+ * This module maintains arrays of PGPROC substructures, as well as associated
+ * arrays in ProcGlobal, for all active backends.  Although there are several
+ * uses for this, the principal one is as a means of determining the set of
+ * currently running transactions.
+ *
+ * Because of various subtle race conditions it is critical that a backend
+ * hold the correct locks while setting or clearing its xid (in
+ * ProcGlobal->xids[]/MyProc->xid).  See notes in
+ * src/backend/access/transam/README.
+ *
+ * The process arrays now also include structures representing prepared
+ * transactions.  The xid and subxids fields of these are valid, as are the
+ * myProcLocks lists.  They can be distinguished from regular backend PGPROCs
+ * at need by checking for pid == 0.
+ *
+ * During hot standby, we also keep a list of XIDs representing transactions
+ * that are known to be running on the primary (or more precisely, were running
+ * as of the current point in the WAL stream).  This list is kept in the
+ * KnownAssignedXids array, and is updated by watching the sequence of
+ * arriving XIDs.  This is necessary because if we leave those XIDs out of
+ * snapshots taken for standby queries, then they will appear to be already
+ * complete, leading to MVCC failures.  Note that in hot standby, the PGPROC
+ * array represents standby processes, which by definition are not running
+ * transactions that have XIDs.
+ *
+ * It is perhaps possible for a backend on the primary to terminate without
+ * writing an abort record for its transaction.  While that shouldn't really
+ * happen, it would tie up KnownAssignedXids indefinitely, so we protect
+ * ourselves by pruning the array when a valid list of running XIDs arrives.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/procarray.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+
+#include "access/clog.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_authid.h"
+#include "commands/dbcommands.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/spin.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+#define UINT32_ACCESS_ONCE(var)		 ((uint32)(*((volatile uint32 *)&(var))))
+
+/* Our shared memory area */
+typedef struct ProcArrayStruct
+{
+	int			numProcs;		/* number of valid procs entries */
+	int			maxProcs;		/* allocated size of procs array */
+
+	/*
+	 * Known assigned XIDs handling
+	 */
+	int			maxKnownAssignedXids;	/* allocated size of array */
+	int			numKnownAssignedXids;	/* current # of valid entries */
+	int			tailKnownAssignedXids;	/* index of oldest valid element */
+	int			headKnownAssignedXids;	/* index of newest element, + 1 */
+	slock_t		known_assigned_xids_lck;	/* protects head/tail pointers */
+
+	/*
+	 * Highest subxid that has been removed from KnownAssignedXids array to
+	 * prevent overflow; or InvalidTransactionId if none.  We track this for
+	 * similar reasons to tracking overflowing cached subxids in PGPROC
+	 * entries.  Must hold exclusive ProcArrayLock to change this, and shared
+	 * lock to read it.
+	 */
+	TransactionId lastOverflowedXid;
+
+	/* oldest xmin of any replication slot */
+	TransactionId replication_slot_xmin;
+	/* oldest catalog xmin of any replication slot */
+	TransactionId replication_slot_catalog_xmin;
+
+	/* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */
+	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
+} ProcArrayStruct;
+
+/*
+ * State for the GlobalVisTest* family of functions. Those functions can
+ * e.g. be used to decide if a deleted row can be removed without violating
+ * MVCC semantics: If the deleted row's xmax is not considered to be running
+ * by anyone, the row can be removed.
+ *
+ * To avoid slowing down GetSnapshotData(), we don't calculate a precise
+ * cutoff XID while building a snapshot (looking at the frequently changing
+ * xmins scales badly). Instead we compute two boundaries while building the
+ * snapshot:
+ *
+ * 1) definitely_needed, indicating that rows deleted by XIDs >=
+ *    definitely_needed are definitely still visible.
+ *
+ * 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can
+ *    definitely be removed
+ *
+ * When testing an XID that falls in between the two (i.e. XID >= maybe_needed
+ * && XID < definitely_needed), the boundaries can be recomputed (using
+ * ComputeXidHorizons()) to get a more accurate answer. This is cheaper than
+ * maintaining an accurate value all the time.
+ *
+ * As it is not cheap to compute accurate boundaries, we limit the number of
+ * times that happens in short succession. See GlobalVisTestShouldUpdate().
+ *
+ *
+ * There are three backend lifetime instances of this struct, optimized for
+ * different types of relations. As e.g. a normal user defined table in one
+ * database is inaccessible to backends connected to another database, a test
+ * specific to a relation can be more aggressive than a test for a shared
+ * relation.  Currently we track four different states:
+ *
+ * 1) GlobalVisSharedRels, which only considers an XID's
+ *    effects visible-to-everyone if neither snapshots in any database, nor a
+ *    replication slot's xmin, nor a replication slot's catalog_xmin might
+ *    still consider XID as running.
+ *
+ * 2) GlobalVisCatalogRels, which only considers an XID's
+ *    effects visible-to-everyone if neither snapshots in the current
+ *    database, nor a replication slot's xmin, nor a replication slot's
+ *    catalog_xmin might still consider XID as running.
+ *
+ *    I.e. the difference to GlobalVisSharedRels is that
+ *    snapshot in other databases are ignored.
+ *
+ * 3) GlobalVisDataRels, which only considers an XID's
+ *    effects visible-to-everyone if neither snapshots in the current
+ *    database, nor a replication slot's xmin consider XID as running.
+ *
+ *    I.e. the difference to GlobalVisCatalogRels is that
+ *    replication slot's catalog_xmin is not taken into account.
+ *
+ * 4) GlobalVisTempRels, which only considers the current session, as temp
+ *    tables are not visible to other sessions.
+ *
+ * GlobalVisTestFor(relation) returns the appropriate state
+ * for the relation.
+ *
+ * The boundaries are FullTransactionIds instead of TransactionIds to avoid
+ * wraparound dangers. There e.g. would otherwise exist no procarray state to
+ * prevent maybe_needed to become old enough after the GetSnapshotData()
+ * call.
+ *
+ * The typedef is in the header.
+ */
+struct GlobalVisState
+{
+	/* XIDs >= are considered running by some backend */
+	FullTransactionId definitely_needed;
+
+	/* XIDs < are not considered to be running by any backend */
+	FullTransactionId maybe_needed;
+};
+
+/*
+ * Result of ComputeXidHorizons().
+ */
+typedef struct ComputeXidHorizonsResult
+{
+	/*
+	 * The value of ShmemVariableCache->latestCompletedXid when
+	 * ComputeXidHorizons() held ProcArrayLock.
+	 */
+	FullTransactionId latest_completed;
+
+	/*
+	 * The same for procArray->replication_slot_xmin and.
+	 * procArray->replication_slot_catalog_xmin.
+	 */
+	TransactionId slot_xmin;
+	TransactionId slot_catalog_xmin;
+
+	/*
+	 * Oldest xid that any backend might still consider running. This needs to
+	 * include processes running VACUUM, in contrast to the normal visibility
+	 * cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when
+	 * determining visibility, but doesn't care about rows above its xmin to
+	 * be removed.
+	 *
+	 * This likely should only be needed to determine whether pg_subtrans can
+	 * be truncated. It currently includes the effects of replication slots,
+	 * for historical reasons. But that could likely be changed.
+	 */
+	TransactionId oldest_considered_running;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in shared
+	 * tables.
+	 *
+	 * This includes the effects of replication slots. If that's not desired,
+	 * look at shared_oldest_nonremovable_raw;
+	 */
+	TransactionId shared_oldest_nonremovable;
+
+	/*
+	 * Oldest xid that may be necessary to retain in shared tables. This is
+	 * the same as shared_oldest_nonremovable, except that is not affected by
+	 * replication slot's catalog_xmin.
+	 *
+	 * This is mainly useful to be able to send the catalog_xmin to upstream
+	 * streaming replication servers via hot_standby_feedback, so they can
+	 * apply the limit only when accessing catalog tables.
+	 */
+	TransactionId shared_oldest_nonremovable_raw;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in non-shared
+	 * catalog tables.
+	 */
+	TransactionId catalog_oldest_nonremovable;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in normal user
+	 * defined tables.
+	 */
+	TransactionId data_oldest_nonremovable;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in this
+	 * session's temporary tables.
+	 */
+	TransactionId temp_oldest_nonremovable;
+
+} ComputeXidHorizonsResult;
+
+/*
+ * Return value for GlobalVisHorizonKindForRel().
+ */
+typedef enum GlobalVisHorizonKind
+{
+	VISHORIZON_SHARED,
+	VISHORIZON_CATALOG,
+	VISHORIZON_DATA,
+	VISHORIZON_TEMP
+} GlobalVisHorizonKind;
+
+
+static ProcArrayStruct *procArray;
+
+static PGPROC *allProcs;
+
+/*
+ * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress()
+ */
+static TransactionId cachedXidIsNotInProgress = InvalidTransactionId;
+
+/*
+ * Bookkeeping for tracking emulated transactions in recovery
+ */
+static TransactionId *KnownAssignedXids;
+static bool *KnownAssignedXidsValid;
+static TransactionId latestObservedXid = InvalidTransactionId;
+
+/*
+ * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
+ * the highest xid that might still be running that we don't have in
+ * KnownAssignedXids.
+ */
+static TransactionId standbySnapshotPendingXmin;
+
+/*
+ * State for visibility checks on different types of relations. See struct
+ * GlobalVisState for details. As shared, catalog, normal and temporary
+ * relations can have different horizons, one such state exists for each.
+ */
+static GlobalVisState GlobalVisSharedRels;
+static GlobalVisState GlobalVisCatalogRels;
+static GlobalVisState GlobalVisDataRels;
+static GlobalVisState GlobalVisTempRels;
+
+/*
+ * This backend's RecentXmin at the last time the accurate xmin horizon was
+ * recomputed, or InvalidTransactionId if it has not. Used to limit how many
+ * times accurate horizons are recomputed. See GlobalVisTestShouldUpdate().
+ */
+static TransactionId ComputeXidHorizonsResultLastXmin;
+
+#ifdef XIDCACHE_DEBUG
+
+/* counters for XidCache measurement */
+static long xc_by_recent_xmin = 0;
+static long xc_by_known_xact = 0;
+static long xc_by_my_xact = 0;
+static long xc_by_latest_xid = 0;
+static long xc_by_main_xid = 0;
+static long xc_by_child_xid = 0;
+static long xc_by_known_assigned = 0;
+static long xc_no_overflow = 0;
+static long xc_slow_answer = 0;
+
+#define xc_by_recent_xmin_inc()		(xc_by_recent_xmin++)
+#define xc_by_known_xact_inc()		(xc_by_known_xact++)
+#define xc_by_my_xact_inc()			(xc_by_my_xact++)
+#define xc_by_latest_xid_inc()		(xc_by_latest_xid++)
+#define xc_by_main_xid_inc()		(xc_by_main_xid++)
+#define xc_by_child_xid_inc()		(xc_by_child_xid++)
+#define xc_by_known_assigned_inc()	(xc_by_known_assigned++)
+#define xc_no_overflow_inc()		(xc_no_overflow++)
+#define xc_slow_answer_inc()		(xc_slow_answer++)
+
+static void DisplayXidCache(void);
+#else							/* !XIDCACHE_DEBUG */
+
+#define xc_by_recent_xmin_inc()		((void) 0)
+#define xc_by_known_xact_inc()		((void) 0)
+#define xc_by_my_xact_inc()			((void) 0)
+#define xc_by_latest_xid_inc()		((void) 0)
+#define xc_by_main_xid_inc()		((void) 0)
+#define xc_by_child_xid_inc()		((void) 0)
+#define xc_by_known_assigned_inc()	((void) 0)
+#define xc_no_overflow_inc()		((void) 0)
+#define xc_slow_answer_inc()		((void) 0)
+#endif							/* XIDCACHE_DEBUG */
+
+static VirtualTransactionId *GetVirtualXIDsDelayingChkptGuts(int *nvxids,
+															 int type);
+static bool HaveVirtualXIDsDelayingChkptGuts(VirtualTransactionId *vxids,
+											 int nvxids, int type);
+
+/* Primitives for KnownAssignedXids array handling for standby */
+static void KnownAssignedXidsCompress(bool force);
+static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
+								 bool exclusive_lock);
+static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
+static bool KnownAssignedXidExists(TransactionId xid);
+static void KnownAssignedXidsRemove(TransactionId xid);
+static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
+										TransactionId *subxids);
+static void KnownAssignedXidsRemovePreceding(TransactionId xid);
+static int	KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
+static int	KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
+										   TransactionId *xmin,
+										   TransactionId xmax);
+static TransactionId KnownAssignedXidsGetOldestXmin(void);
+static void KnownAssignedXidsDisplay(int trace_level);
+static void KnownAssignedXidsReset(void);
+static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid);
+static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
+static void MaintainLatestCompletedXid(TransactionId latestXid);
+static void MaintainLatestCompletedXidRecovery(TransactionId latestXid);
+
+static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel,
+												  TransactionId xid);
+static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons);
+
+/*
+ * Report shared-memory space needed by CreateSharedProcArray.
+ */
+Size
+ProcArrayShmemSize(void)
+{
+	Size		size;
+
+	/* Size of the ProcArray structure itself */
+#define PROCARRAY_MAXPROCS	(MaxBackends + max_prepared_xacts)
+
+	size = offsetof(ProcArrayStruct, pgprocnos);
+	size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
+
+	/*
+	 * During Hot Standby processing we have a data structure called
+	 * KnownAssignedXids, created in shared memory. Local data structures are
+	 * also created in various backends during GetSnapshotData(),
+	 * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
+	 * main structures created in those functions must be identically sized,
+	 * since we may at times copy the whole of the data structures around. We
+	 * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
+	 *
+	 * Ideally we'd only create this structure if we were actually doing hot
+	 * standby in the current run, but we don't know that yet at the time
+	 * shared memory is being set up.
+	 */
+#define TOTAL_MAX_CACHED_SUBXIDS \
+	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
+
+	if (EnableHotStandby)
+	{
+		size = add_size(size,
+						mul_size(sizeof(TransactionId),
+								 TOTAL_MAX_CACHED_SUBXIDS));
+		size = add_size(size,
+						mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
+	}
+
+	return size;
+}
+
+/*
+ * Initialize the shared PGPROC array during postmaster startup.
+ */
+void
+CreateSharedProcArray(void)
+{
+	bool		found;
+
+	/* Create or attach to the ProcArray shared structure */
+	procArray = (ProcArrayStruct *)
+		ShmemInitStruct("Proc Array",
+						add_size(offsetof(ProcArrayStruct, pgprocnos),
+								 mul_size(sizeof(int),
+										  PROCARRAY_MAXPROCS)),
+						&found);
+
+	if (!found)
+	{
+		/*
+		 * We're the first - initialize.
+		 */
+		procArray->numProcs = 0;
+		procArray->maxProcs = PROCARRAY_MAXPROCS;
+		procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
+		procArray->numKnownAssignedXids = 0;
+		procArray->tailKnownAssignedXids = 0;
+		procArray->headKnownAssignedXids = 0;
+		SpinLockInit(&procArray->known_assigned_xids_lck);
+		procArray->lastOverflowedXid = InvalidTransactionId;
+		procArray->replication_slot_xmin = InvalidTransactionId;
+		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+		ShmemVariableCache->xactCompletionCount = 1;
+	}
+
+	allProcs = ProcGlobal->allProcs;
+
+	/* Create or attach to the KnownAssignedXids arrays too, if needed */
+	if (EnableHotStandby)
+	{
+		KnownAssignedXids = (TransactionId *)
+			ShmemInitStruct("KnownAssignedXids",
+							mul_size(sizeof(TransactionId),
+									 TOTAL_MAX_CACHED_SUBXIDS),
+							&found);
+		KnownAssignedXidsValid = (bool *)
+			ShmemInitStruct("KnownAssignedXidsValid",
+							mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
+							&found);
+	}
+}
+
+/*
+ * Add the specified PGPROC to the shared array.
+ */
+void
+ProcArrayAdd(PGPROC *proc)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+	int			movecount;
+
+	/* See ProcGlobal comment explaining why both locks are held */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+	if (arrayP->numProcs >= arrayP->maxProcs)
+	{
+		/*
+		 * Oops, no room.  (This really shouldn't happen, since there is a
+		 * fixed supply of PGPROC structs too, and so we should have failed
+		 * earlier.)
+		 */
+		ereport(FATAL,
+				(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+				 errmsg("sorry, too many clients already")));
+	}
+
+	/*
+	 * Keep the procs array sorted by (PGPROC *) so that we can utilize
+	 * locality of references much better. This is useful while traversing the
+	 * ProcArray because there is an increased likelihood of finding the next
+	 * PGPROC structure in the cache.
+	 *
+	 * Since the occurrence of adding/removing a proc is much lower than the
+	 * access to the ProcArray itself, the overhead should be marginal
+	 */
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			procno PG_USED_FOR_ASSERTS_ONLY = arrayP->pgprocnos[index];
+
+		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+		Assert(allProcs[procno].pgxactoff == index);
+
+		/* If we have found our right position in the array, break */
+		if (arrayP->pgprocnos[index] > proc->pgprocno)
+			break;
+	}
+
+	movecount = arrayP->numProcs - index;
+	memmove(&arrayP->pgprocnos[index + 1],
+			&arrayP->pgprocnos[index],
+			movecount * sizeof(*arrayP->pgprocnos));
+	memmove(&ProcGlobal->xids[index + 1],
+			&ProcGlobal->xids[index],
+			movecount * sizeof(*ProcGlobal->xids));
+	memmove(&ProcGlobal->subxidStates[index + 1],
+			&ProcGlobal->subxidStates[index],
+			movecount * sizeof(*ProcGlobal->subxidStates));
+	memmove(&ProcGlobal->statusFlags[index + 1],
+			&ProcGlobal->statusFlags[index],
+			movecount * sizeof(*ProcGlobal->statusFlags));
+
+	arrayP->pgprocnos[index] = proc->pgprocno;
+	proc->pgxactoff = index;
+	ProcGlobal->xids[index] = proc->xid;
+	ProcGlobal->subxidStates[index] = proc->subxidStatus;
+	ProcGlobal->statusFlags[index] = proc->statusFlags;
+
+	arrayP->numProcs++;
+
+	/* adjust pgxactoff for all following PGPROCs */
+	index++;
+	for (; index < arrayP->numProcs; index++)
+	{
+		int			procno = arrayP->pgprocnos[index];
+
+		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+		Assert(allProcs[procno].pgxactoff == index - 1);
+
+		allProcs[procno].pgxactoff = index;
+	}
+
+	/*
+	 * Release in reversed acquisition order, to reduce frequency of having to
+	 * wait for XidGenLock while holding ProcArrayLock.
+	 */
+	LWLockRelease(XidGenLock);
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * Remove the specified PGPROC from the shared array.
+ *
+ * When latestXid is a valid XID, we are removing a live 2PC gxact from the
+ * array, and thus causing it to appear as "not running" anymore.  In this
+ * case we must advance latestCompletedXid.  (This is essentially the same
+ * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
+ * the ProcArrayLock only once, and don't damage the content of the PGPROC;
+ * twophase.c depends on the latter.)
+ */
+void
+ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			myoff;
+	int			movecount;
+
+#ifdef XIDCACHE_DEBUG
+	/* dump stats at backend shutdown, but not prepared-xact end */
+	if (proc->pid != 0)
+		DisplayXidCache();
+#endif
+
+	/* See ProcGlobal comment explaining why both locks are held */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+	myoff = proc->pgxactoff;
+
+	Assert(myoff >= 0 && myoff < arrayP->numProcs);
+	Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff);
+
+	if (TransactionIdIsValid(latestXid))
+	{
+		Assert(TransactionIdIsValid(ProcGlobal->xids[myoff]));
+
+		/* Advance global latestCompletedXid while holding the lock */
+		MaintainLatestCompletedXid(latestXid);
+
+		/* Same with xactCompletionCount  */
+		ShmemVariableCache->xactCompletionCount++;
+
+		ProcGlobal->xids[myoff] = InvalidTransactionId;
+		ProcGlobal->subxidStates[myoff].overflowed = false;
+		ProcGlobal->subxidStates[myoff].count = 0;
+	}
+	else
+	{
+		/* Shouldn't be trying to remove a live transaction here */
+		Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
+	}
+
+	Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
+	Assert(ProcGlobal->subxidStates[myoff].count == 0);
+	Assert(ProcGlobal->subxidStates[myoff].overflowed == false);
+
+	ProcGlobal->statusFlags[myoff] = 0;
+
+	/* Keep the PGPROC array sorted. See notes above */
+	movecount = arrayP->numProcs - myoff - 1;
+	memmove(&arrayP->pgprocnos[myoff],
+			&arrayP->pgprocnos[myoff + 1],
+			movecount * sizeof(*arrayP->pgprocnos));
+	memmove(&ProcGlobal->xids[myoff],
+			&ProcGlobal->xids[myoff + 1],
+			movecount * sizeof(*ProcGlobal->xids));
+	memmove(&ProcGlobal->subxidStates[myoff],
+			&ProcGlobal->subxidStates[myoff + 1],
+			movecount * sizeof(*ProcGlobal->subxidStates));
+	memmove(&ProcGlobal->statusFlags[myoff],
+			&ProcGlobal->statusFlags[myoff + 1],
+			movecount * sizeof(*ProcGlobal->statusFlags));
+
+	arrayP->pgprocnos[arrayP->numProcs - 1] = -1;	/* for debugging */
+	arrayP->numProcs--;
+
+	/*
+	 * Adjust pgxactoff of following procs for removed PGPROC (note that
+	 * numProcs already has been decremented).
+	 */
+	for (int index = myoff; index < arrayP->numProcs; index++)
+	{
+		int			procno = arrayP->pgprocnos[index];
+
+		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+		Assert(allProcs[procno].pgxactoff - 1 == index);
+
+		allProcs[procno].pgxactoff = index;
+	}
+
+	/*
+	 * Release in reversed acquisition order, to reduce frequency of having to
+	 * wait for XidGenLock while holding ProcArrayLock.
+	 */
+	LWLockRelease(XidGenLock);
+	LWLockRelease(ProcArrayLock);
+}
+
+
+/*
+ * ProcArrayEndTransaction -- mark a transaction as no longer running
+ *
+ * This is used interchangeably for commit and abort cases.  The transaction
+ * commit/abort must already be reported to WAL and pg_xact.
+ *
+ * proc is currently always MyProc, but we pass it explicitly for flexibility.
+ * latestXid is the latest Xid among the transaction's main XID and
+ * subtransactions, or InvalidTransactionId if it has no XID.  (We must ask
+ * the caller to pass latestXid, instead of computing it from the PGPROC's
+ * contents, because the subxid information in the PGPROC might be
+ * incomplete.)
+ */
+void
+ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
+{
+	if (TransactionIdIsValid(latestXid))
+	{
+		/*
+		 * We must lock ProcArrayLock while clearing our advertised XID, so
+		 * that we do not exit the set of "running" transactions while someone
+		 * else is taking a snapshot.  See discussion in
+		 * src/backend/access/transam/README.
+		 */
+		Assert(TransactionIdIsValid(proc->xid));
+
+		/*
+		 * If we can immediately acquire ProcArrayLock, we clear our own XID
+		 * and release the lock.  If not, use group XID clearing to improve
+		 * efficiency.
+		 */
+		if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE))
+		{
+			ProcArrayEndTransactionInternal(proc, latestXid);
+			LWLockRelease(ProcArrayLock);
+		}
+		else
+			ProcArrayGroupClearXid(proc, latestXid);
+	}
+	else
+	{
+		/*
+		 * If we have no XID, we don't need to lock, since we won't affect
+		 * anyone else's calculation of a snapshot.  We might change their
+		 * estimate of global xmin, but that's OK.
+		 */
+		Assert(!TransactionIdIsValid(proc->xid));
+		Assert(proc->subxidStatus.count == 0);
+		Assert(!proc->subxidStatus.overflowed);
+
+		proc->lxid = InvalidLocalTransactionId;
+		proc->xmin = InvalidTransactionId;
+
+		/* be sure these are cleared in abort */
+		proc->delayChkpt = false;
+		proc->delayChkptEnd = false;
+
+		proc->recoveryConflictPending = false;
+
+		/* must be cleared with xid/xmin: */
+		/* avoid unnecessarily dirtying shared cachelines */
+		if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
+		{
+			Assert(!LWLockHeldByMe(ProcArrayLock));
+			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+			Assert(proc->statusFlags == ProcGlobal->statusFlags[proc->pgxactoff]);
+			proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
+			ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
+			LWLockRelease(ProcArrayLock);
+		}
+	}
+}
+
+/*
+ * Mark a write transaction as no longer running.
+ *
+ * We don't do any locking here; caller must handle that.
+ */
+static inline void
+ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
+{
+	int			pgxactoff = proc->pgxactoff;
+
+	/*
+	 * Note: we need exclusive lock here because we're going to change other
+	 * processes' PGPROC entries.
+	 */
+	Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE));
+	Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff]));
+	Assert(ProcGlobal->xids[pgxactoff] == proc->xid);
+
+	ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+	proc->xid = InvalidTransactionId;
+	proc->lxid = InvalidLocalTransactionId;
+	proc->xmin = InvalidTransactionId;
+
+	/* be sure these are cleared in abort */
+	proc->delayChkpt = false;
+	proc->delayChkptEnd = false;
+
+	proc->recoveryConflictPending = false;
+
+	/* must be cleared with xid/xmin: */
+	/* avoid unnecessarily dirtying shared cachelines */
+	if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
+	{
+		proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
+		ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
+	}
+
+	/* Clear the subtransaction-XID cache too while holding the lock */
+	Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+		   ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+	if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+	{
+		ProcGlobal->subxidStates[pgxactoff].count = 0;
+		ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+		proc->subxidStatus.count = 0;
+		proc->subxidStatus.overflowed = false;
+	}
+
+	/* Also advance global latestCompletedXid while holding the lock */
+	MaintainLatestCompletedXid(latestXid);
+
+	/* Same with xactCompletionCount  */
+	ShmemVariableCache->xactCompletionCount++;
+}
+
+/*
+ * ProcArrayGroupClearXid -- group XID clearing
+ *
+ * When we cannot immediately acquire ProcArrayLock in exclusive mode at
+ * commit time, add ourselves to a list of processes that need their XIDs
+ * cleared.  The first process to add itself to the list will acquire
+ * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal
+ * on behalf of all group members.  This avoids a great deal of contention
+ * around ProcArrayLock when many processes are trying to commit at once,
+ * since the lock need not be repeatedly handed off from one committing
+ * process to the next.
+ */
+static void
+ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
+{
+	PROC_HDR   *procglobal = ProcGlobal;
+	uint32		nextidx;
+	uint32		wakeidx;
+
+	/* We should definitely have an XID to clear. */
+	Assert(TransactionIdIsValid(proc->xid));
+
+	/* Add ourselves to the list of processes needing a group XID clear. */
+	proc->procArrayGroupMember = true;
+	proc->procArrayGroupMemberXid = latestXid;
+	nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst);
+	while (true)
+	{
+		pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx);
+
+		if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst,
+										   &nextidx,
+										   (uint32) proc->pgprocno))
+			break;
+	}
+
+	/*
+	 * If the list was not empty, the leader will clear our XID.  It is
+	 * impossible to have followers without a leader because the first process
+	 * that has added itself to the list will always have nextidx as
+	 * INVALID_PGPROCNO.
+	 */
+	if (nextidx != INVALID_PGPROCNO)
+	{
+		int			extraWaits = 0;
+
+		/* Sleep until the leader clears our XID. */
+		pgstat_report_wait_start(WAIT_EVENT_PROCARRAY_GROUP_UPDATE);
+		for (;;)
+		{
+			/* acts as a read barrier */
+			PGSemaphoreLock(proc->sem);
+			if (!proc->procArrayGroupMember)
+				break;
+			extraWaits++;
+		}
+		pgstat_report_wait_end();
+
+		Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO);
+
+		/* Fix semaphore count for any absorbed wakeups */
+		while (extraWaits-- > 0)
+			PGSemaphoreUnlock(proc->sem);
+		return;
+	}
+
+	/* We are the leader.  Acquire the lock on behalf of everyone. */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Now that we've got the lock, clear the list of processes waiting for
+	 * group XID clearing, saving a pointer to the head of the list.  Trying
+	 * to pop elements one at a time could lead to an ABA problem.
+	 */
+	nextidx = pg_atomic_exchange_u32(&procglobal->procArrayGroupFirst,
+									 INVALID_PGPROCNO);
+
+	/* Remember head of list so we can perform wakeups after dropping lock. */
+	wakeidx = nextidx;
+
+	/* Walk the list and clear all XIDs. */
+	while (nextidx != INVALID_PGPROCNO)
+	{
+		PGPROC	   *nextproc = &allProcs[nextidx];
+
+		ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid);
+
+		/* Move to next proc in list. */
+		nextidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
+	}
+
+	/* We're done with the lock now. */
+	LWLockRelease(ProcArrayLock);
+
+	/*
+	 * Now that we've released the lock, go back and wake everybody up.  We
+	 * don't do this under the lock so as to keep lock hold times to a
+	 * minimum.  The system calls we need to perform to wake other processes
+	 * up are probably much slower than the simple memory writes we did while
+	 * holding the lock.
+	 */
+	while (wakeidx != INVALID_PGPROCNO)
+	{
+		PGPROC	   *nextproc = &allProcs[wakeidx];
+
+		wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
+		pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PGPROCNO);
+
+		/* ensure all previous writes are visible before follower continues. */
+		pg_write_barrier();
+
+		nextproc->procArrayGroupMember = false;
+
+		if (nextproc != MyProc)
+			PGSemaphoreUnlock(nextproc->sem);
+	}
+}
+
+/*
+ * ProcArrayClearTransaction -- clear the transaction fields
+ *
+ * This is used after successfully preparing a 2-phase transaction.  We are
+ * not actually reporting the transaction's XID as no longer running --- it
+ * will still appear as running because the 2PC's gxact is in the ProcArray
+ * too.  We just have to clear out our own PGPROC.
+ */
+void
+ProcArrayClearTransaction(PGPROC *proc)
+{
+	int			pgxactoff;
+
+	/*
+	 * Currently we need to lock ProcArrayLock exclusively here, as we
+	 * increment xactCompletionCount below. We also need it at least in shared
+	 * mode for pgproc->pgxactoff to stay the same below.
+	 *
+	 * We could however, as this action does not actually change anyone's view
+	 * of the set of running XIDs (our entry is duplicate with the gxact that
+	 * has already been inserted into the ProcArray), lower the lock level to
+	 * shared if we were to make xactCompletionCount an atomic variable. But
+	 * that doesn't seem worth it currently, as a 2PC commit is heavyweight
+	 * enough for this not to be the bottleneck.  If it ever becomes a
+	 * bottleneck it may also be worth considering to combine this with the
+	 * subsequent ProcArrayRemove()
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	pgxactoff = proc->pgxactoff;
+
+	ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+	proc->xid = InvalidTransactionId;
+
+	proc->lxid = InvalidLocalTransactionId;
+	proc->xmin = InvalidTransactionId;
+	proc->recoveryConflictPending = false;
+
+	Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK));
+	Assert(!proc->delayChkpt);
+
+	/*
+	 * Need to increment completion count even though transaction hasn't
+	 * really committed yet. The reason for that is that GetSnapshotData()
+	 * omits the xid of the current transaction, thus without the increment we
+	 * otherwise could end up reusing the snapshot later. Which would be bad,
+	 * because it might not count the prepared transaction as running.
+	 */
+	ShmemVariableCache->xactCompletionCount++;
+
+	/* Clear the subtransaction-XID cache too */
+	Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+		   ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+	if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+	{
+		ProcGlobal->subxidStates[pgxactoff].count = 0;
+		ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+		proc->subxidStatus.count = 0;
+		proc->subxidStatus.overflowed = false;
+	}
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * Update ShmemVariableCache->latestCompletedXid to point to latestXid if
+ * currently older.
+ */
+static void
+MaintainLatestCompletedXid(TransactionId latestXid)
+{
+	FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+
+	Assert(FullTransactionIdIsValid(cur_latest));
+	Assert(!RecoveryInProgress());
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+	{
+		ShmemVariableCache->latestCompletedXid =
+			FullXidRelativeTo(cur_latest, latestXid);
+	}
+
+	Assert(IsBootstrapProcessingMode() ||
+		   FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
+/*
+ * Same as MaintainLatestCompletedXid, except for use during WAL replay.
+ */
+static void
+MaintainLatestCompletedXidRecovery(TransactionId latestXid)
+{
+	FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+	FullTransactionId rel;
+
+	Assert(AmStartupProcess() || !IsUnderPostmaster);
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	/*
+	 * Need a FullTransactionId to compare latestXid with. Can't rely on
+	 * latestCompletedXid to be initialized in recovery. But in recovery it's
+	 * safe to access nextXid without a lock for the startup process.
+	 */
+	rel = ShmemVariableCache->nextXid;
+	Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
+
+	if (!FullTransactionIdIsValid(cur_latest) ||
+		TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+	{
+		ShmemVariableCache->latestCompletedXid =
+			FullXidRelativeTo(rel, latestXid);
+	}
+
+	Assert(FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
+/*
+ * ProcArrayInitRecovery -- initialize recovery xid mgmt environment
+ *
+ * Remember up to where the startup process initialized the CLOG and subtrans
+ * so we can ensure it's initialized gaplessly up to the point where necessary
+ * while in recovery.
+ */
+void
+ProcArrayInitRecovery(TransactionId initializedUptoXID)
+{
+	Assert(standbyState == STANDBY_INITIALIZED);
+	Assert(TransactionIdIsNormal(initializedUptoXID));
+
+	/*
+	 * we set latestObservedXid to the xid SUBTRANS has been initialized up
+	 * to, so we can extend it from that point onwards in
+	 * RecordKnownAssignedTransactionIds, and when we get consistent in
+	 * ProcArrayApplyRecoveryInfo().
+	 */
+	latestObservedXid = initializedUptoXID;
+	TransactionIdRetreat(latestObservedXid);
+}
+
+/*
+ * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
+ *
+ * Takes us through 3 states: Initialized, Pending and Ready.
+ * Normal case is to go all the way to Ready straight away, though there
+ * are atypical cases where we need to take it in steps.
+ *
+ * Use the data about running transactions on the primary to create the initial
+ * state of KnownAssignedXids. We also use these records to regularly prune
+ * KnownAssignedXids because we know it is possible that some transactions
+ * with FATAL errors fail to write abort records, which could cause eventual
+ * overflow.
+ *
+ * See comments for LogStandbySnapshot().
+ */
+void
+ProcArrayApplyRecoveryInfo(RunningTransactions running)
+{
+	TransactionId *xids;
+	int			nxids;
+	int			i;
+
+	Assert(standbyState >= STANDBY_INITIALIZED);
+	Assert(TransactionIdIsValid(running->nextXid));
+	Assert(TransactionIdIsValid(running->oldestRunningXid));
+	Assert(TransactionIdIsNormal(running->latestCompletedXid));
+
+	/*
+	 * Remove stale transactions, if any.
+	 */
+	ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
+
+	/*
+	 * Remove stale locks, if any.
+	 */
+	StandbyReleaseOldLocks(running->oldestRunningXid);
+
+	/*
+	 * If our snapshot is already valid, nothing else to do...
+	 */
+	if (standbyState == STANDBY_SNAPSHOT_READY)
+		return;
+
+	/*
+	 * If our initial RunningTransactionsData had an overflowed snapshot then
+	 * we knew we were missing some subxids from our snapshot. If we continue
+	 * to see overflowed snapshots then we might never be able to start up, so
+	 * we make another test to see if our snapshot is now valid. We know that
+	 * the missing subxids are equal to or earlier than nextXid. After we
+	 * initialise we continue to apply changes during recovery, so once the
+	 * oldestRunningXid is later than the nextXid from the initial snapshot we
+	 * know that we no longer have missing information and can mark the
+	 * snapshot as valid.
+	 */
+	if (standbyState == STANDBY_SNAPSHOT_PENDING)
+	{
+		/*
+		 * If the snapshot isn't overflowed or if its empty we can reset our
+		 * pending state and use this snapshot instead.
+		 */
+		if (!running->subxid_overflow || running->xcnt == 0)
+		{
+			/*
+			 * If we have already collected known assigned xids, we need to
+			 * throw them away before we apply the recovery snapshot.
+			 */
+			KnownAssignedXidsReset();
+			standbyState = STANDBY_INITIALIZED;
+		}
+		else
+		{
+			if (TransactionIdPrecedes(standbySnapshotPendingXmin,
+									  running->oldestRunningXid))
+			{
+				standbyState = STANDBY_SNAPSHOT_READY;
+				elog(trace_recovery(DEBUG1),
+					 "recovery snapshots are now enabled");
+			}
+			else
+				elog(trace_recovery(DEBUG1),
+					 "recovery snapshot waiting for non-overflowed snapshot or "
+					 "until oldest active xid on standby is at least %u (now %u)",
+					 standbySnapshotPendingXmin,
+					 running->oldestRunningXid);
+			return;
+		}
+	}
+
+	Assert(standbyState == STANDBY_INITIALIZED);
+
+	/*
+	 * NB: this can be reached at least twice, so make sure new code can deal
+	 * with that.
+	 */
+
+	/*
+	 * Nobody else is running yet, but take locks anyhow
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * KnownAssignedXids is sorted so we cannot just add the xids, we have to
+	 * sort them first.
+	 *
+	 * Some of the new xids are top-level xids and some are subtransactions.
+	 * We don't call SubTransSetParent because it doesn't matter yet. If we
+	 * aren't overflowed then all xids will fit in snapshot and so we don't
+	 * need subtrans. If we later overflow, an xid assignment record will add
+	 * xids to subtrans. If RunningTransactionsData is overflowed then we
+	 * don't have enough information to correctly update subtrans anyway.
+	 */
+
+	/*
+	 * Allocate a temporary array to avoid modifying the array passed as
+	 * argument.
+	 */
+	xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt));
+
+	/*
+	 * Add to the temp array any xids which have not already completed.
+	 */
+	nxids = 0;
+	for (i = 0; i < running->xcnt + running->subxcnt; i++)
+	{
+		TransactionId xid = running->xids[i];
+
+		/*
+		 * The running-xacts snapshot can contain xids that were still visible
+		 * in the procarray when the snapshot was taken, but were already
+		 * WAL-logged as completed. They're not running anymore, so ignore
+		 * them.
+		 */
+		if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+			continue;
+
+		xids[nxids++] = xid;
+	}
+
+	if (nxids > 0)
+	{
+		if (procArray->numKnownAssignedXids != 0)
+		{
+			LWLockRelease(ProcArrayLock);
+			elog(ERROR, "KnownAssignedXids is not empty");
+		}
+
+		/*
+		 * Sort the array so that we can add them safely into
+		 * KnownAssignedXids.
+		 *
+		 * We have to sort them logically, because in KnownAssignedXidsAdd we
+		 * call TransactionIdFollowsOrEquals and so on. But we know these XIDs
+		 * come from RUNNING_XACTS, which means there are only normal XIDs from
+		 * the same epoch, so this is safe.
+		 */
+		qsort(xids, nxids, sizeof(TransactionId), xidLogicalComparator);
+
+		/*
+		 * Add the sorted snapshot into KnownAssignedXids.  The running-xacts
+		 * snapshot may include duplicated xids because of prepared
+		 * transactions, so ignore them.
+		 */
+		for (i = 0; i < nxids; i++)
+		{
+			if (i > 0 && TransactionIdEquals(xids[i - 1], xids[i]))
+			{
+				elog(DEBUG1,
+					 "found duplicated transaction %u for KnownAssignedXids insertion",
+					 xids[i]);
+				continue;
+			}
+			KnownAssignedXidsAdd(xids[i], xids[i], true);
+		}
+
+		KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+	}
+
+	pfree(xids);
+
+	/*
+	 * latestObservedXid is at least set to the point where SUBTRANS was
+	 * started up to (cf. ProcArrayInitRecovery()) or to the biggest xid
+	 * RecordKnownAssignedTransactionIds() was called for.  Initialize
+	 * subtrans from thereon, up to nextXid - 1.
+	 *
+	 * We need to duplicate parts of RecordKnownAssignedTransactionId() here,
+	 * because we've just added xids to the known assigned xids machinery that
+	 * haven't gone through RecordKnownAssignedTransactionId().
+	 */
+	Assert(TransactionIdIsNormal(latestObservedXid));
+	TransactionIdAdvance(latestObservedXid);
+	while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
+	{
+		ExtendSUBTRANS(latestObservedXid);
+		TransactionIdAdvance(latestObservedXid);
+	}
+	TransactionIdRetreat(latestObservedXid);	/* = running->nextXid - 1 */
+
+	/* ----------
+	 * Now we've got the running xids we need to set the global values that
+	 * are used to track snapshots as they evolve further.
+	 *
+	 * - latestCompletedXid which will be the xmax for snapshots
+	 * - lastOverflowedXid which shows whether snapshots overflow
+	 * - nextXid
+	 *
+	 * If the snapshot overflowed, then we still initialise with what we know,
+	 * but the recovery snapshot isn't fully valid yet because we know there
+	 * are some subxids missing. We don't know the specific subxids that are
+	 * missing, so conservatively assume the last one is latestObservedXid.
+	 * ----------
+	 */
+	if (running->subxid_overflow)
+	{
+		standbyState = STANDBY_SNAPSHOT_PENDING;
+
+		standbySnapshotPendingXmin = latestObservedXid;
+		procArray->lastOverflowedXid = latestObservedXid;
+	}
+	else
+	{
+		standbyState = STANDBY_SNAPSHOT_READY;
+
+		standbySnapshotPendingXmin = InvalidTransactionId;
+	}
+
+	/*
+	 * If a transaction wrote a commit record in the gap between taking and
+	 * logging the snapshot then latestCompletedXid may already be higher than
+	 * the value from the snapshot, so check before we use the incoming value.
+	 * It also might not yet be set at all.
+	 */
+	MaintainLatestCompletedXidRecovery(running->latestCompletedXid);
+
+	/*
+	 * NB: No need to increment ShmemVariableCache->xactCompletionCount here,
+	 * nobody can see it yet.
+	 */
+
+	LWLockRelease(ProcArrayLock);
+
+	/* ShmemVariableCache->nextXid must be beyond any observed xid. */
+	AdvanceNextFullTransactionIdPastXid(latestObservedXid);
+
+	Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
+
+	KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+	if (standbyState == STANDBY_SNAPSHOT_READY)
+		elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled");
+	else
+		elog(trace_recovery(DEBUG1),
+			 "recovery snapshot waiting for non-overflowed snapshot or "
+			 "until oldest active xid on standby is at least %u (now %u)",
+			 standbySnapshotPendingXmin,
+			 running->oldestRunningXid);
+}
+
+/*
+ * ProcArrayApplyXidAssignment
+ *		Process an XLOG_XACT_ASSIGNMENT WAL record
+ */
+void
+ProcArrayApplyXidAssignment(TransactionId topxid,
+							int nsubxids, TransactionId *subxids)
+{
+	TransactionId max_xid;
+	int			i;
+
+	Assert(standbyState >= STANDBY_INITIALIZED);
+
+	max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
+
+	/*
+	 * Mark all the subtransactions as observed.
+	 *
+	 * NOTE: This will fail if the subxid contains too many previously
+	 * unobserved xids to fit into known-assigned-xids. That shouldn't happen
+	 * as the code stands, because xid-assignment records should never contain
+	 * more than PGPROC_MAX_CACHED_SUBXIDS entries.
+	 */
+	RecordKnownAssignedTransactionIds(max_xid);
+
+	/*
+	 * Notice that we update pg_subtrans with the top-level xid, rather than
+	 * the parent xid. This is a difference between normal processing and
+	 * recovery, yet is still correct in all cases. The reason is that
+	 * subtransaction commit is not marked in clog until commit processing, so
+	 * all aborted subtransactions have already been clearly marked in clog.
+	 * As a result we are able to refer directly to the top-level
+	 * transaction's state rather than skipping through all the intermediate
+	 * states in the subtransaction tree. This should be the first time we
+	 * have attempted to SubTransSetParent().
+	 */
+	for (i = 0; i < nsubxids; i++)
+		SubTransSetParent(subxids[i], topxid);
+
+	/* KnownAssignedXids isn't maintained yet, so we're done for now */
+	if (standbyState == STANDBY_INITIALIZED)
+		return;
+
+	/*
+	 * Uses same locking as transaction commit
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Remove subxids from known-assigned-xacts.
+	 */
+	KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
+
+	/*
+	 * Advance lastOverflowedXid to be at least the last of these subxids.
+	 */
+	if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
+		procArray->lastOverflowedXid = max_xid;
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * TransactionIdIsInProgress -- is given transaction running in some backend
+ *
+ * Aside from some shortcuts such as checking RecentXmin and our own Xid,
+ * there are four possibilities for finding a running transaction:
+ *
+ * 1. The given Xid is a main transaction Id.  We will find this out cheaply
+ * by looking at ProcGlobal->xids.
+ *
+ * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
+ * We can find this out cheaply too.
+ *
+ * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
+ * if the Xid is running on the primary.
+ *
+ * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
+ * if that is running according to ProcGlobal->xids[] or KnownAssignedXids.
+ * This is the slowest way, but sadly it has to be done always if the others
+ * failed, unless we see that the cached subxact sets are complete (none have
+ * overflowed).
+ *
+ * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
+ * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
+ * This buys back some concurrency (and we can't retrieve the main Xids from
+ * ProcGlobal->xids[] again anyway; see GetNewTransactionId).
+ */
+bool
+TransactionIdIsInProgress(TransactionId xid)
+{
+	static TransactionId *xids = NULL;
+	static TransactionId *other_xids;
+	XidCacheStatus *other_subxidstates;
+	int			nxids = 0;
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId topxid;
+	TransactionId latestCompletedXid;
+	int			mypgxactoff;
+	int			numProcs;
+	int			j;
+
+	/*
+	 * Don't bother checking a transaction older than RecentXmin; it could not
+	 * possibly still be running.  (Note: in particular, this guarantees that
+	 * we reject InvalidTransactionId, FrozenTransactionId, etc as not
+	 * running.)
+	 */
+	if (TransactionIdPrecedes(xid, RecentXmin))
+	{
+		xc_by_recent_xmin_inc();
+		return false;
+	}
+
+	/*
+	 * We may have just checked the status of this transaction, so if it is
+	 * already known to be completed, we can fall out without any access to
+	 * shared memory.
+	 */
+	if (TransactionIdEquals(cachedXidIsNotInProgress, xid))
+	{
+		xc_by_known_xact_inc();
+		return false;
+	}
+
+	/*
+	 * Also, we can handle our own transaction (and subtransactions) without
+	 * any access to shared memory.
+	 */
+	if (TransactionIdIsCurrentTransactionId(xid))
+	{
+		xc_by_my_xact_inc();
+		return true;
+	}
+
+	/*
+	 * If first time through, get workspace to remember main XIDs in. We
+	 * malloc it permanently to avoid repeated palloc/pfree overhead.
+	 */
+	if (xids == NULL)
+	{
+		/*
+		 * In hot standby mode, reserve enough space to hold all xids in the
+		 * known-assigned list. If we later finish recovery, we no longer need
+		 * the bigger array, but we don't bother to shrink it.
+		 */
+		int			maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
+
+		xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
+		if (xids == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	other_xids = ProcGlobal->xids;
+	other_subxidstates = ProcGlobal->subxidStates;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	/*
+	 * Now that we have the lock, we can check latestCompletedXid; if the
+	 * target Xid is after that, it's surely still running.
+	 */
+	latestCompletedXid =
+		XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+	if (TransactionIdPrecedes(latestCompletedXid, xid))
+	{
+		LWLockRelease(ProcArrayLock);
+		xc_by_latest_xid_inc();
+		return true;
+	}
+
+	/* No shortcuts, gotta grovel through the array */
+	mypgxactoff = MyProc->pgxactoff;
+	numProcs = arrayP->numProcs;
+	for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
+	{
+		int			pgprocno;
+		PGPROC	   *proc;
+		TransactionId pxid;
+		int			pxids;
+
+		/* Ignore ourselves --- dealt with it above */
+		if (pgxactoff == mypgxactoff)
+			continue;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		pxid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
+
+		if (!TransactionIdIsValid(pxid))
+			continue;
+
+		/*
+		 * Step 1: check the main Xid
+		 */
+		if (TransactionIdEquals(pxid, xid))
+		{
+			LWLockRelease(ProcArrayLock);
+			xc_by_main_xid_inc();
+			return true;
+		}
+
+		/*
+		 * We can ignore main Xids that are younger than the target Xid, since
+		 * the target could not possibly be their child.
+		 */
+		if (TransactionIdPrecedes(xid, pxid))
+			continue;
+
+		/*
+		 * Step 2: check the cached child-Xids arrays
+		 */
+		pxids = other_subxidstates[pgxactoff].count;
+		pg_read_barrier();		/* pairs with barrier in GetNewTransactionId() */
+		pgprocno = arrayP->pgprocnos[pgxactoff];
+		proc = &allProcs[pgprocno];
+		for (j = pxids - 1; j >= 0; j--)
+		{
+			/* Fetch xid just once - see GetNewTransactionId */
+			TransactionId cxid = UINT32_ACCESS_ONCE(proc->subxids.xids[j]);
+
+			if (TransactionIdEquals(cxid, xid))
+			{
+				LWLockRelease(ProcArrayLock);
+				xc_by_child_xid_inc();
+				return true;
+			}
+		}
+
+		/*
+		 * Save the main Xid for step 4.  We only need to remember main Xids
+		 * that have uncached children.  (Note: there is no race condition
+		 * here because the overflowed flag cannot be cleared, only set, while
+		 * we hold ProcArrayLock.  So we can't miss an Xid that we need to
+		 * worry about.)
+		 */
+		if (other_subxidstates[pgxactoff].overflowed)
+			xids[nxids++] = pxid;
+	}
+
+	/*
+	 * Step 3: in hot standby mode, check the known-assigned-xids list.  XIDs
+	 * in the list must be treated as running.
+	 */
+	if (RecoveryInProgress())
+	{
+		/* none of the PGPROC entries should have XIDs in hot standby mode */
+		Assert(nxids == 0);
+
+		if (KnownAssignedXidExists(xid))
+		{
+			LWLockRelease(ProcArrayLock);
+			xc_by_known_assigned_inc();
+			return true;
+		}
+
+		/*
+		 * If the KnownAssignedXids overflowed, we have to check pg_subtrans
+		 * too.  Fetch all xids from KnownAssignedXids that are lower than
+		 * xid, since if xid is a subtransaction its parent will always have a
+		 * lower value.  Note we will collect both main and subXIDs here, but
+		 * there's no help for it.
+		 */
+		if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
+			nxids = KnownAssignedXidsGet(xids, xid);
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	/*
+	 * If none of the relevant caches overflowed, we know the Xid is not
+	 * running without even looking at pg_subtrans.
+	 */
+	if (nxids == 0)
+	{
+		xc_no_overflow_inc();
+		cachedXidIsNotInProgress = xid;
+		return false;
+	}
+
+	/*
+	 * Step 4: have to check pg_subtrans.
+	 *
+	 * At this point, we know it's either a subtransaction of one of the Xids
+	 * in xids[], or it's not running.  If it's an already-failed
+	 * subtransaction, we want to say "not running" even though its parent may
+	 * still be running.  So first, check pg_xact to see if it's been aborted.
+	 */
+	xc_slow_answer_inc();
+
+	if (TransactionIdDidAbort(xid))
+	{
+		cachedXidIsNotInProgress = xid;
+		return false;
+	}
+
+	/*
+	 * It isn't aborted, so check whether the transaction tree it belongs to
+	 * is still running (or, more precisely, whether it was running when we
+	 * held ProcArrayLock).
+	 */
+	topxid = SubTransGetTopmostTransaction(xid);
+	Assert(TransactionIdIsValid(topxid));
+	if (!TransactionIdEquals(topxid, xid))
+	{
+		for (int i = 0; i < nxids; i++)
+		{
+			if (TransactionIdEquals(xids[i], topxid))
+				return true;
+		}
+	}
+
+	cachedXidIsNotInProgress = xid;
+	return false;
+}
+
+/*
+ * TransactionIdIsActive -- is xid the top-level XID of an active backend?
+ *
+ * This differs from TransactionIdIsInProgress in that it ignores prepared
+ * transactions, as well as transactions running on the primary if we're in
+ * hot standby.  Also, we ignore subtransactions since that's not needed
+ * for current uses.
+ */
+bool
+TransactionIdIsActive(TransactionId xid)
+{
+	bool		result = false;
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	int			i;
+
+	/*
+	 * Don't bother checking a transaction older than RecentXmin; it could not
+	 * possibly still be running.
+	 */
+	if (TransactionIdPrecedes(xid, RecentXmin))
+		return false;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (i = 0; i < arrayP->numProcs; i++)
+	{
+		int			pgprocno = arrayP->pgprocnos[i];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		TransactionId pxid;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		pxid = UINT32_ACCESS_ONCE(other_xids[i]);
+
+		if (!TransactionIdIsValid(pxid))
+			continue;
+
+		if (proc->pid == 0)
+			continue;			/* ignore prepared transactions */
+
+		if (TransactionIdEquals(pxid, xid))
+		{
+			result = true;
+			break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+
+/*
+ * Determine XID horizons.
+ *
+ * This is used by wrapper functions like GetOldestNonRemovableTransactionId()
+ * (for VACUUM), GetReplicationHorizons() (for hot_standby_feedback), etc as
+ * well as "internally" by GlobalVisUpdate() (see comment above struct
+ * GlobalVisState).
+ *
+ * See the definition of ComputeXidHorizonsResult for the various computed
+ * horizons.
+ *
+ * For VACUUM separate horizons (used to decide which deleted tuples must
+ * be preserved), for shared and non-shared tables are computed.  For shared
+ * relations backends in all databases must be considered, but for non-shared
+ * relations that's not required, since only backends in my own database could
+ * ever see the tuples in them. Also, we can ignore concurrently running lazy
+ * VACUUMs because (a) they must be working on other tables, and (b) they
+ * don't need to do snapshot-based lookups.
+ *
+ * This also computes a horizon used to truncate pg_subtrans. For that
+ * backends in all databases have to be considered, and concurrently running
+ * lazy VACUUMs cannot be ignored, as they still may perform pg_subtrans
+ * accesses.
+ *
+ * Note: we include all currently running xids in the set of considered xids.
+ * This ensures that if a just-started xact has not yet set its snapshot,
+ * when it does set the snapshot it cannot set xmin less than what we compute.
+ * See notes in src/backend/access/transam/README.
+ *
+ * Note: despite the above, it's possible for the calculated values to move
+ * backwards on repeated calls. The calculated values are conservative, so
+ * that anything older is definitely not considered as running by anyone
+ * anymore, but the exact values calculated depend on a number of things. For
+ * example, if there are no transactions running in the current database, the
+ * horizon for normal tables will be latestCompletedXid. If a transaction
+ * begins after that, its xmin will include in-progress transactions in other
+ * databases that started earlier, so another call will return a lower value.
+ * Nonetheless it is safe to vacuum a table in the current database with the
+ * first result.  There are also replication-related effects: a walsender
+ * process can set its xmin based on transactions that are no longer running
+ * on the primary but are still being replayed on the standby, thus possibly
+ * making the values go backwards.  In this case there is a possibility that
+ * we lose data that the standby would like to have, but unless the standby
+ * uses a replication slot to make its xmin persistent there is little we can
+ * do about that --- data is only protected if the walsender runs continuously
+ * while queries are executed on the standby.  (The Hot Standby code deals
+ * with such cases by failing standby queries that needed to access
+ * already-removed data, so there's no integrity bug.)  The computed values
+ * are also adjusted with vacuum_defer_cleanup_age, so increasing that setting
+ * on the fly is another easy way to make horizons move backwards, with no
+ * consequences for data integrity.
+ *
+ * Note: the approximate horizons (see definition of GlobalVisState) are
+ * updated by the computations done here. That's currently required for
+ * correctness and a small optimization. Without doing so it's possible that
+ * heap vacuum's call to heap_page_prune() uses a more conservative horizon
+ * than later when deciding which tuples can be removed - which the code
+ * doesn't expect (breaking HOT).
+ */
+static void
+ComputeXidHorizons(ComputeXidHorizonsResult *h)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId kaxmin;
+	bool		in_recovery = RecoveryInProgress();
+	TransactionId *other_xids = ProcGlobal->xids;
+
+	/* inferred after ProcArrayLock is released */
+	h->catalog_oldest_nonremovable = InvalidTransactionId;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	h->latest_completed = ShmemVariableCache->latestCompletedXid;
+
+	/*
+	 * We initialize the MIN() calculation with latestCompletedXid + 1. This
+	 * is a lower bound for the XIDs that might appear in the ProcArray later,
+	 * and so protects us against overestimating the result due to future
+	 * additions.
+	 */
+	{
+		TransactionId initial;
+
+		initial = XidFromFullTransactionId(h->latest_completed);
+		Assert(TransactionIdIsValid(initial));
+		TransactionIdAdvance(initial);
+
+		h->oldest_considered_running = initial;
+		h->shared_oldest_nonremovable = initial;
+		h->data_oldest_nonremovable = initial;
+
+		/*
+		 * Only modifications made by this backend affect the horizon for
+		 * temporary relations. Instead of a check in each iteration of the
+		 * loop over all PGPROCs it is cheaper to just initialize to the
+		 * current top-level xid any.
+		 *
+		 * Without an assigned xid we could use a horizon as aggressive as
+		 * ReadNewTransactionid(), but we can get away with the much cheaper
+		 * latestCompletedXid + 1: If this backend has no xid there, by
+		 * definition, can't be any newer changes in the temp table than
+		 * latestCompletedXid.
+		 */
+		if (TransactionIdIsValid(MyProc->xid))
+			h->temp_oldest_nonremovable = MyProc->xid;
+		else
+			h->temp_oldest_nonremovable = initial;
+	}
+
+	/*
+	 * Fetch slot horizons while ProcArrayLock is held - the
+	 * LWLockAcquire/LWLockRelease are a barrier, ensuring this happens inside
+	 * the lock.
+	 */
+	h->slot_xmin = procArray->replication_slot_xmin;
+	h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+	for (int index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		int8		statusFlags = ProcGlobal->statusFlags[index];
+		TransactionId xid;
+		TransactionId xmin;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = UINT32_ACCESS_ONCE(other_xids[index]);
+		xmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+		/*
+		 * Consider both the transaction's Xmin, and its Xid.
+		 *
+		 * We must check both because a transaction might have an Xmin but not
+		 * (yet) an Xid; conversely, if it has an Xid, that could determine
+		 * some not-yet-set Xmin.
+		 */
+		xmin = TransactionIdOlder(xmin, xid);
+
+		/* if neither is set, this proc doesn't influence the horizon */
+		if (!TransactionIdIsValid(xmin))
+			continue;
+
+		/*
+		 * Don't ignore any procs when determining which transactions might be
+		 * considered running.  While slots should ensure logical decoding
+		 * backends are protected even without this check, it can't hurt to
+		 * include them here as well..
+		 */
+		h->oldest_considered_running =
+			TransactionIdOlder(h->oldest_considered_running, xmin);
+
+		/*
+		 * Skip over backends either vacuuming (which is ok with rows being
+		 * removed, as long as pg_subtrans is not truncated) or doing logical
+		 * decoding (which manages xmin separately, check below).
+		 */
+		if (statusFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING))
+			continue;
+
+		/* shared tables need to take backends in all databases into account */
+		h->shared_oldest_nonremovable =
+			TransactionIdOlder(h->shared_oldest_nonremovable, xmin);
+
+		/*
+		 * Normally queries in other databases are ignored for anything but
+		 * the shared horizon. But in recovery we cannot compute an accurate
+		 * per-database horizon as all xids are managed via the
+		 * KnownAssignedXids machinery.
+		 *
+		 * Be careful to compute a pessimistic value when MyDatabaseId is not
+		 * set. If this is a backend in the process of starting up, we may not
+		 * use a "too aggressive" horizon (otherwise we could end up using it
+		 * to prune still needed data away). If the current backend never
+		 * connects to a database that is harmless, because
+		 * data_oldest_nonremovable will never be utilized.
+		 */
+		if (in_recovery ||
+			MyDatabaseId == InvalidOid || proc->databaseId == MyDatabaseId ||
+			proc->databaseId == 0)	/* always include WalSender */
+		{
+			h->data_oldest_nonremovable =
+				TransactionIdOlder(h->data_oldest_nonremovable, xmin);
+		}
+	}
+
+	/*
+	 * If in recovery fetch oldest xid in KnownAssignedXids, will be applied
+	 * after lock is released.
+	 */
+	if (in_recovery)
+		kaxmin = KnownAssignedXidsGetOldestXmin();
+
+	/*
+	 * No other information from shared state is needed, release the lock
+	 * immediately. The rest of the computations can be done without a lock.
+	 */
+	LWLockRelease(ProcArrayLock);
+
+	if (in_recovery)
+	{
+		h->oldest_considered_running =
+			TransactionIdOlder(h->oldest_considered_running, kaxmin);
+		h->shared_oldest_nonremovable =
+			TransactionIdOlder(h->shared_oldest_nonremovable, kaxmin);
+		h->data_oldest_nonremovable =
+			TransactionIdOlder(h->data_oldest_nonremovable, kaxmin);
+		/* temp relations cannot be accessed in recovery */
+	}
+	else
+	{
+		/*
+		 * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age.
+		 *
+		 * vacuum_defer_cleanup_age provides some additional "slop" for the
+		 * benefit of hot standby queries on standby servers.  This is quick
+		 * and dirty, and perhaps not all that useful unless the primary has a
+		 * predictable transaction rate, but it offers some protection when
+		 * there's no walsender connection.  Note that we are assuming
+		 * vacuum_defer_cleanup_age isn't large enough to cause wraparound ---
+		 * so guc.c should limit it to no more than the xidStopLimit threshold
+		 * in varsup.c.  Also note that we intentionally don't apply
+		 * vacuum_defer_cleanup_age on standby servers.
+		 */
+		h->oldest_considered_running =
+			TransactionIdRetreatedBy(h->oldest_considered_running,
+									 vacuum_defer_cleanup_age);
+		h->shared_oldest_nonremovable =
+			TransactionIdRetreatedBy(h->shared_oldest_nonremovable,
+									 vacuum_defer_cleanup_age);
+		h->data_oldest_nonremovable =
+			TransactionIdRetreatedBy(h->data_oldest_nonremovable,
+									 vacuum_defer_cleanup_age);
+		/* defer doesn't apply to temp relations */
+	}
+
+	/*
+	 * Check whether there are replication slots requiring an older xmin.
+	 */
+	h->shared_oldest_nonremovable =
+		TransactionIdOlder(h->shared_oldest_nonremovable, h->slot_xmin);
+	h->data_oldest_nonremovable =
+		TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin);
+
+	/*
+	 * The only difference between catalog / data horizons is that the slot's
+	 * catalog xmin is applied to the catalog one (so catalogs can be accessed
+	 * for logical decoding). Initialize with data horizon, and then back up
+	 * further if necessary. Have to back up the shared horizon as well, since
+	 * that also can contain catalogs.
+	 */
+	h->shared_oldest_nonremovable_raw = h->shared_oldest_nonremovable;
+	h->shared_oldest_nonremovable =
+		TransactionIdOlder(h->shared_oldest_nonremovable,
+						   h->slot_catalog_xmin);
+	h->catalog_oldest_nonremovable = h->data_oldest_nonremovable;
+	h->catalog_oldest_nonremovable =
+		TransactionIdOlder(h->catalog_oldest_nonremovable,
+						   h->slot_catalog_xmin);
+
+	/*
+	 * It's possible that slots / vacuum_defer_cleanup_age backed up the
+	 * horizons further than oldest_considered_running. Fix.
+	 */
+	h->oldest_considered_running =
+		TransactionIdOlder(h->oldest_considered_running,
+						   h->shared_oldest_nonremovable);
+	h->oldest_considered_running =
+		TransactionIdOlder(h->oldest_considered_running,
+						   h->catalog_oldest_nonremovable);
+	h->oldest_considered_running =
+		TransactionIdOlder(h->oldest_considered_running,
+						   h->data_oldest_nonremovable);
+
+	/*
+	 * shared horizons have to be at least as old as the oldest visible in
+	 * current db
+	 */
+	Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+										 h->data_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+										 h->catalog_oldest_nonremovable));
+
+	/*
+	 * Horizons need to ensure that pg_subtrans access is still possible for
+	 * the relevant backends.
+	 */
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->shared_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->catalog_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->data_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->temp_oldest_nonremovable));
+	Assert(!TransactionIdIsValid(h->slot_xmin) ||
+		   TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->slot_xmin));
+	Assert(!TransactionIdIsValid(h->slot_catalog_xmin) ||
+		   TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->slot_catalog_xmin));
+
+	/* update approximate horizons with the computed horizons */
+	GlobalVisUpdateApply(h);
+}
+
+/*
+ * Determine what kind of visibility horizon needs to be used for a
+ * relation. If rel is NULL, the most conservative horizon is used.
+ */
+static inline GlobalVisHorizonKind
+GlobalVisHorizonKindForRel(Relation rel)
+{
+	/*
+	 * Other relkkinds currently don't contain xids, nor always the necessary
+	 * logical decoding markers.
+	 */
+	Assert(!rel ||
+		   rel->rd_rel->relkind == RELKIND_RELATION ||
+		   rel->rd_rel->relkind == RELKIND_MATVIEW ||
+		   rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+
+	if (rel == NULL || rel->rd_rel->relisshared || RecoveryInProgress())
+		return VISHORIZON_SHARED;
+	else if (IsCatalogRelation(rel) ||
+			 RelationIsAccessibleInLogicalDecoding(rel))
+		return VISHORIZON_CATALOG;
+	else if (!RELATION_IS_LOCAL(rel))
+		return VISHORIZON_DATA;
+	else
+		return VISHORIZON_TEMP;
+}
+
+/*
+ * Return the oldest XID for which deleted tuples must be preserved in the
+ * passed table.
+ *
+ * If rel is not NULL the horizon may be considerably more recent than
+ * otherwise (i.e. fewer tuples will be removable). In the NULL case a horizon
+ * that is correct (but not optimal) for all relations will be returned.
+ *
+ * This is used by VACUUM to decide which deleted tuples must be preserved in
+ * the passed in table.
+ */
+TransactionId
+GetOldestNonRemovableTransactionId(Relation rel)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	switch (GlobalVisHorizonKindForRel(rel))
+	{
+		case VISHORIZON_SHARED:
+			return horizons.shared_oldest_nonremovable;
+		case VISHORIZON_CATALOG:
+			return horizons.catalog_oldest_nonremovable;
+		case VISHORIZON_DATA:
+			return horizons.data_oldest_nonremovable;
+		case VISHORIZON_TEMP:
+			return horizons.temp_oldest_nonremovable;
+	}
+
+	return InvalidTransactionId;
+}
+
+/*
+ * Return the oldest transaction id any currently running backend might still
+ * consider running. This should not be used for visibility / pruning
+ * determinations (see GetOldestNonRemovableTransactionId()), but for
+ * decisions like up to where pg_subtrans can be truncated.
+ */
+TransactionId
+GetOldestTransactionIdConsideredRunning(void)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	return horizons.oldest_considered_running;
+}
+
+/*
+ * Return the visibility horizons for a hot standby feedback message.
+ */
+void
+GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	/*
+	 * Don't want to use shared_oldest_nonremovable here, as that contains the
+	 * effect of replication slot's catalog_xmin. We want to send a separate
+	 * feedback for the catalog horizon, so the primary can remove data table
+	 * contents more aggressively.
+	 */
+	*xmin = horizons.shared_oldest_nonremovable_raw;
+	*catalog_xmin = horizons.slot_catalog_xmin;
+}
+
+/*
+ * GetMaxSnapshotXidCount -- get max size for snapshot XID array
+ *
+ * We have to export this for use by snapmgr.c.
+ */
+int
+GetMaxSnapshotXidCount(void)
+{
+	return procArray->maxProcs;
+}
+
+/*
+ * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array
+ *
+ * We have to export this for use by snapmgr.c.
+ */
+int
+GetMaxSnapshotSubxidCount(void)
+{
+	return TOTAL_MAX_CACHED_SUBXIDS;
+}
+
+/*
+ * Initialize old_snapshot_threshold specific parts of a newly build snapshot.
+ */
+static void
+GetSnapshotDataInitOldSnapshot(Snapshot snapshot)
+{
+	if (!OldSnapshotThresholdActive())
+	{
+		/*
+		 * If not using "snapshot too old" feature, fill related fields with
+		 * dummy values that don't require any locking.
+		 */
+		snapshot->lsn = InvalidXLogRecPtr;
+		snapshot->whenTaken = 0;
+	}
+	else
+	{
+		/*
+		 * Capture the current time and WAL stream location in case this
+		 * snapshot becomes old enough to need to fall back on the special
+		 * "old snapshot" logic.
+		 */
+		snapshot->lsn = GetXLogInsertRecPtr();
+		snapshot->whenTaken = GetSnapshotCurrentTimestamp();
+		MaintainOldSnapshotTimeMapping(snapshot->whenTaken, snapshot->xmin);
+	}
+}
+
+/*
+ * Helper function for GetSnapshotData() that checks if the bulk of the
+ * visibility information in the snapshot is still valid. If so, it updates
+ * the fields that need to change and returns true. Otherwise it returns
+ * false.
+ *
+ * This very likely can be evolved to not need ProcArrayLock held (at very
+ * least in the case we already hold a snapshot), but that's for another day.
+ */
+static bool
+GetSnapshotDataReuse(Snapshot snapshot)
+{
+	uint64		curXactCompletionCount;
+
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	if (unlikely(snapshot->snapXactCompletionCount == 0))
+		return false;
+
+	curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+	if (curXactCompletionCount != snapshot->snapXactCompletionCount)
+		return false;
+
+	/*
+	 * If the current xactCompletionCount is still the same as it was at the
+	 * time the snapshot was built, we can be sure that rebuilding the
+	 * contents of the snapshot the hard way would result in the same snapshot
+	 * contents:
+	 *
+	 * As explained in transam/README, the set of xids considered running by
+	 * GetSnapshotData() cannot change while ProcArrayLock is held. Snapshot
+	 * contents only depend on transactions with xids and xactCompletionCount
+	 * is incremented whenever a transaction with an xid finishes (while
+	 * holding ProcArrayLock) exclusively). Thus the xactCompletionCount check
+	 * ensures we would detect if the snapshot would have changed.
+	 *
+	 * As the snapshot contents are the same as it was before, it is safe to
+	 * re-enter the snapshot's xmin into the PGPROC array. None of the rows
+	 * visible under the snapshot could already have been removed (that'd
+	 * require the set of running transactions to change) and it fulfills the
+	 * requirement that concurrent GetSnapshotData() calls yield the same
+	 * xmin.
+	 */
+	if (!TransactionIdIsValid(MyProc->xmin))
+		MyProc->xmin = TransactionXmin = snapshot->xmin;
+
+	RecentXmin = snapshot->xmin;
+	Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+	snapshot->curcid = GetCurrentCommandId(false);
+	snapshot->active_count = 0;
+	snapshot->regd_count = 0;
+	snapshot->copied = false;
+
+	GetSnapshotDataInitOldSnapshot(snapshot);
+
+	return true;
+}
+
+/*
+ * GetSnapshotData -- returns information about running transactions.
+ *
+ * The returned snapshot includes xmin (lowest still-running xact ID),
+ * xmax (highest completed xact ID + 1), and a list of running xact IDs
+ * in the range xmin <= xid < xmax.  It is used as follows:
+ *		All xact IDs < xmin are considered finished.
+ *		All xact IDs >= xmax are considered still running.
+ *		For an xact ID xmin <= xid < xmax, consult list to see whether
+ *		it is considered running or not.
+ * This ensures that the set of transactions seen as "running" by the
+ * current xact will not change after it takes the snapshot.
+ *
+ * All running top-level XIDs are included in the snapshot, except for lazy
+ * VACUUM processes.  We also try to include running subtransaction XIDs,
+ * but since PGPROC has only a limited cache area for subxact XIDs, full
+ * information may not be available.  If we find any overflowed subxid arrays,
+ * we have to mark the snapshot's subxid data as overflowed, and extra work
+ * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
+ * in heapam_visibility.c).
+ *
+ * We also update the following backend-global variables:
+ *		TransactionXmin: the oldest xmin of any snapshot in use in the
+ *			current transaction (this is the same as MyProc->xmin).
+ *		RecentXmin: the xmin computed for the most recent snapshot.  XIDs
+ *			older than this are known not running any more.
+ *
+ * And try to advance the bounds of GlobalVis{Shared,Catalog,Data,Temp}Rels
+ * for the benefit of the GlobalVisTest* family of functions.
+ *
+ * Note: this function should probably not be called with an argument that's
+ * not statically allocated (see xip allocation below).
+ */
+Snapshot
+GetSnapshotData(Snapshot snapshot)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	TransactionId xmin;
+	TransactionId xmax;
+	int			count = 0;
+	int			subcount = 0;
+	bool		suboverflowed = false;
+	FullTransactionId latest_completed;
+	TransactionId oldestxid;
+	int			mypgxactoff;
+	TransactionId myxid;
+	uint64		curXactCompletionCount;
+
+	TransactionId replication_slot_xmin = InvalidTransactionId;
+	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+
+	Assert(snapshot != NULL);
+
+	/*
+	 * Allocating space for maxProcs xids is usually overkill; numProcs would
+	 * be sufficient.  But it seems better to do the malloc while not holding
+	 * the lock, so we can't look at numProcs.  Likewise, we allocate much
+	 * more subxip storage than is probably needed.
+	 *
+	 * This does open a possibility for avoiding repeated malloc/free: since
+	 * maxProcs does not change at runtime, we can simply reuse the previous
+	 * xip arrays if any.  (This relies on the fact that all callers pass
+	 * static SnapshotData structs.)
+	 */
+	if (snapshot->xip == NULL)
+	{
+		/*
+		 * First call for this snapshot. Snapshot is same size whether or not
+		 * we are in recovery, see later comments.
+		 */
+		snapshot->xip = (TransactionId *)
+			malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId));
+		if (snapshot->xip == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		Assert(snapshot->subxip == NULL);
+		snapshot->subxip = (TransactionId *)
+			malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId));
+		if (snapshot->subxip == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	/*
+	 * It is sufficient to get shared lock on ProcArrayLock, even if we are
+	 * going to set MyProc->xmin.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	if (GetSnapshotDataReuse(snapshot))
+	{
+		LWLockRelease(ProcArrayLock);
+		return snapshot;
+	}
+
+	latest_completed = ShmemVariableCache->latestCompletedXid;
+	mypgxactoff = MyProc->pgxactoff;
+	myxid = other_xids[mypgxactoff];
+	Assert(myxid == MyProc->xid);
+
+	oldestxid = ShmemVariableCache->oldestXid;
+	curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+
+	/* xmax is always latestCompletedXid + 1 */
+	xmax = XidFromFullTransactionId(latest_completed);
+	TransactionIdAdvance(xmax);
+	Assert(TransactionIdIsNormal(xmax));
+
+	/* initialize xmin calculation with xmax */
+	xmin = xmax;
+
+	/* take own xid into account, saves a check inside the loop */
+	if (TransactionIdIsNormal(myxid) && NormalTransactionIdPrecedes(myxid, xmin))
+		xmin = myxid;
+
+	snapshot->takenDuringRecovery = RecoveryInProgress();
+
+	if (!snapshot->takenDuringRecovery)
+	{
+		int			numProcs = arrayP->numProcs;
+		TransactionId *xip = snapshot->xip;
+		int		   *pgprocnos = arrayP->pgprocnos;
+		XidCacheStatus *subxidStates = ProcGlobal->subxidStates;
+		uint8	   *allStatusFlags = ProcGlobal->statusFlags;
+
+		/*
+		 * First collect set of pgxactoff/xids that need to be included in the
+		 * snapshot.
+		 */
+		for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
+		{
+			/* Fetch xid just once - see GetNewTransactionId */
+			TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
+			uint8		statusFlags;
+
+			Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff);
+
+			/*
+			 * If the transaction has no XID assigned, we can skip it; it
+			 * won't have sub-XIDs either.
+			 */
+			if (likely(xid == InvalidTransactionId))
+				continue;
+
+			/*
+			 * We don't include our own XIDs (if any) in the snapshot. It
+			 * needs to be includeded in the xmin computation, but we did so
+			 * outside the loop.
+			 */
+			if (pgxactoff == mypgxactoff)
+				continue;
+
+			/*
+			 * The only way we are able to get here with a non-normal xid is
+			 * during bootstrap - with this backend using
+			 * BootstrapTransactionId. But the above test should filter that
+			 * out.
+			 */
+			Assert(TransactionIdIsNormal(xid));
+
+			/*
+			 * If the XID is >= xmax, we can skip it; such transactions will
+			 * be treated as running anyway (and any sub-XIDs will also be >=
+			 * xmax).
+			 */
+			if (!NormalTransactionIdPrecedes(xid, xmax))
+				continue;
+
+			/*
+			 * Skip over backends doing logical decoding which manages xmin
+			 * separately (check below) and ones running LAZY VACUUM.
+			 */
+			statusFlags = allStatusFlags[pgxactoff];
+			if (statusFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM))
+				continue;
+
+			if (NormalTransactionIdPrecedes(xid, xmin))
+				xmin = xid;
+
+			/* Add XID to snapshot. */
+			xip[count++] = xid;
+
+			/*
+			 * Save subtransaction XIDs if possible (if we've already
+			 * overflowed, there's no point).  Note that the subxact XIDs must
+			 * be later than their parent, so no need to check them against
+			 * xmin.  We could filter against xmax, but it seems better not to
+			 * do that much work while holding the ProcArrayLock.
+			 *
+			 * The other backend can add more subxids concurrently, but cannot
+			 * remove any.  Hence it's important to fetch nxids just once.
+			 * Should be safe to use memcpy, though.  (We needn't worry about
+			 * missing any xids added concurrently, because they must postdate
+			 * xmax.)
+			 *
+			 * Again, our own XIDs are not included in the snapshot.
+			 */
+			if (!suboverflowed)
+			{
+
+				if (subxidStates[pgxactoff].overflowed)
+					suboverflowed = true;
+				else
+				{
+					int			nsubxids = subxidStates[pgxactoff].count;
+
+					if (nsubxids > 0)
+					{
+						int			pgprocno = pgprocnos[pgxactoff];
+						PGPROC	   *proc = &allProcs[pgprocno];
+
+						pg_read_barrier();	/* pairs with GetNewTransactionId */
+
+						memcpy(snapshot->subxip + subcount,
+							   (void *) proc->subxids.xids,
+							   nsubxids * sizeof(TransactionId));
+						subcount += nsubxids;
+					}
+				}
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * We're in hot standby, so get XIDs from KnownAssignedXids.
+		 *
+		 * We store all xids directly into subxip[]. Here's why:
+		 *
+		 * In recovery we don't know which xids are top-level and which are
+		 * subxacts, a design choice that greatly simplifies xid processing.
+		 *
+		 * It seems like we would want to try to put xids into xip[] only, but
+		 * that is fairly small. We would either need to make that bigger or
+		 * to increase the rate at which we WAL-log xid assignment; neither is
+		 * an appealing choice.
+		 *
+		 * We could try to store xids into xip[] first and then into subxip[]
+		 * if there are too many xids. That only works if the snapshot doesn't
+		 * overflow because we do not search subxip[] in that case. A simpler
+		 * way is to just store all xids in the subxact array because this is
+		 * by far the bigger array. We just leave the xip array empty.
+		 *
+		 * Either way we need to change the way XidInMVCCSnapshot() works
+		 * depending upon when the snapshot was taken, or change normal
+		 * snapshot processing so it matches.
+		 *
+		 * Note: It is possible for recovery to end before we finish taking
+		 * the snapshot, and for newly assigned transaction ids to be added to
+		 * the ProcArray.  xmax cannot change while we hold ProcArrayLock, so
+		 * those newly added transaction ids would be filtered away, so we
+		 * need not be concerned about them.
+		 */
+		subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
+												  xmax);
+
+		if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
+			suboverflowed = true;
+	}
+
+
+	/*
+	 * Fetch into local variable while ProcArrayLock is held - the
+	 * LWLockRelease below is a barrier, ensuring this happens inside the
+	 * lock.
+	 */
+	replication_slot_xmin = procArray->replication_slot_xmin;
+	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+	if (!TransactionIdIsValid(MyProc->xmin))
+		MyProc->xmin = TransactionXmin = xmin;
+
+	LWLockRelease(ProcArrayLock);
+
+	/* maintain state for GlobalVis* */
+	{
+		TransactionId def_vis_xid;
+		TransactionId def_vis_xid_data;
+		FullTransactionId def_vis_fxid;
+		FullTransactionId def_vis_fxid_data;
+		FullTransactionId oldestfxid;
+
+		/*
+		 * Converting oldestXid is only safe when xid horizon cannot advance,
+		 * i.e. holding locks. While we don't hold the lock anymore, all the
+		 * necessary data has been gathered with lock held.
+		 */
+		oldestfxid = FullXidRelativeTo(latest_completed, oldestxid);
+
+		/* apply vacuum_defer_cleanup_age */
+		def_vis_xid_data =
+			TransactionIdRetreatedBy(xmin, vacuum_defer_cleanup_age);
+
+		/* Check whether there's a replication slot requiring an older xmin. */
+		def_vis_xid_data =
+			TransactionIdOlder(def_vis_xid_data, replication_slot_xmin);
+
+		/*
+		 * Rows in non-shared, non-catalog tables possibly could be vacuumed
+		 * if older than this xid.
+		 */
+		def_vis_xid = def_vis_xid_data;
+
+		/*
+		 * Check whether there's a replication slot requiring an older catalog
+		 * xmin.
+		 */
+		def_vis_xid =
+			TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid);
+
+		def_vis_fxid = FullXidRelativeTo(latest_completed, def_vis_xid);
+		def_vis_fxid_data = FullXidRelativeTo(latest_completed, def_vis_xid_data);
+
+		/*
+		 * Check if we can increase upper bound. As a previous
+		 * GlobalVisUpdate() might have computed more aggressive values, don't
+		 * overwrite them if so.
+		 */
+		GlobalVisSharedRels.definitely_needed =
+			FullTransactionIdNewer(def_vis_fxid,
+								   GlobalVisSharedRels.definitely_needed);
+		GlobalVisCatalogRels.definitely_needed =
+			FullTransactionIdNewer(def_vis_fxid,
+								   GlobalVisCatalogRels.definitely_needed);
+		GlobalVisDataRels.definitely_needed =
+			FullTransactionIdNewer(def_vis_fxid_data,
+								   GlobalVisDataRels.definitely_needed);
+		/* See temp_oldest_nonremovable computation in ComputeXidHorizons() */
+		if (TransactionIdIsNormal(myxid))
+			GlobalVisTempRels.definitely_needed =
+				FullXidRelativeTo(latest_completed, myxid);
+		else
+		{
+			GlobalVisTempRels.definitely_needed = latest_completed;
+			FullTransactionIdAdvance(&GlobalVisTempRels.definitely_needed);
+		}
+
+		/*
+		 * Check if we know that we can initialize or increase the lower
+		 * bound. Currently the only cheap way to do so is to use
+		 * ShmemVariableCache->oldestXid as input.
+		 *
+		 * We should definitely be able to do better. We could e.g. put a
+		 * global lower bound value into ShmemVariableCache.
+		 */
+		GlobalVisSharedRels.maybe_needed =
+			FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+								   oldestfxid);
+		GlobalVisCatalogRels.maybe_needed =
+			FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+								   oldestfxid);
+		GlobalVisDataRels.maybe_needed =
+			FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+								   oldestfxid);
+		/* accurate value known */
+		GlobalVisTempRels.maybe_needed = GlobalVisTempRels.definitely_needed;
+	}
+
+	RecentXmin = xmin;
+	Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+	snapshot->xmin = xmin;
+	snapshot->xmax = xmax;
+	snapshot->xcnt = count;
+	snapshot->subxcnt = subcount;
+	snapshot->suboverflowed = suboverflowed;
+	snapshot->snapXactCompletionCount = curXactCompletionCount;
+
+	snapshot->curcid = GetCurrentCommandId(false);
+
+	/*
+	 * This is a new snapshot, so set both refcounts are zero, and mark it as
+	 * not copied in persistent memory.
+	 */
+	snapshot->active_count = 0;
+	snapshot->regd_count = 0;
+	snapshot->copied = false;
+
+	GetSnapshotDataInitOldSnapshot(snapshot);
+
+	return snapshot;
+}
+
+/*
+ * ProcArrayInstallImportedXmin -- install imported xmin into MyProc->xmin
+ *
+ * This is called when installing a snapshot imported from another
+ * transaction.  To ensure that OldestXmin doesn't go backwards, we must
+ * check that the source transaction is still running, and we'd better do
+ * that atomically with installing the new xmin.
+ *
+ * Returns true if successful, false if source xact is no longer running.
+ */
+bool
+ProcArrayInstallImportedXmin(TransactionId xmin,
+							 VirtualTransactionId *sourcevxid)
+{
+	bool		result = false;
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	Assert(TransactionIdIsNormal(xmin));
+	if (!sourcevxid)
+		return false;
+
+	/* Get lock so source xact can't end while we're doing this */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		int			statusFlags = ProcGlobal->statusFlags[index];
+		TransactionId xid;
+
+		/* Ignore procs running LAZY VACUUM */
+		if (statusFlags & PROC_IN_VACUUM)
+			continue;
+
+		/* We are only interested in the specific virtual transaction. */
+		if (proc->backendId != sourcevxid->backendId)
+			continue;
+		if (proc->lxid != sourcevxid->localTransactionId)
+			continue;
+
+		/*
+		 * We check the transaction's database ID for paranoia's sake: if it's
+		 * in another DB then its xmin does not cover us.  Caller should have
+		 * detected this already, so we just treat any funny cases as
+		 * "transaction not found".
+		 */
+		if (proc->databaseId != MyDatabaseId)
+			continue;
+
+		/*
+		 * Likewise, let's just make real sure its xmin does cover us.
+		 */
+		xid = UINT32_ACCESS_ONCE(proc->xmin);
+		if (!TransactionIdIsNormal(xid) ||
+			!TransactionIdPrecedesOrEquals(xid, xmin))
+			continue;
+
+		/*
+		 * We're good.  Install the new xmin.  As in GetSnapshotData, set
+		 * TransactionXmin too.  (Note that because snapmgr.c called
+		 * GetSnapshotData first, we'll be overwriting a valid xmin here, so
+		 * we don't check that.)
+		 */
+		MyProc->xmin = TransactionXmin = xmin;
+
+		result = true;
+		break;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * ProcArrayInstallRestoredXmin -- install restored xmin into MyProc->xmin
+ *
+ * This is like ProcArrayInstallImportedXmin, but we have a pointer to the
+ * PGPROC of the transaction from which we imported the snapshot, rather than
+ * an XID.
+ *
+ * Note that this function also copies statusFlags from the source `proc` in
+ * order to avoid the case where MyProc's xmin needs to be skipped for
+ * computing xid horizon.
+ *
+ * Returns true if successful, false if source xact is no longer running.
+ */
+bool
+ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
+{
+	bool		result = false;
+	TransactionId xid;
+
+	Assert(TransactionIdIsNormal(xmin));
+	Assert(proc != NULL);
+
+	/*
+	 * Get an exclusive lock so that we can copy statusFlags from source proc.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Be certain that the referenced PGPROC has an advertised xmin which is
+	 * no later than the one we're installing, so that the system-wide xmin
+	 * can't go backwards.  Also, make sure it's running in the same database,
+	 * so that the per-database xmin cannot go backwards.
+	 */
+	xid = UINT32_ACCESS_ONCE(proc->xmin);
+	if (proc->databaseId == MyDatabaseId &&
+		TransactionIdIsNormal(xid) &&
+		TransactionIdPrecedesOrEquals(xid, xmin))
+	{
+		/*
+		 * Install xmin and propagate the statusFlags that affect how the
+		 * value is interpreted by vacuum.
+		 */
+		MyProc->xmin = TransactionXmin = xmin;
+		MyProc->statusFlags = (MyProc->statusFlags & ~PROC_XMIN_FLAGS) |
+			(proc->statusFlags & PROC_XMIN_FLAGS);
+		ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags;
+
+		result = true;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * GetRunningTransactionData -- returns information about running transactions.
+ *
+ * Similar to GetSnapshotData but returns more information. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes and
+ * prepared transactions.
+ *
+ * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
+ * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
+ * array until the caller has WAL-logged this snapshot, and releases the
+ * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
+ * lock is released.
+ *
+ * The returned data structure is statically allocated; caller should not
+ * modify it, and must not assume it is valid past the next call.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * KnownAssignedXids.
+ *
+ * Dummy PGPROCs from prepared transaction are included, meaning that this
+ * may return entries with duplicated TransactionId values coming from
+ * transaction finishing to prepare.  Nothing is done about duplicated
+ * entries here to not hold on ProcArrayLock more than necessary.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ *
+ * Note that if any transaction has overflowed its cached subtransactions
+ * then there is no real need include any subtransactions.
+ */
+RunningTransactions
+GetRunningTransactionData(void)
+{
+	/* result workspace */
+	static RunningTransactionsData CurrentRunningXactsData;
+
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
+	TransactionId latestCompletedXid;
+	TransactionId oldestRunningXid;
+	TransactionId *xids;
+	int			index;
+	int			count;
+	int			subcount;
+	bool		suboverflowed;
+
+	Assert(!RecoveryInProgress());
+
+	/*
+	 * Allocating space for maxProcs xids is usually overkill; numProcs would
+	 * be sufficient.  But it seems better to do the malloc while not holding
+	 * the lock, so we can't look at numProcs.  Likewise, we allocate much
+	 * more subxip storage than is probably needed.
+	 *
+	 * Should only be allocated in bgwriter, since only ever executed during
+	 * checkpoints.
+	 */
+	if (CurrentRunningXacts->xids == NULL)
+	{
+		/*
+		 * First call
+		 */
+		CurrentRunningXacts->xids = (TransactionId *)
+			malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+		if (CurrentRunningXacts->xids == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	xids = CurrentRunningXacts->xids;
+
+	count = subcount = 0;
+	suboverflowed = false;
+
+	/*
+	 * Ensure that no xids enter or leave the procarray while we obtain
+	 * snapshot.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	LWLockAcquire(XidGenLock, LW_SHARED);
+
+	latestCompletedXid =
+		XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+	oldestRunningXid =
+		XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+	/*
+	 * Spin over procArray collecting all xids
+	 */
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		TransactionId xid;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+		/*
+		 * We don't need to store transactions that don't have a TransactionId
+		 * yet because they will not show as running on a standby server.
+		 */
+		if (!TransactionIdIsValid(xid))
+			continue;
+
+		/*
+		 * Be careful not to exclude any xids before calculating the values of
+		 * oldestRunningXid and suboverflowed, since these are used to clean
+		 * up transaction information held on standbys.
+		 */
+		if (TransactionIdPrecedes(xid, oldestRunningXid))
+			oldestRunningXid = xid;
+
+		if (ProcGlobal->subxidStates[index].overflowed)
+			suboverflowed = true;
+
+		/*
+		 * If we wished to exclude xids this would be the right place for it.
+		 * Procs with the PROC_IN_VACUUM flag set don't usually assign xids,
+		 * but they do during truncation at the end when they get the lock and
+		 * truncate, so it is not much of a problem to include them if they
+		 * are seen and it is cleaner to include them.
+		 */
+
+		xids[count++] = xid;
+	}
+
+	/*
+	 * Spin over procArray collecting all subxids, but only if there hasn't
+	 * been a suboverflow.
+	 */
+	if (!suboverflowed)
+	{
+		XidCacheStatus *other_subxidstates = ProcGlobal->subxidStates;
+
+		for (index = 0; index < arrayP->numProcs; index++)
+		{
+			int			pgprocno = arrayP->pgprocnos[index];
+			PGPROC	   *proc = &allProcs[pgprocno];
+			int			nsubxids;
+
+			/*
+			 * Save subtransaction XIDs. Other backends can't add or remove
+			 * entries while we're holding XidGenLock.
+			 */
+			nsubxids = other_subxidstates[index].count;
+			if (nsubxids > 0)
+			{
+				/* barrier not really required, as XidGenLock is held, but ... */
+				pg_read_barrier();	/* pairs with GetNewTransactionId */
+
+				memcpy(&xids[count], (void *) proc->subxids.xids,
+					   nsubxids * sizeof(TransactionId));
+				count += nsubxids;
+				subcount += nsubxids;
+
+				/*
+				 * Top-level XID of a transaction is always less than any of
+				 * its subxids, so we don't need to check if any of the
+				 * subxids are smaller than oldestRunningXid
+				 */
+			}
+		}
+	}
+
+	/*
+	 * It's important *not* to include the limits set by slots here because
+	 * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those
+	 * were to be included here the initial value could never increase because
+	 * of a circular dependency where slots only increase their limits when
+	 * running xacts increases oldestRunningXid and running xacts only
+	 * increases if slots do.
+	 */
+
+	CurrentRunningXacts->xcnt = count - subcount;
+	CurrentRunningXacts->subxcnt = subcount;
+	CurrentRunningXacts->subxid_overflow = suboverflowed;
+	CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
+	CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
+
+	Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
+	Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
+	Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
+
+	/* We don't release the locks here, the caller is responsible for that */
+
+	return CurrentRunningXacts;
+}
+
+/*
+ * GetOldestActiveTransactionId()
+ *
+ * Similar to GetSnapshotData but returns just oldestActiveXid. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes.
+ * We look at all databases, though there is no need to include WALSender
+ * since this has no effect on hot standby conflicts.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * KnownAssignedXids.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ */
+TransactionId
+GetOldestActiveTransactionId(void)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	TransactionId oldestRunningXid;
+	int			index;
+
+	Assert(!RecoveryInProgress());
+
+	/*
+	 * Read nextXid, as the upper bound of what's still active.
+	 *
+	 * Reading a TransactionId is atomic, but we must grab the lock to make
+	 * sure that all XIDs < nextXid are already present in the proc array (or
+	 * have already completed), when we spin over it.
+	 */
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	LWLockRelease(XidGenLock);
+
+	/*
+	 * Spin over procArray collecting all xids and subxids.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		TransactionId xid;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+		if (!TransactionIdIsNormal(xid))
+			continue;
+
+		if (TransactionIdPrecedes(xid, oldestRunningXid))
+			oldestRunningXid = xid;
+
+		/*
+		 * Top-level XID of a transaction is always less than any of its
+		 * subxids, so we don't need to check if any of the subxids are
+		 * smaller than oldestRunningXid
+		 */
+	}
+	LWLockRelease(ProcArrayLock);
+
+	return oldestRunningXid;
+}
+
+/*
+ * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum
+ *
+ * Returns the oldest xid that we can guarantee not to have been affected by
+ * vacuum, i.e. no rows >= that xid have been vacuumed away unless the
+ * transaction aborted. Note that the value can (and most of the time will) be
+ * much more conservative than what really has been affected by vacuum, but we
+ * currently don't have better data available.
+ *
+ * This is useful to initialize the cutoff xid after which a new changeset
+ * extraction replication slot can start decoding changes.
+ *
+ * Must be called with ProcArrayLock held either shared or exclusively,
+ * although most callers will want to use exclusive mode since it is expected
+ * that the caller will immediately use the xid to peg the xmin horizon.
+ */
+TransactionId
+GetOldestSafeDecodingTransactionId(bool catalogOnly)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId oldestSafeXid;
+	int			index;
+	bool		recovery_in_progress = RecoveryInProgress();
+
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	/*
+	 * Acquire XidGenLock, so no transactions can acquire an xid while we're
+	 * running. If no transaction with xid were running concurrently a new xid
+	 * could influence the RecentXmin et al.
+	 *
+	 * We initialize the computation to nextXid since that's guaranteed to be
+	 * a safe, albeit pessimal, value.
+	 */
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+	/*
+	 * If there's already a slot pegging the xmin horizon, we can start with
+	 * that value, it's guaranteed to be safe since it's computed by this
+	 * routine initially and has been enforced since.  We can always use the
+	 * slot's general xmin horizon, but the catalog horizon is only usable
+	 * when only catalog data is going to be looked at.
+	 */
+	if (TransactionIdIsValid(procArray->replication_slot_xmin) &&
+		TransactionIdPrecedes(procArray->replication_slot_xmin,
+							  oldestSafeXid))
+		oldestSafeXid = procArray->replication_slot_xmin;
+
+	if (catalogOnly &&
+		TransactionIdIsValid(procArray->replication_slot_catalog_xmin) &&
+		TransactionIdPrecedes(procArray->replication_slot_catalog_xmin,
+							  oldestSafeXid))
+		oldestSafeXid = procArray->replication_slot_catalog_xmin;
+
+	/*
+	 * If we're not in recovery, we walk over the procarray and collect the
+	 * lowest xid. Since we're called with ProcArrayLock held and have
+	 * acquired XidGenLock, no entries can vanish concurrently, since
+	 * ProcGlobal->xids[i] is only set with XidGenLock held and only cleared
+	 * with ProcArrayLock held.
+	 *
+	 * In recovery we can't lower the safe value besides what we've computed
+	 * above, so we'll have to wait a bit longer there. We unfortunately can
+	 * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
+	 * machinery can miss values and return an older value than is safe.
+	 */
+	if (!recovery_in_progress)
+	{
+		TransactionId *other_xids = ProcGlobal->xids;
+
+		/*
+		 * Spin over procArray collecting min(ProcGlobal->xids[i])
+		 */
+		for (index = 0; index < arrayP->numProcs; index++)
+		{
+			TransactionId xid;
+
+			/* Fetch xid just once - see GetNewTransactionId */
+			xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+			if (!TransactionIdIsNormal(xid))
+				continue;
+
+			if (TransactionIdPrecedes(xid, oldestSafeXid))
+				oldestSafeXid = xid;
+		}
+	}
+
+	LWLockRelease(XidGenLock);
+
+	return oldestSafeXid;
+}
+
+/*
+ * GetVirtualXIDsDelayingChkptGuts -- Get the VXIDs of transactions that are
+ * delaying the start or end of a checkpoint because they have critical
+ * actions in progress.
+ *
+ * Constructs an array of VXIDs of transactions that are currently in commit
+ * critical sections, as shown by having delayChkpt or delayChkptEnd set in
+ * their PGPROC.
+ *
+ * Returns a palloc'd array that should be freed by the caller.
+ * *nvxids is the number of valid entries.
+ *
+ * Note that because backends set or clear delayChkpt and delayChkptEnd
+ * without holding any lock, the result is somewhat indeterminate, but we
+ * don't really care.  Even in a multiprocessor with delayed writes to
+ * shared memory, it should be certain that setting of delayChkpt will
+ * propagate to shared memory when the backend takes a lock, so we cannot
+ * fail to see a virtual xact as delayChkpt if it's already inserted its
+ * commit record.  Whether it takes a little while for clearing of
+ * delayChkpt to propagate is unimportant for correctness.
+ */
+static VirtualTransactionId *
+GetVirtualXIDsDelayingChkptGuts(int *nvxids, int type)
+{
+	VirtualTransactionId *vxids;
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	Assert(type != 0);
+
+	/* allocate what's certainly enough result space */
+	vxids = (VirtualTransactionId *)
+		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (((type & DELAY_CHKPT_START) && proc->delayChkpt) ||
+			((type & DELAY_CHKPT_COMPLETE) && proc->delayChkptEnd))
+		{
+			VirtualTransactionId vxid;
+
+			GET_VXID_FROM_PGPROC(vxid, *proc);
+			if (VirtualTransactionIdIsValid(vxid))
+				vxids[count++] = vxid;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	*nvxids = count;
+	return vxids;
+}
+
+/*
+ * GetVirtualXIDsDelayingChkpt - Get the VXIDs of transactions that are
+ * delaying the start of a checkpoint.
+ */
+VirtualTransactionId *
+GetVirtualXIDsDelayingChkpt(int *nvxids)
+{
+	return GetVirtualXIDsDelayingChkptGuts(nvxids, DELAY_CHKPT_START);
+}
+
+/*
+ * GetVirtualXIDsDelayingChkptEnd - Get the VXIDs of transactions that are
+ * delaying the end of a checkpoint.
+ */
+VirtualTransactionId *
+GetVirtualXIDsDelayingChkptEnd(int *nvxids)
+{
+	return GetVirtualXIDsDelayingChkptGuts(nvxids, DELAY_CHKPT_COMPLETE);
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying?
+ *
+ * This is used with the results of GetVirtualXIDsDelayingChkpt to see if any
+ * of the specified VXIDs are still in critical sections of code.
+ *
+ * Note: this is O(N^2) in the number of vxacts that are/were delaying, but
+ * those numbers should be small enough for it not to be a problem.
+ */
+static bool
+HaveVirtualXIDsDelayingChkptGuts(VirtualTransactionId *vxids, int nvxids,
+								 int type)
+{
+	bool		result = false;
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	Assert(type != 0);
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		VirtualTransactionId vxid;
+
+		GET_VXID_FROM_PGPROC(vxid, *proc);
+
+		if ((((type & DELAY_CHKPT_START) && proc->delayChkpt) ||
+			 ((type & DELAY_CHKPT_COMPLETE) && proc->delayChkptEnd)) &&
+			VirtualTransactionIdIsValid(vxid))
+		{
+			int			i;
+
+			for (i = 0; i < nvxids; i++)
+			{
+				if (VirtualTransactionIdEquals(vxid, vxids[i]))
+				{
+					result = true;
+					break;
+				}
+			}
+			if (result)
+				break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying
+ * the start of a checkpoint?
+ */
+bool
+HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
+{
+	return HaveVirtualXIDsDelayingChkptGuts(vxids, nvxids,
+											DELAY_CHKPT_START);
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkptEnd -- Are any of the specified VXIDs delaying
+ * the end of a checkpoint?
+ */
+bool
+HaveVirtualXIDsDelayingChkptEnd(VirtualTransactionId *vxids, int nvxids)
+{
+	return HaveVirtualXIDsDelayingChkptGuts(vxids, nvxids,
+											DELAY_CHKPT_COMPLETE);
+}
+
+/*
+ * BackendPidGetProc -- get a backend's PGPROC given its PID
+ *
+ * Returns NULL if not found.  Note that it is up to the caller to be
+ * sure that the question remains meaningful for long enough for the
+ * answer to be used ...
+ */
+PGPROC *
+BackendPidGetProc(int pid)
+{
+	PGPROC	   *result;
+
+	if (pid == 0)				/* never match dummy PGPROCs */
+		return NULL;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	result = BackendPidGetProcWithLock(pid);
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * BackendPidGetProcWithLock -- get a backend's PGPROC given its PID
+ *
+ * Same as above, except caller must be holding ProcArrayLock.  The found
+ * entry, if any, can be assumed to be valid as long as the lock remains held.
+ */
+PGPROC *
+BackendPidGetProcWithLock(int pid)
+{
+	PGPROC	   *result = NULL;
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	if (pid == 0)				/* never match dummy PGPROCs */
+		return NULL;
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		PGPROC	   *proc = &allProcs[arrayP->pgprocnos[index]];
+
+		if (proc->pid == pid)
+		{
+			result = proc;
+			break;
+		}
+	}
+
+	return result;
+}
+
+/*
+ * BackendXidGetPid -- get a backend's pid given its XID
+ *
+ * Returns 0 if not found or it's a prepared transaction.  Note that
+ * it is up to the caller to be sure that the question remains
+ * meaningful for long enough for the answer to be used ...
+ *
+ * Only main transaction Ids are considered.  This function is mainly
+ * useful for determining what backend owns a lock.
+ *
+ * Beware that not every xact has an XID assigned.  However, as long as you
+ * only call this using an XID found on disk, you're safe.
+ */
+int
+BackendXidGetPid(TransactionId xid)
+{
+	int			result = 0;
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	int			index;
+
+	if (xid == InvalidTransactionId)	/* never match invalid xid */
+		return 0;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (other_xids[index] == xid)
+		{
+			result = proc->pid;
+			break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * IsBackendPid -- is a given pid a running backend
+ *
+ * This is not called by the backend, but is called by external modules.
+ */
+bool
+IsBackendPid(int pid)
+{
+	return (BackendPidGetProc(pid) != NULL);
+}
+
+
+/*
+ * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * The array is palloc'd. The number of valid entries is returned into *nvxids.
+ *
+ * The arguments allow filtering the set of VXIDs returned.  Our own process
+ * is always skipped.  In addition:
+ *	If limitXmin is not InvalidTransactionId, skip processes with
+ *		xmin > limitXmin.
+ *	If excludeXmin0 is true, skip processes with xmin = 0.
+ *	If allDbs is false, skip processes attached to other databases.
+ *	If excludeVacuum isn't zero, skip processes for which
+ *		(statusFlags & excludeVacuum) is not zero.
+ *
+ * Note: the purpose of the limitXmin and excludeXmin0 parameters is to
+ * allow skipping backends whose oldest live snapshot is no older than
+ * some snapshot we have.  Since we examine the procarray with only shared
+ * lock, there are race conditions: a backend could set its xmin just after
+ * we look.  Indeed, on multiprocessors with weak memory ordering, the
+ * other backend could have set its xmin *before* we look.  We know however
+ * that such a backend must have held shared ProcArrayLock overlapping our
+ * own hold of ProcArrayLock, else we would see its xmin update.  Therefore,
+ * any snapshot the other backend is taking concurrently with our scan cannot
+ * consider any transactions as still running that we think are committed
+ * (since backends must hold ProcArrayLock exclusive to commit).
+ */
+VirtualTransactionId *
+GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
+					  bool allDbs, int excludeVacuum,
+					  int *nvxids)
+{
+	VirtualTransactionId *vxids;
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	/* allocate what's certainly enough result space */
+	vxids = (VirtualTransactionId *)
+		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		uint8		statusFlags = ProcGlobal->statusFlags[index];
+
+		if (proc == MyProc)
+			continue;
+
+		if (excludeVacuum & statusFlags)
+			continue;
+
+		if (allDbs || proc->databaseId == MyDatabaseId)
+		{
+			/* Fetch xmin just once - might change on us */
+			TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+			if (excludeXmin0 && !TransactionIdIsValid(pxmin))
+				continue;
+
+			/*
+			 * InvalidTransactionId precedes all other XIDs, so a proc that
+			 * hasn't set xmin yet will not be rejected by this test.
+			 */
+			if (!TransactionIdIsValid(limitXmin) ||
+				TransactionIdPrecedesOrEquals(pxmin, limitXmin))
+			{
+				VirtualTransactionId vxid;
+
+				GET_VXID_FROM_PGPROC(vxid, *proc);
+				if (VirtualTransactionIdIsValid(vxid))
+					vxids[count++] = vxid;
+			}
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	*nvxids = count;
+	return vxids;
+}
+
+/*
+ * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * Usage is limited to conflict resolution during recovery on standby servers.
+ * limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId
+ * in cases where we cannot accurately determine a value for latestRemovedXid.
+ *
+ * If limitXmin is InvalidTransactionId then we want to kill everybody,
+ * so we're not worried if they have a snapshot or not, nor does it really
+ * matter what type of lock we hold.
+ *
+ * All callers that are checking xmins always now supply a valid and useful
+ * value for limitXmin. The limitXmin is always lower than the lowest
+ * numbered KnownAssignedXid that is not already a FATAL error. This is
+ * because we only care about cleanup records that are cleaning up tuple
+ * versions from committed transactions. In that case they will only occur
+ * at the point where the record is less than the lowest running xid. That
+ * allows us to say that if any backend takes a snapshot concurrently with
+ * us then the conflict assessment made here would never include the snapshot
+ * that is being derived. So we take LW_SHARED on the ProcArray and allow
+ * concurrent snapshots when limitXmin is valid. We might think about adding
+ *	 Assert(limitXmin < lowest(KnownAssignedXids))
+ * but that would not be true in the case of FATAL errors lagging in array,
+ * but we already know those are bogus anyway, so we skip that test.
+ *
+ * If dbOid is valid we skip backends attached to other databases.
+ *
+ * Be careful to *not* pfree the result from this function. We reuse
+ * this array sufficiently often that we use malloc for the result.
+ */
+VirtualTransactionId *
+GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
+{
+	static VirtualTransactionId *vxids;
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	/*
+	 * If first time through, get workspace to remember main XIDs in. We
+	 * malloc it permanently to avoid repeated palloc/pfree overhead. Allow
+	 * result space, remembering room for a terminator.
+	 */
+	if (vxids == NULL)
+	{
+		vxids = (VirtualTransactionId *)
+			malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
+		if (vxids == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		/* Exclude prepared transactions */
+		if (proc->pid == 0)
+			continue;
+
+		if (!OidIsValid(dbOid) ||
+			proc->databaseId == dbOid)
+		{
+			/* Fetch xmin just once - can't change on us, but good coding */
+			TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+			/*
+			 * We ignore an invalid pxmin because this means that backend has
+			 * no snapshot currently. We hold a Share lock to avoid contention
+			 * with users taking snapshots.  That is not a problem because the
+			 * current xmin is always at least one higher than the latest
+			 * removed xid, so any new snapshot would never conflict with the
+			 * test here.
+			 */
+			if (!TransactionIdIsValid(limitXmin) ||
+				(TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin)))
+			{
+				VirtualTransactionId vxid;
+
+				GET_VXID_FROM_PGPROC(vxid, *proc);
+				if (VirtualTransactionIdIsValid(vxid))
+					vxids[count++] = vxid;
+			}
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	/* add the terminator */
+	vxids[count].backendId = InvalidBackendId;
+	vxids[count].localTransactionId = InvalidLocalTransactionId;
+
+	return vxids;
+}
+
+/*
+ * CancelVirtualTransaction - used in recovery conflict processing
+ *
+ * Returns pid of the process signaled, or 0 if not found.
+ */
+pid_t
+CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
+{
+	return SignalVirtualTransaction(vxid, sigmode, true);
+}
+
+pid_t
+SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
+						 bool conflictPending)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+	pid_t		pid = 0;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		VirtualTransactionId procvxid;
+
+		GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+		if (procvxid.backendId == vxid.backendId &&
+			procvxid.localTransactionId == vxid.localTransactionId)
+		{
+			proc->recoveryConflictPending = conflictPending;
+			pid = proc->pid;
+			if (pid != 0)
+			{
+				/*
+				 * Kill the pid if it's still here. If not, that's what we
+				 * wanted so ignore any errors.
+				 */
+				(void) SendProcSignal(pid, sigmode, vxid.backendId);
+			}
+			break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return pid;
+}
+
+/*
+ * MinimumActiveBackends --- count backends (other than myself) that are
+ *		in active transactions.  Return true if the count exceeds the
+ *		minimum threshold passed.  This is used as a heuristic to decide if
+ *		a pre-XLOG-flush delay is worthwhile during commit.
+ *
+ * Do not count backends that are blocked waiting for locks, since they are
+ * not going to get to run until someone else commits.
+ */
+bool
+MinimumActiveBackends(int min)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	/* Quick short-circuit if no minimum is specified */
+	if (min == 0)
+		return true;
+
+	/*
+	 * Note: for speed, we don't acquire ProcArrayLock.  This is a little bit
+	 * bogus, but since we are only testing fields for zero or nonzero, it
+	 * should be OK.  The result is only used for heuristic purposes anyway...
+	 */
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		/*
+		 * Since we're not holding a lock, need to be prepared to deal with
+		 * garbage, as someone could have incremented numProcs but not yet
+		 * filled the structure.
+		 *
+		 * If someone just decremented numProcs, 'proc' could also point to a
+		 * PGPROC entry that's no longer in the array. It still points to a
+		 * PGPROC struct, though, because freed PGPROC entries just go to the
+		 * free list and are recycled. Its contents are nonsense in that case,
+		 * but that's acceptable for this function.
+		 */
+		if (pgprocno == -1)
+			continue;			/* do not count deleted entries */
+		if (proc == MyProc)
+			continue;			/* do not count myself */
+		if (proc->xid == InvalidTransactionId)
+			continue;			/* do not count if no XID assigned */
+		if (proc->pid == 0)
+			continue;			/* do not count prepared xacts */
+		if (proc->waitLock != NULL)
+			continue;			/* do not count if blocked on a lock */
+		count++;
+		if (count >= min)
+			break;
+	}
+
+	return count >= min;
+}
+
+/*
+ * CountDBBackends --- count backends that are using specified database
+ */
+int
+CountDBBackends(Oid databaseid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (proc->pid == 0)
+			continue;			/* do not count prepared xacts */
+		if (!OidIsValid(databaseid) ||
+			proc->databaseId == databaseid)
+			count++;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return count;
+}
+
+/*
+ * CountDBConnections --- counts database backends ignoring any background
+ *		worker processes
+ */
+int
+CountDBConnections(Oid databaseid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (proc->pid == 0)
+			continue;			/* do not count prepared xacts */
+		if (proc->isBackgroundWorker)
+			continue;			/* do not count background workers */
+		if (!OidIsValid(databaseid) ||
+			proc->databaseId == databaseid)
+			count++;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return count;
+}
+
+/*
+ * CancelDBBackends --- cancel backends that are using specified database
+ */
+void
+CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	/* tell all backends to die */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (databaseid == InvalidOid || proc->databaseId == databaseid)
+		{
+			VirtualTransactionId procvxid;
+			pid_t		pid;
+
+			GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+			proc->recoveryConflictPending = conflictPending;
+			pid = proc->pid;
+			if (pid != 0)
+			{
+				/*
+				 * Kill the pid if it's still here. If not, that's what we
+				 * wanted so ignore any errors.
+				 */
+				(void) SendProcSignal(pid, sigmode, procvxid.backendId);
+			}
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * CountUserBackends --- count backends that are used by specified user
+ */
+int
+CountUserBackends(Oid roleid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (proc->pid == 0)
+			continue;			/* do not count prepared xacts */
+		if (proc->isBackgroundWorker)
+			continue;			/* do not count background workers */
+		if (proc->roleId == roleid)
+			count++;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return count;
+}
+
+/*
+ * CountOtherDBBackends -- check for other backends running in the given DB
+ *
+ * If there are other backends in the DB, we will wait a maximum of 5 seconds
+ * for them to exit.  Autovacuum backends are encouraged to exit early by
+ * sending them SIGTERM, but normal user backends are just waited for.
+ *
+ * The current backend is always ignored; it is caller's responsibility to
+ * check whether the current backend uses the given DB, if it's important.
+ *
+ * Returns true if there are (still) other backends in the DB, false if not.
+ * Also, *nbackends and *nprepared are set to the number of other backends
+ * and prepared transactions in the DB, respectively.
+ *
+ * This function is used to interlock DROP DATABASE and related commands
+ * against there being any active backends in the target DB --- dropping the
+ * DB while active backends remain would be a Bad Thing.  Note that we cannot
+ * detect here the possibility of a newly-started backend that is trying to
+ * connect to the doomed database, so additional interlocking is needed during
+ * backend startup.  The caller should normally hold an exclusive lock on the
+ * target DB before calling this, which is one reason we mustn't wait
+ * indefinitely.
+ */
+bool
+CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
+{
+	ProcArrayStruct *arrayP = procArray;
+
+#define MAXAUTOVACPIDS	10		/* max autovacs to SIGTERM per iteration */
+	int			autovac_pids[MAXAUTOVACPIDS];
+	int			tries;
+
+	/* 50 tries with 100ms sleep between tries makes 5 sec total wait */
+	for (tries = 0; tries < 50; tries++)
+	{
+		int			nautovacs = 0;
+		bool		found = false;
+		int			index;
+
+		CHECK_FOR_INTERRUPTS();
+
+		*nbackends = *nprepared = 0;
+
+		LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+		for (index = 0; index < arrayP->numProcs; index++)
+		{
+			int			pgprocno = arrayP->pgprocnos[index];
+			PGPROC	   *proc = &allProcs[pgprocno];
+			uint8		statusFlags = ProcGlobal->statusFlags[index];
+
+			if (proc->databaseId != databaseId)
+				continue;
+			if (proc == MyProc)
+				continue;
+
+			found = true;
+
+			if (proc->pid == 0)
+				(*nprepared)++;
+			else
+			{
+				(*nbackends)++;
+				if ((statusFlags & PROC_IS_AUTOVACUUM) &&
+					nautovacs < MAXAUTOVACPIDS)
+					autovac_pids[nautovacs++] = proc->pid;
+			}
+		}
+
+		LWLockRelease(ProcArrayLock);
+
+		if (!found)
+			return false;		/* no conflicting backends, so done */
+
+		/*
+		 * Send SIGTERM to any conflicting autovacuums before sleeping. We
+		 * postpone this step until after the loop because we don't want to
+		 * hold ProcArrayLock while issuing kill(). We have no idea what might
+		 * block kill() inside the kernel...
+		 */
+		for (index = 0; index < nautovacs; index++)
+			(void) kill(autovac_pids[index], SIGTERM);	/* ignore any error */
+
+		/* sleep, then try again */
+		pg_usleep(100 * 1000L); /* 100ms */
+	}
+
+	return true;				/* timed out, still conflicts */
+}
+
+/*
+ * Terminate existing connections to the specified database. This routine
+ * is used by the DROP DATABASE command when user has asked to forcefully
+ * drop the database.
+ *
+ * The current backend is always ignored; it is caller's responsibility to
+ * check whether the current backend uses the given DB, if it's important.
+ *
+ * It doesn't allow to terminate the connections even if there is a one
+ * backend with the prepared transaction in the target database.
+ */
+void
+TerminateOtherDBBackends(Oid databaseId)
+{
+	ProcArrayStruct *arrayP = procArray;
+	List	   *pids = NIL;
+	int			nprepared = 0;
+	int			i;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (i = 0; i < procArray->numProcs; i++)
+	{
+		int			pgprocno = arrayP->pgprocnos[i];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (proc->databaseId != databaseId)
+			continue;
+		if (proc == MyProc)
+			continue;
+
+		if (proc->pid != 0)
+			pids = lappend_int(pids, proc->pid);
+		else
+			nprepared++;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	if (nprepared > 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_IN_USE),
+				 errmsg("database \"%s\" is being used by prepared transactions",
+						get_database_name(databaseId)),
+				 errdetail_plural("There is %d prepared transaction using the database.",
+								  "There are %d prepared transactions using the database.",
+								  nprepared,
+								  nprepared)));
+
+	if (pids)
+	{
+		ListCell   *lc;
+
+		/*
+		 * Check whether we have the necessary rights to terminate other
+		 * sessions.  We don't terminate any session until we ensure that we
+		 * have rights on all the sessions to be terminated.  These checks are
+		 * the same as we do in pg_terminate_backend.
+		 *
+		 * In this case we don't raise some warnings - like "PID %d is not a
+		 * PostgreSQL server process", because for us already finished session
+		 * is not a problem.
+		 */
+		foreach(lc, pids)
+		{
+			int			pid = lfirst_int(lc);
+			PGPROC	   *proc = BackendPidGetProc(pid);
+
+			if (proc != NULL)
+			{
+				/* Only allow superusers to signal superuser-owned backends. */
+				if (superuser_arg(proc->roleId) && !superuser())
+					ereport(ERROR,
+							(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+							 errmsg("must be a superuser to terminate superuser process")));
+
+				/* Users can signal backends they have role membership in. */
+				if (!has_privs_of_role(GetUserId(), proc->roleId) &&
+					!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+					ereport(ERROR,
+							(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+							 errmsg("must be a member of the role whose process is being terminated or member of pg_signal_backend")));
+			}
+		}
+
+		/*
+		 * There's a race condition here: once we release the ProcArrayLock,
+		 * it's possible for the session to exit before we issue kill.  That
+		 * race condition possibility seems too unlikely to worry about.  See
+		 * pg_signal_backend.
+		 */
+		foreach(lc, pids)
+		{
+			int			pid = lfirst_int(lc);
+			PGPROC	   *proc = BackendPidGetProc(pid);
+
+			if (proc != NULL)
+			{
+				/*
+				 * If we have setsid(), signal the backend's whole process
+				 * group
+				 */
+#ifdef HAVE_SETSID
+				(void) kill(-pid, SIGTERM);
+#else
+				(void) kill(pid, SIGTERM);
+#endif
+			}
+		}
+	}
+}
+
+/*
+ * ProcArraySetReplicationSlotXmin
+ *
+ * Install limits to future computations of the xmin horizon to prevent vacuum
+ * and HOT pruning from removing affected rows still needed by clients with
+ * replication slots.
+ */
+void
+ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
+								bool already_locked)
+{
+	Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
+
+	if (!already_locked)
+		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	procArray->replication_slot_xmin = xmin;
+	procArray->replication_slot_catalog_xmin = catalog_xmin;
+
+	if (!already_locked)
+		LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ProcArrayGetReplicationSlotXmin
+ *
+ * Return the current slot xmin limits. That's useful to be able to remove
+ * data that's older than those limits.
+ */
+void
+ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+								TransactionId *catalog_xmin)
+{
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	if (xmin != NULL)
+		*xmin = procArray->replication_slot_xmin;
+
+	if (catalog_xmin != NULL)
+		*catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * XidCacheRemoveRunningXids
+ *
+ * Remove a bunch of TransactionIds from the list of known-running
+ * subtransactions for my backend.  Both the specified xid and those in
+ * the xids[] array (of length nxids) are removed from the subxids cache.
+ * latestXid must be the latest XID among the group.
+ */
+void
+XidCacheRemoveRunningXids(TransactionId xid,
+						  int nxids, const TransactionId *xids,
+						  TransactionId latestXid)
+{
+	int			i,
+				j;
+	XidCacheStatus *mysubxidstat;
+
+	Assert(TransactionIdIsValid(xid));
+
+	/*
+	 * We must hold ProcArrayLock exclusively in order to remove transactions
+	 * from the PGPROC array.  (See src/backend/access/transam/README.)  It's
+	 * possible this could be relaxed since we know this routine is only used
+	 * to abort subtransactions, but pending closer analysis we'd best be
+	 * conservative.
+	 *
+	 * Note that we do not have to be careful about memory ordering of our own
+	 * reads wrt. GetNewTransactionId() here - only this process can modify
+	 * relevant fields of MyProc/ProcGlobal->xids[].  But we do have to be
+	 * careful about our own writes being well ordered.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	mysubxidstat = &ProcGlobal->subxidStates[MyProc->pgxactoff];
+
+	/*
+	 * Under normal circumstances xid and xids[] will be in increasing order,
+	 * as will be the entries in subxids.  Scan backwards to avoid O(N^2)
+	 * behavior when removing a lot of xids.
+	 */
+	for (i = nxids - 1; i >= 0; i--)
+	{
+		TransactionId anxid = xids[i];
+
+		for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
+		{
+			if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
+			{
+				MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+				pg_write_barrier();
+				mysubxidstat->count--;
+				MyProc->subxidStatus.count--;
+				break;
+			}
+		}
+
+		/*
+		 * Ordinarily we should have found it, unless the cache has
+		 * overflowed. However it's also possible for this routine to be
+		 * invoked multiple times for the same subtransaction, in case of an
+		 * error during AbortSubTransaction.  So instead of Assert, emit a
+		 * debug warning.
+		 */
+		if (j < 0 && !MyProc->subxidStatus.overflowed)
+			elog(WARNING, "did not find subXID %u in MyProc", anxid);
+	}
+
+	for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
+	{
+		if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
+		{
+			MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+			pg_write_barrier();
+			mysubxidstat->count--;
+			MyProc->subxidStatus.count--;
+			break;
+		}
+	}
+	/* Ordinarily we should have found it, unless the cache has overflowed */
+	if (j < 0 && !MyProc->subxidStatus.overflowed)
+		elog(WARNING, "did not find subXID %u in MyProc", xid);
+
+	/* Also advance global latestCompletedXid while holding the lock */
+	MaintainLatestCompletedXid(latestXid);
+
+	/* ... and xactCompletionCount */
+	ShmemVariableCache->xactCompletionCount++;
+
+	LWLockRelease(ProcArrayLock);
+}
+
+#ifdef XIDCACHE_DEBUG
+
+/*
+ * Print stats about effectiveness of XID cache
+ */
+static void
+DisplayXidCache(void)
+{
+	fprintf(stderr,
+			"XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
+			xc_by_recent_xmin,
+			xc_by_known_xact,
+			xc_by_my_xact,
+			xc_by_latest_xid,
+			xc_by_main_xid,
+			xc_by_child_xid,
+			xc_by_known_assigned,
+			xc_no_overflow,
+			xc_slow_answer);
+}
+#endif							/* XIDCACHE_DEBUG */
+
+/*
+ * If rel != NULL, return test state appropriate for relation, otherwise
+ * return state usable for all relations.  The latter may consider XIDs as
+ * not-yet-visible-to-everyone that a state for a specific relation would
+ * already consider visible-to-everyone.
+ *
+ * This needs to be called while a snapshot is active or registered, otherwise
+ * there are wraparound and other dangers.
+ *
+ * See comment for GlobalVisState for details.
+ */
+GlobalVisState *
+GlobalVisTestFor(Relation rel)
+{
+	GlobalVisState *state = NULL;
+
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(RecentXmin);
+
+	switch (GlobalVisHorizonKindForRel(rel))
+	{
+		case VISHORIZON_SHARED:
+			state = &GlobalVisSharedRels;
+			break;
+		case VISHORIZON_CATALOG:
+			state = &GlobalVisCatalogRels;
+			break;
+		case VISHORIZON_DATA:
+			state = &GlobalVisDataRels;
+			break;
+		case VISHORIZON_TEMP:
+			state = &GlobalVisTempRels;
+			break;
+	}
+
+	Assert(FullTransactionIdIsValid(state->definitely_needed) &&
+		   FullTransactionIdIsValid(state->maybe_needed));
+
+	return state;
+}
+
+/*
+ * Return true if it's worth updating the accurate maybe_needed boundary.
+ *
+ * As it is somewhat expensive to determine xmin horizons, we don't want to
+ * repeatedly do so when there is a low likelihood of it being beneficial.
+ *
+ * The current heuristic is that we update only if RecentXmin has changed
+ * since the last update. If the oldest currently running transaction has not
+ * finished, it is unlikely that recomputing the horizon would be useful.
+ */
+static bool
+GlobalVisTestShouldUpdate(GlobalVisState *state)
+{
+	/* hasn't been updated yet */
+	if (!TransactionIdIsValid(ComputeXidHorizonsResultLastXmin))
+		return true;
+
+	/*
+	 * If the maybe_needed/definitely_needed boundaries are the same, it's
+	 * unlikely to be beneficial to refresh boundaries.
+	 */
+	if (FullTransactionIdFollowsOrEquals(state->maybe_needed,
+										 state->definitely_needed))
+		return false;
+
+	/* does the last snapshot built have a different xmin? */
+	return RecentXmin != ComputeXidHorizonsResultLastXmin;
+}
+
+static void
+GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons)
+{
+	GlobalVisSharedRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->shared_oldest_nonremovable);
+	GlobalVisCatalogRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->catalog_oldest_nonremovable);
+	GlobalVisDataRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->data_oldest_nonremovable);
+	GlobalVisTempRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->temp_oldest_nonremovable);
+
+	/*
+	 * In longer running transactions it's possible that transactions we
+	 * previously needed to treat as running aren't around anymore. So update
+	 * definitely_needed to not be earlier than maybe_needed.
+	 */
+	GlobalVisSharedRels.definitely_needed =
+		FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+							   GlobalVisSharedRels.definitely_needed);
+	GlobalVisCatalogRels.definitely_needed =
+		FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+							   GlobalVisCatalogRels.definitely_needed);
+	GlobalVisDataRels.definitely_needed =
+		FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+							   GlobalVisDataRels.definitely_needed);
+	GlobalVisTempRels.definitely_needed = GlobalVisTempRels.maybe_needed;
+
+	ComputeXidHorizonsResultLastXmin = RecentXmin;
+}
+
+/*
+ * Update boundaries in GlobalVis{Shared,Catalog, Data}Rels
+ * using ComputeXidHorizons().
+ */
+static void
+GlobalVisUpdate(void)
+{
+	ComputeXidHorizonsResult horizons;
+
+	/* updates the horizons as a side-effect */
+	ComputeXidHorizons(&horizons);
+}
+
+/*
+ * Return true if no snapshot still considers fxid to be running.
+ *
+ * The state passed needs to have been initialized for the relation fxid is
+ * from (NULL is also OK), otherwise the result may not be correct.
+ *
+ * See comment for GlobalVisState for details.
+ */
+bool
+GlobalVisTestIsRemovableFullXid(GlobalVisState *state,
+								FullTransactionId fxid)
+{
+	/*
+	 * If fxid is older than maybe_needed bound, it definitely is visible to
+	 * everyone.
+	 */
+	if (FullTransactionIdPrecedes(fxid, state->maybe_needed))
+		return true;
+
+	/*
+	 * If fxid is >= definitely_needed bound, it is very likely to still be
+	 * considered running.
+	 */
+	if (FullTransactionIdFollowsOrEquals(fxid, state->definitely_needed))
+		return false;
+
+	/*
+	 * fxid is between maybe_needed and definitely_needed, i.e. there might or
+	 * might not exist a snapshot considering fxid running. If it makes sense,
+	 * update boundaries and recheck.
+	 */
+	if (GlobalVisTestShouldUpdate(state))
+	{
+		GlobalVisUpdate();
+
+		Assert(FullTransactionIdPrecedes(fxid, state->definitely_needed));
+
+		return FullTransactionIdPrecedes(fxid, state->maybe_needed);
+	}
+	else
+		return false;
+}
+
+/*
+ * Wrapper around GlobalVisTestIsRemovableFullXid() for 32bit xids.
+ *
+ * It is crucial that this only gets called for xids from a source that
+ * protects against xid wraparounds (e.g. from a table and thus protected by
+ * relfrozenxid).
+ */
+bool
+GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
+{
+	FullTransactionId fxid;
+
+	/*
+	 * Convert 32 bit argument to FullTransactionId. We can do so safely
+	 * because we know the xid has to, at the very least, be between
+	 * [oldestXid, nextFullXid), i.e. within 2 billion of xid. To avoid taking
+	 * a lock to determine either, we can just compare with
+	 * state->definitely_needed, which was based on those value at the time
+	 * the current snapshot was built.
+	 */
+	fxid = FullXidRelativeTo(state->definitely_needed, xid);
+
+	return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Return FullTransactionId below which all transactions are not considered
+ * running anymore.
+ *
+ * Note: This is less efficient than testing with
+ * GlobalVisTestIsRemovableFullXid as it likely requires building an accurate
+ * cutoff, even in the case all the XIDs compared with the cutoff are outside
+ * [maybe_needed, definitely_needed).
+ */
+FullTransactionId
+GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state)
+{
+	/* acquire accurate horizon if not already done */
+	if (GlobalVisTestShouldUpdate(state))
+		GlobalVisUpdate();
+
+	return state->maybe_needed;
+}
+
+/* Convenience wrapper around GlobalVisTestNonRemovableFullHorizon */
+TransactionId
+GlobalVisTestNonRemovableHorizon(GlobalVisState *state)
+{
+	FullTransactionId cutoff;
+
+	cutoff = GlobalVisTestNonRemovableFullHorizon(state);
+
+	return XidFromFullTransactionId(cutoff);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableFullXid(), see their comments.
+ */
+bool
+GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
+{
+	GlobalVisState *state;
+
+	state = GlobalVisTestFor(rel);
+
+	return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableXid(), see their comments.
+ */
+bool
+GlobalVisCheckRemovableXid(Relation rel, TransactionId xid)
+{
+	GlobalVisState *state;
+
+	state = GlobalVisTestFor(rel);
+
+	return GlobalVisTestIsRemovableXid(state, xid);
+}
+
+/*
+ * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it
+ * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel).
+ *
+ * Be very careful about when to use this function. It can only safely be used
+ * when there is a guarantee that xid is within MaxTransactionId / 2 xids of
+ * rel. That e.g. can be guaranteed if the caller assures a snapshot is
+ * held by the backend and xid is from a table (where vacuum/freezing ensures
+ * the xid has to be within that range), or if xid is from the procarray and
+ * prevents xid wraparound that way.
+ */
+static inline FullTransactionId
+FullXidRelativeTo(FullTransactionId rel, TransactionId xid)
+{
+	TransactionId rel_xid = XidFromFullTransactionId(rel);
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(TransactionIdIsValid(rel_xid));
+
+	/* not guaranteed to find issues, but likely to catch mistakes */
+	AssertTransactionIdInAllowableRange(xid);
+
+	return FullTransactionIdFromU64(U64FromFullTransactionId(rel)
+									+ (int32) (xid - rel_xid));
+}
+
+
+/* ----------------------------------------------
+ *		KnownAssignedTransactionIds sub-module
+ * ----------------------------------------------
+ */
+
+/*
+ * In Hot Standby mode, we maintain a list of transactions that are (or were)
+ * running on the primary at the current point in WAL.  These XIDs must be
+ * treated as running by standby transactions, even though they are not in
+ * the standby server's PGPROC array.
+ *
+ * We record all XIDs that we know have been assigned.  That includes all the
+ * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
+ * been assigned.  We can deduce the existence of unobserved XIDs because we
+ * know XIDs are assigned in sequence, with no gaps.  The KnownAssignedXids
+ * list expands as new XIDs are observed or inferred, and contracts when
+ * transaction completion records arrive.
+ *
+ * During hot standby we do not fret too much about the distinction between
+ * top-level XIDs and subtransaction XIDs. We store both together in the
+ * KnownAssignedXids list.  In backends, this is copied into snapshots in
+ * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
+ * doesn't care about the distinction either.  Subtransaction XIDs are
+ * effectively treated as top-level XIDs and in the typical case pg_subtrans
+ * links are *not* maintained (which does not affect visibility).
+ *
+ * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
+ * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every primary transaction must
+ * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
+ * least every PGPROC_MAX_CACHED_SUBXIDS.  When we receive one of these
+ * records, we mark the subXIDs as children of the top XID in pg_subtrans,
+ * and then remove them from KnownAssignedXids.  This prevents overflow of
+ * KnownAssignedXids and snapshots, at the cost that status checks for these
+ * subXIDs will take a slower path through TransactionIdIsInProgress().
+ * This means that KnownAssignedXids is not necessarily complete for subXIDs,
+ * though it should be complete for top-level XIDs; this is the same situation
+ * that holds with respect to the PGPROC entries in normal running.
+ *
+ * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
+ * that, similarly to tracking overflow of a PGPROC's subxids array.  We do
+ * that by remembering the lastOverflowedXid, ie the last thrown-away subXID.
+ * As long as that is within the range of interesting XIDs, we have to assume
+ * that subXIDs are missing from snapshots.  (Note that subXID overflow occurs
+ * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
+ * subXID arrives - that is not an error.)
+ *
+ * Should a backend on primary somehow disappear before it can write an abort
+ * record, then we just leave those XIDs in KnownAssignedXids. They actually
+ * aborted but we think they were running; the distinction is irrelevant
+ * because either way any changes done by the transaction are not visible to
+ * backends in the standby.  We prune KnownAssignedXids when
+ * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
+ * array due to such dead XIDs.
+ */
+
+/*
+ * RecordKnownAssignedTransactionIds
+ *		Record the given XID in KnownAssignedXids, as well as any preceding
+ *		unobserved XIDs.
+ *
+ * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
+ * associated with a transaction. Must be called for each record after we
+ * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
+ *
+ * Called during recovery in analogy with and in place of GetNewTransactionId()
+ */
+void
+RecordKnownAssignedTransactionIds(TransactionId xid)
+{
+	Assert(standbyState >= STANDBY_INITIALIZED);
+	Assert(TransactionIdIsValid(xid));
+	Assert(TransactionIdIsValid(latestObservedXid));
+
+	elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
+		 xid, latestObservedXid);
+
+	/*
+	 * When a newly observed xid arrives, it is frequently the case that it is
+	 * *not* the next xid in sequence. When this occurs, we must treat the
+	 * intervening xids as running also.
+	 */
+	if (TransactionIdFollows(xid, latestObservedXid))
+	{
+		TransactionId next_expected_xid;
+
+		/*
+		 * Extend subtrans like we do in GetNewTransactionId() during normal
+		 * operation using individual extend steps. Note that we do not need
+		 * to extend clog since its extensions are WAL logged.
+		 *
+		 * This part has to be done regardless of standbyState since we
+		 * immediately start assigning subtransactions to their toplevel
+		 * transactions.
+		 */
+		next_expected_xid = latestObservedXid;
+		while (TransactionIdPrecedes(next_expected_xid, xid))
+		{
+			TransactionIdAdvance(next_expected_xid);
+			ExtendSUBTRANS(next_expected_xid);
+		}
+		Assert(next_expected_xid == xid);
+
+		/*
+		 * If the KnownAssignedXids machinery isn't up yet, there's nothing
+		 * more to do since we don't track assigned xids yet.
+		 */
+		if (standbyState <= STANDBY_INITIALIZED)
+		{
+			latestObservedXid = xid;
+			return;
+		}
+
+		/*
+		 * Add (latestObservedXid, xid] onto the KnownAssignedXids array.
+		 */
+		next_expected_xid = latestObservedXid;
+		TransactionIdAdvance(next_expected_xid);
+		KnownAssignedXidsAdd(next_expected_xid, xid, false);
+
+		/*
+		 * Now we can advance latestObservedXid
+		 */
+		latestObservedXid = xid;
+
+		/* ShmemVariableCache->nextXid must be beyond any observed xid */
+		AdvanceNextFullTransactionIdPastXid(latestObservedXid);
+		next_expected_xid = latestObservedXid;
+		TransactionIdAdvance(next_expected_xid);
+	}
+}
+
+/*
+ * ExpireTreeKnownAssignedTransactionIds
+ *		Remove the given XIDs from KnownAssignedXids.
+ *
+ * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
+ */
+void
+ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
+									  TransactionId *subxids, TransactionId max_xid)
+{
+	Assert(standbyState >= STANDBY_INITIALIZED);
+
+	/*
+	 * Uses same locking as transaction commit
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
+
+	/* As in ProcArrayEndTransaction, advance latestCompletedXid */
+	MaintainLatestCompletedXidRecovery(max_xid);
+
+	/* ... and xactCompletionCount */
+	ShmemVariableCache->xactCompletionCount++;
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ExpireAllKnownAssignedTransactionIds
+ *		Remove all entries in KnownAssignedXids and reset lastOverflowedXid.
+ */
+void
+ExpireAllKnownAssignedTransactionIds(void)
+{
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	KnownAssignedXidsRemovePreceding(InvalidTransactionId);
+
+	/*
+	 * Reset lastOverflowedXid.  Currently, lastOverflowedXid has no use after
+	 * the call of this function.  But do this for unification with what
+	 * ExpireOldKnownAssignedTransactionIds() do.
+	 */
+	procArray->lastOverflowedXid = InvalidTransactionId;
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ExpireOldKnownAssignedTransactionIds
+ *		Remove KnownAssignedXids entries preceding the given XID and
+ *		potentially reset lastOverflowedXid.
+ */
+void
+ExpireOldKnownAssignedTransactionIds(TransactionId xid)
+{
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Reset lastOverflowedXid if we know all transactions that have been
+	 * possibly running are being gone.  Not doing so could cause an incorrect
+	 * lastOverflowedXid value, which makes extra snapshots be marked as
+	 * suboverflowed.
+	 */
+	if (TransactionIdPrecedes(procArray->lastOverflowedXid, xid))
+		procArray->lastOverflowedXid = InvalidTransactionId;
+	KnownAssignedXidsRemovePreceding(xid);
+	LWLockRelease(ProcArrayLock);
+}
+
+
+/*
+ * Private module functions to manipulate KnownAssignedXids
+ *
+ * There are 5 main uses of the KnownAssignedXids data structure:
+ *
+ *	* backends taking snapshots - all valid XIDs need to be copied out
+ *	* backends seeking to determine presence of a specific XID
+ *	* startup process adding new known-assigned XIDs
+ *	* startup process removing specific XIDs as transactions end
+ *	* startup process pruning array when special WAL records arrive
+ *
+ * This data structure is known to be a hot spot during Hot Standby, so we
+ * go to some lengths to make these operations as efficient and as concurrent
+ * as possible.
+ *
+ * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
+ * order, to be exact --- to allow binary search for specific XIDs.  Note:
+ * in general TransactionIdPrecedes would not provide a total order, but
+ * we know that the entries present at any instant should not extend across
+ * a large enough fraction of XID space to wrap around (the primary would
+ * shut down for fear of XID wrap long before that happens).  So it's OK to
+ * use TransactionIdPrecedes as a binary-search comparator.
+ *
+ * It's cheap to maintain the sortedness during insertions, since new known
+ * XIDs are always reported in XID order; we just append them at the right.
+ *
+ * To keep individual deletions cheap, we need to allow gaps in the array.
+ * This is implemented by marking array elements as valid or invalid using
+ * the parallel boolean array KnownAssignedXidsValid[].  A deletion is done
+ * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
+ * XID entry itself.  This preserves the property that the XID entries are
+ * sorted, so we can do binary searches easily.  Periodically we compress
+ * out the unused entries; that's much cheaper than having to compress the
+ * array immediately on every deletion.
+ *
+ * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
+ * are those with indexes tail <= i < head; items outside this subscript range
+ * have unspecified contents.  When head reaches the end of the array, we
+ * force compression of unused entries rather than wrapping around, since
+ * allowing wraparound would greatly complicate the search logic.  We maintain
+ * an explicit tail pointer so that pruning of old XIDs can be done without
+ * immediately moving the array contents.  In most cases only a small fraction
+ * of the array contains valid entries at any instant.
+ *
+ * Although only the startup process can ever change the KnownAssignedXids
+ * data structure, we still need interlocking so that standby backends will
+ * not observe invalid intermediate states.  The convention is that backends
+ * must hold shared ProcArrayLock to examine the array.  To remove XIDs from
+ * the array, the startup process must hold ProcArrayLock exclusively, for
+ * the usual transactional reasons (compare commit/abort of a transaction
+ * during normal running).  Compressing unused entries out of the array
+ * likewise requires exclusive lock.  To add XIDs to the array, we just insert
+ * them into slots to the right of the head pointer and then advance the head
+ * pointer.  This wouldn't require any lock at all, except that on machines
+ * with weak memory ordering we need to be careful that other processors
+ * see the array element changes before they see the head pointer change.
+ * We handle this by using a spinlock to protect reads and writes of the
+ * head/tail pointers.  (We could dispense with the spinlock if we were to
+ * create suitable memory access barrier primitives and use those instead.)
+ * The spinlock must be taken to read or write the head/tail pointers unless
+ * the caller holds ProcArrayLock exclusively.
+ *
+ * Algorithmic analysis:
+ *
+ * If we have a maximum of M slots, with N XIDs currently spread across
+ * S elements then we have N <= S <= M always.
+ *
+ *	* Adding a new XID is O(1) and needs little locking (unless compression
+ *		must happen)
+ *	* Compressing the array is O(S) and requires exclusive lock
+ *	* Removing an XID is O(logS) and requires exclusive lock
+ *	* Taking a snapshot is O(S) and requires shared lock
+ *	* Checking for an XID is O(logS) and requires shared lock
+ *
+ * In comparison, using a hash table for KnownAssignedXids would mean that
+ * taking snapshots would be O(M). If we can maintain S << M then the
+ * sorted array technique will deliver significantly faster snapshots.
+ * If we try to keep S too small then we will spend too much time compressing,
+ * so there is an optimal point for any workload mix. We use a heuristic to
+ * decide when to compress the array, though trimming also helps reduce
+ * frequency of compressing. The heuristic requires us to track the number of
+ * currently valid XIDs in the array.
+ */
+
+
+/*
+ * Compress KnownAssignedXids by shifting valid data down to the start of the
+ * array, removing any gaps.
+ *
+ * A compression step is forced if "force" is true, otherwise we do it
+ * only if a heuristic indicates it's a good time to do it.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsCompress(bool force)
+{
+	ProcArrayStruct *pArray = procArray;
+	int			head,
+				tail;
+	int			compress_index;
+	int			i;
+
+	/* no spinlock required since we hold ProcArrayLock exclusively */
+	head = pArray->headKnownAssignedXids;
+	tail = pArray->tailKnownAssignedXids;
+
+	if (!force)
+	{
+		/*
+		 * If we can choose how much to compress, use a heuristic to avoid
+		 * compressing too often or not often enough.
+		 *
+		 * Heuristic is if we have a large enough current spread and less than
+		 * 50% of the elements are currently in use, then compress. This
+		 * should ensure we compress fairly infrequently. We could compress
+		 * less often though the virtual array would spread out more and
+		 * snapshots would become more expensive.
+		 */
+		int			nelements = head - tail;
+
+		if (nelements < 4 * PROCARRAY_MAXPROCS ||
+			nelements < 2 * pArray->numKnownAssignedXids)
+			return;
+	}
+
+	/*
+	 * We compress the array by reading the valid values from tail to head,
+	 * re-aligning data to 0th element.
+	 */
+	compress_index = 0;
+	for (i = tail; i < head; i++)
+	{
+		if (KnownAssignedXidsValid[i])
+		{
+			KnownAssignedXids[compress_index] = KnownAssignedXids[i];
+			KnownAssignedXidsValid[compress_index] = true;
+			compress_index++;
+		}
+	}
+
+	pArray->tailKnownAssignedXids = 0;
+	pArray->headKnownAssignedXids = compress_index;
+}
+
+/*
+ * Add xids into KnownAssignedXids at the head of the array.
+ *
+ * xids from from_xid to to_xid, inclusive, are added to the array.
+ *
+ * If exclusive_lock is true then caller already holds ProcArrayLock in
+ * exclusive mode, so we need no extra locking here.  Else caller holds no
+ * lock, so we need to be sure we maintain sufficient interlocks against
+ * concurrent readers.  (Only the startup process ever calls this, so no need
+ * to worry about concurrent writers.)
+ */
+static void
+KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
+					 bool exclusive_lock)
+{
+	ProcArrayStruct *pArray = procArray;
+	TransactionId next_xid;
+	int			head,
+				tail;
+	int			nxids;
+	int			i;
+
+	Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));
+
+	/*
+	 * Calculate how many array slots we'll need.  Normally this is cheap; in
+	 * the unusual case where the XIDs cross the wrap point, we do it the hard
+	 * way.
+	 */
+	if (to_xid >= from_xid)
+		nxids = to_xid - from_xid + 1;
+	else
+	{
+		nxids = 1;
+		next_xid = from_xid;
+		while (TransactionIdPrecedes(next_xid, to_xid))
+		{
+			nxids++;
+			TransactionIdAdvance(next_xid);
+		}
+	}
+
+	/*
+	 * Since only the startup process modifies the head/tail pointers, we
+	 * don't need a lock to read them here.
+	 */
+	head = pArray->headKnownAssignedXids;
+	tail = pArray->tailKnownAssignedXids;
+
+	Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
+	Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);
+
+	/*
+	 * Verify that insertions occur in TransactionId sequence.  Note that even
+	 * if the last existing element is marked invalid, it must still have a
+	 * correctly sequenced XID value.
+	 */
+	if (head > tail &&
+		TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
+	{
+		KnownAssignedXidsDisplay(LOG);
+		elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
+	}
+
+	/*
+	 * If our xids won't fit in the remaining space, compress out free space
+	 */
+	if (head + nxids > pArray->maxKnownAssignedXids)
+	{
+		/* must hold lock to compress */
+		if (!exclusive_lock)
+			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+		KnownAssignedXidsCompress(true);
+
+		head = pArray->headKnownAssignedXids;
+		/* note: we no longer care about the tail pointer */
+
+		if (!exclusive_lock)
+			LWLockRelease(ProcArrayLock);
+
+		/*
+		 * If it still won't fit then we're out of memory
+		 */
+		if (head + nxids > pArray->maxKnownAssignedXids)
+			elog(ERROR, "too many KnownAssignedXids");
+	}
+
+	/* Now we can insert the xids into the space starting at head */
+	next_xid = from_xid;
+	for (i = 0; i < nxids; i++)
+	{
+		KnownAssignedXids[head] = next_xid;
+		KnownAssignedXidsValid[head] = true;
+		TransactionIdAdvance(next_xid);
+		head++;
+	}
+
+	/* Adjust count of number of valid entries */
+	pArray->numKnownAssignedXids += nxids;
+
+	/*
+	 * Now update the head pointer.  We use a spinlock to protect this
+	 * pointer, not because the update is likely to be non-atomic, but to
+	 * ensure that other processors see the above array updates before they
+	 * see the head pointer change.
+	 *
+	 * If we're holding ProcArrayLock exclusively, there's no need to take the
+	 * spinlock.
+	 */
+	if (exclusive_lock)
+		pArray->headKnownAssignedXids = head;
+	else
+	{
+		SpinLockAcquire(&pArray->known_assigned_xids_lck);
+		pArray->headKnownAssignedXids = head;
+		SpinLockRelease(&pArray->known_assigned_xids_lck);
+	}
+}
+
+/*
+ * KnownAssignedXidsSearch
+ *
+ * Searches KnownAssignedXids for a specific xid and optionally removes it.
+ * Returns true if it was found, false if not.
+ *
+ * Caller must hold ProcArrayLock in shared or exclusive mode.
+ * Exclusive lock must be held for remove = true.
+ */
+static bool
+KnownAssignedXidsSearch(TransactionId xid, bool remove)
+{
+	ProcArrayStruct *pArray = procArray;
+	int			first,
+				last;
+	int			head;
+	int			tail;
+	int			result_index = -1;
+
+	if (remove)
+	{
+		/* we hold ProcArrayLock exclusively, so no need for spinlock */
+		tail = pArray->tailKnownAssignedXids;
+		head = pArray->headKnownAssignedXids;
+	}
+	else
+	{
+		/* take spinlock to ensure we see up-to-date array contents */
+		SpinLockAcquire(&pArray->known_assigned_xids_lck);
+		tail = pArray->tailKnownAssignedXids;
+		head = pArray->headKnownAssignedXids;
+		SpinLockRelease(&pArray->known_assigned_xids_lck);
+	}
+
+	/*
+	 * Standard binary search.  Note we can ignore the KnownAssignedXidsValid
+	 * array here, since even invalid entries will contain sorted XIDs.
+	 */
+	first = tail;
+	last = head - 1;
+	while (first <= last)
+	{
+		int			mid_index;
+		TransactionId mid_xid;
+
+		mid_index = (first + last) / 2;
+		mid_xid = KnownAssignedXids[mid_index];
+
+		if (xid == mid_xid)
+		{
+			result_index = mid_index;
+			break;
+		}
+		else if (TransactionIdPrecedes(xid, mid_xid))
+			last = mid_index - 1;
+		else
+			first = mid_index + 1;
+	}
+
+	if (result_index < 0)
+		return false;			/* not in array */
+
+	if (!KnownAssignedXidsValid[result_index])
+		return false;			/* in array, but invalid */
+
+	if (remove)
+	{
+		KnownAssignedXidsValid[result_index] = false;
+
+		pArray->numKnownAssignedXids--;
+		Assert(pArray->numKnownAssignedXids >= 0);
+
+		/*
+		 * If we're removing the tail element then advance tail pointer over
+		 * any invalid elements.  This will speed future searches.
+		 */
+		if (result_index == tail)
+		{
+			tail++;
+			while (tail < head && !KnownAssignedXidsValid[tail])
+				tail++;
+			if (tail >= head)
+			{
+				/* Array is empty, so we can reset both pointers */
+				pArray->headKnownAssignedXids = 0;
+				pArray->tailKnownAssignedXids = 0;
+			}
+			else
+			{
+				pArray->tailKnownAssignedXids = tail;
+			}
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Is the specified XID present in KnownAssignedXids[]?
+ *
+ * Caller must hold ProcArrayLock in shared or exclusive mode.
+ */
+static bool
+KnownAssignedXidExists(TransactionId xid)
+{
+	Assert(TransactionIdIsValid(xid));
+
+	return KnownAssignedXidsSearch(xid, false);
+}
+
+/*
+ * Remove the specified XID from KnownAssignedXids[].
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemove(TransactionId xid)
+{
+	Assert(TransactionIdIsValid(xid));
+
+	elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);
+
+	/*
+	 * Note: we cannot consider it an error to remove an XID that's not
+	 * present.  We intentionally remove subxact IDs while processing
+	 * XLOG_XACT_ASSIGNMENT, to avoid array overflow.  Then those XIDs will be
+	 * removed again when the top-level xact commits or aborts.
+	 *
+	 * It might be possible to track such XIDs to distinguish this case from
+	 * actual errors, but it would be complicated and probably not worth it.
+	 * So, just ignore the search result.
+	 */
+	(void) KnownAssignedXidsSearch(xid, true);
+}
+
+/*
+ * KnownAssignedXidsRemoveTree
+ *		Remove xid (if it's not InvalidTransactionId) and all the subxids.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
+							TransactionId *subxids)
+{
+	int			i;
+
+	if (TransactionIdIsValid(xid))
+		KnownAssignedXidsRemove(xid);
+
+	for (i = 0; i < nsubxids; i++)
+		KnownAssignedXidsRemove(subxids[i]);
+
+	/* Opportunistically compress the array */
+	KnownAssignedXidsCompress(false);
+}
+
+/*
+ * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
+ * then clear the whole table.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemovePreceding(TransactionId removeXid)
+{
+	ProcArrayStruct *pArray = procArray;
+	int			count = 0;
+	int			head,
+				tail,
+				i;
+
+	if (!TransactionIdIsValid(removeXid))
+	{
+		elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
+		pArray->numKnownAssignedXids = 0;
+		pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
+		return;
+	}
+
+	elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
+
+	/*
+	 * Mark entries invalid starting at the tail.  Since array is sorted, we
+	 * can stop as soon as we reach an entry >= removeXid.
+	 */
+	tail = pArray->tailKnownAssignedXids;
+	head = pArray->headKnownAssignedXids;
+
+	for (i = tail; i < head; i++)
+	{
+		if (KnownAssignedXidsValid[i])
+		{
+			TransactionId knownXid = KnownAssignedXids[i];
+
+			if (TransactionIdFollowsOrEquals(knownXid, removeXid))
+				break;
+
+			if (!StandbyTransactionIdIsPrepared(knownXid))
+			{
+				KnownAssignedXidsValid[i] = false;
+				count++;
+			}
+		}
+	}
+
+	pArray->numKnownAssignedXids -= count;
+	Assert(pArray->numKnownAssignedXids >= 0);
+
+	/*
+	 * Advance the tail pointer if we've marked the tail item invalid.
+	 */
+	for (i = tail; i < head; i++)
+	{
+		if (KnownAssignedXidsValid[i])
+			break;
+	}
+	if (i >= head)
+	{
+		/* Array is empty, so we can reset both pointers */
+		pArray->headKnownAssignedXids = 0;
+		pArray->tailKnownAssignedXids = 0;
+	}
+	else
+	{
+		pArray->tailKnownAssignedXids = i;
+	}
+
+	/* Opportunistically compress the array */
+	KnownAssignedXidsCompress(false);
+}
+
+/*
+ * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
+ * We filter out anything >= xmax.
+ *
+ * Returns the number of XIDs stored into xarray[].  Caller is responsible
+ * that array is large enough.
+ *
+ * Caller must hold ProcArrayLock in (at least) shared mode.
+ */
+static int
+KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
+{
+	TransactionId xtmp = InvalidTransactionId;
+
+	return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
+}
+
+/*
+ * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
+ * we reduce *xmin to the lowest xid value seen if not already lower.
+ *
+ * Caller must hold ProcArrayLock in (at least) shared mode.
+ */
+static int
+KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
+							   TransactionId xmax)
+{
+	int			count = 0;
+	int			head,
+				tail;
+	int			i;
+
+	/*
+	 * Fetch head just once, since it may change while we loop. We can stop
+	 * once we reach the initially seen head, since we are certain that an xid
+	 * cannot enter and then leave the array while we hold ProcArrayLock.  We
+	 * might miss newly-added xids, but they should be >= xmax so irrelevant
+	 * anyway.
+	 *
+	 * Must take spinlock to ensure we see up-to-date array contents.
+	 */
+	SpinLockAcquire(&procArray->known_assigned_xids_lck);
+	tail = procArray->tailKnownAssignedXids;
+	head = procArray->headKnownAssignedXids;
+	SpinLockRelease(&procArray->known_assigned_xids_lck);
+
+	for (i = tail; i < head; i++)
+	{
+		/* Skip any gaps in the array */
+		if (KnownAssignedXidsValid[i])
+		{
+			TransactionId knownXid = KnownAssignedXids[i];
+
+			/*
+			 * Update xmin if required.  Only the first XID need be checked,
+			 * since the array is sorted.
+			 */
+			if (count == 0 &&
+				TransactionIdPrecedes(knownXid, *xmin))
+				*xmin = knownXid;
+
+			/*
+			 * Filter out anything >= xmax, again relying on sorted property
+			 * of array.
+			 */
+			if (TransactionIdIsValid(xmax) &&
+				TransactionIdFollowsOrEquals(knownXid, xmax))
+				break;
+
+			/* Add knownXid into output array */
+			xarray[count++] = knownXid;
+		}
+	}
+
+	return count;
+}
+
+/*
+ * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
+ * if nothing there.
+ */
+static TransactionId
+KnownAssignedXidsGetOldestXmin(void)
+{
+	int			head,
+				tail;
+	int			i;
+
+	/*
+	 * Fetch head just once, since it may change while we loop.
+	 */
+	SpinLockAcquire(&procArray->known_assigned_xids_lck);
+	tail = procArray->tailKnownAssignedXids;
+	head = procArray->headKnownAssignedXids;
+	SpinLockRelease(&procArray->known_assigned_xids_lck);
+
+	for (i = tail; i < head; i++)
+	{
+		/* Skip any gaps in the array */
+		if (KnownAssignedXidsValid[i])
+			return KnownAssignedXids[i];
+	}
+
+	return InvalidTransactionId;
+}
+
+/*
+ * Display KnownAssignedXids to provide debug trail
+ *
+ * Currently this is only called within startup process, so we need no
+ * special locking.
+ *
+ * Note this is pretty expensive, and much of the expense will be incurred
+ * even if the elog message will get discarded.  It's not currently called
+ * in any performance-critical places, however, so no need to be tenser.
+ */
+static void
+KnownAssignedXidsDisplay(int trace_level)
+{
+	ProcArrayStruct *pArray = procArray;
+	StringInfoData buf;
+	int			head,
+				tail,
+				i;
+	int			nxids = 0;
+
+	tail = pArray->tailKnownAssignedXids;
+	head = pArray->headKnownAssignedXids;
+
+	initStringInfo(&buf);
+
+	for (i = tail; i < head; i++)
+	{
+		if (KnownAssignedXidsValid[i])
+		{
+			nxids++;
+			appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
+		}
+	}
+
+	elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
+		 nxids,
+		 pArray->numKnownAssignedXids,
+		 pArray->tailKnownAssignedXids,
+		 pArray->headKnownAssignedXids,
+		 buf.data);
+
+	pfree(buf.data);
+}
+
+/*
+ * KnownAssignedXidsReset
+ *		Resets KnownAssignedXids to be empty
+ */
+static void
+KnownAssignedXidsReset(void)
+{
+	ProcArrayStruct *pArray = procArray;
+
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	pArray->numKnownAssignedXids = 0;
+	pArray->tailKnownAssignedXids = 0;
+	pArray->headKnownAssignedXids = 0;
+
+	LWLockRelease(ProcArrayLock);
+}
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
new file mode 100644
index 0000000..defb75a
--- /dev/null
+++ b/src/backend/storage/ipc/procsignal.c
@@ -0,0 +1,685 @@
+/*-------------------------------------------------------------------------
+ *
+ * procsignal.c
+ *	  Routines for interprocess signaling
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/procsignal.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/parallel.h"
+#include "port/pg_bitutils.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "storage/sinval.h"
+#include "tcop/tcopprot.h"
+#include "utils/memutils.h"
+
+/*
+ * The SIGUSR1 signal is multiplexed to support signaling multiple event
+ * types. The specific reason is communicated via flags in shared memory.
+ * We keep a boolean flag for each possible "reason", so that different
+ * reasons can be signaled to a process concurrently.  (However, if the same
+ * reason is signaled more than once nearly simultaneously, the process may
+ * observe it only once.)
+ *
+ * Each process that wants to receive signals registers its process ID
+ * in the ProcSignalSlots array. The array is indexed by backend ID to make
+ * slot allocation simple, and to avoid having to search the array when you
+ * know the backend ID of the process you're signaling.  (We do support
+ * signaling without backend ID, but it's a bit less efficient.)
+ *
+ * The flags are actually declared as "volatile sig_atomic_t" for maximum
+ * portability.  This should ensure that loads and stores of the flag
+ * values are atomic, allowing us to dispense with any explicit locking.
+ *
+ * pss_signalFlags are intended to be set in cases where we don't need to
+ * keep track of whether or not the target process has handled the signal,
+ * but sometimes we need confirmation, as when making a global state change
+ * that cannot be considered complete until all backends have taken notice
+ * of it. For such use cases, we set a bit in pss_barrierCheckMask and then
+ * increment the current "barrier generation"; when the new barrier generation
+ * (or greater) appears in the pss_barrierGeneration flag of every process,
+ * we know that the message has been received everywhere.
+ */
+typedef struct
+{
+	volatile pid_t pss_pid;
+	volatile sig_atomic_t pss_signalFlags[NUM_PROCSIGNALS];
+	pg_atomic_uint64 pss_barrierGeneration;
+	pg_atomic_uint32 pss_barrierCheckMask;
+	ConditionVariable pss_barrierCV;
+} ProcSignalSlot;
+
+/*
+ * Information that is global to the entire ProcSignal system can be stored
+ * here.
+ *
+ * psh_barrierGeneration is the highest barrier generation in existence.
+ */
+typedef struct
+{
+	pg_atomic_uint64 psh_barrierGeneration;
+	ProcSignalSlot psh_slot[FLEXIBLE_ARRAY_MEMBER];
+} ProcSignalHeader;
+
+/*
+ * We reserve a slot for each possible BackendId, plus one for each
+ * possible auxiliary process type.  (This scheme assumes there is not
+ * more than one of any auxiliary process type at a time.)
+ */
+#define NumProcSignalSlots	(MaxBackends + NUM_AUXPROCTYPES)
+
+/* Check whether the relevant type bit is set in the flags. */
+#define BARRIER_SHOULD_CHECK(flags, type) \
+	(((flags) & (((uint32) 1) << (uint32) (type))) != 0)
+
+/* Clear the relevant type bit from the flags. */
+#define BARRIER_CLEAR_BIT(flags, type) \
+	((flags) &= ~(((uint32) 1) << (uint32) (type)))
+
+static ProcSignalHeader *ProcSignal = NULL;
+static ProcSignalSlot *MyProcSignalSlot = NULL;
+
+static bool CheckProcSignal(ProcSignalReason reason);
+static void CleanupProcSignalState(int status, Datum arg);
+static void ResetProcSignalBarrierBits(uint32 flags);
+static bool ProcessBarrierPlaceholder(void);
+
+/*
+ * ProcSignalShmemSize
+ *		Compute space needed for procsignal's shared memory
+ */
+Size
+ProcSignalShmemSize(void)
+{
+	Size		size;
+
+	size = mul_size(NumProcSignalSlots, sizeof(ProcSignalSlot));
+	size = add_size(size, offsetof(ProcSignalHeader, psh_slot));
+	return size;
+}
+
+/*
+ * ProcSignalShmemInit
+ *		Allocate and initialize procsignal's shared memory
+ */
+void
+ProcSignalShmemInit(void)
+{
+	Size		size = ProcSignalShmemSize();
+	bool		found;
+
+	ProcSignal = (ProcSignalHeader *)
+		ShmemInitStruct("ProcSignal", size, &found);
+
+	/* If we're first, initialize. */
+	if (!found)
+	{
+		int			i;
+
+		pg_atomic_init_u64(&ProcSignal->psh_barrierGeneration, 0);
+
+		for (i = 0; i < NumProcSignalSlots; ++i)
+		{
+			ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+
+			slot->pss_pid = 0;
+			MemSet(slot->pss_signalFlags, 0, sizeof(slot->pss_signalFlags));
+			pg_atomic_init_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX);
+			pg_atomic_init_u32(&slot->pss_barrierCheckMask, 0);
+			ConditionVariableInit(&slot->pss_barrierCV);
+		}
+	}
+}
+
+/*
+ * ProcSignalInit
+ *		Register the current process in the procsignal array
+ *
+ * The passed index should be my BackendId if the process has one,
+ * or MaxBackends + aux process type if not.
+ */
+void
+ProcSignalInit(int pss_idx)
+{
+	ProcSignalSlot *slot;
+	uint64		barrier_generation;
+
+	Assert(pss_idx >= 1 && pss_idx <= NumProcSignalSlots);
+
+	slot = &ProcSignal->psh_slot[pss_idx - 1];
+
+	/* sanity check */
+	if (slot->pss_pid != 0)
+		elog(LOG, "process %d taking over ProcSignal slot %d, but it's not empty",
+			 MyProcPid, pss_idx);
+
+	/* Clear out any leftover signal reasons */
+	MemSet(slot->pss_signalFlags, 0, NUM_PROCSIGNALS * sizeof(sig_atomic_t));
+
+	/*
+	 * Initialize barrier state. Since we're a brand-new process, there
+	 * shouldn't be any leftover backend-private state that needs to be
+	 * updated. Therefore, we can broadcast the latest barrier generation and
+	 * disregard any previously-set check bits.
+	 *
+	 * NB: This only works if this initialization happens early enough in the
+	 * startup sequence that we haven't yet cached any state that might need
+	 * to be invalidated. That's also why we have a memory barrier here, to be
+	 * sure that any later reads of memory happen strictly after this.
+	 */
+	pg_atomic_write_u32(&slot->pss_barrierCheckMask, 0);
+	barrier_generation =
+		pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
+	pg_atomic_write_u64(&slot->pss_barrierGeneration, barrier_generation);
+	pg_memory_barrier();
+
+	/* Mark slot with my PID */
+	slot->pss_pid = MyProcPid;
+
+	/* Remember slot location for CheckProcSignal */
+	MyProcSignalSlot = slot;
+
+	/* Set up to release the slot on process exit */
+	on_shmem_exit(CleanupProcSignalState, Int32GetDatum(pss_idx));
+}
+
+/*
+ * CleanupProcSignalState
+ *		Remove current process from ProcSignal mechanism
+ *
+ * This function is called via on_shmem_exit() during backend shutdown.
+ */
+static void
+CleanupProcSignalState(int status, Datum arg)
+{
+	int			pss_idx = DatumGetInt32(arg);
+	ProcSignalSlot *slot;
+
+	slot = &ProcSignal->psh_slot[pss_idx - 1];
+	Assert(slot == MyProcSignalSlot);
+
+	/*
+	 * Clear MyProcSignalSlot, so that a SIGUSR1 received after this point
+	 * won't try to access it after it's no longer ours (and perhaps even
+	 * after we've unmapped the shared memory segment).
+	 */
+	MyProcSignalSlot = NULL;
+
+	/* sanity check */
+	if (slot->pss_pid != MyProcPid)
+	{
+		/*
+		 * don't ERROR here. We're exiting anyway, and don't want to get into
+		 * infinite loop trying to exit
+		 */
+		elog(LOG, "process %d releasing ProcSignal slot %d, but it contains %d",
+			 MyProcPid, pss_idx, (int) slot->pss_pid);
+		return;					/* XXX better to zero the slot anyway? */
+	}
+
+	/*
+	 * Make this slot look like it's absorbed all possible barriers, so that
+	 * no barrier waits block on it.
+	 */
+	pg_atomic_write_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX);
+	ConditionVariableBroadcast(&slot->pss_barrierCV);
+
+	slot->pss_pid = 0;
+}
+
+/*
+ * SendProcSignal
+ *		Send a signal to a Postgres process
+ *
+ * Providing backendId is optional, but it will speed up the operation.
+ *
+ * On success (a signal was sent), zero is returned.
+ * On error, -1 is returned, and errno is set (typically to ESRCH or EPERM).
+ *
+ * Not to be confused with ProcSendSignal
+ */
+int
+SendProcSignal(pid_t pid, ProcSignalReason reason, BackendId backendId)
+{
+	volatile ProcSignalSlot *slot;
+
+	if (backendId != InvalidBackendId)
+	{
+		slot = &ProcSignal->psh_slot[backendId - 1];
+
+		/*
+		 * Note: Since there's no locking, it's possible that the target
+		 * process detaches from shared memory and exits right after this
+		 * test, before we set the flag and send signal. And the signal slot
+		 * might even be recycled by a new process, so it's remotely possible
+		 * that we set a flag for a wrong process. That's OK, all the signals
+		 * are such that no harm is done if they're mistakenly fired.
+		 */
+		if (slot->pss_pid == pid)
+		{
+			/* Atomically set the proper flag */
+			slot->pss_signalFlags[reason] = true;
+			/* Send signal */
+			return kill(pid, SIGUSR1);
+		}
+	}
+	else
+	{
+		/*
+		 * BackendId not provided, so search the array using pid.  We search
+		 * the array back to front so as to reduce search overhead.  Passing
+		 * InvalidBackendId means that the target is most likely an auxiliary
+		 * process, which will have a slot near the end of the array.
+		 */
+		int			i;
+
+		for (i = NumProcSignalSlots - 1; i >= 0; i--)
+		{
+			slot = &ProcSignal->psh_slot[i];
+
+			if (slot->pss_pid == pid)
+			{
+				/* the above note about race conditions applies here too */
+
+				/* Atomically set the proper flag */
+				slot->pss_signalFlags[reason] = true;
+				/* Send signal */
+				return kill(pid, SIGUSR1);
+			}
+		}
+	}
+
+	errno = ESRCH;
+	return -1;
+}
+
+/*
+ * EmitProcSignalBarrier
+ *		Send a signal to every Postgres process
+ *
+ * The return value of this function is the barrier "generation" created
+ * by this operation. This value can be passed to WaitForProcSignalBarrier
+ * to wait until it is known that every participant in the ProcSignal
+ * mechanism has absorbed the signal (or started afterwards).
+ *
+ * Note that it would be a bad idea to use this for anything that happens
+ * frequently, as interrupting every backend could cause a noticeable
+ * performance hit.
+ *
+ * Callers are entitled to assume that this function will not throw ERROR
+ * or FATAL.
+ */
+uint64
+EmitProcSignalBarrier(ProcSignalBarrierType type)
+{
+	uint32		flagbit = 1 << (uint32) type;
+	uint64		generation;
+
+	/*
+	 * Set all the flags.
+	 *
+	 * Note that pg_atomic_fetch_or_u32 has full barrier semantics, so this is
+	 * totally ordered with respect to anything the caller did before, and
+	 * anything that we do afterwards. (This is also true of the later call to
+	 * pg_atomic_add_fetch_u64.)
+	 */
+	for (int i = 0; i < NumProcSignalSlots; i++)
+	{
+		volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+
+		pg_atomic_fetch_or_u32(&slot->pss_barrierCheckMask, flagbit);
+	}
+
+	/*
+	 * Increment the generation counter.
+	 */
+	generation =
+		pg_atomic_add_fetch_u64(&ProcSignal->psh_barrierGeneration, 1);
+
+	/*
+	 * Signal all the processes, so that they update their advertised barrier
+	 * generation.
+	 *
+	 * Concurrency is not a problem here. Backends that have exited don't
+	 * matter, and new backends that have joined since we entered this
+	 * function must already have current state, since the caller is
+	 * responsible for making sure that the relevant state is entirely visible
+	 * before calling this function in the first place. We still have to wake
+	 * them up - because we can't distinguish between such backends and older
+	 * backends that need to update state - but they won't actually need to
+	 * change any state.
+	 */
+	for (int i = NumProcSignalSlots - 1; i >= 0; i--)
+	{
+		volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+		pid_t		pid = slot->pss_pid;
+
+		if (pid != 0)
+		{
+			/* see SendProcSignal for details */
+			slot->pss_signalFlags[PROCSIG_BARRIER] = true;
+			kill(pid, SIGUSR1);
+		}
+	}
+
+	return generation;
+}
+
+/*
+ * WaitForProcSignalBarrier - wait until it is guaranteed that all changes
+ * requested by a specific call to EmitProcSignalBarrier() have taken effect.
+ */
+void
+WaitForProcSignalBarrier(uint64 generation)
+{
+	Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration));
+
+	for (int i = NumProcSignalSlots - 1; i >= 0; i--)
+	{
+		ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+		uint64		oldval;
+
+		/*
+		 * It's important that we check only pss_barrierGeneration here and
+		 * not pss_barrierCheckMask. Bits in pss_barrierCheckMask get cleared
+		 * before the barrier is actually absorbed, but pss_barrierGeneration
+		 * is updated only afterward.
+		 */
+		oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
+		while (oldval < generation)
+		{
+			ConditionVariableSleep(&slot->pss_barrierCV,
+								   WAIT_EVENT_PROC_SIGNAL_BARRIER);
+			oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
+		}
+		ConditionVariableCancelSleep();
+	}
+
+	/*
+	 * The caller is probably calling this function because it wants to read
+	 * the shared state or perform further writes to shared state once all
+	 * backends are known to have absorbed the barrier. However, the read of
+	 * pss_barrierGeneration was performed unlocked; insert a memory barrier
+	 * to separate it from whatever follows.
+	 */
+	pg_memory_barrier();
+}
+
+/*
+ * Handle receipt of an interrupt indicating a global barrier event.
+ *
+ * All the actual work is deferred to ProcessProcSignalBarrier(), because we
+ * cannot safely access the barrier generation inside the signal handler as
+ * 64bit atomics might use spinlock based emulation, even for reads. As this
+ * routine only gets called when PROCSIG_BARRIER is sent that won't cause a
+ * lot of unnecessary work.
+ */
+static void
+HandleProcSignalBarrierInterrupt(void)
+{
+	InterruptPending = true;
+	ProcSignalBarrierPending = true;
+	/* latch will be set by procsignal_sigusr1_handler */
+}
+
+/*
+ * Perform global barrier related interrupt checking.
+ *
+ * Any backend that participates in ProcSignal signaling must arrange to
+ * call this function periodically. It is called from CHECK_FOR_INTERRUPTS(),
+ * which is enough for normal backends, but not necessarily for all types of
+ * background processes.
+ */
+void
+ProcessProcSignalBarrier(void)
+{
+	uint64		local_gen;
+	uint64		shared_gen;
+	volatile uint32 flags;
+
+	Assert(MyProcSignalSlot);
+
+	/* Exit quickly if there's no work to do. */
+	if (!ProcSignalBarrierPending)
+		return;
+	ProcSignalBarrierPending = false;
+
+	/*
+	 * It's not unlikely to process multiple barriers at once, before the
+	 * signals for all the barriers have arrived. To avoid unnecessary work in
+	 * response to subsequent signals, exit early if we already have processed
+	 * all of them.
+	 */
+	local_gen = pg_atomic_read_u64(&MyProcSignalSlot->pss_barrierGeneration);
+	shared_gen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
+
+	Assert(local_gen <= shared_gen);
+
+	if (local_gen == shared_gen)
+		return;
+
+	/*
+	 * Get and clear the flags that are set for this backend. Note that
+	 * pg_atomic_exchange_u32 is a full barrier, so we're guaranteed that the
+	 * read of the barrier generation above happens before we atomically
+	 * extract the flags, and that any subsequent state changes happen
+	 * afterward.
+	 *
+	 * NB: In order to avoid race conditions, we must zero
+	 * pss_barrierCheckMask first and only afterwards try to do barrier
+	 * processing. If we did it in the other order, someone could send us
+	 * another barrier of some type right after we called the
+	 * barrier-processing function but before we cleared the bit. We would
+	 * have no way of knowing that the bit needs to stay set in that case, so
+	 * the need to call the barrier-processing function again would just get
+	 * forgotten. So instead, we tentatively clear all the bits and then put
+	 * back any for which we don't manage to successfully absorb the barrier.
+	 */
+	flags = pg_atomic_exchange_u32(&MyProcSignalSlot->pss_barrierCheckMask, 0);
+
+	/*
+	 * If there are no flags set, then we can skip doing any real work.
+	 * Otherwise, establish a PG_TRY block, so that we don't lose track of
+	 * which types of barrier processing are needed if an ERROR occurs.
+	 */
+	if (flags != 0)
+	{
+		bool		success = true;
+
+		PG_TRY();
+		{
+			/*
+			 * Process each type of barrier. The barrier-processing functions
+			 * should normally return true, but may return false if the
+			 * barrier can't be absorbed at the current time. This should be
+			 * rare, because it's pretty expensive.  Every single
+			 * CHECK_FOR_INTERRUPTS() will return here until we manage to
+			 * absorb the barrier, and that cost will add up in a hurry.
+			 *
+			 * NB: It ought to be OK to call the barrier-processing functions
+			 * unconditionally, but it's more efficient to call only the ones
+			 * that might need us to do something based on the flags.
+			 */
+			while (flags != 0)
+			{
+				ProcSignalBarrierType type;
+				bool		processed = true;
+
+				type = (ProcSignalBarrierType) pg_rightmost_one_pos32(flags);
+				switch (type)
+				{
+					case PROCSIGNAL_BARRIER_PLACEHOLDER:
+						processed = ProcessBarrierPlaceholder();
+						break;
+				}
+
+				/*
+				 * To avoid an infinite loop, we must always unset the bit in
+				 * flags.
+				 */
+				BARRIER_CLEAR_BIT(flags, type);
+
+				/*
+				 * If we failed to process the barrier, reset the shared bit
+				 * so we try again later, and set a flag so that we don't bump
+				 * our generation.
+				 */
+				if (!processed)
+				{
+					ResetProcSignalBarrierBits(((uint32) 1) << type);
+					success = false;
+				}
+			}
+		}
+		PG_CATCH();
+		{
+			/*
+			 * If an ERROR occurred, we'll need to try again later to handle
+			 * that barrier type and any others that haven't been handled yet
+			 * or weren't successfully absorbed.
+			 */
+			ResetProcSignalBarrierBits(flags);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+
+		/*
+		 * If some barrier types were not successfully absorbed, we will have
+		 * to try again later.
+		 */
+		if (!success)
+			return;
+	}
+
+	/*
+	 * State changes related to all types of barriers that might have been
+	 * emitted have now been handled, so we can update our notion of the
+	 * generation to the one we observed before beginning the updates. If
+	 * things have changed further, it'll get fixed up when this function is
+	 * next called.
+	 */
+	pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, shared_gen);
+	ConditionVariableBroadcast(&MyProcSignalSlot->pss_barrierCV);
+}
+
+/*
+ * If it turns out that we couldn't absorb one or more barrier types, either
+ * because the barrier-processing functions returned false or due to an error,
+ * arrange for processing to be retried later.
+ */
+static void
+ResetProcSignalBarrierBits(uint32 flags)
+{
+	pg_atomic_fetch_or_u32(&MyProcSignalSlot->pss_barrierCheckMask, flags);
+	ProcSignalBarrierPending = true;
+	InterruptPending = true;
+}
+
+static bool
+ProcessBarrierPlaceholder(void)
+{
+	/*
+	 * XXX. This is just a placeholder until the first real user of this
+	 * machinery gets committed. Rename PROCSIGNAL_BARRIER_PLACEHOLDER to
+	 * PROCSIGNAL_BARRIER_SOMETHING_ELSE where SOMETHING_ELSE is something
+	 * appropriately descriptive. Get rid of this function and instead have
+	 * ProcessBarrierSomethingElse. Most likely, that function should live in
+	 * the file pertaining to that subsystem, rather than here.
+	 *
+	 * The return value should be 'true' if the barrier was successfully
+	 * absorbed and 'false' if not. Note that returning 'false' can lead to
+	 * very frequent retries, so try hard to make that an uncommon case.
+	 */
+	return true;
+}
+
+/*
+ * CheckProcSignal - check to see if a particular reason has been
+ * signaled, and clear the signal flag.  Should be called after receiving
+ * SIGUSR1.
+ */
+static bool
+CheckProcSignal(ProcSignalReason reason)
+{
+	volatile ProcSignalSlot *slot = MyProcSignalSlot;
+
+	if (slot != NULL)
+	{
+		/* Careful here --- don't clear flag if we haven't seen it set */
+		if (slot->pss_signalFlags[reason])
+		{
+			slot->pss_signalFlags[reason] = false;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * procsignal_sigusr1_handler - handle SIGUSR1 signal.
+ */
+void
+procsignal_sigusr1_handler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	if (CheckProcSignal(PROCSIG_CATCHUP_INTERRUPT))
+		HandleCatchupInterrupt();
+
+	if (CheckProcSignal(PROCSIG_NOTIFY_INTERRUPT))
+		HandleNotifyInterrupt();
+
+	if (CheckProcSignal(PROCSIG_PARALLEL_MESSAGE))
+		HandleParallelMessageInterrupt();
+
+	if (CheckProcSignal(PROCSIG_WALSND_INIT_STOPPING))
+		HandleWalSndInitStopping();
+
+	if (CheckProcSignal(PROCSIG_BARRIER))
+		HandleProcSignalBarrierInterrupt();
+
+	if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT))
+		HandleLogMemoryContextInterrupt();
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_TABLESPACE))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOCK))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOCK);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+
+	SetLatch(MyLatch);
+
+	errno = save_errno;
+}
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
new file mode 100644
index 0000000..3240af4
--- /dev/null
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -0,0 +1,1288 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_mq.c
+ *	  single-reader, single-writer shared memory message queue
+ *
+ * Both the sender and the receiver must have a PGPROC; their respective
+ * process latches are used for synchronization.  Only the sender may send,
+ * and only the receiver may receive.  This is intended to allow a user
+ * backend to communicate with worker backends that it has registered.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/ipc/shm_mq.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "storage/procsignal.h"
+#include "storage/shm_mq.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+/*
+ * This structure represents the actual queue, stored in shared memory.
+ *
+ * Some notes on synchronization:
+ *
+ * mq_receiver and mq_bytes_read can only be changed by the receiver; and
+ * mq_sender and mq_bytes_written can only be changed by the sender.
+ * mq_receiver and mq_sender are protected by mq_mutex, although, importantly,
+ * they cannot change once set, and thus may be read without a lock once this
+ * is known to be the case.
+ *
+ * mq_bytes_read and mq_bytes_written are not protected by the mutex.  Instead,
+ * they are written atomically using 8 byte loads and stores.  Memory barriers
+ * must be carefully used to synchronize reads and writes of these values with
+ * reads and writes of the actual data in mq_ring.
+ *
+ * mq_detached needs no locking.  It can be set by either the sender or the
+ * receiver, but only ever from false to true, so redundant writes don't
+ * matter.  It is important that if we set mq_detached and then set the
+ * counterparty's latch, the counterparty must be certain to see the change
+ * after waking up.  Since SetLatch begins with a memory barrier and ResetLatch
+ * ends with one, this should be OK.
+ *
+ * mq_ring_size and mq_ring_offset never change after initialization, and
+ * can therefore be read without the lock.
+ *
+ * Importantly, mq_ring can be safely read and written without a lock.
+ * At any given time, the difference between mq_bytes_read and
+ * mq_bytes_written defines the number of bytes within mq_ring that contain
+ * unread data, and mq_bytes_read defines the position where those bytes
+ * begin.  The sender can increase the number of unread bytes at any time,
+ * but only the receiver can give license to overwrite those bytes, by
+ * incrementing mq_bytes_read.  Therefore, it's safe for the receiver to read
+ * the unread bytes it knows to be present without the lock.  Conversely,
+ * the sender can write to the unused portion of the ring buffer without
+ * the lock, because nobody else can be reading or writing those bytes.  The
+ * receiver could be making more bytes unused by incrementing mq_bytes_read,
+ * but that's OK.  Note that it would be unsafe for the receiver to read any
+ * data it's already marked as read, or to write any data; and it would be
+ * unsafe for the sender to reread any data after incrementing
+ * mq_bytes_written, but fortunately there's no need for any of that.
+ */
+struct shm_mq
+{
+	slock_t		mq_mutex;
+	PGPROC	   *mq_receiver;
+	PGPROC	   *mq_sender;
+	pg_atomic_uint64 mq_bytes_read;
+	pg_atomic_uint64 mq_bytes_written;
+	Size		mq_ring_size;
+	bool		mq_detached;
+	uint8		mq_ring_offset;
+	char		mq_ring[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/*
+ * This structure is a backend-private handle for access to a queue.
+ *
+ * mqh_queue is a pointer to the queue we've attached, and mqh_segment is
+ * an optional pointer to the dynamic shared memory segment that contains it.
+ * (If mqh_segment is provided, we register an on_dsm_detach callback to
+ * make sure we detach from the queue before detaching from DSM.)
+ *
+ * If this queue is intended to connect the current process with a background
+ * worker that started it, the user can pass a pointer to the worker handle
+ * to shm_mq_attach(), and we'll store it in mqh_handle.  The point of this
+ * is to allow us to begin sending to or receiving from that queue before the
+ * process we'll be communicating with has even been started.  If it fails
+ * to start, the handle will allow us to notice that and fail cleanly, rather
+ * than waiting forever; see shm_mq_wait_internal.  This is mostly useful in
+ * simple cases - e.g. where there are just 2 processes communicating; in
+ * more complex scenarios, every process may not have a BackgroundWorkerHandle
+ * available, or may need to watch for the failure of more than one other
+ * process at a time.
+ *
+ * When a message exists as a contiguous chunk of bytes in the queue - that is,
+ * it is smaller than the size of the ring buffer and does not wrap around
+ * the end - we return the message to the caller as a pointer into the buffer.
+ * For messages that are larger or happen to wrap, we reassemble the message
+ * locally by copying the chunks into a backend-local buffer.  mqh_buffer is
+ * the buffer, and mqh_buflen is the number of bytes allocated for it.
+ *
+ * mqh_partial_bytes, mqh_expected_bytes, and mqh_length_word_complete
+ * are used to track the state of non-blocking operations.  When the caller
+ * attempts a non-blocking operation that returns SHM_MQ_WOULD_BLOCK, they
+ * are expected to retry the call at a later time with the same argument;
+ * we need to retain enough state to pick up where we left off.
+ * mqh_length_word_complete tracks whether we are done sending or receiving
+ * (whichever we're doing) the entire length word.  mqh_partial_bytes tracks
+ * the number of bytes read or written for either the length word or the
+ * message itself, and mqh_expected_bytes - which is used only for reads -
+ * tracks the expected total size of the payload.
+ *
+ * mqh_counterparty_attached tracks whether we know the counterparty to have
+ * attached to the queue at some previous point.  This lets us avoid some
+ * mutex acquisitions.
+ *
+ * mqh_context is the memory context in effect at the time we attached to
+ * the shm_mq.  The shm_mq_handle itself is allocated in this context, and
+ * we make sure any other allocations we do happen in this context as well,
+ * to avoid nasty surprises.
+ */
+struct shm_mq_handle
+{
+	shm_mq	   *mqh_queue;
+	dsm_segment *mqh_segment;
+	BackgroundWorkerHandle *mqh_handle;
+	char	   *mqh_buffer;
+	Size		mqh_buflen;
+	Size		mqh_consume_pending;
+	Size		mqh_partial_bytes;
+	Size		mqh_expected_bytes;
+	bool		mqh_length_word_complete;
+	bool		mqh_counterparty_attached;
+	MemoryContext mqh_context;
+};
+
+static void shm_mq_detach_internal(shm_mq *mq);
+static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes,
+									   const void *data, bool nowait, Size *bytes_written);
+static shm_mq_result shm_mq_receive_bytes(shm_mq_handle *mqh,
+										  Size bytes_needed, bool nowait, Size *nbytesp,
+										  void **datap);
+static bool shm_mq_counterparty_gone(shm_mq *mq,
+									 BackgroundWorkerHandle *handle);
+static bool shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr,
+								 BackgroundWorkerHandle *handle);
+static void shm_mq_inc_bytes_read(shm_mq *mq, Size n);
+static void shm_mq_inc_bytes_written(shm_mq *mq, Size n);
+static void shm_mq_detach_callback(dsm_segment *seg, Datum arg);
+
+/* Minimum queue size is enough for header and at least one chunk of data. */
+const Size	shm_mq_minimum_size =
+MAXALIGN(offsetof(shm_mq, mq_ring)) + MAXIMUM_ALIGNOF;
+
+#define MQH_INITIAL_BUFSIZE				8192
+
+/*
+ * Initialize a new shared message queue.
+ */
+shm_mq *
+shm_mq_create(void *address, Size size)
+{
+	shm_mq	   *mq = address;
+	Size		data_offset = MAXALIGN(offsetof(shm_mq, mq_ring));
+
+	/* If the size isn't MAXALIGN'd, just discard the odd bytes. */
+	size = MAXALIGN_DOWN(size);
+
+	/* Queue size must be large enough to hold some data. */
+	Assert(size > data_offset);
+
+	/* Initialize queue header. */
+	SpinLockInit(&mq->mq_mutex);
+	mq->mq_receiver = NULL;
+	mq->mq_sender = NULL;
+	pg_atomic_init_u64(&mq->mq_bytes_read, 0);
+	pg_atomic_init_u64(&mq->mq_bytes_written, 0);
+	mq->mq_ring_size = size - data_offset;
+	mq->mq_detached = false;
+	mq->mq_ring_offset = data_offset - offsetof(shm_mq, mq_ring);
+
+	return mq;
+}
+
+/*
+ * Set the identity of the process that will receive from a shared message
+ * queue.
+ */
+void
+shm_mq_set_receiver(shm_mq *mq, PGPROC *proc)
+{
+	PGPROC	   *sender;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	Assert(mq->mq_receiver == NULL);
+	mq->mq_receiver = proc;
+	sender = mq->mq_sender;
+	SpinLockRelease(&mq->mq_mutex);
+
+	if (sender != NULL)
+		SetLatch(&sender->procLatch);
+}
+
+/*
+ * Set the identity of the process that will send to a shared message queue.
+ */
+void
+shm_mq_set_sender(shm_mq *mq, PGPROC *proc)
+{
+	PGPROC	   *receiver;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	Assert(mq->mq_sender == NULL);
+	mq->mq_sender = proc;
+	receiver = mq->mq_receiver;
+	SpinLockRelease(&mq->mq_mutex);
+
+	if (receiver != NULL)
+		SetLatch(&receiver->procLatch);
+}
+
+/*
+ * Get the configured receiver.
+ */
+PGPROC *
+shm_mq_get_receiver(shm_mq *mq)
+{
+	PGPROC	   *receiver;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	receiver = mq->mq_receiver;
+	SpinLockRelease(&mq->mq_mutex);
+
+	return receiver;
+}
+
+/*
+ * Get the configured sender.
+ */
+PGPROC *
+shm_mq_get_sender(shm_mq *mq)
+{
+	PGPROC	   *sender;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	sender = mq->mq_sender;
+	SpinLockRelease(&mq->mq_mutex);
+
+	return sender;
+}
+
+/*
+ * Attach to a shared message queue so we can send or receive messages.
+ *
+ * The memory context in effect at the time this function is called should
+ * be one which will last for at least as long as the message queue itself.
+ * We'll allocate the handle in that context, and future allocations that
+ * are needed to buffer incoming data will happen in that context as well.
+ *
+ * If seg != NULL, the queue will be automatically detached when that dynamic
+ * shared memory segment is detached.
+ *
+ * If handle != NULL, the queue can be read or written even before the
+ * other process has attached.  We'll wait for it to do so if needed.  The
+ * handle must be for a background worker initialized with bgw_notify_pid
+ * equal to our PID.
+ *
+ * shm_mq_detach() should be called when done.  This will free the
+ * shm_mq_handle and mark the queue itself as detached, so that our
+ * counterpart won't get stuck waiting for us to fill or drain the queue
+ * after we've already lost interest.
+ */
+shm_mq_handle *
+shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
+{
+	shm_mq_handle *mqh = palloc(sizeof(shm_mq_handle));
+
+	Assert(mq->mq_receiver == MyProc || mq->mq_sender == MyProc);
+	mqh->mqh_queue = mq;
+	mqh->mqh_segment = seg;
+	mqh->mqh_handle = handle;
+	mqh->mqh_buffer = NULL;
+	mqh->mqh_buflen = 0;
+	mqh->mqh_consume_pending = 0;
+	mqh->mqh_partial_bytes = 0;
+	mqh->mqh_expected_bytes = 0;
+	mqh->mqh_length_word_complete = false;
+	mqh->mqh_counterparty_attached = false;
+	mqh->mqh_context = CurrentMemoryContext;
+
+	if (seg != NULL)
+		on_dsm_detach(seg, shm_mq_detach_callback, PointerGetDatum(mq));
+
+	return mqh;
+}
+
+/*
+ * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had
+ * been passed to shm_mq_attach.
+ */
+void
+shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle)
+{
+	Assert(mqh->mqh_handle == NULL);
+	mqh->mqh_handle = handle;
+}
+
+/*
+ * Write a message into a shared message queue.
+ */
+shm_mq_result
+shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait)
+{
+	shm_mq_iovec iov;
+
+	iov.data = data;
+	iov.len = nbytes;
+
+	return shm_mq_sendv(mqh, &iov, 1, nowait);
+}
+
+/*
+ * Write a message into a shared message queue, gathered from multiple
+ * addresses.
+ *
+ * When nowait = false, we'll wait on our process latch when the ring buffer
+ * fills up, and then continue writing once the receiver has drained some data.
+ * The process latch is reset after each wait.
+ *
+ * When nowait = true, we do not manipulate the state of the process latch;
+ * instead, if the buffer becomes full, we return SHM_MQ_WOULD_BLOCK.  In
+ * this case, the caller should call this function again, with the same
+ * arguments, each time the process latch is set.  (Once begun, the sending
+ * of a message cannot be aborted except by detaching from the queue; changing
+ * the length or payload will corrupt the queue.)
+ */
+shm_mq_result
+shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait)
+{
+	shm_mq_result res;
+	shm_mq	   *mq = mqh->mqh_queue;
+	PGPROC	   *receiver;
+	Size		nbytes = 0;
+	Size		bytes_written;
+	int			i;
+	int			which_iov = 0;
+	Size		offset;
+
+	Assert(mq->mq_sender == MyProc);
+
+	/* Compute total size of write. */
+	for (i = 0; i < iovcnt; ++i)
+		nbytes += iov[i].len;
+
+	/* Prevent writing messages overwhelming the receiver. */
+	if (nbytes > MaxAllocSize)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot send a message of size %zu via shared memory queue",
+						nbytes)));
+
+	/* Try to write, or finish writing, the length word into the buffer. */
+	while (!mqh->mqh_length_word_complete)
+	{
+		Assert(mqh->mqh_partial_bytes < sizeof(Size));
+		res = shm_mq_send_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes,
+								((char *) &nbytes) + mqh->mqh_partial_bytes,
+								nowait, &bytes_written);
+
+		if (res == SHM_MQ_DETACHED)
+		{
+			/* Reset state in case caller tries to send another message. */
+			mqh->mqh_partial_bytes = 0;
+			mqh->mqh_length_word_complete = false;
+			return res;
+		}
+		mqh->mqh_partial_bytes += bytes_written;
+
+		if (mqh->mqh_partial_bytes >= sizeof(Size))
+		{
+			Assert(mqh->mqh_partial_bytes == sizeof(Size));
+
+			mqh->mqh_partial_bytes = 0;
+			mqh->mqh_length_word_complete = true;
+		}
+
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+
+		/* Length word can't be split unless bigger than required alignment. */
+		Assert(mqh->mqh_length_word_complete || sizeof(Size) > MAXIMUM_ALIGNOF);
+	}
+
+	/* Write the actual data bytes into the buffer. */
+	Assert(mqh->mqh_partial_bytes <= nbytes);
+	offset = mqh->mqh_partial_bytes;
+	do
+	{
+		Size		chunksize;
+
+		/* Figure out which bytes need to be sent next. */
+		if (offset >= iov[which_iov].len)
+		{
+			offset -= iov[which_iov].len;
+			++which_iov;
+			if (which_iov >= iovcnt)
+				break;
+			continue;
+		}
+
+		/*
+		 * We want to avoid copying the data if at all possible, but every
+		 * chunk of bytes we write into the queue has to be MAXALIGN'd, except
+		 * the last.  Thus, if a chunk other than the last one ends on a
+		 * non-MAXALIGN'd boundary, we have to combine the tail end of its
+		 * data with data from one or more following chunks until we either
+		 * reach the last chunk or accumulate a number of bytes which is
+		 * MAXALIGN'd.
+		 */
+		if (which_iov + 1 < iovcnt &&
+			offset + MAXIMUM_ALIGNOF > iov[which_iov].len)
+		{
+			char		tmpbuf[MAXIMUM_ALIGNOF];
+			int			j = 0;
+
+			for (;;)
+			{
+				if (offset < iov[which_iov].len)
+				{
+					tmpbuf[j] = iov[which_iov].data[offset];
+					j++;
+					offset++;
+					if (j == MAXIMUM_ALIGNOF)
+						break;
+				}
+				else
+				{
+					offset -= iov[which_iov].len;
+					which_iov++;
+					if (which_iov >= iovcnt)
+						break;
+				}
+			}
+
+			res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written);
+
+			if (res == SHM_MQ_DETACHED)
+			{
+				/* Reset state in case caller tries to send another message. */
+				mqh->mqh_partial_bytes = 0;
+				mqh->mqh_length_word_complete = false;
+				return res;
+			}
+
+			mqh->mqh_partial_bytes += bytes_written;
+			if (res != SHM_MQ_SUCCESS)
+				return res;
+			continue;
+		}
+
+		/*
+		 * If this is the last chunk, we can write all the data, even if it
+		 * isn't a multiple of MAXIMUM_ALIGNOF.  Otherwise, we need to
+		 * MAXALIGN_DOWN the write size.
+		 */
+		chunksize = iov[which_iov].len - offset;
+		if (which_iov + 1 < iovcnt)
+			chunksize = MAXALIGN_DOWN(chunksize);
+		res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset],
+								nowait, &bytes_written);
+
+		if (res == SHM_MQ_DETACHED)
+		{
+			/* Reset state in case caller tries to send another message. */
+			mqh->mqh_length_word_complete = false;
+			mqh->mqh_partial_bytes = 0;
+			return res;
+		}
+
+		mqh->mqh_partial_bytes += bytes_written;
+		offset += bytes_written;
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+	} while (mqh->mqh_partial_bytes < nbytes);
+
+	/* Reset for next message. */
+	mqh->mqh_partial_bytes = 0;
+	mqh->mqh_length_word_complete = false;
+
+	/* If queue has been detached, let caller know. */
+	if (mq->mq_detached)
+		return SHM_MQ_DETACHED;
+
+	/*
+	 * If the counterparty is known to have attached, we can read mq_receiver
+	 * without acquiring the spinlock and assume it isn't NULL.  Otherwise,
+	 * more caution is needed.
+	 */
+	if (mqh->mqh_counterparty_attached)
+		receiver = mq->mq_receiver;
+	else
+	{
+		SpinLockAcquire(&mq->mq_mutex);
+		receiver = mq->mq_receiver;
+		SpinLockRelease(&mq->mq_mutex);
+		if (receiver == NULL)
+			return SHM_MQ_SUCCESS;
+		mqh->mqh_counterparty_attached = true;
+	}
+
+	/* Notify receiver of the newly-written data, and return. */
+	SetLatch(&receiver->procLatch);
+	return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Receive a message from a shared message queue.
+ *
+ * We set *nbytes to the message length and *data to point to the message
+ * payload.  If the entire message exists in the queue as a single,
+ * contiguous chunk, *data will point directly into shared memory; otherwise,
+ * it will point to a temporary buffer.  This mostly avoids data copying in
+ * the hoped-for case where messages are short compared to the buffer size,
+ * while still allowing longer messages.  In either case, the return value
+ * remains valid until the next receive operation is performed on the queue.
+ *
+ * When nowait = false, we'll wait on our process latch when the ring buffer
+ * is empty and we have not yet received a full message.  The sender will
+ * set our process latch after more data has been written, and we'll resume
+ * processing.  Each call will therefore return a complete message
+ * (unless the sender detaches the queue).
+ *
+ * When nowait = true, we do not manipulate the state of the process latch;
+ * instead, whenever the buffer is empty and we need to read from it, we
+ * return SHM_MQ_WOULD_BLOCK.  In this case, the caller should call this
+ * function again after the process latch has been set.
+ */
+shm_mq_result
+shm_mq_receive(shm_mq_handle *mqh, Size *nbytesp, void **datap, bool nowait)
+{
+	shm_mq	   *mq = mqh->mqh_queue;
+	shm_mq_result res;
+	Size		rb = 0;
+	Size		nbytes;
+	void	   *rawdata;
+
+	Assert(mq->mq_receiver == MyProc);
+
+	/* We can't receive data until the sender has attached. */
+	if (!mqh->mqh_counterparty_attached)
+	{
+		if (nowait)
+		{
+			int			counterparty_gone;
+
+			/*
+			 * We shouldn't return at this point at all unless the sender
+			 * hasn't attached yet.  However, the correct return value depends
+			 * on whether the sender is still attached.  If we first test
+			 * whether the sender has ever attached and then test whether the
+			 * sender has detached, there's a race condition: a sender that
+			 * attaches and detaches very quickly might fool us into thinking
+			 * the sender never attached at all.  So, test whether our
+			 * counterparty is definitively gone first, and only afterwards
+			 * check whether the sender ever attached in the first place.
+			 */
+			counterparty_gone = shm_mq_counterparty_gone(mq, mqh->mqh_handle);
+			if (shm_mq_get_sender(mq) == NULL)
+			{
+				if (counterparty_gone)
+					return SHM_MQ_DETACHED;
+				else
+					return SHM_MQ_WOULD_BLOCK;
+			}
+		}
+		else if (!shm_mq_wait_internal(mq, &mq->mq_sender, mqh->mqh_handle)
+				 && shm_mq_get_sender(mq) == NULL)
+		{
+			mq->mq_detached = true;
+			return SHM_MQ_DETACHED;
+		}
+		mqh->mqh_counterparty_attached = true;
+	}
+
+	/*
+	 * If we've consumed an amount of data greater than 1/4th of the ring
+	 * size, mark it consumed in shared memory.  We try to avoid doing this
+	 * unnecessarily when only a small amount of data has been consumed,
+	 * because SetLatch() is fairly expensive and we don't want to do it too
+	 * often.
+	 */
+	if (mqh->mqh_consume_pending > mq->mq_ring_size / 4)
+	{
+		shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending);
+		mqh->mqh_consume_pending = 0;
+	}
+
+	/* Try to read, or finish reading, the length word from the buffer. */
+	while (!mqh->mqh_length_word_complete)
+	{
+		/* Try to receive the message length word. */
+		Assert(mqh->mqh_partial_bytes < sizeof(Size));
+		res = shm_mq_receive_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes,
+								   nowait, &rb, &rawdata);
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+
+		/*
+		 * Hopefully, we'll receive the entire message length word at once.
+		 * But if sizeof(Size) > MAXIMUM_ALIGNOF, then it might be split over
+		 * multiple reads.
+		 */
+		if (mqh->mqh_partial_bytes == 0 && rb >= sizeof(Size))
+		{
+			Size		needed;
+
+			nbytes = *(Size *) rawdata;
+
+			/* If we've already got the whole message, we're done. */
+			needed = MAXALIGN(sizeof(Size)) + MAXALIGN(nbytes);
+			if (rb >= needed)
+			{
+				mqh->mqh_consume_pending += needed;
+				*nbytesp = nbytes;
+				*datap = ((char *) rawdata) + MAXALIGN(sizeof(Size));
+				return SHM_MQ_SUCCESS;
+			}
+
+			/*
+			 * We don't have the whole message, but we at least have the whole
+			 * length word.
+			 */
+			mqh->mqh_expected_bytes = nbytes;
+			mqh->mqh_length_word_complete = true;
+			mqh->mqh_consume_pending += MAXALIGN(sizeof(Size));
+			rb -= MAXALIGN(sizeof(Size));
+		}
+		else
+		{
+			Size		lengthbytes;
+
+			/* Can't be split unless bigger than required alignment. */
+			Assert(sizeof(Size) > MAXIMUM_ALIGNOF);
+
+			/* Message word is split; need buffer to reassemble. */
+			if (mqh->mqh_buffer == NULL)
+			{
+				mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context,
+													 MQH_INITIAL_BUFSIZE);
+				mqh->mqh_buflen = MQH_INITIAL_BUFSIZE;
+			}
+			Assert(mqh->mqh_buflen >= sizeof(Size));
+
+			/* Copy partial length word; remember to consume it. */
+			if (mqh->mqh_partial_bytes + rb > sizeof(Size))
+				lengthbytes = sizeof(Size) - mqh->mqh_partial_bytes;
+			else
+				lengthbytes = rb;
+			memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata,
+				   lengthbytes);
+			mqh->mqh_partial_bytes += lengthbytes;
+			mqh->mqh_consume_pending += MAXALIGN(lengthbytes);
+			rb -= lengthbytes;
+
+			/* If we now have the whole word, we're ready to read payload. */
+			if (mqh->mqh_partial_bytes >= sizeof(Size))
+			{
+				Assert(mqh->mqh_partial_bytes == sizeof(Size));
+				mqh->mqh_expected_bytes = *(Size *) mqh->mqh_buffer;
+				mqh->mqh_length_word_complete = true;
+				mqh->mqh_partial_bytes = 0;
+			}
+		}
+	}
+	nbytes = mqh->mqh_expected_bytes;
+
+	/*
+	 * Should be disallowed on the sending side already, but better check and
+	 * error out on the receiver side as well rather than trying to read a
+	 * prohibitively large message.
+	 */
+	if (nbytes > MaxAllocSize)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("invalid message size %zu in shared memory queue",
+						nbytes)));
+
+	if (mqh->mqh_partial_bytes == 0)
+	{
+		/*
+		 * Try to obtain the whole message in a single chunk.  If this works,
+		 * we need not copy the data and can return a pointer directly into
+		 * shared memory.
+		 */
+		res = shm_mq_receive_bytes(mqh, nbytes, nowait, &rb, &rawdata);
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+		if (rb >= nbytes)
+		{
+			mqh->mqh_length_word_complete = false;
+			mqh->mqh_consume_pending += MAXALIGN(nbytes);
+			*nbytesp = nbytes;
+			*datap = rawdata;
+			return SHM_MQ_SUCCESS;
+		}
+
+		/*
+		 * The message has wrapped the buffer.  We'll need to copy it in order
+		 * to return it to the client in one chunk.  First, make sure we have
+		 * a large enough buffer available.
+		 */
+		if (mqh->mqh_buflen < nbytes)
+		{
+			Size		newbuflen = Max(mqh->mqh_buflen, MQH_INITIAL_BUFSIZE);
+
+			/*
+			 * Double the buffer size until the payload fits, but limit to
+			 * MaxAllocSize.
+			 */
+			while (newbuflen < nbytes)
+				newbuflen *= 2;
+			newbuflen = Min(newbuflen, MaxAllocSize);
+
+			if (mqh->mqh_buffer != NULL)
+			{
+				pfree(mqh->mqh_buffer);
+				mqh->mqh_buffer = NULL;
+				mqh->mqh_buflen = 0;
+			}
+			mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, newbuflen);
+			mqh->mqh_buflen = newbuflen;
+		}
+	}
+
+	/* Loop until we've copied the entire message. */
+	for (;;)
+	{
+		Size		still_needed;
+
+		/* Copy as much as we can. */
+		Assert(mqh->mqh_partial_bytes + rb <= nbytes);
+		if (rb > 0)
+		{
+			memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, rb);
+			mqh->mqh_partial_bytes += rb;
+		}
+
+		/*
+		 * Update count of bytes that can be consumed, accounting for
+		 * alignment padding.  Note that this will never actually insert any
+		 * padding except at the end of a message, because the buffer size is
+		 * a multiple of MAXIMUM_ALIGNOF, and each read and write is as well.
+		 */
+		Assert(mqh->mqh_partial_bytes == nbytes || rb == MAXALIGN(rb));
+		mqh->mqh_consume_pending += MAXALIGN(rb);
+
+		/* If we got all the data, exit the loop. */
+		if (mqh->mqh_partial_bytes >= nbytes)
+			break;
+
+		/* Wait for some more data. */
+		still_needed = nbytes - mqh->mqh_partial_bytes;
+		res = shm_mq_receive_bytes(mqh, still_needed, nowait, &rb, &rawdata);
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+		if (rb > still_needed)
+			rb = still_needed;
+	}
+
+	/* Return the complete message, and reset for next message. */
+	*nbytesp = nbytes;
+	*datap = mqh->mqh_buffer;
+	mqh->mqh_length_word_complete = false;
+	mqh->mqh_partial_bytes = 0;
+	return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Wait for the other process that's supposed to use this queue to attach
+ * to it.
+ *
+ * The return value is SHM_MQ_DETACHED if the worker has already detached or
+ * if it dies; it is SHM_MQ_SUCCESS if we detect that the worker has attached.
+ * Note that we will only be able to detect that the worker has died before
+ * attaching if a background worker handle was passed to shm_mq_attach().
+ */
+shm_mq_result
+shm_mq_wait_for_attach(shm_mq_handle *mqh)
+{
+	shm_mq	   *mq = mqh->mqh_queue;
+	PGPROC	  **victim;
+
+	if (shm_mq_get_receiver(mq) == MyProc)
+		victim = &mq->mq_sender;
+	else
+	{
+		Assert(shm_mq_get_sender(mq) == MyProc);
+		victim = &mq->mq_receiver;
+	}
+
+	if (shm_mq_wait_internal(mq, victim, mqh->mqh_handle))
+		return SHM_MQ_SUCCESS;
+	else
+		return SHM_MQ_DETACHED;
+}
+
+/*
+ * Detach from a shared message queue, and destroy the shm_mq_handle.
+ */
+void
+shm_mq_detach(shm_mq_handle *mqh)
+{
+	/* Notify counterparty that we're outta here. */
+	shm_mq_detach_internal(mqh->mqh_queue);
+
+	/* Cancel on_dsm_detach callback, if any. */
+	if (mqh->mqh_segment)
+		cancel_on_dsm_detach(mqh->mqh_segment,
+							 shm_mq_detach_callback,
+							 PointerGetDatum(mqh->mqh_queue));
+
+	/* Release local memory associated with handle. */
+	if (mqh->mqh_buffer != NULL)
+		pfree(mqh->mqh_buffer);
+	pfree(mqh);
+}
+
+/*
+ * Notify counterparty that we're detaching from shared message queue.
+ *
+ * The purpose of this function is to make sure that the process
+ * with which we're communicating doesn't block forever waiting for us to
+ * fill or drain the queue once we've lost interest.  When the sender
+ * detaches, the receiver can read any messages remaining in the queue;
+ * further reads will return SHM_MQ_DETACHED.  If the receiver detaches,
+ * further attempts to send messages will likewise return SHM_MQ_DETACHED.
+ *
+ * This is separated out from shm_mq_detach() because if the on_dsm_detach
+ * callback fires, we only want to do this much.  We do not try to touch
+ * the local shm_mq_handle, as it may have been pfree'd already.
+ */
+static void
+shm_mq_detach_internal(shm_mq *mq)
+{
+	PGPROC	   *victim;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	if (mq->mq_sender == MyProc)
+		victim = mq->mq_receiver;
+	else
+	{
+		Assert(mq->mq_receiver == MyProc);
+		victim = mq->mq_sender;
+	}
+	mq->mq_detached = true;
+	SpinLockRelease(&mq->mq_mutex);
+
+	if (victim != NULL)
+		SetLatch(&victim->procLatch);
+}
+
+/*
+ * Get the shm_mq from handle.
+ */
+shm_mq *
+shm_mq_get_queue(shm_mq_handle *mqh)
+{
+	return mqh->mqh_queue;
+}
+
+/*
+ * Write bytes into a shared message queue.
+ */
+static shm_mq_result
+shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
+				  bool nowait, Size *bytes_written)
+{
+	shm_mq	   *mq = mqh->mqh_queue;
+	Size		sent = 0;
+	uint64		used;
+	Size		ringsize = mq->mq_ring_size;
+	Size		available;
+
+	while (sent < nbytes)
+	{
+		uint64		rb;
+		uint64		wb;
+
+		/* Compute number of ring buffer bytes used and available. */
+		rb = pg_atomic_read_u64(&mq->mq_bytes_read);
+		wb = pg_atomic_read_u64(&mq->mq_bytes_written);
+		Assert(wb >= rb);
+		used = wb - rb;
+		Assert(used <= ringsize);
+		available = Min(ringsize - used, nbytes - sent);
+
+		/*
+		 * Bail out if the queue has been detached.  Note that we would be in
+		 * trouble if the compiler decided to cache the value of
+		 * mq->mq_detached in a register or on the stack across loop
+		 * iterations.  It probably shouldn't do that anyway since we'll
+		 * always return, call an external function that performs a system
+		 * call, or reach a memory barrier at some point later in the loop,
+		 * but just to be sure, insert a compiler barrier here.
+		 */
+		pg_compiler_barrier();
+		if (mq->mq_detached)
+		{
+			*bytes_written = sent;
+			return SHM_MQ_DETACHED;
+		}
+
+		if (available == 0 && !mqh->mqh_counterparty_attached)
+		{
+			/*
+			 * The queue is full, so if the receiver isn't yet known to be
+			 * attached, we must wait for that to happen.
+			 */
+			if (nowait)
+			{
+				if (shm_mq_counterparty_gone(mq, mqh->mqh_handle))
+				{
+					*bytes_written = sent;
+					return SHM_MQ_DETACHED;
+				}
+				if (shm_mq_get_receiver(mq) == NULL)
+				{
+					*bytes_written = sent;
+					return SHM_MQ_WOULD_BLOCK;
+				}
+			}
+			else if (!shm_mq_wait_internal(mq, &mq->mq_receiver,
+										   mqh->mqh_handle))
+			{
+				mq->mq_detached = true;
+				*bytes_written = sent;
+				return SHM_MQ_DETACHED;
+			}
+			mqh->mqh_counterparty_attached = true;
+
+			/*
+			 * The receiver may have read some data after attaching, so we
+			 * must not wait without rechecking the queue state.
+			 */
+		}
+		else if (available == 0)
+		{
+			/*
+			 * Since mq->mqh_counterparty_attached is known to be true at this
+			 * point, mq_receiver has been set, and it can't change once set.
+			 * Therefore, we can read it without acquiring the spinlock.
+			 */
+			Assert(mqh->mqh_counterparty_attached);
+			SetLatch(&mq->mq_receiver->procLatch);
+
+			/* Skip manipulation of our latch if nowait = true. */
+			if (nowait)
+			{
+				*bytes_written = sent;
+				return SHM_MQ_WOULD_BLOCK;
+			}
+
+			/*
+			 * Wait for our latch to be set.  It might already be set for some
+			 * unrelated reason, but that'll just result in one extra trip
+			 * through the loop.  It's worth it to avoid resetting the latch
+			 * at top of loop, because setting an already-set latch is much
+			 * cheaper than setting one that has been reset.
+			 */
+			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+							 WAIT_EVENT_MQ_SEND);
+
+			/* Reset the latch so we don't spin. */
+			ResetLatch(MyLatch);
+
+			/* An interrupt may have occurred while we were waiting. */
+			CHECK_FOR_INTERRUPTS();
+		}
+		else
+		{
+			Size		offset;
+			Size		sendnow;
+
+			offset = wb % (uint64) ringsize;
+			sendnow = Min(available, ringsize - offset);
+
+			/*
+			 * Write as much data as we can via a single memcpy(). Make sure
+			 * these writes happen after the read of mq_bytes_read, above.
+			 * This barrier pairs with the one in shm_mq_inc_bytes_read.
+			 * (Since we're separating the read of mq_bytes_read from a
+			 * subsequent write to mq_ring, we need a full barrier here.)
+			 */
+			pg_memory_barrier();
+			memcpy(&mq->mq_ring[mq->mq_ring_offset + offset],
+				   (char *) data + sent, sendnow);
+			sent += sendnow;
+
+			/*
+			 * Update count of bytes written, with alignment padding.  Note
+			 * that this will never actually insert any padding except at the
+			 * end of a run of bytes, because the buffer size is a multiple of
+			 * MAXIMUM_ALIGNOF, and each read is as well.
+			 */
+			Assert(sent == nbytes || sendnow == MAXALIGN(sendnow));
+			shm_mq_inc_bytes_written(mq, MAXALIGN(sendnow));
+
+			/*
+			 * For efficiency, we don't set the reader's latch here.  We'll do
+			 * that only when the buffer fills up or after writing an entire
+			 * message.
+			 */
+		}
+	}
+
+	*bytes_written = sent;
+	return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Wait until at least *nbytesp bytes are available to be read from the
+ * shared message queue, or until the buffer wraps around.  If the queue is
+ * detached, returns SHM_MQ_DETACHED.  If nowait is specified and a wait
+ * would be required, returns SHM_MQ_WOULD_BLOCK.  Otherwise, *datap is set
+ * to the location at which data bytes can be read, *nbytesp is set to the
+ * number of bytes which can be read at that address, and the return value
+ * is SHM_MQ_SUCCESS.
+ */
+static shm_mq_result
+shm_mq_receive_bytes(shm_mq_handle *mqh, Size bytes_needed, bool nowait,
+					 Size *nbytesp, void **datap)
+{
+	shm_mq	   *mq = mqh->mqh_queue;
+	Size		ringsize = mq->mq_ring_size;
+	uint64		used;
+	uint64		written;
+
+	for (;;)
+	{
+		Size		offset;
+		uint64		read;
+
+		/* Get bytes written, so we can compute what's available to read. */
+		written = pg_atomic_read_u64(&mq->mq_bytes_written);
+
+		/*
+		 * Get bytes read.  Include bytes we could consume but have not yet
+		 * consumed.
+		 */
+		read = pg_atomic_read_u64(&mq->mq_bytes_read) +
+			mqh->mqh_consume_pending;
+		used = written - read;
+		Assert(used <= ringsize);
+		offset = read % (uint64) ringsize;
+
+		/* If we have enough data or buffer has wrapped, we're done. */
+		if (used >= bytes_needed || offset + used >= ringsize)
+		{
+			*nbytesp = Min(used, ringsize - offset);
+			*datap = &mq->mq_ring[mq->mq_ring_offset + offset];
+
+			/*
+			 * Separate the read of mq_bytes_written, above, from caller's
+			 * attempt to read the data itself.  Pairs with the barrier in
+			 * shm_mq_inc_bytes_written.
+			 */
+			pg_read_barrier();
+			return SHM_MQ_SUCCESS;
+		}
+
+		/*
+		 * Fall out before waiting if the queue has been detached.
+		 *
+		 * Note that we don't check for this until *after* considering whether
+		 * the data already available is enough, since the receiver can finish
+		 * receiving a message stored in the buffer even after the sender has
+		 * detached.
+		 */
+		if (mq->mq_detached)
+		{
+			/*
+			 * If the writer advanced mq_bytes_written and then set
+			 * mq_detached, we might not have read the final value of
+			 * mq_bytes_written above.  Insert a read barrier and then check
+			 * again if mq_bytes_written has advanced.
+			 */
+			pg_read_barrier();
+			if (written != pg_atomic_read_u64(&mq->mq_bytes_written))
+				continue;
+
+			return SHM_MQ_DETACHED;
+		}
+
+		/*
+		 * We didn't get enough data to satisfy the request, so mark any data
+		 * previously-consumed as read to make more buffer space.
+		 */
+		if (mqh->mqh_consume_pending > 0)
+		{
+			shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending);
+			mqh->mqh_consume_pending = 0;
+		}
+
+		/* Skip manipulation of our latch if nowait = true. */
+		if (nowait)
+			return SHM_MQ_WOULD_BLOCK;
+
+		/*
+		 * Wait for our latch to be set.  It might already be set for some
+		 * unrelated reason, but that'll just result in one extra trip through
+		 * the loop.  It's worth it to avoid resetting the latch at top of
+		 * loop, because setting an already-set latch is much cheaper than
+		 * setting one that has been reset.
+		 */
+		(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+						 WAIT_EVENT_MQ_RECEIVE);
+
+		/* Reset the latch so we don't spin. */
+		ResetLatch(MyLatch);
+
+		/* An interrupt may have occurred while we were waiting. */
+		CHECK_FOR_INTERRUPTS();
+	}
+}
+
+/*
+ * Test whether a counterparty who may not even be alive yet is definitely gone.
+ */
+static bool
+shm_mq_counterparty_gone(shm_mq *mq, BackgroundWorkerHandle *handle)
+{
+	pid_t		pid;
+
+	/* If the queue has been detached, counterparty is definitely gone. */
+	if (mq->mq_detached)
+		return true;
+
+	/* If there's a handle, check worker status. */
+	if (handle != NULL)
+	{
+		BgwHandleStatus status;
+
+		/* Check for unexpected worker death. */
+		status = GetBackgroundWorkerPid(handle, &pid);
+		if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
+		{
+			/* Mark it detached, just to make it official. */
+			mq->mq_detached = true;
+			return true;
+		}
+	}
+
+	/* Counterparty is not definitively gone. */
+	return false;
+}
+
+/*
+ * This is used when a process is waiting for its counterpart to attach to the
+ * queue.  We exit when the other process attaches as expected, or, if
+ * handle != NULL, when the referenced background process or the postmaster
+ * dies.  Note that if handle == NULL, and the process fails to attach, we'll
+ * potentially get stuck here forever waiting for a process that may never
+ * start.  We do check for interrupts, though.
+ *
+ * ptr is a pointer to the memory address that we're expecting to become
+ * non-NULL when our counterpart attaches to the queue.
+ */
+static bool
+shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr, BackgroundWorkerHandle *handle)
+{
+	bool		result = false;
+
+	for (;;)
+	{
+		BgwHandleStatus status;
+		pid_t		pid;
+
+		/* Acquire the lock just long enough to check the pointer. */
+		SpinLockAcquire(&mq->mq_mutex);
+		result = (*ptr != NULL);
+		SpinLockRelease(&mq->mq_mutex);
+
+		/* Fail if detached; else succeed if initialized. */
+		if (mq->mq_detached)
+		{
+			result = false;
+			break;
+		}
+		if (result)
+			break;
+
+		if (handle != NULL)
+		{
+			/* Check for unexpected worker death. */
+			status = GetBackgroundWorkerPid(handle, &pid);
+			if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
+			{
+				result = false;
+				break;
+			}
+		}
+
+		/* Wait to be signaled. */
+		(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+						 WAIT_EVENT_MQ_INTERNAL);
+
+		/* Reset the latch so we don't spin. */
+		ResetLatch(MyLatch);
+
+		/* An interrupt may have occurred while we were waiting. */
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	return result;
+}
+
+/*
+ * Increment the number of bytes read.
+ */
+static void
+shm_mq_inc_bytes_read(shm_mq *mq, Size n)
+{
+	PGPROC	   *sender;
+
+	/*
+	 * Separate prior reads of mq_ring from the increment of mq_bytes_read
+	 * which follows.  This pairs with the full barrier in
+	 * shm_mq_send_bytes(). We only need a read barrier here because the
+	 * increment of mq_bytes_read is actually a read followed by a dependent
+	 * write.
+	 */
+	pg_read_barrier();
+
+	/*
+	 * There's no need to use pg_atomic_fetch_add_u64 here, because nobody
+	 * else can be changing this value.  This method should be cheaper.
+	 */
+	pg_atomic_write_u64(&mq->mq_bytes_read,
+						pg_atomic_read_u64(&mq->mq_bytes_read) + n);
+
+	/*
+	 * We shouldn't have any bytes to read without a sender, so we can read
+	 * mq_sender here without a lock.  Once it's initialized, it can't change.
+	 */
+	sender = mq->mq_sender;
+	Assert(sender != NULL);
+	SetLatch(&sender->procLatch);
+}
+
+/*
+ * Increment the number of bytes written.
+ */
+static void
+shm_mq_inc_bytes_written(shm_mq *mq, Size n)
+{
+	/*
+	 * Separate prior reads of mq_ring from the write of mq_bytes_written
+	 * which we're about to do.  Pairs with the read barrier found in
+	 * shm_mq_receive_bytes.
+	 */
+	pg_write_barrier();
+
+	/*
+	 * There's no need to use pg_atomic_fetch_add_u64 here, because nobody
+	 * else can be changing this value.  This method avoids taking the bus
+	 * lock unnecessarily.
+	 */
+	pg_atomic_write_u64(&mq->mq_bytes_written,
+						pg_atomic_read_u64(&mq->mq_bytes_written) + n);
+}
+
+/* Shim for on_dsm_detach callback. */
+static void
+shm_mq_detach_callback(dsm_segment *seg, Datum arg)
+{
+	shm_mq	   *mq = (shm_mq *) DatumGetPointer(arg);
+
+	shm_mq_detach_internal(mq);
+}
diff --git a/src/backend/storage/ipc/shm_toc.c b/src/backend/storage/ipc/shm_toc.c
new file mode 100644
index 0000000..863b98b
--- /dev/null
+++ b/src/backend/storage/ipc/shm_toc.c
@@ -0,0 +1,272 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_toc.c
+ *	  shared memory segment table of contents
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/ipc/shm_toc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "storage/shm_toc.h"
+#include "storage/spin.h"
+
+typedef struct shm_toc_entry
+{
+	uint64		key;			/* Arbitrary identifier */
+	Size		offset;			/* Offset, in bytes, from TOC start */
+} shm_toc_entry;
+
+struct shm_toc
+{
+	uint64		toc_magic;		/* Magic number identifying this TOC */
+	slock_t		toc_mutex;		/* Spinlock for mutual exclusion */
+	Size		toc_total_bytes;	/* Bytes managed by this TOC */
+	Size		toc_allocated_bytes;	/* Bytes allocated of those managed */
+	uint32		toc_nentry;		/* Number of entries in TOC */
+	shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/*
+ * Initialize a region of shared memory with a table of contents.
+ */
+shm_toc *
+shm_toc_create(uint64 magic, void *address, Size nbytes)
+{
+	shm_toc    *toc = (shm_toc *) address;
+
+	Assert(nbytes > offsetof(shm_toc, toc_entry));
+	toc->toc_magic = magic;
+	SpinLockInit(&toc->toc_mutex);
+
+	/*
+	 * The alignment code in shm_toc_allocate() assumes that the starting
+	 * value is buffer-aligned.
+	 */
+	toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
+	toc->toc_allocated_bytes = 0;
+	toc->toc_nentry = 0;
+
+	return toc;
+}
+
+/*
+ * Attach to an existing table of contents.  If the magic number found at
+ * the target address doesn't match our expectations, return NULL.
+ */
+shm_toc *
+shm_toc_attach(uint64 magic, void *address)
+{
+	shm_toc    *toc = (shm_toc *) address;
+
+	if (toc->toc_magic != magic)
+		return NULL;
+
+	Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
+	Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));
+
+	return toc;
+}
+
+/*
+ * Allocate shared memory from a segment managed by a table of contents.
+ *
+ * This is not a full-blown allocator; there's no way to free memory.  It's
+ * just a way of dividing a single physical shared memory segment into logical
+ * chunks that may be used for different purposes.
+ *
+ * We allocate backwards from the end of the segment, so that the TOC entries
+ * can grow forward from the start of the segment.
+ */
+void *
+shm_toc_allocate(shm_toc *toc, Size nbytes)
+{
+	volatile shm_toc *vtoc = toc;
+	Size		total_bytes;
+	Size		allocated_bytes;
+	Size		nentry;
+	Size		toc_bytes;
+
+	/*
+	 * Make sure request is well-aligned.  XXX: MAXALIGN is not enough,
+	 * because atomic ops might need a wider alignment.  We don't have a
+	 * proper definition for the minimum to make atomic ops safe, but
+	 * BUFFERALIGN ought to be enough.
+	 */
+	nbytes = BUFFERALIGN(nbytes);
+
+	SpinLockAcquire(&toc->toc_mutex);
+
+	total_bytes = vtoc->toc_total_bytes;
+	allocated_bytes = vtoc->toc_allocated_bytes;
+	nentry = vtoc->toc_nentry;
+	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+		+ allocated_bytes;
+
+	/* Check for memory exhaustion and overflow. */
+	if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
+	{
+		SpinLockRelease(&toc->toc_mutex);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory")));
+	}
+	vtoc->toc_allocated_bytes += nbytes;
+
+	SpinLockRelease(&toc->toc_mutex);
+
+	return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
+}
+
+/*
+ * Return the number of bytes that can still be allocated.
+ */
+Size
+shm_toc_freespace(shm_toc *toc)
+{
+	volatile shm_toc *vtoc = toc;
+	Size		total_bytes;
+	Size		allocated_bytes;
+	Size		nentry;
+	Size		toc_bytes;
+
+	SpinLockAcquire(&toc->toc_mutex);
+	total_bytes = vtoc->toc_total_bytes;
+	allocated_bytes = vtoc->toc_allocated_bytes;
+	nentry = vtoc->toc_nentry;
+	SpinLockRelease(&toc->toc_mutex);
+
+	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
+	Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
+	return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
+}
+
+/*
+ * Insert a TOC entry.
+ *
+ * The idea here is that the process setting up the shared memory segment will
+ * register the addresses of data structures within the segment using this
+ * function.  Each data structure will be identified using a 64-bit key, which
+ * is assumed to be a well-known or discoverable integer.  Other processes
+ * accessing the shared memory segment can pass the same key to
+ * shm_toc_lookup() to discover the addresses of those data structures.
+ *
+ * Since the shared memory segment may be mapped at different addresses within
+ * different backends, we store relative rather than absolute pointers.
+ *
+ * This won't scale well to a large number of keys.  Hopefully, that isn't
+ * necessary; if it proves to be, we might need to provide a more sophisticated
+ * data structure here.  But the real idea here is just to give someone mapping
+ * a dynamic shared memory the ability to find the bare minimum number of
+ * pointers that they need to bootstrap.  If you're storing a lot of stuff in
+ * the TOC, you're doing it wrong.
+ */
+void
+shm_toc_insert(shm_toc *toc, uint64 key, void *address)
+{
+	volatile shm_toc *vtoc = toc;
+	Size		total_bytes;
+	Size		allocated_bytes;
+	Size		nentry;
+	Size		toc_bytes;
+	Size		offset;
+
+	/* Relativize pointer. */
+	Assert(address > (void *) toc);
+	offset = ((char *) address) - (char *) toc;
+
+	SpinLockAcquire(&toc->toc_mutex);
+
+	total_bytes = vtoc->toc_total_bytes;
+	allocated_bytes = vtoc->toc_allocated_bytes;
+	nentry = vtoc->toc_nentry;
+	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+		+ allocated_bytes;
+
+	/* Check for memory exhaustion and overflow. */
+	if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
+		toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
+		nentry >= PG_UINT32_MAX)
+	{
+		SpinLockRelease(&toc->toc_mutex);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory")));
+	}
+
+	Assert(offset < total_bytes);
+	vtoc->toc_entry[nentry].key = key;
+	vtoc->toc_entry[nentry].offset = offset;
+
+	/*
+	 * By placing a write barrier after filling in the entry and before
+	 * updating the number of entries, we make it safe to read the TOC
+	 * unlocked.
+	 */
+	pg_write_barrier();
+
+	vtoc->toc_nentry++;
+
+	SpinLockRelease(&toc->toc_mutex);
+}
+
+/*
+ * Look up a TOC entry.
+ *
+ * If the key is not found, returns NULL if noError is true, otherwise
+ * throws elog(ERROR).
+ *
+ * Unlike the other functions in this file, this operation acquires no lock;
+ * it uses only barriers.  It probably wouldn't hurt concurrency very much even
+ * if it did get a lock, but since it's reasonably likely that a group of
+ * worker processes could each read a series of entries from the same TOC
+ * right around the same time, there seems to be some value in avoiding it.
+ */
+void *
+shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
+{
+	uint32		nentry;
+	uint32		i;
+
+	/*
+	 * Read the number of entries before we examine any entry.  We assume that
+	 * reading a uint32 is atomic.
+	 */
+	nentry = toc->toc_nentry;
+	pg_read_barrier();
+
+	/* Now search for a matching entry. */
+	for (i = 0; i < nentry; ++i)
+	{
+		if (toc->toc_entry[i].key == key)
+			return ((char *) toc) + toc->toc_entry[i].offset;
+	}
+
+	/* No matching entry was found. */
+	if (!noError)
+		elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
+			 key, toc);
+	return NULL;
+}
+
+/*
+ * Estimate how much shared memory will be required to store a TOC and its
+ * dependent data structures.
+ */
+Size
+shm_toc_estimate(shm_toc_estimator *e)
+{
+	Size		sz;
+
+	sz = offsetof(shm_toc, toc_entry);
+	sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
+	sz = add_size(sz, e->space_for_chunks);
+
+	return BUFFERALIGN(sz);
+}
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
new file mode 100644
index 0000000..4425e99
--- /dev/null
+++ b/src/backend/storage/ipc/shmem.c
@@ -0,0 +1,611 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.c
+ *	  create shared memory and initialize shared memory data structures.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/shmem.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES processes share one or more regions of shared memory.
+ * The shared memory is created by a postmaster and is inherited
+ * by each backend via fork() (or, in some ports, via other OS-specific
+ * methods).  The routines in this file are used for allocating and
+ * binding to shared memory data structures.
+ *
+ * NOTES:
+ *		(a) There are three kinds of shared memory data structures
+ *	available to POSTGRES: fixed-size structures, queues and hash
+ *	tables.  Fixed-size structures contain things like global variables
+ *	for a module and should never be allocated after the shared memory
+ *	initialization phase.  Hash tables have a fixed maximum size, but
+ *	their actual size can vary dynamically.  When entries are added
+ *	to the table, more space is allocated.  Queues link data structures
+ *	that have been allocated either within fixed-size structures or as hash
+ *	buckets.  Each shared data structure has a string name to identify
+ *	it (assigned in the module that declares it).
+ *
+ *		(b) During initialization, each module looks for its
+ *	shared data structures in a hash table called the "Shmem Index".
+ *	If the data structure is not present, the caller can allocate
+ *	a new one and initialize it.  If the data structure is present,
+ *	the caller "attaches" to the structure by initializing a pointer
+ *	in the local address space.
+ *		The shmem index has two purposes: first, it gives us
+ *	a simple model of how the world looks when a backend process
+ *	initializes.  If something is present in the shmem index,
+ *	it is initialized.  If it is not, it is uninitialized.  Second,
+ *	the shmem index allows us to allocate shared memory on demand
+ *	instead of trying to preallocate structures and hard-wire the
+ *	sizes and locations in header files.  If you are using a lot
+ *	of shared memory in a lot of different places (and changing
+ *	things during development), this is important.
+ *
+ *		(c) In standard Unix-ish environments, individual backends do not
+ *	need to re-establish their local pointers into shared memory, because
+ *	they inherit correct values of those variables via fork() from the
+ *	postmaster.  However, this does not work in the EXEC_BACKEND case.
+ *	In ports using EXEC_BACKEND, new backends have to set up their local
+ *	pointers using the method described in (b) above.
+ *
+ *		(d) memory allocation model: shared memory can never be
+ *	freed, once allocated.   Each hash table has its own free list,
+ *	so hash buckets can be reused when an item is deleted.  However,
+ *	if one hash table grows very large and then shrinks, its space
+ *	cannot be redistributed to other tables.  We could build a simple
+ *	hash bucket garbage collector if need be.  Right now, it seems
+ *	unnecessary.
+ */
+
+#include "postgres.h"
+
+#include "access/transam.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+
+static void *ShmemAllocRaw(Size size, Size *allocated_size);
+
+/* shared memory global variables */
+
+static PGShmemHeader *ShmemSegHdr;	/* shared mem segment header */
+
+static void *ShmemBase;			/* start address of shared memory */
+
+static void *ShmemEnd;			/* end+1 address of shared memory */
+
+slock_t    *ShmemLock;			/* spinlock for shared memory and LWLock
+								 * allocation */
+
+static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
+
+
+/*
+ *	InitShmemAccess() --- set up basic pointers to shared memory.
+ *
+ * Note: the argument should be declared "PGShmemHeader *seghdr",
+ * but we use void to avoid having to include ipc.h in shmem.h.
+ */
+void
+InitShmemAccess(void *seghdr)
+{
+	PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr;
+
+	ShmemSegHdr = shmhdr;
+	ShmemBase = (void *) shmhdr;
+	ShmemEnd = (char *) ShmemBase + shmhdr->totalsize;
+}
+
+/*
+ *	InitShmemAllocation() --- set up shared-memory space allocation.
+ *
+ * This should be called only in the postmaster or a standalone backend.
+ */
+void
+InitShmemAllocation(void)
+{
+	PGShmemHeader *shmhdr = ShmemSegHdr;
+	char	   *aligned;
+
+	Assert(shmhdr != NULL);
+
+	/*
+	 * Initialize the spinlock used by ShmemAlloc.  We must use
+	 * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
+	 */
+	ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
+
+	SpinLockInit(ShmemLock);
+
+	/*
+	 * Allocations after this point should go through ShmemAlloc, which
+	 * expects to allocate everything on cache line boundaries.  Make sure the
+	 * first allocation begins on a cache line boundary.
+	 */
+	aligned = (char *)
+		(CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
+	shmhdr->freeoffset = aligned - (char *) shmhdr;
+
+	/* ShmemIndex can't be set up yet (need LWLocks first) */
+	shmhdr->index = NULL;
+	ShmemIndex = (HTAB *) NULL;
+
+	/*
+	 * Initialize ShmemVariableCache for transaction manager. (This doesn't
+	 * really belong here, but not worth moving.)
+	 */
+	ShmemVariableCache = (VariableCache)
+		ShmemAlloc(sizeof(*ShmemVariableCache));
+	memset(ShmemVariableCache, 0, sizeof(*ShmemVariableCache));
+}
+
+/*
+ * ShmemAlloc -- allocate max-aligned chunk from shared memory
+ *
+ * Throws error if request cannot be satisfied.
+ *
+ * Assumes ShmemLock and ShmemSegHdr are initialized.
+ */
+void *
+ShmemAlloc(Size size)
+{
+	void	   *newSpace;
+	Size		allocated_size;
+
+	newSpace = ShmemAllocRaw(size, &allocated_size);
+	if (!newSpace)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory (%zu bytes requested)",
+						size)));
+	return newSpace;
+}
+
+/*
+ * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
+ *
+ * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
+ */
+void *
+ShmemAllocNoError(Size size)
+{
+	Size		allocated_size;
+
+	return ShmemAllocRaw(size, &allocated_size);
+}
+
+/*
+ * ShmemAllocRaw -- allocate align chunk and return allocated size
+ *
+ * Also sets *allocated_size to the number of bytes allocated, which will
+ * be equal to the number requested plus any padding we choose to add.
+ */
+static void *
+ShmemAllocRaw(Size size, Size *allocated_size)
+{
+	Size		newStart;
+	Size		newFree;
+	void	   *newSpace;
+
+	/*
+	 * Ensure all space is adequately aligned.  We used to only MAXALIGN this
+	 * space but experience has proved that on modern systems that is not good
+	 * enough.  Many parts of the system are very sensitive to critical data
+	 * structures getting split across cache line boundaries.  To avoid that,
+	 * attempt to align the beginning of the allocation to a cache line
+	 * boundary.  The calling code will still need to be careful about how it
+	 * uses the allocated space - e.g. by padding each element in an array of
+	 * structures out to a power-of-two size - but without this, even that
+	 * won't be sufficient.
+	 */
+	size = CACHELINEALIGN(size);
+	*allocated_size = size;
+
+	Assert(ShmemSegHdr != NULL);
+
+	SpinLockAcquire(ShmemLock);
+
+	newStart = ShmemSegHdr->freeoffset;
+
+	newFree = newStart + size;
+	if (newFree <= ShmemSegHdr->totalsize)
+	{
+		newSpace = (void *) ((char *) ShmemBase + newStart);
+		ShmemSegHdr->freeoffset = newFree;
+	}
+	else
+		newSpace = NULL;
+
+	SpinLockRelease(ShmemLock);
+
+	/* note this assert is okay with newSpace == NULL */
+	Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
+
+	return newSpace;
+}
+
+/*
+ * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
+ *
+ * Allocate space without locking ShmemLock.  This should be used for,
+ * and only for, allocations that must happen before ShmemLock is ready.
+ *
+ * We consider maxalign, rather than cachealign, sufficient here.
+ */
+void *
+ShmemAllocUnlocked(Size size)
+{
+	Size		newStart;
+	Size		newFree;
+	void	   *newSpace;
+
+	/*
+	 * Ensure allocated space is adequately aligned.
+	 */
+	size = MAXALIGN(size);
+
+	Assert(ShmemSegHdr != NULL);
+
+	newStart = ShmemSegHdr->freeoffset;
+
+	newFree = newStart + size;
+	if (newFree > ShmemSegHdr->totalsize)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory (%zu bytes requested)",
+						size)));
+	ShmemSegHdr->freeoffset = newFree;
+
+	newSpace = (void *) ((char *) ShmemBase + newStart);
+
+	Assert(newSpace == (void *) MAXALIGN(newSpace));
+
+	return newSpace;
+}
+
+/*
+ * ShmemAddrIsValid -- test if an address refers to shared memory
+ *
+ * Returns true if the pointer points within the shared memory segment.
+ */
+bool
+ShmemAddrIsValid(const void *addr)
+{
+	return (addr >= ShmemBase) && (addr < ShmemEnd);
+}
+
+/*
+ *	InitShmemIndex() --- set up or attach to shmem index table.
+ */
+void
+InitShmemIndex(void)
+{
+	HASHCTL		info;
+
+	/*
+	 * Create the shared memory shmem index.
+	 *
+	 * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
+	 * hashtable to exist already, we have a bit of a circularity problem in
+	 * initializing the ShmemIndex itself.  The special "ShmemIndex" hash
+	 * table name will tell ShmemInitStruct to fake it.
+	 */
+	info.keysize = SHMEM_INDEX_KEYSIZE;
+	info.entrysize = sizeof(ShmemIndexEnt);
+
+	ShmemIndex = ShmemInitHash("ShmemIndex",
+							   SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
+							   &info,
+							   HASH_ELEM | HASH_STRINGS);
+}
+
+/*
+ * ShmemInitHash -- Create and initialize, or attach to, a
+ *		shared memory hash table.
+ *
+ * We assume caller is doing some kind of synchronization
+ * so that two processes don't try to create/initialize the same
+ * table at once.  (In practice, all creations are done in the postmaster
+ * process; child processes should always be attaching to existing tables.)
+ *
+ * max_size is the estimated maximum number of hashtable entries.  This is
+ * not a hard limit, but the access efficiency will degrade if it is
+ * exceeded substantially (since it's used to compute directory size and
+ * the hash table buckets will get overfull).
+ *
+ * init_size is the number of hashtable entries to preallocate.  For a table
+ * whose maximum size is certain, this should be equal to max_size; that
+ * ensures that no run-time out-of-shared-memory failures can occur.
+ *
+ * *infoP and hash_flags must specify at least the entry sizes and key
+ * comparison semantics (see hash_create()).  Flag bits and values specific
+ * to shared-memory hash tables are added here, except that callers may
+ * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
+ *
+ * Note: before Postgres 9.0, this function returned NULL for some failure
+ * cases.  Now, it always throws error instead, so callers need not check
+ * for NULL.
+ */
+HTAB *
+ShmemInitHash(const char *name,		/* table string name for shmem index */
+			  long init_size,	/* initial table size */
+			  long max_size,	/* max size of the table */
+			  HASHCTL *infoP,	/* info about key and bucket size */
+			  int hash_flags)	/* info about infoP */
+{
+	bool		found;
+	void	   *location;
+
+	/*
+	 * Hash tables allocated in shared memory have a fixed directory; it can't
+	 * grow or other backends wouldn't be able to find it. So, make sure we
+	 * make it big enough to start with.
+	 *
+	 * The shared memory allocator must be specified too.
+	 */
+	infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
+	infoP->alloc = ShmemAllocNoError;
+	hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
+
+	/* look it up in the shmem index */
+	location = ShmemInitStruct(name,
+							   hash_get_shared_size(infoP, hash_flags),
+							   &found);
+
+	/*
+	 * if it already exists, attach to it rather than allocate and initialize
+	 * new space
+	 */
+	if (found)
+		hash_flags |= HASH_ATTACH;
+
+	/* Pass location of hashtable header to hash_create */
+	infoP->hctl = (HASHHDR *) location;
+
+	return hash_create(name, init_size, infoP, hash_flags);
+}
+
+/*
+ * ShmemInitStruct -- Create/attach to a structure in shared memory.
+ *
+ *		This is called during initialization to find or allocate
+ *		a data structure in shared memory.  If no other process
+ *		has created the structure, this routine allocates space
+ *		for it.  If it exists already, a pointer to the existing
+ *		structure is returned.
+ *
+ *	Returns: pointer to the object.  *foundPtr is set true if the object was
+ *		already in the shmem index (hence, already initialized).
+ *
+ *	Note: before Postgres 9.0, this function returned NULL for some failure
+ *	cases.  Now, it always throws error instead, so callers need not check
+ *	for NULL.
+ */
+void *
+ShmemInitStruct(const char *name, Size size, bool *foundPtr)
+{
+	ShmemIndexEnt *result;
+	void	   *structPtr;
+
+	LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
+
+	if (!ShmemIndex)
+	{
+		PGShmemHeader *shmemseghdr = ShmemSegHdr;
+
+		/* Must be trying to create/attach to ShmemIndex itself */
+		Assert(strcmp(name, "ShmemIndex") == 0);
+
+		if (IsUnderPostmaster)
+		{
+			/* Must be initializing a (non-standalone) backend */
+			Assert(shmemseghdr->index != NULL);
+			structPtr = shmemseghdr->index;
+			*foundPtr = true;
+		}
+		else
+		{
+			/*
+			 * If the shmem index doesn't exist, we are bootstrapping: we must
+			 * be trying to init the shmem index itself.
+			 *
+			 * Notice that the ShmemIndexLock is released before the shmem
+			 * index has been initialized.  This should be OK because no other
+			 * process can be accessing shared memory yet.
+			 */
+			Assert(shmemseghdr->index == NULL);
+			structPtr = ShmemAlloc(size);
+			shmemseghdr->index = structPtr;
+			*foundPtr = false;
+		}
+		LWLockRelease(ShmemIndexLock);
+		return structPtr;
+	}
+
+	/* look it up in the shmem index */
+	result = (ShmemIndexEnt *)
+		hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
+
+	if (!result)
+	{
+		LWLockRelease(ShmemIndexLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("could not create ShmemIndex entry for data structure \"%s\"",
+						name)));
+	}
+
+	if (*foundPtr)
+	{
+		/*
+		 * Structure is in the shmem index so someone else has allocated it
+		 * already.  The size better be the same as the size we are trying to
+		 * initialize to, or there is a name conflict (or worse).
+		 */
+		if (result->size != size)
+		{
+			LWLockRelease(ShmemIndexLock);
+			ereport(ERROR,
+					(errmsg("ShmemIndex entry size is wrong for data structure"
+							" \"%s\": expected %zu, actual %zu",
+							name, size, result->size)));
+		}
+		structPtr = result->location;
+	}
+	else
+	{
+		Size		allocated_size;
+
+		/* It isn't in the table yet. allocate and initialize it */
+		structPtr = ShmemAllocRaw(size, &allocated_size);
+		if (structPtr == NULL)
+		{
+			/* out of memory; remove the failed ShmemIndex entry */
+			hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
+			LWLockRelease(ShmemIndexLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("not enough shared memory for data structure"
+							" \"%s\" (%zu bytes requested)",
+							name, size)));
+		}
+		result->size = size;
+		result->allocated_size = allocated_size;
+		result->location = structPtr;
+	}
+
+	LWLockRelease(ShmemIndexLock);
+
+	Assert(ShmemAddrIsValid(structPtr));
+
+	Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
+
+	return structPtr;
+}
+
+
+/*
+ * Add two Size values, checking for overflow
+ */
+Size
+add_size(Size s1, Size s2)
+{
+	Size		result;
+
+	result = s1 + s2;
+	/* We are assuming Size is an unsigned type here... */
+	if (result < s1 || result < s2)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("requested shared memory size overflows size_t")));
+	return result;
+}
+
+/*
+ * Multiply two Size values, checking for overflow
+ */
+Size
+mul_size(Size s1, Size s2)
+{
+	Size		result;
+
+	if (s1 == 0 || s2 == 0)
+		return 0;
+	result = s1 * s2;
+	/* We are assuming Size is an unsigned type here... */
+	if (result / s2 != s1)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("requested shared memory size overflows size_t")));
+	return result;
+}
+
+/* SQL SRF showing allocated shared memory */
+Datum
+pg_get_shmem_allocations(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_SIZES_COLS 4
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	HASH_SEQ_STATUS hstat;
+	ShmemIndexEnt *ent;
+	Size		named_allocated = 0;
+	Datum		values[PG_GET_SHMEM_SIZES_COLS];
+	bool		nulls[PG_GET_SHMEM_SIZES_COLS];
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	hash_seq_init(&hstat, ShmemIndex);
+
+	/* output all allocated entries */
+	memset(nulls, 0, sizeof(nulls));
+	while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		values[0] = CStringGetTextDatum(ent->key);
+		values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
+		values[2] = Int64GetDatum(ent->size);
+		values[3] = Int64GetDatum(ent->allocated_size);
+		named_allocated += ent->allocated_size;
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* output shared memory allocated but not counted via the shmem index */
+	values[0] = CStringGetTextDatum("<anonymous>");
+	nulls[1] = true;
+	values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
+	values[3] = values[2];
+	tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+
+	/* output as-of-yet unused shared memory */
+	nulls[0] = true;
+	values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
+	nulls[1] = false;
+	values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
+	values[3] = values[2];
+	tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+
+	LWLockRelease(ShmemIndexLock);
+
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/storage/ipc/shmqueue.c b/src/backend/storage/ipc/shmqueue.c
new file mode 100644
index 0000000..dc3238c
--- /dev/null
+++ b/src/backend/storage/ipc/shmqueue.c
@@ -0,0 +1,190 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmqueue.c
+ *	  shared memory linked lists
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/shmqueue.c
+ *
+ * NOTES
+ *
+ * Package for managing doubly-linked lists in shared memory.
+ * The only tricky thing is that SHM_QUEUE will usually be a field
+ * in a larger record.  SHMQueueNext has to return a pointer
+ * to the record itself instead of a pointer to the SHMQueue field
+ * of the record.  It takes an extra parameter and does some extra
+ * pointer arithmetic to do this correctly.
+ *
+ * NOTE: These are set up so they can be turned into macros some day.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/shmem.h"
+
+
+/*
+ * ShmemQueueInit -- make the head of a new queue point
+ *		to itself
+ */
+void
+SHMQueueInit(SHM_QUEUE *queue)
+{
+	Assert(ShmemAddrIsValid(queue));
+	queue->prev = queue->next = queue;
+}
+
+/*
+ * SHMQueueIsDetached -- true if element is not currently
+ *		in a queue.
+ */
+bool
+SHMQueueIsDetached(const SHM_QUEUE *queue)
+{
+	Assert(ShmemAddrIsValid(queue));
+	return (queue->prev == NULL);
+}
+
+/*
+ * SHMQueueElemInit -- clear an element's links
+ */
+void
+SHMQueueElemInit(SHM_QUEUE *queue)
+{
+	Assert(ShmemAddrIsValid(queue));
+	queue->prev = queue->next = NULL;
+}
+
+/*
+ * SHMQueueDelete -- remove an element from the queue and
+ *		close the links
+ */
+void
+SHMQueueDelete(SHM_QUEUE *queue)
+{
+	SHM_QUEUE  *nextElem = queue->next;
+	SHM_QUEUE  *prevElem = queue->prev;
+
+	Assert(ShmemAddrIsValid(queue));
+	Assert(ShmemAddrIsValid(nextElem));
+	Assert(ShmemAddrIsValid(prevElem));
+
+	prevElem->next = queue->next;
+	nextElem->prev = queue->prev;
+
+	queue->prev = queue->next = NULL;
+}
+
+/*
+ * SHMQueueInsertBefore -- put elem in queue before the given queue
+ *		element.  Inserting "before" the queue head puts the elem
+ *		at the tail of the queue.
+ */
+void
+SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+	SHM_QUEUE  *prevPtr = queue->prev;
+
+	Assert(ShmemAddrIsValid(queue));
+	Assert(ShmemAddrIsValid(elem));
+
+	elem->next = prevPtr->next;
+	elem->prev = queue->prev;
+	queue->prev = elem;
+	prevPtr->next = elem;
+}
+
+/*
+ * SHMQueueInsertAfter -- put elem in queue after the given queue
+ *		element.  Inserting "after" the queue head puts the elem
+ *		at the head of the queue.
+ */
+void
+SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+	SHM_QUEUE  *nextPtr = queue->next;
+
+	Assert(ShmemAddrIsValid(queue));
+	Assert(ShmemAddrIsValid(elem));
+
+	elem->prev = nextPtr->prev;
+	elem->next = queue->next;
+	queue->next = elem;
+	nextPtr->prev = elem;
+}
+
+/*--------------------
+ * SHMQueueNext -- Get the next element from a queue
+ *
+ * To start the iteration, pass the queue head as both queue and curElem.
+ * Returns NULL if no more elements.
+ *
+ * Next element is at curElem->next.  If SHMQueue is part of
+ * a larger structure, we want to return a pointer to the
+ * whole structure rather than a pointer to its SHMQueue field.
+ * For example,
+ * struct {
+ *		int				stuff;
+ *		SHMQueue		elem;
+ * } ELEMType;
+ * When this element is in a queue, prevElem->next points at struct.elem.
+ * We subtract linkOffset to get the correct start address of the structure.
+ *
+ * calls to SHMQueueNext should take these parameters:
+ *	 &(queueHead), &(queueHead), offsetof(ELEMType, elem)
+ * or
+ *	 &(queueHead), &(curElem->elem), offsetof(ELEMType, elem)
+ *--------------------
+ */
+Pointer
+SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
+{
+	SHM_QUEUE  *elemPtr = curElem->next;
+
+	Assert(ShmemAddrIsValid(curElem));
+
+	if (elemPtr == queue)		/* back to the queue head? */
+		return NULL;
+
+	return (Pointer) (((char *) elemPtr) - linkOffset);
+}
+
+/*--------------------
+ * SHMQueuePrev -- Get the previous element from a queue
+ *
+ * Same as SHMQueueNext, just starting at tail and moving towards head.
+ * All other comments and usage applies.
+ */
+Pointer
+SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
+{
+	SHM_QUEUE  *elemPtr = curElem->prev;
+
+	Assert(ShmemAddrIsValid(curElem));
+
+	if (elemPtr == queue)		/* back to the queue head? */
+		return NULL;
+
+	return (Pointer) (((char *) elemPtr) - linkOffset);
+}
+
+/*
+ * SHMQueueEmpty -- true if queue head is only element, false otherwise
+ */
+bool
+SHMQueueEmpty(const SHM_QUEUE *queue)
+{
+	Assert(ShmemAddrIsValid(queue));
+
+	if (queue->prev == queue)
+	{
+		Assert(queue->next == queue);
+		return true;
+	}
+	return false;
+}
diff --git a/src/backend/storage/ipc/signalfuncs.c b/src/backend/storage/ipc/signalfuncs.c
new file mode 100644
index 0000000..de69d60
--- /dev/null
+++ b/src/backend/storage/ipc/signalfuncs.c
@@ -0,0 +1,300 @@
+/*-------------------------------------------------------------------------
+ *
+ * signalfuncs.c
+ *	  Functions for signaling backends
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/signalfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+
+#include "catalog/pg_authid.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/syslogger.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+
+
+/*
+ * Send a signal to another backend.
+ *
+ * The signal is delivered if the user is either a superuser or the same
+ * role as the backend being signaled. For "dangerous" signals, an explicit
+ * check for superuser needs to be done prior to calling this function.
+ *
+ * Returns 0 on success, 1 on general failure, 2 on normal permission error
+ * and 3 if the caller needs to be a superuser.
+ *
+ * In the event of a general failure (return code 1), a warning message will
+ * be emitted. For permission errors, doing that is the responsibility of
+ * the caller.
+ */
+#define SIGNAL_BACKEND_SUCCESS 0
+#define SIGNAL_BACKEND_ERROR 1
+#define SIGNAL_BACKEND_NOPERMISSION 2
+#define SIGNAL_BACKEND_NOSUPERUSER 3
+static int
+pg_signal_backend(int pid, int sig)
+{
+	PGPROC	   *proc = BackendPidGetProc(pid);
+
+	/*
+	 * BackendPidGetProc returns NULL if the pid isn't valid; but by the time
+	 * we reach kill(), a process for which we get a valid proc here might
+	 * have terminated on its own.  There's no way to acquire a lock on an
+	 * arbitrary process to prevent that. But since so far all the callers of
+	 * this mechanism involve some request for ending the process anyway, that
+	 * it might end on its own first is not a problem.
+	 */
+	if (proc == NULL)
+	{
+		/*
+		 * This is just a warning so a loop-through-resultset will not abort
+		 * if one backend terminated on its own during the run.
+		 */
+		ereport(WARNING,
+				(errmsg("PID %d is not a PostgreSQL server process", pid)));
+		return SIGNAL_BACKEND_ERROR;
+	}
+
+	/* Only allow superusers to signal superuser-owned backends. */
+	if (superuser_arg(proc->roleId) && !superuser())
+		return SIGNAL_BACKEND_NOSUPERUSER;
+
+	/* Users can signal backends they have role membership in. */
+	if (!has_privs_of_role(GetUserId(), proc->roleId) &&
+		!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+		return SIGNAL_BACKEND_NOPERMISSION;
+
+	/*
+	 * Can the process we just validated above end, followed by the pid being
+	 * recycled for a new process, before reaching here?  Then we'd be trying
+	 * to kill the wrong thing.  Seems near impossible when sequential pid
+	 * assignment and wraparound is used.  Perhaps it could happen on a system
+	 * where pid re-use is randomized.  That race condition possibility seems
+	 * too unlikely to worry about.
+	 */
+
+	/* If we have setsid(), signal the backend's whole process group */
+#ifdef HAVE_SETSID
+	if (kill(-pid, sig))
+#else
+	if (kill(pid, sig))
+#endif
+	{
+		/* Again, just a warning to allow loops */
+		ereport(WARNING,
+				(errmsg("could not send signal to process %d: %m", pid)));
+		return SIGNAL_BACKEND_ERROR;
+	}
+	return SIGNAL_BACKEND_SUCCESS;
+}
+
+/*
+ * Signal to cancel a backend process.  This is allowed if you are a member of
+ * the role whose process is being canceled.
+ *
+ * Note that only superusers can signal superuser-owned processes.
+ */
+Datum
+pg_cancel_backend(PG_FUNCTION_ARGS)
+{
+	int			r = pg_signal_backend(PG_GETARG_INT32(0), SIGINT);
+
+	if (r == SIGNAL_BACKEND_NOSUPERUSER)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a superuser to cancel superuser query")));
+
+	if (r == SIGNAL_BACKEND_NOPERMISSION)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a member of the role whose query is being canceled or member of pg_signal_backend")));
+
+	PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS);
+}
+
+/*
+ * Wait until there is no backend process with the given PID and return true.
+ * On timeout, a warning is emitted and false is returned.
+ */
+static bool
+pg_wait_until_termination(int pid, int64 timeout)
+{
+	/*
+	 * Wait in steps of waittime milliseconds until this function exits or
+	 * timeout.
+	 */
+	int64		waittime = 100;
+
+	/*
+	 * Initially remaining time is the entire timeout specified by the user.
+	 */
+	int64		remainingtime = timeout;
+
+	/*
+	 * Check existence of the backend. If the backend still exists, then wait
+	 * for waittime milliseconds, again check for the existence. Repeat this
+	 * until timeout or an error occurs or a pending interrupt such as query
+	 * cancel gets processed.
+	 */
+	do
+	{
+		if (remainingtime < waittime)
+			waittime = remainingtime;
+
+		if (kill(pid, 0) == -1)
+		{
+			if (errno == ESRCH)
+				return true;
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("could not check the existence of the backend with PID %d: %m",
+								pid)));
+		}
+
+		/* Process interrupts, if any, before waiting */
+		CHECK_FOR_INTERRUPTS();
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+						 waittime,
+						 WAIT_EVENT_BACKEND_TERMINATION);
+
+		ResetLatch(MyLatch);
+
+		remainingtime -= waittime;
+	} while (remainingtime > 0);
+
+	ereport(WARNING,
+			(errmsg_plural("backend with PID %d did not terminate within %lld millisecond",
+						   "backend with PID %d did not terminate within %lld milliseconds",
+						   timeout,
+						   pid, (long long int) timeout)));
+
+	return false;
+}
+
+/*
+ * Send a signal to terminate a backend process. This is allowed if you are a
+ * member of the role whose process is being terminated. If the timeout input
+ * argument is 0, then this function just signals the backend and returns
+ * true.  If timeout is nonzero, then it waits until no process has the given
+ * PID; if the process ends within the timeout, true is returned, and if the
+ * timeout is exceeded, a warning is emitted and false is returned.
+ *
+ * Note that only superusers can signal superuser-owned processes.
+ */
+Datum
+pg_terminate_backend(PG_FUNCTION_ARGS)
+{
+	int			pid;
+	int			r;
+	int			timeout;		/* milliseconds */
+
+	pid = PG_GETARG_INT32(0);
+	timeout = PG_GETARG_INT64(1);
+
+	if (timeout < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+				 errmsg("\"timeout\" must not be negative")));
+
+	r = pg_signal_backend(pid, SIGTERM);
+
+	if (r == SIGNAL_BACKEND_NOSUPERUSER)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a superuser to terminate superuser process")));
+
+	if (r == SIGNAL_BACKEND_NOPERMISSION)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a member of the role whose process is being terminated or member of pg_signal_backend")));
+
+	/* Wait only on success and if actually requested */
+	if (r == SIGNAL_BACKEND_SUCCESS && timeout > 0)
+		PG_RETURN_BOOL(pg_wait_until_termination(pid, timeout));
+	else
+		PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS);
+}
+
+/*
+ * Signal to reload the database configuration
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_reload_conf(PG_FUNCTION_ARGS)
+{
+	if (kill(PostmasterPid, SIGHUP))
+	{
+		ereport(WARNING,
+				(errmsg("failed to send signal to postmaster: %m")));
+		PG_RETURN_BOOL(false);
+	}
+
+	PG_RETURN_BOOL(true);
+}
+
+
+/*
+ * Rotate log file
+ *
+ * This function is kept to support adminpack 1.0.
+ */
+Datum
+pg_rotate_logfile(PG_FUNCTION_ARGS)
+{
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to rotate log files with adminpack 1.0"),
+		/* translator: %s is a SQL function name */
+				 errhint("Consider using %s, which is part of core, instead.",
+						 "pg_logfile_rotate()")));
+
+	if (!Logging_collector)
+	{
+		ereport(WARNING,
+				(errmsg("rotation not possible because log collection not active")));
+		PG_RETURN_BOOL(false);
+	}
+
+	SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE);
+	PG_RETURN_BOOL(true);
+}
+
+/*
+ * Rotate log file
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_rotate_logfile_v2(PG_FUNCTION_ARGS)
+{
+	if (!Logging_collector)
+	{
+		ereport(WARNING,
+				(errmsg("rotation not possible because log collection not active")));
+		PG_RETURN_BOOL(false);
+	}
+
+	SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE);
+	PG_RETURN_BOOL(true);
+}
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
new file mode 100644
index 0000000..f585d63
--- /dev/null
+++ b/src/backend/storage/ipc/sinval.c
@@ -0,0 +1,205 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.c
+ *	  POSTGRES shared cache invalidation communication code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/sinval.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/sinvaladt.h"
+#include "utils/inval.h"
+
+
+uint64		SharedInvalidMessageCounter;
+
+
+/*
+ * Because backends sitting idle will not be reading sinval events, we
+ * need a way to give an idle backend a swift kick in the rear and make
+ * it catch up before the sinval queue overflows and forces it to go
+ * through a cache reset exercise.  This is done by sending
+ * PROCSIG_CATCHUP_INTERRUPT to any backend that gets too far behind.
+ *
+ * The signal handler will set an interrupt pending flag and will set the
+ * processes latch. Whenever starting to read from the client, or when
+ * interrupted while doing so, ProcessClientReadInterrupt() will call
+ * ProcessCatchupEvent().
+ */
+volatile sig_atomic_t catchupInterruptPending = false;
+
+
+/*
+ * SendSharedInvalidMessages
+ *	Add shared-cache-invalidation message(s) to the global SI message queue.
+ */
+void
+SendSharedInvalidMessages(const SharedInvalidationMessage *msgs, int n)
+{
+	SIInsertDataEntries(msgs, n);
+}
+
+/*
+ * ReceiveSharedInvalidMessages
+ *		Process shared-cache-invalidation messages waiting for this backend
+ *
+ * We guarantee to process all messages that had been queued before the
+ * routine was entered.  It is of course possible for more messages to get
+ * queued right after our last SIGetDataEntries call.
+ *
+ * NOTE: it is entirely possible for this routine to be invoked recursively
+ * as a consequence of processing inside the invalFunction or resetFunction.
+ * Furthermore, such a recursive call must guarantee that all outstanding
+ * inval messages have been processed before it exits.  This is the reason
+ * for the strange-looking choice to use a statically allocated buffer array
+ * and counters; it's so that a recursive call can process messages already
+ * sucked out of sinvaladt.c.
+ */
+void
+ReceiveSharedInvalidMessages(void (*invalFunction) (SharedInvalidationMessage *msg),
+							 void (*resetFunction) (void))
+{
+#define MAXINVALMSGS 32
+	static SharedInvalidationMessage messages[MAXINVALMSGS];
+
+	/*
+	 * We use volatile here to prevent bugs if a compiler doesn't realize that
+	 * recursion is a possibility ...
+	 */
+	static volatile int nextmsg = 0;
+	static volatile int nummsgs = 0;
+
+	/* Deal with any messages still pending from an outer recursion */
+	while (nextmsg < nummsgs)
+	{
+		SharedInvalidationMessage msg = messages[nextmsg++];
+
+		SharedInvalidMessageCounter++;
+		invalFunction(&msg);
+	}
+
+	do
+	{
+		int			getResult;
+
+		nextmsg = nummsgs = 0;
+
+		/* Try to get some more messages */
+		getResult = SIGetDataEntries(messages, MAXINVALMSGS);
+
+		if (getResult < 0)
+		{
+			/* got a reset message */
+			elog(DEBUG4, "cache state reset");
+			SharedInvalidMessageCounter++;
+			resetFunction();
+			break;				/* nothing more to do */
+		}
+
+		/* Process them, being wary that a recursive call might eat some */
+		nextmsg = 0;
+		nummsgs = getResult;
+
+		while (nextmsg < nummsgs)
+		{
+			SharedInvalidationMessage msg = messages[nextmsg++];
+
+			SharedInvalidMessageCounter++;
+			invalFunction(&msg);
+		}
+
+		/*
+		 * We only need to loop if the last SIGetDataEntries call (which might
+		 * have been within a recursive call) returned a full buffer.
+		 */
+	} while (nummsgs == MAXINVALMSGS);
+
+	/*
+	 * We are now caught up.  If we received a catchup signal, reset that
+	 * flag, and call SICleanupQueue().  This is not so much because we need
+	 * to flush dead messages right now, as that we want to pass on the
+	 * catchup signal to the next slowest backend.  "Daisy chaining" the
+	 * catchup signal this way avoids creating spikes in system load for what
+	 * should be just a background maintenance activity.
+	 */
+	if (catchupInterruptPending)
+	{
+		catchupInterruptPending = false;
+		elog(DEBUG4, "sinval catchup complete, cleaning queue");
+		SICleanupQueue(false, 0);
+	}
+}
+
+
+/*
+ * HandleCatchupInterrupt
+ *
+ * This is called when PROCSIG_CATCHUP_INTERRUPT is received.
+ *
+ * We used to directly call ProcessCatchupEvent directly when idle. These days
+ * we just set a flag to do it later and notify the process of that fact by
+ * setting the process's latch.
+ */
+void
+HandleCatchupInterrupt(void)
+{
+	/*
+	 * Note: this is called by a SIGNAL HANDLER. You must be very wary what
+	 * you do here.
+	 */
+
+	catchupInterruptPending = true;
+
+	/* make sure the event is processed in due course */
+	SetLatch(MyLatch);
+}
+
+/*
+ * ProcessCatchupInterrupt
+ *
+ * The portion of catchup interrupt handling that runs outside of the signal
+ * handler, which allows it to actually process pending invalidations.
+ */
+void
+ProcessCatchupInterrupt(void)
+{
+	while (catchupInterruptPending)
+	{
+		/*
+		 * What we need to do here is cause ReceiveSharedInvalidMessages() to
+		 * run, which will do the necessary work and also reset the
+		 * catchupInterruptPending flag.  If we are inside a transaction we
+		 * can just call AcceptInvalidationMessages() to do this.  If we
+		 * aren't, we start and immediately end a transaction; the call to
+		 * AcceptInvalidationMessages() happens down inside transaction start.
+		 *
+		 * It is awfully tempting to just call AcceptInvalidationMessages()
+		 * without the rest of the xact start/stop overhead, and I think that
+		 * would actually work in the normal case; but I am not sure that
+		 * things would clean up nicely if we got an error partway through.
+		 */
+		if (IsTransactionOrTransactionBlock())
+		{
+			elog(DEBUG4, "ProcessCatchupEvent inside transaction");
+			AcceptInvalidationMessages();
+		}
+		else
+		{
+			elog(DEBUG4, "ProcessCatchupEvent outside transaction");
+			StartTransactionCommand();
+			CommitTransactionCommand();
+		}
+	}
+}
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
new file mode 100644
index 0000000..946bd8e
--- /dev/null
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -0,0 +1,777 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.c
+ *	  POSTGRES shared cache invalidation data manager.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/sinvaladt.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "storage/backendid.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/procsignal.h"
+#include "storage/shmem.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+
+/*
+ * Conceptually, the shared cache invalidation messages are stored in an
+ * infinite array, where maxMsgNum is the next array subscript to store a
+ * submitted message in, minMsgNum is the smallest array subscript containing
+ * a message not yet read by all backends, and we always have maxMsgNum >=
+ * minMsgNum.  (They are equal when there are no messages pending.)  For each
+ * active backend, there is a nextMsgNum pointer indicating the next message it
+ * needs to read; we have maxMsgNum >= nextMsgNum >= minMsgNum for every
+ * backend.
+ *
+ * (In the current implementation, minMsgNum is a lower bound for the
+ * per-process nextMsgNum values, but it isn't rigorously kept equal to the
+ * smallest nextMsgNum --- it may lag behind.  We only update it when
+ * SICleanupQueue is called, and we try not to do that often.)
+ *
+ * In reality, the messages are stored in a circular buffer of MAXNUMMESSAGES
+ * entries.  We translate MsgNum values into circular-buffer indexes by
+ * computing MsgNum % MAXNUMMESSAGES (this should be fast as long as
+ * MAXNUMMESSAGES is a constant and a power of 2).  As long as maxMsgNum
+ * doesn't exceed minMsgNum by more than MAXNUMMESSAGES, we have enough space
+ * in the buffer.  If the buffer does overflow, we recover by setting the
+ * "reset" flag for each backend that has fallen too far behind.  A backend
+ * that is in "reset" state is ignored while determining minMsgNum.  When
+ * it does finally attempt to receive inval messages, it must discard all
+ * its invalidatable state, since it won't know what it missed.
+ *
+ * To reduce the probability of needing resets, we send a "catchup" interrupt
+ * to any backend that seems to be falling unreasonably far behind.  The
+ * normal behavior is that at most one such interrupt is in flight at a time;
+ * when a backend completes processing a catchup interrupt, it executes
+ * SICleanupQueue, which will signal the next-furthest-behind backend if
+ * needed.  This avoids undue contention from multiple backends all trying
+ * to catch up at once.  However, the furthest-back backend might be stuck
+ * in a state where it can't catch up.  Eventually it will get reset, so it
+ * won't cause any more problems for anyone but itself.  But we don't want
+ * to find that a bunch of other backends are now too close to the reset
+ * threshold to be saved.  So SICleanupQueue is designed to occasionally
+ * send extra catchup interrupts as the queue gets fuller, to backends that
+ * are far behind and haven't gotten one yet.  As long as there aren't a lot
+ * of "stuck" backends, we won't need a lot of extra interrupts, since ones
+ * that aren't stuck will propagate their interrupts to the next guy.
+ *
+ * We would have problems if the MsgNum values overflow an integer, so
+ * whenever minMsgNum exceeds MSGNUMWRAPAROUND, we subtract MSGNUMWRAPAROUND
+ * from all the MsgNum variables simultaneously.  MSGNUMWRAPAROUND can be
+ * large so that we don't need to do this often.  It must be a multiple of
+ * MAXNUMMESSAGES so that the existing circular-buffer entries don't need
+ * to be moved when we do it.
+ *
+ * Access to the shared sinval array is protected by two locks, SInvalReadLock
+ * and SInvalWriteLock.  Readers take SInvalReadLock in shared mode; this
+ * authorizes them to modify their own ProcState but not to modify or even
+ * look at anyone else's.  When we need to perform array-wide updates,
+ * such as in SICleanupQueue, we take SInvalReadLock in exclusive mode to
+ * lock out all readers.  Writers take SInvalWriteLock (always in exclusive
+ * mode) to serialize adding messages to the queue.  Note that a writer
+ * can operate in parallel with one or more readers, because the writer
+ * has no need to touch anyone's ProcState, except in the infrequent cases
+ * when SICleanupQueue is needed.  The only point of overlap is that
+ * the writer wants to change maxMsgNum while readers need to read it.
+ * We deal with that by having a spinlock that readers must take for just
+ * long enough to read maxMsgNum, while writers take it for just long enough
+ * to write maxMsgNum.  (The exact rule is that you need the spinlock to
+ * read maxMsgNum if you are not holding SInvalWriteLock, and you need the
+ * spinlock to write maxMsgNum unless you are holding both locks.)
+ *
+ * Note: since maxMsgNum is an int and hence presumably atomically readable/
+ * writable, the spinlock might seem unnecessary.  The reason it is needed
+ * is to provide a memory barrier: we need to be sure that messages written
+ * to the array are actually there before maxMsgNum is increased, and that
+ * readers will see that data after fetching maxMsgNum.  Multiprocessors
+ * that have weak memory-ordering guarantees can fail without the memory
+ * barrier instructions that are included in the spinlock sequences.
+ */
+
+
+/*
+ * Configurable parameters.
+ *
+ * MAXNUMMESSAGES: max number of shared-inval messages we can buffer.
+ * Must be a power of 2 for speed.
+ *
+ * MSGNUMWRAPAROUND: how often to reduce MsgNum variables to avoid overflow.
+ * Must be a multiple of MAXNUMMESSAGES.  Should be large.
+ *
+ * CLEANUP_MIN: the minimum number of messages that must be in the buffer
+ * before we bother to call SICleanupQueue.
+ *
+ * CLEANUP_QUANTUM: how often (in messages) to call SICleanupQueue once
+ * we exceed CLEANUP_MIN.  Should be a power of 2 for speed.
+ *
+ * SIG_THRESHOLD: the minimum number of messages a backend must have fallen
+ * behind before we'll send it PROCSIG_CATCHUP_INTERRUPT.
+ *
+ * WRITE_QUANTUM: the max number of messages to push into the buffer per
+ * iteration of SIInsertDataEntries.  Noncritical but should be less than
+ * CLEANUP_QUANTUM, because we only consider calling SICleanupQueue once
+ * per iteration.
+ */
+
+#define MAXNUMMESSAGES 4096
+#define MSGNUMWRAPAROUND (MAXNUMMESSAGES * 262144)
+#define CLEANUP_MIN (MAXNUMMESSAGES / 2)
+#define CLEANUP_QUANTUM (MAXNUMMESSAGES / 16)
+#define SIG_THRESHOLD (MAXNUMMESSAGES / 2)
+#define WRITE_QUANTUM 64
+
+/* Per-backend state in shared invalidation structure */
+typedef struct ProcState
+{
+	/* procPid is zero in an inactive ProcState array entry. */
+	pid_t		procPid;		/* PID of backend, for signaling */
+	PGPROC	   *proc;			/* PGPROC of backend */
+	/* nextMsgNum is meaningless if procPid == 0 or resetState is true. */
+	int			nextMsgNum;		/* next message number to read */
+	bool		resetState;		/* backend needs to reset its state */
+	bool		signaled;		/* backend has been sent catchup signal */
+	bool		hasMessages;	/* backend has unread messages */
+
+	/*
+	 * Backend only sends invalidations, never receives them. This only makes
+	 * sense for Startup process during recovery because it doesn't maintain a
+	 * relcache, yet it fires inval messages to allow query backends to see
+	 * schema changes.
+	 */
+	bool		sendOnly;		/* backend only sends, never receives */
+
+	/*
+	 * Next LocalTransactionId to use for each idle backend slot.  We keep
+	 * this here because it is indexed by BackendId and it is convenient to
+	 * copy the value to and from local memory when MyBackendId is set. It's
+	 * meaningless in an active ProcState entry.
+	 */
+	LocalTransactionId nextLXID;
+} ProcState;
+
+/* Shared cache invalidation memory segment */
+typedef struct SISeg
+{
+	/*
+	 * General state information
+	 */
+	int			minMsgNum;		/* oldest message still needed */
+	int			maxMsgNum;		/* next message number to be assigned */
+	int			nextThreshold;	/* # of messages to call SICleanupQueue */
+	int			lastBackend;	/* index of last active procState entry, +1 */
+	int			maxBackends;	/* size of procState array */
+
+	slock_t		msgnumLock;		/* spinlock protecting maxMsgNum */
+
+	/*
+	 * Circular buffer holding shared-inval messages
+	 */
+	SharedInvalidationMessage buffer[MAXNUMMESSAGES];
+
+	/*
+	 * Per-backend invalidation state info (has MaxBackends entries).
+	 */
+	ProcState	procState[FLEXIBLE_ARRAY_MEMBER];
+} SISeg;
+
+static SISeg *shmInvalBuffer;	/* pointer to the shared inval buffer */
+
+
+static LocalTransactionId nextLocalTransactionId;
+
+static void CleanupInvalidationState(int status, Datum arg);
+
+
+/*
+ * SInvalShmemSize --- return shared-memory space needed
+ */
+Size
+SInvalShmemSize(void)
+{
+	Size		size;
+
+	size = offsetof(SISeg, procState);
+	size = add_size(size, mul_size(sizeof(ProcState), MaxBackends));
+
+	return size;
+}
+
+/*
+ * CreateSharedInvalidationState
+ *		Create and initialize the SI message buffer
+ */
+void
+CreateSharedInvalidationState(void)
+{
+	int			i;
+	bool		found;
+
+	/* Allocate space in shared memory */
+	shmInvalBuffer = (SISeg *)
+		ShmemInitStruct("shmInvalBuffer", SInvalShmemSize(), &found);
+	if (found)
+		return;
+
+	/* Clear message counters, save size of procState array, init spinlock */
+	shmInvalBuffer->minMsgNum = 0;
+	shmInvalBuffer->maxMsgNum = 0;
+	shmInvalBuffer->nextThreshold = CLEANUP_MIN;
+	shmInvalBuffer->lastBackend = 0;
+	shmInvalBuffer->maxBackends = MaxBackends;
+	SpinLockInit(&shmInvalBuffer->msgnumLock);
+
+	/* The buffer[] array is initially all unused, so we need not fill it */
+
+	/* Mark all backends inactive, and initialize nextLXID */
+	for (i = 0; i < shmInvalBuffer->maxBackends; i++)
+	{
+		shmInvalBuffer->procState[i].procPid = 0;	/* inactive */
+		shmInvalBuffer->procState[i].proc = NULL;
+		shmInvalBuffer->procState[i].nextMsgNum = 0;	/* meaningless */
+		shmInvalBuffer->procState[i].resetState = false;
+		shmInvalBuffer->procState[i].signaled = false;
+		shmInvalBuffer->procState[i].hasMessages = false;
+		shmInvalBuffer->procState[i].nextLXID = InvalidLocalTransactionId;
+	}
+}
+
+/*
+ * SharedInvalBackendInit
+ *		Initialize a new backend to operate on the sinval buffer
+ */
+void
+SharedInvalBackendInit(bool sendOnly)
+{
+	int			index;
+	ProcState  *stateP = NULL;
+	SISeg	   *segP = shmInvalBuffer;
+
+	/*
+	 * This can run in parallel with read operations, but not with write
+	 * operations, since SIInsertDataEntries relies on lastBackend to set
+	 * hasMessages appropriately.
+	 */
+	LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+	/* Look for a free entry in the procState array */
+	for (index = 0; index < segP->lastBackend; index++)
+	{
+		if (segP->procState[index].procPid == 0)	/* inactive slot? */
+		{
+			stateP = &segP->procState[index];
+			break;
+		}
+	}
+
+	if (stateP == NULL)
+	{
+		if (segP->lastBackend < segP->maxBackends)
+		{
+			stateP = &segP->procState[segP->lastBackend];
+			Assert(stateP->procPid == 0);
+			segP->lastBackend++;
+		}
+		else
+		{
+			/*
+			 * out of procState slots: MaxBackends exceeded -- report normally
+			 */
+			MyBackendId = InvalidBackendId;
+			LWLockRelease(SInvalWriteLock);
+			ereport(FATAL,
+					(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+					 errmsg("sorry, too many clients already")));
+		}
+	}
+
+	MyBackendId = (stateP - &segP->procState[0]) + 1;
+
+	/* Advertise assigned backend ID in MyProc */
+	MyProc->backendId = MyBackendId;
+
+	/* Fetch next local transaction ID into local memory */
+	nextLocalTransactionId = stateP->nextLXID;
+
+	/* mark myself active, with all extant messages already read */
+	stateP->procPid = MyProcPid;
+	stateP->proc = MyProc;
+	stateP->nextMsgNum = segP->maxMsgNum;
+	stateP->resetState = false;
+	stateP->signaled = false;
+	stateP->hasMessages = false;
+	stateP->sendOnly = sendOnly;
+
+	LWLockRelease(SInvalWriteLock);
+
+	/* register exit routine to mark my entry inactive at exit */
+	on_shmem_exit(CleanupInvalidationState, PointerGetDatum(segP));
+
+	elog(DEBUG4, "my backend ID is %d", MyBackendId);
+}
+
+/*
+ * CleanupInvalidationState
+ *		Mark the current backend as no longer active.
+ *
+ * This function is called via on_shmem_exit() during backend shutdown.
+ *
+ * arg is really of type "SISeg*".
+ */
+static void
+CleanupInvalidationState(int status, Datum arg)
+{
+	SISeg	   *segP = (SISeg *) DatumGetPointer(arg);
+	ProcState  *stateP;
+	int			i;
+
+	Assert(PointerIsValid(segP));
+
+	LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+	stateP = &segP->procState[MyBackendId - 1];
+
+	/* Update next local transaction ID for next holder of this backendID */
+	stateP->nextLXID = nextLocalTransactionId;
+
+	/* Mark myself inactive */
+	stateP->procPid = 0;
+	stateP->proc = NULL;
+	stateP->nextMsgNum = 0;
+	stateP->resetState = false;
+	stateP->signaled = false;
+
+	/* Recompute index of last active backend */
+	for (i = segP->lastBackend; i > 0; i--)
+	{
+		if (segP->procState[i - 1].procPid != 0)
+			break;
+	}
+	segP->lastBackend = i;
+
+	LWLockRelease(SInvalWriteLock);
+}
+
+/*
+ * BackendIdGetProc
+ *		Get the PGPROC structure for a backend, given the backend ID.
+ *		The result may be out of date arbitrarily quickly, so the caller
+ *		must be careful about how this information is used.  NULL is
+ *		returned if the backend is not active.
+ */
+PGPROC *
+BackendIdGetProc(int backendID)
+{
+	PGPROC	   *result = NULL;
+	SISeg	   *segP = shmInvalBuffer;
+
+	/* Need to lock out additions/removals of backends */
+	LWLockAcquire(SInvalWriteLock, LW_SHARED);
+
+	if (backendID > 0 && backendID <= segP->lastBackend)
+	{
+		ProcState  *stateP = &segP->procState[backendID - 1];
+
+		result = stateP->proc;
+	}
+
+	LWLockRelease(SInvalWriteLock);
+
+	return result;
+}
+
+/*
+ * BackendIdGetTransactionIds
+ *		Get the xid and xmin of the backend. The result may be out of date
+ *		arbitrarily quickly, so the caller must be careful about how this
+ *		information is used.
+ */
+void
+BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmin)
+{
+	SISeg	   *segP = shmInvalBuffer;
+
+	*xid = InvalidTransactionId;
+	*xmin = InvalidTransactionId;
+
+	/* Need to lock out additions/removals of backends */
+	LWLockAcquire(SInvalWriteLock, LW_SHARED);
+
+	if (backendID > 0 && backendID <= segP->lastBackend)
+	{
+		ProcState  *stateP = &segP->procState[backendID - 1];
+		PGPROC	   *proc = stateP->proc;
+
+		if (proc != NULL)
+		{
+			*xid = proc->xid;
+			*xmin = proc->xmin;
+		}
+	}
+
+	LWLockRelease(SInvalWriteLock);
+}
+
+/*
+ * SIInsertDataEntries
+ *		Add new invalidation message(s) to the buffer.
+ */
+void
+SIInsertDataEntries(const SharedInvalidationMessage *data, int n)
+{
+	SISeg	   *segP = shmInvalBuffer;
+
+	/*
+	 * N can be arbitrarily large.  We divide the work into groups of no more
+	 * than WRITE_QUANTUM messages, to be sure that we don't hold the lock for
+	 * an unreasonably long time.  (This is not so much because we care about
+	 * letting in other writers, as that some just-caught-up backend might be
+	 * trying to do SICleanupQueue to pass on its signal, and we don't want it
+	 * to have to wait a long time.)  Also, we need to consider calling
+	 * SICleanupQueue every so often.
+	 */
+	while (n > 0)
+	{
+		int			nthistime = Min(n, WRITE_QUANTUM);
+		int			numMsgs;
+		int			max;
+		int			i;
+
+		n -= nthistime;
+
+		LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+		/*
+		 * If the buffer is full, we *must* acquire some space.  Clean the
+		 * queue and reset anyone who is preventing space from being freed.
+		 * Otherwise, clean the queue only when it's exceeded the next
+		 * fullness threshold.  We have to loop and recheck the buffer state
+		 * after any call of SICleanupQueue.
+		 */
+		for (;;)
+		{
+			numMsgs = segP->maxMsgNum - segP->minMsgNum;
+			if (numMsgs + nthistime > MAXNUMMESSAGES ||
+				numMsgs >= segP->nextThreshold)
+				SICleanupQueue(true, nthistime);
+			else
+				break;
+		}
+
+		/*
+		 * Insert new message(s) into proper slot of circular buffer
+		 */
+		max = segP->maxMsgNum;
+		while (nthistime-- > 0)
+		{
+			segP->buffer[max % MAXNUMMESSAGES] = *data++;
+			max++;
+		}
+
+		/* Update current value of maxMsgNum using spinlock */
+		SpinLockAcquire(&segP->msgnumLock);
+		segP->maxMsgNum = max;
+		SpinLockRelease(&segP->msgnumLock);
+
+		/*
+		 * Now that the maxMsgNum change is globally visible, we give everyone
+		 * a swift kick to make sure they read the newly added messages.
+		 * Releasing SInvalWriteLock will enforce a full memory barrier, so
+		 * these (unlocked) changes will be committed to memory before we exit
+		 * the function.
+		 */
+		for (i = 0; i < segP->lastBackend; i++)
+		{
+			ProcState  *stateP = &segP->procState[i];
+
+			stateP->hasMessages = true;
+		}
+
+		LWLockRelease(SInvalWriteLock);
+	}
+}
+
+/*
+ * SIGetDataEntries
+ *		get next SI message(s) for current backend, if there are any
+ *
+ * Possible return values:
+ *	0:	 no SI message available
+ *	n>0: next n SI messages have been extracted into data[]
+ * -1:	 SI reset message extracted
+ *
+ * If the return value is less than the array size "datasize", the caller
+ * can assume that there are no more SI messages after the one(s) returned.
+ * Otherwise, another call is needed to collect more messages.
+ *
+ * NB: this can run in parallel with other instances of SIGetDataEntries
+ * executing on behalf of other backends, since each instance will modify only
+ * fields of its own backend's ProcState, and no instance will look at fields
+ * of other backends' ProcStates.  We express this by grabbing SInvalReadLock
+ * in shared mode.  Note that this is not exactly the normal (read-only)
+ * interpretation of a shared lock! Look closely at the interactions before
+ * allowing SInvalReadLock to be grabbed in shared mode for any other reason!
+ *
+ * NB: this can also run in parallel with SIInsertDataEntries.  It is not
+ * guaranteed that we will return any messages added after the routine is
+ * entered.
+ *
+ * Note: we assume that "datasize" is not so large that it might be important
+ * to break our hold on SInvalReadLock into segments.
+ */
+int
+SIGetDataEntries(SharedInvalidationMessage *data, int datasize)
+{
+	SISeg	   *segP;
+	ProcState  *stateP;
+	int			max;
+	int			n;
+
+	segP = shmInvalBuffer;
+	stateP = &segP->procState[MyBackendId - 1];
+
+	/*
+	 * Before starting to take locks, do a quick, unlocked test to see whether
+	 * there can possibly be anything to read.  On a multiprocessor system,
+	 * it's possible that this load could migrate backwards and occur before
+	 * we actually enter this function, so we might miss a sinval message that
+	 * was just added by some other processor.  But they can't migrate
+	 * backwards over a preceding lock acquisition, so it should be OK.  If we
+	 * haven't acquired a lock preventing against further relevant
+	 * invalidations, any such occurrence is not much different than if the
+	 * invalidation had arrived slightly later in the first place.
+	 */
+	if (!stateP->hasMessages)
+		return 0;
+
+	LWLockAcquire(SInvalReadLock, LW_SHARED);
+
+	/*
+	 * We must reset hasMessages before determining how many messages we're
+	 * going to read.  That way, if new messages arrive after we have
+	 * determined how many we're reading, the flag will get reset and we'll
+	 * notice those messages part-way through.
+	 *
+	 * Note that, if we don't end up reading all of the messages, we had
+	 * better be certain to reset this flag before exiting!
+	 */
+	stateP->hasMessages = false;
+
+	/* Fetch current value of maxMsgNum using spinlock */
+	SpinLockAcquire(&segP->msgnumLock);
+	max = segP->maxMsgNum;
+	SpinLockRelease(&segP->msgnumLock);
+
+	if (stateP->resetState)
+	{
+		/*
+		 * Force reset.  We can say we have dealt with any messages added
+		 * since the reset, as well; and that means we should clear the
+		 * signaled flag, too.
+		 */
+		stateP->nextMsgNum = max;
+		stateP->resetState = false;
+		stateP->signaled = false;
+		LWLockRelease(SInvalReadLock);
+		return -1;
+	}
+
+	/*
+	 * Retrieve messages and advance backend's counter, until data array is
+	 * full or there are no more messages.
+	 *
+	 * There may be other backends that haven't read the message(s), so we
+	 * cannot delete them here.  SICleanupQueue() will eventually remove them
+	 * from the queue.
+	 */
+	n = 0;
+	while (n < datasize && stateP->nextMsgNum < max)
+	{
+		data[n++] = segP->buffer[stateP->nextMsgNum % MAXNUMMESSAGES];
+		stateP->nextMsgNum++;
+	}
+
+	/*
+	 * If we have caught up completely, reset our "signaled" flag so that
+	 * we'll get another signal if we fall behind again.
+	 *
+	 * If we haven't caught up completely, reset the hasMessages flag so that
+	 * we see the remaining messages next time.
+	 */
+	if (stateP->nextMsgNum >= max)
+		stateP->signaled = false;
+	else
+		stateP->hasMessages = true;
+
+	LWLockRelease(SInvalReadLock);
+	return n;
+}
+
+/*
+ * SICleanupQueue
+ *		Remove messages that have been consumed by all active backends
+ *
+ * callerHasWriteLock is true if caller is holding SInvalWriteLock.
+ * minFree is the minimum number of message slots to make free.
+ *
+ * Possible side effects of this routine include marking one or more
+ * backends as "reset" in the array, and sending PROCSIG_CATCHUP_INTERRUPT
+ * to some backend that seems to be getting too far behind.  We signal at
+ * most one backend at a time, for reasons explained at the top of the file.
+ *
+ * Caution: because we transiently release write lock when we have to signal
+ * some other backend, it is NOT guaranteed that there are still minFree
+ * free message slots at exit.  Caller must recheck and perhaps retry.
+ */
+void
+SICleanupQueue(bool callerHasWriteLock, int minFree)
+{
+	SISeg	   *segP = shmInvalBuffer;
+	int			min,
+				minsig,
+				lowbound,
+				numMsgs,
+				i;
+	ProcState  *needSig = NULL;
+
+	/* Lock out all writers and readers */
+	if (!callerHasWriteLock)
+		LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+	LWLockAcquire(SInvalReadLock, LW_EXCLUSIVE);
+
+	/*
+	 * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify the
+	 * furthest-back backend that needs signaling (if any), and reset any
+	 * backends that are too far back.  Note that because we ignore sendOnly
+	 * backends here it is possible for them to keep sending messages without
+	 * a problem even when they are the only active backend.
+	 */
+	min = segP->maxMsgNum;
+	minsig = min - SIG_THRESHOLD;
+	lowbound = min - MAXNUMMESSAGES + minFree;
+
+	for (i = 0; i < segP->lastBackend; i++)
+	{
+		ProcState  *stateP = &segP->procState[i];
+		int			n = stateP->nextMsgNum;
+
+		/* Ignore if inactive or already in reset state */
+		if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly)
+			continue;
+
+		/*
+		 * If we must free some space and this backend is preventing it, force
+		 * him into reset state and then ignore until he catches up.
+		 */
+		if (n < lowbound)
+		{
+			stateP->resetState = true;
+			/* no point in signaling him ... */
+			continue;
+		}
+
+		/* Track the global minimum nextMsgNum */
+		if (n < min)
+			min = n;
+
+		/* Also see who's furthest back of the unsignaled backends */
+		if (n < minsig && !stateP->signaled)
+		{
+			minsig = n;
+			needSig = stateP;
+		}
+	}
+	segP->minMsgNum = min;
+
+	/*
+	 * When minMsgNum gets really large, decrement all message counters so as
+	 * to forestall overflow of the counters.  This happens seldom enough that
+	 * folding it into the previous loop would be a loser.
+	 */
+	if (min >= MSGNUMWRAPAROUND)
+	{
+		segP->minMsgNum -= MSGNUMWRAPAROUND;
+		segP->maxMsgNum -= MSGNUMWRAPAROUND;
+		for (i = 0; i < segP->lastBackend; i++)
+		{
+			/* we don't bother skipping inactive entries here */
+			segP->procState[i].nextMsgNum -= MSGNUMWRAPAROUND;
+		}
+	}
+
+	/*
+	 * Determine how many messages are still in the queue, and set the
+	 * threshold at which we should repeat SICleanupQueue().
+	 */
+	numMsgs = segP->maxMsgNum - segP->minMsgNum;
+	if (numMsgs < CLEANUP_MIN)
+		segP->nextThreshold = CLEANUP_MIN;
+	else
+		segP->nextThreshold = (numMsgs / CLEANUP_QUANTUM + 1) * CLEANUP_QUANTUM;
+
+	/*
+	 * Lastly, signal anyone who needs a catchup interrupt.  Since
+	 * SendProcSignal() might not be fast, we don't want to hold locks while
+	 * executing it.
+	 */
+	if (needSig)
+	{
+		pid_t		his_pid = needSig->procPid;
+		BackendId	his_backendId = (needSig - &segP->procState[0]) + 1;
+
+		needSig->signaled = true;
+		LWLockRelease(SInvalReadLock);
+		LWLockRelease(SInvalWriteLock);
+		elog(DEBUG4, "sending sinval catchup signal to PID %d", (int) his_pid);
+		SendProcSignal(his_pid, PROCSIG_CATCHUP_INTERRUPT, his_backendId);
+		if (callerHasWriteLock)
+			LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+	}
+	else
+	{
+		LWLockRelease(SInvalReadLock);
+		if (!callerHasWriteLock)
+			LWLockRelease(SInvalWriteLock);
+	}
+}
+
+
+/*
+ * GetNextLocalTransactionId --- allocate a new LocalTransactionId
+ *
+ * We split VirtualTransactionIds into two parts so that it is possible
+ * to allocate a new one without any contention for shared memory, except
+ * for a bit of additional overhead during backend startup/shutdown.
+ * The high-order part of a VirtualTransactionId is a BackendId, and the
+ * low-order part is a LocalTransactionId, which we assign from a local
+ * counter.  To avoid the risk of a VirtualTransactionId being reused
+ * within a short interval, successive procs occupying the same backend ID
+ * slot should use a consecutive sequence of local IDs, which is implemented
+ * by copying nextLocalTransactionId as seen above.
+ */
+LocalTransactionId
+GetNextLocalTransactionId(void)
+{
+	LocalTransactionId result;
+
+	/* loop to avoid returning InvalidLocalTransactionId at wraparound */
+	do
+	{
+		result = nextLocalTransactionId++;
+	} while (!LocalTransactionIdIsValid(result));
+
+	return result;
+}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
new file mode 100644
index 0000000..687ce03
--- /dev/null
+++ b/src/backend/storage/ipc/standby.c
@@ -0,0 +1,1450 @@
+/*-------------------------------------------------------------------------
+ *
+ * standby.c
+ *	  Misc functions used in Hot Standby mode.
+ *
+ *	All functions for handling RM_STANDBY_ID, which relate to
+ *	AccessExclusiveLocks and starting snapshots for Hot Standby mode.
+ *	Plus conflict recovery processing.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/standby.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/standby.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/* User-settable GUC parameters */
+int			vacuum_defer_cleanup_age;
+int			max_standby_archive_delay = 30 * 1000;
+int			max_standby_streaming_delay = 30 * 1000;
+bool		log_recovery_conflict_waits = false;
+
+static HTAB *RecoveryLockLists;
+
+/* Flags set by timeout handlers */
+static volatile sig_atomic_t got_standby_deadlock_timeout = false;
+static volatile sig_atomic_t got_standby_delay_timeout = false;
+static volatile sig_atomic_t got_standby_lock_timeout = false;
+
+static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+												   ProcSignalReason reason,
+												   uint32 wait_event_info,
+												   bool report_waiting);
+static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
+static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
+static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
+static const char *get_recovery_conflict_desc(ProcSignalReason reason);
+
+/*
+ * Keep track of all the locks owned by a given transaction.
+ */
+typedef struct RecoveryLockListsEntry
+{
+	TransactionId xid;
+	List	   *locks;
+} RecoveryLockListsEntry;
+
+/*
+ * InitRecoveryTransactionEnvironment
+ *		Initialize tracking of our primary's in-progress transactions.
+ *
+ * We need to issue shared invalidations and hold locks. Holding locks
+ * means others may want to wait on us, so we need to make a lock table
+ * vxact entry like a real transaction. We could create and delete
+ * lock table entries for each transaction but its simpler just to create
+ * one permanent entry and leave it there all the time. Locks are then
+ * acquired and released as needed. Yes, this means you can see the
+ * Startup process in pg_locks once we have run this.
+ */
+void
+InitRecoveryTransactionEnvironment(void)
+{
+	VirtualTransactionId vxid;
+	HASHCTL		hash_ctl;
+
+	/*
+	 * Initialize the hash table for tracking the list of locks held by each
+	 * transaction.
+	 */
+	hash_ctl.keysize = sizeof(TransactionId);
+	hash_ctl.entrysize = sizeof(RecoveryLockListsEntry);
+	RecoveryLockLists = hash_create("RecoveryLockLists",
+									64,
+									&hash_ctl,
+									HASH_ELEM | HASH_BLOBS);
+
+	/*
+	 * Initialize shared invalidation management for Startup process, being
+	 * careful to register ourselves as a sendOnly process so we don't need to
+	 * read messages, nor will we get signaled when the queue starts filling
+	 * up.
+	 */
+	SharedInvalBackendInit(true);
+
+	/*
+	 * Lock a virtual transaction id for Startup process.
+	 *
+	 * We need to do GetNextLocalTransactionId() because
+	 * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
+	 * manager doesn't like that at all.
+	 *
+	 * Note that we don't need to run XactLockTableInsert() because nobody
+	 * needs to wait on xids. That sounds a little strange, but table locks
+	 * are held by vxids and row level locks are held by xids. All queries
+	 * hold AccessShareLocks so never block while we write or lock new rows.
+	 */
+	vxid.backendId = MyBackendId;
+	vxid.localTransactionId = GetNextLocalTransactionId();
+	VirtualXactLockTableInsert(vxid);
+
+	standbyState = STANDBY_INITIALIZED;
+}
+
+/*
+ * ShutdownRecoveryTransactionEnvironment
+ *		Shut down transaction tracking
+ *
+ * Prepare to switch from hot standby mode to normal operation. Shut down
+ * recovery-time transaction tracking.
+ *
+ * This must be called even in shutdown of startup process if transaction
+ * tracking has been initialized. Otherwise some locks the tracked
+ * transactions were holding will not be released and and may interfere with
+ * the processes still running (but will exit soon later) at the exit of
+ * startup process.
+ */
+void
+ShutdownRecoveryTransactionEnvironment(void)
+{
+	/*
+	 * Do nothing if RecoveryLockLists is NULL because which means that
+	 * transaction tracking has not been yet initialized or has been already
+	 * shutdowned. This prevents transaction tracking from being shutdowned
+	 * unexpectedly more than once.
+	 */
+	if (RecoveryLockLists == NULL)
+		return;
+
+	/* Mark all tracked in-progress transactions as finished. */
+	ExpireAllKnownAssignedTransactionIds();
+
+	/* Release all locks the tracked transactions were holding */
+	StandbyReleaseAllLocks();
+
+	/* Destroy the hash table of locks. */
+	hash_destroy(RecoveryLockLists);
+	RecoveryLockLists = NULL;
+
+	/* Cleanup our VirtualTransaction */
+	VirtualXactLockTableCleanup();
+}
+
+
+/*
+ * -----------------------------------------------------
+ *		Standby wait timers and backend cancel logic
+ * -----------------------------------------------------
+ */
+
+/*
+ * Determine the cutoff time at which we want to start canceling conflicting
+ * transactions.  Returns zero (a time safely in the past) if we are willing
+ * to wait forever.
+ */
+static TimestampTz
+GetStandbyLimitTime(void)
+{
+	TimestampTz rtime;
+	bool		fromStream;
+
+	/*
+	 * The cutoff time is the last WAL data receipt time plus the appropriate
+	 * delay variable.  Delay of -1 means wait forever.
+	 */
+	GetXLogReceiptTime(&rtime, &fromStream);
+	if (fromStream)
+	{
+		if (max_standby_streaming_delay < 0)
+			return 0;			/* wait forever */
+		return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
+	}
+	else
+	{
+		if (max_standby_archive_delay < 0)
+			return 0;			/* wait forever */
+		return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
+	}
+}
+
+#define STANDBY_INITIAL_WAIT_US  1000
+static int	standbyWait_us = STANDBY_INITIAL_WAIT_US;
+
+/*
+ * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
+ * We wait here for a while then return. If we decide we can't wait any
+ * more then we return true, if we can wait some more return false.
+ */
+static bool
+WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
+{
+	TimestampTz ltime;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/* Are we past the limit time? */
+	ltime = GetStandbyLimitTime();
+	if (ltime && GetCurrentTimestamp() >= ltime)
+		return true;
+
+	/*
+	 * Sleep a bit (this is essential to avoid busy-waiting).
+	 */
+	pgstat_report_wait_start(wait_event_info);
+	pg_usleep(standbyWait_us);
+	pgstat_report_wait_end();
+
+	/*
+	 * Progressively increase the sleep times, but not to more than 1s, since
+	 * pg_usleep isn't interruptible on some platforms.
+	 */
+	standbyWait_us *= 2;
+	if (standbyWait_us > 1000000)
+		standbyWait_us = 1000000;
+
+	return false;
+}
+
+/*
+ * Log the recovery conflict.
+ *
+ * wait_start is the timestamp when the caller started to wait.
+ * now is the timestamp when this function has been called.
+ * wait_list is the list of virtual transaction ids assigned to
+ * conflicting processes. still_waiting indicates whether
+ * the startup process is still waiting for the recovery conflict
+ * to be resolved or not.
+ */
+void
+LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
+					TimestampTz now, VirtualTransactionId *wait_list,
+					bool still_waiting)
+{
+	long		secs;
+	int			usecs;
+	long		msecs;
+	StringInfoData buf;
+	int			nprocs = 0;
+
+	/*
+	 * There must be no conflicting processes when the recovery conflict has
+	 * already been resolved.
+	 */
+	Assert(still_waiting || wait_list == NULL);
+
+	TimestampDifference(wait_start, now, &secs, &usecs);
+	msecs = secs * 1000 + usecs / 1000;
+	usecs = usecs % 1000;
+
+	if (wait_list)
+	{
+		VirtualTransactionId *vxids;
+
+		/* Construct a string of list of the conflicting processes */
+		vxids = wait_list;
+		while (VirtualTransactionIdIsValid(*vxids))
+		{
+			PGPROC	   *proc = BackendIdGetProc(vxids->backendId);
+
+			/* proc can be NULL if the target backend is not active */
+			if (proc)
+			{
+				if (nprocs == 0)
+				{
+					initStringInfo(&buf);
+					appendStringInfo(&buf, "%d", proc->pid);
+				}
+				else
+					appendStringInfo(&buf, ", %d", proc->pid);
+
+				nprocs++;
+			}
+
+			vxids++;
+		}
+	}
+
+	/*
+	 * If wait_list is specified, report the list of PIDs of active
+	 * conflicting backends in a detail message. Note that if all the backends
+	 * in the list are not active, no detail message is logged.
+	 */
+	if (still_waiting)
+	{
+		ereport(LOG,
+				errmsg("recovery still waiting after %ld.%03d ms: %s",
+					   msecs, usecs, get_recovery_conflict_desc(reason)),
+				nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
+												  "Conflicting processes: %s.",
+												  nprocs, buf.data) : 0);
+	}
+	else
+	{
+		ereport(LOG,
+				errmsg("recovery finished waiting after %ld.%03d ms: %s",
+					   msecs, usecs, get_recovery_conflict_desc(reason)));
+	}
+
+	if (nprocs > 0)
+		pfree(buf.data);
+}
+
+/*
+ * This is the main executioner for any query backend that conflicts with
+ * recovery processing. Judgement has already been passed on it within
+ * a specific rmgr. Here we just issue the orders to the procs. The procs
+ * then throw the required error as instructed.
+ *
+ * If report_waiting is true, "waiting" is reported in PS display and the
+ * wait for recovery conflict is reported in the log, if necessary. If
+ * the caller is responsible for reporting them, report_waiting should be
+ * false. Otherwise, both the caller and this function report the same
+ * thing unexpectedly.
+ */
+static void
+ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+									   ProcSignalReason reason, uint32 wait_event_info,
+									   bool report_waiting)
+{
+	TimestampTz waitStart = 0;
+	char	   *new_status = NULL;
+	bool		logged_recovery_conflict = false;
+
+	/* Fast exit, to avoid a kernel call if there's no work to be done. */
+	if (!VirtualTransactionIdIsValid(*waitlist))
+		return;
+
+	/* Set the wait start timestamp for reporting */
+	if (report_waiting && (log_recovery_conflict_waits || update_process_title))
+		waitStart = GetCurrentTimestamp();
+
+	while (VirtualTransactionIdIsValid(*waitlist))
+	{
+		/* reset standbyWait_us for each xact we wait for */
+		standbyWait_us = STANDBY_INITIAL_WAIT_US;
+
+		/* wait until the virtual xid is gone */
+		while (!VirtualXactLock(*waitlist, false))
+		{
+			/* Is it time to kill it? */
+			if (WaitExceedsMaxStandbyDelay(wait_event_info))
+			{
+				pid_t		pid;
+
+				/*
+				 * Now find out who to throw out of the balloon.
+				 */
+				Assert(VirtualTransactionIdIsValid(*waitlist));
+				pid = CancelVirtualTransaction(*waitlist, reason);
+
+				/*
+				 * Wait a little bit for it to die so that we avoid flooding
+				 * an unresponsive backend when system is heavily loaded.
+				 */
+				if (pid != 0)
+					pg_usleep(5000L);
+			}
+
+			if (waitStart != 0 && (!logged_recovery_conflict || new_status == NULL))
+			{
+				TimestampTz now = 0;
+				bool		maybe_log_conflict;
+				bool		maybe_update_title;
+
+				maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
+				maybe_update_title = (update_process_title && new_status == NULL);
+
+				/* Get the current timestamp if not report yet */
+				if (maybe_log_conflict || maybe_update_title)
+					now = GetCurrentTimestamp();
+
+				/*
+				 * Report via ps if we have been waiting for more than 500
+				 * msec (should that be configurable?)
+				 */
+				if (maybe_update_title &&
+					TimestampDifferenceExceeds(waitStart, now, 500))
+				{
+					const char *old_status;
+					int			len;
+
+					old_status = get_ps_display(&len);
+					new_status = (char *) palloc(len + 8 + 1);
+					memcpy(new_status, old_status, len);
+					strcpy(new_status + len, " waiting");
+					set_ps_display(new_status);
+					new_status[len] = '\0'; /* truncate off " waiting" */
+				}
+
+				/*
+				 * Emit the log message if the startup process is waiting
+				 * longer than deadlock_timeout for recovery conflict.
+				 */
+				if (maybe_log_conflict &&
+					TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
+				{
+					LogRecoveryConflict(reason, waitStart, now, waitlist, true);
+					logged_recovery_conflict = true;
+				}
+			}
+		}
+
+		/* The virtual transaction is gone now, wait for the next one */
+		waitlist++;
+	}
+
+	/*
+	 * Emit the log message if recovery conflict was resolved but the startup
+	 * process waited longer than deadlock_timeout for it.
+	 */
+	if (logged_recovery_conflict)
+		LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
+							NULL, false);
+
+	/* Reset ps display if we changed it */
+	if (new_status)
+	{
+		set_ps_display(new_status);
+		pfree(new_status);
+	}
+}
+
+void
+ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
+{
+	VirtualTransactionId *backends;
+
+	/*
+	 * If we get passed InvalidTransactionId then we do nothing (no conflict).
+	 *
+	 * This can happen when replaying already-applied WAL records after a
+	 * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
+	 * record that marks as frozen a page which was already all-visible.  It's
+	 * also quite common with records generated during index deletion
+	 * (original execution of the deletion can reason that a recovery conflict
+	 * which is sufficient for the deletion operation must take place before
+	 * replay of the deletion record itself).
+	 */
+	if (!TransactionIdIsValid(latestRemovedXid))
+		return;
+
+	backends = GetConflictingVirtualXIDs(latestRemovedXid,
+										 node.dbNode);
+
+	ResolveRecoveryConflictWithVirtualXIDs(backends,
+										   PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
+										   WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
+										   true);
+}
+
+/*
+ * Variant of ResolveRecoveryConflictWithSnapshot that works with
+ * FullTransactionId values
+ */
+void
+ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+										   RelFileNode node)
+{
+	/*
+	 * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
+	 * so truncate the logged FullTransactionId.  If the logged value is very
+	 * old, so that XID wrap-around already happened on it, there can't be any
+	 * snapshots that still see it.
+	 */
+	FullTransactionId nextXid = ReadNextFullTransactionId();
+	uint64		diff;
+
+	diff = U64FromFullTransactionId(nextXid) -
+		U64FromFullTransactionId(latestRemovedFullXid);
+	if (diff < MaxTransactionId / 2)
+	{
+		TransactionId latestRemovedXid;
+
+		latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
+		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node);
+	}
+}
+
+void
+ResolveRecoveryConflictWithTablespace(Oid tsid)
+{
+	VirtualTransactionId *temp_file_users;
+
+	/*
+	 * Standby users may be currently using this tablespace for their
+	 * temporary files. We only care about current users because
+	 * temp_tablespace parameter will just ignore tablespaces that no longer
+	 * exist.
+	 *
+	 * Ask everybody to cancel their queries immediately so we can ensure no
+	 * temp files remain and we can remove the tablespace. Nuke the entire
+	 * site from orbit, it's the only way to be sure.
+	 *
+	 * XXX: We could work out the pids of active backends using this
+	 * tablespace by examining the temp filenames in the directory. We would
+	 * then convert the pids into VirtualXIDs before attempting to cancel
+	 * them.
+	 *
+	 * We don't wait for commit because drop tablespace is non-transactional.
+	 */
+	temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
+												InvalidOid);
+	ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
+										   PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
+										   WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
+										   true);
+}
+
+void
+ResolveRecoveryConflictWithDatabase(Oid dbid)
+{
+	/*
+	 * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
+	 * only waits for transactions and completely idle sessions would block
+	 * us. This is rare enough that we do this as simply as possible: no wait,
+	 * just force them off immediately.
+	 *
+	 * No locking is required here because we already acquired
+	 * AccessExclusiveLock. Anybody trying to connect while we do this will
+	 * block during InitPostgres() and then disconnect when they see the
+	 * database has been removed.
+	 */
+	while (CountDBBackends(dbid) > 0)
+	{
+		CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
+
+		/*
+		 * Wait awhile for them to die so that we avoid flooding an
+		 * unresponsive backend when system is heavily loaded.
+		 */
+		pg_usleep(10000);
+	}
+}
+
+/*
+ * ResolveRecoveryConflictWithLock is called from ProcSleep()
+ * to resolve conflicts with other backends holding relation locks.
+ *
+ * The WaitLatch sleep normally done in ProcSleep()
+ * (when not InHotStandby) is performed here, for code clarity.
+ *
+ * We either resolve conflicts immediately or set a timeout to wake us at
+ * the limit of our patience.
+ *
+ * Resolve conflicts by canceling to all backends holding a conflicting
+ * lock.  As we are already queued to be granted the lock, no new lock
+ * requests conflicting with ours will be granted in the meantime.
+ *
+ * We also must check for deadlocks involving the Startup process and
+ * hot-standby backend processes. If deadlock_timeout is reached in
+ * this function, all the backends holding the conflicting locks are
+ * requested to check themselves for deadlocks.
+ *
+ * logging_conflict should be true if the recovery conflict has not been
+ * logged yet even though logging is enabled. After deadlock_timeout is
+ * reached and the request for deadlock check is sent, we wait again to
+ * be signaled by the release of the lock if logging_conflict is false.
+ * Otherwise we return without waiting again so that the caller can report
+ * the recovery conflict. In this case, then, this function is called again
+ * with logging_conflict=false (because the recovery conflict has already
+ * been logged) and we will wait again for the lock to be released.
+ */
+void
+ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
+{
+	TimestampTz ltime;
+	TimestampTz now;
+
+	Assert(InHotStandby);
+
+	ltime = GetStandbyLimitTime();
+	now = GetCurrentTimestamp();
+
+	/*
+	 * Update waitStart if first time through after the startup process
+	 * started waiting for the lock. It should not be updated every time
+	 * ResolveRecoveryConflictWithLock() is called during the wait.
+	 *
+	 * Use the current time obtained for comparison with ltime as waitStart
+	 * (i.e., the time when this process started waiting for the lock). Since
+	 * getting the current time newly can cause overhead, we reuse the
+	 * already-obtained time to avoid that overhead.
+	 *
+	 * Note that waitStart is updated without holding the lock table's
+	 * partition lock, to avoid the overhead by additional lock acquisition.
+	 * This can cause "waitstart" in pg_locks to become NULL for a very short
+	 * period of time after the wait started even though "granted" is false.
+	 * This is OK in practice because we can assume that users are likely to
+	 * look at "waitstart" when waiting for the lock for a long time.
+	 */
+	if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
+		pg_atomic_write_u64(&MyProc->waitStart, now);
+
+	if (now >= ltime && ltime != 0)
+	{
+		/*
+		 * We're already behind, so clear a path as quickly as possible.
+		 */
+		VirtualTransactionId *backends;
+
+		backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
+
+		/*
+		 * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
+		 * "waiting" in PS display by disabling its argument report_waiting
+		 * because the caller, WaitOnLock(), has already reported that.
+		 */
+		ResolveRecoveryConflictWithVirtualXIDs(backends,
+											   PROCSIG_RECOVERY_CONFLICT_LOCK,
+											   PG_WAIT_LOCK | locktag.locktag_type,
+											   false);
+	}
+	else
+	{
+		/*
+		 * Wait (or wait again) until ltime, and check for deadlocks as well
+		 * if we will be waiting longer than deadlock_timeout
+		 */
+		EnableTimeoutParams timeouts[2];
+		int			cnt = 0;
+
+		if (ltime != 0)
+		{
+			got_standby_lock_timeout = false;
+			timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
+			timeouts[cnt].type = TMPARAM_AT;
+			timeouts[cnt].fin_time = ltime;
+			cnt++;
+		}
+
+		got_standby_deadlock_timeout = false;
+		timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
+		timeouts[cnt].type = TMPARAM_AFTER;
+		timeouts[cnt].delay_ms = DeadlockTimeout;
+		cnt++;
+
+		enable_timeouts(timeouts, cnt);
+	}
+
+	/* Wait to be signaled by the release of the Relation Lock */
+	ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
+
+	/*
+	 * Exit if ltime is reached. Then all the backends holding conflicting
+	 * locks will be canceled in the next ResolveRecoveryConflictWithLock()
+	 * call.
+	 */
+	if (got_standby_lock_timeout)
+		goto cleanup;
+
+	if (got_standby_deadlock_timeout)
+	{
+		VirtualTransactionId *backends;
+
+		backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
+
+		/* Quick exit if there's no work to be done */
+		if (!VirtualTransactionIdIsValid(*backends))
+			goto cleanup;
+
+		/*
+		 * Send signals to all the backends holding the conflicting locks, to
+		 * ask them to check themselves for deadlocks.
+		 */
+		while (VirtualTransactionIdIsValid(*backends))
+		{
+			SignalVirtualTransaction(*backends,
+									 PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
+									 false);
+			backends++;
+		}
+
+		/*
+		 * Exit if the recovery conflict has not been logged yet even though
+		 * logging is enabled, so that the caller can log that. Then
+		 * RecoveryConflictWithLock() is called again and we will wait again
+		 * for the lock to be released.
+		 */
+		if (logging_conflict)
+			goto cleanup;
+
+		/*
+		 * Wait again here to be signaled by the release of the Relation Lock,
+		 * to prevent the subsequent RecoveryConflictWithLock() from causing
+		 * deadlock_timeout and sending a request for deadlocks check again.
+		 * Otherwise the request continues to be sent every deadlock_timeout
+		 * until the relation locks are released or ltime is reached.
+		 */
+		got_standby_deadlock_timeout = false;
+		ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
+	}
+
+cleanup:
+
+	/*
+	 * Clear any timeout requests established above.  We assume here that the
+	 * Startup process doesn't have any other outstanding timeouts than those
+	 * used by this function. If that stops being true, we could cancel the
+	 * timeouts individually, but that'd be slower.
+	 */
+	disable_all_timeouts(false);
+	got_standby_lock_timeout = false;
+	got_standby_deadlock_timeout = false;
+}
+
+/*
+ * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
+ * to resolve conflicts with other backends holding buffer pins.
+ *
+ * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
+ * (when not InHotStandby) is performed here, for code clarity.
+ *
+ * We either resolve conflicts immediately or set a timeout to wake us at
+ * the limit of our patience.
+ *
+ * Resolve conflicts by sending a PROCSIG signal to all backends to check if
+ * they hold one of the buffer pins that is blocking Startup process. If so,
+ * those backends will take an appropriate error action, ERROR or FATAL.
+ *
+ * We also must check for deadlocks.  Deadlocks occur because if queries
+ * wait on a lock, that must be behind an AccessExclusiveLock, which can only
+ * be cleared if the Startup process replays a transaction completion record.
+ * If Startup process is also waiting then that is a deadlock. The deadlock
+ * can occur if the query is waiting and then the Startup sleeps, or if
+ * Startup is sleeping and the query waits on a lock. We protect against
+ * only the former sequence here, the latter sequence is checked prior to
+ * the query sleeping, in CheckRecoveryConflictDeadlock().
+ *
+ * Deadlocks are extremely rare, and relatively expensive to check for,
+ * so we don't do a deadlock check right away ... only if we have had to wait
+ * at least deadlock_timeout.
+ */
+void
+ResolveRecoveryConflictWithBufferPin(void)
+{
+	TimestampTz ltime;
+
+	Assert(InHotStandby);
+
+	ltime = GetStandbyLimitTime();
+
+	if (GetCurrentTimestamp() >= ltime && ltime != 0)
+	{
+		/*
+		 * We're already behind, so clear a path as quickly as possible.
+		 */
+		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+	}
+	else
+	{
+		/*
+		 * Wake up at ltime, and check for deadlocks as well if we will be
+		 * waiting longer than deadlock_timeout
+		 */
+		EnableTimeoutParams timeouts[2];
+		int			cnt = 0;
+
+		if (ltime != 0)
+		{
+			timeouts[cnt].id = STANDBY_TIMEOUT;
+			timeouts[cnt].type = TMPARAM_AT;
+			timeouts[cnt].fin_time = ltime;
+			cnt++;
+		}
+
+		got_standby_deadlock_timeout = false;
+		timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
+		timeouts[cnt].type = TMPARAM_AFTER;
+		timeouts[cnt].delay_ms = DeadlockTimeout;
+		cnt++;
+
+		enable_timeouts(timeouts, cnt);
+	}
+
+	/*
+	 * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
+	 * by one of the timeouts established above.
+	 *
+	 * We assume that only UnpinBuffer() and the timeout requests established
+	 * above can wake us up here. WakeupRecovery() called by walreceiver or
+	 * SIGHUP signal handler, etc cannot do that because it uses the different
+	 * latch from that ProcWaitForSignal() waits on.
+	 */
+	ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
+
+	if (got_standby_delay_timeout)
+		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+	else if (got_standby_deadlock_timeout)
+	{
+		/*
+		 * Send out a request for hot-standby backends to check themselves for
+		 * deadlocks.
+		 *
+		 * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
+		 * to be signaled by UnpinBuffer() again and send a request for
+		 * deadlocks check if deadlock_timeout happens. This causes the
+		 * request to continue to be sent every deadlock_timeout until the
+		 * buffer is unpinned or ltime is reached. This would increase the
+		 * workload in the startup process and backends. In practice it may
+		 * not be so harmful because the period that the buffer is kept pinned
+		 * is basically no so long. But we should fix this?
+		 */
+		SendRecoveryConflictWithBufferPin(
+										  PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+	}
+
+	/*
+	 * Clear any timeout requests established above.  We assume here that the
+	 * Startup process doesn't have any other timeouts than what this function
+	 * uses.  If that stops being true, we could cancel the timeouts
+	 * individually, but that'd be slower.
+	 */
+	disable_all_timeouts(false);
+	got_standby_delay_timeout = false;
+	got_standby_deadlock_timeout = false;
+}
+
+static void
+SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
+{
+	Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
+		   reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+
+	/*
+	 * We send signal to all backends to ask them if they are holding the
+	 * buffer pin which is delaying the Startup process. We must not set the
+	 * conflict flag yet, since most backends will be innocent. Let the
+	 * SIGUSR1 handling in each backend decide their own fate.
+	 */
+	CancelDBBackends(InvalidOid, reason, false);
+}
+
+/*
+ * In Hot Standby perform early deadlock detection.  We abort the lock
+ * wait if we are about to sleep while holding the buffer pin that Startup
+ * process is waiting for.
+ *
+ * Note: this code is pessimistic, because there is no way for it to
+ * determine whether an actual deadlock condition is present: the lock we
+ * need to wait for might be unrelated to any held by the Startup process.
+ * Sooner or later, this mechanism should get ripped out in favor of somehow
+ * accounting for buffer locks in DeadLockCheck().  However, errors here
+ * seem to be very low-probability in practice, so for now it's not worth
+ * the trouble.
+ */
+void
+CheckRecoveryConflictDeadlock(void)
+{
+	Assert(!InRecovery);		/* do not call in Startup process */
+
+	if (!HoldingBufferPinThatDelaysRecovery())
+		return;
+
+	/*
+	 * Error message should match ProcessInterrupts() but we avoid calling
+	 * that because we aren't handling an interrupt at this point. Note that
+	 * we only cancel the current transaction here, so if we are in a
+	 * subtransaction and the pin is held by a parent, then the Startup
+	 * process will continue to wait even though we have avoided deadlock.
+	 */
+	ereport(ERROR,
+			(errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
+			 errmsg("canceling statement due to conflict with recovery"),
+			 errdetail("User transaction caused buffer deadlock with recovery.")));
+}
+
+
+/* --------------------------------
+ *		timeout handler routines
+ * --------------------------------
+ */
+
+/*
+ * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
+ * exceeded.
+ */
+void
+StandbyDeadLockHandler(void)
+{
+	got_standby_deadlock_timeout = true;
+}
+
+/*
+ * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
+ */
+void
+StandbyTimeoutHandler(void)
+{
+	got_standby_delay_timeout = true;
+}
+
+/*
+ * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
+ */
+void
+StandbyLockTimeoutHandler(void)
+{
+	got_standby_lock_timeout = true;
+}
+
+/*
+ * -----------------------------------------------------
+ * Locking in Recovery Mode
+ * -----------------------------------------------------
+ *
+ * All locks are held by the Startup process using a single virtual
+ * transaction. This implementation is both simpler and in some senses,
+ * more correct. The locks held mean "some original transaction held
+ * this lock, so query access is not allowed at this time". So the Startup
+ * process is the proxy by which the original locks are implemented.
+ *
+ * We only keep track of AccessExclusiveLocks, which are only ever held by
+ * one transaction on one relation.
+ *
+ * We keep a hash table of lists of locks in local memory keyed by xid,
+ * RecoveryLockLists, so we can keep track of the various entries made by
+ * the Startup process's virtual xid in the shared lock table.
+ *
+ * List elements use type xl_standby_lock, since the WAL record type exactly
+ * matches the information that we need to keep track of.
+ *
+ * We use session locks rather than normal locks so we don't need
+ * ResourceOwners.
+ */
+
+
+void
+StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
+{
+	RecoveryLockListsEntry *entry;
+	xl_standby_lock *newlock;
+	LOCKTAG		locktag;
+	bool		found;
+
+	/* Already processed? */
+	if (!TransactionIdIsValid(xid) ||
+		TransactionIdDidCommit(xid) ||
+		TransactionIdDidAbort(xid))
+		return;
+
+	elog(trace_recovery(DEBUG4),
+		 "adding recovery lock: db %u rel %u", dbOid, relOid);
+
+	/* dbOid is InvalidOid when we are locking a shared relation. */
+	Assert(OidIsValid(relOid));
+
+	/* Create a new list for this xid, if we don't have one already. */
+	entry = hash_search(RecoveryLockLists, &xid, HASH_ENTER, &found);
+	if (!found)
+	{
+		entry->xid = xid;
+		entry->locks = NIL;
+	}
+
+	newlock = palloc(sizeof(xl_standby_lock));
+	newlock->xid = xid;
+	newlock->dbOid = dbOid;
+	newlock->relOid = relOid;
+	entry->locks = lappend(entry->locks, newlock);
+
+	SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
+
+	(void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
+}
+
+static void
+StandbyReleaseLockList(List *locks)
+{
+	ListCell   *lc;
+
+	foreach(lc, locks)
+	{
+		xl_standby_lock *lock = (xl_standby_lock *) lfirst(lc);
+		LOCKTAG		locktag;
+
+		elog(trace_recovery(DEBUG4),
+			 "releasing recovery lock: xid %u db %u rel %u",
+			 lock->xid, lock->dbOid, lock->relOid);
+		SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
+		if (!LockRelease(&locktag, AccessExclusiveLock, true))
+		{
+			elog(LOG,
+				 "RecoveryLockLists contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
+				 lock->xid, lock->dbOid, lock->relOid);
+			Assert(false);
+		}
+	}
+
+	list_free_deep(locks);
+}
+
+static void
+StandbyReleaseLocks(TransactionId xid)
+{
+	RecoveryLockListsEntry *entry;
+
+	if (TransactionIdIsValid(xid))
+	{
+		if ((entry = hash_search(RecoveryLockLists, &xid, HASH_FIND, NULL)))
+		{
+			StandbyReleaseLockList(entry->locks);
+			hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+		}
+	}
+	else
+		StandbyReleaseAllLocks();
+}
+
+/*
+ * Release locks for a transaction tree, starting at xid down, from
+ * RecoveryLockLists.
+ *
+ * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
+ * to remove any AccessExclusiveLocks requested by a transaction.
+ */
+void
+StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
+{
+	int			i;
+
+	StandbyReleaseLocks(xid);
+
+	for (i = 0; i < nsubxids; i++)
+		StandbyReleaseLocks(subxids[i]);
+}
+
+/*
+ * Called at end of recovery and when we see a shutdown checkpoint.
+ */
+void
+StandbyReleaseAllLocks(void)
+{
+	HASH_SEQ_STATUS status;
+	RecoveryLockListsEntry *entry;
+
+	elog(trace_recovery(DEBUG2), "release all standby locks");
+
+	hash_seq_init(&status, RecoveryLockLists);
+	while ((entry = hash_seq_search(&status)))
+	{
+		StandbyReleaseLockList(entry->locks);
+		hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+	}
+}
+
+/*
+ * StandbyReleaseOldLocks
+ *		Release standby locks held by top-level XIDs that aren't running,
+ *		as long as they're not prepared transactions.
+ */
+void
+StandbyReleaseOldLocks(TransactionId oldxid)
+{
+	HASH_SEQ_STATUS status;
+	RecoveryLockListsEntry *entry;
+
+	hash_seq_init(&status, RecoveryLockLists);
+	while ((entry = hash_seq_search(&status)))
+	{
+		Assert(TransactionIdIsValid(entry->xid));
+
+		/* Skip if prepared transaction. */
+		if (StandbyTransactionIdIsPrepared(entry->xid))
+			continue;
+
+		/* Skip if >= oldxid. */
+		if (!TransactionIdPrecedes(entry->xid, oldxid))
+			continue;
+
+		/* Remove all locks and hash table entry. */
+		StandbyReleaseLockList(entry->locks);
+		hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+	}
+}
+
+/*
+ * --------------------------------------------------------------------
+ *		Recovery handling for Rmgr RM_STANDBY_ID
+ *
+ * These record types will only be created if XLogStandbyInfoActive()
+ * --------------------------------------------------------------------
+ */
+
+void
+standby_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	/* Backup blocks are not used in standby records */
+	Assert(!XLogRecHasAnyBlockRefs(record));
+
+	/* Do nothing if we're not in hot standby mode */
+	if (standbyState == STANDBY_DISABLED)
+		return;
+
+	if (info == XLOG_STANDBY_LOCK)
+	{
+		xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
+		int			i;
+
+		for (i = 0; i < xlrec->nlocks; i++)
+			StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
+											  xlrec->locks[i].dbOid,
+											  xlrec->locks[i].relOid);
+	}
+	else if (info == XLOG_RUNNING_XACTS)
+	{
+		xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
+		RunningTransactionsData running;
+
+		running.xcnt = xlrec->xcnt;
+		running.subxcnt = xlrec->subxcnt;
+		running.subxid_overflow = xlrec->subxid_overflow;
+		running.nextXid = xlrec->nextXid;
+		running.latestCompletedXid = xlrec->latestCompletedXid;
+		running.oldestRunningXid = xlrec->oldestRunningXid;
+		running.xids = xlrec->xids;
+
+		ProcArrayApplyRecoveryInfo(&running);
+	}
+	else if (info == XLOG_INVALIDATIONS)
+	{
+		xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
+
+		ProcessCommittedInvalidationMessages(xlrec->msgs,
+											 xlrec->nmsgs,
+											 xlrec->relcacheInitFileInval,
+											 xlrec->dbId,
+											 xlrec->tsId);
+	}
+	else
+		elog(PANIC, "standby_redo: unknown op code %u", info);
+}
+
+/*
+ * Log details of the current snapshot to WAL. This allows the snapshot state
+ * to be reconstructed on the standby and for logical decoding.
+ *
+ * This is used for Hot Standby as follows:
+ *
+ * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
+ * start from a shutdown checkpoint because we know nothing was running
+ * at that time and our recovery snapshot is known empty. In the more
+ * typical case of an online checkpoint we need to jump through a few
+ * hoops to get a correct recovery snapshot and this requires a two or
+ * sometimes a three stage process.
+ *
+ * The initial snapshot must contain all running xids and all current
+ * AccessExclusiveLocks at a point in time on the standby. Assembling
+ * that information while the server is running requires many and
+ * various LWLocks, so we choose to derive that information piece by
+ * piece and then re-assemble that info on the standby. When that
+ * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
+ *
+ * Since locking on the primary when we derive the information is not
+ * strict, we note that there is a time window between the derivation and
+ * writing to WAL of the derived information. That allows race conditions
+ * that we must resolve, since xids and locks may enter or leave the
+ * snapshot during that window. This creates the issue that an xid or
+ * lock may start *after* the snapshot has been derived yet *before* the
+ * snapshot is logged in the running xacts WAL record. We resolve this by
+ * starting to accumulate changes at a point just prior to when we derive
+ * the snapshot on the primary, then ignore duplicates when we later apply
+ * the snapshot from the running xacts record. This is implemented during
+ * CreateCheckpoint() where we use the logical checkpoint location as
+ * our starting point and then write the running xacts record immediately
+ * before writing the main checkpoint WAL record. Since we always start
+ * up from a checkpoint and are immediately at our starting point, we
+ * unconditionally move to STANDBY_INITIALIZED. After this point we
+ * must do 4 things:
+ *	* move shared nextXid forwards as we see new xids
+ *	* extend the clog and subtrans with each new xid
+ *	* keep track of uncommitted known assigned xids
+ *	* keep track of uncommitted AccessExclusiveLocks
+ *
+ * When we see a commit/abort we must remove known assigned xids and locks
+ * from the completing transaction. Attempted removals that cannot locate
+ * an entry are expected and must not cause an error when we are in state
+ * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
+ * KnownAssignedXidsRemove().
+ *
+ * Later, when we apply the running xact data we must be careful to ignore
+ * transactions already committed, since those commits raced ahead when
+ * making WAL entries.
+ *
+ * The loose timing also means that locks may be recorded that have a
+ * zero xid, since xids are removed from procs before locks are removed.
+ * So we must prune the lock list down to ensure we hold locks only for
+ * currently running xids, performed by StandbyReleaseOldLocks().
+ * Zero xids should no longer be possible, but we may be replaying WAL
+ * from a time when they were possible.
+ *
+ * For logical decoding only the running xacts information is needed;
+ * there's no need to look at the locking information, but it's logged anyway,
+ * as there's no independent knob to just enable logical decoding. For
+ * details of how this is used, check snapbuild.c's introductory comment.
+ *
+ *
+ * Returns the RecPtr of the last inserted record.
+ */
+XLogRecPtr
+LogStandbySnapshot(void)
+{
+	XLogRecPtr	recptr;
+	RunningTransactions running;
+	xl_standby_lock *locks;
+	int			nlocks;
+
+	Assert(XLogStandbyInfoActive());
+
+	/*
+	 * Get details of any AccessExclusiveLocks being held at the moment.
+	 */
+	locks = GetRunningTransactionLocks(&nlocks);
+	if (nlocks > 0)
+		LogAccessExclusiveLocks(nlocks, locks);
+	pfree(locks);
+
+	/*
+	 * Log details of all in-progress transactions. This should be the last
+	 * record we write, because standby will open up when it sees this.
+	 */
+	running = GetRunningTransactionData();
+
+	/*
+	 * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
+	 * For Hot Standby this can be done before inserting the WAL record
+	 * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
+	 * the clog. For logical decoding, though, the lock can't be released
+	 * early because the clog might be "in the future" from the POV of the
+	 * historic snapshot. This would allow for situations where we're waiting
+	 * for the end of a transaction listed in the xl_running_xacts record
+	 * which, according to the WAL, has committed before the xl_running_xacts
+	 * record. Fortunately this routine isn't executed frequently, and it's
+	 * only a shared lock.
+	 */
+	if (wal_level < WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
+	recptr = LogCurrentRunningXacts(running);
+
+	/* Release lock if we kept it longer ... */
+	if (wal_level >= WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
+	/* GetRunningTransactionData() acquired XidGenLock, we must release it */
+	LWLockRelease(XidGenLock);
+
+	return recptr;
+}
+
+/*
+ * Record an enhanced snapshot of running transactions into WAL.
+ *
+ * The definitions of RunningTransactionsData and xl_xact_running_xacts are
+ * similar. We keep them separate because xl_xact_running_xacts is a
+ * contiguous chunk of memory and never exists fully until it is assembled in
+ * WAL. The inserted records are marked as not being important for durability,
+ * to avoid triggering superfluous checkpoint / archiving activity.
+ */
+static XLogRecPtr
+LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
+{
+	xl_running_xacts xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.xcnt = CurrRunningXacts->xcnt;
+	xlrec.subxcnt = CurrRunningXacts->subxcnt;
+	xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
+	xlrec.nextXid = CurrRunningXacts->nextXid;
+	xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
+	xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
+
+	/* Header */
+	XLogBeginInsert();
+	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+	XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
+
+	/* array of TransactionIds */
+	if (xlrec.xcnt > 0)
+		XLogRegisterData((char *) CurrRunningXacts->xids,
+						 (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
+
+	recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
+
+	if (CurrRunningXacts->subxid_overflow)
+		elog(trace_recovery(DEBUG2),
+			 "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+			 CurrRunningXacts->xcnt,
+			 LSN_FORMAT_ARGS(recptr),
+			 CurrRunningXacts->oldestRunningXid,
+			 CurrRunningXacts->latestCompletedXid,
+			 CurrRunningXacts->nextXid);
+	else
+		elog(trace_recovery(DEBUG2),
+			 "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+			 CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
+			 LSN_FORMAT_ARGS(recptr),
+			 CurrRunningXacts->oldestRunningXid,
+			 CurrRunningXacts->latestCompletedXid,
+			 CurrRunningXacts->nextXid);
+
+	/*
+	 * Ensure running_xacts information is synced to disk not too far in the
+	 * future. We don't want to stall anything though (i.e. use XLogFlush()),
+	 * so we let the wal writer do it during normal operation.
+	 * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
+	 * and nudge the WALWriter into action if sleeping. Check
+	 * XLogBackgroundFlush() for details why a record might not be flushed
+	 * without it.
+	 */
+	XLogSetAsyncXactLSN(recptr);
+
+	return recptr;
+}
+
+/*
+ * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
+ * logged, as described in backend/storage/lmgr/README.
+ */
+static void
+LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
+{
+	xl_standby_locks xlrec;
+
+	xlrec.nlocks = nlocks;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
+	XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
+	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+
+	(void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
+}
+
+/*
+ * Individual logging of AccessExclusiveLocks for use during LockAcquire()
+ */
+void
+LogAccessExclusiveLock(Oid dbOid, Oid relOid)
+{
+	xl_standby_lock xlrec;
+
+	xlrec.xid = GetCurrentTransactionId();
+
+	xlrec.dbOid = dbOid;
+	xlrec.relOid = relOid;
+
+	LogAccessExclusiveLocks(1, &xlrec);
+	MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
+}
+
+/*
+ * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
+ */
+void
+LogAccessExclusiveLockPrepare(void)
+{
+	/*
+	 * Ensure that a TransactionId has been assigned to this transaction, for
+	 * two reasons, both related to lock release on the standby. First, we
+	 * must assign an xid so that RecordTransactionCommit() and
+	 * RecordTransactionAbort() do not optimise away the transaction
+	 * completion record which recovery relies upon to release locks. It's a
+	 * hack, but for a corner case not worth adding code for into the main
+	 * commit path. Second, we must assign an xid before the lock is recorded
+	 * in shared memory, otherwise a concurrently executing
+	 * GetRunningTransactionLocks() might see a lock associated with an
+	 * InvalidTransactionId which we later assert cannot happen.
+	 */
+	(void) GetCurrentTransactionId();
+}
+
+/*
+ * Emit WAL for invalidations. This currently is only used for commits without
+ * an xid but which contain invalidations.
+ */
+void
+LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
+						bool relcacheInitFileInval)
+{
+	xl_invalidations xlrec;
+
+	/* prepare record */
+	memset(&xlrec, 0, sizeof(xlrec));
+	xlrec.dbId = MyDatabaseId;
+	xlrec.tsId = MyDatabaseTableSpace;
+	xlrec.relcacheInitFileInval = relcacheInitFileInval;
+	xlrec.nmsgs = nmsgs;
+
+	/* perform insertion */
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
+	XLogRegisterData((char *) msgs,
+					 nmsgs * sizeof(SharedInvalidationMessage));
+	XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
+}
+
+/* Return the description of recovery conflict */
+static const char *
+get_recovery_conflict_desc(ProcSignalReason reason)
+{
+	const char *reasonDesc = _("unknown reason");
+
+	switch (reason)
+	{
+		case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+			reasonDesc = _("recovery conflict on buffer pin");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_LOCK:
+			reasonDesc = _("recovery conflict on lock");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+			reasonDesc = _("recovery conflict on tablespace");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+			reasonDesc = _("recovery conflict on snapshot");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+			reasonDesc = _("recovery conflict on buffer deadlock");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+			reasonDesc = _("recovery conflict on database");
+			break;
+		default:
+			break;
+	}
+
+	return reasonDesc;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
commit	46651ce6fe013220ed397add242004d764fc0153 (patch)
tree	6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/storage/ipc
parent	Initial commit. (diff)
download	postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip