Adding upstream version 14.5.upstream/14.5 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
commit: 46651ce6fe013220ed397add242004d764fc0153 (patch)
tree: 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/storage
parent: Initial commit. (diff)
download: postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
68 files changed, 55818 insertions, 0 deletions
diff --git a/src/backend/storage/Makefile b/src/backend/storage/Makefile
new file mode 100644
index 0000000..8376cdf
--- /dev/null
+++ b/src/backend/storage/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for the storage manager subsystem
+#
+# src/backend/storage/Makefile
+#
+
+subdir = src/backend/storage
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+SUBDIRS     = buffer file freespace ipc large_object lmgr page smgr sync
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile
new file mode 100644
index 0000000..fd7c40d
--- /dev/null
+++ b/src/backend/storage/buffer/Makefile
@@ -0,0 +1,22 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/buffer
+#
+# IDENTIFICATION
+#    src/backend/storage/buffer/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/buffer
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	buf_init.o \
+	buf_table.o \
+	bufmgr.o \
+	freelist.o \
+	localbuf.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
new file mode 100644
index 0000000..a775276
--- /dev/null
+++ b/src/backend/storage/buffer/README
@@ -0,0 +1,276 @@
+src/backend/storage/buffer/README
+
+Notes About Shared Buffer Access Rules
+======================================
+
+There are two separate access control mechanisms for shared disk buffers:
+reference counts (a/k/a pin counts) and buffer content locks.  (Actually,
+there's a third level of access control: one must hold the appropriate kind
+of lock on a relation before one can legally access any page belonging to
+the relation.  Relation-level locks are not discussed here.)
+
+Pins: one must "hold a pin on" a buffer (increment its reference count)
+before being allowed to do anything at all with it.  An unpinned buffer is
+subject to being reclaimed and reused for a different page at any instant,
+so touching it is unsafe.  Normally a pin is acquired via ReadBuffer and
+released via ReleaseBuffer.  It is OK and indeed common for a single
+backend to pin a page more than once concurrently; the buffer manager
+handles this efficiently.  It is considered OK to hold a pin for long
+intervals --- for example, sequential scans hold a pin on the current page
+until done processing all the tuples on the page, which could be quite a
+while if the scan is the outer scan of a join.  Similarly, a btree index
+scan may hold a pin on the current index page.  This is OK because normal
+operations never wait for a page's pin count to drop to zero.  (Anything
+that might need to do such a wait is instead handled by waiting to obtain
+the relation-level lock, which is why you'd better hold one first.)  Pins
+may not be held across transaction boundaries, however.
+
+Buffer content locks: there are two kinds of buffer lock, shared and exclusive,
+which act just as you'd expect: multiple backends can hold shared locks on
+the same buffer, but an exclusive lock prevents anyone else from holding
+either shared or exclusive lock.  (These can alternatively be called READ
+and WRITE locks.)  These locks are intended to be short-term: they should not
+be held for long.  Buffer locks are acquired and released by LockBuffer().
+It will *not* work for a single backend to try to acquire multiple locks on
+the same buffer.  One must pin a buffer before trying to lock it.
+
+Buffer access rules:
+
+1. To scan a page for tuples, one must hold a pin and either shared or
+exclusive content lock.  To examine the commit status (XIDs and status bits)
+of a tuple in a shared buffer, one must likewise hold a pin and either shared
+or exclusive lock.
+
+2. Once one has determined that a tuple is interesting (visible to the
+current transaction) one may drop the content lock, yet continue to access
+the tuple's data for as long as one holds the buffer pin.  This is what is
+typically done by heap scans, since the tuple returned by heap_fetch
+contains a pointer to tuple data in the shared buffer.  Therefore the
+tuple cannot go away while the pin is held (see rule #5).  Its state could
+change, but that is assumed not to matter after the initial determination
+of visibility is made.
+
+3. To add a tuple or change the xmin/xmax fields of an existing tuple,
+one must hold a pin and an exclusive content lock on the containing buffer.
+This ensures that no one else might see a partially-updated state of the
+tuple while they are doing visibility checks.
+
+4. It is considered OK to update tuple commit status bits (ie, OR the
+values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or
+HEAP_XMAX_INVALID into t_infomask) while holding only a shared lock and
+pin on a buffer.  This is OK because another backend looking at the tuple
+at about the same time would OR the same bits into the field, so there
+is little or no risk of conflicting update; what's more, if there did
+manage to be a conflict it would merely mean that one bit-update would
+be lost and need to be done again later.  These four bits are only hints
+(they cache the results of transaction status lookups in pg_xact), so no
+great harm is done if they get reset to zero by conflicting updates.
+Note, however, that a tuple is frozen by setting both HEAP_XMIN_INVALID
+and HEAP_XMIN_COMMITTED; this is a critical update and accordingly requires
+an exclusive buffer lock (and it must also be WAL-logged).
+
+5. To physically remove a tuple or compact free space on a page, one
+must hold a pin and an exclusive lock, *and* observe while holding the
+exclusive lock that the buffer's shared reference count is one (ie,
+no other backend holds a pin).  If these conditions are met then no other
+backend can perform a page scan until the exclusive lock is dropped, and
+no other backend can be holding a reference to an existing tuple that it
+might expect to examine again.  Note that another backend might pin the
+buffer (increment the refcount) while one is performing the cleanup, but
+it won't be able to actually examine the page until it acquires shared
+or exclusive content lock.
+
+
+Obtaining the lock needed under rule #5 is done by the bufmgr routines
+LockBufferForCleanup() or ConditionalLockBufferForCleanup().  They first get
+an exclusive lock and then check to see if the shared pin count is currently
+1.  If not, ConditionalLockBufferForCleanup() releases the exclusive lock and
+then returns false, while LockBufferForCleanup() releases the exclusive lock
+(but not the caller's pin) and waits until signaled by another backend,
+whereupon it tries again.  The signal will occur when UnpinBuffer decrements
+the shared pin count to 1.  As indicated above, this operation might have to
+wait a good while before it acquires the lock, but that shouldn't matter much
+for concurrent VACUUM.  The current implementation only supports a single
+waiter for pin-count-1 on any particular shared buffer.  This is enough for
+VACUUM's use, since we don't allow multiple VACUUMs concurrently on a single
+relation anyway.  Anyone wishing to obtain a cleanup lock outside of recovery
+or a VACUUM must use the conditional variant of the function.
+
+
+Buffer Manager's Internal Locking
+---------------------------------
+
+Before PostgreSQL 8.1, all operations of the shared buffer manager itself
+were protected by a single system-wide lock, the BufMgrLock, which
+unsurprisingly proved to be a source of contention.  The new locking scheme
+avoids grabbing system-wide exclusive locks in common code paths.  It works
+like this:
+
+* There is a system-wide LWLock, the BufMappingLock, that notionally
+protects the mapping from buffer tags (page identifiers) to buffers.
+(Physically, it can be thought of as protecting the hash table maintained
+by buf_table.c.)  To look up whether a buffer exists for a tag, it is
+sufficient to obtain share lock on the BufMappingLock.  Note that one
+must pin the found buffer, if any, before releasing the BufMappingLock.
+To alter the page assignment of any buffer, one must hold exclusive lock
+on the BufMappingLock.  This lock must be held across adjusting the buffer's
+header fields and changing the buf_table hash table.  The only common
+operation that needs exclusive lock is reading in a page that was not
+in shared buffers already, which will require at least a kernel call
+and usually a wait for I/O, so it will be slow anyway.
+
+* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS
+separate locks, each guarding a portion of the buffer tag space.  This allows
+further reduction of contention in the normal code paths.  The partition
+that a particular buffer tag belongs to is determined from the low-order
+bits of the tag's hash value.  The rules stated above apply to each partition
+independently.  If it is necessary to lock more than one partition at a time,
+they must be locked in partition-number order to avoid risk of deadlock.
+
+* A separate system-wide spinlock, buffer_strategy_lock, provides mutual
+exclusion for operations that access the buffer free list or select
+buffers for replacement.  A spinlock is used here rather than a lightweight
+lock for efficiency; no other locks of any sort should be acquired while
+buffer_strategy_lock is held.  This is essential to allow buffer replacement
+to happen in multiple backends with reasonable concurrency.
+
+* Each buffer header contains a spinlock that must be taken when examining
+or changing fields of that buffer header.  This allows operations such as
+ReleaseBuffer to make local state changes without taking any system-wide
+lock.  We use a spinlock, not an LWLock, since there are no cases where
+the lock needs to be held for more than a few instructions.
+
+Note that a buffer header's spinlock does not control access to the data
+held within the buffer.  Each buffer header also contains an LWLock, the
+"buffer content lock", that *does* represent the right to access the data
+in the buffer.  It is used per the rules above.
+
+* The BM_IO_IN_PROGRESS flag acts as a kind of lock, used to wait for I/O on a
+buffer to complete (and in releases before 14, it was accompanied by a
+per-buffer LWLock).  The process doing a read or write sets the flag for the
+duration, and processes that need to wait for it to be cleared sleep on a
+condition variable.
+
+
+Normal Buffer Replacement Strategy
+----------------------------------
+
+There is a "free list" of buffers that are prime candidates for replacement.
+In particular, buffers that are completely free (contain no valid page) are
+always in this list.  We could also throw buffers into this list if we
+consider their pages unlikely to be needed soon; however, the current
+algorithm never does that.  The list is singly-linked using fields in the
+buffer headers; we maintain head and tail pointers in global variables.
+(Note: although the list links are in the buffer headers, they are
+considered to be protected by the buffer_strategy_lock, not the buffer-header
+spinlocks.)  To choose a victim buffer to recycle when there are no free
+buffers available, we use a simple clock-sweep algorithm, which avoids the
+need to take system-wide locks during common operations.  It works like
+this:
+
+Each buffer header contains a usage counter, which is incremented (up to a
+small limit value) whenever the buffer is pinned.  (This requires only the
+buffer header spinlock, which would have to be taken anyway to increment the
+buffer reference count, so it's nearly free.)
+
+The "clock hand" is a buffer index, nextVictimBuffer, that moves circularly
+through all the available buffers.  nextVictimBuffer is protected by the
+buffer_strategy_lock.
+
+The algorithm for a process that needs to obtain a victim buffer is:
+
+1. Obtain buffer_strategy_lock.
+
+2. If buffer free list is nonempty, remove its head buffer.  Release
+buffer_strategy_lock.  If the buffer is pinned or has a nonzero usage count,
+it cannot be used; ignore it go back to step 1.  Otherwise, pin the buffer,
+and return it.
+
+3. Otherwise, the buffer free list is empty.  Select the buffer pointed to by
+nextVictimBuffer, and circularly advance nextVictimBuffer for next time.
+Release buffer_strategy_lock.
+
+4. If the selected buffer is pinned or has a nonzero usage count, it cannot
+be used.  Decrement its usage count (if nonzero), reacquire
+buffer_strategy_lock, and return to step 3 to examine the next buffer.
+
+5. Pin the selected buffer, and return.
+
+(Note that if the selected buffer is dirty, we will have to write it out
+before we can recycle it; if someone else pins the buffer meanwhile we will
+have to give up and try another buffer.  This however is not a concern
+of the basic select-a-victim-buffer algorithm.)
+
+
+Buffer Ring Replacement Strategy
+---------------------------------
+
+When running a query that needs to access a large number of pages just once,
+such as VACUUM or a large sequential scan, a different strategy is used.
+A page that has been touched only by such a scan is unlikely to be needed
+again soon, so instead of running the normal clock sweep algorithm and
+blowing out the entire buffer cache, a small ring of buffers is allocated
+using the normal clock sweep algorithm and those buffers are reused for the
+whole scan.  This also implies that much of the write traffic caused by such
+a statement will be done by the backend itself and not pushed off onto other
+processes.
+
+For sequential scans, a 256KB ring is used. That's small enough to fit in L2
+cache, which makes transferring pages from OS cache to shared buffer cache
+efficient.  Even less would often be enough, but the ring must be big enough
+to accommodate all pages in the scan that are pinned concurrently.  256KB
+should also be enough to leave a small cache trail for other backends to
+join in a synchronized seq scan.  If a ring buffer is dirtied and its LSN
+updated, we would normally have to write and flush WAL before we could
+re-use the buffer; in this case we instead discard the buffer from the ring
+and (later) choose a replacement using the normal clock-sweep algorithm.
+Hence this strategy works best for scans that are read-only (or at worst
+update hint bits).  In a scan that modifies every page in the scan, like a
+bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and
+the ring strategy effectively degrades to the normal strategy.
+
+VACUUM uses a 256KB ring like sequential scans, but dirty pages are not
+removed from the ring.  Instead, WAL is flushed if needed to allow reuse of
+the buffers.  Before introducing the buffer ring strategy in 8.3, VACUUM's
+buffers were sent to the freelist, which was effectively a buffer ring of 1
+buffer, resulting in excessive WAL flushing.  Allowing VACUUM to update
+256KB between WAL flushes should be more efficient.
+
+Bulk writes work similarly to VACUUM.  Currently this applies only to
+COPY IN and CREATE TABLE AS SELECT.  (Might it be interesting to make
+seqscan UPDATE and DELETE use the bulkwrite strategy?)  For bulk writes
+we use a ring size of 16MB (but not more than 1/8th of shared_buffers).
+Smaller sizes have been shown to result in the COPY blocking too often
+for WAL flushes.  While it's okay for a background vacuum to be slowed by
+doing its own WAL flushing, we'd prefer that COPY not be subject to that,
+so we let it use up a bit more of the buffer arena.
+
+
+Background Writer's Processing
+------------------------------
+
+The background writer is designed to write out pages that are likely to be
+recycled soon, thereby offloading the writing work from active backends.
+To do this, it scans forward circularly from the current position of
+nextVictimBuffer (which it does not change!), looking for buffers that are
+dirty and not pinned nor marked with a positive usage count.  It pins,
+writes, and releases any such buffer.
+
+If we can assume that reading nextVictimBuffer is an atomic action, then
+the writer doesn't even need to take buffer_strategy_lock in order to look
+for buffers to write; it needs only to spinlock each buffer header for long
+enough to check the dirtybit.  Even without that assumption, the writer
+only needs to take the lock long enough to read the variable value, not
+while scanning the buffers.  (This is a very substantial improvement in
+the contention cost of the writer compared to PG 8.0.)
+
+The background writer takes shared content lock on a buffer while writing it
+out (and anyone else who flushes buffer contents to disk must do so too).
+This ensures that the page image transferred to disk is reasonably consistent.
+We might miss a hint-bit update or two but that isn't a problem, for the same
+reasons mentioned under buffer access rules.
+
+As of 8.4, background writer starts during recovery mode when there is
+some form of potentially extended recovery to perform. It performs an
+identical service to normal processing, except that checkpoints it
+writes are technically restartpoints.
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index 0000000..a299be1
--- /dev/null
+++ b/src/backend/storage/buffer/buf_init.c
@@ -0,0 +1,181 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_init.c
+ *	  buffer manager initialization routines
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/buffer/buf_init.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+BufferDescPadded *BufferDescriptors;
+char	   *BufferBlocks;
+ConditionVariableMinimallyPadded *BufferIOCVArray;
+WritebackContext BackendWritebackContext;
+CkptSortItem *CkptBufferIds;
+
+
+/*
+ * Data Structures:
+ *		buffers live in a freelist and a lookup data structure.
+ *
+ *
+ * Buffer Lookup:
+ *		Two important notes.  First, the buffer has to be
+ *		available for lookup BEFORE an IO begins.  Otherwise
+ *		a second process trying to read the buffer will
+ *		allocate its own copy and the buffer pool will
+ *		become inconsistent.
+ *
+ * Buffer Replacement:
+ *		see freelist.c.  A buffer cannot be replaced while in
+ *		use either by data manager or during IO.
+ *
+ *
+ * Synchronization/Locking:
+ *
+ * IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
+ *		It must be set when an IO is initiated and cleared at
+ *		the end of the IO.  It is there to make sure that one
+ *		process doesn't start to use a buffer while another is
+ *		faulting it in.  see WaitIO and related routines.
+ *
+ * refcount --	Counts the number of processes holding pins on a buffer.
+ *		A buffer is pinned during IO and immediately after a BufferAlloc().
+ *		Pins must be released before end of transaction.  For efficiency the
+ *		shared refcount isn't increased if an individual backend pins a buffer
+ *		multiple times. Check the PrivateRefCount infrastructure in bufmgr.c.
+ */
+
+
+/*
+ * Initialize shared buffer pool
+ *
+ * This is called once during shared-memory initialization (either in the
+ * postmaster, or in a standalone backend).
+ */
+void
+InitBufferPool(void)
+{
+	bool		foundBufs,
+				foundDescs,
+				foundIOCV,
+				foundBufCkpt;
+
+	/* Align descriptors to a cacheline boundary. */
+	BufferDescriptors = (BufferDescPadded *)
+		ShmemInitStruct("Buffer Descriptors",
+						NBuffers * sizeof(BufferDescPadded),
+						&foundDescs);
+
+	BufferBlocks = (char *)
+		ShmemInitStruct("Buffer Blocks",
+						NBuffers * (Size) BLCKSZ, &foundBufs);
+
+	/* Align condition variables to cacheline boundary. */
+	BufferIOCVArray = (ConditionVariableMinimallyPadded *)
+		ShmemInitStruct("Buffer IO Condition Variables",
+						NBuffers * sizeof(ConditionVariableMinimallyPadded),
+						&foundIOCV);
+
+	/*
+	 * The array used to sort to-be-checkpointed buffer ids is located in
+	 * shared memory, to avoid having to allocate significant amounts of
+	 * memory at runtime. As that'd be in the middle of a checkpoint, or when
+	 * the checkpointer is restarted, memory allocation failures would be
+	 * painful.
+	 */
+	CkptBufferIds = (CkptSortItem *)
+		ShmemInitStruct("Checkpoint BufferIds",
+						NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
+
+	if (foundDescs || foundBufs || foundIOCV || foundBufCkpt)
+	{
+		/* should find all of these, or none of them */
+		Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt);
+		/* note: this path is only taken in EXEC_BACKEND case */
+	}
+	else
+	{
+		int			i;
+
+		/*
+		 * Initialize all the buffer headers.
+		 */
+		for (i = 0; i < NBuffers; i++)
+		{
+			BufferDesc *buf = GetBufferDescriptor(i);
+
+			CLEAR_BUFFERTAG(buf->tag);
+
+			pg_atomic_init_u32(&buf->state, 0);
+			buf->wait_backend_pid = 0;
+
+			buf->buf_id = i;
+
+			/*
+			 * Initially link all the buffers together as unused. Subsequent
+			 * management of this list is done by freelist.c.
+			 */
+			buf->freeNext = i + 1;
+
+			LWLockInitialize(BufferDescriptorGetContentLock(buf),
+							 LWTRANCHE_BUFFER_CONTENT);
+
+			ConditionVariableInit(BufferDescriptorGetIOCV(buf));
+		}
+
+		/* Correct last entry of linked list */
+		GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
+	}
+
+	/* Init other shared buffer-management stuff */
+	StrategyInitialize(!foundDescs);
+
+	/* Initialize per-backend file flush context */
+	WritebackContextInit(&BackendWritebackContext,
+						 &backend_flush_after);
+}
+
+/*
+ * BufferShmemSize
+ *
+ * compute the size of shared memory for the buffer pool including
+ * data pages, buffer descriptors, hash tables, etc.
+ */
+Size
+BufferShmemSize(void)
+{
+	Size		size = 0;
+
+	/* size of buffer descriptors */
+	size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded)));
+	/* to allow aligning buffer descriptors */
+	size = add_size(size, PG_CACHE_LINE_SIZE);
+
+	/* size of data pages */
+	size = add_size(size, mul_size(NBuffers, BLCKSZ));
+
+	/* size of stuff controlled by freelist.c */
+	size = add_size(size, StrategyShmemSize());
+
+	/* size of I/O condition variables */
+	size = add_size(size, mul_size(NBuffers,
+								   sizeof(ConditionVariableMinimallyPadded)));
+	/* to allow aligning the above */
+	size = add_size(size, PG_CACHE_LINE_SIZE);
+
+	/* size of checkpoint sort array in bufmgr.c */
+	size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
+
+	return size;
+}
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
new file mode 100644
index 0000000..caa03ae
--- /dev/null
+++ b/src/backend/storage/buffer/buf_table.c
@@ -0,0 +1,162 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_table.c
+ *	  routines for mapping BufferTags to buffer indexes.
+ *
+ * Note: the routines in this file do no locking of their own.  The caller
+ * must hold a suitable lock on the appropriate BufMappingLock, as specified
+ * in the comments.  We can't do the locking inside these functions because
+ * in most cases the caller needs to adjust the buffer header contents
+ * before the lock is released (see notes in README).
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/buffer/buf_table.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+/* entry for buffer lookup hashtable */
+typedef struct
+{
+	BufferTag	key;			/* Tag of a disk page */
+	int			id;				/* Associated buffer ID */
+} BufferLookupEnt;
+
+static HTAB *SharedBufHash;
+
+
+/*
+ * Estimate space needed for mapping hashtable
+ *		size is the desired hash table size (possibly more than NBuffers)
+ */
+Size
+BufTableShmemSize(int size)
+{
+	return hash_estimate_size(size, sizeof(BufferLookupEnt));
+}
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ *		size is the desired hash table size (possibly more than NBuffers)
+ */
+void
+InitBufTable(int size)
+{
+	HASHCTL		info;
+
+	/* assume no locking is needed yet */
+
+	/* BufferTag maps to Buffer */
+	info.keysize = sizeof(BufferTag);
+	info.entrysize = sizeof(BufferLookupEnt);
+	info.num_partitions = NUM_BUFFER_PARTITIONS;
+
+	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
+								  size, size,
+								  &info,
+								  HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
+}
+
+/*
+ * BufTableHashCode
+ *		Compute the hash code associated with a BufferTag
+ *
+ * This must be passed to the lookup/insert/delete routines along with the
+ * tag.  We do it like this because the callers need to know the hash code
+ * in order to determine which buffer partition to lock, and we don't want
+ * to do the hash computation twice (hash_any is a bit slow).
+ */
+uint32
+BufTableHashCode(BufferTag *tagPtr)
+{
+	return get_hash_value(SharedBufHash, (void *) tagPtr);
+}
+
+/*
+ * BufTableLookup
+ *		Lookup the given BufferTag; return buffer ID, or -1 if not found
+ *
+ * Caller must hold at least share lock on BufMappingLock for tag's partition
+ */
+int
+BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
+{
+	BufferLookupEnt *result;
+
+	result = (BufferLookupEnt *)
+		hash_search_with_hash_value(SharedBufHash,
+									(void *) tagPtr,
+									hashcode,
+									HASH_FIND,
+									NULL);
+
+	if (!result)
+		return -1;
+
+	return result->id;
+}
+
+/*
+ * BufTableInsert
+ *		Insert a hashtable entry for given tag and buffer ID,
+ *		unless an entry already exists for that tag
+ *
+ * Returns -1 on successful insertion.  If a conflicting entry exists
+ * already, returns the buffer ID in that entry.
+ *
+ * Caller must hold exclusive lock on BufMappingLock for tag's partition
+ */
+int
+BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
+{
+	BufferLookupEnt *result;
+	bool		found;
+
+	Assert(buf_id >= 0);		/* -1 is reserved for not-in-table */
+	Assert(tagPtr->blockNum != P_NEW);	/* invalid tag */
+
+	result = (BufferLookupEnt *)
+		hash_search_with_hash_value(SharedBufHash,
+									(void *) tagPtr,
+									hashcode,
+									HASH_ENTER,
+									&found);
+
+	if (found)					/* found something already in the table */
+		return result->id;
+
+	result->id = buf_id;
+
+	return -1;
+}
+
+/*
+ * BufTableDelete
+ *		Delete the hashtable entry for given tag (which must exist)
+ *
+ * Caller must hold exclusive lock on BufMappingLock for tag's partition
+ */
+void
+BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
+{
+	BufferLookupEnt *result;
+
+	result = (BufferLookupEnt *)
+		hash_search_with_hash_value(SharedBufHash,
+									(void *) tagPtr,
+									hashcode,
+									HASH_REMOVE,
+									NULL);
+
+	if (!result)				/* shouldn't happen */
+		elog(ERROR, "shared buffer hash table corrupted");
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 0000000..d2eb69b
--- /dev/null
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -0,0 +1,4892 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmgr.c
+ *	  buffer manager interface routines
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/buffer/bufmgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * Principal entry points:
+ *
+ * ReadBuffer() -- find or create a buffer holding the requested page,
+ *		and pin it so that no one can destroy it while this process
+ *		is using it.
+ *
+ * ReleaseBuffer() -- unpin a buffer
+ *
+ * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
+ *		The disk write is delayed until buffer replacement or checkpoint.
+ *
+ * See also these files:
+ *		freelist.c -- chooses victim for buffer replacement
+ *		buf_table.c -- manages the buffer lookup table
+ */
+#include "postgres.h"
+
+#include <sys/file.h>
+#include <unistd.h>
+
+#include "access/tableam.h"
+#include "access/xlog.h"
+#include "catalog/catalog.h"
+#include "catalog/storage.h"
+#include "executor/instrument.h"
+#include "lib/binaryheap.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "storage/standby.h"
+#include "utils/memdebug.h"
+#include "utils/ps_status.h"
+#include "utils/rel.h"
+#include "utils/resowner_private.h"
+#include "utils/timestamp.h"
+
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
+
+/* Note: this macro only works on local buffers, not shared ones! */
+#define LocalBufHdrGetBlock(bufHdr) \
+	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
+
+/* Bits in SyncOneBuffer's return value */
+#define BUF_WRITTEN				0x01
+#define BUF_REUSABLE			0x02
+
+#define RELS_BSEARCH_THRESHOLD		20
+
+/*
+ * This is the size (in the number of blocks) above which we scan the
+ * entire buffer pool to remove the buffers for all the pages of relation
+ * being dropped. For the relations with size below this threshold, we find
+ * the buffers by doing lookups in BufMapping table.
+ */
+#define BUF_DROP_FULL_SCAN_THRESHOLD		(uint64) (NBuffers / 32)
+
+typedef struct PrivateRefCountEntry
+{
+	Buffer		buffer;
+	int32		refcount;
+} PrivateRefCountEntry;
+
+/* 64 bytes, about the size of a cache line on common systems */
+#define REFCOUNT_ARRAY_ENTRIES 8
+
+/*
+ * Status of buffers to checkpoint for a particular tablespace, used
+ * internally in BufferSync.
+ */
+typedef struct CkptTsStatus
+{
+	/* oid of the tablespace */
+	Oid			tsId;
+
+	/*
+	 * Checkpoint progress for this tablespace. To make progress comparable
+	 * between tablespaces the progress is, for each tablespace, measured as a
+	 * number between 0 and the total number of to-be-checkpointed pages. Each
+	 * page checkpointed in this tablespace increments this space's progress
+	 * by progress_slice.
+	 */
+	float8		progress;
+	float8		progress_slice;
+
+	/* number of to-be checkpointed pages in this tablespace */
+	int			num_to_scan;
+	/* already processed pages in this tablespace */
+	int			num_scanned;
+
+	/* current offset in CkptBufferIds for this tablespace */
+	int			index;
+} CkptTsStatus;
+
+/*
+ * Type for array used to sort SMgrRelations
+ *
+ * FlushRelationsAllBuffers shares the same comparator function with
+ * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be
+ * compatible.
+ */
+typedef struct SMgrSortArray
+{
+	RelFileNode rnode;			/* This must be the first member */
+	SMgrRelation srel;
+} SMgrSortArray;
+
+/* GUC variables */
+bool		zero_damaged_pages = false;
+int			bgwriter_lru_maxpages = 100;
+double		bgwriter_lru_multiplier = 2.0;
+bool		track_io_timing = false;
+
+/*
+ * How many buffers PrefetchBuffer callers should try to stay ahead of their
+ * ReadBuffer calls by.  Zero means "never prefetch".  This value is only used
+ * for buffers not belonging to tablespaces that have their
+ * effective_io_concurrency parameter set.
+ */
+int			effective_io_concurrency = 0;
+
+/*
+ * Like effective_io_concurrency, but used by maintenance code paths that might
+ * benefit from a higher setting because they work on behalf of many sessions.
+ * Overridden by the tablespace setting of the same name.
+ */
+int			maintenance_io_concurrency = 0;
+
+/*
+ * GUC variables about triggering kernel writeback for buffers written; OS
+ * dependent defaults are set via the GUC mechanism.
+ */
+int			checkpoint_flush_after = 0;
+int			bgwriter_flush_after = 0;
+int			backend_flush_after = 0;
+
+/* local state for StartBufferIO and related functions */
+static BufferDesc *InProgressBuf = NULL;
+static bool IsForInput;
+
+/* local state for LockBufferForCleanup */
+static BufferDesc *PinCountWaitBuf = NULL;
+
+/*
+ * Backend-Private refcount management:
+ *
+ * Each buffer also has a private refcount that keeps track of the number of
+ * times the buffer is pinned in the current process.  This is so that the
+ * shared refcount needs to be modified only once if a buffer is pinned more
+ * than once by an individual backend.  It's also used to check that no buffers
+ * are still pinned at the end of transactions and when exiting.
+ *
+ *
+ * To avoid - as we used to - requiring an array with NBuffers entries to keep
+ * track of local buffers, we use a small sequentially searched array
+ * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
+ * keep track of backend local pins.
+ *
+ * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
+ * refcounts are kept track of in the array; after that, new array entries
+ * displace old ones into the hash table. That way a frequently used entry
+ * can't get "stuck" in the hashtable while infrequent ones clog the array.
+ *
+ * Note that in most scenarios the number of pinned buffers will not exceed
+ * REFCOUNT_ARRAY_ENTRIES.
+ *
+ *
+ * To enter a buffer into the refcount tracking mechanism first reserve a free
+ * entry using ReservePrivateRefCountEntry() and then later, if necessary,
+ * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
+ * memory allocations in NewPrivateRefCountEntry() which can be important
+ * because in some scenarios it's called with a spinlock held...
+ */
+static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
+static HTAB *PrivateRefCountHash = NULL;
+static int32 PrivateRefCountOverflowed = 0;
+static uint32 PrivateRefCountClock = 0;
+static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
+
+static void ReservePrivateRefCountEntry(void);
+static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
+static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
+static inline int32 GetPrivateRefCount(Buffer buffer);
+static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
+
+/*
+ * Ensure that the PrivateRefCountArray has sufficient space to store one more
+ * entry. This has to be called before using NewPrivateRefCountEntry() to fill
+ * a new entry - but it's perfectly fine to not use a reserved entry.
+ */
+static void
+ReservePrivateRefCountEntry(void)
+{
+	/* Already reserved (or freed), nothing to do */
+	if (ReservedRefCountEntry != NULL)
+		return;
+
+	/*
+	 * First search for a free entry the array, that'll be sufficient in the
+	 * majority of cases.
+	 */
+	{
+		int			i;
+
+		for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+		{
+			PrivateRefCountEntry *res;
+
+			res = &PrivateRefCountArray[i];
+
+			if (res->buffer == InvalidBuffer)
+			{
+				ReservedRefCountEntry = res;
+				return;
+			}
+		}
+	}
+
+	/*
+	 * No luck. All array entries are full. Move one array entry into the hash
+	 * table.
+	 */
+	{
+		/*
+		 * Move entry from the current clock position in the array into the
+		 * hashtable. Use that slot.
+		 */
+		PrivateRefCountEntry *hashent;
+		bool		found;
+
+		/* select victim slot */
+		ReservedRefCountEntry =
+			&PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
+
+		/* Better be used, otherwise we shouldn't get here. */
+		Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
+
+		/* enter victim array entry into hashtable */
+		hashent = hash_search(PrivateRefCountHash,
+							  (void *) &(ReservedRefCountEntry->buffer),
+							  HASH_ENTER,
+							  &found);
+		Assert(!found);
+		hashent->refcount = ReservedRefCountEntry->refcount;
+
+		/* clear the now free array slot */
+		ReservedRefCountEntry->buffer = InvalidBuffer;
+		ReservedRefCountEntry->refcount = 0;
+
+		PrivateRefCountOverflowed++;
+	}
+}
+
+/*
+ * Fill a previously reserved refcount entry.
+ */
+static PrivateRefCountEntry *
+NewPrivateRefCountEntry(Buffer buffer)
+{
+	PrivateRefCountEntry *res;
+
+	/* only allowed to be called when a reservation has been made */
+	Assert(ReservedRefCountEntry != NULL);
+
+	/* use up the reserved entry */
+	res = ReservedRefCountEntry;
+	ReservedRefCountEntry = NULL;
+
+	/* and fill it */
+	res->buffer = buffer;
+	res->refcount = 0;
+
+	return res;
+}
+
+/*
+ * Return the PrivateRefCount entry for the passed buffer.
+ *
+ * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
+ * do_move is true, and the entry resides in the hashtable the entry is
+ * optimized for frequent access by moving it to the array.
+ */
+static PrivateRefCountEntry *
+GetPrivateRefCountEntry(Buffer buffer, bool do_move)
+{
+	PrivateRefCountEntry *res;
+	int			i;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * First search for references in the array, that'll be sufficient in the
+	 * majority of cases.
+	 */
+	for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+	{
+		res = &PrivateRefCountArray[i];
+
+		if (res->buffer == buffer)
+			return res;
+	}
+
+	/*
+	 * By here we know that the buffer, if already pinned, isn't residing in
+	 * the array.
+	 *
+	 * Only look up the buffer in the hashtable if we've previously overflowed
+	 * into it.
+	 */
+	if (PrivateRefCountOverflowed == 0)
+		return NULL;
+
+	res = hash_search(PrivateRefCountHash,
+					  (void *) &buffer,
+					  HASH_FIND,
+					  NULL);
+
+	if (res == NULL)
+		return NULL;
+	else if (!do_move)
+	{
+		/* caller doesn't want us to move the hash entry into the array */
+		return res;
+	}
+	else
+	{
+		/* move buffer from hashtable into the free array slot */
+		bool		found;
+		PrivateRefCountEntry *free;
+
+		/* Ensure there's a free array slot */
+		ReservePrivateRefCountEntry();
+
+		/* Use up the reserved slot */
+		Assert(ReservedRefCountEntry != NULL);
+		free = ReservedRefCountEntry;
+		ReservedRefCountEntry = NULL;
+		Assert(free->buffer == InvalidBuffer);
+
+		/* and fill it */
+		free->buffer = buffer;
+		free->refcount = res->refcount;
+
+		/* delete from hashtable */
+		hash_search(PrivateRefCountHash,
+					(void *) &buffer,
+					HASH_REMOVE,
+					&found);
+		Assert(found);
+		Assert(PrivateRefCountOverflowed > 0);
+		PrivateRefCountOverflowed--;
+
+		return free;
+	}
+}
+
+/*
+ * Returns how many times the passed buffer is pinned by this backend.
+ *
+ * Only works for shared memory buffers!
+ */
+static inline int32
+GetPrivateRefCount(Buffer buffer)
+{
+	PrivateRefCountEntry *ref;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * Not moving the entry - that's ok for the current users, but we might
+	 * want to change this one day.
+	 */
+	ref = GetPrivateRefCountEntry(buffer, false);
+
+	if (ref == NULL)
+		return 0;
+	return ref->refcount;
+}
+
+/*
+ * Release resources used to track the reference count of a buffer which we no
+ * longer have pinned and don't want to pin again immediately.
+ */
+static void
+ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
+{
+	Assert(ref->refcount == 0);
+
+	if (ref >= &PrivateRefCountArray[0] &&
+		ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
+	{
+		ref->buffer = InvalidBuffer;
+
+		/*
+		 * Mark the just used entry as reserved - in many scenarios that
+		 * allows us to avoid ever having to search the array/hash for free
+		 * entries.
+		 */
+		ReservedRefCountEntry = ref;
+	}
+	else
+	{
+		bool		found;
+		Buffer		buffer = ref->buffer;
+
+		hash_search(PrivateRefCountHash,
+					(void *) &buffer,
+					HASH_REMOVE,
+					&found);
+		Assert(found);
+		Assert(PrivateRefCountOverflowed > 0);
+		PrivateRefCountOverflowed--;
+	}
+}
+
+/*
+ * BufferIsPinned
+ *		True iff the buffer is pinned (also checks for valid buffer number).
+ *
+ *		NOTE: what we check here is that *this* backend holds a pin on
+ *		the buffer.  We do not care whether some other backend does.
+ */
+#define BufferIsPinned(bufnum) \
+( \
+	!BufferIsValid(bufnum) ? \
+		false \
+	: \
+		BufferIsLocal(bufnum) ? \
+			(LocalRefCount[-(bufnum) - 1] > 0) \
+		: \
+	(GetPrivateRefCount(bufnum) > 0) \
+)
+
+
+static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
+								ForkNumber forkNum, BlockNumber blockNum,
+								ReadBufferMode mode, BufferAccessStrategy strategy,
+								bool *hit);
+static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
+static void PinBuffer_Locked(BufferDesc *buf);
+static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
+static void BufferSync(int flags);
+static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
+static int	SyncOneBuffer(int buf_id, bool skip_recently_used,
+						  WritebackContext *wb_context);
+static void WaitIO(BufferDesc *buf);
+static bool StartBufferIO(BufferDesc *buf, bool forInput);
+static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
+							  uint32 set_flag_bits);
+static void shared_buffer_write_error_callback(void *arg);
+static void local_buffer_write_error_callback(void *arg);
+static BufferDesc *BufferAlloc(SMgrRelation smgr,
+							   char relpersistence,
+							   ForkNumber forkNum,
+							   BlockNumber blockNum,
+							   BufferAccessStrategy strategy,
+							   bool *foundPtr);
+static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
+static void FindAndDropRelFileNodeBuffers(RelFileNode rnode,
+										  ForkNumber forkNum,
+										  BlockNumber nForkBlock,
+										  BlockNumber firstDelBlock);
+static void AtProcExit_Buffers(int code, Datum arg);
+static void CheckForBufferLeaks(void);
+static int	rnode_comparator(const void *p1, const void *p2);
+static inline int buffertag_comparator(const BufferTag *a, const BufferTag *b);
+static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
+static int	ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
+
+
+/*
+ * Implementation of PrefetchBuffer() for shared buffers.
+ */
+PrefetchBufferResult
+PrefetchSharedBuffer(SMgrRelation smgr_reln,
+					 ForkNumber forkNum,
+					 BlockNumber blockNum)
+{
+	PrefetchBufferResult result = {InvalidBuffer, false};
+	BufferTag	newTag;			/* identity of requested block */
+	uint32		newHash;		/* hash value for newTag */
+	LWLock	   *newPartitionLock;	/* buffer partition lock for it */
+	int			buf_id;
+
+	Assert(BlockNumberIsValid(blockNum));
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
+				   forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	newHash = BufTableHashCode(&newTag);
+	newPartitionLock = BufMappingPartitionLock(newHash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(newPartitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&newTag, newHash);
+	LWLockRelease(newPartitionLock);
+
+	/* If not in buffers, initiate prefetch */
+	if (buf_id < 0)
+	{
+#ifdef USE_PREFETCH
+		/*
+		 * Try to initiate an asynchronous read.  This returns false in
+		 * recovery if the relation file doesn't exist.
+		 */
+		if (smgrprefetch(smgr_reln, forkNum, blockNum))
+			result.initiated_io = true;
+#endif							/* USE_PREFETCH */
+	}
+	else
+	{
+		/*
+		 * Report the buffer it was in at that time.  The caller may be able
+		 * to avoid a buffer table lookup, but it's not pinned and it must be
+		 * rechecked!
+		 */
+		result.recent_buffer = buf_id + 1;
+	}
+
+	/*
+	 * If the block *is* in buffers, we do nothing.  This is not really ideal:
+	 * the block might be just about to be evicted, which would be stupid
+	 * since we know we are going to need it soon.  But the only easy answer
+	 * is to bump the usage_count, which does not seem like a great solution:
+	 * when the caller does ultimately touch the block, usage_count would get
+	 * bumped again, resulting in too much favoritism for blocks that are
+	 * involved in a prefetch sequence. A real fix would involve some
+	 * additional per-buffer state, and it's not clear that there's enough of
+	 * a problem to justify that.
+	 */
+
+	return result;
+}
+
+/*
+ * PrefetchBuffer -- initiate asynchronous read of a block of a relation
+ *
+ * This is named by analogy to ReadBuffer but doesn't actually allocate a
+ * buffer.  Instead it tries to ensure that a future ReadBuffer for the given
+ * block will not be delayed by the I/O.  Prefetching is optional.
+ *
+ * There are three possible outcomes:
+ *
+ * 1.  If the block is already cached, the result includes a valid buffer that
+ * could be used by the caller to avoid the need for a later buffer lookup, but
+ * it's not pinned, so the caller must recheck it.
+ *
+ * 2.  If the kernel has been asked to initiate I/O, the initiated_io member is
+ * true.  Currently there is no way to know if the data was already cached by
+ * the kernel and therefore didn't really initiate I/O, and no way to know when
+ * the I/O completes other than using synchronous ReadBuffer().
+ *
+ * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and either
+ * USE_PREFETCH is not defined (this build doesn't support prefetching due to
+ * lack of a kernel facility), or the underlying relation file wasn't found and
+ * we are in recovery.  (If the relation file wasn't found and we are not in
+ * recovery, an error is raised).
+ */
+PrefetchBufferResult
+PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
+{
+	Assert(RelationIsValid(reln));
+	Assert(BlockNumberIsValid(blockNum));
+
+	/* Open it at the smgr level if not already done */
+	RelationOpenSmgr(reln);
+
+	if (RelationUsesLocalBuffers(reln))
+	{
+		/* see comments in ReadBufferExtended */
+		if (RELATION_IS_OTHER_TEMP(reln))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot access temporary tables of other sessions")));
+
+		/* pass it off to localbuf.c */
+		return PrefetchLocalBuffer(reln->rd_smgr, forkNum, blockNum);
+	}
+	else
+	{
+		/* pass it to the shared buffer version */
+		return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
+	}
+}
+
+/*
+ * ReadRecentBuffer -- try to pin a block in a recently observed buffer
+ *
+ * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
+ * successful.  Return true if the buffer is valid and still has the expected
+ * tag.  In that case, the buffer is pinned and the usage count is bumped.
+ */
+bool
+ReadRecentBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum,
+				 Buffer recent_buffer)
+{
+	BufferDesc *bufHdr;
+	BufferTag	tag;
+	uint32		buf_state;
+	bool		have_private_ref;
+
+	Assert(BufferIsValid(recent_buffer));
+
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+	ReservePrivateRefCountEntry();
+	INIT_BUFFERTAG(tag, rnode, forkNum, blockNum);
+
+	if (BufferIsLocal(recent_buffer))
+	{
+		int			b = -recent_buffer - 1;
+
+		bufHdr = GetLocalBufferDescriptor(b);
+		buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+		/* Is it still valid and holding the right tag? */
+		if ((buf_state & BM_VALID) && BUFFERTAGS_EQUAL(tag, bufHdr->tag))
+		{
+			/*
+			 * Bump buffer's ref and usage counts. This is equivalent of
+			 * PinBuffer for a shared buffer.
+			 */
+			if (LocalRefCount[b] == 0)
+			{
+				if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
+				{
+					buf_state += BUF_USAGECOUNT_ONE;
+					pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+				}
+			}
+			LocalRefCount[b]++;
+			ResourceOwnerRememberBuffer(CurrentResourceOwner, recent_buffer);
+
+			return true;
+		}
+	}
+	else
+	{
+		bufHdr = GetBufferDescriptor(recent_buffer - 1);
+		have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
+
+		/*
+		 * Do we already have this buffer pinned with a private reference?  If
+		 * so, it must be valid and it is safe to check the tag without
+		 * locking.  If not, we have to lock the header first and then check.
+		 */
+		if (have_private_ref)
+			buf_state = pg_atomic_read_u32(&bufHdr->state);
+		else
+			buf_state = LockBufHdr(bufHdr);
+
+		if ((buf_state & BM_VALID) && BUFFERTAGS_EQUAL(tag, bufHdr->tag))
+		{
+			/*
+			 * It's now safe to pin the buffer.  We can't pin first and ask
+			 * questions later, because because it might confuse code paths
+			 * like InvalidateBuffer() if we pinned a random non-matching
+			 * buffer.
+			 */
+			if (have_private_ref)
+				PinBuffer(bufHdr, NULL);	/* bump pin count */
+			else
+				PinBuffer_Locked(bufHdr);	/* pin for first time */
+
+			return true;
+		}
+
+		/* If we locked the header above, now unlock. */
+		if (!have_private_ref)
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+
+	return false;
+}
+
+/*
+ * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
+ *		fork with RBM_NORMAL mode and default strategy.
+ */
+Buffer
+ReadBuffer(Relation reln, BlockNumber blockNum)
+{
+	return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
+}
+
+/*
+ * ReadBufferExtended -- returns a buffer containing the requested
+ *		block of the requested relation.  If the blknum
+ *		requested is P_NEW, extend the relation file and
+ *		allocate a new block.  (Caller is responsible for
+ *		ensuring that only one backend tries to extend a
+ *		relation at the same time!)
+ *
+ * Returns: the buffer number for the buffer containing
+ *		the block read.  The returned buffer has been pinned.
+ *		Does not return on error --- elog's instead.
+ *
+ * Assume when this function is called, that reln has been opened already.
+ *
+ * In RBM_NORMAL mode, the page is read from disk, and the page header is
+ * validated.  An error is thrown if the page header is not valid.  (But
+ * note that an all-zero page is considered "valid"; see
+ * PageIsVerifiedExtended().)
+ *
+ * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
+ * valid, the page is zeroed instead of throwing an error. This is intended
+ * for non-critical data, where the caller is prepared to repair errors.
+ *
+ * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
+ * filled with zeros instead of reading it from disk.  Useful when the caller
+ * is going to fill the page from scratch, since this saves I/O and avoids
+ * unnecessary failure if the page-on-disk has corrupt page headers.
+ * The page is returned locked to ensure that the caller has a chance to
+ * initialize the page before it's made visible to others.
+ * Caution: do not use this mode to read a page that is beyond the relation's
+ * current physical EOF; that is likely to cause problems in md.c when
+ * the page is modified and written out. P_NEW is OK, though.
+ *
+ * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
+ * a cleanup-strength lock on the page.
+ *
+ * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
+ *
+ * If strategy is not NULL, a nondefault buffer access strategy is used.
+ * See buffer/README for details.
+ */
+Buffer
+ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
+				   ReadBufferMode mode, BufferAccessStrategy strategy)
+{
+	bool		hit;
+	Buffer		buf;
+
+	/* Open it at the smgr level if not already done */
+	RelationOpenSmgr(reln);
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(reln))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	/*
+	 * Read the buffer, and update pgstat counters to reflect a cache hit or
+	 * miss.
+	 */
+	pgstat_count_buffer_read(reln);
+	buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
+							forkNum, blockNum, mode, strategy, &hit);
+	if (hit)
+		pgstat_count_buffer_hit(reln);
+	return buf;
+}
+
+
+/*
+ * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
+ *		a relcache entry for the relation.
+ *
+ * NB: At present, this function may only be used on permanent relations, which
+ * is OK, because we only use it during XLOG replay.  If in the future we
+ * want to use it on temporary or unlogged relations, we could pass additional
+ * parameters.
+ */
+Buffer
+ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+						  BlockNumber blockNum, ReadBufferMode mode,
+						  BufferAccessStrategy strategy)
+{
+	bool		hit;
+
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+
+	Assert(InRecovery);
+
+	return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
+							 mode, strategy, &hit);
+}
+
+
+/*
+ * ReadBuffer_common -- common logic for all ReadBuffer variants
+ *
+ * *hit is set to true if the request was satisfied from shared buffer cache.
+ */
+static Buffer
+ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+				  BlockNumber blockNum, ReadBufferMode mode,
+				  BufferAccessStrategy strategy, bool *hit)
+{
+	BufferDesc *bufHdr;
+	Block		bufBlock;
+	bool		found;
+	bool		isExtend;
+	bool		isLocalBuf = SmgrIsTemp(smgr);
+
+	*hit = false;
+
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	isExtend = (blockNum == P_NEW);
+
+	TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
+									   smgr->smgr_rnode.node.spcNode,
+									   smgr->smgr_rnode.node.dbNode,
+									   smgr->smgr_rnode.node.relNode,
+									   smgr->smgr_rnode.backend,
+									   isExtend);
+
+	/* Substitute proper block number if caller asked for P_NEW */
+	if (isExtend)
+	{
+		blockNum = smgrnblocks(smgr, forkNum);
+		/* Fail if relation is already at maximum possible length */
+		if (blockNum == P_NEW)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("cannot extend relation %s beyond %u blocks",
+							relpath(smgr->smgr_rnode, forkNum),
+							P_NEW)));
+	}
+
+	if (isLocalBuf)
+	{
+		bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
+		if (found)
+			pgBufferUsage.local_blks_hit++;
+		else if (isExtend)
+			pgBufferUsage.local_blks_written++;
+		else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+				 mode == RBM_ZERO_ON_ERROR)
+			pgBufferUsage.local_blks_read++;
+	}
+	else
+	{
+		/*
+		 * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
+		 * not currently in memory.
+		 */
+		bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
+							 strategy, &found);
+		if (found)
+			pgBufferUsage.shared_blks_hit++;
+		else if (isExtend)
+			pgBufferUsage.shared_blks_written++;
+		else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+				 mode == RBM_ZERO_ON_ERROR)
+			pgBufferUsage.shared_blks_read++;
+	}
+
+	/* At this point we do NOT hold any locks. */
+
+	/* if it was already in the buffer pool, we're done */
+	if (found)
+	{
+		if (!isExtend)
+		{
+			/* Just need to update stats before we exit */
+			*hit = true;
+			VacuumPageHit++;
+
+			if (VacuumCostActive)
+				VacuumCostBalance += VacuumCostPageHit;
+
+			TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
+											  smgr->smgr_rnode.node.spcNode,
+											  smgr->smgr_rnode.node.dbNode,
+											  smgr->smgr_rnode.node.relNode,
+											  smgr->smgr_rnode.backend,
+											  isExtend,
+											  found);
+
+			/*
+			 * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
+			 * locked on return.
+			 */
+			if (!isLocalBuf)
+			{
+				if (mode == RBM_ZERO_AND_LOCK)
+					LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+								  LW_EXCLUSIVE);
+				else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
+					LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
+			}
+
+			return BufferDescriptorGetBuffer(bufHdr);
+		}
+
+		/*
+		 * We get here only in the corner case where we are trying to extend
+		 * the relation but we found a pre-existing buffer marked BM_VALID.
+		 * This can happen because mdread doesn't complain about reads beyond
+		 * EOF (when zero_damaged_pages is ON) and so a previous attempt to
+		 * read a block beyond EOF could have left a "valid" zero-filled
+		 * buffer.  Unfortunately, we have also seen this case occurring
+		 * because of buggy Linux kernels that sometimes return an
+		 * lseek(SEEK_END) result that doesn't account for a recent write. In
+		 * that situation, the pre-existing buffer would contain valid data
+		 * that we don't want to overwrite.  Since the legitimate case should
+		 * always have left a zero-filled buffer, complain if not PageIsNew.
+		 */
+		bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+		if (!PageIsNew((Page) bufBlock))
+			ereport(ERROR,
+					(errmsg("unexpected data beyond EOF in block %u of relation %s",
+							blockNum, relpath(smgr->smgr_rnode, forkNum)),
+					 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
+
+		/*
+		 * We *must* do smgrextend before succeeding, else the page will not
+		 * be reserved by the kernel, and the next P_NEW call will decide to
+		 * return the same page.  Clear the BM_VALID bit, do the StartBufferIO
+		 * call that BufferAlloc didn't, and proceed.
+		 */
+		if (isLocalBuf)
+		{
+			/* Only need to adjust flags */
+			uint32		buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+			Assert(buf_state & BM_VALID);
+			buf_state &= ~BM_VALID;
+			pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+		}
+		else
+		{
+			/*
+			 * Loop to handle the very small possibility that someone re-sets
+			 * BM_VALID between our clearing it and StartBufferIO inspecting
+			 * it.
+			 */
+			do
+			{
+				uint32		buf_state = LockBufHdr(bufHdr);
+
+				Assert(buf_state & BM_VALID);
+				buf_state &= ~BM_VALID;
+				UnlockBufHdr(bufHdr, buf_state);
+			} while (!StartBufferIO(bufHdr, true));
+		}
+	}
+
+	/*
+	 * if we have gotten to this point, we have allocated a buffer for the
+	 * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
+	 * if it's a shared buffer.
+	 *
+	 * Note: if smgrextend fails, we will end up with a buffer that is
+	 * allocated but not marked BM_VALID.  P_NEW will still select the same
+	 * block number (because the relation didn't get any longer on disk) and
+	 * so future attempts to extend the relation will find the same buffer (if
+	 * it's not been recycled) but come right back here to try smgrextend
+	 * again.
+	 */
+	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));	/* spinlock not needed */
+
+	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+
+	if (isExtend)
+	{
+		/* new buffers are zero-filled */
+		MemSet((char *) bufBlock, 0, BLCKSZ);
+		/* don't set checksum for all-zero page */
+		smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
+
+		/*
+		 * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
+		 * although we're essentially performing a write. At least on linux
+		 * doing so defeats the 'delayed allocation' mechanism, leading to
+		 * increased file fragmentation.
+		 */
+	}
+	else
+	{
+		/*
+		 * Read in the page, unless the caller intends to overwrite it and
+		 * just wants us to allocate a buffer.
+		 */
+		if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+			MemSet((char *) bufBlock, 0, BLCKSZ);
+		else
+		{
+			instr_time	io_start,
+						io_time;
+
+			if (track_io_timing)
+				INSTR_TIME_SET_CURRENT(io_start);
+
+			smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+
+			if (track_io_timing)
+			{
+				INSTR_TIME_SET_CURRENT(io_time);
+				INSTR_TIME_SUBTRACT(io_time, io_start);
+				pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
+			}
+
+			/* check for garbage data */
+			if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
+										PIV_LOG_WARNING | PIV_REPORT_STAT))
+			{
+				if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+				{
+					ereport(WARNING,
+							(errcode(ERRCODE_DATA_CORRUPTED),
+							 errmsg("invalid page in block %u of relation %s; zeroing out page",
+									blockNum,
+									relpath(smgr->smgr_rnode, forkNum))));
+					MemSet((char *) bufBlock, 0, BLCKSZ);
+				}
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_DATA_CORRUPTED),
+							 errmsg("invalid page in block %u of relation %s",
+									blockNum,
+									relpath(smgr->smgr_rnode, forkNum))));
+			}
+		}
+	}
+
+	/*
+	 * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
+	 * the page as valid, to make sure that no other backend sees the zeroed
+	 * page before the caller has had a chance to initialize it.
+	 *
+	 * Since no-one else can be looking at the page contents yet, there is no
+	 * difference between an exclusive lock and a cleanup-strength lock. (Note
+	 * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
+	 * they assert that the buffer is already valid.)
+	 */
+	if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
+		!isLocalBuf)
+	{
+		LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
+	}
+
+	if (isLocalBuf)
+	{
+		/* Only need to adjust flags */
+		uint32		buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+		buf_state |= BM_VALID;
+		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+	}
+	else
+	{
+		/* Set BM_VALID, terminate IO, and wake up any waiters */
+		TerminateBufferIO(bufHdr, false, BM_VALID);
+	}
+
+	VacuumPageMiss++;
+	if (VacuumCostActive)
+		VacuumCostBalance += VacuumCostPageMiss;
+
+	TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
+									  smgr->smgr_rnode.node.spcNode,
+									  smgr->smgr_rnode.node.dbNode,
+									  smgr->smgr_rnode.node.relNode,
+									  smgr->smgr_rnode.backend,
+									  isExtend,
+									  found);
+
+	return BufferDescriptorGetBuffer(bufHdr);
+}
+
+/*
+ * BufferAlloc -- subroutine for ReadBuffer.  Handles lookup of a shared
+ *		buffer.  If no buffer exists already, selects a replacement
+ *		victim and evicts the old page, but does NOT read in new page.
+ *
+ * "strategy" can be a buffer replacement strategy object, or NULL for
+ * the default strategy.  The selected buffer's usage_count is advanced when
+ * using the default strategy, but otherwise possibly not (see PinBuffer).
+ *
+ * The returned buffer is pinned and is already marked as holding the
+ * desired page.  If it already did have the desired page, *foundPtr is
+ * set true.  Otherwise, *foundPtr is set false and the buffer is marked
+ * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
+ *
+ * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
+ * we keep it for simplicity in ReadBuffer.
+ *
+ * No locks are held either at entry or exit.
+ */
+static BufferDesc *
+BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+			BlockNumber blockNum,
+			BufferAccessStrategy strategy,
+			bool *foundPtr)
+{
+	BufferTag	newTag;			/* identity of requested block */
+	uint32		newHash;		/* hash value for newTag */
+	LWLock	   *newPartitionLock;	/* buffer partition lock for it */
+	BufferTag	oldTag;			/* previous identity of selected buffer */
+	uint32		oldHash;		/* hash value for oldTag */
+	LWLock	   *oldPartitionLock;	/* buffer partition lock for it */
+	uint32		oldFlags;
+	int			buf_id;
+	BufferDesc *buf;
+	bool		valid;
+	uint32		buf_state;
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	newHash = BufTableHashCode(&newTag);
+	newPartitionLock = BufMappingPartitionLock(newHash);
+
+	/* see if the block is in the buffer pool already */
+	LWLockAcquire(newPartitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&newTag, newHash);
+	if (buf_id >= 0)
+	{
+		/*
+		 * Found it.  Now, pin the buffer so no one can steal it from the
+		 * buffer pool, and check to see if the correct data has been loaded
+		 * into the buffer.
+		 */
+		buf = GetBufferDescriptor(buf_id);
+
+		valid = PinBuffer(buf, strategy);
+
+		/* Can release the mapping lock as soon as we've pinned it */
+		LWLockRelease(newPartitionLock);
+
+		*foundPtr = true;
+
+		if (!valid)
+		{
+			/*
+			 * We can only get here if (a) someone else is still reading in
+			 * the page, or (b) a previous read attempt failed.  We have to
+			 * wait for any active read attempt to finish, and then set up our
+			 * own read attempt if the page is still not BM_VALID.
+			 * StartBufferIO does it all.
+			 */
+			if (StartBufferIO(buf, true))
+			{
+				/*
+				 * If we get here, previous attempts to read the buffer must
+				 * have failed ... but we shall bravely try again.
+				 */
+				*foundPtr = false;
+			}
+		}
+
+		return buf;
+	}
+
+	/*
+	 * Didn't find it in the buffer pool.  We'll have to initialize a new
+	 * buffer.  Remember to unlock the mapping lock while doing the work.
+	 */
+	LWLockRelease(newPartitionLock);
+
+	/* Loop here in case we have to try another victim buffer */
+	for (;;)
+	{
+		/*
+		 * Ensure, while the spinlock's not yet held, that there's a free
+		 * refcount entry.
+		 */
+		ReservePrivateRefCountEntry();
+
+		/*
+		 * Select a victim buffer.  The buffer is returned with its header
+		 * spinlock still held!
+		 */
+		buf = StrategyGetBuffer(strategy, &buf_state);
+
+		Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
+
+		/* Must copy buffer flags while we still hold the spinlock */
+		oldFlags = buf_state & BUF_FLAG_MASK;
+
+		/* Pin the buffer and then release the buffer spinlock */
+		PinBuffer_Locked(buf);
+
+		/*
+		 * If the buffer was dirty, try to write it out.  There is a race
+		 * condition here, in that someone might dirty it after we released it
+		 * above, or even while we are writing it out (since our share-lock
+		 * won't prevent hint-bit updates).  We will recheck the dirty bit
+		 * after re-locking the buffer header.
+		 */
+		if (oldFlags & BM_DIRTY)
+		{
+			/*
+			 * We need a share-lock on the buffer contents to write it out
+			 * (else we might write invalid data, eg because someone else is
+			 * compacting the page contents while we write).  We must use a
+			 * conditional lock acquisition here to avoid deadlock.  Even
+			 * though the buffer was not pinned (and therefore surely not
+			 * locked) when StrategyGetBuffer returned it, someone else could
+			 * have pinned and exclusive-locked it by the time we get here. If
+			 * we try to get the lock unconditionally, we'd block waiting for
+			 * them; if they later block waiting for us, deadlock ensues.
+			 * (This has been observed to happen when two backends are both
+			 * trying to split btree index pages, and the second one just
+			 * happens to be trying to split the page the first one got from
+			 * StrategyGetBuffer.)
+			 */
+			if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+										 LW_SHARED))
+			{
+				/*
+				 * If using a nondefault strategy, and writing the buffer
+				 * would require a WAL flush, let the strategy decide whether
+				 * to go ahead and write/reuse the buffer or to choose another
+				 * victim.  We need lock to inspect the page LSN, so this
+				 * can't be done inside StrategyGetBuffer.
+				 */
+				if (strategy != NULL)
+				{
+					XLogRecPtr	lsn;
+
+					/* Read the LSN while holding buffer header lock */
+					buf_state = LockBufHdr(buf);
+					lsn = BufferGetLSN(buf);
+					UnlockBufHdr(buf, buf_state);
+
+					if (XLogNeedsFlush(lsn) &&
+						StrategyRejectBuffer(strategy, buf))
+					{
+						/* Drop lock/pin and loop around for another buffer */
+						LWLockRelease(BufferDescriptorGetContentLock(buf));
+						UnpinBuffer(buf, true);
+						continue;
+					}
+				}
+
+				/* OK, do the I/O */
+				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
+														  smgr->smgr_rnode.node.spcNode,
+														  smgr->smgr_rnode.node.dbNode,
+														  smgr->smgr_rnode.node.relNode);
+
+				FlushBuffer(buf, NULL);
+				LWLockRelease(BufferDescriptorGetContentLock(buf));
+
+				ScheduleBufferTagForWriteback(&BackendWritebackContext,
+											  &buf->tag);
+
+				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
+														 smgr->smgr_rnode.node.spcNode,
+														 smgr->smgr_rnode.node.dbNode,
+														 smgr->smgr_rnode.node.relNode);
+			}
+			else
+			{
+				/*
+				 * Someone else has locked the buffer, so give it up and loop
+				 * back to get another one.
+				 */
+				UnpinBuffer(buf, true);
+				continue;
+			}
+		}
+
+		/*
+		 * To change the association of a valid buffer, we'll need to have
+		 * exclusive lock on both the old and new mapping partitions.
+		 */
+		if (oldFlags & BM_TAG_VALID)
+		{
+			/*
+			 * Need to compute the old tag's hashcode and partition lock ID.
+			 * XXX is it worth storing the hashcode in BufferDesc so we need
+			 * not recompute it here?  Probably not.
+			 */
+			oldTag = buf->tag;
+			oldHash = BufTableHashCode(&oldTag);
+			oldPartitionLock = BufMappingPartitionLock(oldHash);
+
+			/*
+			 * Must lock the lower-numbered partition first to avoid
+			 * deadlocks.
+			 */
+			if (oldPartitionLock < newPartitionLock)
+			{
+				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
+				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
+			}
+			else if (oldPartitionLock > newPartitionLock)
+			{
+				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
+				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
+			}
+			else
+			{
+				/* only one partition, only one lock */
+				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
+			}
+		}
+		else
+		{
+			/* if it wasn't valid, we need only the new partition */
+			LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
+			/* remember we have no old-partition lock or tag */
+			oldPartitionLock = NULL;
+			/* keep the compiler quiet about uninitialized variables */
+			oldHash = 0;
+		}
+
+		/*
+		 * Try to make a hashtable entry for the buffer under its new tag.
+		 * This could fail because while we were writing someone else
+		 * allocated another buffer for the same block we want to read in.
+		 * Note that we have not yet removed the hashtable entry for the old
+		 * tag.
+		 */
+		buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
+
+		if (buf_id >= 0)
+		{
+			/*
+			 * Got a collision. Someone has already done what we were about to
+			 * do. We'll just handle this as if it were found in the buffer
+			 * pool in the first place.  First, give up the buffer we were
+			 * planning to use.
+			 */
+			UnpinBuffer(buf, true);
+
+			/* Can give up that buffer's mapping partition lock now */
+			if (oldPartitionLock != NULL &&
+				oldPartitionLock != newPartitionLock)
+				LWLockRelease(oldPartitionLock);
+
+			/* remaining code should match code at top of routine */
+
+			buf = GetBufferDescriptor(buf_id);
+
+			valid = PinBuffer(buf, strategy);
+
+			/* Can release the mapping lock as soon as we've pinned it */
+			LWLockRelease(newPartitionLock);
+
+			*foundPtr = true;
+
+			if (!valid)
+			{
+				/*
+				 * We can only get here if (a) someone else is still reading
+				 * in the page, or (b) a previous read attempt failed.  We
+				 * have to wait for any active read attempt to finish, and
+				 * then set up our own read attempt if the page is still not
+				 * BM_VALID.  StartBufferIO does it all.
+				 */
+				if (StartBufferIO(buf, true))
+				{
+					/*
+					 * If we get here, previous attempts to read the buffer
+					 * must have failed ... but we shall bravely try again.
+					 */
+					*foundPtr = false;
+				}
+			}
+
+			return buf;
+		}
+
+		/*
+		 * Need to lock the buffer header too in order to change its tag.
+		 */
+		buf_state = LockBufHdr(buf);
+
+		/*
+		 * Somebody could have pinned or re-dirtied the buffer while we were
+		 * doing the I/O and making the new hashtable entry.  If so, we can't
+		 * recycle this buffer; we must undo everything we've done and start
+		 * over with a new victim buffer.
+		 */
+		oldFlags = buf_state & BUF_FLAG_MASK;
+		if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
+			break;
+
+		UnlockBufHdr(buf, buf_state);
+		BufTableDelete(&newTag, newHash);
+		if (oldPartitionLock != NULL &&
+			oldPartitionLock != newPartitionLock)
+			LWLockRelease(oldPartitionLock);
+		LWLockRelease(newPartitionLock);
+		UnpinBuffer(buf, true);
+	}
+
+	/*
+	 * Okay, it's finally safe to rename the buffer.
+	 *
+	 * Clearing BM_VALID here is necessary, clearing the dirtybits is just
+	 * paranoia.  We also reset the usage_count since any recency of use of
+	 * the old content is no longer relevant.  (The usage_count starts out at
+	 * 1 so that the buffer can survive one clock-sweep pass.)
+	 *
+	 * Make sure BM_PERMANENT is set for buffers that must be written at every
+	 * checkpoint.  Unlogged buffers only need to be written at shutdown
+	 * checkpoints, except for their "init" forks, which need to be treated
+	 * just like permanent relations.
+	 */
+	buf->tag = newTag;
+	buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
+				   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
+				   BUF_USAGECOUNT_MASK);
+	if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
+		buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
+	else
+		buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
+
+	UnlockBufHdr(buf, buf_state);
+
+	if (oldPartitionLock != NULL)
+	{
+		BufTableDelete(&oldTag, oldHash);
+		if (oldPartitionLock != newPartitionLock)
+			LWLockRelease(oldPartitionLock);
+	}
+
+	LWLockRelease(newPartitionLock);
+
+	/*
+	 * Buffer contents are currently invalid.  Try to obtain the right to
+	 * start I/O.  If StartBufferIO returns false, then someone else managed
+	 * to read it before we did, so there's nothing left for BufferAlloc() to
+	 * do.
+	 */
+	if (StartBufferIO(buf, true))
+		*foundPtr = false;
+	else
+		*foundPtr = true;
+
+	return buf;
+}
+
+/*
+ * InvalidateBuffer -- mark a shared buffer invalid and return it to the
+ * freelist.
+ *
+ * The buffer header spinlock must be held at entry.  We drop it before
+ * returning.  (This is sane because the caller must have locked the
+ * buffer in order to be sure it should be dropped.)
+ *
+ * This is used only in contexts such as dropping a relation.  We assume
+ * that no other backend could possibly be interested in using the page,
+ * so the only reason the buffer might be pinned is if someone else is
+ * trying to write it out.  We have to let them finish before we can
+ * reclaim the buffer.
+ *
+ * The buffer could get reclaimed by someone else while we are waiting
+ * to acquire the necessary locks; if so, don't mess it up.
+ */
+static void
+InvalidateBuffer(BufferDesc *buf)
+{
+	BufferTag	oldTag;
+	uint32		oldHash;		/* hash value for oldTag */
+	LWLock	   *oldPartitionLock;	/* buffer partition lock for it */
+	uint32		oldFlags;
+	uint32		buf_state;
+
+	/* Save the original buffer tag before dropping the spinlock */
+	oldTag = buf->tag;
+
+	buf_state = pg_atomic_read_u32(&buf->state);
+	Assert(buf_state & BM_LOCKED);
+	UnlockBufHdr(buf, buf_state);
+
+	/*
+	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
+	 * worth storing the hashcode in BufferDesc so we need not recompute it
+	 * here?  Probably not.
+	 */
+	oldHash = BufTableHashCode(&oldTag);
+	oldPartitionLock = BufMappingPartitionLock(oldHash);
+
+retry:
+
+	/*
+	 * Acquire exclusive mapping lock in preparation for changing the buffer's
+	 * association.
+	 */
+	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
+
+	/* Re-lock the buffer header */
+	buf_state = LockBufHdr(buf);
+
+	/* If it's changed while we were waiting for lock, do nothing */
+	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
+	{
+		UnlockBufHdr(buf, buf_state);
+		LWLockRelease(oldPartitionLock);
+		return;
+	}
+
+	/*
+	 * We assume the only reason for it to be pinned is that someone else is
+	 * flushing the page out.  Wait for them to finish.  (This could be an
+	 * infinite loop if the refcount is messed up... it would be nice to time
+	 * out after awhile, but there seems no way to be sure how many loops may
+	 * be needed.  Note that if the other guy has pinned the buffer but not
+	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
+	 * be busy-looping here.)
+	 */
+	if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
+	{
+		UnlockBufHdr(buf, buf_state);
+		LWLockRelease(oldPartitionLock);
+		/* safety check: should definitely not be our *own* pin */
+		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
+			elog(ERROR, "buffer is pinned in InvalidateBuffer");
+		WaitIO(buf);
+		goto retry;
+	}
+
+	/*
+	 * Clear out the buffer's tag and flags.  We must do this to ensure that
+	 * linear scans of the buffer array don't think the buffer is valid.
+	 */
+	oldFlags = buf_state & BUF_FLAG_MASK;
+	CLEAR_BUFFERTAG(buf->tag);
+	buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+	UnlockBufHdr(buf, buf_state);
+
+	/*
+	 * Remove the buffer from the lookup hashtable, if it was in there.
+	 */
+	if (oldFlags & BM_TAG_VALID)
+		BufTableDelete(&oldTag, oldHash);
+
+	/*
+	 * Done with mapping lock.
+	 */
+	LWLockRelease(oldPartitionLock);
+
+	/*
+	 * Insert the buffer at the head of the list of free buffers.
+	 */
+	StrategyFreeBuffer(buf);
+}
+
+/*
+ * MarkBufferDirty
+ *
+ *		Marks buffer contents as dirty (actual write happens later).
+ *
+ * Buffer must be pinned and exclusive-locked.  (If caller does not hold
+ * exclusive lock, then somebody could be in process of writing the buffer,
+ * leading to risk of bad data written to disk.)
+ */
+void
+MarkBufferDirty(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+	uint32		buf_state;
+	uint32		old_buf_state;
+
+	if (!BufferIsValid(buffer))
+		elog(ERROR, "bad buffer ID: %d", buffer);
+
+	if (BufferIsLocal(buffer))
+	{
+		MarkLocalBufferDirty(buffer);
+		return;
+	}
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+
+	Assert(BufferIsPinned(buffer));
+	Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
+								LW_EXCLUSIVE));
+
+	old_buf_state = pg_atomic_read_u32(&bufHdr->state);
+	for (;;)
+	{
+		if (old_buf_state & BM_LOCKED)
+			old_buf_state = WaitBufHdrUnlocked(bufHdr);
+
+		buf_state = old_buf_state;
+
+		Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+		buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
+
+		if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
+										   buf_state))
+			break;
+	}
+
+	/*
+	 * If the buffer was not dirty already, do vacuum accounting.
+	 */
+	if (!(old_buf_state & BM_DIRTY))
+	{
+		VacuumPageDirty++;
+		pgBufferUsage.shared_blks_dirtied++;
+		if (VacuumCostActive)
+			VacuumCostBalance += VacuumCostPageDirty;
+	}
+}
+
+/*
+ * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
+ *
+ * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
+ * compared to calling the two routines separately.  Now it's mainly just
+ * a convenience function.  However, if the passed buffer is valid and
+ * already contains the desired block, we just return it as-is; and that
+ * does save considerable work compared to a full release and reacquire.
+ *
+ * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
+ * buffer actually needs to be released.  This case is the same as ReadBuffer,
+ * but can save some tests in the caller.
+ */
+Buffer
+ReleaseAndReadBuffer(Buffer buffer,
+					 Relation relation,
+					 BlockNumber blockNum)
+{
+	ForkNumber	forkNum = MAIN_FORKNUM;
+	BufferDesc *bufHdr;
+
+	if (BufferIsValid(buffer))
+	{
+		Assert(BufferIsPinned(buffer));
+		if (BufferIsLocal(buffer))
+		{
+			bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+			if (bufHdr->tag.blockNum == blockNum &&
+				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
+				bufHdr->tag.forkNum == forkNum)
+				return buffer;
+			ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
+			LocalRefCount[-buffer - 1]--;
+		}
+		else
+		{
+			bufHdr = GetBufferDescriptor(buffer - 1);
+			/* we have pin, so it's ok to examine tag without spinlock */
+			if (bufHdr->tag.blockNum == blockNum &&
+				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
+				bufHdr->tag.forkNum == forkNum)
+				return buffer;
+			UnpinBuffer(bufHdr, true);
+		}
+	}
+
+	return ReadBuffer(relation, blockNum);
+}
+
+/*
+ * PinBuffer -- make buffer unavailable for replacement.
+ *
+ * For the default access strategy, the buffer's usage_count is incremented
+ * when we first pin it; for other strategies we just make sure the usage_count
+ * isn't zero.  (The idea of the latter is that we don't want synchronized
+ * heap scans to inflate the count, but we need it to not be zero to discourage
+ * other backends from stealing buffers from our ring.  As long as we cycle
+ * through the ring faster than the global clock-sweep cycles, buffers in
+ * our ring won't be chosen as victims for replacement by other backends.)
+ *
+ * This should be applied only to shared buffers, never local ones.
+ *
+ * Since buffers are pinned/unpinned very frequently, pin buffers without
+ * taking the buffer header lock; instead update the state variable in loop of
+ * CAS operations. Hopefully it's just a single CAS.
+ *
+ * Note that ResourceOwnerEnlargeBuffers must have been done already.
+ *
+ * Returns true if buffer is BM_VALID, else false.  This provision allows
+ * some callers to avoid an extra spinlock cycle.
+ */
+static bool
+PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
+{
+	Buffer		b = BufferDescriptorGetBuffer(buf);
+	bool		result;
+	PrivateRefCountEntry *ref;
+
+	ref = GetPrivateRefCountEntry(b, true);
+
+	if (ref == NULL)
+	{
+		uint32		buf_state;
+		uint32		old_buf_state;
+
+		ReservePrivateRefCountEntry();
+		ref = NewPrivateRefCountEntry(b);
+
+		old_buf_state = pg_atomic_read_u32(&buf->state);
+		for (;;)
+		{
+			if (old_buf_state & BM_LOCKED)
+				old_buf_state = WaitBufHdrUnlocked(buf);
+
+			buf_state = old_buf_state;
+
+			/* increase refcount */
+			buf_state += BUF_REFCOUNT_ONE;
+
+			if (strategy == NULL)
+			{
+				/* Default case: increase usagecount unless already max. */
+				if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
+					buf_state += BUF_USAGECOUNT_ONE;
+			}
+			else
+			{
+				/*
+				 * Ring buffers shouldn't evict others from pool.  Thus we
+				 * don't make usagecount more than 1.
+				 */
+				if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
+					buf_state += BUF_USAGECOUNT_ONE;
+			}
+
+			if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+											   buf_state))
+			{
+				result = (buf_state & BM_VALID) != 0;
+
+				/*
+				 * Assume that we acquired a buffer pin for the purposes of
+				 * Valgrind buffer client checks (even in !result case) to
+				 * keep things simple.  Buffers that are unsafe to access are
+				 * not generally guaranteed to be marked undefined or
+				 * non-accessible in any case.
+				 */
+				VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
+				break;
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * If we previously pinned the buffer, it must surely be valid.
+		 *
+		 * Note: We deliberately avoid a Valgrind client request here.
+		 * Individual access methods can optionally superimpose buffer page
+		 * client requests on top of our client requests to enforce that
+		 * buffers are only accessed while locked (and pinned).  It's possible
+		 * that the buffer page is legitimately non-accessible here.  We
+		 * cannot meddle with that.
+		 */
+		result = true;
+	}
+
+	ref->refcount++;
+	Assert(ref->refcount > 0);
+	ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
+	return result;
+}
+
+/*
+ * PinBuffer_Locked -- as above, but caller already locked the buffer header.
+ * The spinlock is released before return.
+ *
+ * As this function is called with the spinlock held, the caller has to
+ * previously call ReservePrivateRefCountEntry().
+ *
+ * Currently, no callers of this function want to modify the buffer's
+ * usage_count at all, so there's no need for a strategy parameter.
+ * Also we don't bother with a BM_VALID test (the caller could check that for
+ * itself).
+ *
+ * Also all callers only ever use this function when it's known that the
+ * buffer can't have a preexisting pin by this backend. That allows us to skip
+ * searching the private refcount array & hash, which is a boon, because the
+ * spinlock is still held.
+ *
+ * Note: use of this routine is frequently mandatory, not just an optimization
+ * to save a spin lock/unlock cycle, because we need to pin a buffer before
+ * its state can change under us.
+ */
+static void
+PinBuffer_Locked(BufferDesc *buf)
+{
+	Buffer		b;
+	PrivateRefCountEntry *ref;
+	uint32		buf_state;
+
+	/*
+	 * As explained, We don't expect any preexisting pins. That allows us to
+	 * manipulate the PrivateRefCount after releasing the spinlock
+	 */
+	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
+
+	/*
+	 * Buffer can't have a preexisting pin, so mark its page as defined to
+	 * Valgrind (this is similar to the PinBuffer() case where the backend
+	 * doesn't already have a buffer pin)
+	 */
+	VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
+
+	/*
+	 * Since we hold the buffer spinlock, we can update the buffer state and
+	 * release the lock in one operation.
+	 */
+	buf_state = pg_atomic_read_u32(&buf->state);
+	Assert(buf_state & BM_LOCKED);
+	buf_state += BUF_REFCOUNT_ONE;
+	UnlockBufHdr(buf, buf_state);
+
+	b = BufferDescriptorGetBuffer(buf);
+
+	ref = NewPrivateRefCountEntry(b);
+	ref->refcount++;
+
+	ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
+}
+
+/*
+ * UnpinBuffer -- make buffer available for replacement.
+ *
+ * This should be applied only to shared buffers, never local ones.
+ *
+ * Most but not all callers want CurrentResourceOwner to be adjusted.
+ * Those that don't should pass fixOwner = false.
+ */
+static void
+UnpinBuffer(BufferDesc *buf, bool fixOwner)
+{
+	PrivateRefCountEntry *ref;
+	Buffer		b = BufferDescriptorGetBuffer(buf);
+
+	/* not moving as we're likely deleting it soon anyway */
+	ref = GetPrivateRefCountEntry(b, false);
+	Assert(ref != NULL);
+
+	if (fixOwner)
+		ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
+
+	Assert(ref->refcount > 0);
+	ref->refcount--;
+	if (ref->refcount == 0)
+	{
+		uint32		buf_state;
+		uint32		old_buf_state;
+
+		/*
+		 * Mark buffer non-accessible to Valgrind.
+		 *
+		 * Note that the buffer may have already been marked non-accessible
+		 * within access method code that enforces that buffers are only
+		 * accessed while a buffer lock is held.
+		 */
+		VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
+
+		/* I'd better not still hold the buffer content lock */
+		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
+
+		/*
+		 * Decrement the shared reference count.
+		 *
+		 * Since buffer spinlock holder can update status using just write,
+		 * it's not safe to use atomic decrement here; thus use a CAS loop.
+		 */
+		old_buf_state = pg_atomic_read_u32(&buf->state);
+		for (;;)
+		{
+			if (old_buf_state & BM_LOCKED)
+				old_buf_state = WaitBufHdrUnlocked(buf);
+
+			buf_state = old_buf_state;
+
+			buf_state -= BUF_REFCOUNT_ONE;
+
+			if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+											   buf_state))
+				break;
+		}
+
+		/* Support LockBufferForCleanup() */
+		if (buf_state & BM_PIN_COUNT_WAITER)
+		{
+			/*
+			 * Acquire the buffer header lock, re-check that there's a waiter.
+			 * Another backend could have unpinned this buffer, and already
+			 * woken up the waiter.  There's no danger of the buffer being
+			 * replaced after we unpinned it above, as it's pinned by the
+			 * waiter.
+			 */
+			buf_state = LockBufHdr(buf);
+
+			if ((buf_state & BM_PIN_COUNT_WAITER) &&
+				BUF_STATE_GET_REFCOUNT(buf_state) == 1)
+			{
+				/* we just released the last pin other than the waiter's */
+				int			wait_backend_pid = buf->wait_backend_pid;
+
+				buf_state &= ~BM_PIN_COUNT_WAITER;
+				UnlockBufHdr(buf, buf_state);
+				ProcSendSignal(wait_backend_pid);
+			}
+			else
+				UnlockBufHdr(buf, buf_state);
+		}
+		ForgetPrivateRefCountEntry(ref);
+	}
+}
+
+#define ST_SORT sort_checkpoint_bufferids
+#define ST_ELEMENT_TYPE CkptSortItem
+#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
+#define ST_SCOPE static
+#define ST_DEFINE
+#include <lib/sort_template.h>
+
+/*
+ * BufferSync -- Write out all dirty buffers in the pool.
+ *
+ * This is called at checkpoint time to write out all dirty shared buffers.
+ * The checkpoint request flags should be passed in.  If CHECKPOINT_IMMEDIATE
+ * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
+ * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
+ * unlogged buffers, which are otherwise skipped.  The remaining flags
+ * currently have no effect here.
+ */
+static void
+BufferSync(int flags)
+{
+	uint32		buf_state;
+	int			buf_id;
+	int			num_to_scan;
+	int			num_spaces;
+	int			num_processed;
+	int			num_written;
+	CkptTsStatus *per_ts_stat = NULL;
+	Oid			last_tsid;
+	binaryheap *ts_heap;
+	int			i;
+	int			mask = BM_DIRTY;
+	WritebackContext wb_context;
+
+	/* Make sure we can handle the pin inside SyncOneBuffer */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	/*
+	 * Unless this is a shutdown checkpoint or we have been explicitly told,
+	 * we write only permanent, dirty buffers.  But at shutdown or end of
+	 * recovery, we write all dirty buffers.
+	 */
+	if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
+					CHECKPOINT_FLUSH_ALL))))
+		mask |= BM_PERMANENT;
+
+	/*
+	 * Loop over all buffers, and mark the ones that need to be written with
+	 * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_scan), so that we
+	 * can estimate how much work needs to be done.
+	 *
+	 * This allows us to write only those pages that were dirty when the
+	 * checkpoint began, and not those that get dirtied while it proceeds.
+	 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
+	 * later in this function, or by normal backends or the bgwriter cleaning
+	 * scan, the flag is cleared.  Any buffer dirtied after this point won't
+	 * have the flag set.
+	 *
+	 * Note that if we fail to write some buffer, we may leave buffers with
+	 * BM_CHECKPOINT_NEEDED still set.  This is OK since any such buffer would
+	 * certainly need to be written for the next checkpoint attempt, too.
+	 */
+	num_to_scan = 0;
+	for (buf_id = 0; buf_id < NBuffers; buf_id++)
+	{
+		BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
+
+		/*
+		 * Header spinlock is enough to examine BM_DIRTY, see comment in
+		 * SyncOneBuffer.
+		 */
+		buf_state = LockBufHdr(bufHdr);
+
+		if ((buf_state & mask) == mask)
+		{
+			CkptSortItem *item;
+
+			buf_state |= BM_CHECKPOINT_NEEDED;
+
+			item = &CkptBufferIds[num_to_scan++];
+			item->buf_id = buf_id;
+			item->tsId = bufHdr->tag.rnode.spcNode;
+			item->relNode = bufHdr->tag.rnode.relNode;
+			item->forkNum = bufHdr->tag.forkNum;
+			item->blockNum = bufHdr->tag.blockNum;
+		}
+
+		UnlockBufHdr(bufHdr, buf_state);
+
+		/* Check for barrier events in case NBuffers is large. */
+		if (ProcSignalBarrierPending)
+			ProcessProcSignalBarrier();
+	}
+
+	if (num_to_scan == 0)
+		return;					/* nothing to do */
+
+	WritebackContextInit(&wb_context, &checkpoint_flush_after);
+
+	TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
+
+	/*
+	 * Sort buffers that need to be written to reduce the likelihood of random
+	 * IO. The sorting is also important for the implementation of balancing
+	 * writes between tablespaces. Without balancing writes we'd potentially
+	 * end up writing to the tablespaces one-by-one; possibly overloading the
+	 * underlying system.
+	 */
+	sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
+
+	num_spaces = 0;
+
+	/*
+	 * Allocate progress status for each tablespace with buffers that need to
+	 * be flushed. This requires the to-be-flushed array to be sorted.
+	 */
+	last_tsid = InvalidOid;
+	for (i = 0; i < num_to_scan; i++)
+	{
+		CkptTsStatus *s;
+		Oid			cur_tsid;
+
+		cur_tsid = CkptBufferIds[i].tsId;
+
+		/*
+		 * Grow array of per-tablespace status structs, every time a new
+		 * tablespace is found.
+		 */
+		if (last_tsid == InvalidOid || last_tsid != cur_tsid)
+		{
+			Size		sz;
+
+			num_spaces++;
+
+			/*
+			 * Not worth adding grow-by-power-of-2 logic here - even with a
+			 * few hundred tablespaces this should be fine.
+			 */
+			sz = sizeof(CkptTsStatus) * num_spaces;
+
+			if (per_ts_stat == NULL)
+				per_ts_stat = (CkptTsStatus *) palloc(sz);
+			else
+				per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
+
+			s = &per_ts_stat[num_spaces - 1];
+			memset(s, 0, sizeof(*s));
+			s->tsId = cur_tsid;
+
+			/*
+			 * The first buffer in this tablespace. As CkptBufferIds is sorted
+			 * by tablespace all (s->num_to_scan) buffers in this tablespace
+			 * will follow afterwards.
+			 */
+			s->index = i;
+
+			/*
+			 * progress_slice will be determined once we know how many buffers
+			 * are in each tablespace, i.e. after this loop.
+			 */
+
+			last_tsid = cur_tsid;
+		}
+		else
+		{
+			s = &per_ts_stat[num_spaces - 1];
+		}
+
+		s->num_to_scan++;
+
+		/* Check for barrier events. */
+		if (ProcSignalBarrierPending)
+			ProcessProcSignalBarrier();
+	}
+
+	Assert(num_spaces > 0);
+
+	/*
+	 * Build a min-heap over the write-progress in the individual tablespaces,
+	 * and compute how large a portion of the total progress a single
+	 * processed buffer is.
+	 */
+	ts_heap = binaryheap_allocate(num_spaces,
+								  ts_ckpt_progress_comparator,
+								  NULL);
+
+	for (i = 0; i < num_spaces; i++)
+	{
+		CkptTsStatus *ts_stat = &per_ts_stat[i];
+
+		ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
+
+		binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
+	}
+
+	binaryheap_build(ts_heap);
+
+	/*
+	 * Iterate through to-be-checkpointed buffers and write the ones (still)
+	 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
+	 * tablespaces; otherwise the sorting would lead to only one tablespace
+	 * receiving writes at a time, making inefficient use of the hardware.
+	 */
+	num_processed = 0;
+	num_written = 0;
+	while (!binaryheap_empty(ts_heap))
+	{
+		BufferDesc *bufHdr = NULL;
+		CkptTsStatus *ts_stat = (CkptTsStatus *)
+		DatumGetPointer(binaryheap_first(ts_heap));
+
+		buf_id = CkptBufferIds[ts_stat->index].buf_id;
+		Assert(buf_id != -1);
+
+		bufHdr = GetBufferDescriptor(buf_id);
+
+		num_processed++;
+
+		/*
+		 * We don't need to acquire the lock here, because we're only looking
+		 * at a single bit. It's possible that someone else writes the buffer
+		 * and clears the flag right after we check, but that doesn't matter
+		 * since SyncOneBuffer will then do nothing.  However, there is a
+		 * further race condition: it's conceivable that between the time we
+		 * examine the bit here and the time SyncOneBuffer acquires the lock,
+		 * someone else not only wrote the buffer but replaced it with another
+		 * page and dirtied it.  In that improbable case, SyncOneBuffer will
+		 * write the buffer though we didn't need to.  It doesn't seem worth
+		 * guarding against this, though.
+		 */
+		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
+		{
+			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
+			{
+				TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
+				BgWriterStats.m_buf_written_checkpoints++;
+				num_written++;
+			}
+		}
+
+		/*
+		 * Measure progress independent of actually having to flush the buffer
+		 * - otherwise writing become unbalanced.
+		 */
+		ts_stat->progress += ts_stat->progress_slice;
+		ts_stat->num_scanned++;
+		ts_stat->index++;
+
+		/* Have all the buffers from the tablespace been processed? */
+		if (ts_stat->num_scanned == ts_stat->num_to_scan)
+		{
+			binaryheap_remove_first(ts_heap);
+		}
+		else
+		{
+			/* update heap with the new progress */
+			binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
+		}
+
+		/*
+		 * Sleep to throttle our I/O rate.
+		 *
+		 * (This will check for barrier events even if it doesn't sleep.)
+		 */
+		CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
+	}
+
+	/* issue all pending flushes */
+	IssuePendingWritebacks(&wb_context);
+
+	pfree(per_ts_stat);
+	per_ts_stat = NULL;
+	binaryheap_free(ts_heap);
+
+	/*
+	 * Update checkpoint statistics. As noted above, this doesn't include
+	 * buffers written by other backends or bgwriter scan.
+	 */
+	CheckpointStats.ckpt_bufs_written += num_written;
+
+	TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
+}
+
+/*
+ * BgBufferSync -- Write out some dirty buffers in the pool.
+ *
+ * This is called periodically by the background writer process.
+ *
+ * Returns true if it's appropriate for the bgwriter process to go into
+ * low-power hibernation mode.  (This happens if the strategy clock sweep
+ * has been "lapped" and no buffer allocations have occurred recently,
+ * or if the bgwriter has been effectively disabled by setting
+ * bgwriter_lru_maxpages to 0.)
+ */
+bool
+BgBufferSync(WritebackContext *wb_context)
+{
+	/* info obtained from freelist.c */
+	int			strategy_buf_id;
+	uint32		strategy_passes;
+	uint32		recent_alloc;
+
+	/*
+	 * Information saved between calls so we can determine the strategy
+	 * point's advance rate and avoid scanning already-cleaned buffers.
+	 */
+	static bool saved_info_valid = false;
+	static int	prev_strategy_buf_id;
+	static uint32 prev_strategy_passes;
+	static int	next_to_clean;
+	static uint32 next_passes;
+
+	/* Moving averages of allocation rate and clean-buffer density */
+	static float smoothed_alloc = 0;
+	static float smoothed_density = 10.0;
+
+	/* Potentially these could be tunables, but for now, not */
+	float		smoothing_samples = 16;
+	float		scan_whole_pool_milliseconds = 120000.0;
+
+	/* Used to compute how far we scan ahead */
+	long		strategy_delta;
+	int			bufs_to_lap;
+	int			bufs_ahead;
+	float		scans_per_alloc;
+	int			reusable_buffers_est;
+	int			upcoming_alloc_est;
+	int			min_scan_buffers;
+
+	/* Variables for the scanning loop proper */
+	int			num_to_scan;
+	int			num_written;
+	int			reusable_buffers;
+
+	/* Variables for final smoothed_density update */
+	long		new_strategy_delta;
+	uint32		new_recent_alloc;
+
+	/*
+	 * Find out where the freelist clock sweep currently is, and how many
+	 * buffer allocations have happened since our last call.
+	 */
+	strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
+
+	/* Report buffer alloc counts to pgstat */
+	BgWriterStats.m_buf_alloc += recent_alloc;
+
+	/*
+	 * If we're not running the LRU scan, just stop after doing the stats
+	 * stuff.  We mark the saved state invalid so that we can recover sanely
+	 * if LRU scan is turned back on later.
+	 */
+	if (bgwriter_lru_maxpages <= 0)
+	{
+		saved_info_valid = false;
+		return true;
+	}
+
+	/*
+	 * Compute strategy_delta = how many buffers have been scanned by the
+	 * clock sweep since last time.  If first time through, assume none. Then
+	 * see if we are still ahead of the clock sweep, and if so, how many
+	 * buffers we could scan before we'd catch up with it and "lap" it. Note:
+	 * weird-looking coding of xxx_passes comparisons are to avoid bogus
+	 * behavior when the passes counts wrap around.
+	 */
+	if (saved_info_valid)
+	{
+		int32		passes_delta = strategy_passes - prev_strategy_passes;
+
+		strategy_delta = strategy_buf_id - prev_strategy_buf_id;
+		strategy_delta += (long) passes_delta * NBuffers;
+
+		Assert(strategy_delta >= 0);
+
+		if ((int32) (next_passes - strategy_passes) > 0)
+		{
+			/* we're one pass ahead of the strategy point */
+			bufs_to_lap = strategy_buf_id - next_to_clean;
+#ifdef BGW_DEBUG
+			elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+				 next_passes, next_to_clean,
+				 strategy_passes, strategy_buf_id,
+				 strategy_delta, bufs_to_lap);
+#endif
+		}
+		else if (next_passes == strategy_passes &&
+				 next_to_clean >= strategy_buf_id)
+		{
+			/* on same pass, but ahead or at least not behind */
+			bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
+#ifdef BGW_DEBUG
+			elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+				 next_passes, next_to_clean,
+				 strategy_passes, strategy_buf_id,
+				 strategy_delta, bufs_to_lap);
+#endif
+		}
+		else
+		{
+			/*
+			 * We're behind, so skip forward to the strategy point and start
+			 * cleaning from there.
+			 */
+#ifdef BGW_DEBUG
+			elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
+				 next_passes, next_to_clean,
+				 strategy_passes, strategy_buf_id,
+				 strategy_delta);
+#endif
+			next_to_clean = strategy_buf_id;
+			next_passes = strategy_passes;
+			bufs_to_lap = NBuffers;
+		}
+	}
+	else
+	{
+		/*
+		 * Initializing at startup or after LRU scanning had been off. Always
+		 * start at the strategy point.
+		 */
+#ifdef BGW_DEBUG
+		elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
+			 strategy_passes, strategy_buf_id);
+#endif
+		strategy_delta = 0;
+		next_to_clean = strategy_buf_id;
+		next_passes = strategy_passes;
+		bufs_to_lap = NBuffers;
+	}
+
+	/* Update saved info for next time */
+	prev_strategy_buf_id = strategy_buf_id;
+	prev_strategy_passes = strategy_passes;
+	saved_info_valid = true;
+
+	/*
+	 * Compute how many buffers had to be scanned for each new allocation, ie,
+	 * 1/density of reusable buffers, and track a moving average of that.
+	 *
+	 * If the strategy point didn't move, we don't update the density estimate
+	 */
+	if (strategy_delta > 0 && recent_alloc > 0)
+	{
+		scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
+		smoothed_density += (scans_per_alloc - smoothed_density) /
+			smoothing_samples;
+	}
+
+	/*
+	 * Estimate how many reusable buffers there are between the current
+	 * strategy point and where we've scanned ahead to, based on the smoothed
+	 * density estimate.
+	 */
+	bufs_ahead = NBuffers - bufs_to_lap;
+	reusable_buffers_est = (float) bufs_ahead / smoothed_density;
+
+	/*
+	 * Track a moving average of recent buffer allocations.  Here, rather than
+	 * a true average we want a fast-attack, slow-decline behavior: we
+	 * immediately follow any increase.
+	 */
+	if (smoothed_alloc <= (float) recent_alloc)
+		smoothed_alloc = recent_alloc;
+	else
+		smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
+			smoothing_samples;
+
+	/* Scale the estimate by a GUC to allow more aggressive tuning. */
+	upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
+
+	/*
+	 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
+	 * eventually underflow to zero, and the underflows produce annoying
+	 * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
+	 * zero, there's no point in tracking smaller and smaller values of
+	 * smoothed_alloc, so just reset it to exactly zero to avoid this
+	 * syndrome.  It will pop back up as soon as recent_alloc increases.
+	 */
+	if (upcoming_alloc_est == 0)
+		smoothed_alloc = 0;
+
+	/*
+	 * Even in cases where there's been little or no buffer allocation
+	 * activity, we want to make a small amount of progress through the buffer
+	 * cache so that as many reusable buffers as possible are clean after an
+	 * idle period.
+	 *
+	 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
+	 * the BGW will be called during the scan_whole_pool time; slice the
+	 * buffer pool into that many sections.
+	 */
+	min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
+
+	if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
+	{
+#ifdef BGW_DEBUG
+		elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
+			 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
+#endif
+		upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
+	}
+
+	/*
+	 * Now write out dirty reusable buffers, working forward from the
+	 * next_to_clean point, until we have lapped the strategy scan, or cleaned
+	 * enough buffers to match our estimate of the next cycle's allocation
+	 * requirements, or hit the bgwriter_lru_maxpages limit.
+	 */
+
+	/* Make sure we can handle the pin inside SyncOneBuffer */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	num_to_scan = bufs_to_lap;
+	num_written = 0;
+	reusable_buffers = reusable_buffers_est;
+
+	/* Execute the LRU scan */
+	while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
+	{
+		int			sync_state = SyncOneBuffer(next_to_clean, true,
+											   wb_context);
+
+		if (++next_to_clean >= NBuffers)
+		{
+			next_to_clean = 0;
+			next_passes++;
+		}
+		num_to_scan--;
+
+		if (sync_state & BUF_WRITTEN)
+		{
+			reusable_buffers++;
+			if (++num_written >= bgwriter_lru_maxpages)
+			{
+				BgWriterStats.m_maxwritten_clean++;
+				break;
+			}
+		}
+		else if (sync_state & BUF_REUSABLE)
+			reusable_buffers++;
+	}
+
+	BgWriterStats.m_buf_written_clean += num_written;
+
+#ifdef BGW_DEBUG
+	elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
+		 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
+		 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
+		 bufs_to_lap - num_to_scan,
+		 num_written,
+		 reusable_buffers - reusable_buffers_est);
+#endif
+
+	/*
+	 * Consider the above scan as being like a new allocation scan.
+	 * Characterize its density and update the smoothed one based on it. This
+	 * effectively halves the moving average period in cases where both the
+	 * strategy and the background writer are doing some useful scanning,
+	 * which is helpful because a long memory isn't as desirable on the
+	 * density estimates.
+	 */
+	new_strategy_delta = bufs_to_lap - num_to_scan;
+	new_recent_alloc = reusable_buffers - reusable_buffers_est;
+	if (new_strategy_delta > 0 && new_recent_alloc > 0)
+	{
+		scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
+		smoothed_density += (scans_per_alloc - smoothed_density) /
+			smoothing_samples;
+
+#ifdef BGW_DEBUG
+		elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
+			 new_recent_alloc, new_strategy_delta,
+			 scans_per_alloc, smoothed_density);
+#endif
+	}
+
+	/* Return true if OK to hibernate */
+	return (bufs_to_lap == 0 && recent_alloc == 0);
+}
+
+/*
+ * SyncOneBuffer -- process a single buffer during syncing.
+ *
+ * If skip_recently_used is true, we don't write currently-pinned buffers, nor
+ * buffers marked recently used, as these are not replacement candidates.
+ *
+ * Returns a bitmask containing the following flag bits:
+ *	BUF_WRITTEN: we wrote the buffer.
+ *	BUF_REUSABLE: buffer is available for replacement, ie, it has
+ *		pin count 0 and usage count 0.
+ *
+ * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
+ * after locking it, but we don't care all that much.)
+ *
+ * Note: caller must have done ResourceOwnerEnlargeBuffers.
+ */
+static int
+SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
+{
+	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
+	int			result = 0;
+	uint32		buf_state;
+	BufferTag	tag;
+
+	ReservePrivateRefCountEntry();
+
+	/*
+	 * Check whether buffer needs writing.
+	 *
+	 * We can make this check without taking the buffer content lock so long
+	 * as we mark pages dirty in access methods *before* logging changes with
+	 * XLogInsert(): if someone marks the buffer dirty just after our check we
+	 * don't worry because our checkpoint.redo points before log record for
+	 * upcoming changes and so we are not required to write such dirty buffer.
+	 */
+	buf_state = LockBufHdr(bufHdr);
+
+	if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
+		BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
+	{
+		result |= BUF_REUSABLE;
+	}
+	else if (skip_recently_used)
+	{
+		/* Caller told us not to write recently-used buffers */
+		UnlockBufHdr(bufHdr, buf_state);
+		return result;
+	}
+
+	if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
+	{
+		/* It's clean, so nothing to do */
+		UnlockBufHdr(bufHdr, buf_state);
+		return result;
+	}
+
+	/*
+	 * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
+	 * buffer is clean by the time we've locked it.)
+	 */
+	PinBuffer_Locked(bufHdr);
+	LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+
+	FlushBuffer(bufHdr, NULL);
+
+	LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+
+	tag = bufHdr->tag;
+
+	UnpinBuffer(bufHdr, true);
+
+	ScheduleBufferTagForWriteback(wb_context, &tag);
+
+	return result | BUF_WRITTEN;
+}
+
+/*
+ *		AtEOXact_Buffers - clean up at end of transaction.
+ *
+ *		As of PostgreSQL 8.0, buffer pins should get released by the
+ *		ResourceOwner mechanism.  This routine is just a debugging
+ *		cross-check that no pins remain.
+ */
+void
+AtEOXact_Buffers(bool isCommit)
+{
+	CheckForBufferLeaks();
+
+	AtEOXact_LocalBuffers(isCommit);
+
+	Assert(PrivateRefCountOverflowed == 0);
+}
+
+/*
+ * Initialize access to shared buffer pool
+ *
+ * This is called during backend startup (whether standalone or under the
+ * postmaster).  It sets up for this backend's access to the already-existing
+ * buffer pool.
+ *
+ * NB: this is called before InitProcess(), so we do not have a PGPROC and
+ * cannot do LWLockAcquire; hence we can't actually access stuff in
+ * shared memory yet.  We are only initializing local data here.
+ * (See also InitBufferPoolBackend)
+ */
+void
+InitBufferPoolAccess(void)
+{
+	HASHCTL		hash_ctl;
+
+	memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
+
+	hash_ctl.keysize = sizeof(int32);
+	hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
+
+	PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
+									  HASH_ELEM | HASH_BLOBS);
+}
+
+/*
+ * InitBufferPoolBackend --- second-stage initialization of a new backend
+ *
+ * This is called after we have acquired a PGPROC and so can safely get
+ * LWLocks.  We don't currently need to do anything at this stage ...
+ * except register a shmem-exit callback.  AtProcExit_Buffers needs LWLock
+ * access, and thereby has to be called at the corresponding phase of
+ * backend shutdown.
+ */
+void
+InitBufferPoolBackend(void)
+{
+	on_shmem_exit(AtProcExit_Buffers, 0);
+}
+
+/*
+ * During backend exit, ensure that we released all shared-buffer locks and
+ * assert that we have no remaining pins.
+ */
+static void
+AtProcExit_Buffers(int code, Datum arg)
+{
+	AbortBufferIO();
+	UnlockBuffers();
+
+	CheckForBufferLeaks();
+
+	/* localbuf.c needs a chance too */
+	AtProcExit_LocalBuffers();
+}
+
+/*
+ *		CheckForBufferLeaks - ensure this backend holds no buffer pins
+ *
+ *		As of PostgreSQL 8.0, buffer pins should get released by the
+ *		ResourceOwner mechanism.  This routine is just a debugging
+ *		cross-check that no pins remain.
+ */
+static void
+CheckForBufferLeaks(void)
+{
+#ifdef USE_ASSERT_CHECKING
+	int			RefCountErrors = 0;
+	PrivateRefCountEntry *res;
+	int			i;
+
+	/* check the array */
+	for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+	{
+		res = &PrivateRefCountArray[i];
+
+		if (res->buffer != InvalidBuffer)
+		{
+			PrintBufferLeakWarning(res->buffer);
+			RefCountErrors++;
+		}
+	}
+
+	/* if necessary search the hash */
+	if (PrivateRefCountOverflowed)
+	{
+		HASH_SEQ_STATUS hstat;
+
+		hash_seq_init(&hstat, PrivateRefCountHash);
+		while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			PrintBufferLeakWarning(res->buffer);
+			RefCountErrors++;
+		}
+
+	}
+
+	Assert(RefCountErrors == 0);
+#endif
+}
+
+/*
+ * Helper routine to issue warnings when a buffer is unexpectedly pinned
+ */
+void
+PrintBufferLeakWarning(Buffer buffer)
+{
+	BufferDesc *buf;
+	int32		loccount;
+	char	   *path;
+	BackendId	backend;
+	uint32		buf_state;
+
+	Assert(BufferIsValid(buffer));
+	if (BufferIsLocal(buffer))
+	{
+		buf = GetLocalBufferDescriptor(-buffer - 1);
+		loccount = LocalRefCount[-buffer - 1];
+		backend = MyBackendId;
+	}
+	else
+	{
+		buf = GetBufferDescriptor(buffer - 1);
+		loccount = GetPrivateRefCount(buffer);
+		backend = InvalidBackendId;
+	}
+
+	/* theoretically we should lock the bufhdr here */
+	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+	buf_state = pg_atomic_read_u32(&buf->state);
+	elog(WARNING,
+		 "buffer refcount leak: [%03d] "
+		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
+		 buffer, path,
+		 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
+		 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
+	pfree(path);
+}
+
+/*
+ * CheckPointBuffers
+ *
+ * Flush all dirty blocks in buffer pool to disk at checkpoint time.
+ *
+ * Note: temporary relations do not participate in checkpoints, so they don't
+ * need to be flushed.
+ */
+void
+CheckPointBuffers(int flags)
+{
+	BufferSync(flags);
+}
+
+
+/*
+ * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
+ */
+void
+BufmgrCommit(void)
+{
+	/* Nothing to do in bufmgr anymore... */
+}
+
+/*
+ * BufferGetBlockNumber
+ *		Returns the block number associated with a buffer.
+ *
+ * Note:
+ *		Assumes that the buffer is valid and pinned, else the
+ *		value may be obsolete immediately...
+ */
+BlockNumber
+BufferGetBlockNumber(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+
+	Assert(BufferIsPinned(buffer));
+
+	if (BufferIsLocal(buffer))
+		bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+	else
+		bufHdr = GetBufferDescriptor(buffer - 1);
+
+	/* pinned, so OK to read tag without spinlock */
+	return bufHdr->tag.blockNum;
+}
+
+/*
+ * BufferGetTag
+ *		Returns the relfilenode, fork number and block number associated with
+ *		a buffer.
+ */
+void
+BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum,
+			 BlockNumber *blknum)
+{
+	BufferDesc *bufHdr;
+
+	/* Do the same checks as BufferGetBlockNumber. */
+	Assert(BufferIsPinned(buffer));
+
+	if (BufferIsLocal(buffer))
+		bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+	else
+		bufHdr = GetBufferDescriptor(buffer - 1);
+
+	/* pinned, so OK to read tag without spinlock */
+	*rnode = bufHdr->tag.rnode;
+	*forknum = bufHdr->tag.forkNum;
+	*blknum = bufHdr->tag.blockNum;
+}
+
+/*
+ * FlushBuffer
+ *		Physically write out a shared buffer.
+ *
+ * NOTE: this actually just passes the buffer contents to the kernel; the
+ * real write to disk won't happen until the kernel feels like it.  This
+ * is okay from our point of view since we can redo the changes from WAL.
+ * However, we will need to force the changes to disk via fsync before
+ * we can checkpoint WAL.
+ *
+ * The caller must hold a pin on the buffer and have share-locked the
+ * buffer contents.  (Note: a share-lock does not prevent updates of
+ * hint bits in the buffer, so the page could change while the write
+ * is in progress, but we assume that that will not invalidate the data
+ * written.)
+ *
+ * If the caller has an smgr reference for the buffer's relation, pass it
+ * as the second parameter.  If not, pass NULL.
+ */
+static void
+FlushBuffer(BufferDesc *buf, SMgrRelation reln)
+{
+	XLogRecPtr	recptr;
+	ErrorContextCallback errcallback;
+	instr_time	io_start,
+				io_time;
+	Block		bufBlock;
+	char	   *bufToWrite;
+	uint32		buf_state;
+
+	/*
+	 * Try to start an I/O operation.  If StartBufferIO returns false, then
+	 * someone else flushed the buffer before we could, so we need not do
+	 * anything.
+	 */
+	if (!StartBufferIO(buf, false))
+		return;
+
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = shared_buffer_write_error_callback;
+	errcallback.arg = (void *) buf;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* Find smgr relation for buffer */
+	if (reln == NULL)
+		reln = smgropen(buf->tag.rnode, InvalidBackendId);
+
+	TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
+										buf->tag.blockNum,
+										reln->smgr_rnode.node.spcNode,
+										reln->smgr_rnode.node.dbNode,
+										reln->smgr_rnode.node.relNode);
+
+	buf_state = LockBufHdr(buf);
+
+	/*
+	 * Run PageGetLSN while holding header lock, since we don't have the
+	 * buffer locked exclusively in all cases.
+	 */
+	recptr = BufferGetLSN(buf);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	buf_state &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(buf, buf_state);
+
+	/*
+	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
+	 * rule that log updates must hit disk before any of the data-file changes
+	 * they describe do.
+	 *
+	 * However, this rule does not apply to unlogged relations, which will be
+	 * lost after a crash anyway.  Most unlogged relation pages do not bear
+	 * LSNs since we never emit WAL records for them, and therefore flushing
+	 * up through the buffer LSN would be useless, but harmless.  However,
+	 * GiST indexes use LSNs internally to track page-splits, and therefore
+	 * unlogged GiST pages bear "fake" LSNs generated by
+	 * GetFakeLSNForUnloggedRel.  It is unlikely but possible that the fake
+	 * LSN counter could advance past the WAL insertion point; and if it did
+	 * happen, attempting to flush WAL through that location would fail, with
+	 * disastrous system-wide consequences.  To make sure that can't happen,
+	 * skip the flush if the buffer isn't permanent.
+	 */
+	if (buf_state & BM_PERMANENT)
+		XLogFlush(recptr);
+
+	/*
+	 * Now it's safe to write buffer to disk. Note that no one else should
+	 * have been able to write it while we were busy with log flushing because
+	 * only one process at a time can set the BM_IO_IN_PROGRESS bit.
+	 */
+	bufBlock = BufHdrGetBlock(buf);
+
+	/*
+	 * Update page checksum if desired.  Since we have only shared lock on the
+	 * buffer, other processes might be updating hint bits in it, so we must
+	 * copy the page to private storage if we do checksumming.
+	 */
+	bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
+
+	if (track_io_timing)
+		INSTR_TIME_SET_CURRENT(io_start);
+
+	/*
+	 * bufToWrite is either the shared buffer or a copy, as appropriate.
+	 */
+	smgrwrite(reln,
+			  buf->tag.forkNum,
+			  buf->tag.blockNum,
+			  bufToWrite,
+			  false);
+
+	if (track_io_timing)
+	{
+		INSTR_TIME_SET_CURRENT(io_time);
+		INSTR_TIME_SUBTRACT(io_time, io_start);
+		pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+		INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+	}
+
+	pgBufferUsage.shared_blks_written++;
+
+	/*
+	 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
+	 * end the BM_IO_IN_PROGRESS state.
+	 */
+	TerminateBufferIO(buf, true, 0);
+
+	TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
+									   buf->tag.blockNum,
+									   reln->smgr_rnode.node.spcNode,
+									   reln->smgr_rnode.node.dbNode,
+									   reln->smgr_rnode.node.relNode);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+/*
+ * RelationGetNumberOfBlocksInFork
+ *		Determines the current number of pages in the specified relation fork.
+ *
+ * Note that the accuracy of the result will depend on the details of the
+ * relation's storage. For builtin AMs it'll be accurate, but for external AMs
+ * it might not be.
+ */
+BlockNumber
+RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
+{
+	switch (relation->rd_rel->relkind)
+	{
+		case RELKIND_SEQUENCE:
+		case RELKIND_INDEX:
+		case RELKIND_PARTITIONED_INDEX:
+			/* Open it at the smgr level if not already done */
+			RelationOpenSmgr(relation);
+
+			return smgrnblocks(relation->rd_smgr, forkNum);
+
+		case RELKIND_RELATION:
+		case RELKIND_TOASTVALUE:
+		case RELKIND_MATVIEW:
+			{
+				/*
+				 * Not every table AM uses BLCKSZ wide fixed size blocks.
+				 * Therefore tableam returns the size in bytes - but for the
+				 * purpose of this routine, we want the number of blocks.
+				 * Therefore divide, rounding up.
+				 */
+				uint64		szbytes;
+
+				szbytes = table_relation_size(relation, forkNum);
+
+				return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
+			}
+		case RELKIND_VIEW:
+		case RELKIND_COMPOSITE_TYPE:
+		case RELKIND_FOREIGN_TABLE:
+		case RELKIND_PARTITIONED_TABLE:
+		default:
+			Assert(false);
+			break;
+	}
+
+	return 0;					/* keep compiler quiet */
+}
+
+/*
+ * BufferIsPermanent
+ *		Determines whether a buffer will potentially still be around after
+ *		a crash.  Caller must hold a buffer pin.
+ */
+bool
+BufferIsPermanent(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+
+	/* Local buffers are used only for temp relations. */
+	if (BufferIsLocal(buffer))
+		return false;
+
+	/* Make sure we've got a real buffer, and that we hold a pin on it. */
+	Assert(BufferIsValid(buffer));
+	Assert(BufferIsPinned(buffer));
+
+	/*
+	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
+	 * need not bother with the buffer header spinlock.  Even if someone else
+	 * changes the buffer header state while we're doing this, the state is
+	 * changed atomically, so we'll read the old value or the new value, but
+	 * not random garbage.
+	 */
+	bufHdr = GetBufferDescriptor(buffer - 1);
+	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
+}
+
+/*
+ * BufferGetLSNAtomic
+ *		Retrieves the LSN of the buffer atomically using a buffer header lock.
+ *		This is necessary for some callers who may not have an exclusive lock
+ *		on the buffer.
+ */
+XLogRecPtr
+BufferGetLSNAtomic(Buffer buffer)
+{
+	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
+	char	   *page = BufferGetPage(buffer);
+	XLogRecPtr	lsn;
+	uint32		buf_state;
+
+	/*
+	 * If we don't need locking for correctness, fastpath out.
+	 */
+	if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
+		return PageGetLSN(page);
+
+	/* Make sure we've got a real buffer, and that we hold a pin on it. */
+	Assert(BufferIsValid(buffer));
+	Assert(BufferIsPinned(buffer));
+
+	buf_state = LockBufHdr(bufHdr);
+	lsn = PageGetLSN(page);
+	UnlockBufHdr(bufHdr, buf_state);
+
+	return lsn;
+}
+
+/* ---------------------------------------------------------------------
+ *		DropRelFileNodeBuffers
+ *
+ *		This function removes from the buffer pool all the pages of the
+ *		specified relation forks that have block numbers >= firstDelBlock.
+ *		(In particular, with firstDelBlock = 0, all pages are removed.)
+ *		Dirty pages are simply dropped, without bothering to write them
+ *		out first.  Therefore, this is NOT rollback-able, and so should be
+ *		used only with extreme caution!
+ *
+ *		Currently, this is called only from smgr.c when the underlying file
+ *		is about to be deleted or truncated (firstDelBlock is needed for
+ *		the truncation case).  The data in the affected pages would therefore
+ *		be deleted momentarily anyway, and there is no point in writing it.
+ *		It is the responsibility of higher-level code to ensure that the
+ *		deletion or truncation does not lose any data that could be needed
+ *		later.  It is also the responsibility of higher-level code to ensure
+ *		that no other process could be trying to load more pages of the
+ *		relation into buffers.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
+					   int nforks, BlockNumber *firstDelBlock)
+{
+	int			i;
+	int			j;
+	RelFileNodeBackend rnode;
+	BlockNumber nForkBlock[MAX_FORKNUM];
+	uint64		nBlocksToInvalidate = 0;
+
+	rnode = smgr_reln->smgr_rnode;
+
+	/* If it's a local relation, it's localbuf.c's problem. */
+	if (RelFileNodeBackendIsTemp(rnode))
+	{
+		if (rnode.backend == MyBackendId)
+		{
+			for (j = 0; j < nforks; j++)
+				DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
+											firstDelBlock[j]);
+		}
+		return;
+	}
+
+	/*
+	 * To remove all the pages of the specified relation forks from the buffer
+	 * pool, we need to scan the entire buffer pool but we can optimize it by
+	 * finding the buffers from BufMapping table provided we know the exact
+	 * size of each fork of the relation. The exact size is required to ensure
+	 * that we don't leave any buffer for the relation being dropped as
+	 * otherwise the background writer or checkpointer can lead to a PANIC
+	 * error while flushing buffers corresponding to files that don't exist.
+	 *
+	 * To know the exact size, we rely on the size cached for each fork by us
+	 * during recovery which limits the optimization to recovery and on
+	 * standbys but we can easily extend it once we have shared cache for
+	 * relation size.
+	 *
+	 * In recovery, we cache the value returned by the first lseek(SEEK_END)
+	 * and the future writes keeps the cached value up-to-date. See
+	 * smgrextend. It is possible that the value of the first lseek is smaller
+	 * than the actual number of existing blocks in the file due to buggy
+	 * Linux kernels that might not have accounted for the recent write. But
+	 * that should be fine because there must not be any buffers after that
+	 * file size.
+	 */
+	for (i = 0; i < nforks; i++)
+	{
+		/* Get the number of blocks for a relation's fork */
+		nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
+
+		if (nForkBlock[i] == InvalidBlockNumber)
+		{
+			nBlocksToInvalidate = InvalidBlockNumber;
+			break;
+		}
+
+		/* calculate the number of blocks to be invalidated */
+		nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
+	}
+
+	/*
+	 * We apply the optimization iff the total number of blocks to invalidate
+	 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
+	 */
+	if (BlockNumberIsValid(nBlocksToInvalidate) &&
+		nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
+	{
+		for (j = 0; j < nforks; j++)
+			FindAndDropRelFileNodeBuffers(rnode.node, forkNum[j],
+										  nForkBlock[j], firstDelBlock[j]);
+		return;
+	}
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		buf_state;
+
+		/*
+		 * We can make this a tad faster by prechecking the buffer tag before
+		 * we attempt to lock the buffer; this saves a lot of lock
+		 * acquisitions in typical cases.  It should be safe because the
+		 * caller must have AccessExclusiveLock on the relation, or some other
+		 * reason to be certain that no one is loading new pages of the rel
+		 * into the buffer pool.  (Otherwise we might well miss such pages
+		 * entirely.)  Therefore, while the tag might be changing while we
+		 * look at it, it can't be changing *to* a value we care about, only
+		 * *away* from such a value.  So false negatives are impossible, and
+		 * false positives are safe because we'll recheck after getting the
+		 * buffer lock.
+		 *
+		 * We could check forkNum and blockNum as well as the rnode, but the
+		 * incremental win from doing so seems small.
+		 */
+		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+			continue;
+
+		buf_state = LockBufHdr(bufHdr);
+
+		for (j = 0; j < nforks; j++)
+		{
+			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+				bufHdr->tag.forkNum == forkNum[j] &&
+				bufHdr->tag.blockNum >= firstDelBlock[j])
+			{
+				InvalidateBuffer(bufHdr);	/* releases spinlock */
+				break;
+			}
+		}
+		if (j >= nforks)
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+}
+
+/* ---------------------------------------------------------------------
+ *		DropRelFileNodesAllBuffers
+ *
+ *		This function removes from the buffer pool all the pages of all
+ *		forks of the specified relations.  It's equivalent to calling
+ *		DropRelFileNodeBuffers once per fork per relation with
+ *		firstDelBlock = 0.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelFileNodesAllBuffers(SMgrRelation *smgr_reln, int nnodes)
+{
+	int			i;
+	int			j;
+	int			n = 0;
+	SMgrRelation *rels;
+	BlockNumber (*block)[MAX_FORKNUM + 1];
+	uint64		nBlocksToInvalidate = 0;
+	RelFileNode *nodes;
+	bool		cached = true;
+	bool		use_bsearch;
+
+	if (nnodes == 0)
+		return;
+
+	rels = palloc(sizeof(SMgrRelation) * nnodes);	/* non-local relations */
+
+	/* If it's a local relation, it's localbuf.c's problem. */
+	for (i = 0; i < nnodes; i++)
+	{
+		if (RelFileNodeBackendIsTemp(smgr_reln[i]->smgr_rnode))
+		{
+			if (smgr_reln[i]->smgr_rnode.backend == MyBackendId)
+				DropRelFileNodeAllLocalBuffers(smgr_reln[i]->smgr_rnode.node);
+		}
+		else
+			rels[n++] = smgr_reln[i];
+	}
+
+	/*
+	 * If there are no non-local relations, then we're done. Release the
+	 * memory and return.
+	 */
+	if (n == 0)
+	{
+		pfree(rels);
+		return;
+	}
+
+	/*
+	 * This is used to remember the number of blocks for all the relations
+	 * forks.
+	 */
+	block = (BlockNumber (*)[MAX_FORKNUM + 1])
+		palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
+
+	/*
+	 * We can avoid scanning the entire buffer pool if we know the exact size
+	 * of each of the given relation forks. See DropRelFileNodeBuffers.
+	 */
+	for (i = 0; i < n && cached; i++)
+	{
+		for (j = 0; j <= MAX_FORKNUM; j++)
+		{
+			/* Get the number of blocks for a relation's fork. */
+			block[i][j] = smgrnblocks_cached(rels[i], j);
+
+			/* We need to only consider the relation forks that exists. */
+			if (block[i][j] == InvalidBlockNumber)
+			{
+				if (!smgrexists(rels[i], j))
+					continue;
+				cached = false;
+				break;
+			}
+
+			/* calculate the total number of blocks to be invalidated */
+			nBlocksToInvalidate += block[i][j];
+		}
+	}
+
+	/*
+	 * We apply the optimization iff the total number of blocks to invalidate
+	 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
+	 */
+	if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
+	{
+		for (i = 0; i < n; i++)
+		{
+			for (j = 0; j <= MAX_FORKNUM; j++)
+			{
+				/* ignore relation forks that doesn't exist */
+				if (!BlockNumberIsValid(block[i][j]))
+					continue;
+
+				/* drop all the buffers for a particular relation fork */
+				FindAndDropRelFileNodeBuffers(rels[i]->smgr_rnode.node,
+											  j, block[i][j], 0);
+			}
+		}
+
+		pfree(block);
+		pfree(rels);
+		return;
+	}
+
+	pfree(block);
+	nodes = palloc(sizeof(RelFileNode) * n);	/* non-local relations */
+	for (i = 0; i < n; i++)
+		nodes[i] = rels[i]->smgr_rnode.node;
+
+	/*
+	 * For low number of relations to drop just use a simple walk through, to
+	 * save the bsearch overhead. The threshold to use is rather a guess than
+	 * an exactly determined value, as it depends on many factors (CPU and RAM
+	 * speeds, amount of shared buffers etc.).
+	 */
+	use_bsearch = n > RELS_BSEARCH_THRESHOLD;
+
+	/* sort the list of rnodes if necessary */
+	if (use_bsearch)
+		pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		RelFileNode *rnode = NULL;
+		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		buf_state;
+
+		/*
+		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+		 * and saves some cycles.
+		 */
+
+		if (!use_bsearch)
+		{
+			int			j;
+
+			for (j = 0; j < n; j++)
+			{
+				if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
+				{
+					rnode = &nodes[j];
+					break;
+				}
+			}
+		}
+		else
+		{
+			rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+							nodes, n, sizeof(RelFileNode),
+							rnode_comparator);
+		}
+
+		/* buffer doesn't belong to any of the given relfilenodes; skip it */
+		if (rnode == NULL)
+			continue;
+
+		buf_state = LockBufHdr(bufHdr);
+		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
+			InvalidateBuffer(bufHdr);	/* releases spinlock */
+		else
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+
+	pfree(nodes);
+	pfree(rels);
+}
+
+/* ---------------------------------------------------------------------
+ *		FindAndDropRelFileNodeBuffers
+ *
+ *		This function performs look up in BufMapping table and removes from the
+ *		buffer pool all the pages of the specified relation fork that has block
+ *		number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
+ *		pages are removed.)
+ * --------------------------------------------------------------------
+ */
+static void
+FindAndDropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum,
+							  BlockNumber nForkBlock,
+							  BlockNumber firstDelBlock)
+{
+	BlockNumber curBlock;
+
+	for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
+	{
+		uint32		bufHash;	/* hash value for tag */
+		BufferTag	bufTag;		/* identity of requested block */
+		LWLock	   *bufPartitionLock;	/* buffer partition lock for it */
+		int			buf_id;
+		BufferDesc *bufHdr;
+		uint32		buf_state;
+
+		/* create a tag so we can lookup the buffer */
+		INIT_BUFFERTAG(bufTag, rnode, forkNum, curBlock);
+
+		/* determine its hash code and partition lock ID */
+		bufHash = BufTableHashCode(&bufTag);
+		bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+		/* Check that it is in the buffer pool. If not, do nothing. */
+		LWLockAcquire(bufPartitionLock, LW_SHARED);
+		buf_id = BufTableLookup(&bufTag, bufHash);
+		LWLockRelease(bufPartitionLock);
+
+		if (buf_id < 0)
+			continue;
+
+		bufHdr = GetBufferDescriptor(buf_id);
+
+		/*
+		 * We need to lock the buffer header and recheck if the buffer is
+		 * still associated with the same block because the buffer could be
+		 * evicted by some other backend loading blocks for a different
+		 * relation after we release lock on the BufMapping table.
+		 */
+		buf_state = LockBufHdr(bufHdr);
+
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+			bufHdr->tag.forkNum == forkNum &&
+			bufHdr->tag.blockNum >= firstDelBlock)
+			InvalidateBuffer(bufHdr);	/* releases spinlock */
+		else
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+}
+
+/* ---------------------------------------------------------------------
+ *		DropDatabaseBuffers
+ *
+ *		This function removes all the buffers in the buffer cache for a
+ *		particular database.  Dirty pages are simply dropped, without
+ *		bothering to write them out first.  This is used when we destroy a
+ *		database, to avoid trying to flush data to disk when the directory
+ *		tree no longer exists.  Implementation is pretty similar to
+ *		DropRelFileNodeBuffers() which is for destroying just one relation.
+ * --------------------------------------------------------------------
+ */
+void
+DropDatabaseBuffers(Oid dbid)
+{
+	int			i;
+
+	/*
+	 * We needn't consider local buffers, since by assumption the target
+	 * database isn't our own.
+	 */
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		buf_state;
+
+		/*
+		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+		 * and saves some cycles.
+		 */
+		if (bufHdr->tag.rnode.dbNode != dbid)
+			continue;
+
+		buf_state = LockBufHdr(bufHdr);
+		if (bufHdr->tag.rnode.dbNode == dbid)
+			InvalidateBuffer(bufHdr);	/* releases spinlock */
+		else
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+}
+
+/* -----------------------------------------------------------------
+ *		PrintBufferDescs
+ *
+ *		this function prints all the buffer descriptors, for debugging
+ *		use only.
+ * -----------------------------------------------------------------
+ */
+#ifdef NOT_USED
+void
+PrintBufferDescs(void)
+{
+	int			i;
+
+	for (i = 0; i < NBuffers; ++i)
+	{
+		BufferDesc *buf = GetBufferDescriptor(i);
+		Buffer		b = BufferDescriptorGetBuffer(buf);
+
+		/* theoretically we should lock the bufhdr here */
+		elog(LOG,
+			 "[%02d] (freeNext=%d, rel=%s, "
+			 "blockNum=%u, flags=0x%x, refcount=%u %d)",
+			 i, buf->freeNext,
+			 relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum),
+			 buf->tag.blockNum, buf->flags,
+			 buf->refcount, GetPrivateRefCount(b));
+	}
+}
+#endif
+
+#ifdef NOT_USED
+void
+PrintPinnedBufs(void)
+{
+	int			i;
+
+	for (i = 0; i < NBuffers; ++i)
+	{
+		BufferDesc *buf = GetBufferDescriptor(i);
+		Buffer		b = BufferDescriptorGetBuffer(buf);
+
+		if (GetPrivateRefCount(b) > 0)
+		{
+			/* theoretically we should lock the bufhdr here */
+			elog(LOG,
+				 "[%02d] (freeNext=%d, rel=%s, "
+				 "blockNum=%u, flags=0x%x, refcount=%u %d)",
+				 i, buf->freeNext,
+				 relpathperm(buf->tag.rnode, buf->tag.forkNum),
+				 buf->tag.blockNum, buf->flags,
+				 buf->refcount, GetPrivateRefCount(b));
+		}
+	}
+}
+#endif
+
+/* ---------------------------------------------------------------------
+ *		FlushRelationBuffers
+ *
+ *		This function writes all dirty pages of a relation out to disk
+ *		(or more accurately, out to kernel disk buffers), ensuring that the
+ *		kernel has an up-to-date view of the relation.
+ *
+ *		Generally, the caller should be holding AccessExclusiveLock on the
+ *		target relation to ensure that no other backend is busy dirtying
+ *		more blocks of the relation; the effects can't be expected to last
+ *		after the lock is released.
+ *
+ *		XXX currently it sequentially searches the buffer pool, should be
+ *		changed to more clever ways of searching.  This routine is not
+ *		used in any performance-critical code paths, so it's not worth
+ *		adding additional overhead to normal paths to make it go faster.
+ * --------------------------------------------------------------------
+ */
+void
+FlushRelationBuffers(Relation rel)
+{
+	int			i;
+	BufferDesc *bufHdr;
+
+	/* Open rel at the smgr level if not already done */
+	RelationOpenSmgr(rel);
+
+	if (RelationUsesLocalBuffers(rel))
+	{
+		for (i = 0; i < NLocBuffer; i++)
+		{
+			uint32		buf_state;
+
+			bufHdr = GetLocalBufferDescriptor(i);
+			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
+				((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
+				 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+			{
+				ErrorContextCallback errcallback;
+				Page		localpage;
+
+				localpage = (char *) LocalBufHdrGetBlock(bufHdr);
+
+				/* Setup error traceback support for ereport() */
+				errcallback.callback = local_buffer_write_error_callback;
+				errcallback.arg = (void *) bufHdr;
+				errcallback.previous = error_context_stack;
+				error_context_stack = &errcallback;
+
+				PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+
+				smgrwrite(rel->rd_smgr,
+						  bufHdr->tag.forkNum,
+						  bufHdr->tag.blockNum,
+						  localpage,
+						  false);
+
+				buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+				pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+				/* Pop the error context stack */
+				error_context_stack = errcallback.previous;
+			}
+		}
+
+		return;
+	}
+
+	/* Make sure we can handle the pin inside the loop */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		uint32		buf_state;
+
+		bufHdr = GetBufferDescriptor(i);
+
+		/*
+		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+		 * and saves some cycles.
+		 */
+		if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
+			continue;
+
+		ReservePrivateRefCountEntry();
+
+		buf_state = LockBufHdr(bufHdr);
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
+			(buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+		{
+			PinBuffer_Locked(bufHdr);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+			FlushBuffer(bufHdr, rel->rd_smgr);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+			UnpinBuffer(bufHdr, true);
+		}
+		else
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+}
+
+/* ---------------------------------------------------------------------
+ *		FlushRelationsAllBuffers
+ *
+ *		This function flushes out of the buffer pool all the pages of all
+ *		forks of the specified smgr relations.  It's equivalent to calling
+ *		FlushRelationBuffers once per fork per relation.  The relations are
+ *		assumed not to use local buffers.
+ * --------------------------------------------------------------------
+ */
+void
+FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
+{
+	int			i;
+	SMgrSortArray *srels;
+	bool		use_bsearch;
+
+	if (nrels == 0)
+		return;
+
+	/* fill-in array for qsort */
+	srels = palloc(sizeof(SMgrSortArray) * nrels);
+
+	for (i = 0; i < nrels; i++)
+	{
+		Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
+
+		srels[i].rnode = smgrs[i]->smgr_rnode.node;
+		srels[i].srel = smgrs[i];
+	}
+
+	/*
+	 * Save the bsearch overhead for low number of relations to sync. See
+	 * DropRelFileNodesAllBuffers for details.
+	 */
+	use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
+
+	/* sort the list of SMgrRelations if necessary */
+	if (use_bsearch)
+		pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
+
+	/* Make sure we can handle the pin inside the loop */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		SMgrSortArray *srelent = NULL;
+		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		buf_state;
+
+		/*
+		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+		 * and saves some cycles.
+		 */
+
+		if (!use_bsearch)
+		{
+			int			j;
+
+			for (j = 0; j < nrels; j++)
+			{
+				if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
+				{
+					srelent = &srels[j];
+					break;
+				}
+			}
+
+		}
+		else
+		{
+			srelent = bsearch((const void *) &(bufHdr->tag.rnode),
+							  srels, nrels, sizeof(SMgrSortArray),
+							  rnode_comparator);
+		}
+
+		/* buffer doesn't belong to any of the given relfilenodes; skip it */
+		if (srelent == NULL)
+			continue;
+
+		ReservePrivateRefCountEntry();
+
+		buf_state = LockBufHdr(bufHdr);
+		if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
+			(buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+		{
+			PinBuffer_Locked(bufHdr);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+			FlushBuffer(bufHdr, srelent->srel);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+			UnpinBuffer(bufHdr, true);
+		}
+		else
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+
+	pfree(srels);
+}
+
+/* ---------------------------------------------------------------------
+ *		FlushDatabaseBuffers
+ *
+ *		This function writes all dirty pages of a database out to disk
+ *		(or more accurately, out to kernel disk buffers), ensuring that the
+ *		kernel has an up-to-date view of the database.
+ *
+ *		Generally, the caller should be holding an appropriate lock to ensure
+ *		no other backend is active in the target database; otherwise more
+ *		pages could get dirtied.
+ *
+ *		Note we don't worry about flushing any pages of temporary relations.
+ *		It's assumed these wouldn't be interesting.
+ * --------------------------------------------------------------------
+ */
+void
+FlushDatabaseBuffers(Oid dbid)
+{
+	int			i;
+	BufferDesc *bufHdr;
+
+	/* Make sure we can handle the pin inside the loop */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		uint32		buf_state;
+
+		bufHdr = GetBufferDescriptor(i);
+
+		/*
+		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+		 * and saves some cycles.
+		 */
+		if (bufHdr->tag.rnode.dbNode != dbid)
+			continue;
+
+		ReservePrivateRefCountEntry();
+
+		buf_state = LockBufHdr(bufHdr);
+		if (bufHdr->tag.rnode.dbNode == dbid &&
+			(buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+		{
+			PinBuffer_Locked(bufHdr);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+			FlushBuffer(bufHdr, NULL);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+			UnpinBuffer(bufHdr, true);
+		}
+		else
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+}
+
+/*
+ * Flush a previously, shared or exclusively, locked and pinned buffer to the
+ * OS.
+ */
+void
+FlushOneBuffer(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+
+	/* currently not needed, but no fundamental reason not to support */
+	Assert(!BufferIsLocal(buffer));
+
+	Assert(BufferIsPinned(buffer));
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+
+	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
+
+	FlushBuffer(bufHdr, NULL);
+}
+
+/*
+ * ReleaseBuffer -- release the pin on a buffer
+ */
+void
+ReleaseBuffer(Buffer buffer)
+{
+	if (!BufferIsValid(buffer))
+		elog(ERROR, "bad buffer ID: %d", buffer);
+
+	if (BufferIsLocal(buffer))
+	{
+		ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
+
+		Assert(LocalRefCount[-buffer - 1] > 0);
+		LocalRefCount[-buffer - 1]--;
+		return;
+	}
+
+	UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
+}
+
+/*
+ * UnlockReleaseBuffer -- release the content lock and pin on a buffer
+ *
+ * This is just a shorthand for a common combination.
+ */
+void
+UnlockReleaseBuffer(Buffer buffer)
+{
+	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	ReleaseBuffer(buffer);
+}
+
+/*
+ * IncrBufferRefCount
+ *		Increment the pin count on a buffer that we have *already* pinned
+ *		at least once.
+ *
+ *		This function cannot be used on a buffer we do not have pinned,
+ *		because it doesn't change the shared buffer state.
+ */
+void
+IncrBufferRefCount(Buffer buffer)
+{
+	Assert(BufferIsPinned(buffer));
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+	if (BufferIsLocal(buffer))
+		LocalRefCount[-buffer - 1]++;
+	else
+	{
+		PrivateRefCountEntry *ref;
+
+		ref = GetPrivateRefCountEntry(buffer, true);
+		Assert(ref != NULL);
+		ref->refcount++;
+	}
+	ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
+}
+
+/*
+ * MarkBufferDirtyHint
+ *
+ *	Mark a buffer dirty for non-critical changes.
+ *
+ * This is essentially the same as MarkBufferDirty, except:
+ *
+ * 1. The caller does not write WAL; so if checksums are enabled, we may need
+ *	  to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
+ * 2. The caller might have only share-lock instead of exclusive-lock on the
+ *	  buffer's content lock.
+ * 3. This function does not guarantee that the buffer is always marked dirty
+ *	  (due to a race condition), so it cannot be used for important changes.
+ */
+void
+MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
+{
+	BufferDesc *bufHdr;
+	Page		page = BufferGetPage(buffer);
+
+	if (!BufferIsValid(buffer))
+		elog(ERROR, "bad buffer ID: %d", buffer);
+
+	if (BufferIsLocal(buffer))
+	{
+		MarkLocalBufferDirty(buffer);
+		return;
+	}
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+
+	Assert(GetPrivateRefCount(buffer) > 0);
+	/* here, either share or exclusive lock is OK */
+	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
+
+	/*
+	 * This routine might get called many times on the same page, if we are
+	 * making the first scan after commit of an xact that added/deleted many
+	 * tuples. So, be as quick as we can if the buffer is already dirty.  We
+	 * do this by not acquiring spinlock if it looks like the status bits are
+	 * already set.  Since we make this test unlocked, there's a chance we
+	 * might fail to notice that the flags have just been cleared, and failed
+	 * to reset them, due to memory-ordering issues.  But since this function
+	 * is only intended to be used in cases where failing to write out the
+	 * data would be harmless anyway, it doesn't really matter.
+	 */
+	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
+		(BM_DIRTY | BM_JUST_DIRTIED))
+	{
+		XLogRecPtr	lsn = InvalidXLogRecPtr;
+		bool		dirtied = false;
+		bool		delayChkpt = false;
+		uint32		buf_state;
+
+		/*
+		 * If we need to protect hint bit updates from torn writes, WAL-log a
+		 * full page image of the page. This full page image is only necessary
+		 * if the hint bit update is the first change to the page since the
+		 * last checkpoint.
+		 *
+		 * We don't check full_page_writes here because that logic is included
+		 * when we call XLogInsert() since the value changes dynamically.
+		 */
+		if (XLogHintBitIsNeeded() &&
+			(pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
+		{
+			/*
+			 * If we must not write WAL, due to a relfilenode-specific
+			 * condition or being in recovery, don't dirty the page.  We can
+			 * set the hint, just not dirty the page as a result so the hint
+			 * is lost when we evict the page or shutdown.
+			 *
+			 * See src/backend/storage/page/README for longer discussion.
+			 */
+			if (RecoveryInProgress() ||
+				RelFileNodeSkippingWAL(bufHdr->tag.rnode))
+				return;
+
+			/*
+			 * If the block is already dirty because we either made a change
+			 * or set a hint already, then we don't need to write a full page
+			 * image.  Note that aggressive cleaning of blocks dirtied by hint
+			 * bit setting would increase the call rate. Bulk setting of hint
+			 * bits would reduce the call rate...
+			 *
+			 * We must issue the WAL record before we mark the buffer dirty.
+			 * Otherwise we might write the page before we write the WAL. That
+			 * causes a race condition, since a checkpoint might occur between
+			 * writing the WAL record and marking the buffer dirty. We solve
+			 * that with a kluge, but one that is already in use during
+			 * transaction commit to prevent race conditions. Basically, we
+			 * simply prevent the checkpoint WAL record from being written
+			 * until we have marked the buffer dirty. We don't start the
+			 * checkpoint flush until we have marked dirty, so our checkpoint
+			 * must flush the change to disk successfully or the checkpoint
+			 * never gets written, so crash recovery will fix.
+			 *
+			 * It's possible we may enter here without an xid, so it is
+			 * essential that CreateCheckpoint waits for virtual transactions
+			 * rather than full transactionids.
+			 */
+			Assert(!MyProc->delayChkpt);
+			MyProc->delayChkpt = true;
+			delayChkpt = true;
+			lsn = XLogSaveBufferForHint(buffer, buffer_std);
+		}
+
+		buf_state = LockBufHdr(bufHdr);
+
+		Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+		if (!(buf_state & BM_DIRTY))
+		{
+			dirtied = true;		/* Means "will be dirtied by this action" */
+
+			/*
+			 * Set the page LSN if we wrote a backup block. We aren't supposed
+			 * to set this when only holding a share lock but as long as we
+			 * serialise it somehow we're OK. We choose to set LSN while
+			 * holding the buffer header lock, which causes any reader of an
+			 * LSN who holds only a share lock to also obtain a buffer header
+			 * lock before using PageGetLSN(), which is enforced in
+			 * BufferGetLSNAtomic().
+			 *
+			 * If checksums are enabled, you might think we should reset the
+			 * checksum here. That will happen when the page is written
+			 * sometime later in this checkpoint cycle.
+			 */
+			if (!XLogRecPtrIsInvalid(lsn))
+				PageSetLSN(page, lsn);
+		}
+
+		buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
+		UnlockBufHdr(bufHdr, buf_state);
+
+		if (delayChkpt)
+			MyProc->delayChkpt = false;
+
+		if (dirtied)
+		{
+			VacuumPageDirty++;
+			pgBufferUsage.shared_blks_dirtied++;
+			if (VacuumCostActive)
+				VacuumCostBalance += VacuumCostPageDirty;
+		}
+	}
+}
+
+/*
+ * Release buffer content locks for shared buffers.
+ *
+ * Used to clean up after errors.
+ *
+ * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
+ * of releasing buffer content locks per se; the only thing we need to deal
+ * with here is clearing any PIN_COUNT request that was in progress.
+ */
+void
+UnlockBuffers(void)
+{
+	BufferDesc *buf = PinCountWaitBuf;
+
+	if (buf)
+	{
+		uint32		buf_state;
+
+		buf_state = LockBufHdr(buf);
+
+		/*
+		 * Don't complain if flag bit not set; it could have been reset but we
+		 * got a cancel/die interrupt before getting the signal.
+		 */
+		if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
+			buf->wait_backend_pid == MyProcPid)
+			buf_state &= ~BM_PIN_COUNT_WAITER;
+
+		UnlockBufHdr(buf, buf_state);
+
+		PinCountWaitBuf = NULL;
+	}
+}
+
+/*
+ * Acquire or release the content_lock for the buffer.
+ */
+void
+LockBuffer(Buffer buffer, int mode)
+{
+	BufferDesc *buf;
+
+	Assert(BufferIsPinned(buffer));
+	if (BufferIsLocal(buffer))
+		return;					/* local buffers need no lock */
+
+	buf = GetBufferDescriptor(buffer - 1);
+
+	if (mode == BUFFER_LOCK_UNLOCK)
+		LWLockRelease(BufferDescriptorGetContentLock(buf));
+	else if (mode == BUFFER_LOCK_SHARE)
+		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
+	else if (mode == BUFFER_LOCK_EXCLUSIVE)
+		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+	else
+		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
+}
+
+/*
+ * Acquire the content_lock for the buffer, but only if we don't have to wait.
+ *
+ * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
+ */
+bool
+ConditionalLockBuffer(Buffer buffer)
+{
+	BufferDesc *buf;
+
+	Assert(BufferIsPinned(buffer));
+	if (BufferIsLocal(buffer))
+		return true;			/* act as though we got it */
+
+	buf = GetBufferDescriptor(buffer - 1);
+
+	return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+									LW_EXCLUSIVE);
+}
+
+/*
+ * LockBufferForCleanup - lock a buffer in preparation for deleting items
+ *
+ * Items may be deleted from a disk page only when the caller (a) holds an
+ * exclusive lock on the buffer and (b) has observed that no other backend
+ * holds a pin on the buffer.  If there is a pin, then the other backend
+ * might have a pointer into the buffer (for example, a heapscan reference
+ * to an item --- see README for more details).  It's OK if a pin is added
+ * after the cleanup starts, however; the newly-arrived backend will be
+ * unable to look at the page until we release the exclusive lock.
+ *
+ * To implement this protocol, a would-be deleter must pin the buffer and
+ * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
+ * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
+ * it has successfully observed pin count = 1.
+ */
+void
+LockBufferForCleanup(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+	char	   *new_status = NULL;
+	TimestampTz waitStart = 0;
+	bool		logged_recovery_conflict = false;
+
+	Assert(BufferIsPinned(buffer));
+	Assert(PinCountWaitBuf == NULL);
+
+	if (BufferIsLocal(buffer))
+	{
+		/* There should be exactly one pin */
+		if (LocalRefCount[-buffer - 1] != 1)
+			elog(ERROR, "incorrect local pin count: %d",
+				 LocalRefCount[-buffer - 1]);
+		/* Nobody else to wait for */
+		return;
+	}
+
+	/* There should be exactly one local pin */
+	if (GetPrivateRefCount(buffer) != 1)
+		elog(ERROR, "incorrect local pin count: %d",
+			 GetPrivateRefCount(buffer));
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+
+	for (;;)
+	{
+		uint32		buf_state;
+
+		/* Try to acquire lock */
+		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+		buf_state = LockBufHdr(bufHdr);
+
+		Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+		if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
+		{
+			/* Successfully acquired exclusive lock with pincount 1 */
+			UnlockBufHdr(bufHdr, buf_state);
+
+			/*
+			 * Emit the log message if recovery conflict on buffer pin was
+			 * resolved but the startup process waited longer than
+			 * deadlock_timeout for it.
+			 */
+			if (logged_recovery_conflict)
+				LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+									waitStart, GetCurrentTimestamp(),
+									NULL, false);
+
+			/* Report change to non-waiting status */
+			if (new_status)
+			{
+				set_ps_display(new_status);
+				pfree(new_status);
+			}
+			return;
+		}
+		/* Failed, so mark myself as waiting for pincount 1 */
+		if (buf_state & BM_PIN_COUNT_WAITER)
+		{
+			UnlockBufHdr(bufHdr, buf_state);
+			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+			elog(ERROR, "multiple backends attempting to wait for pincount 1");
+		}
+		bufHdr->wait_backend_pid = MyProcPid;
+		PinCountWaitBuf = bufHdr;
+		buf_state |= BM_PIN_COUNT_WAITER;
+		UnlockBufHdr(bufHdr, buf_state);
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+
+		/* Wait to be signaled by UnpinBuffer() */
+		if (InHotStandby)
+		{
+			/* Report change to waiting status */
+			if (update_process_title && new_status == NULL)
+			{
+				const char *old_status;
+				int			len;
+
+				old_status = get_ps_display(&len);
+				new_status = (char *) palloc(len + 8 + 1);
+				memcpy(new_status, old_status, len);
+				strcpy(new_status + len, " waiting");
+				set_ps_display(new_status);
+				new_status[len] = '\0'; /* truncate off " waiting" */
+			}
+
+			/*
+			 * Emit the log message if the startup process is waiting longer
+			 * than deadlock_timeout for recovery conflict on buffer pin.
+			 *
+			 * Skip this if first time through because the startup process has
+			 * not started waiting yet in this case. So, the wait start
+			 * timestamp is set after this logic.
+			 */
+			if (waitStart != 0 && !logged_recovery_conflict)
+			{
+				TimestampTz now = GetCurrentTimestamp();
+
+				if (TimestampDifferenceExceeds(waitStart, now,
+											   DeadlockTimeout))
+				{
+					LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+										waitStart, now, NULL, true);
+					logged_recovery_conflict = true;
+				}
+			}
+
+			/*
+			 * Set the wait start timestamp if logging is enabled and first
+			 * time through.
+			 */
+			if (log_recovery_conflict_waits && waitStart == 0)
+				waitStart = GetCurrentTimestamp();
+
+			/* Publish the bufid that Startup process waits on */
+			SetStartupBufferPinWaitBufId(buffer - 1);
+			/* Set alarm and then wait to be signaled by UnpinBuffer() */
+			ResolveRecoveryConflictWithBufferPin();
+			/* Reset the published bufid */
+			SetStartupBufferPinWaitBufId(-1);
+		}
+		else
+			ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
+
+		/*
+		 * Remove flag marking us as waiter. Normally this will not be set
+		 * anymore, but ProcWaitForSignal() can return for other signals as
+		 * well.  We take care to only reset the flag if we're the waiter, as
+		 * theoretically another backend could have started waiting. That's
+		 * impossible with the current usages due to table level locking, but
+		 * better be safe.
+		 */
+		buf_state = LockBufHdr(bufHdr);
+		if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
+			bufHdr->wait_backend_pid == MyProcPid)
+			buf_state &= ~BM_PIN_COUNT_WAITER;
+		UnlockBufHdr(bufHdr, buf_state);
+
+		PinCountWaitBuf = NULL;
+		/* Loop back and try again */
+	}
+}
+
+/*
+ * Check called from RecoveryConflictInterrupt handler when Startup
+ * process requests cancellation of all pin holders that are blocking it.
+ */
+bool
+HoldingBufferPinThatDelaysRecovery(void)
+{
+	int			bufid = GetStartupBufferPinWaitBufId();
+
+	/*
+	 * If we get woken slowly then it's possible that the Startup process was
+	 * already woken by other backends before we got here. Also possible that
+	 * we get here by multiple interrupts or interrupts at inappropriate
+	 * times, so make sure we do nothing if the bufid is not set.
+	 */
+	if (bufid < 0)
+		return false;
+
+	if (GetPrivateRefCount(bufid + 1) > 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
+ *
+ * We won't loop, but just check once to see if the pin count is OK.  If
+ * not, return false with no lock held.
+ */
+bool
+ConditionalLockBufferForCleanup(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+	uint32		buf_state,
+				refcount;
+
+	Assert(BufferIsValid(buffer));
+
+	if (BufferIsLocal(buffer))
+	{
+		refcount = LocalRefCount[-buffer - 1];
+		/* There should be exactly one pin */
+		Assert(refcount > 0);
+		if (refcount != 1)
+			return false;
+		/* Nobody else to wait for */
+		return true;
+	}
+
+	/* There should be exactly one local pin */
+	refcount = GetPrivateRefCount(buffer);
+	Assert(refcount);
+	if (refcount != 1)
+		return false;
+
+	/* Try to acquire lock */
+	if (!ConditionalLockBuffer(buffer))
+		return false;
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+	buf_state = LockBufHdr(bufHdr);
+	refcount = BUF_STATE_GET_REFCOUNT(buf_state);
+
+	Assert(refcount > 0);
+	if (refcount == 1)
+	{
+		/* Successfully acquired exclusive lock with pincount 1 */
+		UnlockBufHdr(bufHdr, buf_state);
+		return true;
+	}
+
+	/* Failed, so release the lock */
+	UnlockBufHdr(bufHdr, buf_state);
+	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	return false;
+}
+
+/*
+ * IsBufferCleanupOK - as above, but we already have the lock
+ *
+ * Check whether it's OK to perform cleanup on a buffer we've already
+ * locked.  If we observe that the pin count is 1, our exclusive lock
+ * happens to be a cleanup lock, and we can proceed with anything that
+ * would have been allowable had we sought a cleanup lock originally.
+ */
+bool
+IsBufferCleanupOK(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+	uint32		buf_state;
+
+	Assert(BufferIsValid(buffer));
+
+	if (BufferIsLocal(buffer))
+	{
+		/* There should be exactly one pin */
+		if (LocalRefCount[-buffer - 1] != 1)
+			return false;
+		/* Nobody else to wait for */
+		return true;
+	}
+
+	/* There should be exactly one local pin */
+	if (GetPrivateRefCount(buffer) != 1)
+		return false;
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+
+	/* caller must hold exclusive lock on buffer */
+	Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
+								LW_EXCLUSIVE));
+
+	buf_state = LockBufHdr(bufHdr);
+
+	Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+	if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
+	{
+		/* pincount is OK. */
+		UnlockBufHdr(bufHdr, buf_state);
+		return true;
+	}
+
+	UnlockBufHdr(bufHdr, buf_state);
+	return false;
+}
+
+
+/*
+ *	Functions for buffer I/O handling
+ *
+ *	Note: We assume that nested buffer I/O never occurs.
+ *	i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
+ *
+ *	Also note that these are used only for shared buffers, not local ones.
+ */
+
+/*
+ * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
+ */
+static void
+WaitIO(BufferDesc *buf)
+{
+	ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
+
+	ConditionVariablePrepareToSleep(cv);
+	for (;;)
+	{
+		uint32		buf_state;
+
+		/*
+		 * It may not be necessary to acquire the spinlock to check the flag
+		 * here, but since this test is essential for correctness, we'd better
+		 * play it safe.
+		 */
+		buf_state = LockBufHdr(buf);
+		UnlockBufHdr(buf, buf_state);
+
+		if (!(buf_state & BM_IO_IN_PROGRESS))
+			break;
+		ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
+	}
+	ConditionVariableCancelSleep();
+}
+
+/*
+ * StartBufferIO: begin I/O on this buffer
+ *	(Assumptions)
+ *	My process is executing no IO
+ *	The buffer is Pinned
+ *
+ * In some scenarios there are race conditions in which multiple backends
+ * could attempt the same I/O operation concurrently.  If someone else
+ * has already started I/O on this buffer then we will block on the
+ * I/O condition variable until he's done.
+ *
+ * Input operations are only attempted on buffers that are not BM_VALID,
+ * and output operations only on buffers that are BM_VALID and BM_DIRTY,
+ * so we can always tell if the work is already done.
+ *
+ * Returns true if we successfully marked the buffer as I/O busy,
+ * false if someone else already did the work.
+ */
+static bool
+StartBufferIO(BufferDesc *buf, bool forInput)
+{
+	uint32		buf_state;
+
+	Assert(!InProgressBuf);
+
+	for (;;)
+	{
+		buf_state = LockBufHdr(buf);
+
+		if (!(buf_state & BM_IO_IN_PROGRESS))
+			break;
+		UnlockBufHdr(buf, buf_state);
+		WaitIO(buf);
+	}
+
+	/* Once we get here, there is definitely no I/O active on this buffer */
+
+	if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
+	{
+		/* someone else already did the I/O */
+		UnlockBufHdr(buf, buf_state);
+		return false;
+	}
+
+	buf_state |= BM_IO_IN_PROGRESS;
+	UnlockBufHdr(buf, buf_state);
+
+	InProgressBuf = buf;
+	IsForInput = forInput;
+
+	return true;
+}
+
+/*
+ * TerminateBufferIO: release a buffer we were doing I/O on
+ *	(Assumptions)
+ *	My process is executing IO for the buffer
+ *	BM_IO_IN_PROGRESS bit is set for the buffer
+ *	The buffer is Pinned
+ *
+ * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
+ * buffer's BM_DIRTY flag.  This is appropriate when terminating a
+ * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
+ * marking the buffer clean if it was re-dirtied while we were writing.
+ *
+ * set_flag_bits gets ORed into the buffer's flags.  It must include
+ * BM_IO_ERROR in a failure case.  For successful completion it could
+ * be 0, or BM_VALID if we just finished reading in the page.
+ */
+static void
+TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
+{
+	uint32		buf_state;
+
+	Assert(buf == InProgressBuf);
+
+	buf_state = LockBufHdr(buf);
+
+	Assert(buf_state & BM_IO_IN_PROGRESS);
+
+	buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
+	if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
+		buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+
+	buf_state |= set_flag_bits;
+	UnlockBufHdr(buf, buf_state);
+
+	InProgressBuf = NULL;
+
+	ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
+}
+
+/*
+ * AbortBufferIO: Clean up any active buffer I/O after an error.
+ *
+ *	All LWLocks we might have held have been released,
+ *	but we haven't yet released buffer pins, so the buffer is still pinned.
+ *
+ *	If I/O was in progress, we always set BM_IO_ERROR, even though it's
+ *	possible the error condition wasn't related to the I/O.
+ */
+void
+AbortBufferIO(void)
+{
+	BufferDesc *buf = InProgressBuf;
+
+	if (buf)
+	{
+		uint32		buf_state;
+
+		buf_state = LockBufHdr(buf);
+		Assert(buf_state & BM_IO_IN_PROGRESS);
+		if (IsForInput)
+		{
+			Assert(!(buf_state & BM_DIRTY));
+
+			/* We'd better not think buffer is valid yet */
+			Assert(!(buf_state & BM_VALID));
+			UnlockBufHdr(buf, buf_state);
+		}
+		else
+		{
+			Assert(buf_state & BM_DIRTY);
+			UnlockBufHdr(buf, buf_state);
+			/* Issue notice if this is not the first failure... */
+			if (buf_state & BM_IO_ERROR)
+			{
+				/* Buffer is pinned, so we can read tag without spinlock */
+				char	   *path;
+
+				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
+				ereport(WARNING,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg("could not write block %u of %s",
+								buf->tag.blockNum, path),
+						 errdetail("Multiple failures --- write error might be permanent.")));
+				pfree(path);
+			}
+		}
+		TerminateBufferIO(buf, false, BM_IO_ERROR);
+	}
+}
+
+/*
+ * Error context callback for errors occurring during shared buffer writes.
+ */
+static void
+shared_buffer_write_error_callback(void *arg)
+{
+	BufferDesc *bufHdr = (BufferDesc *) arg;
+
+	/* Buffer is pinned, so we can read the tag without locking the spinlock */
+	if (bufHdr != NULL)
+	{
+		char	   *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
+
+		errcontext("writing block %u of relation %s",
+				   bufHdr->tag.blockNum, path);
+		pfree(path);
+	}
+}
+
+/*
+ * Error context callback for errors occurring during local buffer writes.
+ */
+static void
+local_buffer_write_error_callback(void *arg)
+{
+	BufferDesc *bufHdr = (BufferDesc *) arg;
+
+	if (bufHdr != NULL)
+	{
+		char	   *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
+										  bufHdr->tag.forkNum);
+
+		errcontext("writing block %u of relation %s",
+				   bufHdr->tag.blockNum, path);
+		pfree(path);
+	}
+}
+
+/*
+ * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
+ */
+static int
+rnode_comparator(const void *p1, const void *p2)
+{
+	RelFileNode n1 = *(const RelFileNode *) p1;
+	RelFileNode n2 = *(const RelFileNode *) p2;
+
+	if (n1.relNode < n2.relNode)
+		return -1;
+	else if (n1.relNode > n2.relNode)
+		return 1;
+
+	if (n1.dbNode < n2.dbNode)
+		return -1;
+	else if (n1.dbNode > n2.dbNode)
+		return 1;
+
+	if (n1.spcNode < n2.spcNode)
+		return -1;
+	else if (n1.spcNode > n2.spcNode)
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * Lock buffer header - set BM_LOCKED in buffer state.
+ */
+uint32
+LockBufHdr(BufferDesc *desc)
+{
+	SpinDelayStatus delayStatus;
+	uint32		old_buf_state;
+
+	init_local_spin_delay(&delayStatus);
+
+	while (true)
+	{
+		/* set BM_LOCKED flag */
+		old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
+		/* if it wasn't set before we're OK */
+		if (!(old_buf_state & BM_LOCKED))
+			break;
+		perform_spin_delay(&delayStatus);
+	}
+	finish_spin_delay(&delayStatus);
+	return old_buf_state | BM_LOCKED;
+}
+
+/*
+ * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
+ * state at that point.
+ *
+ * Obviously the buffer could be locked by the time the value is returned, so
+ * this is primarily useful in CAS style loops.
+ */
+static uint32
+WaitBufHdrUnlocked(BufferDesc *buf)
+{
+	SpinDelayStatus delayStatus;
+	uint32		buf_state;
+
+	init_local_spin_delay(&delayStatus);
+
+	buf_state = pg_atomic_read_u32(&buf->state);
+
+	while (buf_state & BM_LOCKED)
+	{
+		perform_spin_delay(&delayStatus);
+		buf_state = pg_atomic_read_u32(&buf->state);
+	}
+
+	finish_spin_delay(&delayStatus);
+
+	return buf_state;
+}
+
+/*
+ * BufferTag comparator.
+ */
+static inline int
+buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
+{
+	int			ret;
+
+	ret = rnode_comparator(&ba->rnode, &bb->rnode);
+
+	if (ret != 0)
+		return ret;
+
+	if (ba->forkNum < bb->forkNum)
+		return -1;
+	if (ba->forkNum > bb->forkNum)
+		return 1;
+
+	if (ba->blockNum < bb->blockNum)
+		return -1;
+	if (ba->blockNum > bb->blockNum)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Comparator determining the writeout order in a checkpoint.
+ *
+ * It is important that tablespaces are compared first, the logic balancing
+ * writes between tablespaces relies on it.
+ */
+static inline int
+ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
+{
+	/* compare tablespace */
+	if (a->tsId < b->tsId)
+		return -1;
+	else if (a->tsId > b->tsId)
+		return 1;
+	/* compare relation */
+	if (a->relNode < b->relNode)
+		return -1;
+	else if (a->relNode > b->relNode)
+		return 1;
+	/* compare fork */
+	else if (a->forkNum < b->forkNum)
+		return -1;
+	else if (a->forkNum > b->forkNum)
+		return 1;
+	/* compare block number */
+	else if (a->blockNum < b->blockNum)
+		return -1;
+	else if (a->blockNum > b->blockNum)
+		return 1;
+	/* equal page IDs are unlikely, but not impossible */
+	return 0;
+}
+
+/*
+ * Comparator for a Min-Heap over the per-tablespace checkpoint completion
+ * progress.
+ */
+static int
+ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
+{
+	CkptTsStatus *sa = (CkptTsStatus *) a;
+	CkptTsStatus *sb = (CkptTsStatus *) b;
+
+	/* we want a min-heap, so return 1 for the a < b */
+	if (sa->progress < sb->progress)
+		return 1;
+	else if (sa->progress == sb->progress)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * Initialize a writeback context, discarding potential previous state.
+ *
+ * *max_pending is a pointer instead of an immediate value, so the coalesce
+ * limits can easily changed by the GUC mechanism, and so calling code does
+ * not have to check the current configuration. A value of 0 means that no
+ * writeback control will be performed.
+ */
+void
+WritebackContextInit(WritebackContext *context, int *max_pending)
+{
+	Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
+
+	context->max_pending = max_pending;
+	context->nr_pending = 0;
+}
+
+/*
+ * Add buffer to list of pending writeback requests.
+ */
+void
+ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
+{
+	PendingWriteback *pending;
+
+	/*
+	 * Add buffer to the pending writeback array, unless writeback control is
+	 * disabled.
+	 */
+	if (*context->max_pending > 0)
+	{
+		Assert(*context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
+
+		pending = &context->pending_writebacks[context->nr_pending++];
+
+		pending->tag = *tag;
+	}
+
+	/*
+	 * Perform pending flushes if the writeback limit is exceeded. This
+	 * includes the case where previously an item has been added, but control
+	 * is now disabled.
+	 */
+	if (context->nr_pending >= *context->max_pending)
+		IssuePendingWritebacks(context);
+}
+
+#define ST_SORT sort_pending_writebacks
+#define ST_ELEMENT_TYPE PendingWriteback
+#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
+#define ST_SCOPE static
+#define ST_DEFINE
+#include <lib/sort_template.h>
+
+/*
+ * Issue all pending writeback requests, previously scheduled with
+ * ScheduleBufferTagForWriteback, to the OS.
+ *
+ * Because this is only used to improve the OSs IO scheduling we try to never
+ * error out - it's just a hint.
+ */
+void
+IssuePendingWritebacks(WritebackContext *context)
+{
+	int			i;
+
+	if (context->nr_pending == 0)
+		return;
+
+	/*
+	 * Executing the writes in-order can make them a lot faster, and allows to
+	 * merge writeback requests to consecutive blocks into larger writebacks.
+	 */
+	sort_pending_writebacks(context->pending_writebacks, context->nr_pending);
+
+	/*
+	 * Coalesce neighbouring writes, but nothing else. For that we iterate
+	 * through the, now sorted, array of pending flushes, and look forward to
+	 * find all neighbouring (or identical) writes.
+	 */
+	for (i = 0; i < context->nr_pending; i++)
+	{
+		PendingWriteback *cur;
+		PendingWriteback *next;
+		SMgrRelation reln;
+		int			ahead;
+		BufferTag	tag;
+		Size		nblocks = 1;
+
+		cur = &context->pending_writebacks[i];
+		tag = cur->tag;
+
+		/*
+		 * Peek ahead, into following writeback requests, to see if they can
+		 * be combined with the current one.
+		 */
+		for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
+		{
+			next = &context->pending_writebacks[i + ahead + 1];
+
+			/* different file, stop */
+			if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
+				cur->tag.forkNum != next->tag.forkNum)
+				break;
+
+			/* ok, block queued twice, skip */
+			if (cur->tag.blockNum == next->tag.blockNum)
+				continue;
+
+			/* only merge consecutive writes */
+			if (cur->tag.blockNum + 1 != next->tag.blockNum)
+				break;
+
+			nblocks++;
+			cur = next;
+		}
+
+		i += ahead;
+
+		/* and finally tell the kernel to write the data to storage */
+		reln = smgropen(tag.rnode, InvalidBackendId);
+		smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
+	}
+
+	context->nr_pending = 0;
+}
+
+
+/*
+ * Implement slower/larger portions of TestForOldSnapshot
+ *
+ * Smaller/faster portions are put inline, but the entire set of logic is too
+ * big for that.
+ */
+void
+TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
+{
+	if (RelationAllowsEarlyPruning(relation)
+		&& (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
+		ereport(ERROR,
+				(errcode(ERRCODE_SNAPSHOT_TOO_OLD),
+				 errmsg("snapshot too old")));
+}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 0000000..6be8047
--- /dev/null
+++ b/src/backend/storage/buffer/freelist.c
@@ -0,0 +1,704 @@
+/*-------------------------------------------------------------------------
+ *
+ * freelist.c
+ *	  routines for managing the buffer pool's replacement strategy.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/buffer/freelist.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/proc.h"
+
+#define INT_ACCESS_ONCE(var)	((int)(*((volatile int *)&(var))))
+
+
+/*
+ * The shared freelist control information.
+ */
+typedef struct
+{
+	/* Spinlock: protects the values below */
+	slock_t		buffer_strategy_lock;
+
+	/*
+	 * Clock sweep hand: index of next buffer to consider grabbing. Note that
+	 * this isn't a concrete buffer - we only ever increase the value. So, to
+	 * get an actual buffer, it needs to be used modulo NBuffers.
+	 */
+	pg_atomic_uint32 nextVictimBuffer;
+
+	int			firstFreeBuffer;	/* Head of list of unused buffers */
+	int			lastFreeBuffer; /* Tail of list of unused buffers */
+
+	/*
+	 * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
+	 * when the list is empty)
+	 */
+
+	/*
+	 * Statistics.  These counters should be wide enough that they can't
+	 * overflow during a single bgwriter cycle.
+	 */
+	uint32		completePasses; /* Complete cycles of the clock sweep */
+	pg_atomic_uint32 numBufferAllocs;	/* Buffers allocated since last reset */
+
+	/*
+	 * Bgworker process to be notified upon activity or -1 if none. See
+	 * StrategyNotifyBgWriter.
+	 */
+	int			bgwprocno;
+} BufferStrategyControl;
+
+/* Pointers to shared state */
+static BufferStrategyControl *StrategyControl = NULL;
+
+/*
+ * Private (non-shared) state for managing a ring of shared buffers to re-use.
+ * This is currently the only kind of BufferAccessStrategy object, but someday
+ * we might have more kinds.
+ */
+typedef struct BufferAccessStrategyData
+{
+	/* Overall strategy type */
+	BufferAccessStrategyType btype;
+	/* Number of elements in buffers[] array */
+	int			ring_size;
+
+	/*
+	 * Index of the "current" slot in the ring, ie, the one most recently
+	 * returned by GetBufferFromRing.
+	 */
+	int			current;
+
+	/*
+	 * True if the buffer just returned by StrategyGetBuffer had been in the
+	 * ring already.
+	 */
+	bool		current_was_in_ring;
+
+	/*
+	 * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates we
+	 * have not yet selected a buffer for this ring slot.  For allocation
+	 * simplicity this is palloc'd together with the fixed fields of the
+	 * struct.
+	 */
+	Buffer		buffers[FLEXIBLE_ARRAY_MEMBER];
+}			BufferAccessStrategyData;
+
+
+/* Prototypes for internal functions */
+static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
+									 uint32 *buf_state);
+static void AddBufferToRing(BufferAccessStrategy strategy,
+							BufferDesc *buf);
+
+/*
+ * ClockSweepTick - Helper routine for StrategyGetBuffer()
+ *
+ * Move the clock hand one buffer ahead of its current position and return the
+ * id of the buffer now under the hand.
+ */
+static inline uint32
+ClockSweepTick(void)
+{
+	uint32		victim;
+
+	/*
+	 * Atomically move hand ahead one buffer - if there's several processes
+	 * doing this, this can lead to buffers being returned slightly out of
+	 * apparent order.
+	 */
+	victim =
+		pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
+
+	if (victim >= NBuffers)
+	{
+		uint32		originalVictim = victim;
+
+		/* always wrap what we look up in BufferDescriptors */
+		victim = victim % NBuffers;
+
+		/*
+		 * If we're the one that just caused a wraparound, force
+		 * completePasses to be incremented while holding the spinlock. We
+		 * need the spinlock so StrategySyncStart() can return a consistent
+		 * value consisting of nextVictimBuffer and completePasses.
+		 */
+		if (victim == 0)
+		{
+			uint32		expected;
+			uint32		wrapped;
+			bool		success = false;
+
+			expected = originalVictim + 1;
+
+			while (!success)
+			{
+				/*
+				 * Acquire the spinlock while increasing completePasses. That
+				 * allows other readers to read nextVictimBuffer and
+				 * completePasses in a consistent manner which is required for
+				 * StrategySyncStart().  In theory delaying the increment
+				 * could lead to an overflow of nextVictimBuffers, but that's
+				 * highly unlikely and wouldn't be particularly harmful.
+				 */
+				SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+				wrapped = expected % NBuffers;
+
+				success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
+														 &expected, wrapped);
+				if (success)
+					StrategyControl->completePasses++;
+				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+			}
+		}
+	}
+	return victim;
+}
+
+/*
+ * have_free_buffer -- a lockless check to see if there is a free buffer in
+ *					   buffer pool.
+ *
+ * If the result is true that will become stale once free buffers are moved out
+ * by other operations, so the caller who strictly want to use a free buffer
+ * should not call this.
+ */
+bool
+have_free_buffer(void)
+{
+	if (StrategyControl->firstFreeBuffer >= 0)
+		return true;
+	else
+		return false;
+}
+
+/*
+ * StrategyGetBuffer
+ *
+ *	Called by the bufmgr to get the next candidate buffer to use in
+ *	BufferAlloc(). The only hard requirement BufferAlloc() has is that
+ *	the selected buffer must not currently be pinned by anyone.
+ *
+ *	strategy is a BufferAccessStrategy object, or NULL for default strategy.
+ *
+ *	To ensure that no one else can pin the buffer before we do, we must
+ *	return the buffer with the buffer header spinlock still held.
+ */
+BufferDesc *
+StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
+{
+	BufferDesc *buf;
+	int			bgwprocno;
+	int			trycounter;
+	uint32		local_buf_state;	/* to avoid repeated (de-)referencing */
+
+	/*
+	 * If given a strategy object, see whether it can select a buffer. We
+	 * assume strategy objects don't need buffer_strategy_lock.
+	 */
+	if (strategy != NULL)
+	{
+		buf = GetBufferFromRing(strategy, buf_state);
+		if (buf != NULL)
+			return buf;
+	}
+
+	/*
+	 * If asked, we need to waken the bgwriter. Since we don't want to rely on
+	 * a spinlock for this we force a read from shared memory once, and then
+	 * set the latch based on that value. We need to go through that length
+	 * because otherwise bgwprocno might be reset while/after we check because
+	 * the compiler might just reread from memory.
+	 *
+	 * This can possibly set the latch of the wrong process if the bgwriter
+	 * dies in the wrong moment. But since PGPROC->procLatch is never
+	 * deallocated the worst consequence of that is that we set the latch of
+	 * some arbitrary process.
+	 */
+	bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
+	if (bgwprocno != -1)
+	{
+		/* reset bgwprocno first, before setting the latch */
+		StrategyControl->bgwprocno = -1;
+
+		/*
+		 * Not acquiring ProcArrayLock here which is slightly icky. It's
+		 * actually fine because procLatch isn't ever freed, so we just can
+		 * potentially set the wrong process' (or no process') latch.
+		 */
+		SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+	}
+
+	/*
+	 * We count buffer allocation requests so that the bgwriter can estimate
+	 * the rate of buffer consumption.  Note that buffers recycled by a
+	 * strategy object are intentionally not counted here.
+	 */
+	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
+
+	/*
+	 * First check, without acquiring the lock, whether there's buffers in the
+	 * freelist. Since we otherwise don't require the spinlock in every
+	 * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
+	 * uselessly in most cases. That obviously leaves a race where a buffer is
+	 * put on the freelist but we don't see the store yet - but that's pretty
+	 * harmless, it'll just get used during the next buffer acquisition.
+	 *
+	 * If there's buffers on the freelist, acquire the spinlock to pop one
+	 * buffer of the freelist. Then check whether that buffer is usable and
+	 * repeat if not.
+	 *
+	 * Note that the freeNext fields are considered to be protected by the
+	 * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
+	 * manipulate them without holding the spinlock.
+	 */
+	if (StrategyControl->firstFreeBuffer >= 0)
+	{
+		while (true)
+		{
+			/* Acquire the spinlock to remove element from the freelist */
+			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+			if (StrategyControl->firstFreeBuffer < 0)
+			{
+				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+				break;
+			}
+
+			buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
+			Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+
+			/* Unconditionally remove buffer from freelist */
+			StrategyControl->firstFreeBuffer = buf->freeNext;
+			buf->freeNext = FREENEXT_NOT_IN_LIST;
+
+			/*
+			 * Release the lock so someone else can access the freelist while
+			 * we check out this buffer.
+			 */
+			SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+
+			/*
+			 * If the buffer is pinned or has a nonzero usage_count, we cannot
+			 * use it; discard it and retry.  (This can only happen if VACUUM
+			 * put a valid buffer in the freelist and then someone else used
+			 * it before we got to it.  It's probably impossible altogether as
+			 * of 8.3, but we'd better check anyway.)
+			 */
+			local_buf_state = LockBufHdr(buf);
+			if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
+				&& BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
+			{
+				if (strategy != NULL)
+					AddBufferToRing(strategy, buf);
+				*buf_state = local_buf_state;
+				return buf;
+			}
+			UnlockBufHdr(buf, local_buf_state);
+
+		}
+	}
+
+	/* Nothing on the freelist, so run the "clock sweep" algorithm */
+	trycounter = NBuffers;
+	for (;;)
+	{
+		buf = GetBufferDescriptor(ClockSweepTick());
+
+		/*
+		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
+		 * it; decrement the usage_count (unless pinned) and keep scanning.
+		 */
+		local_buf_state = LockBufHdr(buf);
+
+		if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
+		{
+			if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
+			{
+				local_buf_state -= BUF_USAGECOUNT_ONE;
+
+				trycounter = NBuffers;
+			}
+			else
+			{
+				/* Found a usable buffer */
+				if (strategy != NULL)
+					AddBufferToRing(strategy, buf);
+				*buf_state = local_buf_state;
+				return buf;
+			}
+		}
+		else if (--trycounter == 0)
+		{
+			/*
+			 * We've scanned all the buffers without making any state changes,
+			 * so all the buffers are pinned (or were when we looked at them).
+			 * We could hope that someone will free one eventually, but it's
+			 * probably better to fail than to risk getting stuck in an
+			 * infinite loop.
+			 */
+			UnlockBufHdr(buf, local_buf_state);
+			elog(ERROR, "no unpinned buffers available");
+		}
+		UnlockBufHdr(buf, local_buf_state);
+	}
+}
+
+/*
+ * StrategyFreeBuffer: put a buffer on the freelist
+ */
+void
+StrategyFreeBuffer(BufferDesc *buf)
+{
+	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+	/*
+	 * It is possible that we are told to put something in the freelist that
+	 * is already in it; don't screw up the list if so.
+	 */
+	if (buf->freeNext == FREENEXT_NOT_IN_LIST)
+	{
+		buf->freeNext = StrategyControl->firstFreeBuffer;
+		if (buf->freeNext < 0)
+			StrategyControl->lastFreeBuffer = buf->buf_id;
+		StrategyControl->firstFreeBuffer = buf->buf_id;
+	}
+
+	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+}
+
+/*
+ * StrategySyncStart -- tell BufferSync where to start syncing
+ *
+ * The result is the buffer index of the best buffer to sync first.
+ * BufferSync() will proceed circularly around the buffer array from there.
+ *
+ * In addition, we return the completed-pass count (which is effectively
+ * the higher-order bits of nextVictimBuffer) and the count of recent buffer
+ * allocs if non-NULL pointers are passed.  The alloc count is reset after
+ * being read.
+ */
+int
+StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
+{
+	uint32		nextVictimBuffer;
+	int			result;
+
+	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+	nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
+	result = nextVictimBuffer % NBuffers;
+
+	if (complete_passes)
+	{
+		*complete_passes = StrategyControl->completePasses;
+
+		/*
+		 * Additionally add the number of wraparounds that happened before
+		 * completePasses could be incremented. C.f. ClockSweepTick().
+		 */
+		*complete_passes += nextVictimBuffer / NBuffers;
+	}
+
+	if (num_buf_alloc)
+	{
+		*num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
+	}
+	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+	return result;
+}
+
+/*
+ * StrategyNotifyBgWriter -- set or clear allocation notification latch
+ *
+ * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
+ * set that latch.  Pass -1 to clear the pending notification before it
+ * happens.  This feature is used by the bgwriter process to wake itself up
+ * from hibernation, and is not meant for anybody else to use.
+ */
+void
+StrategyNotifyBgWriter(int bgwprocno)
+{
+	/*
+	 * We acquire buffer_strategy_lock just to ensure that the store appears
+	 * atomic to StrategyGetBuffer.  The bgwriter should call this rather
+	 * infrequently, so there's no performance penalty from being safe.
+	 */
+	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+	StrategyControl->bgwprocno = bgwprocno;
+	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+}
+
+
+/*
+ * StrategyShmemSize
+ *
+ * estimate the size of shared memory used by the freelist-related structures.
+ *
+ * Note: for somewhat historical reasons, the buffer lookup hashtable size
+ * is also determined here.
+ */
+Size
+StrategyShmemSize(void)
+{
+	Size		size = 0;
+
+	/* size of lookup hash table ... see comment in StrategyInitialize */
+	size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
+
+	/* size of the shared replacement strategy control block */
+	size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
+
+	return size;
+}
+
+/*
+ * StrategyInitialize -- initialize the buffer cache replacement
+ *		strategy.
+ *
+ * Assumes: All of the buffers are already built into a linked list.
+ *		Only called by postmaster and only during initialization.
+ */
+void
+StrategyInitialize(bool init)
+{
+	bool		found;
+
+	/*
+	 * Initialize the shared buffer lookup hashtable.
+	 *
+	 * Since we can't tolerate running out of lookup table entries, we must be
+	 * sure to specify an adequate table size here.  The maximum steady-state
+	 * usage is of course NBuffers entries, but BufferAlloc() tries to insert
+	 * a new entry before deleting the old.  In principle this could be
+	 * happening in each partition concurrently, so we could need as many as
+	 * NBuffers + NUM_BUFFER_PARTITIONS entries.
+	 */
+	InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
+
+	/*
+	 * Get or create the shared strategy control block
+	 */
+	StrategyControl = (BufferStrategyControl *)
+		ShmemInitStruct("Buffer Strategy Status",
+						sizeof(BufferStrategyControl),
+						&found);
+
+	if (!found)
+	{
+		/*
+		 * Only done once, usually in postmaster
+		 */
+		Assert(init);
+
+		SpinLockInit(&StrategyControl->buffer_strategy_lock);
+
+		/*
+		 * Grab the whole linked list of free buffers for our strategy. We
+		 * assume it was previously set up by InitBufferPool().
+		 */
+		StrategyControl->firstFreeBuffer = 0;
+		StrategyControl->lastFreeBuffer = NBuffers - 1;
+
+		/* Initialize the clock sweep pointer */
+		pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
+
+		/* Clear statistics */
+		StrategyControl->completePasses = 0;
+		pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
+
+		/* No pending notification */
+		StrategyControl->bgwprocno = -1;
+	}
+	else
+		Assert(!init);
+}
+
+
+/* ----------------------------------------------------------------
+ *				Backend-private buffer ring management
+ * ----------------------------------------------------------------
+ */
+
+
+/*
+ * GetAccessStrategy -- create a BufferAccessStrategy object
+ *
+ * The object is allocated in the current memory context.
+ */
+BufferAccessStrategy
+GetAccessStrategy(BufferAccessStrategyType btype)
+{
+	BufferAccessStrategy strategy;
+	int			ring_size;
+
+	/*
+	 * Select ring size to use.  See buffer/README for rationales.
+	 *
+	 * Note: if you change the ring size for BAS_BULKREAD, see also
+	 * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
+	 */
+	switch (btype)
+	{
+		case BAS_NORMAL:
+			/* if someone asks for NORMAL, just give 'em a "default" object */
+			return NULL;
+
+		case BAS_BULKREAD:
+			ring_size = 256 * 1024 / BLCKSZ;
+			break;
+		case BAS_BULKWRITE:
+			ring_size = 16 * 1024 * 1024 / BLCKSZ;
+			break;
+		case BAS_VACUUM:
+			ring_size = 256 * 1024 / BLCKSZ;
+			break;
+
+		default:
+			elog(ERROR, "unrecognized buffer access strategy: %d",
+				 (int) btype);
+			return NULL;		/* keep compiler quiet */
+	}
+
+	/* Make sure ring isn't an undue fraction of shared buffers */
+	ring_size = Min(NBuffers / 8, ring_size);
+
+	/* Allocate the object and initialize all elements to zeroes */
+	strategy = (BufferAccessStrategy)
+		palloc0(offsetof(BufferAccessStrategyData, buffers) +
+				ring_size * sizeof(Buffer));
+
+	/* Set fields that don't start out zero */
+	strategy->btype = btype;
+	strategy->ring_size = ring_size;
+
+	return strategy;
+}
+
+/*
+ * FreeAccessStrategy -- release a BufferAccessStrategy object
+ *
+ * A simple pfree would do at the moment, but we would prefer that callers
+ * don't assume that much about the representation of BufferAccessStrategy.
+ */
+void
+FreeAccessStrategy(BufferAccessStrategy strategy)
+{
+	/* don't crash if called on a "default" strategy */
+	if (strategy != NULL)
+		pfree(strategy);
+}
+
+/*
+ * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
+ *		ring is empty.
+ *
+ * The bufhdr spin lock is held on the returned buffer.
+ */
+static BufferDesc *
+GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
+{
+	BufferDesc *buf;
+	Buffer		bufnum;
+	uint32		local_buf_state;	/* to avoid repeated (de-)referencing */
+
+
+	/* Advance to next ring slot */
+	if (++strategy->current >= strategy->ring_size)
+		strategy->current = 0;
+
+	/*
+	 * If the slot hasn't been filled yet, tell the caller to allocate a new
+	 * buffer with the normal allocation strategy.  He will then fill this
+	 * slot by calling AddBufferToRing with the new buffer.
+	 */
+	bufnum = strategy->buffers[strategy->current];
+	if (bufnum == InvalidBuffer)
+	{
+		strategy->current_was_in_ring = false;
+		return NULL;
+	}
+
+	/*
+	 * If the buffer is pinned we cannot use it under any circumstances.
+	 *
+	 * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
+	 * since our own previous usage of the ring element would have left it
+	 * there, but it might've been decremented by clock sweep since then). A
+	 * higher usage_count indicates someone else has touched the buffer, so we
+	 * shouldn't re-use it.
+	 */
+	buf = GetBufferDescriptor(bufnum - 1);
+	local_buf_state = LockBufHdr(buf);
+	if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
+		&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
+	{
+		strategy->current_was_in_ring = true;
+		*buf_state = local_buf_state;
+		return buf;
+	}
+	UnlockBufHdr(buf, local_buf_state);
+
+	/*
+	 * Tell caller to allocate a new buffer with the normal allocation
+	 * strategy.  He'll then replace this ring element via AddBufferToRing.
+	 */
+	strategy->current_was_in_ring = false;
+	return NULL;
+}
+
+/*
+ * AddBufferToRing -- add a buffer to the buffer ring
+ *
+ * Caller must hold the buffer header spinlock on the buffer.  Since this
+ * is called with the spinlock held, it had better be quite cheap.
+ */
+static void
+AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
+{
+	strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
+}
+
+/*
+ * StrategyRejectBuffer -- consider rejecting a dirty buffer
+ *
+ * When a nondefault strategy is used, the buffer manager calls this function
+ * when it turns out that the buffer selected by StrategyGetBuffer needs to
+ * be written out and doing so would require flushing WAL too.  This gives us
+ * a chance to choose a different victim.
+ *
+ * Returns true if buffer manager should ask for a new victim, and false
+ * if this buffer should be written and re-used.
+ */
+bool
+StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
+{
+	/* We only do this in bulkread mode */
+	if (strategy->btype != BAS_BULKREAD)
+		return false;
+
+	/* Don't muck with behavior of normal buffer-replacement strategy */
+	if (!strategy->current_was_in_ring ||
+		strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
+		return false;
+
+	/*
+	 * Remove the dirty buffer from the ring; necessary to prevent infinite
+	 * loop if all ring members are dirty.
+	 */
+	strategy->buffers[strategy->current] = InvalidBuffer;
+
+	return true;
+}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 0000000..04b3558
--- /dev/null
+++ b/src/backend/storage/buffer/localbuf.c
@@ -0,0 +1,596 @@
+/*-------------------------------------------------------------------------
+ *
+ * localbuf.c
+ *	  local buffer manager. Fast buffer manager for temporary tables,
+ *	  which never need to be WAL-logged or checkpointed, etc.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994-5, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/buffer/localbuf.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "catalog/catalog.h"
+#include "executor/instrument.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+
+
+/*#define LBDEBUG*/
+
+/* entry for buffer lookup hashtable */
+typedef struct
+{
+	BufferTag	key;			/* Tag of a disk page */
+	int			id;				/* Associated local buffer's index */
+} LocalBufferLookupEnt;
+
+/* Note: this macro only works on local buffers, not shared ones! */
+#define LocalBufHdrGetBlock(bufHdr) \
+	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
+
+int			NLocBuffer = 0;		/* until buffers are initialized */
+
+BufferDesc *LocalBufferDescriptors = NULL;
+Block	   *LocalBufferBlockPointers = NULL;
+int32	   *LocalRefCount = NULL;
+
+static int	nextFreeLocalBuf = 0;
+
+static HTAB *LocalBufHash = NULL;
+
+
+static void InitLocalBuffers(void);
+static Block GetLocalBufferStorage(void);
+
+
+/*
+ * PrefetchLocalBuffer -
+ *	  initiate asynchronous read of a block of a relation
+ *
+ * Do PrefetchBuffer's work for temporary relations.
+ * No-op if prefetching isn't compiled in.
+ */
+PrefetchBufferResult
+PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
+					BlockNumber blockNum)
+{
+	PrefetchBufferResult result = {InvalidBuffer, false};
+	BufferTag	newTag;			/* identity of requested block */
+	LocalBufferLookupEnt *hresult;
+
+	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* Initialize local buffers if first request in this session */
+	if (LocalBufHash == NULL)
+		InitLocalBuffers();
+
+	/* See if the desired buffer already exists */
+	hresult = (LocalBufferLookupEnt *)
+		hash_search(LocalBufHash, (void *) &newTag, HASH_FIND, NULL);
+
+	if (hresult)
+	{
+		/* Yes, so nothing to do */
+		result.recent_buffer = -hresult->id - 1;
+	}
+	else
+	{
+#ifdef USE_PREFETCH
+		/* Not in buffers, so initiate prefetch */
+		smgrprefetch(smgr, forkNum, blockNum);
+		result.initiated_io = true;
+#endif							/* USE_PREFETCH */
+	}
+
+	return result;
+}
+
+
+/*
+ * LocalBufferAlloc -
+ *	  Find or create a local buffer for the given page of the given relation.
+ *
+ * API is similar to bufmgr.c's BufferAlloc, except that we do not need
+ * to do any locking since this is all local.   Also, IO_IN_PROGRESS
+ * does not get set.  Lastly, we support only default access strategy
+ * (hence, usage_count is always advanced).
+ */
+BufferDesc *
+LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
+				 bool *foundPtr)
+{
+	BufferTag	newTag;			/* identity of requested block */
+	LocalBufferLookupEnt *hresult;
+	BufferDesc *bufHdr;
+	int			b;
+	int			trycounter;
+	bool		found;
+	uint32		buf_state;
+
+	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* Initialize local buffers if first request in this session */
+	if (LocalBufHash == NULL)
+		InitLocalBuffers();
+
+	/* See if the desired buffer already exists */
+	hresult = (LocalBufferLookupEnt *)
+		hash_search(LocalBufHash, (void *) &newTag, HASH_FIND, NULL);
+
+	if (hresult)
+	{
+		b = hresult->id;
+		bufHdr = GetLocalBufferDescriptor(b);
+		Assert(BUFFERTAGS_EQUAL(bufHdr->tag, newTag));
+#ifdef LBDEBUG
+		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
+				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
+#endif
+		buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+		/* this part is equivalent to PinBuffer for a shared buffer */
+		if (LocalRefCount[b] == 0)
+		{
+			if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
+			{
+				buf_state += BUF_USAGECOUNT_ONE;
+				pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+			}
+		}
+		LocalRefCount[b]++;
+		ResourceOwnerRememberBuffer(CurrentResourceOwner,
+									BufferDescriptorGetBuffer(bufHdr));
+		if (buf_state & BM_VALID)
+			*foundPtr = true;
+		else
+		{
+			/* Previous read attempt must have failed; try again */
+			*foundPtr = false;
+		}
+		return bufHdr;
+	}
+
+#ifdef LBDEBUG
+	fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
+			smgr->smgr_rnode.node.relNode, forkNum, blockNum,
+			-nextFreeLocalBuf - 1);
+#endif
+
+	/*
+	 * Need to get a new buffer.  We use a clock sweep algorithm (essentially
+	 * the same as what freelist.c does now...)
+	 */
+	trycounter = NLocBuffer;
+	for (;;)
+	{
+		b = nextFreeLocalBuf;
+
+		if (++nextFreeLocalBuf >= NLocBuffer)
+			nextFreeLocalBuf = 0;
+
+		bufHdr = GetLocalBufferDescriptor(b);
+
+		if (LocalRefCount[b] == 0)
+		{
+			buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+			if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
+			{
+				buf_state -= BUF_USAGECOUNT_ONE;
+				pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+				trycounter = NLocBuffer;
+			}
+			else
+			{
+				/* Found a usable buffer */
+				LocalRefCount[b]++;
+				ResourceOwnerRememberBuffer(CurrentResourceOwner,
+											BufferDescriptorGetBuffer(bufHdr));
+				break;
+			}
+		}
+		else if (--trycounter == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+					 errmsg("no empty local buffer available")));
+	}
+
+	/*
+	 * this buffer is not referenced but it might still be dirty. if that's
+	 * the case, write it out before reusing it!
+	 */
+	if (buf_state & BM_DIRTY)
+	{
+		SMgrRelation oreln;
+		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
+
+		/* Find smgr relation for buffer */
+		oreln = smgropen(bufHdr->tag.rnode, MyBackendId);
+
+		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+
+		/* And write... */
+		smgrwrite(oreln,
+				  bufHdr->tag.forkNum,
+				  bufHdr->tag.blockNum,
+				  localpage,
+				  false);
+
+		/* Mark not-dirty now in case we error out below */
+		buf_state &= ~BM_DIRTY;
+		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+		pgBufferUsage.local_blks_written++;
+	}
+
+	/*
+	 * lazy memory allocation: allocate space on first use of a buffer.
+	 */
+	if (LocalBufHdrGetBlock(bufHdr) == NULL)
+	{
+		/* Set pointer for use by BufferGetBlock() macro */
+		LocalBufHdrGetBlock(bufHdr) = GetLocalBufferStorage();
+	}
+
+	/*
+	 * Update the hash table: remove old entry, if any, and make new one.
+	 */
+	if (buf_state & BM_TAG_VALID)
+	{
+		hresult = (LocalBufferLookupEnt *)
+			hash_search(LocalBufHash, (void *) &bufHdr->tag,
+						HASH_REMOVE, NULL);
+		if (!hresult)			/* shouldn't happen */
+			elog(ERROR, "local buffer hash table corrupted");
+		/* mark buffer invalid just in case hash insert fails */
+		CLEAR_BUFFERTAG(bufHdr->tag);
+		buf_state &= ~(BM_VALID | BM_TAG_VALID);
+		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+	}
+
+	hresult = (LocalBufferLookupEnt *)
+		hash_search(LocalBufHash, (void *) &newTag, HASH_ENTER, &found);
+	if (found)					/* shouldn't happen */
+		elog(ERROR, "local buffer hash table corrupted");
+	hresult->id = b;
+
+	/*
+	 * it's all ours now.
+	 */
+	bufHdr->tag = newTag;
+	buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
+	buf_state |= BM_TAG_VALID;
+	buf_state &= ~BUF_USAGECOUNT_MASK;
+	buf_state += BUF_USAGECOUNT_ONE;
+	pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+	*foundPtr = false;
+	return bufHdr;
+}
+
+/*
+ * MarkLocalBufferDirty -
+ *	  mark a local buffer dirty
+ */
+void
+MarkLocalBufferDirty(Buffer buffer)
+{
+	int			bufid;
+	BufferDesc *bufHdr;
+	uint32		buf_state;
+
+	Assert(BufferIsLocal(buffer));
+
+#ifdef LBDEBUG
+	fprintf(stderr, "LB DIRTY %d\n", buffer);
+#endif
+
+	bufid = -(buffer + 1);
+
+	Assert(LocalRefCount[bufid] > 0);
+
+	bufHdr = GetLocalBufferDescriptor(bufid);
+
+	buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+	if (!(buf_state & BM_DIRTY))
+		pgBufferUsage.local_blks_dirtied++;
+
+	buf_state |= BM_DIRTY;
+
+	pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+}
+
+/*
+ * DropRelFileNodeLocalBuffers
+ *		This function removes from the buffer pool all the pages of the
+ *		specified relation that have block numbers >= firstDelBlock.
+ *		(In particular, with firstDelBlock = 0, all pages are removed.)
+ *		Dirty pages are simply dropped, without bothering to write them
+ *		out first.  Therefore, this is NOT rollback-able, and so should be
+ *		used only with extreme caution!
+ *
+ *		See DropRelFileNodeBuffers in bufmgr.c for more notes.
+ */
+void
+DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
+							BlockNumber firstDelBlock)
+{
+	int			i;
+
+	for (i = 0; i < NLocBuffer; i++)
+	{
+		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
+		LocalBufferLookupEnt *hresult;
+		uint32		buf_state;
+
+		buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+		if ((buf_state & BM_TAG_VALID) &&
+			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+			bufHdr->tag.forkNum == forkNum &&
+			bufHdr->tag.blockNum >= firstDelBlock)
+		{
+			if (LocalRefCount[i] != 0)
+				elog(ERROR, "block %u of %s is still referenced (local %u)",
+					 bufHdr->tag.blockNum,
+					 relpathbackend(bufHdr->tag.rnode, MyBackendId,
+									bufHdr->tag.forkNum),
+					 LocalRefCount[i]);
+			/* Remove entry from hashtable */
+			hresult = (LocalBufferLookupEnt *)
+				hash_search(LocalBufHash, (void *) &bufHdr->tag,
+							HASH_REMOVE, NULL);
+			if (!hresult)		/* shouldn't happen */
+				elog(ERROR, "local buffer hash table corrupted");
+			/* Mark buffer invalid */
+			CLEAR_BUFFERTAG(bufHdr->tag);
+			buf_state &= ~BUF_FLAG_MASK;
+			buf_state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+		}
+	}
+}
+
+/*
+ * DropRelFileNodeAllLocalBuffers
+ *		This function removes from the buffer pool all pages of all forks
+ *		of the specified relation.
+ *
+ *		See DropRelFileNodesAllBuffers in bufmgr.c for more notes.
+ */
+void
+DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
+{
+	int			i;
+
+	for (i = 0; i < NLocBuffer; i++)
+	{
+		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
+		LocalBufferLookupEnt *hresult;
+		uint32		buf_state;
+
+		buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+		if ((buf_state & BM_TAG_VALID) &&
+			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+		{
+			if (LocalRefCount[i] != 0)
+				elog(ERROR, "block %u of %s is still referenced (local %u)",
+					 bufHdr->tag.blockNum,
+					 relpathbackend(bufHdr->tag.rnode, MyBackendId,
+									bufHdr->tag.forkNum),
+					 LocalRefCount[i]);
+			/* Remove entry from hashtable */
+			hresult = (LocalBufferLookupEnt *)
+				hash_search(LocalBufHash, (void *) &bufHdr->tag,
+							HASH_REMOVE, NULL);
+			if (!hresult)		/* shouldn't happen */
+				elog(ERROR, "local buffer hash table corrupted");
+			/* Mark buffer invalid */
+			CLEAR_BUFFERTAG(bufHdr->tag);
+			buf_state &= ~BUF_FLAG_MASK;
+			buf_state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+		}
+	}
+}
+
+/*
+ * InitLocalBuffers -
+ *	  init the local buffer cache. Since most queries (esp. multi-user ones)
+ *	  don't involve local buffers, we delay allocating actual memory for the
+ *	  buffers until we need them; just make the buffer headers here.
+ */
+static void
+InitLocalBuffers(void)
+{
+	int			nbufs = num_temp_buffers;
+	HASHCTL		info;
+	int			i;
+
+	/*
+	 * Parallel workers can't access data in temporary tables, because they
+	 * have no visibility into the local buffers of their leader.  This is a
+	 * convenient, low-cost place to provide a backstop check for that.  Note
+	 * that we don't wish to prevent a parallel worker from accessing catalog
+	 * metadata about a temp table, so checks at higher levels would be
+	 * inappropriate.
+	 */
+	if (IsParallelWorker())
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+				 errmsg("cannot access temporary tables during a parallel operation")));
+
+	/* Allocate and zero buffer headers and auxiliary arrays */
+	LocalBufferDescriptors = (BufferDesc *) calloc(nbufs, sizeof(BufferDesc));
+	LocalBufferBlockPointers = (Block *) calloc(nbufs, sizeof(Block));
+	LocalRefCount = (int32 *) calloc(nbufs, sizeof(int32));
+	if (!LocalBufferDescriptors || !LocalBufferBlockPointers || !LocalRefCount)
+		ereport(FATAL,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
+	nextFreeLocalBuf = 0;
+
+	/* initialize fields that need to start off nonzero */
+	for (i = 0; i < nbufs; i++)
+	{
+		BufferDesc *buf = GetLocalBufferDescriptor(i);
+
+		/*
+		 * negative to indicate local buffer. This is tricky: shared buffers
+		 * start with 0. We have to start with -2. (Note that the routine
+		 * BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id
+		 * is -1.)
+		 */
+		buf->buf_id = -i - 2;
+
+		/*
+		 * Intentionally do not initialize the buffer's atomic variable
+		 * (besides zeroing the underlying memory above). That way we get
+		 * errors on platforms without atomics, if somebody (re-)introduces
+		 * atomic operations for local buffers.
+		 */
+	}
+
+	/* Create the lookup hash table */
+	info.keysize = sizeof(BufferTag);
+	info.entrysize = sizeof(LocalBufferLookupEnt);
+
+	LocalBufHash = hash_create("Local Buffer Lookup Table",
+							   nbufs,
+							   &info,
+							   HASH_ELEM | HASH_BLOBS);
+
+	if (!LocalBufHash)
+		elog(ERROR, "could not initialize local buffer hash table");
+
+	/* Initialization done, mark buffers allocated */
+	NLocBuffer = nbufs;
+}
+
+/*
+ * GetLocalBufferStorage - allocate memory for a local buffer
+ *
+ * The idea of this function is to aggregate our requests for storage
+ * so that the memory manager doesn't see a whole lot of relatively small
+ * requests.  Since we'll never give back a local buffer once it's created
+ * within a particular process, no point in burdening memmgr with separately
+ * managed chunks.
+ */
+static Block
+GetLocalBufferStorage(void)
+{
+	static char *cur_block = NULL;
+	static int	next_buf_in_block = 0;
+	static int	num_bufs_in_block = 0;
+	static int	total_bufs_allocated = 0;
+	static MemoryContext LocalBufferContext = NULL;
+
+	char	   *this_buf;
+
+	Assert(total_bufs_allocated < NLocBuffer);
+
+	if (next_buf_in_block >= num_bufs_in_block)
+	{
+		/* Need to make a new request to memmgr */
+		int			num_bufs;
+
+		/*
+		 * We allocate local buffers in a context of their own, so that the
+		 * space eaten for them is easily recognizable in MemoryContextStats
+		 * output.  Create the context on first use.
+		 */
+		if (LocalBufferContext == NULL)
+			LocalBufferContext =
+				AllocSetContextCreate(TopMemoryContext,
+									  "LocalBufferContext",
+									  ALLOCSET_DEFAULT_SIZES);
+
+		/* Start with a 16-buffer request; subsequent ones double each time */
+		num_bufs = Max(num_bufs_in_block * 2, 16);
+		/* But not more than what we need for all remaining local bufs */
+		num_bufs = Min(num_bufs, NLocBuffer - total_bufs_allocated);
+		/* And don't overflow MaxAllocSize, either */
+		num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ);
+
+		cur_block = (char *) MemoryContextAlloc(LocalBufferContext,
+												num_bufs * BLCKSZ);
+		next_buf_in_block = 0;
+		num_bufs_in_block = num_bufs;
+	}
+
+	/* Allocate next buffer in current memory block */
+	this_buf = cur_block + next_buf_in_block * BLCKSZ;
+	next_buf_in_block++;
+	total_bufs_allocated++;
+
+	return (Block) this_buf;
+}
+
+/*
+ * CheckForLocalBufferLeaks - ensure this backend holds no local buffer pins
+ *
+ * This is just like CheckForBufferLeaks(), but for local buffers.
+ */
+static void
+CheckForLocalBufferLeaks(void)
+{
+#ifdef USE_ASSERT_CHECKING
+	if (LocalRefCount)
+	{
+		int			RefCountErrors = 0;
+		int			i;
+
+		for (i = 0; i < NLocBuffer; i++)
+		{
+			if (LocalRefCount[i] != 0)
+			{
+				Buffer		b = -i - 1;
+
+				PrintBufferLeakWarning(b);
+				RefCountErrors++;
+			}
+		}
+		Assert(RefCountErrors == 0);
+	}
+#endif
+}
+
+/*
+ * AtEOXact_LocalBuffers - clean up at end of transaction.
+ *
+ * This is just like AtEOXact_Buffers, but for local buffers.
+ */
+void
+AtEOXact_LocalBuffers(bool isCommit)
+{
+	CheckForLocalBufferLeaks();
+}
+
+/*
+ * AtProcExit_LocalBuffers - ensure we have dropped pins during backend exit.
+ *
+ * This is just like AtProcExit_Buffers, but for local buffers.
+ */
+void
+AtProcExit_LocalBuffers(void)
+{
+	/*
+	 * We shouldn't be holding any remaining pins; if we are, and assertions
+	 * aren't enabled, we'll fail later in DropRelFileNodeBuffers while trying
+	 * to drop the temp rels.
+	 */
+	CheckForLocalBufferLeaks();
+}
diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile
new file mode 100644
index 0000000..5e1291b
--- /dev/null
+++ b/src/backend/storage/file/Makefile
@@ -0,0 +1,22 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/file
+#
+# IDENTIFICATION
+#    src/backend/storage/file/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/file
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	buffile.o \
+	copydir.o \
+	fd.o \
+	reinit.o \
+	sharedfileset.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
new file mode 100644
index 0000000..a4be5fe
--- /dev/null
+++ b/src/backend/storage/file/buffile.c
@@ -0,0 +1,949 @@
+/*-------------------------------------------------------------------------
+ *
+ * buffile.c
+ *	  Management of large buffered temporary files.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/buffile.c
+ *
+ * NOTES:
+ *
+ * BufFiles provide a very incomplete emulation of stdio atop virtual Files
+ * (as managed by fd.c).  Currently, we only support the buffered-I/O
+ * aspect of stdio: a read or write of the low-level File occurs only
+ * when the buffer is filled or emptied.  This is an even bigger win
+ * for virtual Files than for ordinary kernel files, since reducing the
+ * frequency with which a virtual File is touched reduces "thrashing"
+ * of opening/closing file descriptors.
+ *
+ * Note that BufFile structs are allocated with palloc(), and therefore
+ * will go away automatically at query/transaction end.  Since the underlying
+ * virtual Files are made with OpenTemporaryFile, all resources for
+ * the file are certain to be cleaned up even if processing is aborted
+ * by ereport(ERROR).  The data structures required are made in the
+ * palloc context that was current when the BufFile was created, and
+ * any external resources such as temp files are owned by the ResourceOwner
+ * that was current at that time.
+ *
+ * BufFile also supports temporary files that exceed the OS file size limit
+ * (by opening multiple fd.c temporary files).  This is an essential feature
+ * for sorts and hashjoins on large amounts of data.
+ *
+ * BufFile supports temporary files that can be shared with other backends, as
+ * infrastructure for parallel execution.  Such files need to be created as a
+ * member of a SharedFileSet that all participants are attached to.
+ *
+ * BufFile also supports temporary files that can be used by the single backend
+ * when the corresponding files need to be survived across the transaction and
+ * need to be opened and closed multiple times.  Such files need to be created
+ * as a member of a SharedFileSet.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "commands/tablespace.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/buf_internals.h"
+#include "storage/buffile.h"
+#include "storage/fd.h"
+#include "utils/resowner.h"
+
+/*
+ * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
+ * The reason is that we'd like large BufFiles to be spread across multiple
+ * tablespaces when available.
+ */
+#define MAX_PHYSICAL_FILESIZE	0x40000000
+#define BUFFILE_SEG_SIZE		(MAX_PHYSICAL_FILESIZE / BLCKSZ)
+
+/*
+ * This data structure represents a buffered file that consists of one or
+ * more physical files (each accessed through a virtual file descriptor
+ * managed by fd.c).
+ */
+struct BufFile
+{
+	int			numFiles;		/* number of physical files in set */
+	/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
+	File	   *files;			/* palloc'd array with numFiles entries */
+
+	bool		isInterXact;	/* keep open over transactions? */
+	bool		dirty;			/* does buffer need to be written? */
+	bool		readOnly;		/* has the file been set to read only? */
+
+	SharedFileSet *fileset;		/* space for segment files if shared */
+	const char *name;			/* name of this BufFile if shared */
+
+	/*
+	 * resowner is the ResourceOwner to use for underlying temp files.  (We
+	 * don't need to remember the memory context we're using explicitly,
+	 * because after creation we only repalloc our arrays larger.)
+	 */
+	ResourceOwner resowner;
+
+	/*
+	 * "current pos" is position of start of buffer within the logical file.
+	 * Position as seen by user of BufFile is (curFile, curOffset + pos).
+	 */
+	int			curFile;		/* file index (0..n) part of current pos */
+	off_t		curOffset;		/* offset part of current pos */
+	int			pos;			/* next read/write position in buffer */
+	int			nbytes;			/* total # of valid bytes in buffer */
+	PGAlignedBlock buffer;
+};
+
+static BufFile *makeBufFileCommon(int nfiles);
+static BufFile *makeBufFile(File firstfile);
+static void extendBufFile(BufFile *file);
+static void BufFileLoadBuffer(BufFile *file);
+static void BufFileDumpBuffer(BufFile *file);
+static void BufFileFlush(BufFile *file);
+static File MakeNewSharedSegment(BufFile *file, int segment);
+
+/*
+ * Create BufFile and perform the common initialization.
+ */
+static BufFile *
+makeBufFileCommon(int nfiles)
+{
+	BufFile    *file = (BufFile *) palloc(sizeof(BufFile));
+
+	file->numFiles = nfiles;
+	file->isInterXact = false;
+	file->dirty = false;
+	file->resowner = CurrentResourceOwner;
+	file->curFile = 0;
+	file->curOffset = 0L;
+	file->pos = 0;
+	file->nbytes = 0;
+
+	return file;
+}
+
+/*
+ * Create a BufFile given the first underlying physical file.
+ * NOTE: caller must set isInterXact if appropriate.
+ */
+static BufFile *
+makeBufFile(File firstfile)
+{
+	BufFile    *file = makeBufFileCommon(1);
+
+	file->files = (File *) palloc(sizeof(File));
+	file->files[0] = firstfile;
+	file->readOnly = false;
+	file->fileset = NULL;
+	file->name = NULL;
+
+	return file;
+}
+
+/*
+ * Add another component temp file.
+ */
+static void
+extendBufFile(BufFile *file)
+{
+	File		pfile;
+	ResourceOwner oldowner;
+
+	/* Be sure to associate the file with the BufFile's resource owner */
+	oldowner = CurrentResourceOwner;
+	CurrentResourceOwner = file->resowner;
+
+	if (file->fileset == NULL)
+		pfile = OpenTemporaryFile(file->isInterXact);
+	else
+		pfile = MakeNewSharedSegment(file, file->numFiles);
+
+	Assert(pfile >= 0);
+
+	CurrentResourceOwner = oldowner;
+
+	file->files = (File *) repalloc(file->files,
+									(file->numFiles + 1) * sizeof(File));
+	file->files[file->numFiles] = pfile;
+	file->numFiles++;
+}
+
+/*
+ * Create a BufFile for a new temporary file (which will expand to become
+ * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
+ * written to it).
+ *
+ * If interXact is true, the temp file will not be automatically deleted
+ * at end of transaction.
+ *
+ * Note: if interXact is true, the caller had better be calling us in a
+ * memory context, and with a resource owner, that will survive across
+ * transaction boundaries.
+ */
+BufFile *
+BufFileCreateTemp(bool interXact)
+{
+	BufFile    *file;
+	File		pfile;
+
+	/*
+	 * Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
+	 * Possibly the caller will have done this already, but it seems useful to
+	 * double-check here.  Failure to do this at all would result in the temp
+	 * files always getting placed in the default tablespace, which is a
+	 * pretty hard-to-detect bug.  Callers may prefer to do it earlier if they
+	 * want to be sure that any required catalog access is done in some other
+	 * resource context.
+	 */
+	PrepareTempTablespaces();
+
+	pfile = OpenTemporaryFile(interXact);
+	Assert(pfile >= 0);
+
+	file = makeBufFile(pfile);
+	file->isInterXact = interXact;
+
+	return file;
+}
+
+/*
+ * Build the name for a given segment of a given BufFile.
+ */
+static void
+SharedSegmentName(char *name, const char *buffile_name, int segment)
+{
+	snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
+}
+
+/*
+ * Create a new segment file backing a shared BufFile.
+ */
+static File
+MakeNewSharedSegment(BufFile *buffile, int segment)
+{
+	char		name[MAXPGPATH];
+	File		file;
+
+	/*
+	 * It is possible that there are files left over from before a crash
+	 * restart with the same name.  In order for BufFileOpenShared() not to
+	 * get confused about how many segments there are, we'll unlink the next
+	 * segment number if it already exists.
+	 */
+	SharedSegmentName(name, buffile->name, segment + 1);
+	SharedFileSetDelete(buffile->fileset, name, true);
+
+	/* Create the new segment. */
+	SharedSegmentName(name, buffile->name, segment);
+	file = SharedFileSetCreate(buffile->fileset, name);
+
+	/* SharedFileSetCreate would've errored out */
+	Assert(file > 0);
+
+	return file;
+}
+
+/*
+ * Create a BufFile that can be discovered and opened read-only by other
+ * backends that are attached to the same SharedFileSet using the same name.
+ *
+ * The naming scheme for shared BufFiles is left up to the calling code.  The
+ * name will appear as part of one or more filenames on disk, and might
+ * provide clues to administrators about which subsystem is generating
+ * temporary file data.  Since each SharedFileSet object is backed by one or
+ * more uniquely named temporary directory, names don't conflict with
+ * unrelated SharedFileSet objects.
+ */
+BufFile *
+BufFileCreateShared(SharedFileSet *fileset, const char *name)
+{
+	BufFile    *file;
+
+	file = makeBufFileCommon(1);
+	file->fileset = fileset;
+	file->name = pstrdup(name);
+	file->files = (File *) palloc(sizeof(File));
+	file->files[0] = MakeNewSharedSegment(file, 0);
+	file->readOnly = false;
+
+	return file;
+}
+
+/*
+ * Open a file that was previously created in another backend (or this one)
+ * with BufFileCreateShared in the same SharedFileSet using the same name.
+ * The backend that created the file must have called BufFileClose() or
+ * BufFileExportShared() to make sure that it is ready to be opened by other
+ * backends and render it read-only.
+ */
+BufFile *
+BufFileOpenShared(SharedFileSet *fileset, const char *name, int mode)
+{
+	BufFile    *file;
+	char		segment_name[MAXPGPATH];
+	Size		capacity = 16;
+	File	   *files;
+	int			nfiles = 0;
+
+	files = palloc(sizeof(File) * capacity);
+
+	/*
+	 * We don't know how many segments there are, so we'll probe the
+	 * filesystem to find out.
+	 */
+	for (;;)
+	{
+		/* See if we need to expand our file segment array. */
+		if (nfiles + 1 > capacity)
+		{
+			capacity *= 2;
+			files = repalloc(files, sizeof(File) * capacity);
+		}
+		/* Try to load a segment. */
+		SharedSegmentName(segment_name, name, nfiles);
+		files[nfiles] = SharedFileSetOpen(fileset, segment_name, mode);
+		if (files[nfiles] <= 0)
+			break;
+		++nfiles;
+
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	/*
+	 * If we didn't find any files at all, then no BufFile exists with this
+	 * name.
+	 */
+	if (nfiles == 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
+						segment_name, name)));
+
+	file = makeBufFileCommon(nfiles);
+	file->files = files;
+	file->readOnly = (mode == O_RDONLY) ? true : false;
+	file->fileset = fileset;
+	file->name = pstrdup(name);
+
+	return file;
+}
+
+/*
+ * Delete a BufFile that was created by BufFileCreateShared in the given
+ * SharedFileSet using the given name.
+ *
+ * It is not necessary to delete files explicitly with this function.  It is
+ * provided only as a way to delete files proactively, rather than waiting for
+ * the SharedFileSet to be cleaned up.
+ *
+ * Only one backend should attempt to delete a given name, and should know
+ * that it exists and has been exported or closed.
+ */
+void
+BufFileDeleteShared(SharedFileSet *fileset, const char *name)
+{
+	char		segment_name[MAXPGPATH];
+	int			segment = 0;
+	bool		found = false;
+
+	/*
+	 * We don't know how many segments the file has.  We'll keep deleting
+	 * until we run out.  If we don't manage to find even an initial segment,
+	 * raise an error.
+	 */
+	for (;;)
+	{
+		SharedSegmentName(segment_name, name, segment);
+		if (!SharedFileSetDelete(fileset, segment_name, true))
+			break;
+		found = true;
+		++segment;
+
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	if (!found)
+		elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name);
+}
+
+/*
+ * BufFileExportShared --- flush and make read-only, in preparation for sharing.
+ */
+void
+BufFileExportShared(BufFile *file)
+{
+	/* Must be a file belonging to a SharedFileSet. */
+	Assert(file->fileset != NULL);
+
+	/* It's probably a bug if someone calls this twice. */
+	Assert(!file->readOnly);
+
+	BufFileFlush(file);
+	file->readOnly = true;
+}
+
+/*
+ * Close a BufFile
+ *
+ * Like fclose(), this also implicitly FileCloses the underlying File.
+ */
+void
+BufFileClose(BufFile *file)
+{
+	int			i;
+
+	/* flush any unwritten data */
+	BufFileFlush(file);
+	/* close and delete the underlying file(s) */
+	for (i = 0; i < file->numFiles; i++)
+		FileClose(file->files[i]);
+	/* release the buffer space */
+	pfree(file->files);
+	pfree(file);
+}
+
+/*
+ * BufFileLoadBuffer
+ *
+ * Load some data into buffer, if possible, starting from curOffset.
+ * At call, must have dirty = false, pos and nbytes = 0.
+ * On exit, nbytes is number of bytes loaded.
+ */
+static void
+BufFileLoadBuffer(BufFile *file)
+{
+	File		thisfile;
+
+	/*
+	 * Advance to next component file if necessary and possible.
+	 */
+	if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
+		file->curFile + 1 < file->numFiles)
+	{
+		file->curFile++;
+		file->curOffset = 0L;
+	}
+
+	/*
+	 * Read whatever we can get, up to a full bufferload.
+	 */
+	thisfile = file->files[file->curFile];
+	file->nbytes = FileRead(thisfile,
+							file->buffer.data,
+							sizeof(file->buffer),
+							file->curOffset,
+							WAIT_EVENT_BUFFILE_READ);
+	if (file->nbytes < 0)
+	{
+		file->nbytes = 0;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\": %m",
+						FilePathName(thisfile))));
+	}
+
+	/* we choose not to advance curOffset here */
+
+	if (file->nbytes > 0)
+		pgBufferUsage.temp_blks_read++;
+}
+
+/*
+ * BufFileDumpBuffer
+ *
+ * Dump buffer contents starting at curOffset.
+ * At call, should have dirty = true, nbytes > 0.
+ * On exit, dirty is cleared if successful write, and curOffset is advanced.
+ */
+static void
+BufFileDumpBuffer(BufFile *file)
+{
+	int			wpos = 0;
+	int			bytestowrite;
+	File		thisfile;
+
+	/*
+	 * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
+	 * crosses a component-file boundary; so we need a loop.
+	 */
+	while (wpos < file->nbytes)
+	{
+		off_t		availbytes;
+
+		/*
+		 * Advance to next component file if necessary and possible.
+		 */
+		if (file->curOffset >= MAX_PHYSICAL_FILESIZE)
+		{
+			while (file->curFile + 1 >= file->numFiles)
+				extendBufFile(file);
+			file->curFile++;
+			file->curOffset = 0L;
+		}
+
+		/*
+		 * Determine how much we need to write into this file.
+		 */
+		bytestowrite = file->nbytes - wpos;
+		availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
+
+		if ((off_t) bytestowrite > availbytes)
+			bytestowrite = (int) availbytes;
+
+		thisfile = file->files[file->curFile];
+		bytestowrite = FileWrite(thisfile,
+								 file->buffer.data + wpos,
+								 bytestowrite,
+								 file->curOffset,
+								 WAIT_EVENT_BUFFILE_WRITE);
+		if (bytestowrite <= 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to file \"%s\": %m",
+							FilePathName(thisfile))));
+		file->curOffset += bytestowrite;
+		wpos += bytestowrite;
+
+		pgBufferUsage.temp_blks_written++;
+	}
+	file->dirty = false;
+
+	/*
+	 * At this point, curOffset has been advanced to the end of the buffer,
+	 * ie, its original value + nbytes.  We need to make it point to the
+	 * logical file position, ie, original value + pos, in case that is less
+	 * (as could happen due to a small backwards seek in a dirty buffer!)
+	 */
+	file->curOffset -= (file->nbytes - file->pos);
+	if (file->curOffset < 0)	/* handle possible segment crossing */
+	{
+		file->curFile--;
+		Assert(file->curFile >= 0);
+		file->curOffset += MAX_PHYSICAL_FILESIZE;
+	}
+
+	/*
+	 * Now we can set the buffer empty without changing the logical position
+	 */
+	file->pos = 0;
+	file->nbytes = 0;
+}
+
+/*
+ * BufFileRead
+ *
+ * Like fread() except we assume 1-byte element size and report I/O errors via
+ * ereport().
+ */
+size_t
+BufFileRead(BufFile *file, void *ptr, size_t size)
+{
+	size_t		nread = 0;
+	size_t		nthistime;
+
+	BufFileFlush(file);
+
+	while (size > 0)
+	{
+		if (file->pos >= file->nbytes)
+		{
+			/* Try to load more data into buffer. */
+			file->curOffset += file->pos;
+			file->pos = 0;
+			file->nbytes = 0;
+			BufFileLoadBuffer(file);
+			if (file->nbytes <= 0)
+				break;			/* no more data available */
+		}
+
+		nthistime = file->nbytes - file->pos;
+		if (nthistime > size)
+			nthistime = size;
+		Assert(nthistime > 0);
+
+		memcpy(ptr, file->buffer.data + file->pos, nthistime);
+
+		file->pos += nthistime;
+		ptr = (void *) ((char *) ptr + nthistime);
+		size -= nthistime;
+		nread += nthistime;
+	}
+
+	return nread;
+}
+
+/*
+ * BufFileWrite
+ *
+ * Like fwrite() except we assume 1-byte element size and report errors via
+ * ereport().
+ */
+void
+BufFileWrite(BufFile *file, void *ptr, size_t size)
+{
+	size_t		nthistime;
+
+	Assert(!file->readOnly);
+
+	while (size > 0)
+	{
+		if (file->pos >= BLCKSZ)
+		{
+			/* Buffer full, dump it out */
+			if (file->dirty)
+				BufFileDumpBuffer(file);
+			else
+			{
+				/* Hmm, went directly from reading to writing? */
+				file->curOffset += file->pos;
+				file->pos = 0;
+				file->nbytes = 0;
+			}
+		}
+
+		nthistime = BLCKSZ - file->pos;
+		if (nthistime > size)
+			nthistime = size;
+		Assert(nthistime > 0);
+
+		memcpy(file->buffer.data + file->pos, ptr, nthistime);
+
+		file->dirty = true;
+		file->pos += nthistime;
+		if (file->nbytes < file->pos)
+			file->nbytes = file->pos;
+		ptr = (void *) ((char *) ptr + nthistime);
+		size -= nthistime;
+	}
+}
+
+/*
+ * BufFileFlush
+ *
+ * Like fflush(), except that I/O errors are reported with ereport().
+ */
+static void
+BufFileFlush(BufFile *file)
+{
+	if (file->dirty)
+		BufFileDumpBuffer(file);
+
+	Assert(!file->dirty);
+}
+
+/*
+ * BufFileSeek
+ *
+ * Like fseek(), except that target position needs two values in order to
+ * work when logical filesize exceeds maximum value representable by off_t.
+ * We do not support relative seeks across more than that, however.
+ * I/O errors are reported by ereport().
+ *
+ * Result is 0 if OK, EOF if not.  Logical position is not moved if an
+ * impossible seek is attempted.
+ */
+int
+BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
+{
+	int			newFile;
+	off_t		newOffset;
+
+	switch (whence)
+	{
+		case SEEK_SET:
+			if (fileno < 0)
+				return EOF;
+			newFile = fileno;
+			newOffset = offset;
+			break;
+		case SEEK_CUR:
+
+			/*
+			 * Relative seek considers only the signed offset, ignoring
+			 * fileno. Note that large offsets (> 1 GB) risk overflow in this
+			 * add, unless we have 64-bit off_t.
+			 */
+			newFile = file->curFile;
+			newOffset = (file->curOffset + file->pos) + offset;
+			break;
+		case SEEK_END:
+
+			/*
+			 * The file size of the last file gives us the end offset of that
+			 * file.
+			 */
+			newFile = file->numFiles - 1;
+			newOffset = FileSize(file->files[file->numFiles - 1]);
+			if (newOffset < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
+								FilePathName(file->files[file->numFiles - 1]),
+								file->name)));
+			break;
+		default:
+			elog(ERROR, "invalid whence: %d", whence);
+			return EOF;
+	}
+	while (newOffset < 0)
+	{
+		if (--newFile < 0)
+			return EOF;
+		newOffset += MAX_PHYSICAL_FILESIZE;
+	}
+	if (newFile == file->curFile &&
+		newOffset >= file->curOffset &&
+		newOffset <= file->curOffset + file->nbytes)
+	{
+		/*
+		 * Seek is to a point within existing buffer; we can just adjust
+		 * pos-within-buffer, without flushing buffer.  Note this is OK
+		 * whether reading or writing, but buffer remains dirty if we were
+		 * writing.
+		 */
+		file->pos = (int) (newOffset - file->curOffset);
+		return 0;
+	}
+	/* Otherwise, must reposition buffer, so flush any dirty data */
+	BufFileFlush(file);
+
+	/*
+	 * At this point and no sooner, check for seek past last segment. The
+	 * above flush could have created a new segment, so checking sooner would
+	 * not work (at least not with this code).
+	 */
+
+	/* convert seek to "start of next seg" to "end of last seg" */
+	if (newFile == file->numFiles && newOffset == 0)
+	{
+		newFile--;
+		newOffset = MAX_PHYSICAL_FILESIZE;
+	}
+	while (newOffset > MAX_PHYSICAL_FILESIZE)
+	{
+		if (++newFile >= file->numFiles)
+			return EOF;
+		newOffset -= MAX_PHYSICAL_FILESIZE;
+	}
+	if (newFile >= file->numFiles)
+		return EOF;
+	/* Seek is OK! */
+	file->curFile = newFile;
+	file->curOffset = newOffset;
+	file->pos = 0;
+	file->nbytes = 0;
+	return 0;
+}
+
+void
+BufFileTell(BufFile *file, int *fileno, off_t *offset)
+{
+	*fileno = file->curFile;
+	*offset = file->curOffset + file->pos;
+}
+
+/*
+ * BufFileSeekBlock --- block-oriented seek
+ *
+ * Performs absolute seek to the start of the n'th BLCKSZ-sized block of
+ * the file.  Note that users of this interface will fail if their files
+ * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
+ * with tables bigger than that, either...
+ *
+ * Result is 0 if OK, EOF if not.  Logical position is not moved if an
+ * impossible seek is attempted.
+ */
+int
+BufFileSeekBlock(BufFile *file, long blknum)
+{
+	return BufFileSeek(file,
+					   (int) (blknum / BUFFILE_SEG_SIZE),
+					   (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
+					   SEEK_SET);
+}
+
+#ifdef NOT_USED
+/*
+ * BufFileTellBlock --- block-oriented tell
+ *
+ * Any fractional part of a block in the current seek position is ignored.
+ */
+long
+BufFileTellBlock(BufFile *file)
+{
+	long		blknum;
+
+	blknum = (file->curOffset + file->pos) / BLCKSZ;
+	blknum += file->curFile * BUFFILE_SEG_SIZE;
+	return blknum;
+}
+
+#endif
+
+/*
+ * Return the current shared BufFile size.
+ *
+ * Counts any holes left behind by BufFileAppend as part of the size.
+ * ereport()s on failure.
+ */
+int64
+BufFileSize(BufFile *file)
+{
+	int64		lastFileSize;
+
+	Assert(file->fileset != NULL);
+
+	/* Get the size of the last physical file. */
+	lastFileSize = FileSize(file->files[file->numFiles - 1]);
+	if (lastFileSize < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
+						FilePathName(file->files[file->numFiles - 1]),
+						file->name)));
+
+	return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) +
+		lastFileSize;
+}
+
+/*
+ * Append the contents of source file (managed within shared fileset) to
+ * end of target file (managed within same shared fileset).
+ *
+ * Note that operation subsumes ownership of underlying resources from
+ * "source".  Caller should never call BufFileClose against source having
+ * called here first.  Resource owners for source and target must match,
+ * too.
+ *
+ * This operation works by manipulating lists of segment files, so the
+ * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
+ * boundary, typically creating empty holes before the boundary.  These
+ * areas do not contain any interesting data, and cannot be read from by
+ * caller.
+ *
+ * Returns the block number within target where the contents of source
+ * begins.  Caller should apply this as an offset when working off block
+ * positions that are in terms of the original BufFile space.
+ */
+long
+BufFileAppend(BufFile *target, BufFile *source)
+{
+	long		startBlock = target->numFiles * BUFFILE_SEG_SIZE;
+	int			newNumFiles = target->numFiles + source->numFiles;
+	int			i;
+
+	Assert(target->fileset != NULL);
+	Assert(source->readOnly);
+	Assert(!source->dirty);
+	Assert(source->fileset != NULL);
+
+	if (target->resowner != source->resowner)
+		elog(ERROR, "could not append BufFile with non-matching resource owner");
+
+	target->files = (File *)
+		repalloc(target->files, sizeof(File) * newNumFiles);
+	for (i = target->numFiles; i < newNumFiles; i++)
+		target->files[i] = source->files[i - target->numFiles];
+	target->numFiles = newNumFiles;
+
+	return startBlock;
+}
+
+/*
+ * Truncate a BufFile created by BufFileCreateShared up to the given fileno and
+ * the offset.
+ */
+void
+BufFileTruncateShared(BufFile *file, int fileno, off_t offset)
+{
+	int			numFiles = file->numFiles;
+	int			newFile = fileno;
+	off_t		newOffset = file->curOffset;
+	char		segment_name[MAXPGPATH];
+	int			i;
+
+	/*
+	 * Loop over all the files up to the given fileno and remove the files
+	 * that are greater than the fileno and truncate the given file up to the
+	 * offset. Note that we also remove the given fileno if the offset is 0
+	 * provided it is not the first file in which we truncate it.
+	 */
+	for (i = file->numFiles - 1; i >= fileno; i--)
+	{
+		if ((i != fileno || offset == 0) && i != 0)
+		{
+			SharedSegmentName(segment_name, file->name, i);
+			FileClose(file->files[i]);
+			if (!SharedFileSetDelete(file->fileset, segment_name, true))
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not delete shared fileset \"%s\": %m",
+								segment_name)));
+			numFiles--;
+			newOffset = MAX_PHYSICAL_FILESIZE;
+
+			/*
+			 * This is required to indicate that we have deleted the given
+			 * fileno.
+			 */
+			if (i == fileno)
+				newFile--;
+		}
+		else
+		{
+			if (FileTruncate(file->files[i], offset,
+							 WAIT_EVENT_BUFFILE_TRUNCATE) < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not truncate file \"%s\": %m",
+								FilePathName(file->files[i]))));
+			newOffset = offset;
+		}
+	}
+
+	file->numFiles = numFiles;
+
+	/*
+	 * If the truncate point is within existing buffer then we can just adjust
+	 * pos within buffer.
+	 */
+	if (newFile == file->curFile &&
+		newOffset >= file->curOffset &&
+		newOffset <= file->curOffset + file->nbytes)
+	{
+		/* No need to reset the current pos if the new pos is greater. */
+		if (newOffset <= file->curOffset + file->pos)
+			file->pos = (int) (newOffset - file->curOffset);
+
+		/* Adjust the nbytes for the current buffer. */
+		file->nbytes = (int) (newOffset - file->curOffset);
+	}
+	else if (newFile == file->curFile &&
+			 newOffset < file->curOffset)
+	{
+		/*
+		 * The truncate point is within the existing file but prior to the
+		 * current position, so we can forget the current buffer and reset the
+		 * current position.
+		 */
+		file->curOffset = newOffset;
+		file->pos = 0;
+		file->nbytes = 0;
+	}
+	else if (newFile < file->curFile)
+	{
+		/*
+		 * The truncate point is prior to the current file, so need to reset
+		 * the current position accordingly.
+		 */
+		file->curFile = newFile;
+		file->curOffset = newOffset;
+		file->pos = 0;
+		file->nbytes = 0;
+	}
+	/* Nothing to do, if the truncate point is beyond current file. */
+}
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
new file mode 100644
index 0000000..da8b7cb
--- /dev/null
+++ b/src/backend/storage/file/copydir.c
@@ -0,0 +1,226 @@
+/*-------------------------------------------------------------------------
+ *
+ * copydir.c
+ *	  copies a directory
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *	While "xcopy /e /i /q" works fine for copying directories, on Windows XP
+ *	it requires a Window handle which prevents it from working when invoked
+ *	as a service.
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/copydir.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/copydir.h"
+#include "storage/fd.h"
+
+/*
+ * copydir: copy a directory
+ *
+ * If recurse is false, subdirectories are ignored.  Anything that's not
+ * a directory or a regular file is ignored.
+ */
+void
+copydir(char *fromdir, char *todir, bool recurse)
+{
+	DIR		   *xldir;
+	struct dirent *xlde;
+	char		fromfile[MAXPGPATH * 2];
+	char		tofile[MAXPGPATH * 2];
+
+	if (MakePGDirectory(todir) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create directory \"%s\": %m", todir)));
+
+	xldir = AllocateDir(fromdir);
+
+	while ((xlde = ReadDir(xldir, fromdir)) != NULL)
+	{
+		struct stat fst;
+
+		/* If we got a cancel signal during the copy of the directory, quit */
+		CHECK_FOR_INTERRUPTS();
+
+		if (strcmp(xlde->d_name, ".") == 0 ||
+			strcmp(xlde->d_name, "..") == 0)
+			continue;
+
+		snprintf(fromfile, sizeof(fromfile), "%s/%s", fromdir, xlde->d_name);
+		snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name);
+
+		if (lstat(fromfile, &fst) < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", fromfile)));
+
+		if (S_ISDIR(fst.st_mode))
+		{
+			/* recurse to handle subdirectories */
+			if (recurse)
+				copydir(fromfile, tofile, true);
+		}
+		else if (S_ISREG(fst.st_mode))
+			copy_file(fromfile, tofile);
+	}
+	FreeDir(xldir);
+
+	/*
+	 * Be paranoid here and fsync all files to ensure the copy is really done.
+	 * But if fsync is disabled, we're done.
+	 */
+	if (!enableFsync)
+		return;
+
+	xldir = AllocateDir(todir);
+
+	while ((xlde = ReadDir(xldir, todir)) != NULL)
+	{
+		struct stat fst;
+
+		if (strcmp(xlde->d_name, ".") == 0 ||
+			strcmp(xlde->d_name, "..") == 0)
+			continue;
+
+		snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name);
+
+		/*
+		 * We don't need to sync subdirectories here since the recursive
+		 * copydir will do it before it returns
+		 */
+		if (lstat(tofile, &fst) < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", tofile)));
+
+		if (S_ISREG(fst.st_mode))
+			fsync_fname(tofile, false);
+	}
+	FreeDir(xldir);
+
+	/*
+	 * It's important to fsync the destination directory itself as individual
+	 * file fsyncs don't guarantee that the directory entry for the file is
+	 * synced. Recent versions of ext4 have made the window much wider but
+	 * it's been true for ext3 and other filesystems in the past.
+	 */
+	fsync_fname(todir, true);
+}
+
+/*
+ * copy one file
+ */
+void
+copy_file(char *fromfile, char *tofile)
+{
+	char	   *buffer;
+	int			srcfd;
+	int			dstfd;
+	int			nbytes;
+	off_t		offset;
+	off_t		flush_offset;
+
+	/* Size of copy buffer (read and write requests) */
+#define COPY_BUF_SIZE (8 * BLCKSZ)
+
+	/*
+	 * Size of data flush requests.  It seems beneficial on most platforms to
+	 * do this every 1MB or so.  But macOS, at least with early releases of
+	 * APFS, is really unfriendly to small mmap/msync requests, so there do it
+	 * only every 32MB.
+	 */
+#if defined(__darwin__)
+#define FLUSH_DISTANCE (32 * 1024 * 1024)
+#else
+#define FLUSH_DISTANCE (1024 * 1024)
+#endif
+
+	/* Use palloc to ensure we get a maxaligned buffer */
+	buffer = palloc(COPY_BUF_SIZE);
+
+	/*
+	 * Open the files
+	 */
+	srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY);
+	if (srcfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fromfile)));
+
+	dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (dstfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tofile)));
+
+	/*
+	 * Do the data copying.
+	 */
+	flush_offset = 0;
+	for (offset = 0;; offset += nbytes)
+	{
+		/* If we got a cancel signal during the copy of the file, quit */
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * We fsync the files later, but during the copy, flush them every so
+		 * often to avoid spamming the cache and hopefully get the kernel to
+		 * start writing them out before the fsync comes.
+		 */
+		if (offset - flush_offset >= FLUSH_DISTANCE)
+		{
+			pg_flush_data(dstfd, flush_offset, offset - flush_offset);
+			flush_offset = offset;
+		}
+
+		pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ);
+		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
+		pgstat_report_wait_end();
+		if (nbytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m", fromfile)));
+		if (nbytes == 0)
+			break;
+		errno = 0;
+		pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE);
+		if ((int) write(dstfd, buffer, nbytes) != nbytes)
+		{
+			/* if write didn't set errno, assume problem is no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to file \"%s\": %m", tofile)));
+		}
+		pgstat_report_wait_end();
+	}
+
+	if (offset > flush_offset)
+		pg_flush_data(dstfd, flush_offset, offset - flush_offset);
+
+	if (CloseTransientFile(dstfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tofile)));
+
+	if (CloseTransientFile(srcfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fromfile)));
+
+	pfree(buffer);
+}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
new file mode 100644
index 0000000..e76daff
--- /dev/null
+++ b/src/backend/storage/file/fd.c
@@ -0,0 +1,3789 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.c
+ *	  Virtual file descriptor code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/fd.c
+ *
+ * NOTES:
+ *
+ * This code manages a cache of 'virtual' file descriptors (VFDs).
+ * The server opens many file descriptors for a variety of reasons,
+ * including base tables, scratch files (e.g., sort and hash spool
+ * files), and random calls to C library routines like system(3); it
+ * is quite easy to exceed system limits on the number of open files a
+ * single process can have.  (This is around 1024 on many modern
+ * operating systems, but may be lower on others.)
+ *
+ * VFDs are managed as an LRU pool, with actual OS file descriptors
+ * being opened and closed as needed.  Obviously, if a routine is
+ * opened using these interfaces, all subsequent operations must also
+ * be through these interfaces (the File type is not a real file
+ * descriptor).
+ *
+ * For this scheme to work, most (if not all) routines throughout the
+ * server should use these interfaces instead of calling the C library
+ * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
+ * may find ourselves short of real file descriptors anyway.
+ *
+ * INTERFACE ROUTINES
+ *
+ * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
+ * A File opened with OpenTemporaryFile is automatically deleted when the
+ * File is closed, either explicitly or implicitly at end of transaction or
+ * process exit. PathNameOpenFile is intended for files that are held open
+ * for a long time, like relation files. It is the caller's responsibility
+ * to close them, there is no automatic mechanism in fd.c for that.
+ *
+ * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
+ * temporary files that have names so that they can be shared between
+ * backends.  Such files are automatically closed and count against the
+ * temporary file limit of the backend that creates them, but unlike anonymous
+ * files they are not automatically deleted.  See sharedfileset.c for a shared
+ * ownership mechanism that provides automatic cleanup for shared files when
+ * the last of a group of backends detaches.
+ *
+ * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
+ * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
+ * They behave like the corresponding native functions, except that the handle
+ * is registered with the current subtransaction, and will be automatically
+ * closed at abort. These are intended mainly for short operations like
+ * reading a configuration file; there is a limit on the number of files that
+ * can be opened using these functions at any one time.
+ *
+ * Finally, BasicOpenFile is just a thin wrapper around open() that can
+ * release file descriptors in use by the virtual file descriptors if
+ * necessary. There is no automatic cleanup of file descriptors returned by
+ * BasicOpenFile, it is solely the caller's responsibility to close the file
+ * descriptor by calling close(2).
+ *
+ * If a non-virtual file descriptor needs to be held open for any length of
+ * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
+ * (and eventually ReleaseExternalFD), so that we can take it into account
+ * while deciding how many VFDs can be open.  This applies to FDs obtained
+ * with BasicOpenFile as well as those obtained without use of any fd.c API.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <dirent.h>
+#include <sys/file.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <limits.h>
+#include <unistd.h>
+#include <fcntl.h>
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>		/* for getrlimit */
+#endif
+
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/pg_tablespace.h"
+#include "common/file_perm.h"
+#include "common/file_utils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/pg_iovec.h"
+#include "portability/mem.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "utils/guc.h"
+#include "utils/resowner_private.h"
+
+/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
+#if defined(HAVE_SYNC_FILE_RANGE)
+#define PG_FLUSH_DATA_WORKS 1
+#elif !defined(WIN32) && defined(MS_ASYNC)
+#define PG_FLUSH_DATA_WORKS 1
+#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+#define PG_FLUSH_DATA_WORKS 1
+#endif
+
+/*
+ * We must leave some file descriptors free for system(), the dynamic loader,
+ * and other code that tries to open files without consulting fd.c.  This
+ * is the number left free.  (While we try fairly hard to prevent EMFILE
+ * errors, there's never any guarantee that we won't get ENFILE due to
+ * other processes chewing up FDs.  So it's a bad idea to try to open files
+ * without consulting fd.c.  Nonetheless we cannot control all code.)
+ *
+ * Because this is just a fixed setting, we are effectively assuming that
+ * no such code will leave FDs open over the long term; otherwise the slop
+ * is likely to be insufficient.  Note in particular that we expect that
+ * loading a shared library does not result in any permanent increase in
+ * the number of open files.  (This appears to be true on most if not
+ * all platforms as of Feb 2004.)
+ */
+#define NUM_RESERVED_FDS		10
+
+/*
+ * If we have fewer than this many usable FDs after allowing for the reserved
+ * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
+ * much less than that.  Note that this value ensures numExternalFDs can be
+ * at least 16; as of this writing, the contrib/postgres_fdw regression tests
+ * will not pass unless that can grow to at least 14.)
+ */
+#define FD_MINFREE				48
+
+/*
+ * A number of platforms allow individual processes to open many more files
+ * than they can really support when *many* processes do the same thing.
+ * This GUC parameter lets the DBA limit max_safe_fds to something less than
+ * what the postmaster's initial probe suggests will work.
+ */
+int			max_files_per_process = 1000;
+
+/*
+ * Maximum number of file descriptors to open for operations that fd.c knows
+ * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
+ * to a conservative value, and remains that way indefinitely in bootstrap or
+ * standalone-backend cases.  In normal postmaster operation, the postmaster
+ * calls set_max_safe_fds() late in initialization to update the value, and
+ * that value is then inherited by forked subprocesses.
+ *
+ * Note: the value of max_files_per_process is taken into account while
+ * setting this variable, and so need not be tested separately.
+ */
+int			max_safe_fds = FD_MINFREE;	/* default if not changed */
+
+/* Whether it is safe to continue running after fsync() fails. */
+bool		data_sync_retry = false;
+
+/* How SyncDataDirectory() should do its job. */
+int			recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
+
+/* Debugging.... */
+
+#ifdef FDDEBUG
+#define DO_DB(A) \
+	do { \
+		int			_do_db_save_errno = errno; \
+		A; \
+		errno = _do_db_save_errno; \
+	} while (0)
+#else
+#define DO_DB(A) \
+	((void) 0)
+#endif
+
+#define VFD_CLOSED (-1)
+
+#define FileIsValid(file) \
+	((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
+
+#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
+
+/* these are the assigned bits in fdstate below: */
+#define FD_DELETE_AT_CLOSE	(1 << 0)	/* T = delete when closed */
+#define FD_CLOSE_AT_EOXACT	(1 << 1)	/* T = close at eoXact */
+#define FD_TEMP_FILE_LIMIT	(1 << 2)	/* T = respect temp_file_limit */
+
+typedef struct vfd
+{
+	int			fd;				/* current FD, or VFD_CLOSED if none */
+	unsigned short fdstate;		/* bitflags for VFD's state */
+	ResourceOwner resowner;		/* owner, for automatic cleanup */
+	File		nextFree;		/* link to next free VFD, if in freelist */
+	File		lruMoreRecently;	/* doubly linked recency-of-use list */
+	File		lruLessRecently;
+	off_t		fileSize;		/* current size of file (0 if not temporary) */
+	char	   *fileName;		/* name of file, or NULL for unused VFD */
+	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
+	int			fileFlags;		/* open(2) flags for (re)opening the file */
+	mode_t		fileMode;		/* mode to pass to open(2) */
+} Vfd;
+
+/*
+ * Virtual File Descriptor array pointer and size.  This grows as
+ * needed.  'File' values are indexes into this array.
+ * Note that VfdCache[0] is not a usable VFD, just a list header.
+ */
+static Vfd *VfdCache;
+static Size SizeVfdCache = 0;
+
+/*
+ * Number of file descriptors known to be in use by VFD entries.
+ */
+static int	nfile = 0;
+
+/*
+ * Flag to tell whether it's worth scanning VfdCache looking for temp files
+ * to close
+ */
+static bool have_xact_temporary_files = false;
+
+/*
+ * Tracks the total size of all temporary files.  Note: when temp_file_limit
+ * is being enforced, this cannot overflow since the limit cannot be more
+ * than INT_MAX kilobytes.  When not enforcing, it could theoretically
+ * overflow, but we don't care.
+ */
+static uint64 temporary_files_size = 0;
+
+/*
+ * List of OS handles opened with AllocateFile, AllocateDir and
+ * OpenTransientFile.
+ */
+typedef enum
+{
+	AllocateDescFile,
+	AllocateDescPipe,
+	AllocateDescDir,
+	AllocateDescRawFD
+} AllocateDescKind;
+
+typedef struct
+{
+	AllocateDescKind kind;
+	SubTransactionId create_subid;
+	union
+	{
+		FILE	   *file;
+		DIR		   *dir;
+		int			fd;
+	}			desc;
+} AllocateDesc;
+
+static int	numAllocatedDescs = 0;
+static int	maxAllocatedDescs = 0;
+static AllocateDesc *allocatedDescs = NULL;
+
+/*
+ * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
+ */
+static int	numExternalFDs = 0;
+
+/*
+ * Number of temporary files opened during the current session;
+ * this is used in generation of tempfile names.
+ */
+static long tempFileCounter = 0;
+
+/*
+ * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
+ * indicating that the current database's default tablespace should be used.)
+ * When numTempTableSpaces is -1, this has not been set in the current
+ * transaction.
+ */
+static Oid *tempTableSpaces = NULL;
+static int	numTempTableSpaces = -1;
+static int	nextTempTableSpace = 0;
+
+
+/*--------------------
+ *
+ * Private Routines
+ *
+ * Delete		   - delete a file from the Lru ring
+ * LruDelete	   - remove a file from the Lru ring and close its FD
+ * Insert		   - put a file at the front of the Lru ring
+ * LruInsert	   - put a file at the front of the Lru ring and open it
+ * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
+ * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
+ * AllocateVfd	   - grab a free (or new) file record (from VfdCache)
+ * FreeVfd		   - free a file record
+ *
+ * The Least Recently Used ring is a doubly linked list that begins and
+ * ends on element zero.  Element zero is special -- it doesn't represent
+ * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
+ * anchor that shows us the beginning/end of the ring.
+ * Only VFD elements that are currently really open (have an FD assigned) are
+ * in the Lru ring.  Elements that are "virtually" open can be recognized
+ * by having a non-null fileName field.
+ *
+ * example:
+ *
+ *	   /--less----\				   /---------\
+ *	   v		   \			  v			  \
+ *	 #0 --more---> LeastRecentlyUsed --more-\ \
+ *	  ^\									| |
+ *	   \\less--> MostRecentlyUsedFile	<---/ |
+ *		\more---/					 \--less--/
+ *
+ *--------------------
+ */
+static void Delete(File file);
+static void LruDelete(File file);
+static void Insert(File file);
+static int	LruInsert(File file);
+static bool ReleaseLruFile(void);
+static void ReleaseLruFiles(void);
+static File AllocateVfd(void);
+static void FreeVfd(File file);
+
+static int	FileAccess(File file);
+static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
+static bool reserveAllocatedDesc(void);
+static int	FreeDesc(AllocateDesc *desc);
+
+static void AtProcExit_Files(int code, Datum arg);
+static void CleanupTempFiles(bool isCommit, bool isProcExit);
+static void RemovePgTempRelationFiles(const char *tsdirname);
+static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
+
+static void walkdir(const char *path,
+					void (*action) (const char *fname, bool isdir, int elevel),
+					bool process_symlinks,
+					int elevel);
+#ifdef PG_FLUSH_DATA_WORKS
+static void pre_sync_fname(const char *fname, bool isdir, int elevel);
+#endif
+static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
+
+static int	fsync_parent_path(const char *fname, int elevel);
+
+
+/*
+ * pg_fsync --- do fsync with or without writethrough
+ */
+int
+pg_fsync(int fd)
+{
+#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
+	struct stat st;
+
+	/*
+	 * Some operating system implementations of fsync() have requirements
+	 * about the file access modes that were used when their file descriptor
+	 * argument was opened, and these requirements differ depending on whether
+	 * the file descriptor is for a directory.
+	 *
+	 * For any file descriptor that may eventually be handed to fsync(), we
+	 * should have opened it with access modes that are compatible with
+	 * fsync() on all supported systems, otherwise the code may not be
+	 * portable, even if it runs ok on the current system.
+	 *
+	 * We assert here that a descriptor for a file was opened with write
+	 * permissions (either O_RDWR or O_WRONLY) and for a directory without
+	 * write permissions (O_RDONLY).
+	 *
+	 * Ignore any fstat errors and let the follow-up fsync() do its work.
+	 * Doing this sanity check here counts for the case where fsync() is
+	 * disabled.
+	 */
+	if (fstat(fd, &st) == 0)
+	{
+		int			desc_flags = fcntl(fd, F_GETFL);
+
+		/*
+		 * O_RDONLY is historically 0, so just make sure that for directories
+		 * no write flags are used.
+		 */
+		if (S_ISDIR(st.st_mode))
+			Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
+		else
+			Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
+	}
+	errno = 0;
+#endif
+
+	/* #if is to skip the sync_method test if there's no need for it */
+#if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
+	if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
+		return pg_fsync_writethrough(fd);
+	else
+#endif
+		return pg_fsync_no_writethrough(fd);
+}
+
+
+/*
+ * pg_fsync_no_writethrough --- same as fsync except does nothing if
+ *	enableFsync is off
+ */
+int
+pg_fsync_no_writethrough(int fd)
+{
+	if (enableFsync)
+		return fsync(fd);
+	else
+		return 0;
+}
+
+/*
+ * pg_fsync_writethrough
+ */
+int
+pg_fsync_writethrough(int fd)
+{
+	if (enableFsync)
+	{
+#ifdef WIN32
+		return _commit(fd);
+#elif defined(F_FULLFSYNC)
+		return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
+#else
+		errno = ENOSYS;
+		return -1;
+#endif
+	}
+	else
+		return 0;
+}
+
+/*
+ * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
+ *
+ * Not all platforms have fdatasync; treat as fsync if not available.
+ */
+int
+pg_fdatasync(int fd)
+{
+	if (enableFsync)
+	{
+#ifdef HAVE_FDATASYNC
+		return fdatasync(fd);
+#else
+		return fsync(fd);
+#endif
+	}
+	else
+		return 0;
+}
+
+/*
+ * pg_flush_data --- advise OS that the described dirty data should be flushed
+ *
+ * offset of 0 with nbytes 0 means that the entire file should be flushed
+ */
+void
+pg_flush_data(int fd, off_t offset, off_t nbytes)
+{
+	/*
+	 * Right now file flushing is primarily used to avoid making later
+	 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
+	 * if fsyncs are disabled - that's a decision we might want to make
+	 * configurable at some point.
+	 */
+	if (!enableFsync)
+		return;
+
+	/*
+	 * We compile all alternatives that are supported on the current platform,
+	 * to find portability problems more easily.
+	 */
+#if defined(HAVE_SYNC_FILE_RANGE)
+	{
+		int			rc;
+		static bool not_implemented_by_kernel = false;
+
+		if (not_implemented_by_kernel)
+			return;
+
+		/*
+		 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
+		 * tells the OS that writeback for the specified blocks should be
+		 * started, but that we don't want to wait for completion.  Note that
+		 * this call might block if too much dirty data exists in the range.
+		 * This is the preferable method on OSs supporting it, as it works
+		 * reliably when available (contrast to msync()) and doesn't flush out
+		 * clean data (like FADV_DONTNEED).
+		 */
+		rc = sync_file_range(fd, offset, nbytes,
+							 SYNC_FILE_RANGE_WRITE);
+		if (rc != 0)
+		{
+			int			elevel;
+
+			/*
+			 * For systems that don't have an implementation of
+			 * sync_file_range() such as Windows WSL, generate only one
+			 * warning and then suppress all further attempts by this process.
+			 */
+			if (errno == ENOSYS)
+			{
+				elevel = WARNING;
+				not_implemented_by_kernel = true;
+			}
+			else
+				elevel = data_sync_elevel(WARNING);
+
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not flush dirty data: %m")));
+		}
+
+		return;
+	}
+#endif
+#if !defined(WIN32) && defined(MS_ASYNC)
+	{
+		void	   *p;
+		static int	pagesize = 0;
+
+		/*
+		 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
+		 * writeback. On linux it only does so if MS_SYNC is specified, but
+		 * then it does the writeback synchronously. Luckily all common linux
+		 * systems have sync_file_range().  This is preferable over
+		 * FADV_DONTNEED because it doesn't flush out clean data.
+		 *
+		 * We map the file (mmap()), tell the kernel to sync back the contents
+		 * (msync()), and then remove the mapping again (munmap()).
+		 */
+
+		/* mmap() needs actual length if we want to map whole file */
+		if (offset == 0 && nbytes == 0)
+		{
+			nbytes = lseek(fd, 0, SEEK_END);
+			if (nbytes < 0)
+			{
+				ereport(WARNING,
+						(errcode_for_file_access(),
+						 errmsg("could not determine dirty data size: %m")));
+				return;
+			}
+		}
+
+		/*
+		 * Some platforms reject partial-page mmap() attempts.  To deal with
+		 * that, just truncate the request to a page boundary.  If any extra
+		 * bytes don't get flushed, well, it's only a hint anyway.
+		 */
+
+		/* fetch pagesize only once */
+		if (pagesize == 0)
+			pagesize = sysconf(_SC_PAGESIZE);
+
+		/* align length to pagesize, dropping any fractional page */
+		if (pagesize > 0)
+			nbytes = (nbytes / pagesize) * pagesize;
+
+		/* fractional-page request is a no-op */
+		if (nbytes <= 0)
+			return;
+
+		/*
+		 * mmap could well fail, particularly on 32-bit platforms where there
+		 * may simply not be enough address space.  If so, silently fall
+		 * through to the next implementation.
+		 */
+		if (nbytes <= (off_t) SSIZE_MAX)
+			p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
+		else
+			p = MAP_FAILED;
+
+		if (p != MAP_FAILED)
+		{
+			int			rc;
+
+			rc = msync(p, (size_t) nbytes, MS_ASYNC);
+			if (rc != 0)
+			{
+				ereport(data_sync_elevel(WARNING),
+						(errcode_for_file_access(),
+						 errmsg("could not flush dirty data: %m")));
+				/* NB: need to fall through to munmap()! */
+			}
+
+			rc = munmap(p, (size_t) nbytes);
+			if (rc != 0)
+			{
+				/* FATAL error because mapping would remain */
+				ereport(FATAL,
+						(errcode_for_file_access(),
+						 errmsg("could not munmap() while flushing data: %m")));
+			}
+
+			return;
+		}
+	}
+#endif
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+	{
+		int			rc;
+
+		/*
+		 * Signal the kernel that the passed in range should not be cached
+		 * anymore. This has the, desired, side effect of writing out dirty
+		 * data, and the, undesired, side effect of likely discarding useful
+		 * clean cached blocks.  For the latter reason this is the least
+		 * preferable method.
+		 */
+
+		rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
+
+		if (rc != 0)
+		{
+			/* don't error out, this is just a performance optimization */
+			ereport(WARNING,
+					(errcode_for_file_access(),
+					 errmsg("could not flush dirty data: %m")));
+		}
+
+		return;
+	}
+#endif
+}
+
+/*
+ * Truncate a file to a given length by name.
+ */
+int
+pg_truncate(const char *path, off_t length)
+{
+#ifdef WIN32
+	int			save_errno;
+	int			ret;
+	int			fd;
+
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+	if (fd >= 0)
+	{
+		ret = ftruncate(fd, 0);
+		save_errno = errno;
+		CloseTransientFile(fd);
+		errno = save_errno;
+	}
+	else
+		ret = -1;
+
+	return ret;
+#else
+	return truncate(path, length);
+#endif
+}
+
+/*
+ * fsync_fname -- fsync a file or directory, handling errors properly
+ *
+ * Try to fsync a file or directory. When doing the latter, ignore errors that
+ * indicate the OS just doesn't allow/require fsyncing directories.
+ */
+void
+fsync_fname(const char *fname, bool isdir)
+{
+	fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
+}
+
+/*
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
+ *
+ * This routine ensures that, after returning, the effect of renaming file
+ * persists in case of a crash. A crash while this routine is running will
+ * leave you with either the pre-existing or the moved file in place of the
+ * new file; no mixed state or truncated files are possible.
+ *
+ * It does so by using fsync on the old filename and the possibly existing
+ * target filename before the rename, and the target file and directory after.
+ *
+ * Note that rename() cannot be used across arbitrary directories, as they
+ * might not be on the same filesystem. Therefore this routine does not
+ * support renaming across directories.
+ *
+ * Log errors with the caller specified severity.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_rename(const char *oldfile, const char *newfile, int elevel)
+{
+	int			fd;
+
+	/*
+	 * First fsync the old and target path (if it exists), to ensure that they
+	 * are properly persistent on disk. Syncing the target file is not
+	 * strictly necessary, but it makes it easier to reason about crashes;
+	 * because it's then guaranteed that either source or target file exists
+	 * after a crash.
+	 */
+	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
+		return -1;
+
+	fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
+	if (fd < 0)
+	{
+		if (errno != ENOENT)
+		{
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m", newfile)));
+			return -1;
+		}
+	}
+	else
+	{
+		if (pg_fsync(fd) != 0)
+		{
+			int			save_errno;
+
+			/* close file upon error, might not be in transaction context */
+			save_errno = errno;
+			CloseTransientFile(fd);
+			errno = save_errno;
+
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m", newfile)));
+			return -1;
+		}
+
+		if (CloseTransientFile(fd) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not close file \"%s\": %m", newfile)));
+			return -1;
+		}
+	}
+
+	/* Time to do the real deal... */
+	if (rename(oldfile, newfile) < 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not rename file \"%s\" to \"%s\": %m",
+						oldfile, newfile)));
+		return -1;
+	}
+
+	/*
+	 * To guarantee renaming the file is persistent, fsync the file with its
+	 * new name, and its containing directory.
+	 */
+	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
+		return -1;
+
+	if (fsync_parent_path(newfile, elevel) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * durable_unlink -- remove a file in a durable manner
+ *
+ * This routine ensures that, after returning, the effect of removing file
+ * persists in case of a crash. A crash while this routine is running will
+ * leave the system in no mixed state.
+ *
+ * It does so by using fsync on the parent directory of the file after the
+ * actual removal is done.
+ *
+ * Log errors with the severity specified by caller.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_unlink(const char *fname, int elevel)
+{
+	if (unlink(fname) < 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not remove file \"%s\": %m",
+						fname)));
+		return -1;
+	}
+
+	/*
+	 * To guarantee that the removal of the file is persistent, fsync its
+	 * parent directory.
+	 */
+	if (fsync_parent_path(fname, elevel) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * durable_rename_excl -- rename a file in a durable manner.
+ *
+ * Similar to durable_rename(), except that this routine tries (but does not
+ * guarantee) not to overwrite the target file.
+ *
+ * Note that a crash in an unfortunate moment can leave you with two links to
+ * the target file.
+ *
+ * Log errors with the caller specified severity.
+ *
+ * On Windows, using a hard link followed by unlink() causes concurrency
+ * issues, while a simple rename() does not cause that, so be careful when
+ * changing the logic of this routine.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
+{
+	/*
+	 * Ensure that, if we crash directly after the rename/link, a file with
+	 * valid contents is moved into place.
+	 */
+	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
+		return -1;
+
+#ifdef HAVE_WORKING_LINK
+	if (link(oldfile, newfile) < 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not link file \"%s\" to \"%s\": %m",
+						oldfile, newfile),
+				 (AmCheckpointerProcess() ?
+				  errhint("This is known to fail occasionally during archive recovery, where it is harmless.") :
+				  0)));
+		return -1;
+	}
+	unlink(oldfile);
+#else
+	if (rename(oldfile, newfile) < 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not rename file \"%s\" to \"%s\": %m",
+						oldfile, newfile),
+				 (AmCheckpointerProcess() ?
+				  errhint("This is known to fail occasionally during archive recovery, where it is harmless.") :
+				  0)));
+		return -1;
+	}
+#endif
+
+	/*
+	 * Make change persistent in case of an OS crash, both the new entry and
+	 * its parent directory need to be flushed.
+	 */
+	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
+		return -1;
+
+	/* Same for parent directory */
+	if (fsync_parent_path(newfile, elevel) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * InitFileAccess --- initialize this module during backend startup
+ *
+ * This is called during either normal or standalone backend start.
+ * It is *not* called in the postmaster.
+ */
+void
+InitFileAccess(void)
+{
+	Assert(SizeVfdCache == 0);	/* call me only once */
+
+	/* initialize cache header entry */
+	VfdCache = (Vfd *) malloc(sizeof(Vfd));
+	if (VfdCache == NULL)
+		ereport(FATAL,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
+	MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
+	VfdCache->fd = VFD_CLOSED;
+
+	SizeVfdCache = 1;
+
+	/* register proc-exit hook to ensure temp files are dropped at exit */
+	on_proc_exit(AtProcExit_Files, 0);
+}
+
+/*
+ * count_usable_fds --- count how many FDs the system will let us open,
+ *		and estimate how many are already open.
+ *
+ * We stop counting if usable_fds reaches max_to_probe.  Note: a small
+ * value of max_to_probe might result in an underestimate of already_open;
+ * we must fill in any "gaps" in the set of used FDs before the calculation
+ * of already_open will give the right answer.  In practice, max_to_probe
+ * of a couple of dozen should be enough to ensure good results.
+ *
+ * We assume stderr (FD 2) is available for dup'ing.  While the calling
+ * script could theoretically close that, it would be a really bad idea,
+ * since then one risks loss of error messages from, e.g., libc.
+ */
+static void
+count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
+{
+	int		   *fd;
+	int			size;
+	int			used = 0;
+	int			highestfd = 0;
+	int			j;
+
+#ifdef HAVE_GETRLIMIT
+	struct rlimit rlim;
+	int			getrlimit_status;
+#endif
+
+	size = 1024;
+	fd = (int *) palloc(size * sizeof(int));
+
+#ifdef HAVE_GETRLIMIT
+#ifdef RLIMIT_NOFILE			/* most platforms use RLIMIT_NOFILE */
+	getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
+#else							/* but BSD doesn't ... */
+	getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
+#endif							/* RLIMIT_NOFILE */
+	if (getrlimit_status != 0)
+		ereport(WARNING, (errmsg("getrlimit failed: %m")));
+#endif							/* HAVE_GETRLIMIT */
+
+	/* dup until failure or probe limit reached */
+	for (;;)
+	{
+		int			thisfd;
+
+#ifdef HAVE_GETRLIMIT
+
+		/*
+		 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
+		 * some platforms
+		 */
+		if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
+			break;
+#endif
+
+		thisfd = dup(2);
+		if (thisfd < 0)
+		{
+			/* Expect EMFILE or ENFILE, else it's fishy */
+			if (errno != EMFILE && errno != ENFILE)
+				elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
+			break;
+		}
+
+		if (used >= size)
+		{
+			size *= 2;
+			fd = (int *) repalloc(fd, size * sizeof(int));
+		}
+		fd[used++] = thisfd;
+
+		if (highestfd < thisfd)
+			highestfd = thisfd;
+
+		if (used >= max_to_probe)
+			break;
+	}
+
+	/* release the files we opened */
+	for (j = 0; j < used; j++)
+		close(fd[j]);
+
+	pfree(fd);
+
+	/*
+	 * Return results.  usable_fds is just the number of successful dups. We
+	 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
+	 * number) and so already_open is highestfd+1 - usable_fds.
+	 */
+	*usable_fds = used;
+	*already_open = highestfd + 1 - used;
+}
+
+/*
+ * set_max_safe_fds
+ *		Determine number of file descriptors that fd.c is allowed to use
+ */
+void
+set_max_safe_fds(void)
+{
+	int			usable_fds;
+	int			already_open;
+
+	/*----------
+	 * We want to set max_safe_fds to
+	 *			MIN(usable_fds, max_files_per_process - already_open)
+	 * less the slop factor for files that are opened without consulting
+	 * fd.c.  This ensures that we won't exceed either max_files_per_process
+	 * or the experimentally-determined EMFILE limit.
+	 *----------
+	 */
+	count_usable_fds(max_files_per_process,
+					 &usable_fds, &already_open);
+
+	max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
+
+	/*
+	 * Take off the FDs reserved for system() etc.
+	 */
+	max_safe_fds -= NUM_RESERVED_FDS;
+
+	/*
+	 * Make sure we still have enough to get by.
+	 */
+	if (max_safe_fds < FD_MINFREE)
+		ereport(FATAL,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("insufficient file descriptors available to start server process"),
+				 errdetail("System allows %d, we need at least %d.",
+						   max_safe_fds + NUM_RESERVED_FDS,
+						   FD_MINFREE + NUM_RESERVED_FDS)));
+
+	elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
+		 max_safe_fds, usable_fds, already_open);
+}
+
+/*
+ * Open a file with BasicOpenFilePerm() and pass default file mode for the
+ * fileMode parameter.
+ */
+int
+BasicOpenFile(const char *fileName, int fileFlags)
+{
+	return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
+}
+
+/*
+ * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
+ *
+ * This is exported for use by places that really want a plain kernel FD,
+ * but need to be proof against running out of FDs.  Once an FD has been
+ * successfully returned, it is the caller's responsibility to ensure that
+ * it will not be leaked on ereport()!	Most users should *not* call this
+ * routine directly, but instead use the VFD abstraction level, which
+ * provides protection against descriptor leaks as well as management of
+ * files that need to be open for more than a short period of time.
+ *
+ * Ideally this should be the *only* direct call of open() in the backend.
+ * In practice, the postmaster calls open() directly, and there are some
+ * direct open() calls done early in backend startup.  Those are OK since
+ * this module wouldn't have any open files to close at that point anyway.
+ */
+int
+BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
+{
+	int			fd;
+
+tryAgain:
+	fd = open(fileName, fileFlags, fileMode);
+
+	if (fd >= 0)
+		return fd;				/* success! */
+
+	if (errno == EMFILE || errno == ENFILE)
+	{
+		int			save_errno = errno;
+
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("out of file descriptors: %m; release and retry")));
+		errno = 0;
+		if (ReleaseLruFile())
+			goto tryAgain;
+		errno = save_errno;
+	}
+
+	return -1;					/* failure */
+}
+
+/*
+ * AcquireExternalFD - attempt to reserve an external file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.
+ *
+ * The difference between this and the underlying ReserveExternalFD function
+ * is that this will report failure (by setting errno and returning false)
+ * if "too many" external FDs are already reserved.  This should be used in
+ * any code where the total number of FDs to be reserved is not predictable
+ * and small.
+ */
+bool
+AcquireExternalFD(void)
+{
+	/*
+	 * We don't want more than max_safe_fds / 3 FDs to be consumed for
+	 * "external" FDs.
+	 */
+	if (numExternalFDs < max_safe_fds / 3)
+	{
+		ReserveExternalFD();
+		return true;
+	}
+	errno = EMFILE;
+	return false;
+}
+
+/*
+ * ReserveExternalFD - report external consumption of a file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.  This just tracks the use of the FD and closes
+ * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
+ *
+ * Call this directly only in code where failure to reserve the FD would be
+ * fatal; for example, the WAL-writing code does so, since the alternative is
+ * session failure.  Also, it's very unwise to do so in code that could
+ * consume more than one FD per process.
+ *
+ * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
+ * available, it doesn't matter too much whether this is called before or
+ * after actually opening the FD; but doing so beforehand reduces the risk of
+ * an EMFILE failure if not everybody played nice.  In any case, it's solely
+ * caller's responsibility to keep the external-FD count in sync with reality.
+ */
+void
+ReserveExternalFD(void)
+{
+	/*
+	 * Release VFDs if needed to stay safe.  Because we do this before
+	 * incrementing numExternalFDs, the final state will be as desired, i.e.,
+	 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
+	 */
+	ReleaseLruFiles();
+
+	numExternalFDs++;
+}
+
+/*
+ * ReleaseExternalFD - report release of an external file descriptor
+ *
+ * This is guaranteed not to change errno, so it can be used in failure paths.
+ */
+void
+ReleaseExternalFD(void)
+{
+	Assert(numExternalFDs > 0);
+	numExternalFDs--;
+}
+
+
+#if defined(FDDEBUG)
+
+static void
+_dump_lru(void)
+{
+	int			mru = VfdCache[0].lruLessRecently;
+	Vfd		   *vfdP = &VfdCache[mru];
+	char		buf[2048];
+
+	snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
+	while (mru != 0)
+	{
+		mru = vfdP->lruLessRecently;
+		vfdP = &VfdCache[mru];
+		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
+	}
+	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
+	elog(LOG, "%s", buf);
+}
+#endif							/* FDDEBUG */
+
+static void
+Delete(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(file != 0);
+
+	DO_DB(elog(LOG, "Delete %d (%s)",
+			   file, VfdCache[file].fileName));
+	DO_DB(_dump_lru());
+
+	vfdP = &VfdCache[file];
+
+	VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
+	VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
+
+	DO_DB(_dump_lru());
+}
+
+static void
+LruDelete(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(file != 0);
+
+	DO_DB(elog(LOG, "LruDelete %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	vfdP = &VfdCache[file];
+
+	/*
+	 * Close the file.  We aren't expecting this to fail; if it does, better
+	 * to leak the FD than to mess up our internal state.
+	 */
+	if (close(vfdP->fd) != 0)
+		elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+			 "could not close file \"%s\": %m", vfdP->fileName);
+	vfdP->fd = VFD_CLOSED;
+	--nfile;
+
+	/* delete the vfd record from the LRU ring */
+	Delete(file);
+}
+
+static void
+Insert(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(file != 0);
+
+	DO_DB(elog(LOG, "Insert %d (%s)",
+			   file, VfdCache[file].fileName));
+	DO_DB(_dump_lru());
+
+	vfdP = &VfdCache[file];
+
+	vfdP->lruMoreRecently = 0;
+	vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
+	VfdCache[0].lruLessRecently = file;
+	VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
+
+	DO_DB(_dump_lru());
+}
+
+/* returns 0 on success, -1 on re-open failure (with errno set) */
+static int
+LruInsert(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(file != 0);
+
+	DO_DB(elog(LOG, "LruInsert %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	vfdP = &VfdCache[file];
+
+	if (FileIsNotOpen(file))
+	{
+		/* Close excess kernel FDs. */
+		ReleaseLruFiles();
+
+		/*
+		 * The open could still fail for lack of file descriptors, eg due to
+		 * overall system file table being full.  So, be prepared to release
+		 * another FD if necessary...
+		 */
+		vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
+									 vfdP->fileMode);
+		if (vfdP->fd < 0)
+		{
+			DO_DB(elog(LOG, "re-open failed: %m"));
+			return -1;
+		}
+		else
+		{
+			++nfile;
+		}
+	}
+
+	/*
+	 * put it at the head of the Lru ring
+	 */
+
+	Insert(file);
+
+	return 0;
+}
+
+/*
+ * Release one kernel FD by closing the least-recently-used VFD.
+ */
+static bool
+ReleaseLruFile(void)
+{
+	DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
+
+	if (nfile > 0)
+	{
+		/*
+		 * There are opened files and so there should be at least one used vfd
+		 * in the ring.
+		 */
+		Assert(VfdCache[0].lruMoreRecently != 0);
+		LruDelete(VfdCache[0].lruMoreRecently);
+		return true;			/* freed a file */
+	}
+	return false;				/* no files available to free */
+}
+
+/*
+ * Release kernel FDs as needed to get under the max_safe_fds limit.
+ * After calling this, it's OK to try to open another file.
+ */
+static void
+ReleaseLruFiles(void)
+{
+	while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
+	{
+		if (!ReleaseLruFile())
+			break;
+	}
+}
+
+static File
+AllocateVfd(void)
+{
+	Index		i;
+	File		file;
+
+	DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
+
+	Assert(SizeVfdCache > 0);	/* InitFileAccess not called? */
+
+	if (VfdCache[0].nextFree == 0)
+	{
+		/*
+		 * The free list is empty so it is time to increase the size of the
+		 * array.  We choose to double it each time this happens. However,
+		 * there's not much point in starting *real* small.
+		 */
+		Size		newCacheSize = SizeVfdCache * 2;
+		Vfd		   *newVfdCache;
+
+		if (newCacheSize < 32)
+			newCacheSize = 32;
+
+		/*
+		 * Be careful not to clobber VfdCache ptr if realloc fails.
+		 */
+		newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
+		if (newVfdCache == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		VfdCache = newVfdCache;
+
+		/*
+		 * Initialize the new entries and link them into the free list.
+		 */
+		for (i = SizeVfdCache; i < newCacheSize; i++)
+		{
+			MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
+			VfdCache[i].nextFree = i + 1;
+			VfdCache[i].fd = VFD_CLOSED;
+		}
+		VfdCache[newCacheSize - 1].nextFree = 0;
+		VfdCache[0].nextFree = SizeVfdCache;
+
+		/*
+		 * Record the new size
+		 */
+		SizeVfdCache = newCacheSize;
+	}
+
+	file = VfdCache[0].nextFree;
+
+	VfdCache[0].nextFree = VfdCache[file].nextFree;
+
+	return file;
+}
+
+static void
+FreeVfd(File file)
+{
+	Vfd		   *vfdP = &VfdCache[file];
+
+	DO_DB(elog(LOG, "FreeVfd: %d (%s)",
+			   file, vfdP->fileName ? vfdP->fileName : ""));
+
+	if (vfdP->fileName != NULL)
+	{
+		free(vfdP->fileName);
+		vfdP->fileName = NULL;
+	}
+	vfdP->fdstate = 0x0;
+
+	vfdP->nextFree = VfdCache[0].nextFree;
+	VfdCache[0].nextFree = file;
+}
+
+/* returns 0 on success, -1 on re-open failure (with errno set) */
+static int
+FileAccess(File file)
+{
+	int			returnValue;
+
+	DO_DB(elog(LOG, "FileAccess %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	/*
+	 * Is the file open?  If not, open it and put it at the head of the LRU
+	 * ring (possibly closing the least recently used file to get an FD).
+	 */
+
+	if (FileIsNotOpen(file))
+	{
+		returnValue = LruInsert(file);
+		if (returnValue != 0)
+			return returnValue;
+	}
+	else if (VfdCache[0].lruLessRecently != file)
+	{
+		/*
+		 * We now know that the file is open and that it is not the last one
+		 * accessed, so we need to move it to the head of the Lru ring.
+		 */
+
+		Delete(file);
+		Insert(file);
+	}
+
+	return 0;
+}
+
+/*
+ * Called whenever a temporary file is deleted to report its size.
+ */
+static void
+ReportTemporaryFileUsage(const char *path, off_t size)
+{
+	pgstat_report_tempfile(size);
+
+	if (log_temp_files >= 0)
+	{
+		if ((size / 1024) >= log_temp_files)
+			ereport(LOG,
+					(errmsg("temporary file: path \"%s\", size %lu",
+							path, (unsigned long) size)));
+	}
+}
+
+/*
+ * Called to register a temporary file for automatic close.
+ * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
+ * before the file was opened.
+ */
+static void
+RegisterTemporaryFile(File file)
+{
+	ResourceOwnerRememberFile(CurrentResourceOwner, file);
+	VfdCache[file].resowner = CurrentResourceOwner;
+
+	/* Backup mechanism for closing at end of xact. */
+	VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
+	have_xact_temporary_files = true;
+}
+
+/*
+ *	Called when we get a shared invalidation message on some relation.
+ */
+#ifdef NOT_USED
+void
+FileInvalidate(File file)
+{
+	Assert(FileIsValid(file));
+	if (!FileIsNotOpen(file))
+		LruDelete(file);
+}
+#endif
+
+/*
+ * Open a file with PathNameOpenFilePerm() and pass default file mode for the
+ * fileMode parameter.
+ */
+File
+PathNameOpenFile(const char *fileName, int fileFlags)
+{
+	return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
+}
+
+/*
+ * open a file in an arbitrary directory
+ *
+ * NB: if the passed pathname is relative (which it usually is),
+ * it will be interpreted relative to the process' working directory
+ * (which should always be $PGDATA when this code is running).
+ */
+File
+PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
+{
+	char	   *fnamecopy;
+	File		file;
+	Vfd		   *vfdP;
+
+	DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
+			   fileName, fileFlags, fileMode));
+
+	/*
+	 * We need a malloc'd copy of the file name; fail cleanly if no room.
+	 */
+	fnamecopy = strdup(fileName);
+	if (fnamecopy == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
+	file = AllocateVfd();
+	vfdP = &VfdCache[file];
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+	vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
+
+	if (vfdP->fd < 0)
+	{
+		int			save_errno = errno;
+
+		FreeVfd(file);
+		free(fnamecopy);
+		errno = save_errno;
+		return -1;
+	}
+	++nfile;
+	DO_DB(elog(LOG, "PathNameOpenFile: success %d",
+			   vfdP->fd));
+
+	vfdP->fileName = fnamecopy;
+	/* Saved flags are adjusted to be OK for re-opening file */
+	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
+	vfdP->fileMode = fileMode;
+	vfdP->fileSize = 0;
+	vfdP->fdstate = 0x0;
+	vfdP->resowner = NULL;
+
+	Insert(file);
+
+	return file;
+}
+
+/*
+ * Create directory 'directory'.  If necessary, create 'basedir', which must
+ * be the directory above it.  This is designed for creating the top-level
+ * temporary directory on demand before creating a directory underneath it.
+ * Do nothing if the directory already exists.
+ *
+ * Directories created within the top-level temporary directory should begin
+ * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
+ * deleted at startup by RemovePgTempFiles().  Further subdirectories below
+ * that do not need any particular prefix.
+*/
+void
+PathNameCreateTemporaryDir(const char *basedir, const char *directory)
+{
+	if (MakePGDirectory(directory) < 0)
+	{
+		if (errno == EEXIST)
+			return;
+
+		/*
+		 * Failed.  Try to create basedir first in case it's missing. Tolerate
+		 * EEXIST to close a race against another process following the same
+		 * algorithm.
+		 */
+		if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("cannot create temporary directory \"%s\": %m",
+							basedir)));
+
+		/* Try again. */
+		if (MakePGDirectory(directory) < 0 && errno != EEXIST)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("cannot create temporary subdirectory \"%s\": %m",
+							directory)));
+	}
+}
+
+/*
+ * Delete a directory and everything in it, if it exists.
+ */
+void
+PathNameDeleteTemporaryDir(const char *dirname)
+{
+	struct stat statbuf;
+
+	/* Silently ignore missing directory. */
+	if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
+		return;
+
+	/*
+	 * Currently, walkdir doesn't offer a way for our passed in function to
+	 * maintain state.  Perhaps it should, so that we could tell the caller
+	 * whether this operation succeeded or failed.  Since this operation is
+	 * used in a cleanup path, we wouldn't actually behave differently: we'll
+	 * just log failures.
+	 */
+	walkdir(dirname, unlink_if_exists_fname, false, LOG);
+}
+
+/*
+ * Open a temporary file that will disappear when we close it.
+ *
+ * This routine takes care of generating an appropriate tempfile name.
+ * There's no need to pass in fileFlags or fileMode either, since only
+ * one setting makes any sense for a temp file.
+ *
+ * Unless interXact is true, the file is remembered by CurrentResourceOwner
+ * to ensure it's closed and deleted when it's no longer needed, typically at
+ * the end-of-transaction. In most cases, you don't want temporary files to
+ * outlive the transaction that created them, so this should be false -- but
+ * if you need "somewhat" temporary storage, this might be useful. In either
+ * case, the file is removed when the File is explicitly closed.
+ */
+File
+OpenTemporaryFile(bool interXact)
+{
+	File		file = 0;
+
+	/*
+	 * Make sure the current resource owner has space for this File before we
+	 * open it, if we'll be registering it below.
+	 */
+	if (!interXact)
+		ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+	/*
+	 * If some temp tablespace(s) have been given to us, try to use the next
+	 * one.  If a given tablespace can't be found, we silently fall back to
+	 * the database's default tablespace.
+	 *
+	 * BUT: if the temp file is slated to outlive the current transaction,
+	 * force it into the database's default tablespace, so that it will not
+	 * pose a threat to possible tablespace drop attempts.
+	 */
+	if (numTempTableSpaces > 0 && !interXact)
+	{
+		Oid			tblspcOid = GetNextTempTableSpace();
+
+		if (OidIsValid(tblspcOid))
+			file = OpenTemporaryFileInTablespace(tblspcOid, false);
+	}
+
+	/*
+	 * If not, or if tablespace is bad, create in database's default
+	 * tablespace.  MyDatabaseTableSpace should normally be set before we get
+	 * here, but just in case it isn't, fall back to pg_default tablespace.
+	 */
+	if (file <= 0)
+		file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
+											 MyDatabaseTableSpace :
+											 DEFAULTTABLESPACE_OID,
+											 true);
+
+	/* Mark it for deletion at close and temporary file size limit */
+	VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
+
+	/* Register it with the current resource owner */
+	if (!interXact)
+		RegisterTemporaryFile(file);
+
+	return file;
+}
+
+/*
+ * Return the path of the temp directory in a given tablespace.
+ */
+void
+TempTablespacePath(char *path, Oid tablespace)
+{
+	/*
+	 * Identify the tempfile directory for this tablespace.
+	 *
+	 * If someone tries to specify pg_global, use pg_default instead.
+	 */
+	if (tablespace == InvalidOid ||
+		tablespace == DEFAULTTABLESPACE_OID ||
+		tablespace == GLOBALTABLESPACE_OID)
+		snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
+	else
+	{
+		/* All other tablespaces are accessed via symlinks */
+		snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
+				 tablespace, TABLESPACE_VERSION_DIRECTORY,
+				 PG_TEMP_FILES_DIR);
+	}
+}
+
+/*
+ * Open a temporary file in a specific tablespace.
+ * Subroutine for OpenTemporaryFile, which see for details.
+ */
+static File
+OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
+{
+	char		tempdirpath[MAXPGPATH];
+	char		tempfilepath[MAXPGPATH];
+	File		file;
+
+	TempTablespacePath(tempdirpath, tblspcOid);
+
+	/*
+	 * Generate a tempfile name that should be unique within the current
+	 * database instance.
+	 */
+	snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
+			 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
+
+	/*
+	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
+	 * temp file that can be reused.
+	 */
+	file = PathNameOpenFile(tempfilepath,
+							O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+	if (file <= 0)
+	{
+		/*
+		 * We might need to create the tablespace's tempfile directory, if no
+		 * one has yet done so.
+		 *
+		 * Don't check for an error from MakePGDirectory; it could fail if
+		 * someone else just did the same thing.  If it doesn't work then
+		 * we'll bomb out on the second create attempt, instead.
+		 */
+		(void) MakePGDirectory(tempdirpath);
+
+		file = PathNameOpenFile(tempfilepath,
+								O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+		if (file <= 0 && rejectError)
+			elog(ERROR, "could not create temporary file \"%s\": %m",
+				 tempfilepath);
+	}
+
+	return file;
+}
+
+
+/*
+ * Create a new file.  The directory containing it must already exist.  Files
+ * created this way are subject to temp_file_limit and are automatically
+ * closed at end of transaction, but are not automatically deleted on close
+ * because they are intended to be shared between cooperating backends.
+ *
+ * If the file is inside the top-level temporary directory, its name should
+ * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
+ * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
+ * inside a directory created with PathNameCreateTemporaryDir(), in which case
+ * the prefix isn't needed.
+ */
+File
+PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
+{
+	File		file;
+
+	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+	/*
+	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
+	 * temp file that can be reused.
+	 */
+	file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+	if (file <= 0)
+	{
+		if (error_on_failure)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not create temporary file \"%s\": %m",
+							path)));
+		else
+			return file;
+	}
+
+	/* Mark it for temp_file_limit accounting. */
+	VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
+
+	/* Register it for automatic close. */
+	RegisterTemporaryFile(file);
+
+	return file;
+}
+
+/*
+ * Open a file that was created with PathNameCreateTemporaryFile, possibly in
+ * another backend.  Files opened this way don't count against the
+ * temp_file_limit of the caller, are automatically closed at the end of the
+ * transaction but are not deleted on close.
+ */
+File
+PathNameOpenTemporaryFile(const char *path, int mode)
+{
+	File		file;
+
+	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+	file = PathNameOpenFile(path, mode | PG_BINARY);
+
+	/* If no such file, then we don't raise an error. */
+	if (file <= 0 && errno != ENOENT)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open temporary file \"%s\": %m",
+						path)));
+
+	if (file > 0)
+	{
+		/* Register it for automatic close. */
+		RegisterTemporaryFile(file);
+	}
+
+	return file;
+}
+
+/*
+ * Delete a file by pathname.  Return true if the file existed, false if
+ * didn't.
+ */
+bool
+PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
+{
+	struct stat filestats;
+	int			stat_errno;
+
+	/* Get the final size for pgstat reporting. */
+	if (stat(path, &filestats) != 0)
+		stat_errno = errno;
+	else
+		stat_errno = 0;
+
+	/*
+	 * Unlike FileClose's automatic file deletion code, we tolerate
+	 * non-existence to support BufFileDeleteShared which doesn't know how
+	 * many segments it has to delete until it runs out.
+	 */
+	if (stat_errno == ENOENT)
+		return false;
+
+	if (unlink(path) < 0)
+	{
+		if (errno != ENOENT)
+			ereport(error_on_failure ? ERROR : LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not unlink temporary file \"%s\": %m",
+							path)));
+		return false;
+	}
+
+	if (stat_errno == 0)
+		ReportTemporaryFileUsage(path, filestats.st_size);
+	else
+	{
+		errno = stat_errno;
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not stat file \"%s\": %m", path)));
+	}
+
+	return true;
+}
+
+/*
+ * close a file when done with it
+ */
+void
+FileClose(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileClose: %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	vfdP = &VfdCache[file];
+
+	if (!FileIsNotOpen(file))
+	{
+		/* close the file */
+		if (close(vfdP->fd) != 0)
+		{
+			/*
+			 * We may need to panic on failure to close non-temporary files;
+			 * see LruDelete.
+			 */
+			elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+				 "could not close file \"%s\": %m", vfdP->fileName);
+		}
+
+		--nfile;
+		vfdP->fd = VFD_CLOSED;
+
+		/* remove the file from the lru ring */
+		Delete(file);
+	}
+
+	if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
+	{
+		/* Subtract its size from current usage (do first in case of error) */
+		temporary_files_size -= vfdP->fileSize;
+		vfdP->fileSize = 0;
+	}
+
+	/*
+	 * Delete the file if it was temporary, and make a log entry if wanted
+	 */
+	if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
+	{
+		struct stat filestats;
+		int			stat_errno;
+
+		/*
+		 * If we get an error, as could happen within the ereport/elog calls,
+		 * we'll come right back here during transaction abort.  Reset the
+		 * flag to ensure that we can't get into an infinite loop.  This code
+		 * is arranged to ensure that the worst-case consequence is failing to
+		 * emit log message(s), not failing to attempt the unlink.
+		 */
+		vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
+
+
+		/* first try the stat() */
+		if (stat(vfdP->fileName, &filestats))
+			stat_errno = errno;
+		else
+			stat_errno = 0;
+
+		/* in any case do the unlink */
+		if (unlink(vfdP->fileName))
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
+
+		/* and last report the stat results */
+		if (stat_errno == 0)
+			ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
+		else
+		{
+			errno = stat_errno;
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
+		}
+	}
+
+	/* Unregister it from the resource owner */
+	if (vfdP->resowner)
+		ResourceOwnerForgetFile(vfdP->resowner, file);
+
+	/*
+	 * Return the Vfd slot to the free list
+	 */
+	FreeVfd(file);
+}
+
+/*
+ * FilePrefetch - initiate asynchronous read of a given range of the file.
+ *
+ * Currently the only implementation of this function is using posix_fadvise
+ * which is the simplest standardized interface that accomplishes this.
+ * We could add an implementation using libaio in the future; but note that
+ * this API is inappropriate for libaio, which wants to have a buffer provided
+ * to read into.
+ */
+int
+FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
+{
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
+			   file, VfdCache[file].fileName,
+			   (int64) offset, amount));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
+							   POSIX_FADV_WILLNEED);
+	pgstat_report_wait_end();
+
+	return returnCode;
+#else
+	Assert(FileIsValid(file));
+	return 0;
+#endif
+}
+
+void
+FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+			   file, VfdCache[file].fileName,
+			   (int64) offset, (int64) nbytes));
+
+	if (nbytes <= 0)
+		return;
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return;
+
+	pgstat_report_wait_start(wait_event_info);
+	pg_flush_data(VfdCache[file].fd, offset, nbytes);
+	pgstat_report_wait_end();
+}
+
+int
+FileRead(File file, char *buffer, int amount, off_t offset,
+		 uint32 wait_event_info)
+{
+	int			returnCode;
+	Vfd		   *vfdP;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
+			   file, VfdCache[file].fileName,
+			   (int64) offset,
+			   amount, buffer));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	vfdP = &VfdCache[file];
+
+retry:
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
+	pgstat_report_wait_end();
+
+	if (returnCode < 0)
+	{
+		/*
+		 * Windows may run out of kernel buffers and return "Insufficient
+		 * system resources" error.  Wait a bit and retry to solve it.
+		 *
+		 * It is rumored that EINTR is also possible on some Unix filesystems,
+		 * in which case immediate retry is indicated.
+		 */
+#ifdef WIN32
+		DWORD		error = GetLastError();
+
+		switch (error)
+		{
+			case ERROR_NO_SYSTEM_RESOURCES:
+				pg_usleep(1000L);
+				errno = EINTR;
+				break;
+			default:
+				_dosmaperr(error);
+				break;
+		}
+#endif
+		/* OK to retry if interrupted */
+		if (errno == EINTR)
+			goto retry;
+	}
+
+	return returnCode;
+}
+
+int
+FileWrite(File file, char *buffer, int amount, off_t offset,
+		  uint32 wait_event_info)
+{
+	int			returnCode;
+	Vfd		   *vfdP;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
+			   file, VfdCache[file].fileName,
+			   (int64) offset,
+			   amount, buffer));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	vfdP = &VfdCache[file];
+
+	/*
+	 * If enforcing temp_file_limit and it's a temp file, check to see if the
+	 * write would overrun temp_file_limit, and throw error if so.  Note: it's
+	 * really a modularity violation to throw error here; we should set errno
+	 * and return -1.  However, there's no way to report a suitable error
+	 * message if we do that.  All current callers would just throw error
+	 * immediately anyway, so this is safe at present.
+	 */
+	if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
+	{
+		off_t		past_write = offset + amount;
+
+		if (past_write > vfdP->fileSize)
+		{
+			uint64		newTotal = temporary_files_size;
+
+			newTotal += past_write - vfdP->fileSize;
+			if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
+				ereport(ERROR,
+						(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
+						 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
+								temp_file_limit)));
+		}
+	}
+
+retry:
+	errno = 0;
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
+	pgstat_report_wait_end();
+
+	/* if write didn't set errno, assume problem is no disk space */
+	if (returnCode != amount && errno == 0)
+		errno = ENOSPC;
+
+	if (returnCode >= 0)
+	{
+		/*
+		 * Maintain fileSize and temporary_files_size if it's a temp file.
+		 */
+		if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
+		{
+			off_t		past_write = offset + amount;
+
+			if (past_write > vfdP->fileSize)
+			{
+				temporary_files_size += past_write - vfdP->fileSize;
+				vfdP->fileSize = past_write;
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * See comments in FileRead()
+		 */
+#ifdef WIN32
+		DWORD		error = GetLastError();
+
+		switch (error)
+		{
+			case ERROR_NO_SYSTEM_RESOURCES:
+				pg_usleep(1000L);
+				errno = EINTR;
+				break;
+			default:
+				_dosmaperr(error);
+				break;
+		}
+#endif
+		/* OK to retry if interrupted */
+		if (errno == EINTR)
+			goto retry;
+	}
+
+	return returnCode;
+}
+
+int
+FileSync(File file, uint32 wait_event_info)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileSync: %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = pg_fsync(VfdCache[file].fd);
+	pgstat_report_wait_end();
+
+	return returnCode;
+}
+
+off_t
+FileSize(File file)
+{
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileSize %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	if (FileIsNotOpen(file))
+	{
+		if (FileAccess(file) < 0)
+			return (off_t) -1;
+	}
+
+	return lseek(VfdCache[file].fd, 0, SEEK_END);
+}
+
+int
+FileTruncate(File file, off_t offset, uint32 wait_event_info)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileTruncate %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = ftruncate(VfdCache[file].fd, offset);
+	pgstat_report_wait_end();
+
+	if (returnCode == 0 && VfdCache[file].fileSize > offset)
+	{
+		/* adjust our state for truncation of a temp file */
+		Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
+		temporary_files_size -= VfdCache[file].fileSize - offset;
+		VfdCache[file].fileSize = offset;
+	}
+
+	return returnCode;
+}
+
+/*
+ * Return the pathname associated with an open file.
+ *
+ * The returned string points to an internal buffer, which is valid until
+ * the file is closed.
+ */
+char *
+FilePathName(File file)
+{
+	Assert(FileIsValid(file));
+
+	return VfdCache[file].fileName;
+}
+
+/*
+ * Return the raw file descriptor of an opened file.
+ *
+ * The returned file descriptor will be valid until the file is closed, but
+ * there are a lot of things that can make that happen.  So the caller should
+ * be careful not to do much of anything else before it finishes using the
+ * returned file descriptor.
+ */
+int
+FileGetRawDesc(File file)
+{
+	Assert(FileIsValid(file));
+	return VfdCache[file].fd;
+}
+
+/*
+ * FileGetRawFlags - returns the file flags on open(2)
+ */
+int
+FileGetRawFlags(File file)
+{
+	Assert(FileIsValid(file));
+	return VfdCache[file].fileFlags;
+}
+
+/*
+ * FileGetRawMode - returns the mode bitmask passed to open(2)
+ */
+mode_t
+FileGetRawMode(File file)
+{
+	Assert(FileIsValid(file));
+	return VfdCache[file].fileMode;
+}
+
+/*
+ * Make room for another allocatedDescs[] array entry if needed and possible.
+ * Returns true if an array element is available.
+ */
+static bool
+reserveAllocatedDesc(void)
+{
+	AllocateDesc *newDescs;
+	int			newMax;
+
+	/* Quick out if array already has a free slot. */
+	if (numAllocatedDescs < maxAllocatedDescs)
+		return true;
+
+	/*
+	 * If the array hasn't yet been created in the current process, initialize
+	 * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
+	 * we will ever need, anyway.  We don't want to look at max_safe_fds
+	 * immediately because set_max_safe_fds() may not have run yet.
+	 */
+	if (allocatedDescs == NULL)
+	{
+		newMax = FD_MINFREE / 3;
+		newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
+		/* Out of memory already?  Treat as fatal error. */
+		if (newDescs == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		allocatedDescs = newDescs;
+		maxAllocatedDescs = newMax;
+		return true;
+	}
+
+	/*
+	 * Consider enlarging the array beyond the initial allocation used above.
+	 * By the time this happens, max_safe_fds should be known accurately.
+	 *
+	 * We mustn't let allocated descriptors hog all the available FDs, and in
+	 * practice we'd better leave a reasonable number of FDs for VFD use.  So
+	 * set the maximum to max_safe_fds / 3.  (This should certainly be at
+	 * least as large as the initial size, FD_MINFREE / 3, so we aren't
+	 * tightening the restriction here.)  Recall that "external" FDs are
+	 * allowed to consume another third of max_safe_fds.
+	 */
+	newMax = max_safe_fds / 3;
+	if (newMax > maxAllocatedDescs)
+	{
+		newDescs = (AllocateDesc *) realloc(allocatedDescs,
+											newMax * sizeof(AllocateDesc));
+		/* Treat out-of-memory as a non-fatal error. */
+		if (newDescs == NULL)
+			return false;
+		allocatedDescs = newDescs;
+		maxAllocatedDescs = newMax;
+		return true;
+	}
+
+	/* Can't enlarge allocatedDescs[] any more. */
+	return false;
+}
+
+/*
+ * Routines that want to use stdio (ie, FILE*) should use AllocateFile
+ * rather than plain fopen().  This lets fd.c deal with freeing FDs if
+ * necessary to open the file.  When done, call FreeFile rather than fclose.
+ *
+ * Note that files that will be open for any significant length of time
+ * should NOT be handled this way, since they cannot share kernel file
+ * descriptors with other files; there is grave risk of running out of FDs
+ * if anyone locks down too many FDs.  Most callers of this routine are
+ * simply reading a config file that they will read and close immediately.
+ *
+ * fd.c will automatically close all files opened with AllocateFile at
+ * transaction commit or abort; this prevents FD leakage if a routine
+ * that calls AllocateFile is terminated prematurely by ereport(ERROR).
+ *
+ * Ideally this should be the *only* direct call of fopen() in the backend.
+ */
+FILE *
+AllocateFile(const char *name, const char *mode)
+{
+	FILE	   *file;
+
+	DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
+			   numAllocatedDescs, name));
+
+	/* Can we allocate another non-virtual FD? */
+	if (!reserveAllocatedDesc())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
+						maxAllocatedDescs, name)));
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+TryAgain:
+	if ((file = fopen(name, mode)) != NULL)
+	{
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescFile;
+		desc->desc.file = file;
+		desc->create_subid = GetCurrentSubTransactionId();
+		numAllocatedDescs++;
+		return desc->desc.file;
+	}
+
+	if (errno == EMFILE || errno == ENFILE)
+	{
+		int			save_errno = errno;
+
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("out of file descriptors: %m; release and retry")));
+		errno = 0;
+		if (ReleaseLruFile())
+			goto TryAgain;
+		errno = save_errno;
+	}
+
+	return NULL;
+}
+
+/*
+ * Open a file with OpenTransientFilePerm() and pass default file mode for
+ * the fileMode parameter.
+ */
+int
+OpenTransientFile(const char *fileName, int fileFlags)
+{
+	return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
+}
+
+/*
+ * Like AllocateFile, but returns an unbuffered fd like open(2)
+ */
+int
+OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
+{
+	int			fd;
+
+	DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
+			   numAllocatedDescs, fileName));
+
+	/* Can we allocate another non-virtual FD? */
+	if (!reserveAllocatedDesc())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
+						maxAllocatedDescs, fileName)));
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+	fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
+
+	if (fd >= 0)
+	{
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescRawFD;
+		desc->desc.fd = fd;
+		desc->create_subid = GetCurrentSubTransactionId();
+		numAllocatedDescs++;
+
+		return fd;
+	}
+
+	return -1;					/* failure */
+}
+
+/*
+ * Routines that want to initiate a pipe stream should use OpenPipeStream
+ * rather than plain popen().  This lets fd.c deal with freeing FDs if
+ * necessary.  When done, call ClosePipeStream rather than pclose.
+ *
+ * This function also ensures that the popen'd program is run with default
+ * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
+ * uses.  This ensures desirable response to, eg, closing a read pipe early.
+ */
+FILE *
+OpenPipeStream(const char *command, const char *mode)
+{
+	FILE	   *file;
+	int			save_errno;
+
+	DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
+			   numAllocatedDescs, command));
+
+	/* Can we allocate another non-virtual FD? */
+	if (!reserveAllocatedDesc())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
+						maxAllocatedDescs, command)));
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+TryAgain:
+	fflush(stdout);
+	fflush(stderr);
+	pqsignal(SIGPIPE, SIG_DFL);
+	errno = 0;
+	file = popen(command, mode);
+	save_errno = errno;
+	pqsignal(SIGPIPE, SIG_IGN);
+	errno = save_errno;
+	if (file != NULL)
+	{
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescPipe;
+		desc->desc.file = file;
+		desc->create_subid = GetCurrentSubTransactionId();
+		numAllocatedDescs++;
+		return desc->desc.file;
+	}
+
+	if (errno == EMFILE || errno == ENFILE)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("out of file descriptors: %m; release and retry")));
+		if (ReleaseLruFile())
+			goto TryAgain;
+		errno = save_errno;
+	}
+
+	return NULL;
+}
+
+/*
+ * Free an AllocateDesc of any type.
+ *
+ * The argument *must* point into the allocatedDescs[] array.
+ */
+static int
+FreeDesc(AllocateDesc *desc)
+{
+	int			result;
+
+	/* Close the underlying object */
+	switch (desc->kind)
+	{
+		case AllocateDescFile:
+			result = fclose(desc->desc.file);
+			break;
+		case AllocateDescPipe:
+			result = pclose(desc->desc.file);
+			break;
+		case AllocateDescDir:
+			result = closedir(desc->desc.dir);
+			break;
+		case AllocateDescRawFD:
+			result = close(desc->desc.fd);
+			break;
+		default:
+			elog(ERROR, "AllocateDesc kind not recognized");
+			result = 0;			/* keep compiler quiet */
+			break;
+	}
+
+	/* Compact storage in the allocatedDescs array */
+	numAllocatedDescs--;
+	*desc = allocatedDescs[numAllocatedDescs];
+
+	return result;
+}
+
+/*
+ * Close a file returned by AllocateFile.
+ *
+ * Note we do not check fclose's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+FreeFile(FILE *file)
+{
+	int			i;
+
+	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
+
+	/* Remove file from list of allocated files, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescFile && desc->desc.file == file)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a file not in allocatedDescs */
+	elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
+
+	return fclose(file);
+}
+
+/*
+ * Close a file returned by OpenTransientFile.
+ *
+ * Note we do not check close's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+CloseTransientFile(int fd)
+{
+	int			i;
+
+	DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
+
+	/* Remove fd from list of allocated files, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a file not in allocatedDescs */
+	elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
+
+	return close(fd);
+}
+
+/*
+ * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
+ * rather than plain opendir().  This lets fd.c deal with freeing FDs if
+ * necessary to open the directory, and with closing it after an elog.
+ * When done, call FreeDir rather than closedir.
+ *
+ * Returns NULL, with errno set, on failure.  Note that failure detection
+ * is commonly left to the following call of ReadDir or ReadDirExtended;
+ * see the comments for ReadDir.
+ *
+ * Ideally this should be the *only* direct call of opendir() in the backend.
+ */
+DIR *
+AllocateDir(const char *dirname)
+{
+	DIR		   *dir;
+
+	DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
+			   numAllocatedDescs, dirname));
+
+	/* Can we allocate another non-virtual FD? */
+	if (!reserveAllocatedDesc())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
+						maxAllocatedDescs, dirname)));
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+TryAgain:
+	if ((dir = opendir(dirname)) != NULL)
+	{
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescDir;
+		desc->desc.dir = dir;
+		desc->create_subid = GetCurrentSubTransactionId();
+		numAllocatedDescs++;
+		return desc->desc.dir;
+	}
+
+	if (errno == EMFILE || errno == ENFILE)
+	{
+		int			save_errno = errno;
+
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("out of file descriptors: %m; release and retry")));
+		errno = 0;
+		if (ReleaseLruFile())
+			goto TryAgain;
+		errno = save_errno;
+	}
+
+	return NULL;
+}
+
+/*
+ * Read a directory opened with AllocateDir, ereport'ing any error.
+ *
+ * This is easier to use than raw readdir() since it takes care of some
+ * otherwise rather tedious and error-prone manipulation of errno.  Also,
+ * if you are happy with a generic error message for AllocateDir failure,
+ * you can just do
+ *
+ *		dir = AllocateDir(path);
+ *		while ((dirent = ReadDir(dir, path)) != NULL)
+ *			process dirent;
+ *		FreeDir(dir);
+ *
+ * since a NULL dir parameter is taken as indicating AllocateDir failed.
+ * (Make sure errno isn't changed between AllocateDir and ReadDir if you
+ * use this shortcut.)
+ *
+ * The pathname passed to AllocateDir must be passed to this routine too,
+ * but it is only used for error reporting.
+ */
+struct dirent *
+ReadDir(DIR *dir, const char *dirname)
+{
+	return ReadDirExtended(dir, dirname, ERROR);
+}
+
+/*
+ * Alternate version of ReadDir that allows caller to specify the elevel
+ * for any error report (whether it's reporting an initial failure of
+ * AllocateDir or a subsequent directory read failure).
+ *
+ * If elevel < ERROR, returns NULL after any error.  With the normal coding
+ * pattern, this will result in falling out of the loop immediately as
+ * though the directory contained no (more) entries.
+ */
+struct dirent *
+ReadDirExtended(DIR *dir, const char *dirname, int elevel)
+{
+	struct dirent *dent;
+
+	/* Give a generic message for AllocateDir failure, if caller didn't */
+	if (dir == NULL)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open directory \"%s\": %m",
+						dirname)));
+		return NULL;
+	}
+
+	errno = 0;
+	if ((dent = readdir(dir)) != NULL)
+		return dent;
+
+	if (errno)
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not read directory \"%s\": %m",
+						dirname)));
+	return NULL;
+}
+
+/*
+ * Close a directory opened with AllocateDir.
+ *
+ * Returns closedir's return value (with errno set if it's not 0).
+ * Note we do not check the return value --- it is up to the caller
+ * to handle close errors if wanted.
+ *
+ * Does nothing if dir == NULL; we assume that directory open failure was
+ * already reported if desired.
+ */
+int
+FreeDir(DIR *dir)
+{
+	int			i;
+
+	/* Nothing to do if AllocateDir failed */
+	if (dir == NULL)
+		return 0;
+
+	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
+
+	/* Remove dir from list of allocated dirs, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a dir not in allocatedDescs */
+	elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
+
+	return closedir(dir);
+}
+
+
+/*
+ * Close a pipe stream returned by OpenPipeStream.
+ */
+int
+ClosePipeStream(FILE *file)
+{
+	int			i;
+
+	DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
+
+	/* Remove file from list of allocated files, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescPipe && desc->desc.file == file)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a file not in allocatedDescs */
+	elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
+
+	return pclose(file);
+}
+
+/*
+ * closeAllVfds
+ *
+ * Force all VFDs into the physically-closed state, so that the fewest
+ * possible number of kernel file descriptors are in use.  There is no
+ * change in the logical state of the VFDs.
+ */
+void
+closeAllVfds(void)
+{
+	Index		i;
+
+	if (SizeVfdCache > 0)
+	{
+		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
+		for (i = 1; i < SizeVfdCache; i++)
+		{
+			if (!FileIsNotOpen(i))
+				LruDelete(i);
+		}
+	}
+}
+
+
+/*
+ * SetTempTablespaces
+ *
+ * Define a list (actually an array) of OIDs of tablespaces to use for
+ * temporary files.  This list will be used until end of transaction,
+ * unless this function is called again before then.  It is caller's
+ * responsibility that the passed-in array has adequate lifespan (typically
+ * it'd be allocated in TopTransactionContext).
+ *
+ * Some entries of the array may be InvalidOid, indicating that the current
+ * database's default tablespace should be used.
+ */
+void
+SetTempTablespaces(Oid *tableSpaces, int numSpaces)
+{
+	Assert(numSpaces >= 0);
+	tempTableSpaces = tableSpaces;
+	numTempTableSpaces = numSpaces;
+
+	/*
+	 * Select a random starting point in the list.  This is to minimize
+	 * conflicts between backends that are most likely sharing the same list
+	 * of temp tablespaces.  Note that if we create multiple temp files in the
+	 * same transaction, we'll advance circularly through the list --- this
+	 * ensures that large temporary sort files are nicely spread across all
+	 * available tablespaces.
+	 */
+	if (numSpaces > 1)
+		nextTempTableSpace = random() % numSpaces;
+	else
+		nextTempTableSpace = 0;
+}
+
+/*
+ * TempTablespacesAreSet
+ *
+ * Returns true if SetTempTablespaces has been called in current transaction.
+ * (This is just so that tablespaces.c doesn't need its own per-transaction
+ * state.)
+ */
+bool
+TempTablespacesAreSet(void)
+{
+	return (numTempTableSpaces >= 0);
+}
+
+/*
+ * GetTempTablespaces
+ *
+ * Populate an array with the OIDs of the tablespaces that should be used for
+ * temporary files.  (Some entries may be InvalidOid, indicating that the
+ * current database's default tablespace should be used.)  At most numSpaces
+ * entries will be filled.
+ * Returns the number of OIDs that were copied into the output array.
+ */
+int
+GetTempTablespaces(Oid *tableSpaces, int numSpaces)
+{
+	int			i;
+
+	Assert(TempTablespacesAreSet());
+	for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
+		tableSpaces[i] = tempTableSpaces[i];
+
+	return i;
+}
+
+/*
+ * GetNextTempTableSpace
+ *
+ * Select the next temp tablespace to use.  A result of InvalidOid means
+ * to use the current database's default tablespace.
+ */
+Oid
+GetNextTempTableSpace(void)
+{
+	if (numTempTableSpaces > 0)
+	{
+		/* Advance nextTempTableSpace counter with wraparound */
+		if (++nextTempTableSpace >= numTempTableSpaces)
+			nextTempTableSpace = 0;
+		return tempTableSpaces[nextTempTableSpace];
+	}
+	return InvalidOid;
+}
+
+
+/*
+ * AtEOSubXact_Files
+ *
+ * Take care of subtransaction commit/abort.  At abort, we close temp files
+ * that the subtransaction may have opened.  At commit, we reassign the
+ * files that were opened to the parent subtransaction.
+ */
+void
+AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
+				  SubTransactionId parentSubid)
+{
+	Index		i;
+
+	for (i = 0; i < numAllocatedDescs; i++)
+	{
+		if (allocatedDescs[i].create_subid == mySubid)
+		{
+			if (isCommit)
+				allocatedDescs[i].create_subid = parentSubid;
+			else
+			{
+				/* have to recheck the item after FreeDesc (ugly) */
+				FreeDesc(&allocatedDescs[i--]);
+			}
+		}
+	}
+}
+
+/*
+ * AtEOXact_Files
+ *
+ * This routine is called during transaction commit or abort.  All still-open
+ * per-transaction temporary file VFDs are closed, which also causes the
+ * underlying files to be deleted (although they should've been closed already
+ * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
+ * closed. We also forget any transaction-local temp tablespace list.
+ *
+ * The isCommit flag is used only to decide whether to emit warnings about
+ * unclosed files.
+ */
+void
+AtEOXact_Files(bool isCommit)
+{
+	CleanupTempFiles(isCommit, false);
+	tempTableSpaces = NULL;
+	numTempTableSpaces = -1;
+}
+
+/*
+ * AtProcExit_Files
+ *
+ * on_proc_exit hook to clean up temp files during backend shutdown.
+ * Here, we want to clean up *all* temp files including interXact ones.
+ */
+static void
+AtProcExit_Files(int code, Datum arg)
+{
+	CleanupTempFiles(false, true);
+}
+
+/*
+ * Close temporary files and delete their underlying files.
+ *
+ * isCommit: if true, this is normal transaction commit, and we don't
+ * expect any remaining files; warn if there are some.
+ *
+ * isProcExit: if true, this is being called as the backend process is
+ * exiting. If that's the case, we should remove all temporary files; if
+ * that's not the case, we are being called for transaction commit/abort
+ * and should only remove transaction-local temp files.  In either case,
+ * also clean up "allocated" stdio files, dirs and fds.
+ */
+static void
+CleanupTempFiles(bool isCommit, bool isProcExit)
+{
+	Index		i;
+
+	/*
+	 * Careful here: at proc_exit we need extra cleanup, not just
+	 * xact_temporary files.
+	 */
+	if (isProcExit || have_xact_temporary_files)
+	{
+		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
+		for (i = 1; i < SizeVfdCache; i++)
+		{
+			unsigned short fdstate = VfdCache[i].fdstate;
+
+			if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
+				VfdCache[i].fileName != NULL)
+			{
+				/*
+				 * If we're in the process of exiting a backend process, close
+				 * all temporary files. Otherwise, only close temporary files
+				 * local to the current transaction. They should be closed by
+				 * the ResourceOwner mechanism already, so this is just a
+				 * debugging cross-check.
+				 */
+				if (isProcExit)
+					FileClose(i);
+				else if (fdstate & FD_CLOSE_AT_EOXACT)
+				{
+					elog(WARNING,
+						 "temporary file %s not closed at end-of-transaction",
+						 VfdCache[i].fileName);
+					FileClose(i);
+				}
+			}
+		}
+
+		have_xact_temporary_files = false;
+	}
+
+	/* Complain if any allocated files remain open at commit. */
+	if (isCommit && numAllocatedDescs > 0)
+		elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
+			 numAllocatedDescs);
+
+	/* Clean up "allocated" stdio files, dirs and fds. */
+	while (numAllocatedDescs > 0)
+		FreeDesc(&allocatedDescs[0]);
+}
+
+
+/*
+ * Remove temporary and temporary relation files left over from a prior
+ * postmaster session
+ *
+ * This should be called during postmaster startup.  It will forcibly
+ * remove any leftover files created by OpenTemporaryFile and any leftover
+ * temporary relation files created by mdcreate.
+ *
+ * During post-backend-crash restart cycle, this routine is called when
+ * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
+ * queries are using temp files could result in useless storage usage that can
+ * only be reclaimed by a service restart. The argument against enabling it is
+ * that someone might want to examine the temporary files for debugging
+ * purposes. This does however mean that OpenTemporaryFile had better allow for
+ * collision with an existing temp file name.
+ *
+ * NOTE: this function and its subroutines generally report syscall failures
+ * with ereport(LOG) and keep going.  Removing temp files is not so critical
+ * that we should fail to start the database when we can't do it.
+ */
+void
+RemovePgTempFiles(void)
+{
+	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
+	DIR		   *spc_dir;
+	struct dirent *spc_de;
+
+	/*
+	 * First process temp files in pg_default ($PGDATA/base)
+	 */
+	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
+	RemovePgTempFilesInDir(temp_path, true, false);
+	RemovePgTempRelationFiles("base");
+
+	/*
+	 * Cycle through temp directories for all non-default tablespaces.
+	 */
+	spc_dir = AllocateDir("pg_tblspc");
+
+	while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
+	{
+		if (strcmp(spc_de->d_name, ".") == 0 ||
+			strcmp(spc_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
+				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
+		RemovePgTempFilesInDir(temp_path, true, false);
+
+		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
+				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
+		RemovePgTempRelationFiles(temp_path);
+	}
+
+	FreeDir(spc_dir);
+
+	/*
+	 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
+	 * DataDir as well.  However, that is *not* cleaned here because doing so
+	 * would create a race condition.  It's done separately, earlier in
+	 * postmaster startup.
+	 */
+}
+
+/*
+ * Process one pgsql_tmp directory for RemovePgTempFiles.
+ *
+ * If missing_ok is true, it's all right for the named directory to not exist.
+ * Any other problem results in a LOG message.  (missing_ok should be true at
+ * the top level, since pgsql_tmp directories are not created until needed.)
+ *
+ * At the top level, this should be called with unlink_all = false, so that
+ * only files matching the temporary name prefix will be unlinked.  When
+ * recursing it will be called with unlink_all = true to unlink everything
+ * under a top-level temporary directory.
+ *
+ * (These two flags could be replaced by one, but it seems clearer to keep
+ * them separate.)
+ */
+void
+RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
+{
+	DIR		   *temp_dir;
+	struct dirent *temp_de;
+	char		rm_path[MAXPGPATH * 2];
+
+	temp_dir = AllocateDir(tmpdirname);
+
+	if (temp_dir == NULL && errno == ENOENT && missing_ok)
+		return;
+
+	while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
+	{
+		if (strcmp(temp_de->d_name, ".") == 0 ||
+			strcmp(temp_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(rm_path, sizeof(rm_path), "%s/%s",
+				 tmpdirname, temp_de->d_name);
+
+		if (unlink_all ||
+			strncmp(temp_de->d_name,
+					PG_TEMP_FILE_PREFIX,
+					strlen(PG_TEMP_FILE_PREFIX)) == 0)
+		{
+			struct stat statbuf;
+
+			if (lstat(rm_path, &statbuf) < 0)
+			{
+				ereport(LOG,
+						(errcode_for_file_access(),
+						 errmsg("could not stat file \"%s\": %m", rm_path)));
+				continue;
+			}
+
+			if (S_ISDIR(statbuf.st_mode))
+			{
+				/* recursively remove contents, then directory itself */
+				RemovePgTempFilesInDir(rm_path, false, true);
+
+				if (rmdir(rm_path) < 0)
+					ereport(LOG,
+							(errcode_for_file_access(),
+							 errmsg("could not remove directory \"%s\": %m",
+									rm_path)));
+			}
+			else
+			{
+				if (unlink(rm_path) < 0)
+					ereport(LOG,
+							(errcode_for_file_access(),
+							 errmsg("could not remove file \"%s\": %m",
+									rm_path)));
+			}
+		}
+		else
+			ereport(LOG,
+					(errmsg("unexpected file found in temporary-files directory: \"%s\"",
+							rm_path)));
+	}
+
+	FreeDir(temp_dir);
+}
+
+/* Process one tablespace directory, look for per-DB subdirectories */
+static void
+RemovePgTempRelationFiles(const char *tsdirname)
+{
+	DIR		   *ts_dir;
+	struct dirent *de;
+	char		dbspace_path[MAXPGPATH * 2];
+
+	ts_dir = AllocateDir(tsdirname);
+
+	while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
+	{
+		/*
+		 * We're only interested in the per-database directories, which have
+		 * numeric names.  Note that this code will also (properly) ignore "."
+		 * and "..".
+		 */
+		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+			continue;
+
+		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
+				 tsdirname, de->d_name);
+		RemovePgTempRelationFilesInDbspace(dbspace_path);
+	}
+
+	FreeDir(ts_dir);
+}
+
+/* Process one per-dbspace directory for RemovePgTempRelationFiles */
+static void
+RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
+{
+	DIR		   *dbspace_dir;
+	struct dirent *de;
+	char		rm_path[MAXPGPATH * 2];
+
+	dbspace_dir = AllocateDir(dbspacedirname);
+
+	while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
+	{
+		if (!looks_like_temp_rel_name(de->d_name))
+			continue;
+
+		snprintf(rm_path, sizeof(rm_path), "%s/%s",
+				 dbspacedirname, de->d_name);
+
+		if (unlink(rm_path) < 0)
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not remove file \"%s\": %m",
+							rm_path)));
+	}
+
+	FreeDir(dbspace_dir);
+}
+
+/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
+bool
+looks_like_temp_rel_name(const char *name)
+{
+	int			pos;
+	int			savepos;
+
+	/* Must start with "t". */
+	if (name[0] != 't')
+		return false;
+
+	/* Followed by a non-empty string of digits and then an underscore. */
+	for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
+		;
+	if (pos == 1 || name[pos] != '_')
+		return false;
+
+	/* Followed by another nonempty string of digits. */
+	for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
+		;
+	if (savepos == pos)
+		return false;
+
+	/* We might have _forkname or .segment or both. */
+	if (name[pos] == '_')
+	{
+		int			forkchar = forkname_chars(&name[pos + 1], NULL);
+
+		if (forkchar <= 0)
+			return false;
+		pos += forkchar + 1;
+	}
+	if (name[pos] == '.')
+	{
+		int			segchar;
+
+		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
+			;
+		if (segchar <= 1)
+			return false;
+		pos += segchar;
+	}
+
+	/* Now we should be at the end. */
+	if (name[pos] != '\0')
+		return false;
+	return true;
+}
+
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+	int			fd;
+
+	fd = OpenTransientFile(path, O_RDONLY);
+	if (fd < 0)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+		return;
+	}
+	if (syncfs(fd) < 0)
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not synchronize file system for file \"%s\": %m", path)));
+	CloseTransientFile(fd);
+}
+#endif
+
+/*
+ * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
+ * all potential filesystem, depending on recovery_init_sync_method setting.
+ *
+ * We fsync regular files and directories wherever they are, but we
+ * follow symlinks only for pg_wal and immediately under pg_tblspc.
+ * Other symlinks are presumed to point at files we're not responsible
+ * for fsyncing, and might not have privileges to write at all.
+ *
+ * Errors are logged but not considered fatal; that's because this is used
+ * only during database startup, to deal with the possibility that there are
+ * issued-but-unsynced writes pending against the data directory.  We want to
+ * ensure that such writes reach disk before anything that's done in the new
+ * run.  However, aborting on error would result in failure to start for
+ * harmless cases such as read-only files in the data directory, and that's
+ * not good either.
+ *
+ * Note that if we previously crashed due to a PANIC on fsync(), we'll be
+ * rewriting all changes again during recovery.
+ *
+ * Note we assume we're chdir'd into PGDATA to begin with.
+ */
+void
+SyncDataDirectory(void)
+{
+	bool		xlog_is_symlink;
+
+	/* We can skip this whole thing if fsync is disabled. */
+	if (!enableFsync)
+		return;
+
+	/*
+	 * If pg_wal is a symlink, we'll need to recurse into it separately,
+	 * because the first walkdir below will ignore it.
+	 */
+	xlog_is_symlink = false;
+
+#ifndef WIN32
+	{
+		struct stat st;
+
+		if (lstat("pg_wal", &st) < 0)
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m",
+							"pg_wal")));
+		else if (S_ISLNK(st.st_mode))
+			xlog_is_symlink = true;
+	}
+#else
+	if (pgwin32_is_junction("pg_wal"))
+		xlog_is_symlink = true;
+#endif
+
+#ifdef HAVE_SYNCFS
+	if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
+	{
+		DIR		   *dir;
+		struct dirent *de;
+
+		/*
+		 * On Linux, we don't have to open every single file one by one.  We
+		 * can use syncfs() to sync whole filesystems.  We only expect
+		 * filesystem boundaries to exist where we tolerate symlinks, namely
+		 * pg_wal and the tablespaces, so we call syncfs() for each of those
+		 * directories.
+		 */
+
+		/* Sync the top level pgdata directory. */
+		do_syncfs(".");
+		/* If any tablespaces are configured, sync each of those. */
+		dir = AllocateDir("pg_tblspc");
+		while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
+		{
+			char		path[MAXPGPATH];
+
+			if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+				continue;
+
+			snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
+			do_syncfs(path);
+		}
+		FreeDir(dir);
+		/* If pg_wal is a symlink, process that too. */
+		if (xlog_is_symlink)
+			do_syncfs("pg_wal");
+		return;
+	}
+#endif							/* !HAVE_SYNCFS */
+
+	/*
+	 * If possible, hint to the kernel that we're soon going to fsync the data
+	 * directory and its contents.  Errors in this step are even less
+	 * interesting than normal, so log them only at DEBUG1.
+	 */
+#ifdef PG_FLUSH_DATA_WORKS
+	walkdir(".", pre_sync_fname, false, DEBUG1);
+	if (xlog_is_symlink)
+		walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
+	walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
+#endif
+
+	/*
+	 * Now we do the fsync()s in the same order.
+	 *
+	 * The main call ignores symlinks, so in addition to specially processing
+	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
+	 * process_symlinks = true.  Note that if there are any plain directories
+	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
+	 * so we don't worry about optimizing it.
+	 */
+	walkdir(".", datadir_fsync_fname, false, LOG);
+	if (xlog_is_symlink)
+		walkdir("pg_wal", datadir_fsync_fname, false, LOG);
+	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
+}
+
+/*
+ * walkdir: recursively walk a directory, applying the action to each
+ * regular file and directory (including the named directory itself).
+ *
+ * If process_symlinks is true, the action and recursion are also applied
+ * to regular files and directories that are pointed to by symlinks in the
+ * given directory; otherwise symlinks are ignored.  Symlinks are always
+ * ignored in subdirectories, ie we intentionally don't pass down the
+ * process_symlinks flag to recursive calls.
+ *
+ * Errors are reported at level elevel, which might be ERROR or less.
+ *
+ * See also walkdir in file_utils.c, which is a frontend version of this
+ * logic.
+ */
+static void
+walkdir(const char *path,
+		void (*action) (const char *fname, bool isdir, int elevel),
+		bool process_symlinks,
+		int elevel)
+{
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = AllocateDir(path);
+
+	while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
+	{
+		char		subpath[MAXPGPATH * 2];
+
+		CHECK_FOR_INTERRUPTS();
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
+
+		switch (get_dirent_type(subpath, de, process_symlinks, elevel))
+		{
+			case PGFILETYPE_REG:
+				(*action) (subpath, false, elevel);
+				break;
+			case PGFILETYPE_DIR:
+				walkdir(subpath, action, false, elevel);
+				break;
+			default:
+
+				/*
+				 * Errors are already reported directly by get_dirent_type(),
+				 * and any remaining symlinks and unknown file types are
+				 * ignored.
+				 */
+				break;
+		}
+	}
+
+	FreeDir(dir);				/* we ignore any error here */
+
+	/*
+	 * It's important to fsync the destination directory itself as individual
+	 * file fsyncs don't guarantee that the directory entry for the file is
+	 * synced.  However, skip this if AllocateDir failed; the action function
+	 * might not be robust against that.
+	 */
+	if (dir)
+		(*action) (path, true, elevel);
+}
+
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Ignores errors trying to open unreadable files, and logs other errors at a
+ * caller-specified level.
+ */
+#ifdef PG_FLUSH_DATA_WORKS
+
+static void
+pre_sync_fname(const char *fname, bool isdir, int elevel)
+{
+	int			fd;
+
+	/* Don't try to flush directories, it'll likely just fail */
+	if (isdir)
+		return;
+
+	fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
+
+	if (fd < 0)
+	{
+		if (errno == EACCES)
+			return;
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fname)));
+		return;
+	}
+
+	/*
+	 * pg_flush_data() ignores errors, which is ok because this is only a
+	 * hint.
+	 */
+	pg_flush_data(fd, 0, 0);
+
+	if (CloseTransientFile(fd) != 0)
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fname)));
+}
+
+#endif							/* PG_FLUSH_DATA_WORKS */
+
+static void
+datadir_fsync_fname(const char *fname, bool isdir, int elevel)
+{
+	/*
+	 * We want to silently ignoring errors about unreadable files.  Pass that
+	 * desire on to fsync_fname_ext().
+	 */
+	fsync_fname_ext(fname, isdir, true, elevel);
+}
+
+static void
+unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
+{
+	if (isdir)
+	{
+		if (rmdir(fname) != 0 && errno != ENOENT)
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not remove directory \"%s\": %m", fname)));
+	}
+	else
+	{
+		/* Use PathNameDeleteTemporaryFile to report filesize */
+		PathNameDeleteTemporaryFile(fname, false);
+	}
+}
+
+/*
+ * fsync_fname_ext -- Try to fsync a file or directory
+ *
+ * If ignore_perm is true, ignore errors upon trying to open unreadable
+ * files. Logs other errors at a caller-specified level.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise.
+ */
+int
+fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
+{
+	int			fd;
+	int			flags;
+	int			returncode;
+
+	/*
+	 * Some OSs require directories to be opened read-only whereas other
+	 * systems don't allow us to fsync files opened read-only; so we need both
+	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
+	 * not writable by our userid, but we assume that's OK.
+	 */
+	flags = PG_BINARY;
+	if (!isdir)
+		flags |= O_RDWR;
+	else
+		flags |= O_RDONLY;
+
+	fd = OpenTransientFile(fname, flags);
+
+	/*
+	 * Some OSs don't allow us to open directories at all (Windows returns
+	 * EACCES), just ignore the error in that case.  If desired also silently
+	 * ignoring errors about unreadable files. Log others.
+	 */
+	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+		return 0;
+	else if (fd < 0 && ignore_perm && errno == EACCES)
+		return 0;
+	else if (fd < 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fname)));
+		return -1;
+	}
+
+	returncode = pg_fsync(fd);
+
+	/*
+	 * Some OSes don't allow us to fsync directories at all, so we can ignore
+	 * those errors. Anything else needs to be logged.
+	 */
+	if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
+	{
+		int			save_errno;
+
+		/* close file upon error, might not be in transaction context */
+		save_errno = errno;
+		(void) CloseTransientFile(fd);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", fname)));
+		return -1;
+	}
+
+	if (CloseTransientFile(fd) != 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fname)));
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * fsync_parent_path -- fsync the parent path of a file or directory
+ *
+ * This is aimed at making file operations persistent on disk in case of
+ * an OS crash or power failure.
+ */
+static int
+fsync_parent_path(const char *fname, int elevel)
+{
+	char		parentpath[MAXPGPATH];
+
+	strlcpy(parentpath, fname, MAXPGPATH);
+	get_parent_directory(parentpath);
+
+	/*
+	 * get_parent_directory() returns an empty string if the input argument is
+	 * just a file name (see comments in path.c), so handle that as being the
+	 * current directory.
+	 */
+	if (strlen(parentpath) == 0)
+		strlcpy(parentpath, ".", MAXPGPATH);
+
+	if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Create a PostgreSQL data sub-directory
+ *
+ * The data directory itself, and most of its sub-directories, are created at
+ * initdb time, but we do have some occasions when we create directories in
+ * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
+ * make sure that those directories are created consistently.  Today, that means
+ * making sure that the created directory has the correct permissions, which is
+ * what pg_dir_create_mode tracks for us.
+ *
+ * Note that we also set the umask() based on what we understand the correct
+ * permissions to be (see file_perm.c).
+ *
+ * For permissions other than the default, mkdir() can be used directly, but
+ * be sure to consider carefully such cases -- a sub-directory with incorrect
+ * permissions in a PostgreSQL data directory could cause backups and other
+ * processes to fail.
+ */
+int
+MakePGDirectory(const char *directoryName)
+{
+	return mkdir(directoryName, pg_dir_create_mode);
+}
+
+/*
+ * Return the passed-in error level, or PANIC if data_sync_retry is off.
+ *
+ * Failure to fsync any data file is cause for immediate panic, unless
+ * data_sync_retry is enabled.  Data may have been written to the operating
+ * system and removed from our buffer pool already, and if we are running on
+ * an operating system that forgets dirty data on write-back failure, there
+ * may be only one copy of the data remaining: in the WAL.  A later attempt to
+ * fsync again might falsely report success.  Therefore we must not allow any
+ * further checkpoints to be attempted.  data_sync_retry can in theory be
+ * enabled on systems known not to drop dirty buffered data on write-back
+ * failure (with the likely outcome that checkpoints will continue to fail
+ * until the underlying problem is fixed).
+ *
+ * Any code that reports a failure from fsync() or related functions should
+ * filter the error level with this function.
+ */
+int
+data_sync_elevel(int elevel)
+{
+	return data_sync_retry ? elevel : PANIC;
+}
+
+/*
+ * A convenience wrapper for pg_pwritev() that retries on partial write.  If an
+ * error is returned, it is unspecified how much has been written.
+ */
+ssize_t
+pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	struct iovec iov_copy[PG_IOV_MAX];
+	ssize_t		sum = 0;
+	ssize_t		part;
+
+	/* We'd better have space to make a copy, in case we need to retry. */
+	if (iovcnt > PG_IOV_MAX)
+	{
+		errno = EINVAL;
+		return -1;
+	}
+
+	for (;;)
+	{
+		/* Write as much as we can. */
+		part = pg_pwritev(fd, iov, iovcnt, offset);
+		if (part < 0)
+			return -1;
+
+#ifdef SIMULATE_SHORT_WRITE
+		part = Min(part, 4096);
+#endif
+
+		/* Count our progress. */
+		sum += part;
+		offset += part;
+
+		/* Step over iovecs that are done. */
+		while (iovcnt > 0 && iov->iov_len <= part)
+		{
+			part -= iov->iov_len;
+			++iov;
+			--iovcnt;
+		}
+
+		/* Are they all done? */
+		if (iovcnt == 0)
+		{
+			/* We don't expect the kernel to write more than requested. */
+			Assert(part == 0);
+			break;
+		}
+
+		/*
+		 * Move whatever's left to the front of our mutable copy and adjust
+		 * the leading iovec.
+		 */
+		Assert(iovcnt > 0);
+		memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
+		Assert(iov->iov_len > part);
+		iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
+		iov_copy[0].iov_len -= part;
+		iov = iov_copy;
+	}
+
+	return sum;
+}
diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
new file mode 100644
index 0000000..40c758d
--- /dev/null
+++ b/src/backend/storage/file/reinit.c
@@ -0,0 +1,410 @@
+/*-------------------------------------------------------------------------
+ *
+ * reinit.c
+ *	  Reinitialization of unlogged relations
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/reinit.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "common/relpath.h"
+#include "storage/copydir.h"
+#include "storage/fd.h"
+#include "storage/reinit.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+
+static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
+												  int op);
+static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
+											   int op);
+
+typedef struct
+{
+	Oid			reloid;			/* hash key */
+} unlogged_relation_entry;
+
+/*
+ * Reset unlogged relations from before the last restart.
+ *
+ * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any
+ * relation with an "init" fork, except for the "init" fork itself.
+ *
+ * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main
+ * fork.
+ */
+void
+ResetUnloggedRelations(int op)
+{
+	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
+	DIR		   *spc_dir;
+	struct dirent *spc_de;
+	MemoryContext tmpctx,
+				oldctx;
+
+	/* Log it. */
+	elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d",
+		 (op & UNLOGGED_RELATION_CLEANUP) != 0,
+		 (op & UNLOGGED_RELATION_INIT) != 0);
+
+	/*
+	 * Just to be sure we don't leak any memory, let's create a temporary
+	 * memory context for this operation.
+	 */
+	tmpctx = AllocSetContextCreate(CurrentMemoryContext,
+								   "ResetUnloggedRelations",
+								   ALLOCSET_DEFAULT_SIZES);
+	oldctx = MemoryContextSwitchTo(tmpctx);
+
+	/*
+	 * First process unlogged files in pg_default ($PGDATA/base)
+	 */
+	ResetUnloggedRelationsInTablespaceDir("base", op);
+
+	/*
+	 * Cycle through directories for all non-default tablespaces.
+	 */
+	spc_dir = AllocateDir("pg_tblspc");
+
+	while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
+	{
+		if (strcmp(spc_de->d_name, ".") == 0 ||
+			strcmp(spc_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
+				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
+		ResetUnloggedRelationsInTablespaceDir(temp_path, op);
+	}
+
+	FreeDir(spc_dir);
+
+	/*
+	 * Restore memory context.
+	 */
+	MemoryContextSwitchTo(oldctx);
+	MemoryContextDelete(tmpctx);
+}
+
+/*
+ * Process one tablespace directory for ResetUnloggedRelations
+ */
+static void
+ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
+{
+	DIR		   *ts_dir;
+	struct dirent *de;
+	char		dbspace_path[MAXPGPATH * 2];
+
+	ts_dir = AllocateDir(tsdirname);
+
+	/*
+	 * If we get ENOENT on a tablespace directory, log it and return.  This
+	 * can happen if a previous DROP TABLESPACE crashed between removing the
+	 * tablespace directory and removing the symlink in pg_tblspc.  We don't
+	 * really want to prevent database startup in that scenario, so let it
+	 * pass instead.  Any other type of error will be reported by ReadDir
+	 * (causing a startup failure).
+	 */
+	if (ts_dir == NULL && errno == ENOENT)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not open directory \"%s\": %m",
+						tsdirname)));
+		return;
+	}
+
+	while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
+	{
+		/*
+		 * We're only interested in the per-database directories, which have
+		 * numeric names.  Note that this code will also (properly) ignore "."
+		 * and "..".
+		 */
+		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+			continue;
+
+		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
+				 tsdirname, de->d_name);
+		ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
+	}
+
+	FreeDir(ts_dir);
+}
+
+/*
+ * Process one per-dbspace directory for ResetUnloggedRelations
+ */
+static void
+ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
+{
+	DIR		   *dbspace_dir;
+	struct dirent *de;
+	char		rm_path[MAXPGPATH * 2];
+
+	/* Caller must specify at least one operation. */
+	Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);
+
+	/*
+	 * Cleanup is a two-pass operation.  First, we go through and identify all
+	 * the files with init forks.  Then, we go through again and nuke
+	 * everything with the same OID except the init fork.
+	 */
+	if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
+	{
+		HTAB	   *hash;
+		HASHCTL		ctl;
+
+		/*
+		 * It's possible that someone could create a ton of unlogged relations
+		 * in the same database & tablespace, so we'd better use a hash table
+		 * rather than an array or linked list to keep track of which files
+		 * need to be reset.  Otherwise, this cleanup operation would be
+		 * O(n^2).
+		 */
+		ctl.keysize = sizeof(Oid);
+		ctl.entrysize = sizeof(unlogged_relation_entry);
+		ctl.hcxt = CurrentMemoryContext;
+		hash = hash_create("unlogged relation OIDs", 32, &ctl,
+						   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+		/* Scan the directory. */
+		dbspace_dir = AllocateDir(dbspacedirname);
+		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+		{
+			ForkNumber	forkNum;
+			int			oidchars;
+			unlogged_relation_entry ent;
+
+			/* Skip anything that doesn't look like a relation data file. */
+			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
+													 &forkNum))
+				continue;
+
+			/* Also skip it unless this is the init fork. */
+			if (forkNum != INIT_FORKNUM)
+				continue;
+
+			/*
+			 * Put the OID portion of the name into the hash table, if it
+			 * isn't already.
+			 */
+			ent.reloid = atooid(de->d_name);
+			(void) hash_search(hash, &ent, HASH_ENTER, NULL);
+		}
+
+		/* Done with the first pass. */
+		FreeDir(dbspace_dir);
+
+		/*
+		 * If we didn't find any init forks, there's no point in continuing;
+		 * we can bail out now.
+		 */
+		if (hash_get_num_entries(hash) == 0)
+		{
+			hash_destroy(hash);
+			return;
+		}
+
+		/*
+		 * Now, make a second pass and remove anything that matches.
+		 */
+		dbspace_dir = AllocateDir(dbspacedirname);
+		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+		{
+			ForkNumber	forkNum;
+			int			oidchars;
+			unlogged_relation_entry ent;
+
+			/* Skip anything that doesn't look like a relation data file. */
+			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
+													 &forkNum))
+				continue;
+
+			/* We never remove the init fork. */
+			if (forkNum == INIT_FORKNUM)
+				continue;
+
+			/*
+			 * See whether the OID portion of the name shows up in the hash
+			 * table.  If so, nuke it!
+			 */
+			ent.reloid = atooid(de->d_name);
+			if (hash_search(hash, &ent, HASH_FIND, NULL))
+			{
+				snprintf(rm_path, sizeof(rm_path), "%s/%s",
+						 dbspacedirname, de->d_name);
+				if (unlink(rm_path) < 0)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not remove file \"%s\": %m",
+									rm_path)));
+				else
+					elog(DEBUG2, "unlinked file \"%s\"", rm_path);
+			}
+		}
+
+		/* Cleanup is complete. */
+		FreeDir(dbspace_dir);
+		hash_destroy(hash);
+	}
+
+	/*
+	 * Initialization happens after cleanup is complete: we copy each init
+	 * fork file to the corresponding main fork file.  Note that if we are
+	 * asked to do both cleanup and init, we may never get here: if the
+	 * cleanup code determines that there are no init forks in this dbspace,
+	 * it will return before we get to this point.
+	 */
+	if ((op & UNLOGGED_RELATION_INIT) != 0)
+	{
+		/* Scan the directory. */
+		dbspace_dir = AllocateDir(dbspacedirname);
+		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+		{
+			ForkNumber	forkNum;
+			int			oidchars;
+			char		oidbuf[OIDCHARS + 1];
+			char		srcpath[MAXPGPATH * 2];
+			char		dstpath[MAXPGPATH];
+
+			/* Skip anything that doesn't look like a relation data file. */
+			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
+													 &forkNum))
+				continue;
+
+			/* Also skip it unless this is the init fork. */
+			if (forkNum != INIT_FORKNUM)
+				continue;
+
+			/* Construct source pathname. */
+			snprintf(srcpath, sizeof(srcpath), "%s/%s",
+					 dbspacedirname, de->d_name);
+
+			/* Construct destination pathname. */
+			memcpy(oidbuf, de->d_name, oidchars);
+			oidbuf[oidchars] = '\0';
+			snprintf(dstpath, sizeof(dstpath), "%s/%s%s",
+					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
+					 strlen(forkNames[INIT_FORKNUM]));
+
+			/* OK, we're ready to perform the actual copy. */
+			elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
+			copy_file(srcpath, dstpath);
+		}
+
+		FreeDir(dbspace_dir);
+
+		/*
+		 * copy_file() above has already called pg_flush_data() on the files
+		 * it created. Now we need to fsync those files, because a checkpoint
+		 * won't do it for us while we're in recovery. We do this in a
+		 * separate pass to allow the kernel to perform all the flushes
+		 * (especially the metadata ones) at once.
+		 */
+		dbspace_dir = AllocateDir(dbspacedirname);
+		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+		{
+			ForkNumber	forkNum;
+			int			oidchars;
+			char		oidbuf[OIDCHARS + 1];
+			char		mainpath[MAXPGPATH];
+
+			/* Skip anything that doesn't look like a relation data file. */
+			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
+													 &forkNum))
+				continue;
+
+			/* Also skip it unless this is the init fork. */
+			if (forkNum != INIT_FORKNUM)
+				continue;
+
+			/* Construct main fork pathname. */
+			memcpy(oidbuf, de->d_name, oidchars);
+			oidbuf[oidchars] = '\0';
+			snprintf(mainpath, sizeof(mainpath), "%s/%s%s",
+					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
+					 strlen(forkNames[INIT_FORKNUM]));
+
+			fsync_fname(mainpath, false);
+		}
+
+		FreeDir(dbspace_dir);
+
+		/*
+		 * Lastly, fsync the database directory itself, ensuring the
+		 * filesystem remembers the file creations and deletions we've done.
+		 * We don't bother with this during a call that does only
+		 * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we
+		 * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step
+		 * too at the next startup attempt.
+		 */
+		fsync_fname(dbspacedirname, true);
+	}
+}
+
+/*
+ * Basic parsing of putative relation filenames.
+ *
+ * This function returns true if the file appears to be in the correct format
+ * for a non-temporary relation and false otherwise.
+ *
+ * NB: If this function returns true, the caller is entitled to assume that
+ * *oidchars has been set to the a value no more than OIDCHARS, and thus
+ * that a buffer of OIDCHARS+1 characters is sufficient to hold the OID
+ * portion of the filename.  This is critical to protect against a possible
+ * buffer overrun.
+ */
+bool
+parse_filename_for_nontemp_relation(const char *name, int *oidchars,
+									ForkNumber *fork)
+{
+	int			pos;
+
+	/* Look for a non-empty string of digits (that isn't too long). */
+	for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
+		;
+	if (pos == 0 || pos > OIDCHARS)
+		return false;
+	*oidchars = pos;
+
+	/* Check for a fork name. */
+	if (name[pos] != '_')
+		*fork = MAIN_FORKNUM;
+	else
+	{
+		int			forkchar;
+
+		forkchar = forkname_chars(&name[pos + 1], fork);
+		if (forkchar <= 0)
+			return false;
+		pos += forkchar + 1;
+	}
+
+	/* Check for a segment number. */
+	if (name[pos] == '.')
+	{
+		int			segchar;
+
+		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
+			;
+		if (segchar <= 1)
+			return false;
+		pos += segchar;
+	}
+
+	/* Now we should be at the end. */
+	if (name[pos] != '\0')
+		return false;
+	return true;
+}
diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c
new file mode 100644
index 0000000..ed37c94
--- /dev/null
+++ b/src/backend/storage/file/sharedfileset.c
@@ -0,0 +1,354 @@
+/*-------------------------------------------------------------------------
+ *
+ * sharedfileset.c
+ *	  Shared temporary file management.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/sharedfileset.c
+ *
+ * SharedFileSets provide a temporary namespace (think directory) so that
+ * files can be discovered by name, and a shared ownership semantics so that
+ * shared files survive until the last user detaches.
+ *
+ * SharedFileSets can be used by backends when the temporary files need to be
+ * opened/closed multiple times and the underlying files need to survive across
+ * transactions.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "catalog/pg_tablespace.h"
+#include "commands/tablespace.h"
+#include "common/hashfn.h"
+#include "miscadmin.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/sharedfileset.h"
+#include "utils/builtins.h"
+
+static List *filesetlist = NIL;
+
+static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum);
+static void SharedFileSetDeleteOnProcExit(int status, Datum arg);
+static void SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace);
+static void SharedFilePath(char *path, SharedFileSet *fileset, const char *name);
+static Oid	ChooseTablespace(const SharedFileSet *fileset, const char *name);
+
+/*
+ * Initialize a space for temporary files that can be opened by other backends.
+ * Other backends must attach to it before accessing it.  Associate this
+ * SharedFileSet with 'seg'.  Any contained files will be deleted when the
+ * last backend detaches.
+ *
+ * We can also use this interface if the temporary files are used only by
+ * single backend but the files need to be opened and closed multiple times
+ * and also the underlying files need to survive across transactions.  For
+ * such cases, dsm segment 'seg' should be passed as NULL.  Callers are
+ * expected to explicitly remove such files by using SharedFileSetDelete/
+ * SharedFileSetDeleteAll or we remove such files on proc exit.
+ *
+ * Files will be distributed over the tablespaces configured in
+ * temp_tablespaces.
+ *
+ * Under the covers the set is one or more directories which will eventually
+ * be deleted.
+ */
+void
+SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
+{
+	static uint32 counter = 0;
+
+	SpinLockInit(&fileset->mutex);
+	fileset->refcnt = 1;
+	fileset->creator_pid = MyProcPid;
+	fileset->number = counter;
+	counter = (counter + 1) % INT_MAX;
+
+	/* Capture the tablespace OIDs so that all backends agree on them. */
+	PrepareTempTablespaces();
+	fileset->ntablespaces =
+		GetTempTablespaces(&fileset->tablespaces[0],
+						   lengthof(fileset->tablespaces));
+	if (fileset->ntablespaces == 0)
+	{
+		/* If the GUC is empty, use current database's default tablespace */
+		fileset->tablespaces[0] = MyDatabaseTableSpace;
+		fileset->ntablespaces = 1;
+	}
+	else
+	{
+		int			i;
+
+		/*
+		 * An entry of InvalidOid means use the default tablespace for the
+		 * current database.  Replace that now, to be sure that all users of
+		 * the SharedFileSet agree on what to do.
+		 */
+		for (i = 0; i < fileset->ntablespaces; i++)
+		{
+			if (fileset->tablespaces[i] == InvalidOid)
+				fileset->tablespaces[i] = MyDatabaseTableSpace;
+		}
+	}
+
+	/* Register our cleanup callback. */
+	if (seg)
+		on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+	else
+	{
+		static bool registered_cleanup = false;
+
+		if (!registered_cleanup)
+		{
+			/*
+			 * We must not have registered any fileset before registering the
+			 * fileset clean up.
+			 */
+			Assert(filesetlist == NIL);
+			on_proc_exit(SharedFileSetDeleteOnProcExit, 0);
+			registered_cleanup = true;
+		}
+
+		filesetlist = lcons((void *) fileset, filesetlist);
+	}
+}
+
+/*
+ * Attach to a set of directories that was created with SharedFileSetInit.
+ */
+void
+SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
+{
+	bool		success;
+
+	SpinLockAcquire(&fileset->mutex);
+	if (fileset->refcnt == 0)
+		success = false;
+	else
+	{
+		++fileset->refcnt;
+		success = true;
+	}
+	SpinLockRelease(&fileset->mutex);
+
+	if (!success)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not attach to a SharedFileSet that is already destroyed")));
+
+	/* Register our cleanup callback. */
+	on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+}
+
+/*
+ * Create a new file in the given set.
+ */
+File
+SharedFileSetCreate(SharedFileSet *fileset, const char *name)
+{
+	char		path[MAXPGPATH];
+	File		file;
+
+	SharedFilePath(path, fileset, name);
+	file = PathNameCreateTemporaryFile(path, false);
+
+	/* If we failed, see if we need to create the directory on demand. */
+	if (file <= 0)
+	{
+		char		tempdirpath[MAXPGPATH];
+		char		filesetpath[MAXPGPATH];
+		Oid			tablespace = ChooseTablespace(fileset, name);
+
+		TempTablespacePath(tempdirpath, tablespace);
+		SharedFileSetPath(filesetpath, fileset, tablespace);
+		PathNameCreateTemporaryDir(tempdirpath, filesetpath);
+		file = PathNameCreateTemporaryFile(path, true);
+	}
+
+	return file;
+}
+
+/*
+ * Open a file that was created with SharedFileSetCreate(), possibly in
+ * another backend.
+ */
+File
+SharedFileSetOpen(SharedFileSet *fileset, const char *name, int mode)
+{
+	char		path[MAXPGPATH];
+	File		file;
+
+	SharedFilePath(path, fileset, name);
+	file = PathNameOpenTemporaryFile(path, mode);
+
+	return file;
+}
+
+/*
+ * Delete a file that was created with SharedFileSetCreate().
+ * Return true if the file existed, false if didn't.
+ */
+bool
+SharedFileSetDelete(SharedFileSet *fileset, const char *name,
+					bool error_on_failure)
+{
+	char		path[MAXPGPATH];
+
+	SharedFilePath(path, fileset, name);
+
+	return PathNameDeleteTemporaryFile(path, error_on_failure);
+}
+
+/*
+ * Delete all files in the set.
+ */
+void
+SharedFileSetDeleteAll(SharedFileSet *fileset)
+{
+	char		dirpath[MAXPGPATH];
+	int			i;
+
+	/*
+	 * Delete the directory we created in each tablespace.  Doesn't fail
+	 * because we use this in error cleanup paths, but can generate LOG
+	 * message on IO error.
+	 */
+	for (i = 0; i < fileset->ntablespaces; ++i)
+	{
+		SharedFileSetPath(dirpath, fileset, fileset->tablespaces[i]);
+		PathNameDeleteTemporaryDir(dirpath);
+	}
+
+	/* Unregister the shared fileset */
+	SharedFileSetUnregister(fileset);
+}
+
+/*
+ * Callback function that will be invoked when this backend detaches from a
+ * DSM segment holding a SharedFileSet that it has created or attached to.  If
+ * we are the last to detach, then try to remove the directories and
+ * everything in them.  We can't raise an error on failures, because this runs
+ * in error cleanup paths.
+ */
+static void
+SharedFileSetOnDetach(dsm_segment *segment, Datum datum)
+{
+	bool		unlink_all = false;
+	SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum);
+
+	SpinLockAcquire(&fileset->mutex);
+	Assert(fileset->refcnt > 0);
+	if (--fileset->refcnt == 0)
+		unlink_all = true;
+	SpinLockRelease(&fileset->mutex);
+
+	/*
+	 * If we are the last to detach, we delete the directory in all
+	 * tablespaces.  Note that we are still actually attached for the rest of
+	 * this function so we can safely access its data.
+	 */
+	if (unlink_all)
+		SharedFileSetDeleteAll(fileset);
+}
+
+/*
+ * Callback function that will be invoked on the process exit.  This will
+ * process the list of all the registered sharedfilesets and delete the
+ * underlying files.
+ */
+static void
+SharedFileSetDeleteOnProcExit(int status, Datum arg)
+{
+	/*
+	 * Remove all the pending shared fileset entries. We don't use foreach()
+	 * here because SharedFileSetDeleteAll will remove the current element in
+	 * filesetlist. Though we have used foreach_delete_current() to remove the
+	 * element from filesetlist it could only fix up the state of one of the
+	 * loops, see SharedFileSetUnregister.
+	 */
+	while (list_length(filesetlist) > 0)
+	{
+		SharedFileSet *fileset = (SharedFileSet *) linitial(filesetlist);
+
+		SharedFileSetDeleteAll(fileset);
+	}
+
+	filesetlist = NIL;
+}
+
+/*
+ * Unregister the shared fileset entry registered for cleanup on proc exit.
+ */
+void
+SharedFileSetUnregister(SharedFileSet *input_fileset)
+{
+	ListCell   *l;
+
+	/*
+	 * If the caller is following the dsm based cleanup then we don't maintain
+	 * the filesetlist so return.
+	 */
+	if (filesetlist == NIL)
+		return;
+
+	foreach(l, filesetlist)
+	{
+		SharedFileSet *fileset = (SharedFileSet *) lfirst(l);
+
+		/* Remove the entry from the list */
+		if (input_fileset == fileset)
+		{
+			filesetlist = foreach_delete_current(filesetlist, l);
+			return;
+		}
+	}
+
+	/* Should have found a match */
+	Assert(false);
+}
+
+/*
+ * Build the path for the directory holding the files backing a SharedFileSet
+ * in a given tablespace.
+ */
+static void
+SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace)
+{
+	char		tempdirpath[MAXPGPATH];
+
+	TempTablespacePath(tempdirpath, tablespace);
+	snprintf(path, MAXPGPATH, "%s/%s%lu.%u.sharedfileset",
+			 tempdirpath, PG_TEMP_FILE_PREFIX,
+			 (unsigned long) fileset->creator_pid, fileset->number);
+}
+
+/*
+ * Sorting hat to determine which tablespace a given shared temporary file
+ * belongs in.
+ */
+static Oid
+ChooseTablespace(const SharedFileSet *fileset, const char *name)
+{
+	uint32		hash = hash_any((const unsigned char *) name, strlen(name));
+
+	return fileset->tablespaces[hash % fileset->ntablespaces];
+}
+
+/*
+ * Compute the full path of a file in a SharedFileSet.
+ */
+static void
+SharedFilePath(char *path, SharedFileSet *fileset, const char *name)
+{
+	char		dirpath[MAXPGPATH];
+
+	SharedFileSetPath(dirpath, fileset, ChooseTablespace(fileset, name));
+	snprintf(path, MAXPGPATH, "%s/%s", dirpath, name);
+}
diff --git a/src/backend/storage/freespace/Makefile b/src/backend/storage/freespace/Makefile
new file mode 100644
index 0000000..ac0fa8b
--- /dev/null
+++ b/src/backend/storage/freespace/Makefile
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/freespace
+#
+# IDENTIFICATION
+#    src/backend/storage/freespace/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/freespace
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	freespace.o \
+	fsmpage.o \
+	indexfsm.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README
new file mode 100644
index 0000000..e7ff23b
--- /dev/null
+++ b/src/backend/storage/freespace/README
@@ -0,0 +1,196 @@
+src/backend/storage/freespace/README
+
+Free Space Map
+--------------
+
+The purpose of the free space map is to quickly locate a page with enough
+free space to hold a tuple to be stored; or to determine that no such page
+exists and the relation must be extended by one page.  As of PostgreSQL 8.4
+each relation has its own, extensible free space map stored in a separate
+"fork" of its relation.  This eliminates the disadvantages of the former
+fixed-size FSM.
+
+It is important to keep the map small so that it can be searched rapidly.
+Therefore, we don't attempt to record the exact free space on a page.
+We allocate one map byte to each page, allowing us to record free space
+at a granularity of 1/256th of a page.  Another way to say it is that
+the stored value is the free space divided by BLCKSZ/256 (rounding down).
+We assume that the free space must always be less than BLCKSZ, since
+all pages have some overhead; so the maximum map value is 255.
+
+To assist in fast searching, the map isn't simply an array of per-page
+entries, but has a tree structure above those entries.  There is a tree
+structure of pages, and a tree structure within each page, as described
+below.
+
+FSM page structure
+------------------
+
+Within each FSM page, we use a binary tree structure where leaf nodes store
+the amount of free space on heap pages (or lower level FSM pages, see
+"Higher-level structure" below), with one leaf node per heap page. A non-leaf
+node stores the max amount of free space on any of its children.
+
+For example:
+
+    4
+ 4     2
+3 4   0 2    <- This level represents heap pages
+
+We need two basic operations: search and update.
+
+To search for a page with X amount of free space, traverse down the tree
+along a path where n >= X, until you hit the bottom. If both children of a
+node satisfy the condition, you can pick either one arbitrarily.
+
+To update the amount of free space on a page to X, first update the leaf node
+corresponding to the heap page, then "bubble up" the change to upper nodes,
+by walking up to each parent and recomputing its value as the max of its
+two children.  Repeat until reaching the root or a parent whose value
+doesn't change.
+
+This data structure has a couple of nice properties:
+- to discover that there is no page with X bytes of free space, you only
+  need to look at the root node
+- by varying which child to traverse to in the search algorithm, when you have
+  a choice, we can implement various strategies, like preferring pages closer
+  to a given page, or spreading the load across the table.
+
+Higher-level routines that use FSM pages access them through the fsm_set_avail()
+and fsm_search_avail() functions. The interface to those functions hides the
+page's internal tree structure, treating the FSM page as a black box that has
+a certain number of "slots" for storing free space information.  (However,
+the higher routines have to be aware of the tree structure of the whole map.)
+
+The binary tree is stored on each FSM page as an array. Because the page
+header takes some space on a page, the binary tree isn't perfect. That is,
+a few right-most leaf nodes are missing, and there are some useless non-leaf
+nodes at the right. So the tree looks something like this:
+
+       0
+   1       2
+ 3   4   5   6
+7 8 9 A B
+
+where the numbers denote each node's position in the array.  Note that the
+tree is guaranteed complete above the leaf level; only some leaf nodes are
+missing.  This is reflected in the number of usable "slots" per page not
+being an exact power of 2.
+
+A FSM page also has a next slot pointer, fp_next_slot, that determines where
+to start the next search for free space within that page.  The reason for that
+is to spread out the pages that are returned by FSM searches.  When several
+backends are concurrently inserting into a relation, contention can be avoided
+by having them insert into different pages.  But it is also desirable to fill
+up pages in sequential order, to get the benefit of OS prefetching and batched
+writes.  The FSM is responsible for making that happen, and the next slot
+pointer helps provide the desired behavior.
+
+Higher-level structure
+----------------------
+
+To scale up the data structure described above beyond a single page, we
+maintain a similar tree-structure across pages. Leaf nodes in higher level
+pages correspond to lower level FSM pages. The root node within each page
+has the same value as the corresponding leaf node on its parent page.
+
+The root page is always stored at physical block 0.
+
+For example, assuming each FSM page can hold information about 4 pages (in
+reality, it holds (BLCKSZ - headers) / 2, or ~4000 with default BLCKSZ),
+we get a disk layout like this:
+
+ 0     <-- page 0 at level 2 (root page)
+  0     <-- page 0 at level 1
+   0     <-- page 0 at level 0
+   1     <-- page 1 at level 0
+   2     <-- ...
+   3
+  1     <-- page 1 at level 1
+   4
+   5
+   6
+   7
+  2
+   8
+   9
+   10
+   11
+  3
+   12
+   13
+   14
+   15
+
+where the numbers are page numbers *at that level*, starting from 0.
+
+To find the physical block # corresponding to leaf page n, we need to
+count the number of leaf and upper-level pages preceding page n.
+This turns out to be
+
+y = n + (n / F + 1) + (n / F^2 + 1) + ... + 1
+
+where F is the fanout (4 in the above example). The first term n is the number
+of preceding leaf pages, the second term is the number of pages at level 1,
+and so forth.
+
+To keep things simple, the tree is always constant height. To cover the
+maximum relation size of 2^32-1 blocks, three levels is enough with the default
+BLCKSZ (4000^3 > 2^32).
+
+Addressing
+----------
+
+The higher-level routines operate on "logical" addresses, consisting of
+- level,
+- logical page number, and
+- slot (if applicable)
+
+Bottom level FSM pages have level of 0, the level above that 1, and root 2.
+As in the diagram above, logical page number is the page number at that level,
+starting from 0.
+
+Locking
+-------
+
+When traversing down to search for free space, only one page is locked at a
+time: the parent page is released before locking the child. If the child page
+is concurrently modified, and there no longer is free space on the child page
+when you land on it, you need to start from scratch (after correcting the
+parent page, so that you don't get into an infinite loop).
+
+We use shared buffer locks when searching, but exclusive buffer lock when
+updating a page.  However, the next slot search pointer is updated during
+searches even though we have only a shared lock.  fp_next_slot is just a hint
+and we can easily reset it if it gets corrupted; so it seems better to accept
+some risk of that type than to pay the overhead of exclusive locking.
+
+Recovery
+--------
+
+The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of
+self-correcting measures to repair possible corruption. As a result when
+we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint()
+rather than MarkBufferDirty().
+
+First of all, whenever a value is set on an FSM page, the root node of the
+page is compared against the new value after bubbling up the change is
+finished. It should be greater than or equal to the value just set, or we
+have a corrupted page, with a parent somewhere with too small a value.
+Secondly, if we detect corrupted pages while we search, traversing down
+the tree. That check will notice if a parent node is set to too high a value.
+In both cases, the upper nodes on the page are immediately rebuilt, fixing
+the corruption so far as that page is concerned.
+
+VACUUM updates all the bottom-level FSM pages with the correct amount of free
+space on corresponding heap pages, as it proceeds through the heap.  This
+goes through fsm_set_avail(), so that the upper nodes on those pages are
+immediately updated.  Periodically, VACUUM calls FreeSpaceMapVacuum[Range]
+to propagate the new free-space info into the upper pages of the FSM tree.
+
+TODO
+----
+
+- fastroot to avoid traversing upper nodes with just 1 child
+- use a different system for tables that fit into one FSM page, with a
+  mechanism to switch to the real thing as it grows.
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
new file mode 100644
index 0000000..8c12dda
--- /dev/null
+++ b/src/backend/storage/freespace/freespace.c
@@ -0,0 +1,893 @@
+/*-------------------------------------------------------------------------
+ *
+ * freespace.c
+ *	  POSTGRES free space map for quickly finding free space in relations
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/freespace/freespace.c
+ *
+ *
+ * NOTES:
+ *
+ *	Free Space Map keeps track of the amount of free space on pages, and
+ *	allows quickly searching for a page with enough free space. The FSM is
+ *	stored in a dedicated relation fork of all heap relations, and those
+ *	index access methods that need it (see also indexfsm.c). See README for
+ *	more information.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/freespace.h"
+#include "storage/fsm_internals.h"
+#include "storage/lmgr.h"
+#include "storage/smgr.h"
+
+
+/*
+ * We use just one byte to store the amount of free space on a page, so we
+ * divide the amount of free space a page can have into 256 different
+ * categories. The highest category, 255, represents a page with at least
+ * MaxFSMRequestSize bytes of free space, and the second highest category
+ * represents the range from 254 * FSM_CAT_STEP, inclusive, to
+ * MaxFSMRequestSize, exclusive.
+ *
+ * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
+ * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
+ * categories look like this:
+ *
+ *
+ * Range	 Category
+ * 0	- 31   0
+ * 32	- 63   1
+ * ...    ...  ...
+ * 8096 - 8127 253
+ * 8128 - 8163 254
+ * 8164 - 8192 255
+ *
+ * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
+ * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
+ * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
+ * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
+ * completely empty page, that would mean that we could never satisfy a
+ * request of exactly MaxFSMRequestSize bytes.
+ */
+#define FSM_CATEGORIES	256
+#define FSM_CAT_STEP	(BLCKSZ / FSM_CATEGORIES)
+#define MaxFSMRequestSize	MaxHeapTupleSize
+
+/*
+ * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
+ * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
+ * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
+ * this means that 4096 bytes is the smallest BLCKSZ that we can get away
+ * with a 3-level tree, and 512 is the smallest we support.
+ */
+#define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)
+
+#define FSM_ROOT_LEVEL	(FSM_TREE_DEPTH - 1)
+#define FSM_BOTTOM_LEVEL 0
+
+/*
+ * The internal FSM routines work on a logical addressing scheme. Each
+ * level of the tree can be thought of as a separately addressable file.
+ */
+typedef struct
+{
+	int			level;			/* level */
+	int			logpageno;		/* page number within the level */
+} FSMAddress;
+
+/* Address of the root page. */
+static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
+
+/* functions to navigate the tree */
+static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
+static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
+static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
+static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
+static BlockNumber fsm_logical_to_physical(FSMAddress addr);
+
+static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
+static void fsm_extend(Relation rel, BlockNumber fsm_nblocks);
+
+/* functions to convert amount of free space to a FSM category */
+static uint8 fsm_space_avail_to_cat(Size avail);
+static uint8 fsm_space_needed_to_cat(Size needed);
+static Size fsm_space_cat_to_avail(uint8 cat);
+
+/* workhorse functions for various operations */
+static int	fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
+							   uint8 newValue, uint8 minValue);
+static BlockNumber fsm_search(Relation rel, uint8 min_cat);
+static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
+							 BlockNumber start, BlockNumber end,
+							 bool *eof);
+
+
+/******** Public API ********/
+
+/*
+ * GetPageWithFreeSpace - try to find a page in the given relation with
+ *		at least the specified amount of free space.
+ *
+ * If successful, return the block number; if not, return InvalidBlockNumber.
+ *
+ * The caller must be prepared for the possibility that the returned page
+ * will turn out to have too little space available by the time the caller
+ * gets a lock on it.  In that case, the caller should report the actual
+ * amount of free space available on that page and then try again (see
+ * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
+ * extend the relation.
+ */
+BlockNumber
+GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
+{
+	uint8		min_cat = fsm_space_needed_to_cat(spaceNeeded);
+
+	return fsm_search(rel, min_cat);
+}
+
+/*
+ * RecordAndGetPageWithFreeSpace - update info about a page and try again.
+ *
+ * We provide this combo form to save some locking overhead, compared to
+ * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
+ * also some effort to return a page close to the old page; if there's a
+ * page with enough free space on the same FSM page where the old one page
+ * is located, it is preferred.
+ */
+BlockNumber
+RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
+							  Size oldSpaceAvail, Size spaceNeeded)
+{
+	int			old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
+	int			search_cat = fsm_space_needed_to_cat(spaceNeeded);
+	FSMAddress	addr;
+	uint16		slot;
+	int			search_slot;
+
+	/* Get the location of the FSM byte representing the heap block */
+	addr = fsm_get_location(oldPage, &slot);
+
+	search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
+
+	/*
+	 * If fsm_set_and_search found a suitable new block, return that.
+	 * Otherwise, search as usual.
+	 */
+	if (search_slot != -1)
+		return fsm_get_heap_blk(addr, search_slot);
+	else
+		return fsm_search(rel, search_cat);
+}
+
+/*
+ * RecordPageWithFreeSpace - update info about a page.
+ *
+ * Note that if the new spaceAvail value is higher than the old value stored
+ * in the FSM, the space might not become visible to searchers until the next
+ * FreeSpaceMapVacuum call, which updates the upper level pages.
+ */
+void
+RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
+{
+	int			new_cat = fsm_space_avail_to_cat(spaceAvail);
+	FSMAddress	addr;
+	uint16		slot;
+
+	/* Get the location of the FSM byte representing the heap block */
+	addr = fsm_get_location(heapBlk, &slot);
+
+	fsm_set_and_search(rel, addr, slot, new_cat, 0);
+}
+
+/*
+ * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
+ *		WAL replay
+ */
+void
+XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
+							Size spaceAvail)
+{
+	int			new_cat = fsm_space_avail_to_cat(spaceAvail);
+	FSMAddress	addr;
+	uint16		slot;
+	BlockNumber blkno;
+	Buffer		buf;
+	Page		page;
+
+	/* Get the location of the FSM byte representing the heap block */
+	addr = fsm_get_location(heapBlk, &slot);
+	blkno = fsm_logical_to_physical(addr);
+
+	/* If the page doesn't exist already, extend */
+	buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+	page = BufferGetPage(buf);
+	if (PageIsNew(page))
+		PageInit(page, BLCKSZ, 0);
+
+	if (fsm_set_avail(page, slot, new_cat))
+		MarkBufferDirtyHint(buf, false);
+	UnlockReleaseBuffer(buf);
+}
+
+/*
+ * GetRecordedFreeSpace - return the amount of free space on a particular page,
+ *		according to the FSM.
+ */
+Size
+GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
+{
+	FSMAddress	addr;
+	uint16		slot;
+	Buffer		buf;
+	uint8		cat;
+
+	/* Get the location of the FSM byte representing the heap block */
+	addr = fsm_get_location(heapBlk, &slot);
+
+	buf = fsm_readbuf(rel, addr, false);
+	if (!BufferIsValid(buf))
+		return 0;
+	cat = fsm_get_avail(BufferGetPage(buf), slot);
+	ReleaseBuffer(buf);
+
+	return fsm_space_cat_to_avail(cat);
+}
+
+/*
+ * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
+ *
+ * nblocks is the new size of the heap.
+ *
+ * Return the number of blocks of new FSM.
+ * If it's InvalidBlockNumber, there is nothing to truncate;
+ * otherwise the caller is responsible for calling smgrtruncate()
+ * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
+ * to update upper-level pages in the FSM.
+ */
+BlockNumber
+FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
+{
+	BlockNumber new_nfsmblocks;
+	FSMAddress	first_removed_address;
+	uint16		first_removed_slot;
+	Buffer		buf;
+
+	RelationOpenSmgr(rel);
+
+	/*
+	 * If no FSM has been created yet for this relation, there's nothing to
+	 * truncate.
+	 */
+	if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
+		return InvalidBlockNumber;
+
+	/* Get the location in the FSM of the first removed heap block */
+	first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
+
+	/*
+	 * Zero out the tail of the last remaining FSM page. If the slot
+	 * representing the first removed heap block is at a page boundary, as the
+	 * first slot on the FSM page that first_removed_address points to, we can
+	 * just truncate that page altogether.
+	 */
+	if (first_removed_slot > 0)
+	{
+		buf = fsm_readbuf(rel, first_removed_address, false);
+		if (!BufferIsValid(buf))
+			return InvalidBlockNumber;	/* nothing to do; the FSM was already
+										 * smaller */
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+		/* NO EREPORT(ERROR) from here till changes are logged */
+		START_CRIT_SECTION();
+
+		fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
+
+		/*
+		 * Truncation of a relation is WAL-logged at a higher-level, and we
+		 * will be called at WAL replay. But if checksums are enabled, we need
+		 * to still write a WAL record to protect against a torn page, if the
+		 * page is flushed to disk before the truncation WAL record. We cannot
+		 * use MarkBufferDirtyHint here, because that will not dirty the page
+		 * during recovery.
+		 */
+		MarkBufferDirty(buf);
+		if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+			log_newpage_buffer(buf, false);
+
+		END_CRIT_SECTION();
+
+		UnlockReleaseBuffer(buf);
+
+		new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
+	}
+	else
+	{
+		new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
+		if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
+			return InvalidBlockNumber;	/* nothing to do; the FSM was already
+										 * smaller */
+	}
+
+	return new_nfsmblocks;
+}
+
+/*
+ * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
+ *
+ * We assume that the bottom-level pages have already been updated with
+ * new free-space information.
+ */
+void
+FreeSpaceMapVacuum(Relation rel)
+{
+	bool		dummy;
+
+	/* Recursively scan the tree, starting at the root */
+	(void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
+						   (BlockNumber) 0, InvalidBlockNumber,
+						   &dummy);
+}
+
+/*
+ * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
+ *
+ * As above, but assume that only heap pages between start and end-1 inclusive
+ * have new free-space information, so update only the upper-level slots
+ * covering that block range.  end == InvalidBlockNumber is equivalent to
+ * "all the rest of the relation".
+ */
+void
+FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
+{
+	bool		dummy;
+
+	/* Recursively scan the tree, starting at the root */
+	if (end > start)
+		(void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
+}
+
+/******** Internal routines ********/
+
+/*
+ * Return category corresponding x bytes of free space
+ */
+static uint8
+fsm_space_avail_to_cat(Size avail)
+{
+	int			cat;
+
+	Assert(avail < BLCKSZ);
+
+	if (avail >= MaxFSMRequestSize)
+		return 255;
+
+	cat = avail / FSM_CAT_STEP;
+
+	/*
+	 * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
+	 * more.
+	 */
+	if (cat > 254)
+		cat = 254;
+
+	return (uint8) cat;
+}
+
+/*
+ * Return the lower bound of the range of free space represented by given
+ * category.
+ */
+static Size
+fsm_space_cat_to_avail(uint8 cat)
+{
+	/* The highest category represents exactly MaxFSMRequestSize bytes. */
+	if (cat == 255)
+		return MaxFSMRequestSize;
+	else
+		return cat * FSM_CAT_STEP;
+}
+
+/*
+ * Which category does a page need to have, to accommodate x bytes of data?
+ * While fsm_space_avail_to_cat() rounds down, this needs to round up.
+ */
+static uint8
+fsm_space_needed_to_cat(Size needed)
+{
+	int			cat;
+
+	/* Can't ask for more space than the highest category represents */
+	if (needed > MaxFSMRequestSize)
+		elog(ERROR, "invalid FSM request size %zu", needed);
+
+	if (needed == 0)
+		return 1;
+
+	cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
+
+	if (cat > 255)
+		cat = 255;
+
+	return (uint8) cat;
+}
+
+/*
+ * Returns the physical block number of a FSM page
+ */
+static BlockNumber
+fsm_logical_to_physical(FSMAddress addr)
+{
+	BlockNumber pages;
+	int			leafno;
+	int			l;
+
+	/*
+	 * Calculate the logical page number of the first leaf page below the
+	 * given page.
+	 */
+	leafno = addr.logpageno;
+	for (l = 0; l < addr.level; l++)
+		leafno *= SlotsPerFSMPage;
+
+	/* Count upper level nodes required to address the leaf page */
+	pages = 0;
+	for (l = 0; l < FSM_TREE_DEPTH; l++)
+	{
+		pages += leafno + 1;
+		leafno /= SlotsPerFSMPage;
+	}
+
+	/*
+	 * If the page we were asked for wasn't at the bottom level, subtract the
+	 * additional lower level pages we counted above.
+	 */
+	pages -= addr.level;
+
+	/* Turn the page count into 0-based block number */
+	return pages - 1;
+}
+
+/*
+ * Return the FSM location corresponding to given heap block.
+ */
+static FSMAddress
+fsm_get_location(BlockNumber heapblk, uint16 *slot)
+{
+	FSMAddress	addr;
+
+	addr.level = FSM_BOTTOM_LEVEL;
+	addr.logpageno = heapblk / SlotsPerFSMPage;
+	*slot = heapblk % SlotsPerFSMPage;
+
+	return addr;
+}
+
+/*
+ * Return the heap block number corresponding to given location in the FSM.
+ */
+static BlockNumber
+fsm_get_heap_blk(FSMAddress addr, uint16 slot)
+{
+	Assert(addr.level == FSM_BOTTOM_LEVEL);
+	return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
+}
+
+/*
+ * Given a logical address of a child page, get the logical page number of
+ * the parent, and the slot within the parent corresponding to the child.
+ */
+static FSMAddress
+fsm_get_parent(FSMAddress child, uint16 *slot)
+{
+	FSMAddress	parent;
+
+	Assert(child.level < FSM_ROOT_LEVEL);
+
+	parent.level = child.level + 1;
+	parent.logpageno = child.logpageno / SlotsPerFSMPage;
+	*slot = child.logpageno % SlotsPerFSMPage;
+
+	return parent;
+}
+
+/*
+ * Given a logical address of a parent page and a slot number, get the
+ * logical address of the corresponding child page.
+ */
+static FSMAddress
+fsm_get_child(FSMAddress parent, uint16 slot)
+{
+	FSMAddress	child;
+
+	Assert(parent.level > FSM_BOTTOM_LEVEL);
+
+	child.level = parent.level - 1;
+	child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
+
+	return child;
+}
+
+/*
+ * Read a FSM page.
+ *
+ * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
+ * true, the FSM file is extended.
+ */
+static Buffer
+fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
+{
+	BlockNumber blkno = fsm_logical_to_physical(addr);
+	Buffer		buf;
+
+	RelationOpenSmgr(rel);
+
+	/*
+	 * If we haven't cached the size of the FSM yet, check it first.  Also
+	 * recheck if the requested block seems to be past end, since our cached
+	 * value might be stale.  (We send smgr inval messages on truncation, but
+	 * not on extension.)
+	 */
+	if (rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
+		blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM])
+	{
+		/* Invalidate the cache so smgrnblocks asks the kernel. */
+		rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
+		if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
+			smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
+		else
+			rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = 0;
+	}
+
+	/* Handle requests beyond EOF */
+	if (blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM])
+	{
+		if (extend)
+			fsm_extend(rel, blkno + 1);
+		else
+			return InvalidBuffer;
+	}
+
+	/*
+	 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
+	 * information is not accurate anyway, so it's better to clear corrupt
+	 * pages than error out. Since the FSM changes are not WAL-logged, the
+	 * so-called torn page problem on crash can lead to pages with corrupt
+	 * headers, for example.
+	 *
+	 * The initialize-the-page part is trickier than it looks, because of the
+	 * possibility of multiple backends doing this concurrently, and our
+	 * desire to not uselessly take the buffer lock in the normal path where
+	 * the page is OK.  We must take the lock to initialize the page, so
+	 * recheck page newness after we have the lock, in case someone else
+	 * already did it.  Also, because we initially check PageIsNew with no
+	 * lock, it's possible to fall through and return the buffer while someone
+	 * else is still initializing the page (i.e., we might see pd_upper as set
+	 * but other page header fields are still zeroes).  This is harmless for
+	 * callers that will take a buffer lock themselves, but some callers
+	 * inspect the page without any lock at all.  The latter is OK only so
+	 * long as it doesn't depend on the page header having correct contents.
+	 * Current usage is safe because PageGetContents() does not require that.
+	 */
+	buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
+	if (PageIsNew(BufferGetPage(buf)))
+	{
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		if (PageIsNew(BufferGetPage(buf)))
+			PageInit(BufferGetPage(buf), BLCKSZ, 0);
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	}
+	return buf;
+}
+
+/*
+ * Ensure that the FSM fork is at least fsm_nblocks long, extending
+ * it if necessary with empty pages. And by empty, I mean pages filled
+ * with zeros, meaning there's no free space.
+ */
+static void
+fsm_extend(Relation rel, BlockNumber fsm_nblocks)
+{
+	BlockNumber fsm_nblocks_now;
+	PGAlignedBlock pg;
+
+	PageInit((Page) pg.data, BLCKSZ, 0);
+
+	/*
+	 * We use the relation extension lock to lock out other backends trying to
+	 * extend the FSM at the same time. It also locks out extension of the
+	 * main fork, unnecessarily, but extending the FSM happens seldom enough
+	 * that it doesn't seem worthwhile to have a separate lock tag type for
+	 * it.
+	 *
+	 * Note that another backend might have extended or created the relation
+	 * by the time we get the lock.
+	 */
+	LockRelationForExtension(rel, ExclusiveLock);
+
+	/* Might have to re-open if a cache flush happened */
+	RelationOpenSmgr(rel);
+
+	/*
+	 * Create the FSM file first if it doesn't exist.  If
+	 * smgr_cached_nblocks[FSM_FORKNUM] is positive then it must exist, no
+	 * need for an smgrexists call.
+	 */
+	if ((rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == 0 ||
+		 rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber) &&
+		!smgrexists(rel->rd_smgr, FSM_FORKNUM))
+		smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);
+
+	/* Invalidate cache so that smgrnblocks() asks the kernel. */
+	rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
+	fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
+
+	while (fsm_nblocks_now < fsm_nblocks)
+	{
+		PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now);
+
+		smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
+				   pg.data, false);
+		fsm_nblocks_now++;
+	}
+
+	UnlockRelationForExtension(rel, ExclusiveLock);
+}
+
+/*
+ * Set value in given FSM page and slot.
+ *
+ * If minValue > 0, the updated page is also searched for a page with at
+ * least minValue of free space. If one is found, its slot number is
+ * returned, -1 otherwise.
+ */
+static int
+fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
+				   uint8 newValue, uint8 minValue)
+{
+	Buffer		buf;
+	Page		page;
+	int			newslot = -1;
+
+	buf = fsm_readbuf(rel, addr, true);
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+	page = BufferGetPage(buf);
+
+	if (fsm_set_avail(page, slot, newValue))
+		MarkBufferDirtyHint(buf, false);
+
+	if (minValue != 0)
+	{
+		/* Search while we still hold the lock */
+		newslot = fsm_search_avail(buf, minValue,
+								   addr.level == FSM_BOTTOM_LEVEL,
+								   true);
+	}
+
+	UnlockReleaseBuffer(buf);
+
+	return newslot;
+}
+
+/*
+ * Search the tree for a heap page with at least min_cat of free space
+ */
+static BlockNumber
+fsm_search(Relation rel, uint8 min_cat)
+{
+	int			restarts = 0;
+	FSMAddress	addr = FSM_ROOT_ADDRESS;
+
+	for (;;)
+	{
+		int			slot;
+		Buffer		buf;
+		uint8		max_avail = 0;
+
+		/* Read the FSM page. */
+		buf = fsm_readbuf(rel, addr, false);
+
+		/* Search within the page */
+		if (BufferIsValid(buf))
+		{
+			LockBuffer(buf, BUFFER_LOCK_SHARE);
+			slot = fsm_search_avail(buf, min_cat,
+									(addr.level == FSM_BOTTOM_LEVEL),
+									false);
+			if (slot == -1)
+				max_avail = fsm_get_max_avail(BufferGetPage(buf));
+			UnlockReleaseBuffer(buf);
+		}
+		else
+			slot = -1;
+
+		if (slot != -1)
+		{
+			/*
+			 * Descend the tree, or return the found block if we're at the
+			 * bottom.
+			 */
+			if (addr.level == FSM_BOTTOM_LEVEL)
+				return fsm_get_heap_blk(addr, slot);
+
+			addr = fsm_get_child(addr, slot);
+		}
+		else if (addr.level == FSM_ROOT_LEVEL)
+		{
+			/*
+			 * At the root, failure means there's no page with enough free
+			 * space in the FSM. Give up.
+			 */
+			return InvalidBlockNumber;
+		}
+		else
+		{
+			uint16		parentslot;
+			FSMAddress	parent;
+
+			/*
+			 * At lower level, failure can happen if the value in the upper-
+			 * level node didn't reflect the value on the lower page. Update
+			 * the upper node, to avoid falling into the same trap again, and
+			 * start over.
+			 *
+			 * There's a race condition here, if another backend updates this
+			 * page right after we release it, and gets the lock on the parent
+			 * page before us. We'll then update the parent page with the now
+			 * stale information we had. It's OK, because it should happen
+			 * rarely, and will be fixed by the next vacuum.
+			 */
+			parent = fsm_get_parent(addr, &parentslot);
+			fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
+
+			/*
+			 * If the upper pages are badly out of date, we might need to loop
+			 * quite a few times, updating them as we go. Any inconsistencies
+			 * should eventually be corrected and the loop should end. Looping
+			 * indefinitely is nevertheless scary, so provide an emergency
+			 * valve.
+			 */
+			if (restarts++ > 10000)
+				return InvalidBlockNumber;
+
+			/* Start search all over from the root */
+			addr = FSM_ROOT_ADDRESS;
+		}
+	}
+}
+
+
+/*
+ * Recursive guts of FreeSpaceMapVacuum
+ *
+ * Examine the FSM page indicated by addr, as well as its children, updating
+ * upper-level nodes that cover the heap block range from start to end-1.
+ * (It's okay if end is beyond the actual end of the map.)
+ * Return the maximum freespace value on this page.
+ *
+ * If addr is past the end of the FSM, set *eof_p to true and return 0.
+ *
+ * This traverses the tree in depth-first order.  The tree is stored
+ * physically in depth-first order, so this should be pretty I/O efficient.
+ */
+static uint8
+fsm_vacuum_page(Relation rel, FSMAddress addr,
+				BlockNumber start, BlockNumber end,
+				bool *eof_p)
+{
+	Buffer		buf;
+	Page		page;
+	uint8		max_avail;
+
+	/* Read the page if it exists, or return EOF */
+	buf = fsm_readbuf(rel, addr, false);
+	if (!BufferIsValid(buf))
+	{
+		*eof_p = true;
+		return 0;
+	}
+	else
+		*eof_p = false;
+
+	page = BufferGetPage(buf);
+
+	/*
+	 * If we're above the bottom level, recurse into children, and fix the
+	 * information stored about them at this level.
+	 */
+	if (addr.level > FSM_BOTTOM_LEVEL)
+	{
+		FSMAddress	fsm_start,
+					fsm_end;
+		uint16		fsm_start_slot,
+					fsm_end_slot;
+		int			slot,
+					start_slot,
+					end_slot;
+		bool		eof = false;
+
+		/*
+		 * Compute the range of slots we need to update on this page, given
+		 * the requested range of heap blocks to consider.  The first slot to
+		 * update is the one covering the "start" block, and the last slot is
+		 * the one covering "end - 1".  (Some of this work will be duplicated
+		 * in each recursive call, but it's cheap enough to not worry about.)
+		 */
+		fsm_start = fsm_get_location(start, &fsm_start_slot);
+		fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
+
+		while (fsm_start.level < addr.level)
+		{
+			fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
+			fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
+		}
+		Assert(fsm_start.level == addr.level);
+
+		if (fsm_start.logpageno == addr.logpageno)
+			start_slot = fsm_start_slot;
+		else if (fsm_start.logpageno > addr.logpageno)
+			start_slot = SlotsPerFSMPage;	/* shouldn't get here... */
+		else
+			start_slot = 0;
+
+		if (fsm_end.logpageno == addr.logpageno)
+			end_slot = fsm_end_slot;
+		else if (fsm_end.logpageno > addr.logpageno)
+			end_slot = SlotsPerFSMPage - 1;
+		else
+			end_slot = -1;		/* shouldn't get here... */
+
+		for (slot = start_slot; slot <= end_slot; slot++)
+		{
+			int			child_avail;
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* After we hit end-of-file, just clear the rest of the slots */
+			if (!eof)
+				child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
+											  start, end,
+											  &eof);
+			else
+				child_avail = 0;
+
+			/* Update information about the child */
+			if (fsm_get_avail(page, slot) != child_avail)
+			{
+				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+				fsm_set_avail(page, slot, child_avail);
+				MarkBufferDirtyHint(buf, false);
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			}
+		}
+	}
+
+	/* Now get the maximum value on the page, to return to caller */
+	max_avail = fsm_get_max_avail(page);
+
+	/*
+	 * Reset the next slot pointer. This encourages the use of low-numbered
+	 * pages, increasing the chances that a later vacuum can truncate the
+	 * relation.  We don't bother with a lock here, nor with marking the page
+	 * dirty if it wasn't already, since this is just a hint.
+	 */
+	((FSMPage) PageGetContents(page))->fp_next_slot = 0;
+
+	ReleaseBuffer(buf);
+
+	return max_avail;
+}
diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c
new file mode 100644
index 0000000..88ae51e
--- /dev/null
+++ b/src/backend/storage/freespace/fsmpage.c
@@ -0,0 +1,374 @@
+/*-------------------------------------------------------------------------
+ *
+ * fsmpage.c
+ *	  routines to search and manipulate one FSM page.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/freespace/fsmpage.c
+ *
+ * NOTES:
+ *
+ *	The public functions in this file form an API that hides the internal
+ *	structure of a FSM page. This allows freespace.c to treat each FSM page
+ *	as a black box with SlotsPerPage "slots". fsm_set_avail() and
+ *	fsm_get_avail() let you get/set the value of a slot, and
+ *	fsm_search_avail() lets you search for a slot with value >= X.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/fsm_internals.h"
+
+/* Macros to navigate the tree within a page. Root has index zero. */
+#define leftchild(x)	(2 * (x) + 1)
+#define rightchild(x)	(2 * (x) + 2)
+#define parentof(x)		(((x) - 1) / 2)
+
+/*
+ * Find right neighbor of x, wrapping around within the level
+ */
+static int
+rightneighbor(int x)
+{
+	/*
+	 * Move right. This might wrap around, stepping to the leftmost node at
+	 * the next level.
+	 */
+	x++;
+
+	/*
+	 * Check if we stepped to the leftmost node at next level, and correct if
+	 * so. The leftmost nodes at each level are numbered x = 2^level - 1, so
+	 * check if (x + 1) is a power of two, using a standard
+	 * twos-complement-arithmetic trick.
+	 */
+	if (((x + 1) & x) == 0)
+		x = parentof(x);
+
+	return x;
+}
+
+/*
+ * Sets the value of a slot on page. Returns true if the page was modified.
+ *
+ * The caller must hold an exclusive lock on the page.
+ */
+bool
+fsm_set_avail(Page page, int slot, uint8 value)
+{
+	int			nodeno = NonLeafNodesPerPage + slot;
+	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
+	uint8		oldvalue;
+
+	Assert(slot < LeafNodesPerPage);
+
+	oldvalue = fsmpage->fp_nodes[nodeno];
+
+	/* If the value hasn't changed, we don't need to do anything */
+	if (oldvalue == value && value <= fsmpage->fp_nodes[0])
+		return false;
+
+	fsmpage->fp_nodes[nodeno] = value;
+
+	/*
+	 * Propagate up, until we hit the root or a node that doesn't need to be
+	 * updated.
+	 */
+	do
+	{
+		uint8		newvalue = 0;
+		int			lchild;
+		int			rchild;
+
+		nodeno = parentof(nodeno);
+		lchild = leftchild(nodeno);
+		rchild = lchild + 1;
+
+		newvalue = fsmpage->fp_nodes[lchild];
+		if (rchild < NodesPerPage)
+			newvalue = Max(newvalue,
+						   fsmpage->fp_nodes[rchild]);
+
+		oldvalue = fsmpage->fp_nodes[nodeno];
+		if (oldvalue == newvalue)
+			break;
+
+		fsmpage->fp_nodes[nodeno] = newvalue;
+	} while (nodeno > 0);
+
+	/*
+	 * sanity check: if the new value is (still) higher than the value at the
+	 * top, the tree is corrupt.  If so, rebuild.
+	 */
+	if (value > fsmpage->fp_nodes[0])
+		fsm_rebuild_page(page);
+
+	return true;
+}
+
+/*
+ * Returns the value of given slot on page.
+ *
+ * Since this is just a read-only access of a single byte, the page doesn't
+ * need to be locked.
+ */
+uint8
+fsm_get_avail(Page page, int slot)
+{
+	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
+
+	Assert(slot < LeafNodesPerPage);
+
+	return fsmpage->fp_nodes[NonLeafNodesPerPage + slot];
+}
+
+/*
+ * Returns the value at the root of a page.
+ *
+ * Since this is just a read-only access of a single byte, the page doesn't
+ * need to be locked.
+ */
+uint8
+fsm_get_max_avail(Page page)
+{
+	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
+
+	return fsmpage->fp_nodes[0];
+}
+
+/*
+ * Searches for a slot with category at least minvalue.
+ * Returns slot number, or -1 if none found.
+ *
+ * The caller must hold at least a shared lock on the page, and this
+ * function can unlock and lock the page again in exclusive mode if it
+ * needs to be updated. exclusive_lock_held should be set to true if the
+ * caller is already holding an exclusive lock, to avoid extra work.
+ *
+ * If advancenext is false, fp_next_slot is set to point to the returned
+ * slot, and if it's true, to the slot after the returned slot.
+ */
+int
+fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext,
+				 bool exclusive_lock_held)
+{
+	Page		page = BufferGetPage(buf);
+	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
+	int			nodeno;
+	int			target;
+	uint16		slot;
+
+restart:
+
+	/*
+	 * Check the root first, and exit quickly if there's no leaf with enough
+	 * free space
+	 */
+	if (fsmpage->fp_nodes[0] < minvalue)
+		return -1;
+
+	/*
+	 * Start search using fp_next_slot.  It's just a hint, so check that it's
+	 * sane.  (This also handles wrapping around when the prior call returned
+	 * the last slot on the page.)
+	 */
+	target = fsmpage->fp_next_slot;
+	if (target < 0 || target >= LeafNodesPerPage)
+		target = 0;
+	target += NonLeafNodesPerPage;
+
+	/*----------
+	 * Start the search from the target slot.  At every step, move one
+	 * node to the right, then climb up to the parent.  Stop when we reach
+	 * a node with enough free space (as we must, since the root has enough
+	 * space).
+	 *
+	 * The idea is to gradually expand our "search triangle", that is, all
+	 * nodes covered by the current node, and to be sure we search to the
+	 * right from the start point.  At the first step, only the target slot
+	 * is examined.  When we move up from a left child to its parent, we are
+	 * adding the right-hand subtree of that parent to the search triangle.
+	 * When we move right then up from a right child, we are dropping the
+	 * current search triangle (which we know doesn't contain any suitable
+	 * page) and instead looking at the next-larger-size triangle to its
+	 * right.  So we never look left from our original start point, and at
+	 * each step the size of the search triangle doubles, ensuring it takes
+	 * only log2(N) work to search N pages.
+	 *
+	 * The "move right" operation will wrap around if it hits the right edge
+	 * of the tree, so the behavior is still good if we start near the right.
+	 * Note also that the move-and-climb behavior ensures that we can't end
+	 * up on one of the missing nodes at the right of the leaf level.
+	 *
+	 * For example, consider this tree:
+	 *
+	 *		   7
+	 *	   7	   6
+	 *	 5	 7	 6	 5
+	 *	4 5 5 7 2 6 5 2
+	 *				T
+	 *
+	 * Assume that the target node is the node indicated by the letter T,
+	 * and we're searching for a node with value of 6 or higher. The search
+	 * begins at T. At the first iteration, we move to the right, then to the
+	 * parent, arriving at the rightmost 5. At the second iteration, we move
+	 * to the right, wrapping around, then climb up, arriving at the 7 on the
+	 * third level.  7 satisfies our search, so we descend down to the bottom,
+	 * following the path of sevens.  This is in fact the first suitable page
+	 * to the right of (allowing for wraparound) our start point.
+	 *----------
+	 */
+	nodeno = target;
+	while (nodeno > 0)
+	{
+		if (fsmpage->fp_nodes[nodeno] >= minvalue)
+			break;
+
+		/*
+		 * Move to the right, wrapping around on same level if necessary, then
+		 * climb up.
+		 */
+		nodeno = parentof(rightneighbor(nodeno));
+	}
+
+	/*
+	 * We're now at a node with enough free space, somewhere in the middle of
+	 * the tree. Descend to the bottom, following a path with enough free
+	 * space, preferring to move left if there's a choice.
+	 */
+	while (nodeno < NonLeafNodesPerPage)
+	{
+		int			childnodeno = leftchild(nodeno);
+
+		if (childnodeno < NodesPerPage &&
+			fsmpage->fp_nodes[childnodeno] >= minvalue)
+		{
+			nodeno = childnodeno;
+			continue;
+		}
+		childnodeno++;			/* point to right child */
+		if (childnodeno < NodesPerPage &&
+			fsmpage->fp_nodes[childnodeno] >= minvalue)
+		{
+			nodeno = childnodeno;
+		}
+		else
+		{
+			/*
+			 * Oops. The parent node promised that either left or right child
+			 * has enough space, but neither actually did. This can happen in
+			 * case of a "torn page", IOW if we crashed earlier while writing
+			 * the page to disk, and only part of the page made it to disk.
+			 *
+			 * Fix the corruption and restart.
+			 */
+			RelFileNode rnode;
+			ForkNumber	forknum;
+			BlockNumber blknum;
+
+			BufferGetTag(buf, &rnode, &forknum, &blknum);
+			elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u",
+				 blknum, rnode.spcNode, rnode.dbNode, rnode.relNode);
+
+			/* make sure we hold an exclusive lock */
+			if (!exclusive_lock_held)
+			{
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+				exclusive_lock_held = true;
+			}
+			fsm_rebuild_page(page);
+			MarkBufferDirtyHint(buf, false);
+			goto restart;
+		}
+	}
+
+	/* We're now at the bottom level, at a node with enough space. */
+	slot = nodeno - NonLeafNodesPerPage;
+
+	/*
+	 * Update the next-target pointer. Note that we do this even if we're only
+	 * holding a shared lock, on the grounds that it's better to use a shared
+	 * lock and get a garbled next pointer every now and then, than take the
+	 * concurrency hit of an exclusive lock.
+	 *
+	 * Wrap-around is handled at the beginning of this function.
+	 */
+	fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0);
+
+	return slot;
+}
+
+/*
+ * Sets the available space to zero for all slots numbered >= nslots.
+ * Returns true if the page was modified.
+ */
+bool
+fsm_truncate_avail(Page page, int nslots)
+{
+	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
+	uint8	   *ptr;
+	bool		changed = false;
+
+	Assert(nslots >= 0 && nslots < LeafNodesPerPage);
+
+	/* Clear all truncated leaf nodes */
+	ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots];
+	for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++)
+	{
+		if (*ptr != 0)
+			changed = true;
+		*ptr = 0;
+	}
+
+	/* Fix upper nodes. */
+	if (changed)
+		fsm_rebuild_page(page);
+
+	return changed;
+}
+
+/*
+ * Reconstructs the upper levels of a page. Returns true if the page
+ * was modified.
+ */
+bool
+fsm_rebuild_page(Page page)
+{
+	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
+	bool		changed = false;
+	int			nodeno;
+
+	/*
+	 * Start from the lowest non-leaf level, at last node, working our way
+	 * backwards, through all non-leaf nodes at all levels, up to the root.
+	 */
+	for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--)
+	{
+		int			lchild = leftchild(nodeno);
+		int			rchild = lchild + 1;
+		uint8		newvalue = 0;
+
+		/* The first few nodes we examine might have zero or one child. */
+		if (lchild < NodesPerPage)
+			newvalue = fsmpage->fp_nodes[lchild];
+
+		if (rchild < NodesPerPage)
+			newvalue = Max(newvalue,
+						   fsmpage->fp_nodes[rchild]);
+
+		if (fsmpage->fp_nodes[nodeno] != newvalue)
+		{
+			fsmpage->fp_nodes[nodeno] = newvalue;
+			changed = true;
+		}
+	}
+
+	return changed;
+}
diff --git a/src/backend/storage/freespace/indexfsm.c b/src/backend/storage/freespace/indexfsm.c
new file mode 100644
index 0000000..d66e10b
--- /dev/null
+++ b/src/backend/storage/freespace/indexfsm.c
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * indexfsm.c
+ *	  POSTGRES free space map for quickly finding free pages in relations
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/freespace/indexfsm.c
+ *
+ *
+ * NOTES:
+ *
+ *	This is similar to the FSM used for heap, in freespace.c, but instead
+ *	of tracking the amount of free space on pages, we only track whether
+ *	pages are completely free or in-use. We use the same FSM implementation
+ *	as for heaps, using BLCKSZ - 1 to denote used pages, and 0 for unused.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/freespace.h"
+#include "storage/indexfsm.h"
+
+/*
+ * Exported routines
+ */
+
+/*
+ * GetFreeIndexPage - return a free page from the FSM
+ *
+ * As a side effect, the page is marked as used in the FSM.
+ */
+BlockNumber
+GetFreeIndexPage(Relation rel)
+{
+	BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2);
+
+	if (blkno != InvalidBlockNumber)
+		RecordUsedIndexPage(rel, blkno);
+
+	return blkno;
+}
+
+/*
+ * RecordFreeIndexPage - mark a page as free in the FSM
+ */
+void
+RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
+{
+	RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1);
+}
+
+
+/*
+ * RecordUsedIndexPage - mark a page as used in the FSM
+ */
+void
+RecordUsedIndexPage(Relation rel, BlockNumber usedBlock)
+{
+	RecordPageWithFreeSpace(rel, usedBlock, 0);
+}
+
+/*
+ * IndexFreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM
+ */
+void
+IndexFreeSpaceMapVacuum(Relation rel)
+{
+	FreeSpaceMapVacuum(rel);
+}
diff --git a/src/backend/storage/ipc/Makefile b/src/backend/storage/ipc/Makefile
new file mode 100644
index 0000000..df90c6b
--- /dev/null
+++ b/src/backend/storage/ipc/Makefile
@@ -0,0 +1,30 @@
+#
+# Makefile for storage/ipc
+#
+# src/backend/storage/ipc/Makefile
+#
+
+subdir = src/backend/storage/ipc
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	barrier.o \
+	dsm.o \
+	dsm_impl.o \
+	ipc.o \
+	ipci.o \
+	latch.o \
+	pmsignal.o \
+	procarray.o \
+	procsignal.o \
+	shm_mq.o \
+	shm_toc.o \
+	shmem.o \
+	shmqueue.o \
+	signalfuncs.o \
+	sinval.o \
+	sinvaladt.o \
+	standby.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/ipc/barrier.c b/src/backend/storage/ipc/barrier.c
new file mode 100644
index 0000000..5c05297
--- /dev/null
+++ b/src/backend/storage/ipc/barrier.c
@@ -0,0 +1,333 @@
+/*-------------------------------------------------------------------------
+ *
+ * barrier.c
+ *	  Barriers for synchronizing cooperating processes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * From Wikipedia[1]: "In parallel computing, a barrier is a type of
+ * synchronization method.  A barrier for a group of threads or processes in
+ * the source code means any thread/process must stop at this point and cannot
+ * proceed until all other threads/processes reach this barrier."
+ *
+ * This implementation of barriers allows for static sets of participants
+ * known up front, or dynamic sets of participants which processes can join or
+ * leave at any time.  In the dynamic case, a phase number can be used to
+ * track progress through a parallel algorithm, and may be necessary to
+ * synchronize with the current phase of a multi-phase algorithm when a new
+ * participant joins.  In the static case, the phase number is used
+ * internally, but it isn't strictly necessary for client code to access it
+ * because the phase can only advance when the declared number of participants
+ * reaches the barrier, so client code should be in no doubt about the current
+ * phase of computation at all times.
+ *
+ * Consider a parallel algorithm that involves separate phases of computation
+ * A, B and C where the output of each phase is needed before the next phase
+ * can begin.
+ *
+ * In the case of a static barrier initialized with 4 participants, each
+ * participant works on phase A, then calls BarrierArriveAndWait to wait until
+ * all 4 participants have reached that point.  When BarrierArriveAndWait
+ * returns control, each participant can work on B, and so on.  Because the
+ * barrier knows how many participants to expect, the phases of computation
+ * don't need labels or numbers, since each process's program counter implies
+ * the current phase.  Even if some of the processes are slow to start up and
+ * begin running phase A, the other participants are expecting them and will
+ * patiently wait at the barrier.  The code could be written as follows:
+ *
+ *     perform_a();
+ *     BarrierArriveAndWait(&barrier, ...);
+ *     perform_b();
+ *     BarrierArriveAndWait(&barrier, ...);
+ *     perform_c();
+ *     BarrierArriveAndWait(&barrier, ...);
+ *
+ * If the number of participants is not known up front, then a dynamic barrier
+ * is needed and the number should be set to zero at initialization.  New
+ * complications arise because the number necessarily changes over time as
+ * participants attach and detach, and therefore phases B, C or even the end
+ * of processing may be reached before any given participant has started
+ * running and attached.  Therefore the client code must perform an initial
+ * test of the phase number after attaching, because it needs to find out
+ * which phase of the algorithm has been reached by any participants that are
+ * already attached in order to synchronize with that work.  Once the program
+ * counter or some other representation of current progress is synchronized
+ * with the barrier's phase, normal control flow can be used just as in the
+ * static case.  Our example could be written using a switch statement with
+ * cases that fall-through, as follows:
+ *
+ *     phase = BarrierAttach(&barrier);
+ *     switch (phase)
+ *     {
+ *     case PHASE_A:
+ *         perform_a();
+ *         BarrierArriveAndWait(&barrier, ...);
+ *     case PHASE_B:
+ *         perform_b();
+ *         BarrierArriveAndWait(&barrier, ...);
+ *     case PHASE_C:
+ *         perform_c();
+ *         BarrierArriveAndWait(&barrier, ...);
+ *     }
+ *     BarrierDetach(&barrier);
+ *
+ * Static barriers behave similarly to POSIX's pthread_barrier_t.  Dynamic
+ * barriers behave similarly to Java's java.util.concurrent.Phaser.
+ *
+ * [1] https://en.wikipedia.org/wiki/Barrier_(computer_science)
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/barrier.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "storage/barrier.h"
+
+static inline bool BarrierDetachImpl(Barrier *barrier, bool arrive);
+
+/*
+ * Initialize this barrier.  To use a static party size, provide the number of
+ * participants to wait for at each phase indicating that that number of
+ * backends is implicitly attached.  To use a dynamic party size, specify zero
+ * here and then use BarrierAttach() and
+ * BarrierDetach()/BarrierArriveAndDetach() to register and deregister
+ * participants explicitly.
+ */
+void
+BarrierInit(Barrier *barrier, int participants)
+{
+	SpinLockInit(&barrier->mutex);
+	barrier->participants = participants;
+	barrier->arrived = 0;
+	barrier->phase = 0;
+	barrier->elected = 0;
+	barrier->static_party = participants > 0;
+	ConditionVariableInit(&barrier->condition_variable);
+}
+
+/*
+ * Arrive at this barrier, wait for all other attached participants to arrive
+ * too and then return.  Increments the current phase.  The caller must be
+ * attached.
+ *
+ * While waiting, pg_stat_activity shows a wait_event_type and wait_event
+ * controlled by the wait_event_info passed in, which should be a value from
+ * one of the WaitEventXXX enums defined in pgstat.h.
+ *
+ * Return true in one arbitrarily chosen participant.  Return false in all
+ * others.  The return code can be used to elect one participant to execute a
+ * phase of work that must be done serially while other participants wait.
+ */
+bool
+BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info)
+{
+	bool		release = false;
+	bool		elected;
+	int			start_phase;
+	int			next_phase;
+
+	SpinLockAcquire(&barrier->mutex);
+	start_phase = barrier->phase;
+	next_phase = start_phase + 1;
+	++barrier->arrived;
+	if (barrier->arrived == barrier->participants)
+	{
+		release = true;
+		barrier->arrived = 0;
+		barrier->phase = next_phase;
+		barrier->elected = next_phase;
+	}
+	SpinLockRelease(&barrier->mutex);
+
+	/*
+	 * If we were the last expected participant to arrive, we can release our
+	 * peers and return true to indicate that this backend has been elected to
+	 * perform any serial work.
+	 */
+	if (release)
+	{
+		ConditionVariableBroadcast(&barrier->condition_variable);
+
+		return true;
+	}
+
+	/*
+	 * Otherwise we have to wait for the last participant to arrive and
+	 * advance the phase.
+	 */
+	elected = false;
+	ConditionVariablePrepareToSleep(&barrier->condition_variable);
+	for (;;)
+	{
+		/*
+		 * We know that phase must either be start_phase, indicating that we
+		 * need to keep waiting, or next_phase, indicating that the last
+		 * participant that we were waiting for has either arrived or detached
+		 * so that the next phase has begun.  The phase cannot advance any
+		 * further than that without this backend's participation, because
+		 * this backend is attached.
+		 */
+		SpinLockAcquire(&barrier->mutex);
+		Assert(barrier->phase == start_phase || barrier->phase == next_phase);
+		release = barrier->phase == next_phase;
+		if (release && barrier->elected != next_phase)
+		{
+			/*
+			 * Usually the backend that arrives last and releases the other
+			 * backends is elected to return true (see above), so that it can
+			 * begin processing serial work while it has a CPU timeslice.
+			 * However, if the barrier advanced because someone detached, then
+			 * one of the backends that is awoken will need to be elected.
+			 */
+			barrier->elected = barrier->phase;
+			elected = true;
+		}
+		SpinLockRelease(&barrier->mutex);
+		if (release)
+			break;
+		ConditionVariableSleep(&barrier->condition_variable, wait_event_info);
+	}
+	ConditionVariableCancelSleep();
+
+	return elected;
+}
+
+/*
+ * Arrive at this barrier, but detach rather than waiting.  Returns true if
+ * the caller was the last to detach.
+ */
+bool
+BarrierArriveAndDetach(Barrier *barrier)
+{
+	return BarrierDetachImpl(barrier, true);
+}
+
+/*
+ * Arrive at a barrier, and detach all but the last to arrive.  Returns true if
+ * the caller was the last to arrive, and is therefore still attached.
+ */
+bool
+BarrierArriveAndDetachExceptLast(Barrier *barrier)
+{
+	SpinLockAcquire(&barrier->mutex);
+	if (barrier->participants > 1)
+	{
+		--barrier->participants;
+		SpinLockRelease(&barrier->mutex);
+
+		return false;
+	}
+	Assert(barrier->participants == 1);
+	++barrier->phase;
+	SpinLockRelease(&barrier->mutex);
+
+	return true;
+}
+
+/*
+ * Attach to a barrier.  All waiting participants will now wait for this
+ * participant to call BarrierArriveAndWait(), BarrierDetach() or
+ * BarrierArriveAndDetach().  Return the current phase.
+ */
+int
+BarrierAttach(Barrier *barrier)
+{
+	int			phase;
+
+	Assert(!barrier->static_party);
+
+	SpinLockAcquire(&barrier->mutex);
+	++barrier->participants;
+	phase = barrier->phase;
+	SpinLockRelease(&barrier->mutex);
+
+	return phase;
+}
+
+/*
+ * Detach from a barrier.  This may release other waiters from
+ * BarrierArriveAndWait() and advance the phase if they were only waiting for
+ * this backend.  Return true if this participant was the last to detach.
+ */
+bool
+BarrierDetach(Barrier *barrier)
+{
+	return BarrierDetachImpl(barrier, false);
+}
+
+/*
+ * Return the current phase of a barrier.  The caller must be attached.
+ */
+int
+BarrierPhase(Barrier *barrier)
+{
+	/*
+	 * It is OK to read barrier->phase without locking, because it can't
+	 * change without us (we are attached to it), and we executed a memory
+	 * barrier when we either attached or participated in changing it last
+	 * time.
+	 */
+	return barrier->phase;
+}
+
+/*
+ * Return an instantaneous snapshot of the number of participants currently
+ * attached to this barrier.  For debugging purposes only.
+ */
+int
+BarrierParticipants(Barrier *barrier)
+{
+	int			participants;
+
+	SpinLockAcquire(&barrier->mutex);
+	participants = barrier->participants;
+	SpinLockRelease(&barrier->mutex);
+
+	return participants;
+}
+
+/*
+ * Detach from a barrier.  If 'arrive' is true then also increment the phase
+ * if there are no other participants.  If there are other participants
+ * waiting, then the phase will be advanced and they'll be released if they
+ * were only waiting for the caller.  Return true if this participant was the
+ * last to detach.
+ */
+static inline bool
+BarrierDetachImpl(Barrier *barrier, bool arrive)
+{
+	bool		release;
+	bool		last;
+
+	Assert(!barrier->static_party);
+
+	SpinLockAcquire(&barrier->mutex);
+	Assert(barrier->participants > 0);
+	--barrier->participants;
+
+	/*
+	 * If any other participants are waiting and we were the last participant
+	 * waited for, release them.  If no other participants are waiting, but
+	 * this is a BarrierArriveAndDetach() call, then advance the phase too.
+	 */
+	if ((arrive || barrier->participants > 0) &&
+		barrier->arrived == barrier->participants)
+	{
+		release = true;
+		barrier->arrived = 0;
+		++barrier->phase;
+	}
+	else
+		release = false;
+
+	last = barrier->participants == 0;
+	SpinLockRelease(&barrier->mutex);
+
+	if (release)
+		ConditionVariableBroadcast(&barrier->condition_variable);
+
+	return last;
+}
diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c
new file mode 100644
index 0000000..b461a5f
--- /dev/null
+++ b/src/backend/storage/ipc/dsm.c
@@ -0,0 +1,1248 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm.c
+ *	  manage dynamic shared memory segments
+ *
+ * This file provides a set of services to make programming with dynamic
+ * shared memory segments more convenient.  Unlike the low-level
+ * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
+ * created using this module will be cleaned up automatically.  Mappings
+ * will be removed when the resource owner under which they were created
+ * is cleaned up, unless dsm_pin_mapping() is used, in which case they
+ * have session lifespan.  Segments will be removed when there are no
+ * remaining mappings, or at postmaster shutdown in any case.  After a
+ * hard postmaster crash, remaining segments will be removed, if they
+ * still exist, at the next postmaster startup.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/dsm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "port/pg_bitutils.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "utils/freepage.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+
+#define PG_DYNSHMEM_CONTROL_MAGIC		0x9a503d32
+
+#define PG_DYNSHMEM_FIXED_SLOTS			64
+#define PG_DYNSHMEM_SLOTS_PER_BACKEND	5
+
+#define INVALID_CONTROL_SLOT		((uint32) -1)
+
+/* Backend-local tracking for on-detach callbacks. */
+typedef struct dsm_segment_detach_callback
+{
+	on_dsm_detach_callback function;
+	Datum		arg;
+	slist_node	node;
+} dsm_segment_detach_callback;
+
+/* Backend-local state for a dynamic shared memory segment. */
+struct dsm_segment
+{
+	dlist_node	node;			/* List link in dsm_segment_list. */
+	ResourceOwner resowner;		/* Resource owner. */
+	dsm_handle	handle;			/* Segment name. */
+	uint32		control_slot;	/* Slot in control segment. */
+	void	   *impl_private;	/* Implementation-specific private data. */
+	void	   *mapped_address; /* Mapping address, or NULL if unmapped. */
+	Size		mapped_size;	/* Size of our mapping. */
+	slist_head	on_detach;		/* On-detach callbacks. */
+};
+
+/* Shared-memory state for a dynamic shared memory segment. */
+typedef struct dsm_control_item
+{
+	dsm_handle	handle;
+	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
+	size_t		first_page;
+	size_t		npages;
+	void	   *impl_private_pm_handle; /* only needed on Windows */
+	bool		pinned;
+} dsm_control_item;
+
+/* Layout of the dynamic shared memory control segment. */
+typedef struct dsm_control_header
+{
+	uint32		magic;
+	uint32		nitems;
+	uint32		maxitems;
+	dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
+} dsm_control_header;
+
+static void dsm_cleanup_for_mmap(void);
+static void dsm_postmaster_shutdown(int code, Datum arg);
+static dsm_segment *dsm_create_descriptor(void);
+static bool dsm_control_segment_sane(dsm_control_header *control,
+									 Size mapped_size);
+static uint64 dsm_control_bytes_needed(uint32 nitems);
+static inline dsm_handle make_main_region_dsm_handle(int slot);
+static inline bool is_main_region_dsm_handle(dsm_handle handle);
+
+/* Has this backend initialized the dynamic shared memory system yet? */
+static bool dsm_init_done = false;
+
+/* Preallocated DSM space in the main shared memory region. */
+static void *dsm_main_space_begin = NULL;
+
+/*
+ * List of dynamic shared memory segments used by this backend.
+ *
+ * At process exit time, we must decrement the reference count of each
+ * segment we have attached; this list makes it possible to find all such
+ * segments.
+ *
+ * This list should always be empty in the postmaster.  We could probably
+ * allow the postmaster to map dynamic shared memory segments before it
+ * begins to start child processes, provided that each process adjusted
+ * the reference counts for those segments in the control segment at
+ * startup time, but there's no obvious need for such a facility, which
+ * would also be complex to handle in the EXEC_BACKEND case.  Once the
+ * postmaster has begun spawning children, there's an additional problem:
+ * each new mapping would require an update to the control segment,
+ * which requires locking, in which the postmaster must not be involved.
+ */
+static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
+
+/*
+ * Control segment information.
+ *
+ * Unlike ordinary shared memory segments, the control segment is not
+ * reference counted; instead, it lasts for the postmaster's entire
+ * life cycle.  For simplicity, it doesn't have a dsm_segment object either.
+ */
+static dsm_handle dsm_control_handle;
+static dsm_control_header *dsm_control;
+static Size dsm_control_mapped_size = 0;
+static void *dsm_control_impl_private = NULL;
+
+/*
+ * Start up the dynamic shared memory system.
+ *
+ * This is called just once during each cluster lifetime, at postmaster
+ * startup time.
+ */
+void
+dsm_postmaster_startup(PGShmemHeader *shim)
+{
+	void	   *dsm_control_address = NULL;
+	uint32		maxitems;
+	Size		segsize;
+
+	Assert(!IsUnderPostmaster);
+
+	/*
+	 * If we're using the mmap implementations, clean up any leftovers.
+	 * Cleanup isn't needed on Windows, and happens earlier in startup for
+	 * POSIX and System V shared memory, via a direct call to
+	 * dsm_cleanup_using_control_segment.
+	 */
+	if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
+		dsm_cleanup_for_mmap();
+
+	/* Determine size for new control segment. */
+	maxitems = PG_DYNSHMEM_FIXED_SLOTS
+		+ PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
+	elog(DEBUG2, "dynamic shared memory system will support %u segments",
+		 maxitems);
+	segsize = dsm_control_bytes_needed(maxitems);
+
+	/*
+	 * Loop until we find an unused identifier for the new control segment. We
+	 * sometimes use 0 as a sentinel value indicating that no control segment
+	 * is known to exist, so avoid using that value for a real control
+	 * segment.
+	 */
+	for (;;)
+	{
+		Assert(dsm_control_address == NULL);
+		Assert(dsm_control_mapped_size == 0);
+		dsm_control_handle = random() << 1; /* Even numbers only */
+		if (dsm_control_handle == DSM_HANDLE_INVALID)
+			continue;
+		if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
+						&dsm_control_impl_private, &dsm_control_address,
+						&dsm_control_mapped_size, ERROR))
+			break;
+	}
+	dsm_control = dsm_control_address;
+	on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
+	elog(DEBUG2,
+		 "created dynamic shared memory control segment %u (%zu bytes)",
+		 dsm_control_handle, segsize);
+	shim->dsm_control = dsm_control_handle;
+
+	/* Initialize control segment. */
+	dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
+	dsm_control->nitems = 0;
+	dsm_control->maxitems = maxitems;
+}
+
+/*
+ * Determine whether the control segment from the previous postmaster
+ * invocation still exists.  If so, remove the dynamic shared memory
+ * segments to which it refers, and then the control segment itself.
+ */
+void
+dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
+{
+	void	   *mapped_address = NULL;
+	void	   *junk_mapped_address = NULL;
+	void	   *impl_private = NULL;
+	void	   *junk_impl_private = NULL;
+	Size		mapped_size = 0;
+	Size		junk_mapped_size = 0;
+	uint32		nitems;
+	uint32		i;
+	dsm_control_header *old_control;
+
+	/*
+	 * Try to attach the segment.  If this fails, it probably just means that
+	 * the operating system has been rebooted and the segment no longer
+	 * exists, or an unrelated process has used the same shm ID.  So just fall
+	 * out quietly.
+	 */
+	if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
+					 &mapped_address, &mapped_size, DEBUG1))
+		return;
+
+	/*
+	 * We've managed to reattach it, but the contents might not be sane. If
+	 * they aren't, we disregard the segment after all.
+	 */
+	old_control = (dsm_control_header *) mapped_address;
+	if (!dsm_control_segment_sane(old_control, mapped_size))
+	{
+		dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
+					&mapped_address, &mapped_size, LOG);
+		return;
+	}
+
+	/*
+	 * OK, the control segment looks basically valid, so we can use it to get
+	 * a list of segments that need to be removed.
+	 */
+	nitems = old_control->nitems;
+	for (i = 0; i < nitems; ++i)
+	{
+		dsm_handle	handle;
+		uint32		refcnt;
+
+		/* If the reference count is 0, the slot is actually unused. */
+		refcnt = old_control->item[i].refcnt;
+		if (refcnt == 0)
+			continue;
+
+		/* If it was using the main shmem area, there is nothing to do. */
+		handle = old_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
+		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
+			 handle, refcnt);
+
+		/* Destroy the referenced segment. */
+		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+					&junk_mapped_address, &junk_mapped_size, LOG);
+	}
+
+	/* Destroy the old control segment, too. */
+	elog(DEBUG2,
+		 "cleaning up dynamic shared memory control segment with ID %u",
+		 old_control_handle);
+	dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
+				&mapped_address, &mapped_size, LOG);
+}
+
+/*
+ * When we're using the mmap shared memory implementation, "shared memory"
+ * segments might even manage to survive an operating system reboot.
+ * But there's no guarantee as to exactly what will survive: some segments
+ * may survive, and others may not, and the contents of some may be out
+ * of date.  In particular, the control segment may be out of date, so we
+ * can't rely on it to figure out what to remove.  However, since we know
+ * what directory contains the files we used as shared memory, we can simply
+ * scan the directory and blow everything away that shouldn't be there.
+ */
+static void
+dsm_cleanup_for_mmap(void)
+{
+	DIR		   *dir;
+	struct dirent *dent;
+
+	/* Scan the directory for something with a name of the correct format. */
+	dir = AllocateDir(PG_DYNSHMEM_DIR);
+
+	while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
+	{
+		if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
+					strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
+		{
+			char		buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
+
+			snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
+
+			elog(DEBUG2, "removing file \"%s\"", buf);
+
+			/* We found a matching file; so remove it. */
+			if (unlink(buf) != 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m", buf)));
+		}
+	}
+
+	/* Cleanup complete. */
+	FreeDir(dir);
+}
+
+/*
+ * At shutdown time, we iterate over the control segment and remove all
+ * remaining dynamic shared memory segments.  We avoid throwing errors here;
+ * the postmaster is shutting down either way, and this is just non-critical
+ * resource cleanup.
+ */
+static void
+dsm_postmaster_shutdown(int code, Datum arg)
+{
+	uint32		nitems;
+	uint32		i;
+	void	   *dsm_control_address;
+	void	   *junk_mapped_address = NULL;
+	void	   *junk_impl_private = NULL;
+	Size		junk_mapped_size = 0;
+	PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
+
+	/*
+	 * If some other backend exited uncleanly, it might have corrupted the
+	 * control segment while it was dying.  In that case, we warn and ignore
+	 * the contents of the control segment.  This may end up leaving behind
+	 * stray shared memory segments, but there's not much we can do about that
+	 * if the metadata is gone.
+	 */
+	nitems = dsm_control->nitems;
+	if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
+	{
+		ereport(LOG,
+				(errmsg("dynamic shared memory control segment is corrupt")));
+		return;
+	}
+
+	/* Remove any remaining segments. */
+	for (i = 0; i < nitems; ++i)
+	{
+		dsm_handle	handle;
+
+		/* If the reference count is 0, the slot is actually unused. */
+		if (dsm_control->item[i].refcnt == 0)
+			continue;
+
+		handle = dsm_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
+		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
+			 handle);
+
+		/* Destroy the segment. */
+		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+					&junk_mapped_address, &junk_mapped_size, LOG);
+	}
+
+	/* Remove the control segment itself. */
+	elog(DEBUG2,
+		 "cleaning up dynamic shared memory control segment with ID %u",
+		 dsm_control_handle);
+	dsm_control_address = dsm_control;
+	dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
+				&dsm_control_impl_private, &dsm_control_address,
+				&dsm_control_mapped_size, LOG);
+	dsm_control = dsm_control_address;
+	shim->dsm_control = 0;
+}
+
+/*
+ * Prepare this backend for dynamic shared memory usage.  Under EXEC_BACKEND,
+ * we must reread the state file and map the control segment; in other cases,
+ * we'll have inherited the postmaster's mapping and global variables.
+ */
+static void
+dsm_backend_startup(void)
+{
+#ifdef EXEC_BACKEND
+	{
+		void	   *control_address = NULL;
+
+		/* Attach control segment. */
+		Assert(dsm_control_handle != 0);
+		dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
+					&dsm_control_impl_private, &control_address,
+					&dsm_control_mapped_size, ERROR);
+		dsm_control = control_address;
+		/* If control segment doesn't look sane, something is badly wrong. */
+		if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
+		{
+			dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
+						&dsm_control_impl_private, &control_address,
+						&dsm_control_mapped_size, WARNING);
+			ereport(FATAL,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("dynamic shared memory control segment is not valid")));
+		}
+	}
+#endif
+
+	dsm_init_done = true;
+}
+
+#ifdef EXEC_BACKEND
+/*
+ * When running under EXEC_BACKEND, we get a callback here when the main
+ * shared memory segment is re-attached, so that we can record the control
+ * handle retrieved from it.
+ */
+void
+dsm_set_control_handle(dsm_handle h)
+{
+	Assert(dsm_control_handle == 0 && h != 0);
+	dsm_control_handle = h;
+}
+#endif
+
+/*
+ * Reserve some space in the main shared memory segment for DSM segments.
+ */
+size_t
+dsm_estimate_size(void)
+{
+	return 1024 * 1024 * (size_t) min_dynamic_shared_memory;
+}
+
+/*
+ * Initialize space in the main shared memory segment for DSM segments.
+ */
+void
+dsm_shmem_init(void)
+{
+	size_t		size = dsm_estimate_size();
+	bool		found;
+
+	if (size == 0)
+		return;
+
+	dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found);
+	if (!found)
+	{
+		FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin;
+		size_t		first_page = 0;
+		size_t		pages;
+
+		/* Reserve space for the FreePageManager. */
+		while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager))
+			++first_page;
+
+		/* Initialize it and give it all the rest of the space. */
+		FreePageManagerInitialize(fpm, dsm_main_space_begin);
+		pages = (size / FPM_PAGE_SIZE) - first_page;
+		FreePageManagerPut(fpm, first_page, pages);
+	}
+}
+
+/*
+ * Create a new dynamic shared memory segment.
+ *
+ * If there is a non-NULL CurrentResourceOwner, the new segment is associated
+ * with it and must be detached before the resource owner releases, or a
+ * warning will be logged.  If CurrentResourceOwner is NULL, the segment
+ * remains attached until explicitly detached or the session ends.
+ * Creating with a NULL CurrentResourceOwner is equivalent to creating
+ * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
+ */
+dsm_segment *
+dsm_create(Size size, int flags)
+{
+	dsm_segment *seg;
+	uint32		i;
+	uint32		nitems;
+	size_t		npages = 0;
+	size_t		first_page = 0;
+	FreePageManager *dsm_main_space_fpm = dsm_main_space_begin;
+	bool		using_main_dsm_region = false;
+
+	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
+	Assert(IsUnderPostmaster);
+
+	if (!dsm_init_done)
+		dsm_backend_startup();
+
+	/* Create a new segment descriptor. */
+	seg = dsm_create_descriptor();
+
+	/*
+	 * Lock the control segment while we try to allocate from the main shared
+	 * memory area, if configured.
+	 */
+	if (dsm_main_space_fpm)
+	{
+		npages = size / FPM_PAGE_SIZE;
+		if (size % FPM_PAGE_SIZE > 0)
+			++npages;
+
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+		if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page))
+		{
+			/* We can carve out a piece of the main shared memory segment. */
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = npages * FPM_PAGE_SIZE;
+			using_main_dsm_region = true;
+			/* We'll choose a handle below. */
+		}
+	}
+
+	if (!using_main_dsm_region)
+	{
+		/*
+		 * We need to create a new memory segment.  Loop until we find an
+		 * unused segment identifier.
+		 */
+		if (dsm_main_space_fpm)
+			LWLockRelease(DynamicSharedMemoryControlLock);
+		for (;;)
+		{
+			Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
+			seg->handle = random() << 1;	/* Even numbers only */
+			if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
+				continue;
+			if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
+							&seg->mapped_address, &seg->mapped_size, ERROR))
+				break;
+		}
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	}
+
+	/* Search the control segment for an unused slot. */
+	nitems = dsm_control->nitems;
+	for (i = 0; i < nitems; ++i)
+	{
+		if (dsm_control->item[i].refcnt == 0)
+		{
+			if (using_main_dsm_region)
+			{
+				seg->handle = make_main_region_dsm_handle(i);
+				dsm_control->item[i].first_page = first_page;
+				dsm_control->item[i].npages = npages;
+			}
+			else
+				Assert(!is_main_region_dsm_handle(seg->handle));
+			dsm_control->item[i].handle = seg->handle;
+			/* refcnt of 1 triggers destruction, so start at 2 */
+			dsm_control->item[i].refcnt = 2;
+			dsm_control->item[i].impl_private_pm_handle = NULL;
+			dsm_control->item[i].pinned = false;
+			seg->control_slot = i;
+			LWLockRelease(DynamicSharedMemoryControlLock);
+			return seg;
+		}
+	}
+
+	/* Verify that we can support an additional mapping. */
+	if (nitems >= dsm_control->maxitems)
+	{
+		if (using_main_dsm_region)
+			FreePageManagerPut(dsm_main_space_fpm, first_page, npages);
+		LWLockRelease(DynamicSharedMemoryControlLock);
+		if (!using_main_dsm_region)
+			dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
+		if (seg->resowner != NULL)
+			ResourceOwnerForgetDSM(seg->resowner, seg);
+		dlist_delete(&seg->node);
+		pfree(seg);
+
+		if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
+			return NULL;
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("too many dynamic shared memory segments")));
+	}
+
+	/* Enter the handle into a new array slot. */
+	if (using_main_dsm_region)
+	{
+		seg->handle = make_main_region_dsm_handle(nitems);
+		dsm_control->item[i].first_page = first_page;
+		dsm_control->item[i].npages = npages;
+	}
+	dsm_control->item[nitems].handle = seg->handle;
+	/* refcnt of 1 triggers destruction, so start at 2 */
+	dsm_control->item[nitems].refcnt = 2;
+	dsm_control->item[nitems].impl_private_pm_handle = NULL;
+	dsm_control->item[nitems].pinned = false;
+	seg->control_slot = nitems;
+	dsm_control->nitems++;
+	LWLockRelease(DynamicSharedMemoryControlLock);
+
+	return seg;
+}
+
+/*
+ * Attach a dynamic shared memory segment.
+ *
+ * See comments for dsm_segment_handle() for an explanation of how this
+ * is intended to be used.
+ *
+ * This function will return NULL if the segment isn't known to the system.
+ * This can happen if we're asked to attach the segment, but then everyone
+ * else detaches it (causing it to be destroyed) before we get around to
+ * attaching it.
+ *
+ * If there is a non-NULL CurrentResourceOwner, the attached segment is
+ * associated with it and must be detached before the resource owner releases,
+ * or a warning will be logged.  Otherwise the segment remains attached until
+ * explicitly detached or the session ends.  See the note atop dsm_create().
+ */
+dsm_segment *
+dsm_attach(dsm_handle h)
+{
+	dsm_segment *seg;
+	dlist_iter	iter;
+	uint32		i;
+	uint32		nitems;
+
+	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
+	Assert(IsUnderPostmaster);
+
+	if (!dsm_init_done)
+		dsm_backend_startup();
+
+	/*
+	 * Since this is just a debugging cross-check, we could leave it out
+	 * altogether, or include it only in assert-enabled builds.  But since the
+	 * list of attached segments should normally be very short, let's include
+	 * it always for right now.
+	 *
+	 * If you're hitting this error, you probably want to attempt to find an
+	 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
+	 * create a new one.
+	 */
+	dlist_foreach(iter, &dsm_segment_list)
+	{
+		seg = dlist_container(dsm_segment, node, iter.cur);
+		if (seg->handle == h)
+			elog(ERROR, "can't attach the same segment more than once");
+	}
+
+	/* Create a new segment descriptor. */
+	seg = dsm_create_descriptor();
+	seg->handle = h;
+
+	/* Bump reference count for this segment in shared memory. */
+	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	nitems = dsm_control->nitems;
+	for (i = 0; i < nitems; ++i)
+	{
+		/*
+		 * If the reference count is 0, the slot is actually unused.  If the
+		 * reference count is 1, the slot is still in use, but the segment is
+		 * in the process of going away; even if the handle matches, another
+		 * slot may already have started using the same handle value by
+		 * coincidence so we have to keep searching.
+		 */
+		if (dsm_control->item[i].refcnt <= 1)
+			continue;
+
+		/* If the handle doesn't match, it's not the slot we want. */
+		if (dsm_control->item[i].handle != seg->handle)
+			continue;
+
+		/* Otherwise we've found a match. */
+		dsm_control->item[i].refcnt++;
+		seg->control_slot = i;
+		if (is_main_region_dsm_handle(seg->handle))
+		{
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				dsm_control->item[i].first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE;
+		}
+		break;
+	}
+	LWLockRelease(DynamicSharedMemoryControlLock);
+
+	/*
+	 * If we didn't find the handle we're looking for in the control segment,
+	 * it probably means that everyone else who had it mapped, including the
+	 * original creator, died before we got to this point. It's up to the
+	 * caller to decide what to do about that.
+	 */
+	if (seg->control_slot == INVALID_CONTROL_SLOT)
+	{
+		dsm_detach(seg);
+		return NULL;
+	}
+
+	/* Here's where we actually try to map the segment. */
+	if (!is_main_region_dsm_handle(seg->handle))
+		dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
+					&seg->mapped_address, &seg->mapped_size, ERROR);
+
+	return seg;
+}
+
+/*
+ * At backend shutdown time, detach any segments that are still attached.
+ * (This is similar to dsm_detach_all, except that there's no reason to
+ * unmap the control segment before exiting, so we don't bother.)
+ */
+void
+dsm_backend_shutdown(void)
+{
+	while (!dlist_is_empty(&dsm_segment_list))
+	{
+		dsm_segment *seg;
+
+		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
+		dsm_detach(seg);
+	}
+}
+
+/*
+ * Detach all shared memory segments, including the control segments.  This
+ * should be called, along with PGSharedMemoryDetach, in processes that
+ * might inherit mappings but are not intended to be connected to dynamic
+ * shared memory.
+ */
+void
+dsm_detach_all(void)
+{
+	void	   *control_address = dsm_control;
+
+	while (!dlist_is_empty(&dsm_segment_list))
+	{
+		dsm_segment *seg;
+
+		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
+		dsm_detach(seg);
+	}
+
+	if (control_address != NULL)
+		dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
+					&dsm_control_impl_private, &control_address,
+					&dsm_control_mapped_size, ERROR);
+}
+
+/*
+ * Detach from a shared memory segment, destroying the segment if we
+ * remove the last reference.
+ *
+ * This function should never fail.  It will often be invoked when aborting
+ * a transaction, and a further error won't serve any purpose.  It's not a
+ * complete disaster if we fail to unmap or destroy the segment; it means a
+ * resource leak, but that doesn't necessarily preclude further operations.
+ */
+void
+dsm_detach(dsm_segment *seg)
+{
+	/*
+	 * Invoke registered callbacks.  Just in case one of those callbacks
+	 * throws a further error that brings us back here, pop the callback
+	 * before invoking it, to avoid infinite error recursion.  Don't allow
+	 * interrupts while running the individual callbacks in non-error code
+	 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
+	 * a statement timeout or similar.
+	 */
+	HOLD_INTERRUPTS();
+	while (!slist_is_empty(&seg->on_detach))
+	{
+		slist_node *node;
+		dsm_segment_detach_callback *cb;
+		on_dsm_detach_callback function;
+		Datum		arg;
+
+		node = slist_pop_head_node(&seg->on_detach);
+		cb = slist_container(dsm_segment_detach_callback, node, node);
+		function = cb->function;
+		arg = cb->arg;
+		pfree(cb);
+
+		function(seg, arg);
+	}
+	RESUME_INTERRUPTS();
+
+	/*
+	 * Try to remove the mapping, if one exists.  Normally, there will be, but
+	 * maybe not, if we failed partway through a create or attach operation.
+	 * We remove the mapping before decrementing the reference count so that
+	 * the process that sees a zero reference count can be certain that no
+	 * remaining mappings exist.  Even if this fails, we pretend that it
+	 * works, because retrying is likely to fail in the same way.
+	 */
+	if (seg->mapped_address != NULL)
+	{
+		if (!is_main_region_dsm_handle(seg->handle))
+			dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
+		seg->impl_private = NULL;
+		seg->mapped_address = NULL;
+		seg->mapped_size = 0;
+	}
+
+	/* Reduce reference count, if we previously increased it. */
+	if (seg->control_slot != INVALID_CONTROL_SLOT)
+	{
+		uint32		refcnt;
+		uint32		control_slot = seg->control_slot;
+
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+		Assert(dsm_control->item[control_slot].handle == seg->handle);
+		Assert(dsm_control->item[control_slot].refcnt > 1);
+		refcnt = --dsm_control->item[control_slot].refcnt;
+		seg->control_slot = INVALID_CONTROL_SLOT;
+		LWLockRelease(DynamicSharedMemoryControlLock);
+
+		/* If new reference count is 1, try to destroy the segment. */
+		if (refcnt == 1)
+		{
+			/* A pinned segment should never reach 1. */
+			Assert(!dsm_control->item[control_slot].pinned);
+
+			/*
+			 * If we fail to destroy the segment here, or are killed before we
+			 * finish doing so, the reference count will remain at 1, which
+			 * will mean that nobody else can attach to the segment.  At
+			 * postmaster shutdown time, or when a new postmaster is started
+			 * after a hard kill, another attempt will be made to remove the
+			 * segment.
+			 *
+			 * The main case we're worried about here is being killed by a
+			 * signal before we can finish removing the segment.  In that
+			 * case, it's important to be sure that the segment still gets
+			 * removed. If we actually fail to remove the segment for some
+			 * other reason, the postmaster may not have any better luck than
+			 * we did.  There's not much we can do about that, though.
+			 */
+			if (is_main_region_dsm_handle(seg->handle) ||
+				dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+							&seg->mapped_address, &seg->mapped_size, WARNING))
+			{
+				LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+				if (is_main_region_dsm_handle(seg->handle))
+					FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+									   dsm_control->item[control_slot].first_page,
+									   dsm_control->item[control_slot].npages);
+				Assert(dsm_control->item[control_slot].handle == seg->handle);
+				Assert(dsm_control->item[control_slot].refcnt == 1);
+				dsm_control->item[control_slot].refcnt = 0;
+				LWLockRelease(DynamicSharedMemoryControlLock);
+			}
+		}
+	}
+
+	/* Clean up our remaining backend-private data structures. */
+	if (seg->resowner != NULL)
+		ResourceOwnerForgetDSM(seg->resowner, seg);
+	dlist_delete(&seg->node);
+	pfree(seg);
+}
+
+/*
+ * Keep a dynamic shared memory mapping until end of session.
+ *
+ * By default, mappings are owned by the current resource owner, which
+ * typically means they stick around for the duration of the current query
+ * only.
+ */
+void
+dsm_pin_mapping(dsm_segment *seg)
+{
+	if (seg->resowner != NULL)
+	{
+		ResourceOwnerForgetDSM(seg->resowner, seg);
+		seg->resowner = NULL;
+	}
+}
+
+/*
+ * Arrange to remove a dynamic shared memory mapping at cleanup time.
+ *
+ * dsm_pin_mapping() can be used to preserve a mapping for the entire
+ * lifetime of a process; this function reverses that decision, making
+ * the segment owned by the current resource owner.  This may be useful
+ * just before performing some operation that will invalidate the segment
+ * for future use by this backend.
+ */
+void
+dsm_unpin_mapping(dsm_segment *seg)
+{
+	Assert(seg->resowner == NULL);
+	ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
+	seg->resowner = CurrentResourceOwner;
+	ResourceOwnerRememberDSM(seg->resowner, seg);
+}
+
+/*
+ * Keep a dynamic shared memory segment until postmaster shutdown, or until
+ * dsm_unpin_segment is called.
+ *
+ * This function should not be called more than once per segment, unless the
+ * segment is explicitly unpinned with dsm_unpin_segment in between calls.
+ *
+ * Note that this function does not arrange for the current process to
+ * keep the segment mapped indefinitely; if that behavior is desired,
+ * dsm_pin_mapping() should be used from each process that needs to
+ * retain the mapping.
+ */
+void
+dsm_pin_segment(dsm_segment *seg)
+{
+	void	   *handle;
+
+	/*
+	 * Bump reference count for this segment in shared memory. This will
+	 * ensure that even if there is no session which is attached to this
+	 * segment, it will remain until postmaster shutdown or an explicit call
+	 * to unpin.
+	 */
+	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	if (dsm_control->item[seg->control_slot].pinned)
+		elog(ERROR, "cannot pin a segment that is already pinned");
+	dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
+	dsm_control->item[seg->control_slot].pinned = true;
+	dsm_control->item[seg->control_slot].refcnt++;
+	dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
+	LWLockRelease(DynamicSharedMemoryControlLock);
+}
+
+/*
+ * Unpin a dynamic shared memory segment that was previously pinned with
+ * dsm_pin_segment.  This function should not be called unless dsm_pin_segment
+ * was previously called for this segment.
+ *
+ * The argument is a dsm_handle rather than a dsm_segment in case you want
+ * to unpin a segment to which you haven't attached.  This turns out to be
+ * useful if, for example, a reference to one shared memory segment is stored
+ * within another shared memory segment.  You might want to unpin the
+ * referenced segment before destroying the referencing segment.
+ */
+void
+dsm_unpin_segment(dsm_handle handle)
+{
+	uint32		control_slot = INVALID_CONTROL_SLOT;
+	bool		destroy = false;
+	uint32		i;
+
+	/* Find the control slot for the given handle. */
+	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	for (i = 0; i < dsm_control->nitems; ++i)
+	{
+		/* Skip unused slots and segments that are concurrently going away. */
+		if (dsm_control->item[i].refcnt <= 1)
+			continue;
+
+		/* If we've found our handle, we can stop searching. */
+		if (dsm_control->item[i].handle == handle)
+		{
+			control_slot = i;
+			break;
+		}
+	}
+
+	/*
+	 * We should definitely have found the slot, and it should not already be
+	 * in the process of going away, because this function should only be
+	 * called on a segment which is pinned.
+	 */
+	if (control_slot == INVALID_CONTROL_SLOT)
+		elog(ERROR, "cannot unpin unknown segment handle");
+	if (!dsm_control->item[control_slot].pinned)
+		elog(ERROR, "cannot unpin a segment that is not pinned");
+	Assert(dsm_control->item[control_slot].refcnt > 1);
+
+	/*
+	 * Allow implementation-specific code to run.  We have to do this before
+	 * releasing the lock, because impl_private_pm_handle may get modified by
+	 * dsm_impl_unpin_segment.
+	 */
+	dsm_impl_unpin_segment(handle,
+						   &dsm_control->item[control_slot].impl_private_pm_handle);
+
+	/* Note that 1 means no references (0 means unused slot). */
+	if (--dsm_control->item[control_slot].refcnt == 1)
+		destroy = true;
+	dsm_control->item[control_slot].pinned = false;
+
+	/* Now we can release the lock. */
+	LWLockRelease(DynamicSharedMemoryControlLock);
+
+	/* Clean up resources if that was the last reference. */
+	if (destroy)
+	{
+		void	   *junk_impl_private = NULL;
+		void	   *junk_mapped_address = NULL;
+		Size		junk_mapped_size = 0;
+
+		/*
+		 * For an explanation of how error handling works in this case, see
+		 * comments in dsm_detach.  Note that if we reach this point, the
+		 * current process certainly does not have the segment mapped, because
+		 * if it did, the reference count would have still been greater than 1
+		 * even after releasing the reference count held by the pin.  The fact
+		 * that there can't be a dsm_segment for this handle makes it OK to
+		 * pass the mapped size, mapped address, and private data as NULL
+		 * here.
+		 */
+		if (is_main_region_dsm_handle(handle) ||
+			dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+						&junk_mapped_address, &junk_mapped_size, WARNING))
+		{
+			LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+			if (is_main_region_dsm_handle(handle))
+				FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+								   dsm_control->item[control_slot].first_page,
+								   dsm_control->item[control_slot].npages);
+			Assert(dsm_control->item[control_slot].handle == handle);
+			Assert(dsm_control->item[control_slot].refcnt == 1);
+			dsm_control->item[control_slot].refcnt = 0;
+			LWLockRelease(DynamicSharedMemoryControlLock);
+		}
+	}
+}
+
+/*
+ * Find an existing mapping for a shared memory segment, if there is one.
+ */
+dsm_segment *
+dsm_find_mapping(dsm_handle h)
+{
+	dlist_iter	iter;
+	dsm_segment *seg;
+
+	dlist_foreach(iter, &dsm_segment_list)
+	{
+		seg = dlist_container(dsm_segment, node, iter.cur);
+		if (seg->handle == h)
+			return seg;
+	}
+
+	return NULL;
+}
+
+/*
+ * Get the address at which a dynamic shared memory segment is mapped.
+ */
+void *
+dsm_segment_address(dsm_segment *seg)
+{
+	Assert(seg->mapped_address != NULL);
+	return seg->mapped_address;
+}
+
+/*
+ * Get the size of a mapping.
+ */
+Size
+dsm_segment_map_length(dsm_segment *seg)
+{
+	Assert(seg->mapped_address != NULL);
+	return seg->mapped_size;
+}
+
+/*
+ * Get a handle for a mapping.
+ *
+ * To establish communication via dynamic shared memory between two backends,
+ * one of them should first call dsm_create() to establish a new shared
+ * memory mapping.  That process should then call dsm_segment_handle() to
+ * obtain a handle for the mapping, and pass that handle to the
+ * coordinating backend via some means (e.g. bgw_main_arg, or via the
+ * main shared memory segment).  The recipient, once in possession of the
+ * handle, should call dsm_attach().
+ */
+dsm_handle
+dsm_segment_handle(dsm_segment *seg)
+{
+	return seg->handle;
+}
+
+/*
+ * Register an on-detach callback for a dynamic shared memory segment.
+ */
+void
+on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
+{
+	dsm_segment_detach_callback *cb;
+
+	cb = MemoryContextAlloc(TopMemoryContext,
+							sizeof(dsm_segment_detach_callback));
+	cb->function = function;
+	cb->arg = arg;
+	slist_push_head(&seg->on_detach, &cb->node);
+}
+
+/*
+ * Unregister an on-detach callback for a dynamic shared memory segment.
+ */
+void
+cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
+					 Datum arg)
+{
+	slist_mutable_iter iter;
+
+	slist_foreach_modify(iter, &seg->on_detach)
+	{
+		dsm_segment_detach_callback *cb;
+
+		cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
+		if (cb->function == function && cb->arg == arg)
+		{
+			slist_delete_current(&iter);
+			pfree(cb);
+			break;
+		}
+	}
+}
+
+/*
+ * Discard all registered on-detach callbacks without executing them.
+ */
+void
+reset_on_dsm_detach(void)
+{
+	dlist_iter	iter;
+
+	dlist_foreach(iter, &dsm_segment_list)
+	{
+		dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
+
+		/* Throw away explicit on-detach actions one by one. */
+		while (!slist_is_empty(&seg->on_detach))
+		{
+			slist_node *node;
+			dsm_segment_detach_callback *cb;
+
+			node = slist_pop_head_node(&seg->on_detach);
+			cb = slist_container(dsm_segment_detach_callback, node, node);
+			pfree(cb);
+		}
+
+		/*
+		 * Decrementing the reference count is a sort of implicit on-detach
+		 * action; make sure we don't do that, either.
+		 */
+		seg->control_slot = INVALID_CONTROL_SLOT;
+	}
+}
+
+/*
+ * Create a segment descriptor.
+ */
+static dsm_segment *
+dsm_create_descriptor(void)
+{
+	dsm_segment *seg;
+
+	if (CurrentResourceOwner)
+		ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
+
+	seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
+	dlist_push_head(&dsm_segment_list, &seg->node);
+
+	/* seg->handle must be initialized by the caller */
+	seg->control_slot = INVALID_CONTROL_SLOT;
+	seg->impl_private = NULL;
+	seg->mapped_address = NULL;
+	seg->mapped_size = 0;
+
+	seg->resowner = CurrentResourceOwner;
+	if (CurrentResourceOwner)
+		ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
+
+	slist_init(&seg->on_detach);
+
+	return seg;
+}
+
+/*
+ * Sanity check a control segment.
+ *
+ * The goal here isn't to detect everything that could possibly be wrong with
+ * the control segment; there's not enough information for that.  Rather, the
+ * goal is to make sure that someone can iterate over the items in the segment
+ * without overrunning the end of the mapping and crashing.  We also check
+ * the magic number since, if that's messed up, this may not even be one of
+ * our segments at all.
+ */
+static bool
+dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
+{
+	if (mapped_size < offsetof(dsm_control_header, item))
+		return false;			/* Mapped size too short to read header. */
+	if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
+		return false;			/* Magic number doesn't match. */
+	if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
+		return false;			/* Max item count won't fit in map. */
+	if (control->nitems > control->maxitems)
+		return false;			/* Overfull. */
+	return true;
+}
+
+/*
+ * Compute the number of control-segment bytes needed to store a given
+ * number of items.
+ */
+static uint64
+dsm_control_bytes_needed(uint32 nitems)
+{
+	return offsetof(dsm_control_header, item)
+		+ sizeof(dsm_control_item) * (uint64) nitems;
+}
+
+static inline dsm_handle
+make_main_region_dsm_handle(int slot)
+{
+	dsm_handle	handle;
+
+	/*
+	 * We need to create a handle that doesn't collide with any existing extra
+	 * segment created by dsm_impl_op(), so we'll make it odd.  It also
+	 * mustn't collide with any other main area pseudo-segment, so we'll
+	 * include the slot number in some of the bits.  We also want to make an
+	 * effort to avoid newly created and recently destroyed handles from being
+	 * confused, so we'll make the rest of the bits random.
+	 */
+	handle = 1;
+	handle |= slot << 1;
+	handle |= random() << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1);
+	return handle;
+}
+
+static inline bool
+is_main_region_dsm_handle(dsm_handle handle)
+{
+	return handle & 1;
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
new file mode 100644
index 0000000..c51e3e6
--- /dev/null
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -0,0 +1,1058 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm_impl.c
+ *	  manage dynamic shared memory segments
+ *
+ * This file provides low-level APIs for creating and destroying shared
+ * memory segments using several different possible techniques.  We refer
+ * to these segments as dynamic because they can be created, altered, and
+ * destroyed at any point during the server life cycle.  This is unlike
+ * the main shared memory segment, of which there is always exactly one
+ * and which is always mapped at a fixed address in every PostgreSQL
+ * background process.
+ *
+ * Because not all systems provide the same primitives in this area, nor
+ * do all primitives behave the same way on all systems, we provide
+ * several implementations of this facility.  Many systems implement
+ * POSIX shared memory (shm_open etc.), which is well-suited to our needs
+ * in this area, with the exception that shared memory identifiers live
+ * in a flat system-wide namespace, raising the uncomfortable prospect of
+ * name collisions with other processes (including other copies of
+ * PostgreSQL) running on the same system.  Some systems only support
+ * the older System V shared memory interface (shmget etc.) which is
+ * also usable; however, the default allocation limits are often quite
+ * small, and the namespace is even more restricted.
+ *
+ * We also provide an mmap-based shared memory implementation.  This may
+ * be useful on systems that provide shared memory via a special-purpose
+ * filesystem; by opting for this implementation, the user can even
+ * control precisely where their shared memory segments are placed.  It
+ * can also be used as a fallback for systems where shm_open and shmget
+ * are not available or can't be used for some reason.  Of course,
+ * mapping a file residing on an actual spinning disk is a fairly poor
+ * approximation for shared memory because writeback may hurt performance
+ * substantially, but there should be few systems where we must make do
+ * with such poor tools.
+ *
+ * As ever, Windows requires its own implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/dsm_impl.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <signal.h>
+#include <unistd.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+#ifdef HAVE_SYS_IPC_H
+#include <sys/ipc.h>
+#endif
+#ifdef HAVE_SYS_SHM_H
+#include <sys/shm.h>
+#endif
+
+#include "common/file_perm.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "portability/mem.h"
+#include "postmaster/postmaster.h"
+#include "storage/dsm_impl.h"
+#include "storage/fd.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+#ifdef USE_DSM_POSIX
+static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
+						   void **impl_private, void **mapped_address,
+						   Size *mapped_size, int elevel);
+static int	dsm_impl_posix_resize(int fd, off_t size);
+#endif
+#ifdef USE_DSM_SYSV
+static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
+						  void **impl_private, void **mapped_address,
+						  Size *mapped_size, int elevel);
+#endif
+#ifdef USE_DSM_WINDOWS
+static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
+							 void **impl_private, void **mapped_address,
+							 Size *mapped_size, int elevel);
+#endif
+#ifdef USE_DSM_MMAP
+static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
+						  void **impl_private, void **mapped_address,
+						  Size *mapped_size, int elevel);
+#endif
+static int	errcode_for_dynamic_shared_memory(void);
+
+const struct config_enum_entry dynamic_shared_memory_options[] = {
+#ifdef USE_DSM_POSIX
+	{"posix", DSM_IMPL_POSIX, false},
+#endif
+#ifdef USE_DSM_SYSV
+	{"sysv", DSM_IMPL_SYSV, false},
+#endif
+#ifdef USE_DSM_WINDOWS
+	{"windows", DSM_IMPL_WINDOWS, false},
+#endif
+#ifdef USE_DSM_MMAP
+	{"mmap", DSM_IMPL_MMAP, false},
+#endif
+	{NULL, 0, false}
+};
+
+/* Implementation selector. */
+int			dynamic_shared_memory_type;
+
+/* Amount of space reserved for DSM segments in the main area. */
+int			min_dynamic_shared_memory;
+
+/* Size of buffer to be used for zero-filling. */
+#define ZBUFFER_SIZE				8192
+
+#define SEGMENT_NAME_PREFIX			"Global/PostgreSQL"
+
+/*------
+ * Perform a low-level shared memory operation in a platform-specific way,
+ * as dictated by the selected implementation.  Each implementation is
+ * required to implement the following primitives.
+ *
+ * DSM_OP_CREATE.  Create a segment whose size is the request_size and
+ * map it.
+ *
+ * DSM_OP_ATTACH.  Map the segment, whose size must be the request_size.
+ *
+ * DSM_OP_DETACH.  Unmap the segment.
+ *
+ * DSM_OP_DESTROY.  Unmap the segment, if it is mapped.  Destroy the
+ * segment.
+ *
+ * Arguments:
+ *	 op: The operation to be performed.
+ *	 handle: The handle of an existing object, or for DSM_OP_CREATE, the
+ *	   a new handle the caller wants created.
+ *	 request_size: For DSM_OP_CREATE, the requested size.  Otherwise, 0.
+ *	 impl_private: Private, implementation-specific data.  Will be a pointer
+ *	   to NULL for the first operation on a shared memory segment within this
+ *	   backend; thereafter, it will point to the value to which it was set
+ *	   on the previous call.
+ *	 mapped_address: Pointer to start of current mapping; pointer to NULL
+ *	   if none.  Updated with new mapping address.
+ *	 mapped_size: Pointer to size of current mapping; pointer to 0 if none.
+ *	   Updated with new mapped size.
+ *	 elevel: Level at which to log errors.
+ *
+ * Return value: true on success, false on failure.  When false is returned,
+ * a message should first be logged at the specified elevel, except in the
+ * case where DSM_OP_CREATE experiences a name collision, which should
+ * silently return false.
+ *-----
+ */
+bool
+dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
+			void **impl_private, void **mapped_address, Size *mapped_size,
+			int elevel)
+{
+	Assert(op == DSM_OP_CREATE || request_size == 0);
+	Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
+		   (*mapped_address == NULL && *mapped_size == 0));
+
+	switch (dynamic_shared_memory_type)
+	{
+#ifdef USE_DSM_POSIX
+		case DSM_IMPL_POSIX:
+			return dsm_impl_posix(op, handle, request_size, impl_private,
+								  mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_SYSV
+		case DSM_IMPL_SYSV:
+			return dsm_impl_sysv(op, handle, request_size, impl_private,
+								 mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_WINDOWS
+		case DSM_IMPL_WINDOWS:
+			return dsm_impl_windows(op, handle, request_size, impl_private,
+									mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_MMAP
+		case DSM_IMPL_MMAP:
+			return dsm_impl_mmap(op, handle, request_size, impl_private,
+								 mapped_address, mapped_size, elevel);
+#endif
+		default:
+			elog(ERROR, "unexpected dynamic shared memory type: %d",
+				 dynamic_shared_memory_type);
+			return false;
+	}
+}
+
+#ifdef USE_DSM_POSIX
+/*
+ * Operating system primitives to support POSIX shared memory.
+ *
+ * POSIX shared memory segments are created and attached using shm_open()
+ * and shm_unlink(); other operations, such as sizing or mapping the
+ * segment, are performed as if the shared memory segments were files.
+ *
+ * Indeed, on some platforms, they may be implemented that way.  While
+ * POSIX shared memory segments seem intended to exist in a flat namespace,
+ * some operating systems may implement them as files, even going so far
+ * to treat a request for /xyz as a request to create a file by that name
+ * in the root directory.  Users of such broken platforms should select
+ * a different shared memory implementation.
+ */
+static bool
+dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
+			   void **impl_private, void **mapped_address, Size *mapped_size,
+			   int elevel)
+{
+	char		name[64];
+	int			flags;
+	int			fd;
+	char	   *address;
+
+	snprintf(name, 64, "/PostgreSQL.%u", handle);
+
+	/* Handle teardown cases. */
+	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+	{
+		if (*mapped_address != NULL
+			&& munmap(*mapped_address, *mapped_size) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not unmap shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		*mapped_address = NULL;
+		*mapped_size = 0;
+		if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not remove shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		return true;
+	}
+
+	/*
+	 * Create new segment or open an existing one for attach.
+	 *
+	 * Even though we will close the FD before returning, it seems desirable
+	 * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
+	 * failure.  The fact that we won't hold the FD open long justifies using
+	 * ReserveExternalFD rather than AcquireExternalFD, though.
+	 */
+	ReserveExternalFD();
+
+	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
+	if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
+	{
+		ReleaseExternalFD();
+		if (op == DSM_OP_ATTACH || errno != EEXIST)
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not open shared memory segment \"%s\": %m",
+							name)));
+		return false;
+	}
+
+	/*
+	 * If we're attaching the segment, determine the current size; if we are
+	 * creating the segment, set the size to the requested value.
+	 */
+	if (op == DSM_OP_ATTACH)
+	{
+		struct stat st;
+
+		if (fstat(fd, &st) != 0)
+		{
+			int			save_errno;
+
+			/* Back out what's already been done. */
+			save_errno = errno;
+			close(fd);
+			ReleaseExternalFD();
+			errno = save_errno;
+
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not stat shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		request_size = st.st_size;
+	}
+	else if (dsm_impl_posix_resize(fd, request_size) != 0)
+	{
+		int			save_errno;
+
+		/* Back out what's already been done. */
+		save_errno = errno;
+		close(fd);
+		ReleaseExternalFD();
+		shm_unlink(name);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
+						name, request_size)));
+		return false;
+	}
+
+	/* Map it. */
+	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
+				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
+	if (address == MAP_FAILED)
+	{
+		int			save_errno;
+
+		/* Back out what's already been done. */
+		save_errno = errno;
+		close(fd);
+		ReleaseExternalFD();
+		if (op == DSM_OP_CREATE)
+			shm_unlink(name);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not map shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+	*mapped_address = address;
+	*mapped_size = request_size;
+	close(fd);
+	ReleaseExternalFD();
+
+	return true;
+}
+
+/*
+ * Set the size of a virtual memory region associated with a file descriptor.
+ * If necessary, also ensure that virtual memory is actually allocated by the
+ * operating system, to avoid nasty surprises later.
+ *
+ * Returns non-zero if either truncation or allocation fails, and sets errno.
+ */
+static int
+dsm_impl_posix_resize(int fd, off_t size)
+{
+	int			rc;
+	int			save_errno;
+	sigset_t	save_sigmask;
+
+	/*
+	 * Block all blockable signals, except SIGQUIT.  posix_fallocate() can run
+	 * for quite a long time, and is an all-or-nothing operation.  If we
+	 * allowed SIGUSR1 to interrupt us repeatedly (for example, due to recovery
+	 * conflicts), the retry loop might never succeed.
+	 */
+	if (IsUnderPostmaster)
+		sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask);
+
+	/* Truncate (or extend) the file to the requested size. */
+	do
+	{
+		rc = ftruncate(fd, size);
+	} while (rc < 0 && errno == EINTR);
+
+	/*
+	 * On Linux, a shm_open fd is backed by a tmpfs file.  After resizing with
+	 * ftruncate, the file may contain a hole.  Accessing memory backed by a
+	 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
+	 * is no more tmpfs space available.  So we ask tmpfs to allocate pages
+	 * here, so we can fail gracefully with ENOSPC now rather than risking
+	 * SIGBUS later.
+	 */
+#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
+	if (rc == 0)
+	{
+		/*
+		 * We still use a traditional EINTR retry loop to handle SIGCONT.
+		 * posix_fallocate() doesn't restart automatically, and we don't want
+		 * this to fail if you attach a debugger.
+		 */
+		pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
+		do
+		{
+			rc = posix_fallocate(fd, 0, size);
+		} while (rc == EINTR);
+		pgstat_report_wait_end();
+
+		/*
+		 * The caller expects errno to be set, but posix_fallocate() doesn't
+		 * set it.  Instead it returns error numbers directly.  So set errno,
+		 * even though we'll also return rc to indicate success or failure.
+		 */
+		errno = rc;
+	}
+#endif							/* HAVE_POSIX_FALLOCATE && __linux__ */
+
+	if (IsUnderPostmaster)
+	{
+		save_errno = errno;
+		sigprocmask(SIG_SETMASK, &save_sigmask, NULL);
+		errno = save_errno;
+	}
+
+	return rc;
+}
+
+#endif							/* USE_DSM_POSIX */
+
+#ifdef USE_DSM_SYSV
+/*
+ * Operating system primitives to support System V shared memory.
+ *
+ * System V shared memory segments are manipulated using shmget(), shmat(),
+ * shmdt(), and shmctl().  As the default allocation limits for System V
+ * shared memory are usually quite low, the POSIX facilities may be
+ * preferable; but those are not supported everywhere.
+ */
+static bool
+dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
+			  void **impl_private, void **mapped_address, Size *mapped_size,
+			  int elevel)
+{
+	key_t		key;
+	int			ident;
+	char	   *address;
+	char		name[64];
+	int		   *ident_cache;
+
+	/*
+	 * POSIX shared memory and mmap-based shared memory identify segments with
+	 * names.  To avoid needless error message variation, we use the handle as
+	 * the name.
+	 */
+	snprintf(name, 64, "%u", handle);
+
+	/*
+	 * The System V shared memory namespace is very restricted; names are of
+	 * type key_t, which is expected to be some sort of integer data type, but
+	 * not necessarily the same one as dsm_handle.  Since we use dsm_handle to
+	 * identify shared memory segments across processes, this might seem like
+	 * a problem, but it's really not.  If dsm_handle is bigger than key_t,
+	 * the cast below might truncate away some bits from the handle the
+	 * user-provided, but it'll truncate exactly the same bits away in exactly
+	 * the same fashion every time we use that handle, which is all that
+	 * really matters.  Conversely, if dsm_handle is smaller than key_t, we
+	 * won't use the full range of available key space, but that's no big deal
+	 * either.
+	 *
+	 * We do make sure that the key isn't negative, because that might not be
+	 * portable.
+	 */
+	key = (key_t) handle;
+	if (key < 1)				/* avoid compiler warning if type is unsigned */
+		key = -key;
+
+	/*
+	 * There's one special key, IPC_PRIVATE, which can't be used.  If we end
+	 * up with that value by chance during a create operation, just pretend it
+	 * already exists, so that caller will retry.  If we run into it anywhere
+	 * else, the caller has passed a handle that doesn't correspond to
+	 * anything we ever created, which should not happen.
+	 */
+	if (key == IPC_PRIVATE)
+	{
+		if (op != DSM_OP_CREATE)
+			elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
+		errno = EEXIST;
+		return false;
+	}
+
+	/*
+	 * Before we can do anything with a shared memory segment, we have to map
+	 * the shared memory key to a shared memory identifier using shmget(). To
+	 * avoid repeated lookups, we store the key using impl_private.
+	 */
+	if (*impl_private != NULL)
+	{
+		ident_cache = *impl_private;
+		ident = *ident_cache;
+	}
+	else
+	{
+		int			flags = IPCProtection;
+		size_t		segsize;
+
+		/*
+		 * Allocate the memory BEFORE acquiring the resource, so that we don't
+		 * leak the resource if memory allocation fails.
+		 */
+		ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
+
+		/*
+		 * When using shmget to find an existing segment, we must pass the
+		 * size as 0.  Passing a non-zero size which is greater than the
+		 * actual size will result in EINVAL.
+		 */
+		segsize = 0;
+
+		if (op == DSM_OP_CREATE)
+		{
+			flags |= IPC_CREAT | IPC_EXCL;
+			segsize = request_size;
+		}
+
+		if ((ident = shmget(key, segsize, flags)) == -1)
+		{
+			if (op == DSM_OP_ATTACH || errno != EEXIST)
+			{
+				int			save_errno = errno;
+
+				pfree(ident_cache);
+				errno = save_errno;
+				ereport(elevel,
+						(errcode_for_dynamic_shared_memory(),
+						 errmsg("could not get shared memory segment: %m")));
+			}
+			return false;
+		}
+
+		*ident_cache = ident;
+		*impl_private = ident_cache;
+	}
+
+	/* Handle teardown cases. */
+	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+	{
+		pfree(ident_cache);
+		*impl_private = NULL;
+		if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not unmap shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		*mapped_address = NULL;
+		*mapped_size = 0;
+		if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not remove shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		return true;
+	}
+
+	/* If we're attaching it, we must use IPC_STAT to determine the size. */
+	if (op == DSM_OP_ATTACH)
+	{
+		struct shmid_ds shm;
+
+		if (shmctl(ident, IPC_STAT, &shm) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not stat shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		request_size = shm.shm_segsz;
+	}
+
+	/* Map it. */
+	address = shmat(ident, NULL, PG_SHMAT_FLAGS);
+	if (address == (void *) -1)
+	{
+		int			save_errno;
+
+		/* Back out what's already been done. */
+		save_errno = errno;
+		if (op == DSM_OP_CREATE)
+			shmctl(ident, IPC_RMID, NULL);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not map shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+	*mapped_address = address;
+	*mapped_size = request_size;
+
+	return true;
+}
+#endif
+
+#ifdef USE_DSM_WINDOWS
+/*
+ * Operating system primitives to support Windows shared memory.
+ *
+ * Windows shared memory implementation is done using file mapping
+ * which can be backed by either physical file or system paging file.
+ * Current implementation uses system paging file as other effects
+ * like performance are not clear for physical file and it is used in similar
+ * way for main shared memory in windows.
+ *
+ * A memory mapping object is a kernel object - they always get deleted when
+ * the last reference to them goes away, either explicitly via a CloseHandle or
+ * when the process containing the reference exits.
+ */
+static bool
+dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
+				 void **impl_private, void **mapped_address,
+				 Size *mapped_size, int elevel)
+{
+	char	   *address;
+	HANDLE		hmap;
+	char		name[64];
+	MEMORY_BASIC_INFORMATION info;
+
+	/*
+	 * Storing the shared memory segment in the Global\ namespace, can allow
+	 * any process running in any session to access that file mapping object
+	 * provided that the caller has the required access rights. But to avoid
+	 * issues faced in main shared memory, we are using the naming convention
+	 * similar to main shared memory. We can change here once issue mentioned
+	 * in GetSharedMemName is resolved.
+	 */
+	snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+
+	/*
+	 * Handle teardown cases.  Since Windows automatically destroys the object
+	 * when no references remain, we can treat it the same as detach.
+	 */
+	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+	{
+		if (*mapped_address != NULL
+			&& UnmapViewOfFile(*mapped_address) == 0)
+		{
+			_dosmaperr(GetLastError());
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not unmap shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		if (*impl_private != NULL
+			&& CloseHandle(*impl_private) == 0)
+		{
+			_dosmaperr(GetLastError());
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not remove shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+
+		*impl_private = NULL;
+		*mapped_address = NULL;
+		*mapped_size = 0;
+		return true;
+	}
+
+	/* Create new segment or open an existing one for attach. */
+	if (op == DSM_OP_CREATE)
+	{
+		DWORD		size_high;
+		DWORD		size_low;
+		DWORD		errcode;
+
+		/* Shifts >= the width of the type are undefined. */
+#ifdef _WIN64
+		size_high = request_size >> 32;
+#else
+		size_high = 0;
+#endif
+		size_low = (DWORD) request_size;
+
+		/* CreateFileMapping might not clear the error code on success */
+		SetLastError(0);
+
+		hmap = CreateFileMapping(INVALID_HANDLE_VALUE,	/* Use the pagefile */
+								 NULL,	/* Default security attrs */
+								 PAGE_READWRITE,	/* Memory is read/write */
+								 size_high, /* Upper 32 bits of size */
+								 size_low,	/* Lower 32 bits of size */
+								 name);
+
+		errcode = GetLastError();
+		if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
+		{
+			/*
+			 * On Windows, when the segment already exists, a handle for the
+			 * existing segment is returned.  We must close it before
+			 * returning.  However, if the existing segment is created by a
+			 * service, then it returns ERROR_ACCESS_DENIED. We don't do
+			 * _dosmaperr here, so errno won't be modified.
+			 */
+			if (hmap)
+				CloseHandle(hmap);
+			return false;
+		}
+
+		if (!hmap)
+		{
+			_dosmaperr(errcode);
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not create shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+	}
+	else
+	{
+		hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
+							   FALSE,	/* do not inherit the name */
+							   name);	/* name of mapping object */
+		if (!hmap)
+		{
+			_dosmaperr(GetLastError());
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not open shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+	}
+
+	/* Map it. */
+	address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
+							0, 0, 0);
+	if (!address)
+	{
+		int			save_errno;
+
+		_dosmaperr(GetLastError());
+		/* Back out what's already been done. */
+		save_errno = errno;
+		CloseHandle(hmap);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not map shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+
+	/*
+	 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
+	 * need size only when we are attaching, but it's better to get the size
+	 * when creating new segment to keep size consistent both for
+	 * DSM_OP_CREATE and DSM_OP_ATTACH.
+	 */
+	if (VirtualQuery(address, &info, sizeof(info)) == 0)
+	{
+		int			save_errno;
+
+		_dosmaperr(GetLastError());
+		/* Back out what's already been done. */
+		save_errno = errno;
+		UnmapViewOfFile(address);
+		CloseHandle(hmap);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not stat shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+
+	*mapped_address = address;
+	*mapped_size = info.RegionSize;
+	*impl_private = hmap;
+
+	return true;
+}
+#endif
+
+#ifdef USE_DSM_MMAP
+/*
+ * Operating system primitives to support mmap-based shared memory.
+ *
+ * Calling this "shared memory" is somewhat of a misnomer, because what
+ * we're really doing is creating a bunch of files and mapping them into
+ * our address space.  The operating system may feel obliged to
+ * synchronize the contents to disk even if nothing is being paged out,
+ * which will not serve us well.  The user can relocate the pg_dynshmem
+ * directory to a ramdisk to avoid this problem, if available.
+ */
+static bool
+dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
+			  void **impl_private, void **mapped_address, Size *mapped_size,
+			  int elevel)
+{
+	char		name[64];
+	int			flags;
+	int			fd;
+	char	   *address;
+
+	snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
+			 handle);
+
+	/* Handle teardown cases. */
+	if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+	{
+		if (*mapped_address != NULL
+			&& munmap(*mapped_address, *mapped_size) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not unmap shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		*mapped_address = NULL;
+		*mapped_size = 0;
+		if (op == DSM_OP_DESTROY && unlink(name) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not remove shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		return true;
+	}
+
+	/* Create new segment or open an existing one for attach. */
+	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
+	if ((fd = OpenTransientFile(name, flags)) == -1)
+	{
+		if (op == DSM_OP_ATTACH || errno != EEXIST)
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not open shared memory segment \"%s\": %m",
+							name)));
+		return false;
+	}
+
+	/*
+	 * If we're attaching the segment, determine the current size; if we are
+	 * creating the segment, set the size to the requested value.
+	 */
+	if (op == DSM_OP_ATTACH)
+	{
+		struct stat st;
+
+		if (fstat(fd, &st) != 0)
+		{
+			int			save_errno;
+
+			/* Back out what's already been done. */
+			save_errno = errno;
+			CloseTransientFile(fd);
+			errno = save_errno;
+
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not stat shared memory segment \"%s\": %m",
+							name)));
+			return false;
+		}
+		request_size = st.st_size;
+	}
+	else
+	{
+		/*
+		 * Allocate a buffer full of zeros.
+		 *
+		 * Note: palloc zbuffer, instead of just using a local char array, to
+		 * ensure it is reasonably well-aligned; this may save a few cycles
+		 * transferring data to the kernel.
+		 */
+		char	   *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
+		uint32		remaining = request_size;
+		bool		success = true;
+
+		/*
+		 * Zero-fill the file. We have to do this the hard way to ensure that
+		 * all the file space has really been allocated, so that we don't
+		 * later seg fault when accessing the memory mapping.  This is pretty
+		 * pessimal.
+		 */
+		while (success && remaining > 0)
+		{
+			Size		goal = remaining;
+
+			if (goal > ZBUFFER_SIZE)
+				goal = ZBUFFER_SIZE;
+			pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
+			if (write(fd, zbuffer, goal) == goal)
+				remaining -= goal;
+			else
+				success = false;
+			pgstat_report_wait_end();
+		}
+
+		if (!success)
+		{
+			int			save_errno;
+
+			/* Back out what's already been done. */
+			save_errno = errno;
+			CloseTransientFile(fd);
+			unlink(name);
+			errno = save_errno ? save_errno : ENOSPC;
+
+			ereport(elevel,
+					(errcode_for_dynamic_shared_memory(),
+					 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
+							name, request_size)));
+			return false;
+		}
+	}
+
+	/* Map it. */
+	address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
+				   MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
+	if (address == MAP_FAILED)
+	{
+		int			save_errno;
+
+		/* Back out what's already been done. */
+		save_errno = errno;
+		CloseTransientFile(fd);
+		if (op == DSM_OP_CREATE)
+			unlink(name);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_dynamic_shared_memory(),
+				 errmsg("could not map shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+	*mapped_address = address;
+	*mapped_size = request_size;
+
+	if (CloseTransientFile(fd) != 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not close shared memory segment \"%s\": %m",
+						name)));
+		return false;
+	}
+
+	return true;
+}
+#endif
+
+/*
+ * Implementation-specific actions that must be performed when a segment is to
+ * be preserved even when no backend has it attached.
+ *
+ * Except on Windows, we don't need to do anything at all.  But since Windows
+ * cleans up segments automatically when no references remain, we duplicate
+ * the segment handle into the postmaster process.  The postmaster needn't
+ * do anything to receive the handle; Windows transfers it automatically.
+ */
+void
+dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
+					 void **impl_private_pm_handle)
+{
+	switch (dynamic_shared_memory_type)
+	{
+#ifdef USE_DSM_WINDOWS
+		case DSM_IMPL_WINDOWS:
+			{
+				HANDLE		hmap;
+
+				if (!DuplicateHandle(GetCurrentProcess(), impl_private,
+									 PostmasterHandle, &hmap, 0, FALSE,
+									 DUPLICATE_SAME_ACCESS))
+				{
+					char		name[64];
+
+					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+					_dosmaperr(GetLastError());
+					ereport(ERROR,
+							(errcode_for_dynamic_shared_memory(),
+							 errmsg("could not duplicate handle for \"%s\": %m",
+									name)));
+				}
+
+				/*
+				 * Here, we remember the handle that we created in the
+				 * postmaster process.  This handle isn't actually usable in
+				 * any process other than the postmaster, but that doesn't
+				 * matter.  We're just holding onto it so that, if the segment
+				 * is unpinned, dsm_impl_unpin_segment can close it.
+				 */
+				*impl_private_pm_handle = hmap;
+				break;
+			}
+#endif
+		default:
+			break;
+	}
+}
+
+/*
+ * Implementation-specific actions that must be performed when a segment is no
+ * longer to be preserved, so that it will be cleaned up when all backends
+ * have detached from it.
+ *
+ * Except on Windows, we don't need to do anything at all.  For Windows, we
+ * close the extra handle that dsm_impl_pin_segment created in the
+ * postmaster's process space.
+ */
+void
+dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
+{
+	switch (dynamic_shared_memory_type)
+	{
+#ifdef USE_DSM_WINDOWS
+		case DSM_IMPL_WINDOWS:
+			{
+				if (*impl_private &&
+					!DuplicateHandle(PostmasterHandle, *impl_private,
+									 NULL, NULL, 0, FALSE,
+									 DUPLICATE_CLOSE_SOURCE))
+				{
+					char		name[64];
+
+					snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+					_dosmaperr(GetLastError());
+					ereport(ERROR,
+							(errcode_for_dynamic_shared_memory(),
+							 errmsg("could not duplicate handle for \"%s\": %m",
+									name)));
+				}
+
+				*impl_private = NULL;
+				break;
+			}
+#endif
+		default:
+			break;
+	}
+}
+
+static int
+errcode_for_dynamic_shared_memory(void)
+{
+	if (errno == EFBIG || errno == ENOMEM)
+		return errcode(ERRCODE_OUT_OF_MEMORY);
+	else
+		return errcode_for_file_access();
+}
diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c
new file mode 100644
index 0000000..4045d7d
--- /dev/null
+++ b/src/backend/storage/ipc/ipc.c
@@ -0,0 +1,435 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.c
+ *	  POSTGRES inter-process communication definitions.
+ *
+ * This file is misnamed, as it no longer has much of anything directly
+ * to do with IPC.  The functionality here is concerned with managing
+ * exit-time cleanup for either a postmaster or a backend.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/ipc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+#ifdef PROFILE_PID_DIR
+#include "postmaster/autovacuum.h"
+#endif
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "tcop/tcopprot.h"
+
+
+/*
+ * This flag is set during proc_exit() to change ereport()'s behavior,
+ * so that an ereport() from an on_proc_exit routine cannot get us out
+ * of the exit procedure.  We do NOT want to go back to the idle loop...
+ */
+bool		proc_exit_inprogress = false;
+
+/*
+ * Set when shmem_exit() is in progress.
+ */
+bool		shmem_exit_inprogress = false;
+
+/*
+ * This flag tracks whether we've called atexit() in the current process
+ * (or in the parent postmaster).
+ */
+static bool atexit_callback_setup = false;
+
+/* local functions */
+static void proc_exit_prepare(int code);
+
+
+/* ----------------------------------------------------------------
+ *						exit() handling stuff
+ *
+ * These functions are in generally the same spirit as atexit(),
+ * but provide some additional features we need --- in particular,
+ * we want to register callbacks to invoke when we are disconnecting
+ * from a broken shared-memory context but not exiting the postmaster.
+ *
+ * Callback functions can take zero, one, or two args: the first passed
+ * arg is the integer exitcode, the second is the Datum supplied when
+ * the callback was registered.
+ * ----------------------------------------------------------------
+ */
+
+#define MAX_ON_EXITS 20
+
+struct ONEXIT
+{
+	pg_on_exit_callback function;
+	Datum		arg;
+};
+
+static struct ONEXIT on_proc_exit_list[MAX_ON_EXITS];
+static struct ONEXIT on_shmem_exit_list[MAX_ON_EXITS];
+static struct ONEXIT before_shmem_exit_list[MAX_ON_EXITS];
+
+static int	on_proc_exit_index,
+			on_shmem_exit_index,
+			before_shmem_exit_index;
+
+
+/* ----------------------------------------------------------------
+ *		proc_exit
+ *
+ *		this function calls all the callbacks registered
+ *		for it (to free resources) and then calls exit.
+ *
+ *		This should be the only function to call exit().
+ *		-cim 2/6/90
+ *
+ *		Unfortunately, we can't really guarantee that add-on code
+ *		obeys the rule of not calling exit() directly.  So, while
+ *		this is the preferred way out of the system, we also register
+ *		an atexit callback that will make sure cleanup happens.
+ * ----------------------------------------------------------------
+ */
+void
+proc_exit(int code)
+{
+	/* Clean up everything that must be cleaned up */
+	proc_exit_prepare(code);
+
+#ifdef PROFILE_PID_DIR
+	{
+		/*
+		 * If we are profiling ourself then gprof's mcleanup() is about to
+		 * write out a profile to ./gmon.out.  Since mcleanup() always uses a
+		 * fixed file name, each backend will overwrite earlier profiles. To
+		 * fix that, we create a separate subdirectory for each backend
+		 * (./gprof/pid) and 'cd' to that subdirectory before we exit() - that
+		 * forces mcleanup() to write each profile into its own directory.  We
+		 * end up with something like: $PGDATA/gprof/8829/gmon.out
+		 * $PGDATA/gprof/8845/gmon.out ...
+		 *
+		 * To avoid undesirable disk space bloat, autovacuum workers are
+		 * discriminated against: all their gmon.out files go into the same
+		 * subdirectory.  Without this, an installation that is "just sitting
+		 * there" nonetheless eats megabytes of disk space every few seconds.
+		 *
+		 * Note that we do this here instead of in an on_proc_exit() callback
+		 * because we want to ensure that this code executes last - we don't
+		 * want to interfere with any other on_proc_exit() callback.  For the
+		 * same reason, we do not include it in proc_exit_prepare ... so if
+		 * you are exiting in the "wrong way" you won't drop your profile in a
+		 * nice place.
+		 */
+		char		gprofDirName[32];
+
+		if (IsAutoVacuumWorkerProcess())
+			snprintf(gprofDirName, 32, "gprof/avworker");
+		else
+			snprintf(gprofDirName, 32, "gprof/%d", (int) getpid());
+
+		/*
+		 * Use mkdir() instead of MakePGDirectory() since we aren't making a
+		 * PG directory here.
+		 */
+		mkdir("gprof", S_IRWXU | S_IRWXG | S_IRWXO);
+		mkdir(gprofDirName, S_IRWXU | S_IRWXG | S_IRWXO);
+		chdir(gprofDirName);
+	}
+#endif
+
+	elog(DEBUG3, "exit(%d)", code);
+
+	exit(code);
+}
+
+/*
+ * Code shared between proc_exit and the atexit handler.  Note that in
+ * normal exit through proc_exit, this will actually be called twice ...
+ * but the second call will have nothing to do.
+ */
+static void
+proc_exit_prepare(int code)
+{
+	/*
+	 * Once we set this flag, we are committed to exit.  Any ereport() will
+	 * NOT send control back to the main loop, but right back here.
+	 */
+	proc_exit_inprogress = true;
+
+	/*
+	 * Forget any pending cancel or die requests; we're doing our best to
+	 * close up shop already.  Note that the signal handlers will not set
+	 * these flags again, now that proc_exit_inprogress is set.
+	 */
+	InterruptPending = false;
+	ProcDiePending = false;
+	QueryCancelPending = false;
+	InterruptHoldoffCount = 1;
+	CritSectionCount = 0;
+
+	/*
+	 * Also clear the error context stack, to prevent error callbacks from
+	 * being invoked by any elog/ereport calls made during proc_exit. Whatever
+	 * context they might want to offer is probably not relevant, and in any
+	 * case they are likely to fail outright after we've done things like
+	 * aborting any open transaction.  (In normal exit scenarios the context
+	 * stack should be empty anyway, but it might not be in the case of
+	 * elog(FATAL) for example.)
+	 */
+	error_context_stack = NULL;
+	/* For the same reason, reset debug_query_string before it's clobbered */
+	debug_query_string = NULL;
+
+	/* do our shared memory exits first */
+	shmem_exit(code);
+
+	elog(DEBUG3, "proc_exit(%d): %d callbacks to make",
+		 code, on_proc_exit_index);
+
+	/*
+	 * call all the registered callbacks.
+	 *
+	 * Note that since we decrement on_proc_exit_index each time, if a
+	 * callback calls ereport(ERROR) or ereport(FATAL) then it won't be
+	 * invoked again when control comes back here (nor will the
+	 * previously-completed callbacks).  So, an infinite loop should not be
+	 * possible.
+	 */
+	while (--on_proc_exit_index >= 0)
+		on_proc_exit_list[on_proc_exit_index].function(code,
+													   on_proc_exit_list[on_proc_exit_index].arg);
+
+	on_proc_exit_index = 0;
+}
+
+/* ------------------
+ * Run all of the on_shmem_exit routines --- but don't actually exit.
+ * This is used by the postmaster to re-initialize shared memory and
+ * semaphores after a backend dies horribly.  As with proc_exit(), we
+ * remove each callback from the list before calling it, to avoid
+ * infinite loop in case of error.
+ * ------------------
+ */
+void
+shmem_exit(int code)
+{
+	shmem_exit_inprogress = true;
+
+	/*
+	 * Call before_shmem_exit callbacks.
+	 *
+	 * These should be things that need most of the system to still be up and
+	 * working, such as cleanup of temp relations, which requires catalog
+	 * access; or things that need to be completed because later cleanup steps
+	 * depend on them, such as releasing lwlocks.
+	 */
+	elog(DEBUG3, "shmem_exit(%d): %d before_shmem_exit callbacks to make",
+		 code, before_shmem_exit_index);
+	while (--before_shmem_exit_index >= 0)
+		before_shmem_exit_list[before_shmem_exit_index].function(code,
+																 before_shmem_exit_list[before_shmem_exit_index].arg);
+	before_shmem_exit_index = 0;
+
+	/*
+	 * Call dynamic shared memory callbacks.
+	 *
+	 * These serve the same purpose as late callbacks, but for dynamic shared
+	 * memory segments rather than the main shared memory segment.
+	 * dsm_backend_shutdown() has the same kind of progressive logic we use
+	 * for the main shared memory segment; namely, it unregisters each
+	 * callback before invoking it, so that we don't get stuck in an infinite
+	 * loop if one of those callbacks itself throws an ERROR or FATAL.
+	 *
+	 * Note that explicitly calling this function here is quite different from
+	 * registering it as an on_shmem_exit callback for precisely this reason:
+	 * if one dynamic shared memory callback errors out, the remaining
+	 * callbacks will still be invoked.  Thus, hard-coding this call puts it
+	 * equal footing with callbacks for the main shared memory segment.
+	 */
+	dsm_backend_shutdown();
+
+	/*
+	 * Call on_shmem_exit callbacks.
+	 *
+	 * These are generally releasing low-level shared memory resources.  In
+	 * some cases, this is a backstop against the possibility that the early
+	 * callbacks might themselves fail, leading to re-entry to this routine;
+	 * in other cases, it's cleanup that only happens at process exit.
+	 */
+	elog(DEBUG3, "shmem_exit(%d): %d on_shmem_exit callbacks to make",
+		 code, on_shmem_exit_index);
+	while (--on_shmem_exit_index >= 0)
+		on_shmem_exit_list[on_shmem_exit_index].function(code,
+														 on_shmem_exit_list[on_shmem_exit_index].arg);
+	on_shmem_exit_index = 0;
+
+	shmem_exit_inprogress = false;
+}
+
+/* ----------------------------------------------------------------
+ *		atexit_callback
+ *
+ *		Backstop to ensure that direct calls of exit() don't mess us up.
+ *
+ * Somebody who was being really uncooperative could call _exit(),
+ * but for that case we have a "dead man switch" that will make the
+ * postmaster treat it as a crash --- see pmsignal.c.
+ * ----------------------------------------------------------------
+ */
+static void
+atexit_callback(void)
+{
+	/* Clean up everything that must be cleaned up */
+	/* ... too bad we don't know the real exit code ... */
+	proc_exit_prepare(-1);
+}
+
+/* ----------------------------------------------------------------
+ *		on_proc_exit
+ *
+ *		this function adds a callback function to the list of
+ *		functions invoked by proc_exit().   -cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+void
+on_proc_exit(pg_on_exit_callback function, Datum arg)
+{
+	if (on_proc_exit_index >= MAX_ON_EXITS)
+		ereport(FATAL,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg_internal("out of on_proc_exit slots")));
+
+	on_proc_exit_list[on_proc_exit_index].function = function;
+	on_proc_exit_list[on_proc_exit_index].arg = arg;
+
+	++on_proc_exit_index;
+
+	if (!atexit_callback_setup)
+	{
+		atexit(atexit_callback);
+		atexit_callback_setup = true;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		before_shmem_exit
+ *
+ *		Register early callback to perform user-level cleanup,
+ *		e.g. transaction abort, before we begin shutting down
+ *		low-level subsystems.
+ * ----------------------------------------------------------------
+ */
+void
+before_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+	if (before_shmem_exit_index >= MAX_ON_EXITS)
+		ereport(FATAL,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg_internal("out of before_shmem_exit slots")));
+
+	before_shmem_exit_list[before_shmem_exit_index].function = function;
+	before_shmem_exit_list[before_shmem_exit_index].arg = arg;
+
+	++before_shmem_exit_index;
+
+	if (!atexit_callback_setup)
+	{
+		atexit(atexit_callback);
+		atexit_callback_setup = true;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		on_shmem_exit
+ *
+ *		Register ordinary callback to perform low-level shutdown
+ *		(e.g. releasing our PGPROC); run after before_shmem_exit
+ *		callbacks and before on_proc_exit callbacks.
+ * ----------------------------------------------------------------
+ */
+void
+on_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+	if (on_shmem_exit_index >= MAX_ON_EXITS)
+		ereport(FATAL,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg_internal("out of on_shmem_exit slots")));
+
+	on_shmem_exit_list[on_shmem_exit_index].function = function;
+	on_shmem_exit_list[on_shmem_exit_index].arg = arg;
+
+	++on_shmem_exit_index;
+
+	if (!atexit_callback_setup)
+	{
+		atexit(atexit_callback);
+		atexit_callback_setup = true;
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		cancel_before_shmem_exit
+ *
+ *		this function removes a previously-registered before_shmem_exit
+ *		callback.  We only look at the latest entry for removal, as we
+ * 		expect callers to add and remove temporary before_shmem_exit
+ * 		callbacks in strict LIFO order.
+ * ----------------------------------------------------------------
+ */
+void
+cancel_before_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+	if (before_shmem_exit_index > 0 &&
+		before_shmem_exit_list[before_shmem_exit_index - 1].function
+		== function &&
+		before_shmem_exit_list[before_shmem_exit_index - 1].arg == arg)
+		--before_shmem_exit_index;
+	else
+		elog(ERROR, "before_shmem_exit callback (%p,0x%llx) is not the latest entry",
+			 function, (long long) arg);
+}
+
+/* ----------------------------------------------------------------
+ *		on_exit_reset
+ *
+ *		this function clears all on_proc_exit() and on_shmem_exit()
+ *		registered functions.  This is used just after forking a backend,
+ *		so that the backend doesn't believe it should call the postmaster's
+ *		on-exit routines when it exits...
+ * ----------------------------------------------------------------
+ */
+void
+on_exit_reset(void)
+{
+	before_shmem_exit_index = 0;
+	on_shmem_exit_index = 0;
+	on_proc_exit_index = 0;
+	reset_on_dsm_detach();
+}
+
+/* ----------------------------------------------------------------
+ *		check_on_shmem_exit_lists_are_empty
+ *
+ *		Debugging check that no shmem cleanup handlers have been registered
+ *		prematurely in the current process.
+ * ----------------------------------------------------------------
+ */
+void
+check_on_shmem_exit_lists_are_empty(void)
+{
+	if (before_shmem_exit_index)
+		elog(FATAL, "before_shmem_exit has been called prematurely");
+	if (on_shmem_exit_index)
+		elog(FATAL, "on_shmem_exit has been called prematurely");
+	/* Checking DSM detach state seems unnecessary given the above */
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
new file mode 100644
index 0000000..3e4ec53
--- /dev/null
+++ b/src/backend/storage/ipc/ipci.c
@@ -0,0 +1,291 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipci.c
+ *	  POSTGRES inter-process communication initialization code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/ipci.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/heapam.h"
+#include "access/multixact.h"
+#include "access/nbtree.h"
+#include "access/subtrans.h"
+#include "access/syncscan.h"
+#include "access/twophase.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "postmaster/bgworker_internals.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/postmaster.h"
+#include "replication/logicallauncher.h"
+#include "replication/origin.h"
+#include "replication/slot.h"
+#include "replication/walreceiver.h"
+#include "replication/walsender.h"
+#include "storage/bufmgr.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+#include "storage/pmsignal.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "utils/snapmgr.h"
+
+/* GUCs */
+int			shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE;
+
+shmem_startup_hook_type shmem_startup_hook = NULL;
+
+static Size total_addin_request = 0;
+static bool addin_request_allowed = true;
+
+
+/*
+ * RequestAddinShmemSpace
+ *		Request that extra shmem space be allocated for use by
+ *		a loadable module.
+ *
+ * This is only useful if called from the _PG_init hook of a library that
+ * is loaded into the postmaster via shared_preload_libraries.  Once
+ * shared memory has been allocated, calls will be ignored.  (We could
+ * raise an error, but it seems better to make it a no-op, so that
+ * libraries containing such calls can be reloaded if needed.)
+ */
+void
+RequestAddinShmemSpace(Size size)
+{
+	if (IsUnderPostmaster || !addin_request_allowed)
+		return;					/* too late */
+	total_addin_request = add_size(total_addin_request, size);
+}
+
+
+/*
+ * CreateSharedMemoryAndSemaphores
+ *		Creates and initializes shared memory and semaphores.
+ *
+ * This is called by the postmaster or by a standalone backend.
+ * It is also called by a backend forked from the postmaster in the
+ * EXEC_BACKEND case.  In the latter case, the shared memory segment
+ * already exists and has been physically attached to, but we have to
+ * initialize pointers in local memory that reference the shared structures,
+ * because we didn't inherit the correct pointer values from the postmaster
+ * as we do in the fork() scenario.  The easiest way to do that is to run
+ * through the same code as before.  (Note that the called routines mostly
+ * check IsUnderPostmaster, rather than EXEC_BACKEND, to detect this case.
+ * This is a bit code-wasteful and could be cleaned up.)
+ */
+void
+CreateSharedMemoryAndSemaphores(void)
+{
+	PGShmemHeader *shim = NULL;
+
+	if (!IsUnderPostmaster)
+	{
+		PGShmemHeader *seghdr;
+		Size		size;
+		int			numSemas;
+
+		/* Compute number of semaphores we'll need */
+		numSemas = ProcGlobalSemas();
+		numSemas += SpinlockSemas();
+
+		/*
+		 * Size of the Postgres shared-memory block is estimated via
+		 * moderately-accurate estimates for the big hogs, plus 100K for the
+		 * stuff that's too small to bother with estimating.
+		 *
+		 * We take some care during this phase to ensure that the total size
+		 * request doesn't overflow size_t.  If this gets through, we don't
+		 * need to be so careful during the actual allocation phase.
+		 */
+		size = 100000;
+		size = add_size(size, PGSemaphoreShmemSize(numSemas));
+		size = add_size(size, SpinlockSemaSize());
+		size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
+												 sizeof(ShmemIndexEnt)));
+		size = add_size(size, dsm_estimate_size());
+		size = add_size(size, BufferShmemSize());
+		size = add_size(size, LockShmemSize());
+		size = add_size(size, PredicateLockShmemSize());
+		size = add_size(size, ProcGlobalShmemSize());
+		size = add_size(size, XLOGShmemSize());
+		size = add_size(size, CLOGShmemSize());
+		size = add_size(size, CommitTsShmemSize());
+		size = add_size(size, SUBTRANSShmemSize());
+		size = add_size(size, TwoPhaseShmemSize());
+		size = add_size(size, BackgroundWorkerShmemSize());
+		size = add_size(size, MultiXactShmemSize());
+		size = add_size(size, LWLockShmemSize());
+		size = add_size(size, ProcArrayShmemSize());
+		size = add_size(size, BackendStatusShmemSize());
+		size = add_size(size, SInvalShmemSize());
+		size = add_size(size, PMSignalShmemSize());
+		size = add_size(size, ProcSignalShmemSize());
+		size = add_size(size, CheckpointerShmemSize());
+		size = add_size(size, AutoVacuumShmemSize());
+		size = add_size(size, ReplicationSlotsShmemSize());
+		size = add_size(size, ReplicationOriginShmemSize());
+		size = add_size(size, WalSndShmemSize());
+		size = add_size(size, WalRcvShmemSize());
+		size = add_size(size, PgArchShmemSize());
+		size = add_size(size, ApplyLauncherShmemSize());
+		size = add_size(size, SnapMgrShmemSize());
+		size = add_size(size, BTreeShmemSize());
+		size = add_size(size, SyncScanShmemSize());
+		size = add_size(size, AsyncShmemSize());
+#ifdef EXEC_BACKEND
+		size = add_size(size, ShmemBackendArraySize());
+#endif
+
+		/* freeze the addin request size and include it */
+		addin_request_allowed = false;
+		size = add_size(size, total_addin_request);
+
+		/* might as well round it off to a multiple of a typical page size */
+		size = add_size(size, 8192 - (size % 8192));
+
+		elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size);
+
+		/*
+		 * Create the shmem segment
+		 */
+		seghdr = PGSharedMemoryCreate(size, &shim);
+
+		InitShmemAccess(seghdr);
+
+		/*
+		 * Create semaphores
+		 */
+		PGReserveSemaphores(numSemas);
+
+		/*
+		 * If spinlocks are disabled, initialize emulation layer (which
+		 * depends on semaphores, so the order is important here).
+		 */
+#ifndef HAVE_SPINLOCKS
+		SpinlockSemaInit();
+#endif
+	}
+	else
+	{
+		/*
+		 * We are reattaching to an existing shared memory segment. This
+		 * should only be reached in the EXEC_BACKEND case.
+		 */
+#ifndef EXEC_BACKEND
+		elog(PANIC, "should be attached to shared memory already");
+#endif
+	}
+
+	/*
+	 * Set up shared memory allocation mechanism
+	 */
+	if (!IsUnderPostmaster)
+		InitShmemAllocation();
+
+	/*
+	 * Now initialize LWLocks, which do shared memory allocation and are
+	 * needed for InitShmemIndex.
+	 */
+	CreateLWLocks();
+
+	/*
+	 * Set up shmem.c index hashtable
+	 */
+	InitShmemIndex();
+
+	dsm_shmem_init();
+
+	/*
+	 * Set up xlog, clog, and buffers
+	 */
+	XLOGShmemInit();
+	CLOGShmemInit();
+	CommitTsShmemInit();
+	SUBTRANSShmemInit();
+	MultiXactShmemInit();
+	InitBufferPool();
+
+	/*
+	 * Set up lock manager
+	 */
+	InitLocks();
+
+	/*
+	 * Set up predicate lock manager
+	 */
+	InitPredicateLocks();
+
+	/*
+	 * Set up process table
+	 */
+	if (!IsUnderPostmaster)
+		InitProcGlobal();
+	CreateSharedProcArray();
+	CreateSharedBackendStatus();
+	TwoPhaseShmemInit();
+	BackgroundWorkerShmemInit();
+
+	/*
+	 * Set up shared-inval messaging
+	 */
+	CreateSharedInvalidationState();
+
+	/*
+	 * Set up interprocess signaling mechanisms
+	 */
+	PMSignalShmemInit();
+	ProcSignalShmemInit();
+	CheckpointerShmemInit();
+	AutoVacuumShmemInit();
+	ReplicationSlotsShmemInit();
+	ReplicationOriginShmemInit();
+	WalSndShmemInit();
+	WalRcvShmemInit();
+	PgArchShmemInit();
+	ApplyLauncherShmemInit();
+
+	/*
+	 * Set up other modules that need some shared memory space
+	 */
+	SnapMgrInit();
+	BTreeShmemInit();
+	SyncScanShmemInit();
+	AsyncShmemInit();
+
+#ifdef EXEC_BACKEND
+
+	/*
+	 * Alloc the win32 shared backend array
+	 */
+	if (!IsUnderPostmaster)
+		ShmemBackendArrayAllocation();
+#endif
+
+	/* Initialize dynamic shared memory facilities. */
+	if (!IsUnderPostmaster)
+		dsm_postmaster_startup(shim);
+
+	/*
+	 * Now give loadable modules a chance to set up their shmem allocations
+	 */
+	if (shmem_startup_hook)
+		shmem_startup_hook();
+}
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
new file mode 100644
index 0000000..3427bcf
--- /dev/null
+++ b/src/backend/storage/ipc/latch.c
@@ -0,0 +1,2158 @@
+/*-------------------------------------------------------------------------
+ *
+ * latch.c
+ *	  Routines for inter-process latches
+ *
+ * The poll() implementation uses the so-called self-pipe trick to overcome the
+ * race condition involved with poll() and setting a global flag in the signal
+ * handler. When a latch is set and the current process is waiting for it, the
+ * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe.
+ * A signal by itself doesn't interrupt poll() on all platforms, and even on
+ * platforms where it does, a signal that arrives just before the poll() call
+ * does not prevent poll() from entering sleep. An incoming byte on a pipe
+ * however reliably interrupts the sleep, and causes poll() to return
+ * immediately even if the signal arrives before poll() begins.
+ *
+ * The epoll() implementation overcomes the race with a different technique: it
+ * keeps SIGURG blocked and consumes from a signalfd() descriptor instead.  We
+ * don't need to register a signal handler or create our own self-pipe.  We
+ * assume that any system that has Linux epoll() also has Linux signalfd().
+ *
+ * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL.
+ *
+ * The Windows implementation uses Windows events that are inherited by all
+ * postmaster child processes. There's no need for the self-pipe trick there.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/latch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <unistd.h>
+#ifdef HAVE_SYS_EPOLL_H
+#include <sys/epoll.h>
+#endif
+#ifdef HAVE_SYS_EVENT_H
+#include <sys/event.h>
+#endif
+#ifdef HAVE_SYS_SIGNALFD_H
+#include <sys/signalfd.h>
+#endif
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "portability/instr_time.h"
+#include "postmaster/postmaster.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+#include "utils/memutils.h"
+
+/*
+ * Select the fd readiness primitive to use. Normally the "most modern"
+ * primitive supported by the OS will be used, but for testing it can be
+ * useful to manually specify the used primitive.  If desired, just add a
+ * define somewhere before this block.
+ */
+#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
+	defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
+/* don't overwrite manual choice */
+#elif defined(HAVE_SYS_EPOLL_H)
+#define WAIT_USE_EPOLL
+#elif defined(HAVE_KQUEUE)
+#define WAIT_USE_KQUEUE
+#elif defined(HAVE_POLL)
+#define WAIT_USE_POLL
+#elif WIN32
+#define WAIT_USE_WIN32
+#else
+#error "no wait set implementation available"
+#endif
+
+/*
+ * By default, we use a self-pipe with poll() and a signalfd with epoll(), if
+ * available.  We avoid signalfd on illumos for now based on problem reports.
+ * For testing the choice can also be manually specified.
+ */
+#if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL)
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+/* don't overwrite manual choice */
+#elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H) && \
+	!defined(__illumos__)
+#define WAIT_USE_SIGNALFD
+#else
+#define WAIT_USE_SELF_PIPE
+#endif
+#endif
+
+/* typedef in latch.h */
+struct WaitEventSet
+{
+	int			nevents;		/* number of registered events */
+	int			nevents_space;	/* maximum number of events in this set */
+
+	/*
+	 * Array, of nevents_space length, storing the definition of events this
+	 * set is waiting for.
+	 */
+	WaitEvent  *events;
+
+	/*
+	 * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
+	 * said latch, and latch_pos the offset in the ->events array. This is
+	 * useful because we check the state of the latch before performing doing
+	 * syscalls related to waiting.
+	 */
+	Latch	   *latch;
+	int			latch_pos;
+
+	/*
+	 * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
+	 * is set so that we'll exit immediately if postmaster death is detected,
+	 * instead of returning.
+	 */
+	bool		exit_on_postmaster_death;
+
+#if defined(WAIT_USE_EPOLL)
+	int			epoll_fd;
+	/* epoll_wait returns events in a user provided arrays, allocate once */
+	struct epoll_event *epoll_ret_events;
+#elif defined(WAIT_USE_KQUEUE)
+	int			kqueue_fd;
+	/* kevent returns events in a user provided arrays, allocate once */
+	struct kevent *kqueue_ret_events;
+	bool		report_postmaster_not_running;
+#elif defined(WAIT_USE_POLL)
+	/* poll expects events to be waited on every poll() call, prepare once */
+	struct pollfd *pollfds;
+#elif defined(WAIT_USE_WIN32)
+
+	/*
+	 * Array of windows events. The first element always contains
+	 * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
+	 * event->pos + 1).
+	 */
+	HANDLE	   *handles;
+#endif
+};
+
+/* A common WaitEventSet used to implement WatchLatch() */
+static WaitEventSet *LatchWaitSet;
+
+/* The position of the latch in LatchWaitSet. */
+#define LatchWaitSetLatchPos 0
+
+#ifndef WIN32
+/* Are we currently in WaitLatch? The signal handler would like to know. */
+static volatile sig_atomic_t waiting = false;
+#endif
+
+#ifdef WAIT_USE_SIGNALFD
+/* On Linux, we'll receive SIGURG via a signalfd file descriptor. */
+static int	signal_fd = -1;
+#endif
+
+#ifdef WAIT_USE_SELF_PIPE
+/* Read and write ends of the self-pipe */
+static int	selfpipe_readfd = -1;
+static int	selfpipe_writefd = -1;
+
+/* Process owning the self-pipe --- needed for checking purposes */
+static int	selfpipe_owner_pid = 0;
+
+/* Private function prototypes */
+static void latch_sigurg_handler(SIGNAL_ARGS);
+static void sendSelfPipeByte(void);
+#endif
+
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+static void drain(void);
+#endif
+
+#if defined(WAIT_USE_EPOLL)
+static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
+#elif defined(WAIT_USE_KQUEUE)
+static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
+#elif defined(WAIT_USE_POLL)
+static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
+#elif defined(WAIT_USE_WIN32)
+static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
+#endif
+
+static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+										WaitEvent *occurred_events, int nevents);
+
+/*
+ * Initialize the process-local latch infrastructure.
+ *
+ * This must be called once during startup of any process that can wait on
+ * latches, before it issues any InitLatch() or OwnLatch() calls.
+ */
+void
+InitializeLatchSupport(void)
+{
+#if defined(WAIT_USE_SELF_PIPE)
+	int			pipefd[2];
+
+	if (IsUnderPostmaster)
+	{
+		/*
+		 * We might have inherited connections to a self-pipe created by the
+		 * postmaster.  It's critical that child processes create their own
+		 * self-pipes, of course, and we really want them to close the
+		 * inherited FDs for safety's sake.
+		 */
+		if (selfpipe_owner_pid != 0)
+		{
+			/* Assert we go through here but once in a child process */
+			Assert(selfpipe_owner_pid != MyProcPid);
+			/* Release postmaster's pipe FDs; ignore any error */
+			(void) close(selfpipe_readfd);
+			(void) close(selfpipe_writefd);
+			/* Clean up, just for safety's sake; we'll set these below */
+			selfpipe_readfd = selfpipe_writefd = -1;
+			selfpipe_owner_pid = 0;
+			/* Keep fd.c's accounting straight */
+			ReleaseExternalFD();
+			ReleaseExternalFD();
+		}
+		else
+		{
+			/*
+			 * Postmaster didn't create a self-pipe ... or else we're in an
+			 * EXEC_BACKEND build, in which case it doesn't matter since the
+			 * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
+			 * fd.c won't have state to clean up, either.
+			 */
+			Assert(selfpipe_readfd == -1);
+		}
+	}
+	else
+	{
+		/* In postmaster or standalone backend, assert we do this but once */
+		Assert(selfpipe_readfd == -1);
+		Assert(selfpipe_owner_pid == 0);
+	}
+
+	/*
+	 * Set up the self-pipe that allows a signal handler to wake up the
+	 * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
+	 * that SetLatch won't block if the event has already been set many times
+	 * filling the kernel buffer. Make the read-end non-blocking too, so that
+	 * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
+	 * Also, make both FDs close-on-exec, since we surely do not want any
+	 * child processes messing with them.
+	 */
+	if (pipe(pipefd) < 0)
+		elog(FATAL, "pipe() failed: %m");
+	if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
+		elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
+	if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
+		elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
+	if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
+		elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
+	if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
+		elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
+
+	selfpipe_readfd = pipefd[0];
+	selfpipe_writefd = pipefd[1];
+	selfpipe_owner_pid = MyProcPid;
+
+	/* Tell fd.c about these two long-lived FDs */
+	ReserveExternalFD();
+	ReserveExternalFD();
+
+	pqsignal(SIGURG, latch_sigurg_handler);
+#endif
+
+#ifdef WAIT_USE_SIGNALFD
+	sigset_t	signalfd_mask;
+
+	/* Block SIGURG, because we'll receive it through a signalfd. */
+	sigaddset(&UnBlockSig, SIGURG);
+
+	/* Set up the signalfd to receive SIGURG notifications. */
+	sigemptyset(&signalfd_mask);
+	sigaddset(&signalfd_mask, SIGURG);
+	signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC);
+	if (signal_fd < 0)
+		elog(FATAL, "signalfd() failed");
+	ReserveExternalFD();
+#endif
+
+#ifdef WAIT_USE_KQUEUE
+	/* Ignore SIGURG, because we'll receive it via kqueue. */
+	pqsignal(SIGURG, SIG_IGN);
+#endif
+}
+
+void
+InitializeLatchWaitSet(void)
+{
+	int			latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(LatchWaitSet == NULL);
+
+	/* Set up the WaitEventSet used by WaitLatch(). */
+	LatchWaitSet = CreateWaitEventSet(TopMemoryContext, 2);
+	latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+								  MyLatch, NULL);
+	if (IsUnderPostmaster)
+		AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
+						  PGINVALID_SOCKET, NULL, NULL);
+
+	Assert(latch_pos == LatchWaitSetLatchPos);
+}
+
+void
+ShutdownLatchSupport(void)
+{
+#if defined(WAIT_USE_POLL)
+	pqsignal(SIGURG, SIG_IGN);
+#endif
+
+	if (LatchWaitSet)
+	{
+		FreeWaitEventSet(LatchWaitSet);
+		LatchWaitSet = NULL;
+	}
+
+#if defined(WAIT_USE_SELF_PIPE)
+	close(selfpipe_readfd);
+	close(selfpipe_writefd);
+	selfpipe_readfd = -1;
+	selfpipe_writefd = -1;
+	selfpipe_owner_pid = InvalidPid;
+#endif
+
+#if defined(WAIT_USE_SIGNALFD)
+	close(signal_fd);
+	signal_fd = -1;
+#endif
+}
+
+/*
+ * Initialize a process-local latch.
+ */
+void
+InitLatch(Latch *latch)
+{
+	latch->is_set = false;
+	latch->maybe_sleeping = false;
+	latch->owner_pid = MyProcPid;
+	latch->is_shared = false;
+
+#if defined(WAIT_USE_SELF_PIPE)
+	/* Assert InitializeLatchSupport has been called in this process */
+	Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
+#elif defined(WAIT_USE_SIGNALFD)
+	/* Assert InitializeLatchSupport has been called in this process */
+	Assert(signal_fd >= 0);
+#elif defined(WAIT_USE_WIN32)
+	latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
+	if (latch->event == NULL)
+		elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
+#endif							/* WIN32 */
+}
+
+/*
+ * Initialize a shared latch that can be set from other processes. The latch
+ * is initially owned by no-one; use OwnLatch to associate it with the
+ * current process.
+ *
+ * InitSharedLatch needs to be called in postmaster before forking child
+ * processes, usually right after allocating the shared memory block
+ * containing the latch with ShmemInitStruct. (The Unix implementation
+ * doesn't actually require that, but the Windows one does.) Because of
+ * this restriction, we have no concurrency issues to worry about here.
+ *
+ * Note that other handles created in this module are never marked as
+ * inheritable.  Thus we do not need to worry about cleaning up child
+ * process references to postmaster-private latches or WaitEventSets.
+ */
+void
+InitSharedLatch(Latch *latch)
+{
+#ifdef WIN32
+	SECURITY_ATTRIBUTES sa;
+
+	/*
+	 * Set up security attributes to specify that the events are inherited.
+	 */
+	ZeroMemory(&sa, sizeof(sa));
+	sa.nLength = sizeof(sa);
+	sa.bInheritHandle = TRUE;
+
+	latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
+	if (latch->event == NULL)
+		elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
+#endif
+
+	latch->is_set = false;
+	latch->maybe_sleeping = false;
+	latch->owner_pid = 0;
+	latch->is_shared = true;
+}
+
+/*
+ * Associate a shared latch with the current process, allowing it to
+ * wait on the latch.
+ *
+ * Although there is a sanity check for latch-already-owned, we don't do
+ * any sort of locking here, meaning that we could fail to detect the error
+ * if two processes try to own the same latch at about the same time.  If
+ * there is any risk of that, caller must provide an interlock to prevent it.
+ */
+void
+OwnLatch(Latch *latch)
+{
+	/* Sanity checks */
+	Assert(latch->is_shared);
+
+#if defined(WAIT_USE_SELF_PIPE)
+	/* Assert InitializeLatchSupport has been called in this process */
+	Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
+#elif defined(WAIT_USE_SIGNALFD)
+	/* Assert InitializeLatchSupport has been called in this process */
+	Assert(signal_fd >= 0);
+#endif
+
+	if (latch->owner_pid != 0)
+		elog(ERROR, "latch already owned");
+
+	latch->owner_pid = MyProcPid;
+}
+
+/*
+ * Disown a shared latch currently owned by the current process.
+ */
+void
+DisownLatch(Latch *latch)
+{
+	Assert(latch->is_shared);
+	Assert(latch->owner_pid == MyProcPid);
+
+	latch->owner_pid = 0;
+}
+
+/*
+ * Wait for a given latch to be set, or for postmaster death, or until timeout
+ * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
+ * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
+ * function returns immediately.
+ *
+ * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
+ * is given.  Although it is declared as "long", we don't actually support
+ * timeouts longer than INT_MAX milliseconds.  Note that some extra overhead
+ * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
+ *
+ * The latch must be owned by the current process, ie. it must be a
+ * process-local latch initialized with InitLatch, or a shared latch
+ * associated with the current process by calling OwnLatch.
+ *
+ * Returns bit mask indicating which condition(s) caused the wake-up. Note
+ * that if multiple wake-up conditions are true, there is no guarantee that
+ * we return all of them in one call, but we will return at least one.
+ */
+int
+WaitLatch(Latch *latch, int wakeEvents, long timeout,
+		  uint32 wait_event_info)
+{
+	WaitEvent	event;
+
+	/* Postmaster-managed callers must handle postmaster death somehow. */
+	Assert(!IsUnderPostmaster ||
+		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+		   (wakeEvents & WL_POSTMASTER_DEATH));
+
+	/*
+	 * Some callers may have a latch other than MyLatch, or no latch at all,
+	 * or want to handle postmaster death differently.  It's cheap to assign
+	 * those, so just do it every time.
+	 */
+	if (!(wakeEvents & WL_LATCH_SET))
+		latch = NULL;
+	ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
+	LatchWaitSet->exit_on_postmaster_death =
+		((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
+
+	if (WaitEventSetWait(LatchWaitSet,
+						 (wakeEvents & WL_TIMEOUT) ? timeout : -1,
+						 &event, 1,
+						 wait_event_info) == 0)
+		return WL_TIMEOUT;
+	else
+		return event.events;
+}
+
+/*
+ * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
+ * conditions.
+ *
+ * When waiting on a socket, EOF and error conditions always cause the socket
+ * to be reported as readable/writable/connected, so that the caller can deal
+ * with the condition.
+ *
+ * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
+ * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
+ * return value if the postmaster dies.  The latter is useful for rare cases
+ * where some behavior other than immediate exit is needed.
+ *
+ * NB: These days this is just a wrapper around the WaitEventSet API. When
+ * using a latch very frequently, consider creating a longer living
+ * WaitEventSet instead; that's more efficient.
+ */
+int
+WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
+				  long timeout, uint32 wait_event_info)
+{
+	int			ret = 0;
+	int			rc;
+	WaitEvent	event;
+	WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
+
+	if (wakeEvents & WL_TIMEOUT)
+		Assert(timeout >= 0);
+	else
+		timeout = -1;
+
+	if (wakeEvents & WL_LATCH_SET)
+		AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
+						  latch, NULL);
+
+	/* Postmaster-managed callers must handle postmaster death somehow. */
+	Assert(!IsUnderPostmaster ||
+		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+		   (wakeEvents & WL_POSTMASTER_DEATH));
+
+	if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
+		AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+
+	if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
+		AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+
+	if (wakeEvents & WL_SOCKET_MASK)
+	{
+		int			ev;
+
+		ev = wakeEvents & WL_SOCKET_MASK;
+		AddWaitEventToSet(set, ev, sock, NULL, NULL);
+	}
+
+	rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
+
+	if (rc == 0)
+		ret |= WL_TIMEOUT;
+	else
+	{
+		ret |= event.events & (WL_LATCH_SET |
+							   WL_POSTMASTER_DEATH |
+							   WL_SOCKET_MASK);
+	}
+
+	FreeWaitEventSet(set);
+
+	return ret;
+}
+
+/*
+ * Sets a latch and wakes up anyone waiting on it.
+ *
+ * This is cheap if the latch is already set, otherwise not so much.
+ *
+ * NB: when calling this in a signal handler, be sure to save and restore
+ * errno around it.  (That's standard practice in most signal handlers, of
+ * course, but we used to omit it in handlers that only set a flag.)
+ *
+ * NB: this function is called from critical sections and signal handlers so
+ * throwing an error is not a good idea.
+ */
+void
+SetLatch(Latch *latch)
+{
+#ifndef WIN32
+	pid_t		owner_pid;
+#else
+	HANDLE		handle;
+#endif
+
+	/*
+	 * The memory barrier has to be placed here to ensure that any flag
+	 * variables possibly changed by this process have been flushed to main
+	 * memory, before we check/set is_set.
+	 */
+	pg_memory_barrier();
+
+	/* Quick exit if already set */
+	if (latch->is_set)
+		return;
+
+	latch->is_set = true;
+
+	pg_memory_barrier();
+	if (!latch->maybe_sleeping)
+		return;
+
+#ifndef WIN32
+
+	/*
+	 * See if anyone's waiting for the latch. It can be the current process if
+	 * we're in a signal handler. We use the self-pipe or SIGURG to ourselves
+	 * to wake up WaitEventSetWaitBlock() without races in that case. If it's
+	 * another process, send a signal.
+	 *
+	 * Fetch owner_pid only once, in case the latch is concurrently getting
+	 * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
+	 * guaranteed to be true! In practice, the effective range of pid_t fits
+	 * in a 32 bit integer, and so should be atomic. In the worst case, we
+	 * might end up signaling the wrong process. Even then, you're very
+	 * unlucky if a process with that bogus pid exists and belongs to
+	 * Postgres; and PG database processes should handle excess SIGUSR1
+	 * interrupts without a problem anyhow.
+	 *
+	 * Another sort of race condition that's possible here is for a new
+	 * process to own the latch immediately after we look, so we don't signal
+	 * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
+	 * the standard coding convention of waiting at the bottom of their loops,
+	 * not the top, so that they'll correctly process latch-setting events
+	 * that happen before they enter the loop.
+	 */
+	owner_pid = latch->owner_pid;
+	if (owner_pid == 0)
+		return;
+	else if (owner_pid == MyProcPid)
+	{
+#if defined(WAIT_USE_SELF_PIPE)
+		if (waiting)
+			sendSelfPipeByte();
+#else
+		if (waiting)
+			kill(MyProcPid, SIGURG);
+#endif
+	}
+	else
+		kill(owner_pid, SIGURG);
+
+#else
+
+	/*
+	 * See if anyone's waiting for the latch. It can be the current process if
+	 * we're in a signal handler.
+	 *
+	 * Use a local variable here just in case somebody changes the event field
+	 * concurrently (which really should not happen).
+	 */
+	handle = latch->event;
+	if (handle)
+	{
+		SetEvent(handle);
+
+		/*
+		 * Note that we silently ignore any errors. We might be in a signal
+		 * handler or other critical path where it's not safe to call elog().
+		 */
+	}
+#endif
+
+}
+
+/*
+ * Clear the latch. Calling WaitLatch after this will sleep, unless
+ * the latch is set again before the WaitLatch call.
+ */
+void
+ResetLatch(Latch *latch)
+{
+	/* Only the owner should reset the latch */
+	Assert(latch->owner_pid == MyProcPid);
+	Assert(latch->maybe_sleeping == false);
+
+	latch->is_set = false;
+
+	/*
+	 * Ensure that the write to is_set gets flushed to main memory before we
+	 * examine any flag variables.  Otherwise a concurrent SetLatch might
+	 * falsely conclude that it needn't signal us, even though we have missed
+	 * seeing some flag updates that SetLatch was supposed to inform us of.
+	 */
+	pg_memory_barrier();
+}
+
+/*
+ * Create a WaitEventSet with space for nevents different events to wait for.
+ *
+ * These events can then be efficiently waited upon together, using
+ * WaitEventSetWait().
+ */
+WaitEventSet *
+CreateWaitEventSet(MemoryContext context, int nevents)
+{
+	WaitEventSet *set;
+	char	   *data;
+	Size		sz = 0;
+
+	/*
+	 * Use MAXALIGN size/alignment to guarantee that later uses of memory are
+	 * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
+	 * platforms, but earlier allocations like WaitEventSet and WaitEvent
+	 * might not be sized to guarantee that when purely using sizeof().
+	 */
+	sz += MAXALIGN(sizeof(WaitEventSet));
+	sz += MAXALIGN(sizeof(WaitEvent) * nevents);
+
+#if defined(WAIT_USE_EPOLL)
+	sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+	sz += MAXALIGN(sizeof(struct kevent) * nevents);
+#elif defined(WAIT_USE_POLL)
+	sz += MAXALIGN(sizeof(struct pollfd) * nevents);
+#elif defined(WAIT_USE_WIN32)
+	/* need space for the pgwin32_signal_event */
+	sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
+#endif
+
+	data = (char *) MemoryContextAllocZero(context, sz);
+
+	set = (WaitEventSet *) data;
+	data += MAXALIGN(sizeof(WaitEventSet));
+
+	set->events = (WaitEvent *) data;
+	data += MAXALIGN(sizeof(WaitEvent) * nevents);
+
+#if defined(WAIT_USE_EPOLL)
+	set->epoll_ret_events = (struct epoll_event *) data;
+	data += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+	set->kqueue_ret_events = (struct kevent *) data;
+	data += MAXALIGN(sizeof(struct kevent) * nevents);
+#elif defined(WAIT_USE_POLL)
+	set->pollfds = (struct pollfd *) data;
+	data += MAXALIGN(sizeof(struct pollfd) * nevents);
+#elif defined(WAIT_USE_WIN32)
+	set->handles = (HANDLE) data;
+	data += MAXALIGN(sizeof(HANDLE) * nevents);
+#endif
+
+	set->latch = NULL;
+	set->nevents_space = nevents;
+	set->exit_on_postmaster_death = false;
+
+#if defined(WAIT_USE_EPOLL)
+	if (!AcquireExternalFD())
+	{
+		/* treat this as though epoll_create1 itself returned EMFILE */
+		elog(ERROR, "epoll_create1 failed: %m");
+	}
+	set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+	if (set->epoll_fd < 0)
+	{
+		ReleaseExternalFD();
+		elog(ERROR, "epoll_create1 failed: %m");
+	}
+#elif defined(WAIT_USE_KQUEUE)
+	if (!AcquireExternalFD())
+	{
+		/* treat this as though kqueue itself returned EMFILE */
+		elog(ERROR, "kqueue failed: %m");
+	}
+	set->kqueue_fd = kqueue();
+	if (set->kqueue_fd < 0)
+	{
+		ReleaseExternalFD();
+		elog(ERROR, "kqueue failed: %m");
+	}
+	if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
+	{
+		int			save_errno = errno;
+
+		close(set->kqueue_fd);
+		ReleaseExternalFD();
+		errno = save_errno;
+		elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
+	}
+	set->report_postmaster_not_running = false;
+#elif defined(WAIT_USE_WIN32)
+
+	/*
+	 * To handle signals while waiting, we need to add a win32 specific event.
+	 * We accounted for the additional event at the top of this routine. See
+	 * port/win32/signal.c for more details.
+	 *
+	 * Note: pgwin32_signal_event should be first to ensure that it will be
+	 * reported when multiple events are set.  We want to guarantee that
+	 * pending signals are serviced.
+	 */
+	set->handles[0] = pgwin32_signal_event;
+	StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
+#endif
+
+	return set;
+}
+
+/*
+ * Free a previously created WaitEventSet.
+ *
+ * Note: preferably, this shouldn't have to free any resources that could be
+ * inherited across an exec().  If it did, we'd likely leak those resources in
+ * many scenarios.  For the epoll case, we ensure that by setting EPOLL_CLOEXEC
+ * when the FD is created.  For the Windows case, we assume that the handles
+ * involved are non-inheritable.
+ */
+void
+FreeWaitEventSet(WaitEventSet *set)
+{
+#if defined(WAIT_USE_EPOLL)
+	close(set->epoll_fd);
+	ReleaseExternalFD();
+#elif defined(WAIT_USE_KQUEUE)
+	close(set->kqueue_fd);
+	ReleaseExternalFD();
+#elif defined(WAIT_USE_WIN32)
+	WaitEvent  *cur_event;
+
+	for (cur_event = set->events;
+		 cur_event < (set->events + set->nevents);
+		 cur_event++)
+	{
+		if (cur_event->events & WL_LATCH_SET)
+		{
+			/* uses the latch's HANDLE */
+		}
+		else if (cur_event->events & WL_POSTMASTER_DEATH)
+		{
+			/* uses PostmasterHandle */
+		}
+		else
+		{
+			/* Clean up the event object we created for the socket */
+			WSAEventSelect(cur_event->fd, NULL, 0);
+			WSACloseEvent(set->handles[cur_event->pos + 1]);
+		}
+	}
+#endif
+
+	pfree(set);
+}
+
+/* ---
+ * Add an event to the set. Possible events are:
+ * - WL_LATCH_SET: Wait for the latch to be set
+ * - WL_POSTMASTER_DEATH: Wait for postmaster to die
+ * - WL_SOCKET_READABLE: Wait for socket to become readable,
+ *	 can be combined in one event with other WL_SOCKET_* events
+ * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
+ *	 can be combined with other WL_SOCKET_* events
+ * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
+ *	 can be combined with other WL_SOCKET_* events (on non-Windows
+ *	 platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
+ *
+ * Returns the offset in WaitEventSet->events (starting from 0), which can be
+ * used to modify previously added wait events using ModifyWaitEvent().
+ *
+ * In the WL_LATCH_SET case the latch must be owned by the current process,
+ * i.e. it must be a process-local latch initialized with InitLatch, or a
+ * shared latch associated with the current process by calling OwnLatch.
+ *
+ * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
+ * conditions cause the socket to be reported as readable/writable/connected,
+ * so that the caller can deal with the condition.
+ *
+ * The user_data pointer specified here will be set for the events returned
+ * by WaitEventSetWait(), allowing to easily associate additional data with
+ * events.
+ */
+int
+AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
+				  void *user_data)
+{
+	WaitEvent  *event;
+
+	/* not enough space */
+	Assert(set->nevents < set->nevents_space);
+
+	if (events == WL_EXIT_ON_PM_DEATH)
+	{
+		events = WL_POSTMASTER_DEATH;
+		set->exit_on_postmaster_death = true;
+	}
+
+	if (latch)
+	{
+		if (latch->owner_pid != MyProcPid)
+			elog(ERROR, "cannot wait on a latch owned by another process");
+		if (set->latch)
+			elog(ERROR, "cannot wait on more than one latch");
+		if ((events & WL_LATCH_SET) != WL_LATCH_SET)
+			elog(ERROR, "latch events only support being set");
+	}
+	else
+	{
+		if (events & WL_LATCH_SET)
+			elog(ERROR, "cannot wait on latch without a specified latch");
+	}
+
+	/* waiting for socket readiness without a socket indicates a bug */
+	if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
+		elog(ERROR, "cannot wait on socket event without a socket");
+
+	event = &set->events[set->nevents];
+	event->pos = set->nevents++;
+	event->fd = fd;
+	event->events = events;
+	event->user_data = user_data;
+#ifdef WIN32
+	event->reset = false;
+#endif
+
+	if (events == WL_LATCH_SET)
+	{
+		set->latch = latch;
+		set->latch_pos = event->pos;
+#if defined(WAIT_USE_SELF_PIPE)
+		event->fd = selfpipe_readfd;
+#elif defined(WAIT_USE_SIGNALFD)
+		event->fd = signal_fd;
+#else
+		event->fd = PGINVALID_SOCKET;
+#ifdef WAIT_USE_EPOLL
+		return event->pos;
+#endif
+#endif
+	}
+	else if (events == WL_POSTMASTER_DEATH)
+	{
+#ifndef WIN32
+		event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
+#endif
+	}
+
+	/* perform wait primitive specific initialization, if needed */
+#if defined(WAIT_USE_EPOLL)
+	WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
+#elif defined(WAIT_USE_KQUEUE)
+	WaitEventAdjustKqueue(set, event, 0);
+#elif defined(WAIT_USE_POLL)
+	WaitEventAdjustPoll(set, event);
+#elif defined(WAIT_USE_WIN32)
+	WaitEventAdjustWin32(set, event);
+#endif
+
+	return event->pos;
+}
+
+/*
+ * Change the event mask and, in the WL_LATCH_SET case, the latch associated
+ * with the WaitEvent.  The latch may be changed to NULL to disable the latch
+ * temporarily, and then set back to a latch later.
+ *
+ * 'pos' is the id returned by AddWaitEventToSet.
+ */
+void
+ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
+{
+	WaitEvent  *event;
+#if defined(WAIT_USE_KQUEUE)
+	int			old_events;
+#endif
+
+	Assert(pos < set->nevents);
+
+	event = &set->events[pos];
+#if defined(WAIT_USE_KQUEUE)
+	old_events = event->events;
+#endif
+
+	/*
+	 * If neither the event mask nor the associated latch changes, return
+	 * early. That's an important optimization for some sockets, where
+	 * ModifyWaitEvent is frequently used to switch from waiting for reads to
+	 * waiting on writes.
+	 */
+	if (events == event->events &&
+		(!(event->events & WL_LATCH_SET) || set->latch == latch))
+		return;
+
+	if (event->events & WL_LATCH_SET &&
+		events != event->events)
+	{
+		elog(ERROR, "cannot modify latch event");
+	}
+
+	if (event->events & WL_POSTMASTER_DEATH)
+	{
+		elog(ERROR, "cannot modify postmaster death event");
+	}
+
+	/* FIXME: validate event mask */
+	event->events = events;
+
+	if (events == WL_LATCH_SET)
+	{
+		if (latch && latch->owner_pid != MyProcPid)
+			elog(ERROR, "cannot wait on a latch owned by another process");
+		set->latch = latch;
+
+		/*
+		 * On Unix, we don't need to modify the kernel object because the
+		 * underlying pipe (if there is one) is the same for all latches so we
+		 * can return immediately.  On Windows, we need to update our array of
+		 * handles, but we leave the old one in place and tolerate spurious
+		 * wakeups if the latch is disabled.
+		 */
+#if defined(WAIT_USE_WIN32)
+		if (!latch)
+			return;
+#else
+		return;
+#endif
+	}
+
+#if defined(WAIT_USE_EPOLL)
+	WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
+#elif defined(WAIT_USE_KQUEUE)
+	WaitEventAdjustKqueue(set, event, old_events);
+#elif defined(WAIT_USE_POLL)
+	WaitEventAdjustPoll(set, event);
+#elif defined(WAIT_USE_WIN32)
+	WaitEventAdjustWin32(set, event);
+#endif
+}
+
+#if defined(WAIT_USE_EPOLL)
+/*
+ * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
+ */
+static void
+WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
+{
+	struct epoll_event epoll_ev;
+	int			rc;
+
+	/* pointer to our event, returned by epoll_wait */
+	epoll_ev.data.ptr = event;
+	/* always wait for errors */
+	epoll_ev.events = EPOLLERR | EPOLLHUP;
+
+	/* prepare pollfd entry once */
+	if (event->events == WL_LATCH_SET)
+	{
+		Assert(set->latch != NULL);
+		epoll_ev.events |= EPOLLIN;
+	}
+	else if (event->events == WL_POSTMASTER_DEATH)
+	{
+		epoll_ev.events |= EPOLLIN;
+	}
+	else
+	{
+		Assert(event->fd != PGINVALID_SOCKET);
+		Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+
+		if (event->events & WL_SOCKET_READABLE)
+			epoll_ev.events |= EPOLLIN;
+		if (event->events & WL_SOCKET_WRITEABLE)
+			epoll_ev.events |= EPOLLOUT;
+	}
+
+	/*
+	 * Even though unused, we also pass epoll_ev as the data argument if
+	 * EPOLL_CTL_DEL is passed as action.  There used to be an epoll bug
+	 * requiring that, and actually it makes the code simpler...
+	 */
+	rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
+
+	if (rc < 0)
+		ereport(ERROR,
+				(errcode_for_socket_access(),
+				 errmsg("%s() failed: %m",
+						"epoll_ctl")));
+}
+#endif
+
+#if defined(WAIT_USE_POLL)
+static void
+WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
+{
+	struct pollfd *pollfd = &set->pollfds[event->pos];
+
+	pollfd->revents = 0;
+	pollfd->fd = event->fd;
+
+	/* prepare pollfd entry once */
+	if (event->events == WL_LATCH_SET)
+	{
+		Assert(set->latch != NULL);
+		pollfd->events = POLLIN;
+	}
+	else if (event->events == WL_POSTMASTER_DEATH)
+	{
+		pollfd->events = POLLIN;
+	}
+	else
+	{
+		Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+		pollfd->events = 0;
+		if (event->events & WL_SOCKET_READABLE)
+			pollfd->events |= POLLIN;
+		if (event->events & WL_SOCKET_WRITEABLE)
+			pollfd->events |= POLLOUT;
+	}
+
+	Assert(event->fd != PGINVALID_SOCKET);
+}
+#endif
+
+#if defined(WAIT_USE_KQUEUE)
+
+/*
+ * On most BSD family systems, the udata member of struct kevent is of type
+ * void *, so we could directly convert to/from WaitEvent *.  Unfortunately,
+ * NetBSD has it as intptr_t, so here we wallpaper over that difference with
+ * an lvalue cast.
+ */
+#define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
+
+static inline void
+WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
+						 WaitEvent *event)
+{
+	k_ev->ident = event->fd;
+	k_ev->filter = filter;
+	k_ev->flags = action;
+	k_ev->fflags = 0;
+	k_ev->data = 0;
+	AccessWaitEvent(k_ev) = event;
+}
+
+static inline void
+WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
+{
+	/* For now postmaster death can only be added, not removed. */
+	k_ev->ident = PostmasterPid;
+	k_ev->filter = EVFILT_PROC;
+	k_ev->flags = EV_ADD;
+	k_ev->fflags = NOTE_EXIT;
+	k_ev->data = 0;
+	AccessWaitEvent(k_ev) = event;
+}
+
+static inline void
+WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event)
+{
+	/* For now latch can only be added, not removed. */
+	k_ev->ident = SIGURG;
+	k_ev->filter = EVFILT_SIGNAL;
+	k_ev->flags = EV_ADD;
+	k_ev->fflags = 0;
+	k_ev->data = 0;
+	AccessWaitEvent(k_ev) = event;
+}
+
+/*
+ * old_events is the previous event mask, used to compute what has changed.
+ */
+static void
+WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
+{
+	int			rc;
+	struct kevent k_ev[2];
+	int			count = 0;
+	bool		new_filt_read = false;
+	bool		old_filt_read = false;
+	bool		new_filt_write = false;
+	bool		old_filt_write = false;
+
+	if (old_events == event->events)
+		return;
+
+	Assert(event->events != WL_LATCH_SET || set->latch != NULL);
+	Assert(event->events == WL_LATCH_SET ||
+		   event->events == WL_POSTMASTER_DEATH ||
+		   (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+
+	if (event->events == WL_POSTMASTER_DEATH)
+	{
+		/*
+		 * Unlike all the other implementations, we detect postmaster death
+		 * using process notification instead of waiting on the postmaster
+		 * alive pipe.
+		 */
+		WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
+	}
+	else if (event->events == WL_LATCH_SET)
+	{
+		/* We detect latch wakeup using a signal event. */
+		WaitEventAdjustKqueueAddLatch(&k_ev[count++], event);
+	}
+	else
+	{
+		/*
+		 * We need to compute the adds and deletes required to get from the
+		 * old event mask to the new event mask, since kevent treats readable
+		 * and writable as separate events.
+		 */
+		if (old_events & WL_SOCKET_READABLE)
+			old_filt_read = true;
+		if (event->events & WL_SOCKET_READABLE)
+			new_filt_read = true;
+		if (old_events & WL_SOCKET_WRITEABLE)
+			old_filt_write = true;
+		if (event->events & WL_SOCKET_WRITEABLE)
+			new_filt_write = true;
+		if (old_filt_read && !new_filt_read)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
+									 event);
+		else if (!old_filt_read && new_filt_read)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
+									 event);
+		if (old_filt_write && !new_filt_write)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
+									 event);
+		else if (!old_filt_write && new_filt_write)
+			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
+									 event);
+	}
+
+	Assert(count > 0);
+	Assert(count <= 2);
+
+	rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
+
+	/*
+	 * When adding the postmaster's pid, we have to consider that it might
+	 * already have exited and perhaps even been replaced by another process
+	 * with the same pid.  If so, we have to defer reporting this as an event
+	 * until the next call to WaitEventSetWaitBlock().
+	 */
+
+	if (rc < 0)
+	{
+		if (event->events == WL_POSTMASTER_DEATH &&
+			(errno == ESRCH || errno == EACCES))
+			set->report_postmaster_not_running = true;
+		else
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("%s() failed: %m",
+							"kevent")));
+	}
+	else if (event->events == WL_POSTMASTER_DEATH &&
+			 PostmasterPid != getppid() &&
+			 !PostmasterIsAlive())
+	{
+		/*
+		 * The extra PostmasterIsAliveInternal() check prevents false alarms
+		 * on systems that give a different value for getppid() while being
+		 * traced by a debugger.
+		 */
+		set->report_postmaster_not_running = true;
+	}
+}
+
+#endif
+
+#if defined(WAIT_USE_WIN32)
+static void
+WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
+{
+	HANDLE	   *handle = &set->handles[event->pos + 1];
+
+	if (event->events == WL_LATCH_SET)
+	{
+		Assert(set->latch != NULL);
+		*handle = set->latch->event;
+	}
+	else if (event->events == WL_POSTMASTER_DEATH)
+	{
+		*handle = PostmasterHandle;
+	}
+	else
+	{
+		int			flags = FD_CLOSE;	/* always check for errors/EOF */
+
+		if (event->events & WL_SOCKET_READABLE)
+			flags |= FD_READ;
+		if (event->events & WL_SOCKET_WRITEABLE)
+			flags |= FD_WRITE;
+		if (event->events & WL_SOCKET_CONNECTED)
+			flags |= FD_CONNECT;
+
+		if (*handle == WSA_INVALID_EVENT)
+		{
+			*handle = WSACreateEvent();
+			if (*handle == WSA_INVALID_EVENT)
+				elog(ERROR, "failed to create event for socket: error code %d",
+					 WSAGetLastError());
+		}
+		if (WSAEventSelect(event->fd, *handle, flags) != 0)
+			elog(ERROR, "failed to set up event for socket: error code %d",
+				 WSAGetLastError());
+
+		Assert(event->fd != PGINVALID_SOCKET);
+	}
+}
+#endif
+
+/*
+ * Wait for events added to the set to happen, or until the timeout is
+ * reached.  At most nevents occurred events are returned.
+ *
+ * If timeout = -1, block until an event occurs; if 0, check sockets for
+ * readiness, but don't block; if > 0, block for at most timeout milliseconds.
+ *
+ * Returns the number of events occurred, or 0 if the timeout was reached.
+ *
+ * Returned events will have the fd, pos, user_data fields set to the
+ * values associated with the registered event.
+ */
+int
+WaitEventSetWait(WaitEventSet *set, long timeout,
+				 WaitEvent *occurred_events, int nevents,
+				 uint32 wait_event_info)
+{
+	int			returned_events = 0;
+	instr_time	start_time;
+	instr_time	cur_time;
+	long		cur_timeout = -1;
+
+	Assert(nevents > 0);
+
+	/*
+	 * Initialize timeout if requested.  We must record the current time so
+	 * that we can determine the remaining timeout if interrupted.
+	 */
+	if (timeout >= 0)
+	{
+		INSTR_TIME_SET_CURRENT(start_time);
+		Assert(timeout >= 0 && timeout <= INT_MAX);
+		cur_timeout = timeout;
+	}
+
+	pgstat_report_wait_start(wait_event_info);
+
+#ifndef WIN32
+	waiting = true;
+#else
+	/* Ensure that signals are serviced even if latch is already set */
+	pgwin32_dispatch_queued_signals();
+#endif
+	while (returned_events == 0)
+	{
+		int			rc;
+
+		/*
+		 * Check if the latch is set already. If so, leave the loop
+		 * immediately, avoid blocking again. We don't attempt to report any
+		 * other events that might also be satisfied.
+		 *
+		 * If someone sets the latch between this and the
+		 * WaitEventSetWaitBlock() below, the setter will write a byte to the
+		 * pipe (or signal us and the signal handler will do that), and the
+		 * readiness routine will return immediately.
+		 *
+		 * On unix, If there's a pending byte in the self pipe, we'll notice
+		 * whenever blocking. Only clearing the pipe in that case avoids
+		 * having to drain it every time WaitLatchOrSocket() is used. Should
+		 * the pipe-buffer fill up we're still ok, because the pipe is in
+		 * nonblocking mode. It's unlikely for that to happen, because the
+		 * self pipe isn't filled unless we're blocking (waiting = true), or
+		 * from inside a signal handler in latch_sigurg_handler().
+		 *
+		 * On windows, we'll also notice if there's a pending event for the
+		 * latch when blocking, but there's no danger of anything filling up,
+		 * as "Setting an event that is already set has no effect.".
+		 *
+		 * Note: we assume that the kernel calls involved in latch management
+		 * will provide adequate synchronization on machines with weak memory
+		 * ordering, so that we cannot miss seeing is_set if a notification
+		 * has already been queued.
+		 */
+		if (set->latch && !set->latch->is_set)
+		{
+			/* about to sleep on a latch */
+			set->latch->maybe_sleeping = true;
+			pg_memory_barrier();
+			/* and recheck */
+		}
+
+		if (set->latch && set->latch->is_set)
+		{
+			occurred_events->fd = PGINVALID_SOCKET;
+			occurred_events->pos = set->latch_pos;
+			occurred_events->user_data =
+				set->events[set->latch_pos].user_data;
+			occurred_events->events = WL_LATCH_SET;
+			occurred_events++;
+			returned_events++;
+
+			/* could have been set above */
+			set->latch->maybe_sleeping = false;
+
+			break;
+		}
+
+		/*
+		 * Wait for events using the readiness primitive chosen at the top of
+		 * this file. If -1 is returned, a timeout has occurred, if 0 we have
+		 * to retry, everything >= 1 is the number of returned events.
+		 */
+		rc = WaitEventSetWaitBlock(set, cur_timeout,
+								   occurred_events, nevents);
+
+		if (set->latch)
+		{
+			Assert(set->latch->maybe_sleeping);
+			set->latch->maybe_sleeping = false;
+		}
+
+		if (rc == -1)
+			break;				/* timeout occurred */
+		else
+			returned_events = rc;
+
+		/* If we're not done, update cur_timeout for next iteration */
+		if (returned_events == 0 && timeout >= 0)
+		{
+			INSTR_TIME_SET_CURRENT(cur_time);
+			INSTR_TIME_SUBTRACT(cur_time, start_time);
+			cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+			if (cur_timeout <= 0)
+				break;
+		}
+	}
+#ifndef WIN32
+	waiting = false;
+#endif
+
+	pgstat_report_wait_end();
+
+	return returned_events;
+}
+
+
+#if defined(WAIT_USE_EPOLL)
+
+/*
+ * Wait using linux's epoll_wait(2).
+ *
+ * This is the preferable wait method, as several readiness notifications are
+ * delivered, without having to iterate through all of set->events. The return
+ * epoll_event struct contain a pointer to our events, making association
+ * easy.
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+					  WaitEvent *occurred_events, int nevents)
+{
+	int			returned_events = 0;
+	int			rc;
+	WaitEvent  *cur_event;
+	struct epoll_event *cur_epoll_event;
+
+	/* Sleep */
+	rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
+					nevents, cur_timeout);
+
+	/* Check return code */
+	if (rc < 0)
+	{
+		/* EINTR is okay, otherwise complain */
+		if (errno != EINTR)
+		{
+			waiting = false;
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("%s() failed: %m",
+							"epoll_wait")));
+		}
+		return 0;
+	}
+	else if (rc == 0)
+	{
+		/* timeout exceeded */
+		return -1;
+	}
+
+	/*
+	 * At least one event occurred, iterate over the returned epoll events
+	 * until they're either all processed, or we've returned all the events
+	 * the caller desired.
+	 */
+	for (cur_epoll_event = set->epoll_ret_events;
+		 cur_epoll_event < (set->epoll_ret_events + rc) &&
+		 returned_events < nevents;
+		 cur_epoll_event++)
+	{
+		/* epoll's data pointer is set to the associated WaitEvent */
+		cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
+
+		occurred_events->pos = cur_event->pos;
+		occurred_events->user_data = cur_event->user_data;
+		occurred_events->events = 0;
+
+		if (cur_event->events == WL_LATCH_SET &&
+			cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
+		{
+			/* Drain the signalfd. */
+			drain();
+
+			if (set->latch && set->latch->is_set)
+			{
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_LATCH_SET;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events == WL_POSTMASTER_DEATH &&
+				 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
+		{
+			/*
+			 * We expect an EPOLLHUP when the remote end is closed, but
+			 * because we don't expect the pipe to become readable or to have
+			 * any errors either, treat those cases as postmaster death, too.
+			 *
+			 * Be paranoid about a spurious event signaling the postmaster as
+			 * being dead.  There have been reports about that happening with
+			 * older primitives (select(2) to be specific), and a spurious
+			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+			 * cost much.
+			 */
+			if (!PostmasterIsAliveInternal())
+			{
+				if (set->exit_on_postmaster_death)
+					proc_exit(1);
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_POSTMASTER_DEATH;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+		{
+			Assert(cur_event->fd != PGINVALID_SOCKET);
+
+			if ((cur_event->events & WL_SOCKET_READABLE) &&
+				(cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
+			{
+				/* data available in socket, or EOF */
+				occurred_events->events |= WL_SOCKET_READABLE;
+			}
+
+			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+				(cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
+			{
+				/* writable, or EOF */
+				occurred_events->events |= WL_SOCKET_WRITEABLE;
+			}
+
+			if (occurred_events->events != 0)
+			{
+				occurred_events->fd = cur_event->fd;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+	}
+
+	return returned_events;
+}
+
+#elif defined(WAIT_USE_KQUEUE)
+
+/*
+ * Wait using kevent(2) on BSD-family systems and macOS.
+ *
+ * For now this mirrors the epoll code, but in future it could modify the fd
+ * set in the same call to kevent as it uses for waiting instead of doing that
+ * with separate system calls.
+ */
+static int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+					  WaitEvent *occurred_events, int nevents)
+{
+	int			returned_events = 0;
+	int			rc;
+	WaitEvent  *cur_event;
+	struct kevent *cur_kqueue_event;
+	struct timespec timeout;
+	struct timespec *timeout_p;
+
+	if (cur_timeout < 0)
+		timeout_p = NULL;
+	else
+	{
+		timeout.tv_sec = cur_timeout / 1000;
+		timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
+		timeout_p = &timeout;
+	}
+
+	/*
+	 * Report postmaster events discovered by WaitEventAdjustKqueue() or an
+	 * earlier call to WaitEventSetWait().
+	 */
+	if (unlikely(set->report_postmaster_not_running))
+	{
+		if (set->exit_on_postmaster_death)
+			proc_exit(1);
+		occurred_events->fd = PGINVALID_SOCKET;
+		occurred_events->events = WL_POSTMASTER_DEATH;
+		return 1;
+	}
+
+	/* Sleep */
+	rc = kevent(set->kqueue_fd, NULL, 0,
+				set->kqueue_ret_events, nevents,
+				timeout_p);
+
+	/* Check return code */
+	if (rc < 0)
+	{
+		/* EINTR is okay, otherwise complain */
+		if (errno != EINTR)
+		{
+			waiting = false;
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("%s() failed: %m",
+							"kevent")));
+		}
+		return 0;
+	}
+	else if (rc == 0)
+	{
+		/* timeout exceeded */
+		return -1;
+	}
+
+	/*
+	 * At least one event occurred, iterate over the returned kqueue events
+	 * until they're either all processed, or we've returned all the events
+	 * the caller desired.
+	 */
+	for (cur_kqueue_event = set->kqueue_ret_events;
+		 cur_kqueue_event < (set->kqueue_ret_events + rc) &&
+		 returned_events < nevents;
+		 cur_kqueue_event++)
+	{
+		/* kevent's udata points to the associated WaitEvent */
+		cur_event = AccessWaitEvent(cur_kqueue_event);
+
+		occurred_events->pos = cur_event->pos;
+		occurred_events->user_data = cur_event->user_data;
+		occurred_events->events = 0;
+
+		if (cur_event->events == WL_LATCH_SET &&
+			cur_kqueue_event->filter == EVFILT_SIGNAL)
+		{
+			if (set->latch && set->latch->is_set)
+			{
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_LATCH_SET;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events == WL_POSTMASTER_DEATH &&
+				 cur_kqueue_event->filter == EVFILT_PROC &&
+				 (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
+		{
+			/*
+			 * The kernel will tell this kqueue object only once about the
+			 * exit of the postmaster, so let's remember that for next time so
+			 * that we provide level-triggered semantics.
+			 */
+			set->report_postmaster_not_running = true;
+
+			if (set->exit_on_postmaster_death)
+				proc_exit(1);
+			occurred_events->fd = PGINVALID_SOCKET;
+			occurred_events->events = WL_POSTMASTER_DEATH;
+			occurred_events++;
+			returned_events++;
+		}
+		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+		{
+			Assert(cur_event->fd >= 0);
+
+			if ((cur_event->events & WL_SOCKET_READABLE) &&
+				(cur_kqueue_event->filter == EVFILT_READ))
+			{
+				/* readable, or EOF */
+				occurred_events->events |= WL_SOCKET_READABLE;
+			}
+
+			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+				(cur_kqueue_event->filter == EVFILT_WRITE))
+			{
+				/* writable, or EOF */
+				occurred_events->events |= WL_SOCKET_WRITEABLE;
+			}
+
+			if (occurred_events->events != 0)
+			{
+				occurred_events->fd = cur_event->fd;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+	}
+
+	return returned_events;
+}
+
+#elif defined(WAIT_USE_POLL)
+
+/*
+ * Wait using poll(2).
+ *
+ * This allows to receive readiness notifications for several events at once,
+ * but requires iterating through all of set->pollfds.
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+					  WaitEvent *occurred_events, int nevents)
+{
+	int			returned_events = 0;
+	int			rc;
+	WaitEvent  *cur_event;
+	struct pollfd *cur_pollfd;
+
+	/* Sleep */
+	rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
+
+	/* Check return code */
+	if (rc < 0)
+	{
+		/* EINTR is okay, otherwise complain */
+		if (errno != EINTR)
+		{
+			waiting = false;
+			ereport(ERROR,
+					(errcode_for_socket_access(),
+					 errmsg("%s() failed: %m",
+							"poll")));
+		}
+		return 0;
+	}
+	else if (rc == 0)
+	{
+		/* timeout exceeded */
+		return -1;
+	}
+
+	for (cur_event = set->events, cur_pollfd = set->pollfds;
+		 cur_event < (set->events + set->nevents) &&
+		 returned_events < nevents;
+		 cur_event++, cur_pollfd++)
+	{
+		/* no activity on this FD, skip */
+		if (cur_pollfd->revents == 0)
+			continue;
+
+		occurred_events->pos = cur_event->pos;
+		occurred_events->user_data = cur_event->user_data;
+		occurred_events->events = 0;
+
+		if (cur_event->events == WL_LATCH_SET &&
+			(cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
+		{
+			/* There's data in the self-pipe, clear it. */
+			drain();
+
+			if (set->latch && set->latch->is_set)
+			{
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_LATCH_SET;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events == WL_POSTMASTER_DEATH &&
+				 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
+		{
+			/*
+			 * We expect an POLLHUP when the remote end is closed, but because
+			 * we don't expect the pipe to become readable or to have any
+			 * errors either, treat those cases as postmaster death, too.
+			 *
+			 * Be paranoid about a spurious event signaling the postmaster as
+			 * being dead.  There have been reports about that happening with
+			 * older primitives (select(2) to be specific), and a spurious
+			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+			 * cost much.
+			 */
+			if (!PostmasterIsAliveInternal())
+			{
+				if (set->exit_on_postmaster_death)
+					proc_exit(1);
+				occurred_events->fd = PGINVALID_SOCKET;
+				occurred_events->events = WL_POSTMASTER_DEATH;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+		{
+			int			errflags = POLLHUP | POLLERR | POLLNVAL;
+
+			Assert(cur_event->fd >= PGINVALID_SOCKET);
+
+			if ((cur_event->events & WL_SOCKET_READABLE) &&
+				(cur_pollfd->revents & (POLLIN | errflags)))
+			{
+				/* data available in socket, or EOF */
+				occurred_events->events |= WL_SOCKET_READABLE;
+			}
+
+			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+				(cur_pollfd->revents & (POLLOUT | errflags)))
+			{
+				/* writeable, or EOF */
+				occurred_events->events |= WL_SOCKET_WRITEABLE;
+			}
+
+			if (occurred_events->events != 0)
+			{
+				occurred_events->fd = cur_event->fd;
+				occurred_events++;
+				returned_events++;
+			}
+		}
+	}
+	return returned_events;
+}
+
+#elif defined(WAIT_USE_WIN32)
+
+/*
+ * Wait using Windows' WaitForMultipleObjects().
+ *
+ * Unfortunately this will only ever return a single readiness notification at
+ * a time.  Note that while the official documentation for
+ * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
+ * with a single bWaitAll = FALSE call,
+ * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
+ * that only one event is "consumed".
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+					  WaitEvent *occurred_events, int nevents)
+{
+	int			returned_events = 0;
+	DWORD		rc;
+	WaitEvent  *cur_event;
+
+	/* Reset any wait events that need it */
+	for (cur_event = set->events;
+		 cur_event < (set->events + set->nevents);
+		 cur_event++)
+	{
+		if (cur_event->reset)
+		{
+			WaitEventAdjustWin32(set, cur_event);
+			cur_event->reset = false;
+		}
+
+		/*
+		 * Windows does not guarantee to log an FD_WRITE network event
+		 * indicating that more data can be sent unless the previous send()
+		 * failed with WSAEWOULDBLOCK.  While our caller might well have made
+		 * such a call, we cannot assume that here.  Therefore, if waiting for
+		 * write-ready, force the issue by doing a dummy send().  If the dummy
+		 * send() succeeds, assume that the socket is in fact write-ready, and
+		 * return immediately.  Also, if it fails with something other than
+		 * WSAEWOULDBLOCK, return a write-ready indication to let our caller
+		 * deal with the error condition.
+		 */
+		if (cur_event->events & WL_SOCKET_WRITEABLE)
+		{
+			char		c;
+			WSABUF		buf;
+			DWORD		sent;
+			int			r;
+
+			buf.buf = &c;
+			buf.len = 0;
+
+			r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
+			if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
+			{
+				occurred_events->pos = cur_event->pos;
+				occurred_events->user_data = cur_event->user_data;
+				occurred_events->events = WL_SOCKET_WRITEABLE;
+				occurred_events->fd = cur_event->fd;
+				return 1;
+			}
+		}
+	}
+
+	/*
+	 * Sleep.
+	 *
+	 * Need to wait for ->nevents + 1, because signal handle is in [0].
+	 */
+	rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
+								cur_timeout);
+
+	/* Check return code */
+	if (rc == WAIT_FAILED)
+		elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
+			 GetLastError());
+	else if (rc == WAIT_TIMEOUT)
+	{
+		/* timeout exceeded */
+		return -1;
+	}
+
+	if (rc == WAIT_OBJECT_0)
+	{
+		/* Service newly-arrived signals */
+		pgwin32_dispatch_queued_signals();
+		return 0;				/* retry */
+	}
+
+	/*
+	 * With an offset of one, due to the always present pgwin32_signal_event,
+	 * the handle offset directly corresponds to a wait event.
+	 */
+	cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
+
+	occurred_events->pos = cur_event->pos;
+	occurred_events->user_data = cur_event->user_data;
+	occurred_events->events = 0;
+
+	if (cur_event->events == WL_LATCH_SET)
+	{
+		/*
+		 * We cannot use set->latch->event to reset the fired event if we
+		 * aren't waiting on this latch now.
+		 */
+		if (!ResetEvent(set->handles[cur_event->pos + 1]))
+			elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
+
+		if (set->latch && set->latch->is_set)
+		{
+			occurred_events->fd = PGINVALID_SOCKET;
+			occurred_events->events = WL_LATCH_SET;
+			occurred_events++;
+			returned_events++;
+		}
+	}
+	else if (cur_event->events == WL_POSTMASTER_DEATH)
+	{
+		/*
+		 * Postmaster apparently died.  Since the consequences of falsely
+		 * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
+		 * the trouble to positively verify this with PostmasterIsAlive(),
+		 * even though there is no known reason to think that the event could
+		 * be falsely set on Windows.
+		 */
+		if (!PostmasterIsAliveInternal())
+		{
+			if (set->exit_on_postmaster_death)
+				proc_exit(1);
+			occurred_events->fd = PGINVALID_SOCKET;
+			occurred_events->events = WL_POSTMASTER_DEATH;
+			occurred_events++;
+			returned_events++;
+		}
+	}
+	else if (cur_event->events & WL_SOCKET_MASK)
+	{
+		WSANETWORKEVENTS resEvents;
+		HANDLE		handle = set->handles[cur_event->pos + 1];
+
+		Assert(cur_event->fd);
+
+		occurred_events->fd = cur_event->fd;
+
+		ZeroMemory(&resEvents, sizeof(resEvents));
+		if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
+			elog(ERROR, "failed to enumerate network events: error code %d",
+				 WSAGetLastError());
+		if ((cur_event->events & WL_SOCKET_READABLE) &&
+			(resEvents.lNetworkEvents & FD_READ))
+		{
+			/* data available in socket */
+			occurred_events->events |= WL_SOCKET_READABLE;
+
+			/*------
+			 * WaitForMultipleObjects doesn't guarantee that a read event will
+			 * be returned if the latch is set at the same time.  Even if it
+			 * did, the caller might drop that event expecting it to reoccur
+			 * on next call.  So, we must force the event to be reset if this
+			 * WaitEventSet is used again in order to avoid an indefinite
+			 * hang.  Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
+			 * for the behavior of socket events.
+			 *------
+			 */
+			cur_event->reset = true;
+		}
+		if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+			(resEvents.lNetworkEvents & FD_WRITE))
+		{
+			/* writeable */
+			occurred_events->events |= WL_SOCKET_WRITEABLE;
+		}
+		if ((cur_event->events & WL_SOCKET_CONNECTED) &&
+			(resEvents.lNetworkEvents & FD_CONNECT))
+		{
+			/* connected */
+			occurred_events->events |= WL_SOCKET_CONNECTED;
+		}
+		if (resEvents.lNetworkEvents & FD_CLOSE)
+		{
+			/* EOF/error, so signal all caller-requested socket flags */
+			occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
+		}
+
+		if (occurred_events->events != 0)
+		{
+			occurred_events++;
+			returned_events++;
+		}
+	}
+
+	return returned_events;
+}
+#endif
+
+/*
+ * Get the number of wait events registered in a given WaitEventSet.
+ */
+int
+GetNumRegisteredWaitEvents(WaitEventSet *set)
+{
+	return set->nevents;
+}
+
+#if defined(WAIT_USE_SELF_PIPE)
+
+/*
+ * SetLatch uses SIGURG to wake up the process waiting on the latch.
+ *
+ * Wake up WaitLatch, if we're waiting.
+ */
+static void
+latch_sigurg_handler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	if (waiting)
+		sendSelfPipeByte();
+
+	errno = save_errno;
+}
+
+/* Send one byte to the self-pipe, to wake up WaitLatch */
+static void
+sendSelfPipeByte(void)
+{
+	int			rc;
+	char		dummy = 0;
+
+retry:
+	rc = write(selfpipe_writefd, &dummy, 1);
+	if (rc < 0)
+	{
+		/* If interrupted by signal, just retry */
+		if (errno == EINTR)
+			goto retry;
+
+		/*
+		 * If the pipe is full, we don't need to retry, the data that's there
+		 * already is enough to wake up WaitLatch.
+		 */
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+			return;
+
+		/*
+		 * Oops, the write() failed for some other reason. We might be in a
+		 * signal handler, so it's not safe to elog(). We have no choice but
+		 * silently ignore the error.
+		 */
+		return;
+	}
+}
+
+#endif
+
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+
+/*
+ * Read all available data from self-pipe or signalfd.
+ *
+ * Note: this is only called when waiting = true.  If it fails and doesn't
+ * return, it must reset that flag first (though ideally, this will never
+ * happen).
+ */
+static void
+drain(void)
+{
+	char		buf[1024];
+	int			rc;
+	int			fd;
+
+#ifdef WAIT_USE_SELF_PIPE
+	fd = selfpipe_readfd;
+#else
+	fd = signal_fd;
+#endif
+
+	for (;;)
+	{
+		rc = read(fd, buf, sizeof(buf));
+		if (rc < 0)
+		{
+			if (errno == EAGAIN || errno == EWOULDBLOCK)
+				break;			/* the descriptor is empty */
+			else if (errno == EINTR)
+				continue;		/* retry */
+			else
+			{
+				waiting = false;
+#ifdef WAIT_USE_SELF_PIPE
+				elog(ERROR, "read() on self-pipe failed: %m");
+#else
+				elog(ERROR, "read() on signalfd failed: %m");
+#endif
+			}
+		}
+		else if (rc == 0)
+		{
+			waiting = false;
+#ifdef WAIT_USE_SELF_PIPE
+			elog(ERROR, "unexpected EOF on self-pipe");
+#else
+			elog(ERROR, "unexpected EOF on signalfd");
+#endif
+		}
+		else if (rc < sizeof(buf))
+		{
+			/* we successfully drained the pipe; no need to read() again */
+			break;
+		}
+		/* else buffer wasn't big enough, so read again */
+	}
+}
+
+#endif
diff --git a/src/backend/storage/ipc/pmsignal.c b/src/backend/storage/ipc/pmsignal.c
new file mode 100644
index 0000000..280c239
--- /dev/null
+++ b/src/backend/storage/ipc/pmsignal.c
@@ -0,0 +1,430 @@
+/*-------------------------------------------------------------------------
+ *
+ * pmsignal.c
+ *	  routines for signaling between the postmaster and its child processes
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/pmsignal.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#include "miscadmin.h"
+#include "postmaster/postmaster.h"
+#include "replication/walsender.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+
+
+/*
+ * The postmaster is signaled by its children by sending SIGUSR1.  The
+ * specific reason is communicated via flags in shared memory.  We keep
+ * a boolean flag for each possible "reason", so that different reasons
+ * can be signaled by different backends at the same time.  (However,
+ * if the same reason is signaled more than once simultaneously, the
+ * postmaster will observe it only once.)
+ *
+ * The flags are actually declared as "volatile sig_atomic_t" for maximum
+ * portability.  This should ensure that loads and stores of the flag
+ * values are atomic, allowing us to dispense with any explicit locking.
+ *
+ * In addition to the per-reason flags, we store a set of per-child-process
+ * flags that are currently used only for detecting whether a backend has
+ * exited without performing proper shutdown.  The per-child-process flags
+ * have three possible states: UNUSED, ASSIGNED, ACTIVE.  An UNUSED slot is
+ * available for assignment.  An ASSIGNED slot is associated with a postmaster
+ * child process, but either the process has not touched shared memory yet,
+ * or it has successfully cleaned up after itself.  A ACTIVE slot means the
+ * process is actively using shared memory.  The slots are assigned to
+ * child processes at random, and postmaster.c is responsible for tracking
+ * which one goes with which PID.
+ *
+ * Actually there is a fourth state, WALSENDER.  This is just like ACTIVE,
+ * but carries the extra information that the child is a WAL sender.
+ * WAL senders too start in ACTIVE state, but switch to WALSENDER once they
+ * start streaming the WAL (and they never go back to ACTIVE after that).
+ *
+ * We also have a shared-memory field that is used for communication in
+ * the opposite direction, from postmaster to children: it tells why the
+ * postmaster has broadcasted SIGQUIT signals, if indeed it has done so.
+ */
+
+#define PM_CHILD_UNUSED		0	/* these values must fit in sig_atomic_t */
+#define PM_CHILD_ASSIGNED	1
+#define PM_CHILD_ACTIVE		2
+#define PM_CHILD_WALSENDER	3
+
+/* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */
+struct PMSignalData
+{
+	/* per-reason flags for signaling the postmaster */
+	sig_atomic_t PMSignalFlags[NUM_PMSIGNALS];
+	/* global flags for signals from postmaster to children */
+	QuitSignalReason sigquit_reason;	/* why SIGQUIT was sent */
+	/* per-child-process flags */
+	int			num_child_flags;	/* # of entries in PMChildFlags[] */
+	int			next_child_flag;	/* next slot to try to assign */
+	sig_atomic_t PMChildFlags[FLEXIBLE_ARRAY_MEMBER];
+};
+
+NON_EXEC_STATIC volatile PMSignalData *PMSignalState = NULL;
+
+/*
+ * Signal handler to be notified if postmaster dies.
+ */
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+volatile sig_atomic_t postmaster_possibly_dead = false;
+
+static void
+postmaster_death_handler(int signo)
+{
+	postmaster_possibly_dead = true;
+}
+
+/*
+ * The available signals depend on the OS.  SIGUSR1 and SIGUSR2 are already
+ * used for other things, so choose another one.
+ *
+ * Currently, we assume that we can always find a signal to use.  That
+ * seems like a reasonable assumption for all platforms that are modern
+ * enough to have a parent-death signaling mechanism.
+ */
+#if defined(SIGINFO)
+#define POSTMASTER_DEATH_SIGNAL SIGINFO
+#elif defined(SIGPWR)
+#define POSTMASTER_DEATH_SIGNAL SIGPWR
+#else
+#error "cannot find a signal to use for postmaster death"
+#endif
+
+#endif							/* USE_POSTMASTER_DEATH_SIGNAL */
+
+/*
+ * PMSignalShmemSize
+ *		Compute space needed for pmsignal.c's shared memory
+ */
+Size
+PMSignalShmemSize(void)
+{
+	Size		size;
+
+	size = offsetof(PMSignalData, PMChildFlags);
+	size = add_size(size, mul_size(MaxLivePostmasterChildren(),
+								   sizeof(sig_atomic_t)));
+
+	return size;
+}
+
+/*
+ * PMSignalShmemInit - initialize during shared-memory creation
+ */
+void
+PMSignalShmemInit(void)
+{
+	bool		found;
+
+	PMSignalState = (PMSignalData *)
+		ShmemInitStruct("PMSignalState", PMSignalShmemSize(), &found);
+
+	if (!found)
+	{
+		/* initialize all flags to zeroes */
+		MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
+		PMSignalState->num_child_flags = MaxLivePostmasterChildren();
+	}
+}
+
+/*
+ * SendPostmasterSignal - signal the postmaster from a child process
+ */
+void
+SendPostmasterSignal(PMSignalReason reason)
+{
+	/* If called in a standalone backend, do nothing */
+	if (!IsUnderPostmaster)
+		return;
+	/* Atomically set the proper flag */
+	PMSignalState->PMSignalFlags[reason] = true;
+	/* Send signal to postmaster */
+	kill(PostmasterPid, SIGUSR1);
+}
+
+/*
+ * CheckPostmasterSignal - check to see if a particular reason has been
+ * signaled, and clear the signal flag.  Should be called by postmaster
+ * after receiving SIGUSR1.
+ */
+bool
+CheckPostmasterSignal(PMSignalReason reason)
+{
+	/* Careful here --- don't clear flag if we haven't seen it set */
+	if (PMSignalState->PMSignalFlags[reason])
+	{
+		PMSignalState->PMSignalFlags[reason] = false;
+		return true;
+	}
+	return false;
+}
+
+/*
+ * SetQuitSignalReason - broadcast the reason for a system shutdown.
+ * Should be called by postmaster before sending SIGQUIT to children.
+ *
+ * Note: in a crash-and-restart scenario, the "reason" field gets cleared
+ * as a part of rebuilding shared memory; the postmaster need not do it
+ * explicitly.
+ */
+void
+SetQuitSignalReason(QuitSignalReason reason)
+{
+	PMSignalState->sigquit_reason = reason;
+}
+
+/*
+ * GetQuitSignalReason - obtain the reason for a system shutdown.
+ * Called by child processes when they receive SIGQUIT.
+ * If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT.
+ */
+QuitSignalReason
+GetQuitSignalReason(void)
+{
+	/* This is called in signal handlers, so be extra paranoid. */
+	if (!IsUnderPostmaster || PMSignalState == NULL)
+		return PMQUIT_NOT_SENT;
+	return PMSignalState->sigquit_reason;
+}
+
+
+/*
+ * AssignPostmasterChildSlot - select an unused slot for a new postmaster
+ * child process, and set its state to ASSIGNED.  Returns a slot number
+ * (one to N).
+ *
+ * Only the postmaster is allowed to execute this routine, so we need no
+ * special locking.
+ */
+int
+AssignPostmasterChildSlot(void)
+{
+	int			slot = PMSignalState->next_child_flag;
+	int			n;
+
+	/*
+	 * Scan for a free slot.  We track the last slot assigned so as not to
+	 * waste time repeatedly rescanning low-numbered slots.
+	 */
+	for (n = PMSignalState->num_child_flags; n > 0; n--)
+	{
+		if (--slot < 0)
+			slot = PMSignalState->num_child_flags - 1;
+		if (PMSignalState->PMChildFlags[slot] == PM_CHILD_UNUSED)
+		{
+			PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
+			PMSignalState->next_child_flag = slot;
+			return slot + 1;
+		}
+	}
+
+	/* Out of slots ... should never happen, else postmaster.c messed up */
+	elog(FATAL, "no free slots in PMChildFlags array");
+	return 0;					/* keep compiler quiet */
+}
+
+/*
+ * ReleasePostmasterChildSlot - release a slot after death of a postmaster
+ * child process.  This must be called in the postmaster process.
+ *
+ * Returns true if the slot had been in ASSIGNED state (the expected case),
+ * false otherwise (implying that the child failed to clean itself up).
+ */
+bool
+ReleasePostmasterChildSlot(int slot)
+{
+	bool		result;
+
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+
+	/*
+	 * Note: the slot state might already be unused, because the logic in
+	 * postmaster.c is such that this might get called twice when a child
+	 * crashes.  So we don't try to Assert anything about the state.
+	 */
+	result = (PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
+	PMSignalState->PMChildFlags[slot] = PM_CHILD_UNUSED;
+	return result;
+}
+
+/*
+ * IsPostmasterChildWalSender - check if given slot is in use by a
+ * walsender process.
+ */
+bool
+IsPostmasterChildWalSender(int slot)
+{
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+
+	if (PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER)
+		return true;
+	else
+		return false;
+}
+
+/*
+ * MarkPostmasterChildActive - mark a postmaster child as about to begin
+ * actively using shared memory.  This is called in the child process.
+ */
+void
+MarkPostmasterChildActive(void)
+{
+	int			slot = MyPMChildSlot;
+
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+	Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
+	PMSignalState->PMChildFlags[slot] = PM_CHILD_ACTIVE;
+}
+
+/*
+ * MarkPostmasterChildWalSender - mark a postmaster child as a WAL sender
+ * process.  This is called in the child process, sometime after marking the
+ * child as active.
+ */
+void
+MarkPostmasterChildWalSender(void)
+{
+	int			slot = MyPMChildSlot;
+
+	Assert(am_walsender);
+
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+	Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE);
+	PMSignalState->PMChildFlags[slot] = PM_CHILD_WALSENDER;
+}
+
+/*
+ * MarkPostmasterChildInactive - mark a postmaster child as done using
+ * shared memory.  This is called in the child process.
+ */
+void
+MarkPostmasterChildInactive(void)
+{
+	int			slot = MyPMChildSlot;
+
+	Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+	slot--;
+	Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE ||
+		   PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER);
+	PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
+}
+
+
+/*
+ * PostmasterIsAliveInternal - check whether postmaster process is still alive
+ *
+ * This is the slow path of PostmasterIsAlive(), where the caller has already
+ * checked 'postmaster_possibly_dead'.  (On platforms that don't support
+ * a signal for parent death, PostmasterIsAlive() is just an alias for this.)
+ */
+bool
+PostmasterIsAliveInternal(void)
+{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+	/*
+	 * Reset the flag before checking, so that we don't miss a signal if
+	 * postmaster dies right after the check.  If postmaster was indeed dead,
+	 * we'll re-arm it before returning to caller.
+	 */
+	postmaster_possibly_dead = false;
+#endif
+
+#ifndef WIN32
+	{
+		char		c;
+		ssize_t		rc;
+
+		rc = read(postmaster_alive_fds[POSTMASTER_FD_WATCH], &c, 1);
+
+		/*
+		 * In the usual case, the postmaster is still alive, and there is no
+		 * data in the pipe.
+		 */
+		if (rc < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+			return true;
+		else
+		{
+			/*
+			 * Postmaster is dead, or something went wrong with the read()
+			 * call.
+			 */
+
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+			postmaster_possibly_dead = true;
+#endif
+
+			if (rc < 0)
+				elog(FATAL, "read on postmaster death monitoring pipe failed: %m");
+			else if (rc > 0)
+				elog(FATAL, "unexpected data in postmaster death monitoring pipe");
+
+			return false;
+		}
+	}
+
+#else							/* WIN32 */
+	if (WaitForSingleObject(PostmasterHandle, 0) == WAIT_TIMEOUT)
+		return true;
+	else
+	{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+		postmaster_possibly_dead = true;
+#endif
+		return false;
+	}
+#endif							/* WIN32 */
+}
+
+/*
+ * PostmasterDeathSignalInit - request signal on postmaster death if possible
+ */
+void
+PostmasterDeathSignalInit(void)
+{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+	int			signum = POSTMASTER_DEATH_SIGNAL;
+
+	/* Register our signal handler. */
+	pqsignal(signum, postmaster_death_handler);
+
+	/* Request a signal on parent exit. */
+#if defined(PR_SET_PDEATHSIG)
+	if (prctl(PR_SET_PDEATHSIG, signum) < 0)
+		elog(ERROR, "could not request parent death signal: %m");
+#elif defined(PROC_PDEATHSIG_CTL)
+	if (procctl(P_PID, 0, PROC_PDEATHSIG_CTL, &signum) < 0)
+		elog(ERROR, "could not request parent death signal: %m");
+#else
+#error "USE_POSTMASTER_DEATH_SIGNAL set, but there is no mechanism to request the signal"
+#endif
+
+	/*
+	 * Just in case the parent was gone already and we missed it, we'd better
+	 * check the slow way on the first call.
+	 */
+	postmaster_possibly_dead = true;
+#endif							/* USE_POSTMASTER_DEATH_SIGNAL */
+}
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
new file mode 100644
index 0000000..755f842
--- /dev/null
+++ b/src/backend/storage/ipc/procarray.c
@@ -0,0 +1,5220 @@
+/*-------------------------------------------------------------------------
+ *
+ * procarray.c
+ *	  POSTGRES process array code.
+ *
+ *
+ * This module maintains arrays of PGPROC substructures, as well as associated
+ * arrays in ProcGlobal, for all active backends.  Although there are several
+ * uses for this, the principal one is as a means of determining the set of
+ * currently running transactions.
+ *
+ * Because of various subtle race conditions it is critical that a backend
+ * hold the correct locks while setting or clearing its xid (in
+ * ProcGlobal->xids[]/MyProc->xid).  See notes in
+ * src/backend/access/transam/README.
+ *
+ * The process arrays now also include structures representing prepared
+ * transactions.  The xid and subxids fields of these are valid, as are the
+ * myProcLocks lists.  They can be distinguished from regular backend PGPROCs
+ * at need by checking for pid == 0.
+ *
+ * During hot standby, we also keep a list of XIDs representing transactions
+ * that are known to be running on the primary (or more precisely, were running
+ * as of the current point in the WAL stream).  This list is kept in the
+ * KnownAssignedXids array, and is updated by watching the sequence of
+ * arriving XIDs.  This is necessary because if we leave those XIDs out of
+ * snapshots taken for standby queries, then they will appear to be already
+ * complete, leading to MVCC failures.  Note that in hot standby, the PGPROC
+ * array represents standby processes, which by definition are not running
+ * transactions that have XIDs.
+ *
+ * It is perhaps possible for a backend on the primary to terminate without
+ * writing an abort record for its transaction.  While that shouldn't really
+ * happen, it would tie up KnownAssignedXids indefinitely, so we protect
+ * ourselves by pruning the array when a valid list of running XIDs arrives.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/procarray.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+
+#include "access/clog.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_authid.h"
+#include "commands/dbcommands.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/spin.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+#define UINT32_ACCESS_ONCE(var)		 ((uint32)(*((volatile uint32 *)&(var))))
+
+/* Our shared memory area */
+typedef struct ProcArrayStruct
+{
+	int			numProcs;		/* number of valid procs entries */
+	int			maxProcs;		/* allocated size of procs array */
+
+	/*
+	 * Known assigned XIDs handling
+	 */
+	int			maxKnownAssignedXids;	/* allocated size of array */
+	int			numKnownAssignedXids;	/* current # of valid entries */
+	int			tailKnownAssignedXids;	/* index of oldest valid element */
+	int			headKnownAssignedXids;	/* index of newest element, + 1 */
+	slock_t		known_assigned_xids_lck;	/* protects head/tail pointers */
+
+	/*
+	 * Highest subxid that has been removed from KnownAssignedXids array to
+	 * prevent overflow; or InvalidTransactionId if none.  We track this for
+	 * similar reasons to tracking overflowing cached subxids in PGPROC
+	 * entries.  Must hold exclusive ProcArrayLock to change this, and shared
+	 * lock to read it.
+	 */
+	TransactionId lastOverflowedXid;
+
+	/* oldest xmin of any replication slot */
+	TransactionId replication_slot_xmin;
+	/* oldest catalog xmin of any replication slot */
+	TransactionId replication_slot_catalog_xmin;
+
+	/* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */
+	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
+} ProcArrayStruct;
+
+/*
+ * State for the GlobalVisTest* family of functions. Those functions can
+ * e.g. be used to decide if a deleted row can be removed without violating
+ * MVCC semantics: If the deleted row's xmax is not considered to be running
+ * by anyone, the row can be removed.
+ *
+ * To avoid slowing down GetSnapshotData(), we don't calculate a precise
+ * cutoff XID while building a snapshot (looking at the frequently changing
+ * xmins scales badly). Instead we compute two boundaries while building the
+ * snapshot:
+ *
+ * 1) definitely_needed, indicating that rows deleted by XIDs >=
+ *    definitely_needed are definitely still visible.
+ *
+ * 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can
+ *    definitely be removed
+ *
+ * When testing an XID that falls in between the two (i.e. XID >= maybe_needed
+ * && XID < definitely_needed), the boundaries can be recomputed (using
+ * ComputeXidHorizons()) to get a more accurate answer. This is cheaper than
+ * maintaining an accurate value all the time.
+ *
+ * As it is not cheap to compute accurate boundaries, we limit the number of
+ * times that happens in short succession. See GlobalVisTestShouldUpdate().
+ *
+ *
+ * There are three backend lifetime instances of this struct, optimized for
+ * different types of relations. As e.g. a normal user defined table in one
+ * database is inaccessible to backends connected to another database, a test
+ * specific to a relation can be more aggressive than a test for a shared
+ * relation.  Currently we track four different states:
+ *
+ * 1) GlobalVisSharedRels, which only considers an XID's
+ *    effects visible-to-everyone if neither snapshots in any database, nor a
+ *    replication slot's xmin, nor a replication slot's catalog_xmin might
+ *    still consider XID as running.
+ *
+ * 2) GlobalVisCatalogRels, which only considers an XID's
+ *    effects visible-to-everyone if neither snapshots in the current
+ *    database, nor a replication slot's xmin, nor a replication slot's
+ *    catalog_xmin might still consider XID as running.
+ *
+ *    I.e. the difference to GlobalVisSharedRels is that
+ *    snapshot in other databases are ignored.
+ *
+ * 3) GlobalVisDataRels, which only considers an XID's
+ *    effects visible-to-everyone if neither snapshots in the current
+ *    database, nor a replication slot's xmin consider XID as running.
+ *
+ *    I.e. the difference to GlobalVisCatalogRels is that
+ *    replication slot's catalog_xmin is not taken into account.
+ *
+ * 4) GlobalVisTempRels, which only considers the current session, as temp
+ *    tables are not visible to other sessions.
+ *
+ * GlobalVisTestFor(relation) returns the appropriate state
+ * for the relation.
+ *
+ * The boundaries are FullTransactionIds instead of TransactionIds to avoid
+ * wraparound dangers. There e.g. would otherwise exist no procarray state to
+ * prevent maybe_needed to become old enough after the GetSnapshotData()
+ * call.
+ *
+ * The typedef is in the header.
+ */
+struct GlobalVisState
+{
+	/* XIDs >= are considered running by some backend */
+	FullTransactionId definitely_needed;
+
+	/* XIDs < are not considered to be running by any backend */
+	FullTransactionId maybe_needed;
+};
+
+/*
+ * Result of ComputeXidHorizons().
+ */
+typedef struct ComputeXidHorizonsResult
+{
+	/*
+	 * The value of ShmemVariableCache->latestCompletedXid when
+	 * ComputeXidHorizons() held ProcArrayLock.
+	 */
+	FullTransactionId latest_completed;
+
+	/*
+	 * The same for procArray->replication_slot_xmin and.
+	 * procArray->replication_slot_catalog_xmin.
+	 */
+	TransactionId slot_xmin;
+	TransactionId slot_catalog_xmin;
+
+	/*
+	 * Oldest xid that any backend might still consider running. This needs to
+	 * include processes running VACUUM, in contrast to the normal visibility
+	 * cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when
+	 * determining visibility, but doesn't care about rows above its xmin to
+	 * be removed.
+	 *
+	 * This likely should only be needed to determine whether pg_subtrans can
+	 * be truncated. It currently includes the effects of replication slots,
+	 * for historical reasons. But that could likely be changed.
+	 */
+	TransactionId oldest_considered_running;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in shared
+	 * tables.
+	 *
+	 * This includes the effects of replication slots. If that's not desired,
+	 * look at shared_oldest_nonremovable_raw;
+	 */
+	TransactionId shared_oldest_nonremovable;
+
+	/*
+	 * Oldest xid that may be necessary to retain in shared tables. This is
+	 * the same as shared_oldest_nonremovable, except that is not affected by
+	 * replication slot's catalog_xmin.
+	 *
+	 * This is mainly useful to be able to send the catalog_xmin to upstream
+	 * streaming replication servers via hot_standby_feedback, so they can
+	 * apply the limit only when accessing catalog tables.
+	 */
+	TransactionId shared_oldest_nonremovable_raw;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in non-shared
+	 * catalog tables.
+	 */
+	TransactionId catalog_oldest_nonremovable;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in normal user
+	 * defined tables.
+	 */
+	TransactionId data_oldest_nonremovable;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in this
+	 * session's temporary tables.
+	 */
+	TransactionId temp_oldest_nonremovable;
+
+} ComputeXidHorizonsResult;
+
+/*
+ * Return value for GlobalVisHorizonKindForRel().
+ */
+typedef enum GlobalVisHorizonKind
+{
+	VISHORIZON_SHARED,
+	VISHORIZON_CATALOG,
+	VISHORIZON_DATA,
+	VISHORIZON_TEMP
+} GlobalVisHorizonKind;
+
+
+static ProcArrayStruct *procArray;
+
+static PGPROC *allProcs;
+
+/*
+ * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress()
+ */
+static TransactionId cachedXidIsNotInProgress = InvalidTransactionId;
+
+/*
+ * Bookkeeping for tracking emulated transactions in recovery
+ */
+static TransactionId *KnownAssignedXids;
+static bool *KnownAssignedXidsValid;
+static TransactionId latestObservedXid = InvalidTransactionId;
+
+/*
+ * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
+ * the highest xid that might still be running that we don't have in
+ * KnownAssignedXids.
+ */
+static TransactionId standbySnapshotPendingXmin;
+
+/*
+ * State for visibility checks on different types of relations. See struct
+ * GlobalVisState for details. As shared, catalog, normal and temporary
+ * relations can have different horizons, one such state exists for each.
+ */
+static GlobalVisState GlobalVisSharedRels;
+static GlobalVisState GlobalVisCatalogRels;
+static GlobalVisState GlobalVisDataRels;
+static GlobalVisState GlobalVisTempRels;
+
+/*
+ * This backend's RecentXmin at the last time the accurate xmin horizon was
+ * recomputed, or InvalidTransactionId if it has not. Used to limit how many
+ * times accurate horizons are recomputed. See GlobalVisTestShouldUpdate().
+ */
+static TransactionId ComputeXidHorizonsResultLastXmin;
+
+#ifdef XIDCACHE_DEBUG
+
+/* counters for XidCache measurement */
+static long xc_by_recent_xmin = 0;
+static long xc_by_known_xact = 0;
+static long xc_by_my_xact = 0;
+static long xc_by_latest_xid = 0;
+static long xc_by_main_xid = 0;
+static long xc_by_child_xid = 0;
+static long xc_by_known_assigned = 0;
+static long xc_no_overflow = 0;
+static long xc_slow_answer = 0;
+
+#define xc_by_recent_xmin_inc()		(xc_by_recent_xmin++)
+#define xc_by_known_xact_inc()		(xc_by_known_xact++)
+#define xc_by_my_xact_inc()			(xc_by_my_xact++)
+#define xc_by_latest_xid_inc()		(xc_by_latest_xid++)
+#define xc_by_main_xid_inc()		(xc_by_main_xid++)
+#define xc_by_child_xid_inc()		(xc_by_child_xid++)
+#define xc_by_known_assigned_inc()	(xc_by_known_assigned++)
+#define xc_no_overflow_inc()		(xc_no_overflow++)
+#define xc_slow_answer_inc()		(xc_slow_answer++)
+
+static void DisplayXidCache(void);
+#else							/* !XIDCACHE_DEBUG */
+
+#define xc_by_recent_xmin_inc()		((void) 0)
+#define xc_by_known_xact_inc()		((void) 0)
+#define xc_by_my_xact_inc()			((void) 0)
+#define xc_by_latest_xid_inc()		((void) 0)
+#define xc_by_main_xid_inc()		((void) 0)
+#define xc_by_child_xid_inc()		((void) 0)
+#define xc_by_known_assigned_inc()	((void) 0)
+#define xc_no_overflow_inc()		((void) 0)
+#define xc_slow_answer_inc()		((void) 0)
+#endif							/* XIDCACHE_DEBUG */
+
+static VirtualTransactionId *GetVirtualXIDsDelayingChkptGuts(int *nvxids,
+															 int type);
+static bool HaveVirtualXIDsDelayingChkptGuts(VirtualTransactionId *vxids,
+											 int nvxids, int type);
+
+/* Primitives for KnownAssignedXids array handling for standby */
+static void KnownAssignedXidsCompress(bool force);
+static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
+								 bool exclusive_lock);
+static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
+static bool KnownAssignedXidExists(TransactionId xid);
+static void KnownAssignedXidsRemove(TransactionId xid);
+static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
+										TransactionId *subxids);
+static void KnownAssignedXidsRemovePreceding(TransactionId xid);
+static int	KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
+static int	KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
+										   TransactionId *xmin,
+										   TransactionId xmax);
+static TransactionId KnownAssignedXidsGetOldestXmin(void);
+static void KnownAssignedXidsDisplay(int trace_level);
+static void KnownAssignedXidsReset(void);
+static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid);
+static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
+static void MaintainLatestCompletedXid(TransactionId latestXid);
+static void MaintainLatestCompletedXidRecovery(TransactionId latestXid);
+
+static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel,
+												  TransactionId xid);
+static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons);
+
+/*
+ * Report shared-memory space needed by CreateSharedProcArray.
+ */
+Size
+ProcArrayShmemSize(void)
+{
+	Size		size;
+
+	/* Size of the ProcArray structure itself */
+#define PROCARRAY_MAXPROCS	(MaxBackends + max_prepared_xacts)
+
+	size = offsetof(ProcArrayStruct, pgprocnos);
+	size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
+
+	/*
+	 * During Hot Standby processing we have a data structure called
+	 * KnownAssignedXids, created in shared memory. Local data structures are
+	 * also created in various backends during GetSnapshotData(),
+	 * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
+	 * main structures created in those functions must be identically sized,
+	 * since we may at times copy the whole of the data structures around. We
+	 * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
+	 *
+	 * Ideally we'd only create this structure if we were actually doing hot
+	 * standby in the current run, but we don't know that yet at the time
+	 * shared memory is being set up.
+	 */
+#define TOTAL_MAX_CACHED_SUBXIDS \
+	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
+
+	if (EnableHotStandby)
+	{
+		size = add_size(size,
+						mul_size(sizeof(TransactionId),
+								 TOTAL_MAX_CACHED_SUBXIDS));
+		size = add_size(size,
+						mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
+	}
+
+	return size;
+}
+
+/*
+ * Initialize the shared PGPROC array during postmaster startup.
+ */
+void
+CreateSharedProcArray(void)
+{
+	bool		found;
+
+	/* Create or attach to the ProcArray shared structure */
+	procArray = (ProcArrayStruct *)
+		ShmemInitStruct("Proc Array",
+						add_size(offsetof(ProcArrayStruct, pgprocnos),
+								 mul_size(sizeof(int),
+										  PROCARRAY_MAXPROCS)),
+						&found);
+
+	if (!found)
+	{
+		/*
+		 * We're the first - initialize.
+		 */
+		procArray->numProcs = 0;
+		procArray->maxProcs = PROCARRAY_MAXPROCS;
+		procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
+		procArray->numKnownAssignedXids = 0;
+		procArray->tailKnownAssignedXids = 0;
+		procArray->headKnownAssignedXids = 0;
+		SpinLockInit(&procArray->known_assigned_xids_lck);
+		procArray->lastOverflowedXid = InvalidTransactionId;
+		procArray->replication_slot_xmin = InvalidTransactionId;
+		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+		ShmemVariableCache->xactCompletionCount = 1;
+	}
+
+	allProcs = ProcGlobal->allProcs;
+
+	/* Create or attach to the KnownAssignedXids arrays too, if needed */
+	if (EnableHotStandby)
+	{
+		KnownAssignedXids = (TransactionId *)
+			ShmemInitStruct("KnownAssignedXids",
+							mul_size(sizeof(TransactionId),
+									 TOTAL_MAX_CACHED_SUBXIDS),
+							&found);
+		KnownAssignedXidsValid = (bool *)
+			ShmemInitStruct("KnownAssignedXidsValid",
+							mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
+							&found);
+	}
+}
+
+/*
+ * Add the specified PGPROC to the shared array.
+ */
+void
+ProcArrayAdd(PGPROC *proc)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+	int			movecount;
+
+	/* See ProcGlobal comment explaining why both locks are held */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+	if (arrayP->numProcs >= arrayP->maxProcs)
+	{
+		/*
+		 * Oops, no room.  (This really shouldn't happen, since there is a
+		 * fixed supply of PGPROC structs too, and so we should have failed
+		 * earlier.)
+		 */
+		ereport(FATAL,
+				(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+				 errmsg("sorry, too many clients already")));
+	}
+
+	/*
+	 * Keep the procs array sorted by (PGPROC *) so that we can utilize
+	 * locality of references much better. This is useful while traversing the
+	 * ProcArray because there is an increased likelihood of finding the next
+	 * PGPROC structure in the cache.
+	 *
+	 * Since the occurrence of adding/removing a proc is much lower than the
+	 * access to the ProcArray itself, the overhead should be marginal
+	 */
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			procno PG_USED_FOR_ASSERTS_ONLY = arrayP->pgprocnos[index];
+
+		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+		Assert(allProcs[procno].pgxactoff == index);
+
+		/* If we have found our right position in the array, break */
+		if (arrayP->pgprocnos[index] > proc->pgprocno)
+			break;
+	}
+
+	movecount = arrayP->numProcs - index;
+	memmove(&arrayP->pgprocnos[index + 1],
+			&arrayP->pgprocnos[index],
+			movecount * sizeof(*arrayP->pgprocnos));
+	memmove(&ProcGlobal->xids[index + 1],
+			&ProcGlobal->xids[index],
+			movecount * sizeof(*ProcGlobal->xids));
+	memmove(&ProcGlobal->subxidStates[index + 1],
+			&ProcGlobal->subxidStates[index],
+			movecount * sizeof(*ProcGlobal->subxidStates));
+	memmove(&ProcGlobal->statusFlags[index + 1],
+			&ProcGlobal->statusFlags[index],
+			movecount * sizeof(*ProcGlobal->statusFlags));
+
+	arrayP->pgprocnos[index] = proc->pgprocno;
+	proc->pgxactoff = index;
+	ProcGlobal->xids[index] = proc->xid;
+	ProcGlobal->subxidStates[index] = proc->subxidStatus;
+	ProcGlobal->statusFlags[index] = proc->statusFlags;
+
+	arrayP->numProcs++;
+
+	/* adjust pgxactoff for all following PGPROCs */
+	index++;
+	for (; index < arrayP->numProcs; index++)
+	{
+		int			procno = arrayP->pgprocnos[index];
+
+		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+		Assert(allProcs[procno].pgxactoff == index - 1);
+
+		allProcs[procno].pgxactoff = index;
+	}
+
+	/*
+	 * Release in reversed acquisition order, to reduce frequency of having to
+	 * wait for XidGenLock while holding ProcArrayLock.
+	 */
+	LWLockRelease(XidGenLock);
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * Remove the specified PGPROC from the shared array.
+ *
+ * When latestXid is a valid XID, we are removing a live 2PC gxact from the
+ * array, and thus causing it to appear as "not running" anymore.  In this
+ * case we must advance latestCompletedXid.  (This is essentially the same
+ * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
+ * the ProcArrayLock only once, and don't damage the content of the PGPROC;
+ * twophase.c depends on the latter.)
+ */
+void
+ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			myoff;
+	int			movecount;
+
+#ifdef XIDCACHE_DEBUG
+	/* dump stats at backend shutdown, but not prepared-xact end */
+	if (proc->pid != 0)
+		DisplayXidCache();
+#endif
+
+	/* See ProcGlobal comment explaining why both locks are held */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+	myoff = proc->pgxactoff;
+
+	Assert(myoff >= 0 && myoff < arrayP->numProcs);
+	Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff);
+
+	if (TransactionIdIsValid(latestXid))
+	{
+		Assert(TransactionIdIsValid(ProcGlobal->xids[myoff]));
+
+		/* Advance global latestCompletedXid while holding the lock */
+		MaintainLatestCompletedXid(latestXid);
+
+		/* Same with xactCompletionCount  */
+		ShmemVariableCache->xactCompletionCount++;
+
+		ProcGlobal->xids[myoff] = InvalidTransactionId;
+		ProcGlobal->subxidStates[myoff].overflowed = false;
+		ProcGlobal->subxidStates[myoff].count = 0;
+	}
+	else
+	{
+		/* Shouldn't be trying to remove a live transaction here */
+		Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
+	}
+
+	Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
+	Assert(ProcGlobal->subxidStates[myoff].count == 0);
+	Assert(ProcGlobal->subxidStates[myoff].overflowed == false);
+
+	ProcGlobal->statusFlags[myoff] = 0;
+
+	/* Keep the PGPROC array sorted. See notes above */
+	movecount = arrayP->numProcs - myoff - 1;
+	memmove(&arrayP->pgprocnos[myoff],
+			&arrayP->pgprocnos[myoff + 1],
+			movecount * sizeof(*arrayP->pgprocnos));
+	memmove(&ProcGlobal->xids[myoff],
+			&ProcGlobal->xids[myoff + 1],
+			movecount * sizeof(*ProcGlobal->xids));
+	memmove(&ProcGlobal->subxidStates[myoff],
+			&ProcGlobal->subxidStates[myoff + 1],
+			movecount * sizeof(*ProcGlobal->subxidStates));
+	memmove(&ProcGlobal->statusFlags[myoff],
+			&ProcGlobal->statusFlags[myoff + 1],
+			movecount * sizeof(*ProcGlobal->statusFlags));
+
+	arrayP->pgprocnos[arrayP->numProcs - 1] = -1;	/* for debugging */
+	arrayP->numProcs--;
+
+	/*
+	 * Adjust pgxactoff of following procs for removed PGPROC (note that
+	 * numProcs already has been decremented).
+	 */
+	for (int index = myoff; index < arrayP->numProcs; index++)
+	{
+		int			procno = arrayP->pgprocnos[index];
+
+		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+		Assert(allProcs[procno].pgxactoff - 1 == index);
+
+		allProcs[procno].pgxactoff = index;
+	}
+
+	/*
+	 * Release in reversed acquisition order, to reduce frequency of having to
+	 * wait for XidGenLock while holding ProcArrayLock.
+	 */
+	LWLockRelease(XidGenLock);
+	LWLockRelease(ProcArrayLock);
+}
+
+
+/*
+ * ProcArrayEndTransaction -- mark a transaction as no longer running
+ *
+ * This is used interchangeably for commit and abort cases.  The transaction
+ * commit/abort must already be reported to WAL and pg_xact.
+ *
+ * proc is currently always MyProc, but we pass it explicitly for flexibility.
+ * latestXid is the latest Xid among the transaction's main XID and
+ * subtransactions, or InvalidTransactionId if it has no XID.  (We must ask
+ * the caller to pass latestXid, instead of computing it from the PGPROC's
+ * contents, because the subxid information in the PGPROC might be
+ * incomplete.)
+ */
+void
+ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
+{
+	if (TransactionIdIsValid(latestXid))
+	{
+		/*
+		 * We must lock ProcArrayLock while clearing our advertised XID, so
+		 * that we do not exit the set of "running" transactions while someone
+		 * else is taking a snapshot.  See discussion in
+		 * src/backend/access/transam/README.
+		 */
+		Assert(TransactionIdIsValid(proc->xid));
+
+		/*
+		 * If we can immediately acquire ProcArrayLock, we clear our own XID
+		 * and release the lock.  If not, use group XID clearing to improve
+		 * efficiency.
+		 */
+		if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE))
+		{
+			ProcArrayEndTransactionInternal(proc, latestXid);
+			LWLockRelease(ProcArrayLock);
+		}
+		else
+			ProcArrayGroupClearXid(proc, latestXid);
+	}
+	else
+	{
+		/*
+		 * If we have no XID, we don't need to lock, since we won't affect
+		 * anyone else's calculation of a snapshot.  We might change their
+		 * estimate of global xmin, but that's OK.
+		 */
+		Assert(!TransactionIdIsValid(proc->xid));
+		Assert(proc->subxidStatus.count == 0);
+		Assert(!proc->subxidStatus.overflowed);
+
+		proc->lxid = InvalidLocalTransactionId;
+		proc->xmin = InvalidTransactionId;
+
+		/* be sure these are cleared in abort */
+		proc->delayChkpt = false;
+		proc->delayChkptEnd = false;
+
+		proc->recoveryConflictPending = false;
+
+		/* must be cleared with xid/xmin: */
+		/* avoid unnecessarily dirtying shared cachelines */
+		if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
+		{
+			Assert(!LWLockHeldByMe(ProcArrayLock));
+			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+			Assert(proc->statusFlags == ProcGlobal->statusFlags[proc->pgxactoff]);
+			proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
+			ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
+			LWLockRelease(ProcArrayLock);
+		}
+	}
+}
+
+/*
+ * Mark a write transaction as no longer running.
+ *
+ * We don't do any locking here; caller must handle that.
+ */
+static inline void
+ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
+{
+	int			pgxactoff = proc->pgxactoff;
+
+	/*
+	 * Note: we need exclusive lock here because we're going to change other
+	 * processes' PGPROC entries.
+	 */
+	Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE));
+	Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff]));
+	Assert(ProcGlobal->xids[pgxactoff] == proc->xid);
+
+	ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+	proc->xid = InvalidTransactionId;
+	proc->lxid = InvalidLocalTransactionId;
+	proc->xmin = InvalidTransactionId;
+
+	/* be sure these are cleared in abort */
+	proc->delayChkpt = false;
+	proc->delayChkptEnd = false;
+
+	proc->recoveryConflictPending = false;
+
+	/* must be cleared with xid/xmin: */
+	/* avoid unnecessarily dirtying shared cachelines */
+	if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
+	{
+		proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
+		ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
+	}
+
+	/* Clear the subtransaction-XID cache too while holding the lock */
+	Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+		   ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+	if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+	{
+		ProcGlobal->subxidStates[pgxactoff].count = 0;
+		ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+		proc->subxidStatus.count = 0;
+		proc->subxidStatus.overflowed = false;
+	}
+
+	/* Also advance global latestCompletedXid while holding the lock */
+	MaintainLatestCompletedXid(latestXid);
+
+	/* Same with xactCompletionCount  */
+	ShmemVariableCache->xactCompletionCount++;
+}
+
+/*
+ * ProcArrayGroupClearXid -- group XID clearing
+ *
+ * When we cannot immediately acquire ProcArrayLock in exclusive mode at
+ * commit time, add ourselves to a list of processes that need their XIDs
+ * cleared.  The first process to add itself to the list will acquire
+ * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal
+ * on behalf of all group members.  This avoids a great deal of contention
+ * around ProcArrayLock when many processes are trying to commit at once,
+ * since the lock need not be repeatedly handed off from one committing
+ * process to the next.
+ */
+static void
+ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
+{
+	PROC_HDR   *procglobal = ProcGlobal;
+	uint32		nextidx;
+	uint32		wakeidx;
+
+	/* We should definitely have an XID to clear. */
+	Assert(TransactionIdIsValid(proc->xid));
+
+	/* Add ourselves to the list of processes needing a group XID clear. */
+	proc->procArrayGroupMember = true;
+	proc->procArrayGroupMemberXid = latestXid;
+	nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst);
+	while (true)
+	{
+		pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx);
+
+		if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst,
+										   &nextidx,
+										   (uint32) proc->pgprocno))
+			break;
+	}
+
+	/*
+	 * If the list was not empty, the leader will clear our XID.  It is
+	 * impossible to have followers without a leader because the first process
+	 * that has added itself to the list will always have nextidx as
+	 * INVALID_PGPROCNO.
+	 */
+	if (nextidx != INVALID_PGPROCNO)
+	{
+		int			extraWaits = 0;
+
+		/* Sleep until the leader clears our XID. */
+		pgstat_report_wait_start(WAIT_EVENT_PROCARRAY_GROUP_UPDATE);
+		for (;;)
+		{
+			/* acts as a read barrier */
+			PGSemaphoreLock(proc->sem);
+			if (!proc->procArrayGroupMember)
+				break;
+			extraWaits++;
+		}
+		pgstat_report_wait_end();
+
+		Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO);
+
+		/* Fix semaphore count for any absorbed wakeups */
+		while (extraWaits-- > 0)
+			PGSemaphoreUnlock(proc->sem);
+		return;
+	}
+
+	/* We are the leader.  Acquire the lock on behalf of everyone. */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Now that we've got the lock, clear the list of processes waiting for
+	 * group XID clearing, saving a pointer to the head of the list.  Trying
+	 * to pop elements one at a time could lead to an ABA problem.
+	 */
+	nextidx = pg_atomic_exchange_u32(&procglobal->procArrayGroupFirst,
+									 INVALID_PGPROCNO);
+
+	/* Remember head of list so we can perform wakeups after dropping lock. */
+	wakeidx = nextidx;
+
+	/* Walk the list and clear all XIDs. */
+	while (nextidx != INVALID_PGPROCNO)
+	{
+		PGPROC	   *nextproc = &allProcs[nextidx];
+
+		ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid);
+
+		/* Move to next proc in list. */
+		nextidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
+	}
+
+	/* We're done with the lock now. */
+	LWLockRelease(ProcArrayLock);
+
+	/*
+	 * Now that we've released the lock, go back and wake everybody up.  We
+	 * don't do this under the lock so as to keep lock hold times to a
+	 * minimum.  The system calls we need to perform to wake other processes
+	 * up are probably much slower than the simple memory writes we did while
+	 * holding the lock.
+	 */
+	while (wakeidx != INVALID_PGPROCNO)
+	{
+		PGPROC	   *nextproc = &allProcs[wakeidx];
+
+		wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
+		pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PGPROCNO);
+
+		/* ensure all previous writes are visible before follower continues. */
+		pg_write_barrier();
+
+		nextproc->procArrayGroupMember = false;
+
+		if (nextproc != MyProc)
+			PGSemaphoreUnlock(nextproc->sem);
+	}
+}
+
+/*
+ * ProcArrayClearTransaction -- clear the transaction fields
+ *
+ * This is used after successfully preparing a 2-phase transaction.  We are
+ * not actually reporting the transaction's XID as no longer running --- it
+ * will still appear as running because the 2PC's gxact is in the ProcArray
+ * too.  We just have to clear out our own PGPROC.
+ */
+void
+ProcArrayClearTransaction(PGPROC *proc)
+{
+	int			pgxactoff;
+
+	/*
+	 * Currently we need to lock ProcArrayLock exclusively here, as we
+	 * increment xactCompletionCount below. We also need it at least in shared
+	 * mode for pgproc->pgxactoff to stay the same below.
+	 *
+	 * We could however, as this action does not actually change anyone's view
+	 * of the set of running XIDs (our entry is duplicate with the gxact that
+	 * has already been inserted into the ProcArray), lower the lock level to
+	 * shared if we were to make xactCompletionCount an atomic variable. But
+	 * that doesn't seem worth it currently, as a 2PC commit is heavyweight
+	 * enough for this not to be the bottleneck.  If it ever becomes a
+	 * bottleneck it may also be worth considering to combine this with the
+	 * subsequent ProcArrayRemove()
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	pgxactoff = proc->pgxactoff;
+
+	ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+	proc->xid = InvalidTransactionId;
+
+	proc->lxid = InvalidLocalTransactionId;
+	proc->xmin = InvalidTransactionId;
+	proc->recoveryConflictPending = false;
+
+	Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK));
+	Assert(!proc->delayChkpt);
+
+	/*
+	 * Need to increment completion count even though transaction hasn't
+	 * really committed yet. The reason for that is that GetSnapshotData()
+	 * omits the xid of the current transaction, thus without the increment we
+	 * otherwise could end up reusing the snapshot later. Which would be bad,
+	 * because it might not count the prepared transaction as running.
+	 */
+	ShmemVariableCache->xactCompletionCount++;
+
+	/* Clear the subtransaction-XID cache too */
+	Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+		   ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+	if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+	{
+		ProcGlobal->subxidStates[pgxactoff].count = 0;
+		ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+		proc->subxidStatus.count = 0;
+		proc->subxidStatus.overflowed = false;
+	}
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * Update ShmemVariableCache->latestCompletedXid to point to latestXid if
+ * currently older.
+ */
+static void
+MaintainLatestCompletedXid(TransactionId latestXid)
+{
+	FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+
+	Assert(FullTransactionIdIsValid(cur_latest));
+	Assert(!RecoveryInProgress());
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+	{
+		ShmemVariableCache->latestCompletedXid =
+			FullXidRelativeTo(cur_latest, latestXid);
+	}
+
+	Assert(IsBootstrapProcessingMode() ||
+		   FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
+/*
+ * Same as MaintainLatestCompletedXid, except for use during WAL replay.
+ */
+static void
+MaintainLatestCompletedXidRecovery(TransactionId latestXid)
+{
+	FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+	FullTransactionId rel;
+
+	Assert(AmStartupProcess() || !IsUnderPostmaster);
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	/*
+	 * Need a FullTransactionId to compare latestXid with. Can't rely on
+	 * latestCompletedXid to be initialized in recovery. But in recovery it's
+	 * safe to access nextXid without a lock for the startup process.
+	 */
+	rel = ShmemVariableCache->nextXid;
+	Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
+
+	if (!FullTransactionIdIsValid(cur_latest) ||
+		TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+	{
+		ShmemVariableCache->latestCompletedXid =
+			FullXidRelativeTo(rel, latestXid);
+	}
+
+	Assert(FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
+/*
+ * ProcArrayInitRecovery -- initialize recovery xid mgmt environment
+ *
+ * Remember up to where the startup process initialized the CLOG and subtrans
+ * so we can ensure it's initialized gaplessly up to the point where necessary
+ * while in recovery.
+ */
+void
+ProcArrayInitRecovery(TransactionId initializedUptoXID)
+{
+	Assert(standbyState == STANDBY_INITIALIZED);
+	Assert(TransactionIdIsNormal(initializedUptoXID));
+
+	/*
+	 * we set latestObservedXid to the xid SUBTRANS has been initialized up
+	 * to, so we can extend it from that point onwards in
+	 * RecordKnownAssignedTransactionIds, and when we get consistent in
+	 * ProcArrayApplyRecoveryInfo().
+	 */
+	latestObservedXid = initializedUptoXID;
+	TransactionIdRetreat(latestObservedXid);
+}
+
+/*
+ * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
+ *
+ * Takes us through 3 states: Initialized, Pending and Ready.
+ * Normal case is to go all the way to Ready straight away, though there
+ * are atypical cases where we need to take it in steps.
+ *
+ * Use the data about running transactions on the primary to create the initial
+ * state of KnownAssignedXids. We also use these records to regularly prune
+ * KnownAssignedXids because we know it is possible that some transactions
+ * with FATAL errors fail to write abort records, which could cause eventual
+ * overflow.
+ *
+ * See comments for LogStandbySnapshot().
+ */
+void
+ProcArrayApplyRecoveryInfo(RunningTransactions running)
+{
+	TransactionId *xids;
+	int			nxids;
+	int			i;
+
+	Assert(standbyState >= STANDBY_INITIALIZED);
+	Assert(TransactionIdIsValid(running->nextXid));
+	Assert(TransactionIdIsValid(running->oldestRunningXid));
+	Assert(TransactionIdIsNormal(running->latestCompletedXid));
+
+	/*
+	 * Remove stale transactions, if any.
+	 */
+	ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
+
+	/*
+	 * Remove stale locks, if any.
+	 */
+	StandbyReleaseOldLocks(running->oldestRunningXid);
+
+	/*
+	 * If our snapshot is already valid, nothing else to do...
+	 */
+	if (standbyState == STANDBY_SNAPSHOT_READY)
+		return;
+
+	/*
+	 * If our initial RunningTransactionsData had an overflowed snapshot then
+	 * we knew we were missing some subxids from our snapshot. If we continue
+	 * to see overflowed snapshots then we might never be able to start up, so
+	 * we make another test to see if our snapshot is now valid. We know that
+	 * the missing subxids are equal to or earlier than nextXid. After we
+	 * initialise we continue to apply changes during recovery, so once the
+	 * oldestRunningXid is later than the nextXid from the initial snapshot we
+	 * know that we no longer have missing information and can mark the
+	 * snapshot as valid.
+	 */
+	if (standbyState == STANDBY_SNAPSHOT_PENDING)
+	{
+		/*
+		 * If the snapshot isn't overflowed or if its empty we can reset our
+		 * pending state and use this snapshot instead.
+		 */
+		if (!running->subxid_overflow || running->xcnt == 0)
+		{
+			/*
+			 * If we have already collected known assigned xids, we need to
+			 * throw them away before we apply the recovery snapshot.
+			 */
+			KnownAssignedXidsReset();
+			standbyState = STANDBY_INITIALIZED;
+		}
+		else
+		{
+			if (TransactionIdPrecedes(standbySnapshotPendingXmin,
+									  running->oldestRunningXid))
+			{
+				standbyState = STANDBY_SNAPSHOT_READY;
+				elog(trace_recovery(DEBUG1),
+					 "recovery snapshots are now enabled");
+			}
+			else
+				elog(trace_recovery(DEBUG1),
+					 "recovery snapshot waiting for non-overflowed snapshot or "
+					 "until oldest active xid on standby is at least %u (now %u)",
+					 standbySnapshotPendingXmin,
+					 running->oldestRunningXid);
+			return;
+		}
+	}
+
+	Assert(standbyState == STANDBY_INITIALIZED);
+
+	/*
+	 * NB: this can be reached at least twice, so make sure new code can deal
+	 * with that.
+	 */
+
+	/*
+	 * Nobody else is running yet, but take locks anyhow
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * KnownAssignedXids is sorted so we cannot just add the xids, we have to
+	 * sort them first.
+	 *
+	 * Some of the new xids are top-level xids and some are subtransactions.
+	 * We don't call SubTransSetParent because it doesn't matter yet. If we
+	 * aren't overflowed then all xids will fit in snapshot and so we don't
+	 * need subtrans. If we later overflow, an xid assignment record will add
+	 * xids to subtrans. If RunningTransactionsData is overflowed then we
+	 * don't have enough information to correctly update subtrans anyway.
+	 */
+
+	/*
+	 * Allocate a temporary array to avoid modifying the array passed as
+	 * argument.
+	 */
+	xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt));
+
+	/*
+	 * Add to the temp array any xids which have not already completed.
+	 */
+	nxids = 0;
+	for (i = 0; i < running->xcnt + running->subxcnt; i++)
+	{
+		TransactionId xid = running->xids[i];
+
+		/*
+		 * The running-xacts snapshot can contain xids that were still visible
+		 * in the procarray when the snapshot was taken, but were already
+		 * WAL-logged as completed. They're not running anymore, so ignore
+		 * them.
+		 */
+		if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+			continue;
+
+		xids[nxids++] = xid;
+	}
+
+	if (nxids > 0)
+	{
+		if (procArray->numKnownAssignedXids != 0)
+		{
+			LWLockRelease(ProcArrayLock);
+			elog(ERROR, "KnownAssignedXids is not empty");
+		}
+
+		/*
+		 * Sort the array so that we can add them safely into
+		 * KnownAssignedXids.
+		 *
+		 * We have to sort them logically, because in KnownAssignedXidsAdd we
+		 * call TransactionIdFollowsOrEquals and so on. But we know these XIDs
+		 * come from RUNNING_XACTS, which means there are only normal XIDs from
+		 * the same epoch, so this is safe.
+		 */
+		qsort(xids, nxids, sizeof(TransactionId), xidLogicalComparator);
+
+		/*
+		 * Add the sorted snapshot into KnownAssignedXids.  The running-xacts
+		 * snapshot may include duplicated xids because of prepared
+		 * transactions, so ignore them.
+		 */
+		for (i = 0; i < nxids; i++)
+		{
+			if (i > 0 && TransactionIdEquals(xids[i - 1], xids[i]))
+			{
+				elog(DEBUG1,
+					 "found duplicated transaction %u for KnownAssignedXids insertion",
+					 xids[i]);
+				continue;
+			}
+			KnownAssignedXidsAdd(xids[i], xids[i], true);
+		}
+
+		KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+	}
+
+	pfree(xids);
+
+	/*
+	 * latestObservedXid is at least set to the point where SUBTRANS was
+	 * started up to (cf. ProcArrayInitRecovery()) or to the biggest xid
+	 * RecordKnownAssignedTransactionIds() was called for.  Initialize
+	 * subtrans from thereon, up to nextXid - 1.
+	 *
+	 * We need to duplicate parts of RecordKnownAssignedTransactionId() here,
+	 * because we've just added xids to the known assigned xids machinery that
+	 * haven't gone through RecordKnownAssignedTransactionId().
+	 */
+	Assert(TransactionIdIsNormal(latestObservedXid));
+	TransactionIdAdvance(latestObservedXid);
+	while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
+	{
+		ExtendSUBTRANS(latestObservedXid);
+		TransactionIdAdvance(latestObservedXid);
+	}
+	TransactionIdRetreat(latestObservedXid);	/* = running->nextXid - 1 */
+
+	/* ----------
+	 * Now we've got the running xids we need to set the global values that
+	 * are used to track snapshots as they evolve further.
+	 *
+	 * - latestCompletedXid which will be the xmax for snapshots
+	 * - lastOverflowedXid which shows whether snapshots overflow
+	 * - nextXid
+	 *
+	 * If the snapshot overflowed, then we still initialise with what we know,
+	 * but the recovery snapshot isn't fully valid yet because we know there
+	 * are some subxids missing. We don't know the specific subxids that are
+	 * missing, so conservatively assume the last one is latestObservedXid.
+	 * ----------
+	 */
+	if (running->subxid_overflow)
+	{
+		standbyState = STANDBY_SNAPSHOT_PENDING;
+
+		standbySnapshotPendingXmin = latestObservedXid;
+		procArray->lastOverflowedXid = latestObservedXid;
+	}
+	else
+	{
+		standbyState = STANDBY_SNAPSHOT_READY;
+
+		standbySnapshotPendingXmin = InvalidTransactionId;
+	}
+
+	/*
+	 * If a transaction wrote a commit record in the gap between taking and
+	 * logging the snapshot then latestCompletedXid may already be higher than
+	 * the value from the snapshot, so check before we use the incoming value.
+	 * It also might not yet be set at all.
+	 */
+	MaintainLatestCompletedXidRecovery(running->latestCompletedXid);
+
+	/*
+	 * NB: No need to increment ShmemVariableCache->xactCompletionCount here,
+	 * nobody can see it yet.
+	 */
+
+	LWLockRelease(ProcArrayLock);
+
+	/* ShmemVariableCache->nextXid must be beyond any observed xid. */
+	AdvanceNextFullTransactionIdPastXid(latestObservedXid);
+
+	Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
+
+	KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+	if (standbyState == STANDBY_SNAPSHOT_READY)
+		elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled");
+	else
+		elog(trace_recovery(DEBUG1),
+			 "recovery snapshot waiting for non-overflowed snapshot or "
+			 "until oldest active xid on standby is at least %u (now %u)",
+			 standbySnapshotPendingXmin,
+			 running->oldestRunningXid);
+}
+
+/*
+ * ProcArrayApplyXidAssignment
+ *		Process an XLOG_XACT_ASSIGNMENT WAL record
+ */
+void
+ProcArrayApplyXidAssignment(TransactionId topxid,
+							int nsubxids, TransactionId *subxids)
+{
+	TransactionId max_xid;
+	int			i;
+
+	Assert(standbyState >= STANDBY_INITIALIZED);
+
+	max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
+
+	/*
+	 * Mark all the subtransactions as observed.
+	 *
+	 * NOTE: This will fail if the subxid contains too many previously
+	 * unobserved xids to fit into known-assigned-xids. That shouldn't happen
+	 * as the code stands, because xid-assignment records should never contain
+	 * more than PGPROC_MAX_CACHED_SUBXIDS entries.
+	 */
+	RecordKnownAssignedTransactionIds(max_xid);
+
+	/*
+	 * Notice that we update pg_subtrans with the top-level xid, rather than
+	 * the parent xid. This is a difference between normal processing and
+	 * recovery, yet is still correct in all cases. The reason is that
+	 * subtransaction commit is not marked in clog until commit processing, so
+	 * all aborted subtransactions have already been clearly marked in clog.
+	 * As a result we are able to refer directly to the top-level
+	 * transaction's state rather than skipping through all the intermediate
+	 * states in the subtransaction tree. This should be the first time we
+	 * have attempted to SubTransSetParent().
+	 */
+	for (i = 0; i < nsubxids; i++)
+		SubTransSetParent(subxids[i], topxid);
+
+	/* KnownAssignedXids isn't maintained yet, so we're done for now */
+	if (standbyState == STANDBY_INITIALIZED)
+		return;
+
+	/*
+	 * Uses same locking as transaction commit
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Remove subxids from known-assigned-xacts.
+	 */
+	KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
+
+	/*
+	 * Advance lastOverflowedXid to be at least the last of these subxids.
+	 */
+	if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
+		procArray->lastOverflowedXid = max_xid;
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * TransactionIdIsInProgress -- is given transaction running in some backend
+ *
+ * Aside from some shortcuts such as checking RecentXmin and our own Xid,
+ * there are four possibilities for finding a running transaction:
+ *
+ * 1. The given Xid is a main transaction Id.  We will find this out cheaply
+ * by looking at ProcGlobal->xids.
+ *
+ * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
+ * We can find this out cheaply too.
+ *
+ * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
+ * if the Xid is running on the primary.
+ *
+ * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
+ * if that is running according to ProcGlobal->xids[] or KnownAssignedXids.
+ * This is the slowest way, but sadly it has to be done always if the others
+ * failed, unless we see that the cached subxact sets are complete (none have
+ * overflowed).
+ *
+ * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
+ * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
+ * This buys back some concurrency (and we can't retrieve the main Xids from
+ * ProcGlobal->xids[] again anyway; see GetNewTransactionId).
+ */
+bool
+TransactionIdIsInProgress(TransactionId xid)
+{
+	static TransactionId *xids = NULL;
+	static TransactionId *other_xids;
+	XidCacheStatus *other_subxidstates;
+	int			nxids = 0;
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId topxid;
+	TransactionId latestCompletedXid;
+	int			mypgxactoff;
+	int			numProcs;
+	int			j;
+
+	/*
+	 * Don't bother checking a transaction older than RecentXmin; it could not
+	 * possibly still be running.  (Note: in particular, this guarantees that
+	 * we reject InvalidTransactionId, FrozenTransactionId, etc as not
+	 * running.)
+	 */
+	if (TransactionIdPrecedes(xid, RecentXmin))
+	{
+		xc_by_recent_xmin_inc();
+		return false;
+	}
+
+	/*
+	 * We may have just checked the status of this transaction, so if it is
+	 * already known to be completed, we can fall out without any access to
+	 * shared memory.
+	 */
+	if (TransactionIdEquals(cachedXidIsNotInProgress, xid))
+	{
+		xc_by_known_xact_inc();
+		return false;
+	}
+
+	/*
+	 * Also, we can handle our own transaction (and subtransactions) without
+	 * any access to shared memory.
+	 */
+	if (TransactionIdIsCurrentTransactionId(xid))
+	{
+		xc_by_my_xact_inc();
+		return true;
+	}
+
+	/*
+	 * If first time through, get workspace to remember main XIDs in. We
+	 * malloc it permanently to avoid repeated palloc/pfree overhead.
+	 */
+	if (xids == NULL)
+	{
+		/*
+		 * In hot standby mode, reserve enough space to hold all xids in the
+		 * known-assigned list. If we later finish recovery, we no longer need
+		 * the bigger array, but we don't bother to shrink it.
+		 */
+		int			maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
+
+		xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
+		if (xids == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	other_xids = ProcGlobal->xids;
+	other_subxidstates = ProcGlobal->subxidStates;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	/*
+	 * Now that we have the lock, we can check latestCompletedXid; if the
+	 * target Xid is after that, it's surely still running.
+	 */
+	latestCompletedXid =
+		XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+	if (TransactionIdPrecedes(latestCompletedXid, xid))
+	{
+		LWLockRelease(ProcArrayLock);
+		xc_by_latest_xid_inc();
+		return true;
+	}
+
+	/* No shortcuts, gotta grovel through the array */
+	mypgxactoff = MyProc->pgxactoff;
+	numProcs = arrayP->numProcs;
+	for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
+	{
+		int			pgprocno;
+		PGPROC	   *proc;
+		TransactionId pxid;
+		int			pxids;
+
+		/* Ignore ourselves --- dealt with it above */
+		if (pgxactoff == mypgxactoff)
+			continue;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		pxid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
+
+		if (!TransactionIdIsValid(pxid))
+			continue;
+
+		/*
+		 * Step 1: check the main Xid
+		 */
+		if (TransactionIdEquals(pxid, xid))
+		{
+			LWLockRelease(ProcArrayLock);
+			xc_by_main_xid_inc();
+			return true;
+		}
+
+		/*
+		 * We can ignore main Xids that are younger than the target Xid, since
+		 * the target could not possibly be their child.
+		 */
+		if (TransactionIdPrecedes(xid, pxid))
+			continue;
+
+		/*
+		 * Step 2: check the cached child-Xids arrays
+		 */
+		pxids = other_subxidstates[pgxactoff].count;
+		pg_read_barrier();		/* pairs with barrier in GetNewTransactionId() */
+		pgprocno = arrayP->pgprocnos[pgxactoff];
+		proc = &allProcs[pgprocno];
+		for (j = pxids - 1; j >= 0; j--)
+		{
+			/* Fetch xid just once - see GetNewTransactionId */
+			TransactionId cxid = UINT32_ACCESS_ONCE(proc->subxids.xids[j]);
+
+			if (TransactionIdEquals(cxid, xid))
+			{
+				LWLockRelease(ProcArrayLock);
+				xc_by_child_xid_inc();
+				return true;
+			}
+		}
+
+		/*
+		 * Save the main Xid for step 4.  We only need to remember main Xids
+		 * that have uncached children.  (Note: there is no race condition
+		 * here because the overflowed flag cannot be cleared, only set, while
+		 * we hold ProcArrayLock.  So we can't miss an Xid that we need to
+		 * worry about.)
+		 */
+		if (other_subxidstates[pgxactoff].overflowed)
+			xids[nxids++] = pxid;
+	}
+
+	/*
+	 * Step 3: in hot standby mode, check the known-assigned-xids list.  XIDs
+	 * in the list must be treated as running.
+	 */
+	if (RecoveryInProgress())
+	{
+		/* none of the PGPROC entries should have XIDs in hot standby mode */
+		Assert(nxids == 0);
+
+		if (KnownAssignedXidExists(xid))
+		{
+			LWLockRelease(ProcArrayLock);
+			xc_by_known_assigned_inc();
+			return true;
+		}
+
+		/*
+		 * If the KnownAssignedXids overflowed, we have to check pg_subtrans
+		 * too.  Fetch all xids from KnownAssignedXids that are lower than
+		 * xid, since if xid is a subtransaction its parent will always have a
+		 * lower value.  Note we will collect both main and subXIDs here, but
+		 * there's no help for it.
+		 */
+		if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
+			nxids = KnownAssignedXidsGet(xids, xid);
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	/*
+	 * If none of the relevant caches overflowed, we know the Xid is not
+	 * running without even looking at pg_subtrans.
+	 */
+	if (nxids == 0)
+	{
+		xc_no_overflow_inc();
+		cachedXidIsNotInProgress = xid;
+		return false;
+	}
+
+	/*
+	 * Step 4: have to check pg_subtrans.
+	 *
+	 * At this point, we know it's either a subtransaction of one of the Xids
+	 * in xids[], or it's not running.  If it's an already-failed
+	 * subtransaction, we want to say "not running" even though its parent may
+	 * still be running.  So first, check pg_xact to see if it's been aborted.
+	 */
+	xc_slow_answer_inc();
+
+	if (TransactionIdDidAbort(xid))
+	{
+		cachedXidIsNotInProgress = xid;
+		return false;
+	}
+
+	/*
+	 * It isn't aborted, so check whether the transaction tree it belongs to
+	 * is still running (or, more precisely, whether it was running when we
+	 * held ProcArrayLock).
+	 */
+	topxid = SubTransGetTopmostTransaction(xid);
+	Assert(TransactionIdIsValid(topxid));
+	if (!TransactionIdEquals(topxid, xid))
+	{
+		for (int i = 0; i < nxids; i++)
+		{
+			if (TransactionIdEquals(xids[i], topxid))
+				return true;
+		}
+	}
+
+	cachedXidIsNotInProgress = xid;
+	return false;
+}
+
+/*
+ * TransactionIdIsActive -- is xid the top-level XID of an active backend?
+ *
+ * This differs from TransactionIdIsInProgress in that it ignores prepared
+ * transactions, as well as transactions running on the primary if we're in
+ * hot standby.  Also, we ignore subtransactions since that's not needed
+ * for current uses.
+ */
+bool
+TransactionIdIsActive(TransactionId xid)
+{
+	bool		result = false;
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	int			i;
+
+	/*
+	 * Don't bother checking a transaction older than RecentXmin; it could not
+	 * possibly still be running.
+	 */
+	if (TransactionIdPrecedes(xid, RecentXmin))
+		return false;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (i = 0; i < arrayP->numProcs; i++)
+	{
+		int			pgprocno = arrayP->pgprocnos[i];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		TransactionId pxid;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		pxid = UINT32_ACCESS_ONCE(other_xids[i]);
+
+		if (!TransactionIdIsValid(pxid))
+			continue;
+
+		if (proc->pid == 0)
+			continue;			/* ignore prepared transactions */
+
+		if (TransactionIdEquals(pxid, xid))
+		{
+			result = true;
+			break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+
+/*
+ * Determine XID horizons.
+ *
+ * This is used by wrapper functions like GetOldestNonRemovableTransactionId()
+ * (for VACUUM), GetReplicationHorizons() (for hot_standby_feedback), etc as
+ * well as "internally" by GlobalVisUpdate() (see comment above struct
+ * GlobalVisState).
+ *
+ * See the definition of ComputeXidHorizonsResult for the various computed
+ * horizons.
+ *
+ * For VACUUM separate horizons (used to decide which deleted tuples must
+ * be preserved), for shared and non-shared tables are computed.  For shared
+ * relations backends in all databases must be considered, but for non-shared
+ * relations that's not required, since only backends in my own database could
+ * ever see the tuples in them. Also, we can ignore concurrently running lazy
+ * VACUUMs because (a) they must be working on other tables, and (b) they
+ * don't need to do snapshot-based lookups.
+ *
+ * This also computes a horizon used to truncate pg_subtrans. For that
+ * backends in all databases have to be considered, and concurrently running
+ * lazy VACUUMs cannot be ignored, as they still may perform pg_subtrans
+ * accesses.
+ *
+ * Note: we include all currently running xids in the set of considered xids.
+ * This ensures that if a just-started xact has not yet set its snapshot,
+ * when it does set the snapshot it cannot set xmin less than what we compute.
+ * See notes in src/backend/access/transam/README.
+ *
+ * Note: despite the above, it's possible for the calculated values to move
+ * backwards on repeated calls. The calculated values are conservative, so
+ * that anything older is definitely not considered as running by anyone
+ * anymore, but the exact values calculated depend on a number of things. For
+ * example, if there are no transactions running in the current database, the
+ * horizon for normal tables will be latestCompletedXid. If a transaction
+ * begins after that, its xmin will include in-progress transactions in other
+ * databases that started earlier, so another call will return a lower value.
+ * Nonetheless it is safe to vacuum a table in the current database with the
+ * first result.  There are also replication-related effects: a walsender
+ * process can set its xmin based on transactions that are no longer running
+ * on the primary but are still being replayed on the standby, thus possibly
+ * making the values go backwards.  In this case there is a possibility that
+ * we lose data that the standby would like to have, but unless the standby
+ * uses a replication slot to make its xmin persistent there is little we can
+ * do about that --- data is only protected if the walsender runs continuously
+ * while queries are executed on the standby.  (The Hot Standby code deals
+ * with such cases by failing standby queries that needed to access
+ * already-removed data, so there's no integrity bug.)  The computed values
+ * are also adjusted with vacuum_defer_cleanup_age, so increasing that setting
+ * on the fly is another easy way to make horizons move backwards, with no
+ * consequences for data integrity.
+ *
+ * Note: the approximate horizons (see definition of GlobalVisState) are
+ * updated by the computations done here. That's currently required for
+ * correctness and a small optimization. Without doing so it's possible that
+ * heap vacuum's call to heap_page_prune() uses a more conservative horizon
+ * than later when deciding which tuples can be removed - which the code
+ * doesn't expect (breaking HOT).
+ */
+static void
+ComputeXidHorizons(ComputeXidHorizonsResult *h)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId kaxmin;
+	bool		in_recovery = RecoveryInProgress();
+	TransactionId *other_xids = ProcGlobal->xids;
+
+	/* inferred after ProcArrayLock is released */
+	h->catalog_oldest_nonremovable = InvalidTransactionId;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	h->latest_completed = ShmemVariableCache->latestCompletedXid;
+
+	/*
+	 * We initialize the MIN() calculation with latestCompletedXid + 1. This
+	 * is a lower bound for the XIDs that might appear in the ProcArray later,
+	 * and so protects us against overestimating the result due to future
+	 * additions.
+	 */
+	{
+		TransactionId initial;
+
+		initial = XidFromFullTransactionId(h->latest_completed);
+		Assert(TransactionIdIsValid(initial));
+		TransactionIdAdvance(initial);
+
+		h->oldest_considered_running = initial;
+		h->shared_oldest_nonremovable = initial;
+		h->data_oldest_nonremovable = initial;
+
+		/*
+		 * Only modifications made by this backend affect the horizon for
+		 * temporary relations. Instead of a check in each iteration of the
+		 * loop over all PGPROCs it is cheaper to just initialize to the
+		 * current top-level xid any.
+		 *
+		 * Without an assigned xid we could use a horizon as aggressive as
+		 * ReadNewTransactionid(), but we can get away with the much cheaper
+		 * latestCompletedXid + 1: If this backend has no xid there, by
+		 * definition, can't be any newer changes in the temp table than
+		 * latestCompletedXid.
+		 */
+		if (TransactionIdIsValid(MyProc->xid))
+			h->temp_oldest_nonremovable = MyProc->xid;
+		else
+			h->temp_oldest_nonremovable = initial;
+	}
+
+	/*
+	 * Fetch slot horizons while ProcArrayLock is held - the
+	 * LWLockAcquire/LWLockRelease are a barrier, ensuring this happens inside
+	 * the lock.
+	 */
+	h->slot_xmin = procArray->replication_slot_xmin;
+	h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+	for (int index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		int8		statusFlags = ProcGlobal->statusFlags[index];
+		TransactionId xid;
+		TransactionId xmin;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = UINT32_ACCESS_ONCE(other_xids[index]);
+		xmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+		/*
+		 * Consider both the transaction's Xmin, and its Xid.
+		 *
+		 * We must check both because a transaction might have an Xmin but not
+		 * (yet) an Xid; conversely, if it has an Xid, that could determine
+		 * some not-yet-set Xmin.
+		 */
+		xmin = TransactionIdOlder(xmin, xid);
+
+		/* if neither is set, this proc doesn't influence the horizon */
+		if (!TransactionIdIsValid(xmin))
+			continue;
+
+		/*
+		 * Don't ignore any procs when determining which transactions might be
+		 * considered running.  While slots should ensure logical decoding
+		 * backends are protected even without this check, it can't hurt to
+		 * include them here as well..
+		 */
+		h->oldest_considered_running =
+			TransactionIdOlder(h->oldest_considered_running, xmin);
+
+		/*
+		 * Skip over backends either vacuuming (which is ok with rows being
+		 * removed, as long as pg_subtrans is not truncated) or doing logical
+		 * decoding (which manages xmin separately, check below).
+		 */
+		if (statusFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING))
+			continue;
+
+		/* shared tables need to take backends in all databases into account */
+		h->shared_oldest_nonremovable =
+			TransactionIdOlder(h->shared_oldest_nonremovable, xmin);
+
+		/*
+		 * Normally queries in other databases are ignored for anything but
+		 * the shared horizon. But in recovery we cannot compute an accurate
+		 * per-database horizon as all xids are managed via the
+		 * KnownAssignedXids machinery.
+		 *
+		 * Be careful to compute a pessimistic value when MyDatabaseId is not
+		 * set. If this is a backend in the process of starting up, we may not
+		 * use a "too aggressive" horizon (otherwise we could end up using it
+		 * to prune still needed data away). If the current backend never
+		 * connects to a database that is harmless, because
+		 * data_oldest_nonremovable will never be utilized.
+		 */
+		if (in_recovery ||
+			MyDatabaseId == InvalidOid || proc->databaseId == MyDatabaseId ||
+			proc->databaseId == 0)	/* always include WalSender */
+		{
+			h->data_oldest_nonremovable =
+				TransactionIdOlder(h->data_oldest_nonremovable, xmin);
+		}
+	}
+
+	/*
+	 * If in recovery fetch oldest xid in KnownAssignedXids, will be applied
+	 * after lock is released.
+	 */
+	if (in_recovery)
+		kaxmin = KnownAssignedXidsGetOldestXmin();
+
+	/*
+	 * No other information from shared state is needed, release the lock
+	 * immediately. The rest of the computations can be done without a lock.
+	 */
+	LWLockRelease(ProcArrayLock);
+
+	if (in_recovery)
+	{
+		h->oldest_considered_running =
+			TransactionIdOlder(h->oldest_considered_running, kaxmin);
+		h->shared_oldest_nonremovable =
+			TransactionIdOlder(h->shared_oldest_nonremovable, kaxmin);
+		h->data_oldest_nonremovable =
+			TransactionIdOlder(h->data_oldest_nonremovable, kaxmin);
+		/* temp relations cannot be accessed in recovery */
+	}
+	else
+	{
+		/*
+		 * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age.
+		 *
+		 * vacuum_defer_cleanup_age provides some additional "slop" for the
+		 * benefit of hot standby queries on standby servers.  This is quick
+		 * and dirty, and perhaps not all that useful unless the primary has a
+		 * predictable transaction rate, but it offers some protection when
+		 * there's no walsender connection.  Note that we are assuming
+		 * vacuum_defer_cleanup_age isn't large enough to cause wraparound ---
+		 * so guc.c should limit it to no more than the xidStopLimit threshold
+		 * in varsup.c.  Also note that we intentionally don't apply
+		 * vacuum_defer_cleanup_age on standby servers.
+		 */
+		h->oldest_considered_running =
+			TransactionIdRetreatedBy(h->oldest_considered_running,
+									 vacuum_defer_cleanup_age);
+		h->shared_oldest_nonremovable =
+			TransactionIdRetreatedBy(h->shared_oldest_nonremovable,
+									 vacuum_defer_cleanup_age);
+		h->data_oldest_nonremovable =
+			TransactionIdRetreatedBy(h->data_oldest_nonremovable,
+									 vacuum_defer_cleanup_age);
+		/* defer doesn't apply to temp relations */
+	}
+
+	/*
+	 * Check whether there are replication slots requiring an older xmin.
+	 */
+	h->shared_oldest_nonremovable =
+		TransactionIdOlder(h->shared_oldest_nonremovable, h->slot_xmin);
+	h->data_oldest_nonremovable =
+		TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin);
+
+	/*
+	 * The only difference between catalog / data horizons is that the slot's
+	 * catalog xmin is applied to the catalog one (so catalogs can be accessed
+	 * for logical decoding). Initialize with data horizon, and then back up
+	 * further if necessary. Have to back up the shared horizon as well, since
+	 * that also can contain catalogs.
+	 */
+	h->shared_oldest_nonremovable_raw = h->shared_oldest_nonremovable;
+	h->shared_oldest_nonremovable =
+		TransactionIdOlder(h->shared_oldest_nonremovable,
+						   h->slot_catalog_xmin);
+	h->catalog_oldest_nonremovable = h->data_oldest_nonremovable;
+	h->catalog_oldest_nonremovable =
+		TransactionIdOlder(h->catalog_oldest_nonremovable,
+						   h->slot_catalog_xmin);
+
+	/*
+	 * It's possible that slots / vacuum_defer_cleanup_age backed up the
+	 * horizons further than oldest_considered_running. Fix.
+	 */
+	h->oldest_considered_running =
+		TransactionIdOlder(h->oldest_considered_running,
+						   h->shared_oldest_nonremovable);
+	h->oldest_considered_running =
+		TransactionIdOlder(h->oldest_considered_running,
+						   h->catalog_oldest_nonremovable);
+	h->oldest_considered_running =
+		TransactionIdOlder(h->oldest_considered_running,
+						   h->data_oldest_nonremovable);
+
+	/*
+	 * shared horizons have to be at least as old as the oldest visible in
+	 * current db
+	 */
+	Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+										 h->data_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+										 h->catalog_oldest_nonremovable));
+
+	/*
+	 * Horizons need to ensure that pg_subtrans access is still possible for
+	 * the relevant backends.
+	 */
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->shared_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->catalog_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->data_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->temp_oldest_nonremovable));
+	Assert(!TransactionIdIsValid(h->slot_xmin) ||
+		   TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->slot_xmin));
+	Assert(!TransactionIdIsValid(h->slot_catalog_xmin) ||
+		   TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->slot_catalog_xmin));
+
+	/* update approximate horizons with the computed horizons */
+	GlobalVisUpdateApply(h);
+}
+
+/*
+ * Determine what kind of visibility horizon needs to be used for a
+ * relation. If rel is NULL, the most conservative horizon is used.
+ */
+static inline GlobalVisHorizonKind
+GlobalVisHorizonKindForRel(Relation rel)
+{
+	/*
+	 * Other relkkinds currently don't contain xids, nor always the necessary
+	 * logical decoding markers.
+	 */
+	Assert(!rel ||
+		   rel->rd_rel->relkind == RELKIND_RELATION ||
+		   rel->rd_rel->relkind == RELKIND_MATVIEW ||
+		   rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+
+	if (rel == NULL || rel->rd_rel->relisshared || RecoveryInProgress())
+		return VISHORIZON_SHARED;
+	else if (IsCatalogRelation(rel) ||
+			 RelationIsAccessibleInLogicalDecoding(rel))
+		return VISHORIZON_CATALOG;
+	else if (!RELATION_IS_LOCAL(rel))
+		return VISHORIZON_DATA;
+	else
+		return VISHORIZON_TEMP;
+}
+
+/*
+ * Return the oldest XID for which deleted tuples must be preserved in the
+ * passed table.
+ *
+ * If rel is not NULL the horizon may be considerably more recent than
+ * otherwise (i.e. fewer tuples will be removable). In the NULL case a horizon
+ * that is correct (but not optimal) for all relations will be returned.
+ *
+ * This is used by VACUUM to decide which deleted tuples must be preserved in
+ * the passed in table.
+ */
+TransactionId
+GetOldestNonRemovableTransactionId(Relation rel)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	switch (GlobalVisHorizonKindForRel(rel))
+	{
+		case VISHORIZON_SHARED:
+			return horizons.shared_oldest_nonremovable;
+		case VISHORIZON_CATALOG:
+			return horizons.catalog_oldest_nonremovable;
+		case VISHORIZON_DATA:
+			return horizons.data_oldest_nonremovable;
+		case VISHORIZON_TEMP:
+			return horizons.temp_oldest_nonremovable;
+	}
+
+	return InvalidTransactionId;
+}
+
+/*
+ * Return the oldest transaction id any currently running backend might still
+ * consider running. This should not be used for visibility / pruning
+ * determinations (see GetOldestNonRemovableTransactionId()), but for
+ * decisions like up to where pg_subtrans can be truncated.
+ */
+TransactionId
+GetOldestTransactionIdConsideredRunning(void)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	return horizons.oldest_considered_running;
+}
+
+/*
+ * Return the visibility horizons for a hot standby feedback message.
+ */
+void
+GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	/*
+	 * Don't want to use shared_oldest_nonremovable here, as that contains the
+	 * effect of replication slot's catalog_xmin. We want to send a separate
+	 * feedback for the catalog horizon, so the primary can remove data table
+	 * contents more aggressively.
+	 */
+	*xmin = horizons.shared_oldest_nonremovable_raw;
+	*catalog_xmin = horizons.slot_catalog_xmin;
+}
+
+/*
+ * GetMaxSnapshotXidCount -- get max size for snapshot XID array
+ *
+ * We have to export this for use by snapmgr.c.
+ */
+int
+GetMaxSnapshotXidCount(void)
+{
+	return procArray->maxProcs;
+}
+
+/*
+ * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array
+ *
+ * We have to export this for use by snapmgr.c.
+ */
+int
+GetMaxSnapshotSubxidCount(void)
+{
+	return TOTAL_MAX_CACHED_SUBXIDS;
+}
+
+/*
+ * Initialize old_snapshot_threshold specific parts of a newly build snapshot.
+ */
+static void
+GetSnapshotDataInitOldSnapshot(Snapshot snapshot)
+{
+	if (!OldSnapshotThresholdActive())
+	{
+		/*
+		 * If not using "snapshot too old" feature, fill related fields with
+		 * dummy values that don't require any locking.
+		 */
+		snapshot->lsn = InvalidXLogRecPtr;
+		snapshot->whenTaken = 0;
+	}
+	else
+	{
+		/*
+		 * Capture the current time and WAL stream location in case this
+		 * snapshot becomes old enough to need to fall back on the special
+		 * "old snapshot" logic.
+		 */
+		snapshot->lsn = GetXLogInsertRecPtr();
+		snapshot->whenTaken = GetSnapshotCurrentTimestamp();
+		MaintainOldSnapshotTimeMapping(snapshot->whenTaken, snapshot->xmin);
+	}
+}
+
+/*
+ * Helper function for GetSnapshotData() that checks if the bulk of the
+ * visibility information in the snapshot is still valid. If so, it updates
+ * the fields that need to change and returns true. Otherwise it returns
+ * false.
+ *
+ * This very likely can be evolved to not need ProcArrayLock held (at very
+ * least in the case we already hold a snapshot), but that's for another day.
+ */
+static bool
+GetSnapshotDataReuse(Snapshot snapshot)
+{
+	uint64		curXactCompletionCount;
+
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	if (unlikely(snapshot->snapXactCompletionCount == 0))
+		return false;
+
+	curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+	if (curXactCompletionCount != snapshot->snapXactCompletionCount)
+		return false;
+
+	/*
+	 * If the current xactCompletionCount is still the same as it was at the
+	 * time the snapshot was built, we can be sure that rebuilding the
+	 * contents of the snapshot the hard way would result in the same snapshot
+	 * contents:
+	 *
+	 * As explained in transam/README, the set of xids considered running by
+	 * GetSnapshotData() cannot change while ProcArrayLock is held. Snapshot
+	 * contents only depend on transactions with xids and xactCompletionCount
+	 * is incremented whenever a transaction with an xid finishes (while
+	 * holding ProcArrayLock) exclusively). Thus the xactCompletionCount check
+	 * ensures we would detect if the snapshot would have changed.
+	 *
+	 * As the snapshot contents are the same as it was before, it is safe to
+	 * re-enter the snapshot's xmin into the PGPROC array. None of the rows
+	 * visible under the snapshot could already have been removed (that'd
+	 * require the set of running transactions to change) and it fulfills the
+	 * requirement that concurrent GetSnapshotData() calls yield the same
+	 * xmin.
+	 */
+	if (!TransactionIdIsValid(MyProc->xmin))
+		MyProc->xmin = TransactionXmin = snapshot->xmin;
+
+	RecentXmin = snapshot->xmin;
+	Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+	snapshot->curcid = GetCurrentCommandId(false);
+	snapshot->active_count = 0;
+	snapshot->regd_count = 0;
+	snapshot->copied = false;
+
+	GetSnapshotDataInitOldSnapshot(snapshot);
+
+	return true;
+}
+
+/*
+ * GetSnapshotData -- returns information about running transactions.
+ *
+ * The returned snapshot includes xmin (lowest still-running xact ID),
+ * xmax (highest completed xact ID + 1), and a list of running xact IDs
+ * in the range xmin <= xid < xmax.  It is used as follows:
+ *		All xact IDs < xmin are considered finished.
+ *		All xact IDs >= xmax are considered still running.
+ *		For an xact ID xmin <= xid < xmax, consult list to see whether
+ *		it is considered running or not.
+ * This ensures that the set of transactions seen as "running" by the
+ * current xact will not change after it takes the snapshot.
+ *
+ * All running top-level XIDs are included in the snapshot, except for lazy
+ * VACUUM processes.  We also try to include running subtransaction XIDs,
+ * but since PGPROC has only a limited cache area for subxact XIDs, full
+ * information may not be available.  If we find any overflowed subxid arrays,
+ * we have to mark the snapshot's subxid data as overflowed, and extra work
+ * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
+ * in heapam_visibility.c).
+ *
+ * We also update the following backend-global variables:
+ *		TransactionXmin: the oldest xmin of any snapshot in use in the
+ *			current transaction (this is the same as MyProc->xmin).
+ *		RecentXmin: the xmin computed for the most recent snapshot.  XIDs
+ *			older than this are known not running any more.
+ *
+ * And try to advance the bounds of GlobalVis{Shared,Catalog,Data,Temp}Rels
+ * for the benefit of the GlobalVisTest* family of functions.
+ *
+ * Note: this function should probably not be called with an argument that's
+ * not statically allocated (see xip allocation below).
+ */
+Snapshot
+GetSnapshotData(Snapshot snapshot)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	TransactionId xmin;
+	TransactionId xmax;
+	int			count = 0;
+	int			subcount = 0;
+	bool		suboverflowed = false;
+	FullTransactionId latest_completed;
+	TransactionId oldestxid;
+	int			mypgxactoff;
+	TransactionId myxid;
+	uint64		curXactCompletionCount;
+
+	TransactionId replication_slot_xmin = InvalidTransactionId;
+	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+
+	Assert(snapshot != NULL);
+
+	/*
+	 * Allocating space for maxProcs xids is usually overkill; numProcs would
+	 * be sufficient.  But it seems better to do the malloc while not holding
+	 * the lock, so we can't look at numProcs.  Likewise, we allocate much
+	 * more subxip storage than is probably needed.
+	 *
+	 * This does open a possibility for avoiding repeated malloc/free: since
+	 * maxProcs does not change at runtime, we can simply reuse the previous
+	 * xip arrays if any.  (This relies on the fact that all callers pass
+	 * static SnapshotData structs.)
+	 */
+	if (snapshot->xip == NULL)
+	{
+		/*
+		 * First call for this snapshot. Snapshot is same size whether or not
+		 * we are in recovery, see later comments.
+		 */
+		snapshot->xip = (TransactionId *)
+			malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId));
+		if (snapshot->xip == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		Assert(snapshot->subxip == NULL);
+		snapshot->subxip = (TransactionId *)
+			malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId));
+		if (snapshot->subxip == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	/*
+	 * It is sufficient to get shared lock on ProcArrayLock, even if we are
+	 * going to set MyProc->xmin.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	if (GetSnapshotDataReuse(snapshot))
+	{
+		LWLockRelease(ProcArrayLock);
+		return snapshot;
+	}
+
+	latest_completed = ShmemVariableCache->latestCompletedXid;
+	mypgxactoff = MyProc->pgxactoff;
+	myxid = other_xids[mypgxactoff];
+	Assert(myxid == MyProc->xid);
+
+	oldestxid = ShmemVariableCache->oldestXid;
+	curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+
+	/* xmax is always latestCompletedXid + 1 */
+	xmax = XidFromFullTransactionId(latest_completed);
+	TransactionIdAdvance(xmax);
+	Assert(TransactionIdIsNormal(xmax));
+
+	/* initialize xmin calculation with xmax */
+	xmin = xmax;
+
+	/* take own xid into account, saves a check inside the loop */
+	if (TransactionIdIsNormal(myxid) && NormalTransactionIdPrecedes(myxid, xmin))
+		xmin = myxid;
+
+	snapshot->takenDuringRecovery = RecoveryInProgress();
+
+	if (!snapshot->takenDuringRecovery)
+	{
+		int			numProcs = arrayP->numProcs;
+		TransactionId *xip = snapshot->xip;
+		int		   *pgprocnos = arrayP->pgprocnos;
+		XidCacheStatus *subxidStates = ProcGlobal->subxidStates;
+		uint8	   *allStatusFlags = ProcGlobal->statusFlags;
+
+		/*
+		 * First collect set of pgxactoff/xids that need to be included in the
+		 * snapshot.
+		 */
+		for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
+		{
+			/* Fetch xid just once - see GetNewTransactionId */
+			TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
+			uint8		statusFlags;
+
+			Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff);
+
+			/*
+			 * If the transaction has no XID assigned, we can skip it; it
+			 * won't have sub-XIDs either.
+			 */
+			if (likely(xid == InvalidTransactionId))
+				continue;
+
+			/*
+			 * We don't include our own XIDs (if any) in the snapshot. It
+			 * needs to be includeded in the xmin computation, but we did so
+			 * outside the loop.
+			 */
+			if (pgxactoff == mypgxactoff)
+				continue;
+
+			/*
+			 * The only way we are able to get here with a non-normal xid is
+			 * during bootstrap - with this backend using
+			 * BootstrapTransactionId. But the above test should filter that
+			 * out.
+			 */
+			Assert(TransactionIdIsNormal(xid));
+
+			/*
+			 * If the XID is >= xmax, we can skip it; such transactions will
+			 * be treated as running anyway (and any sub-XIDs will also be >=
+			 * xmax).
+			 */
+			if (!NormalTransactionIdPrecedes(xid, xmax))
+				continue;
+
+			/*
+			 * Skip over backends doing logical decoding which manages xmin
+			 * separately (check below) and ones running LAZY VACUUM.
+			 */
+			statusFlags = allStatusFlags[pgxactoff];
+			if (statusFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM))
+				continue;
+
+			if (NormalTransactionIdPrecedes(xid, xmin))
+				xmin = xid;
+
+			/* Add XID to snapshot. */
+			xip[count++] = xid;
+
+			/*
+			 * Save subtransaction XIDs if possible (if we've already
+			 * overflowed, there's no point).  Note that the subxact XIDs must
+			 * be later than their parent, so no need to check them against
+			 * xmin.  We could filter against xmax, but it seems better not to
+			 * do that much work while holding the ProcArrayLock.
+			 *
+			 * The other backend can add more subxids concurrently, but cannot
+			 * remove any.  Hence it's important to fetch nxids just once.
+			 * Should be safe to use memcpy, though.  (We needn't worry about
+			 * missing any xids added concurrently, because they must postdate
+			 * xmax.)
+			 *
+			 * Again, our own XIDs are not included in the snapshot.
+			 */
+			if (!suboverflowed)
+			{
+
+				if (subxidStates[pgxactoff].overflowed)
+					suboverflowed = true;
+				else
+				{
+					int			nsubxids = subxidStates[pgxactoff].count;
+
+					if (nsubxids > 0)
+					{
+						int			pgprocno = pgprocnos[pgxactoff];
+						PGPROC	   *proc = &allProcs[pgprocno];
+
+						pg_read_barrier();	/* pairs with GetNewTransactionId */
+
+						memcpy(snapshot->subxip + subcount,
+							   (void *) proc->subxids.xids,
+							   nsubxids * sizeof(TransactionId));
+						subcount += nsubxids;
+					}
+				}
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * We're in hot standby, so get XIDs from KnownAssignedXids.
+		 *
+		 * We store all xids directly into subxip[]. Here's why:
+		 *
+		 * In recovery we don't know which xids are top-level and which are
+		 * subxacts, a design choice that greatly simplifies xid processing.
+		 *
+		 * It seems like we would want to try to put xids into xip[] only, but
+		 * that is fairly small. We would either need to make that bigger or
+		 * to increase the rate at which we WAL-log xid assignment; neither is
+		 * an appealing choice.
+		 *
+		 * We could try to store xids into xip[] first and then into subxip[]
+		 * if there are too many xids. That only works if the snapshot doesn't
+		 * overflow because we do not search subxip[] in that case. A simpler
+		 * way is to just store all xids in the subxact array because this is
+		 * by far the bigger array. We just leave the xip array empty.
+		 *
+		 * Either way we need to change the way XidInMVCCSnapshot() works
+		 * depending upon when the snapshot was taken, or change normal
+		 * snapshot processing so it matches.
+		 *
+		 * Note: It is possible for recovery to end before we finish taking
+		 * the snapshot, and for newly assigned transaction ids to be added to
+		 * the ProcArray.  xmax cannot change while we hold ProcArrayLock, so
+		 * those newly added transaction ids would be filtered away, so we
+		 * need not be concerned about them.
+		 */
+		subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
+												  xmax);
+
+		if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
+			suboverflowed = true;
+	}
+
+
+	/*
+	 * Fetch into local variable while ProcArrayLock is held - the
+	 * LWLockRelease below is a barrier, ensuring this happens inside the
+	 * lock.
+	 */
+	replication_slot_xmin = procArray->replication_slot_xmin;
+	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+	if (!TransactionIdIsValid(MyProc->xmin))
+		MyProc->xmin = TransactionXmin = xmin;
+
+	LWLockRelease(ProcArrayLock);
+
+	/* maintain state for GlobalVis* */
+	{
+		TransactionId def_vis_xid;
+		TransactionId def_vis_xid_data;
+		FullTransactionId def_vis_fxid;
+		FullTransactionId def_vis_fxid_data;
+		FullTransactionId oldestfxid;
+
+		/*
+		 * Converting oldestXid is only safe when xid horizon cannot advance,
+		 * i.e. holding locks. While we don't hold the lock anymore, all the
+		 * necessary data has been gathered with lock held.
+		 */
+		oldestfxid = FullXidRelativeTo(latest_completed, oldestxid);
+
+		/* apply vacuum_defer_cleanup_age */
+		def_vis_xid_data =
+			TransactionIdRetreatedBy(xmin, vacuum_defer_cleanup_age);
+
+		/* Check whether there's a replication slot requiring an older xmin. */
+		def_vis_xid_data =
+			TransactionIdOlder(def_vis_xid_data, replication_slot_xmin);
+
+		/*
+		 * Rows in non-shared, non-catalog tables possibly could be vacuumed
+		 * if older than this xid.
+		 */
+		def_vis_xid = def_vis_xid_data;
+
+		/*
+		 * Check whether there's a replication slot requiring an older catalog
+		 * xmin.
+		 */
+		def_vis_xid =
+			TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid);
+
+		def_vis_fxid = FullXidRelativeTo(latest_completed, def_vis_xid);
+		def_vis_fxid_data = FullXidRelativeTo(latest_completed, def_vis_xid_data);
+
+		/*
+		 * Check if we can increase upper bound. As a previous
+		 * GlobalVisUpdate() might have computed more aggressive values, don't
+		 * overwrite them if so.
+		 */
+		GlobalVisSharedRels.definitely_needed =
+			FullTransactionIdNewer(def_vis_fxid,
+								   GlobalVisSharedRels.definitely_needed);
+		GlobalVisCatalogRels.definitely_needed =
+			FullTransactionIdNewer(def_vis_fxid,
+								   GlobalVisCatalogRels.definitely_needed);
+		GlobalVisDataRels.definitely_needed =
+			FullTransactionIdNewer(def_vis_fxid_data,
+								   GlobalVisDataRels.definitely_needed);
+		/* See temp_oldest_nonremovable computation in ComputeXidHorizons() */
+		if (TransactionIdIsNormal(myxid))
+			GlobalVisTempRels.definitely_needed =
+				FullXidRelativeTo(latest_completed, myxid);
+		else
+		{
+			GlobalVisTempRels.definitely_needed = latest_completed;
+			FullTransactionIdAdvance(&GlobalVisTempRels.definitely_needed);
+		}
+
+		/*
+		 * Check if we know that we can initialize or increase the lower
+		 * bound. Currently the only cheap way to do so is to use
+		 * ShmemVariableCache->oldestXid as input.
+		 *
+		 * We should definitely be able to do better. We could e.g. put a
+		 * global lower bound value into ShmemVariableCache.
+		 */
+		GlobalVisSharedRels.maybe_needed =
+			FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+								   oldestfxid);
+		GlobalVisCatalogRels.maybe_needed =
+			FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+								   oldestfxid);
+		GlobalVisDataRels.maybe_needed =
+			FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+								   oldestfxid);
+		/* accurate value known */
+		GlobalVisTempRels.maybe_needed = GlobalVisTempRels.definitely_needed;
+	}
+
+	RecentXmin = xmin;
+	Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+	snapshot->xmin = xmin;
+	snapshot->xmax = xmax;
+	snapshot->xcnt = count;
+	snapshot->subxcnt = subcount;
+	snapshot->suboverflowed = suboverflowed;
+	snapshot->snapXactCompletionCount = curXactCompletionCount;
+
+	snapshot->curcid = GetCurrentCommandId(false);
+
+	/*
+	 * This is a new snapshot, so set both refcounts are zero, and mark it as
+	 * not copied in persistent memory.
+	 */
+	snapshot->active_count = 0;
+	snapshot->regd_count = 0;
+	snapshot->copied = false;
+
+	GetSnapshotDataInitOldSnapshot(snapshot);
+
+	return snapshot;
+}
+
+/*
+ * ProcArrayInstallImportedXmin -- install imported xmin into MyProc->xmin
+ *
+ * This is called when installing a snapshot imported from another
+ * transaction.  To ensure that OldestXmin doesn't go backwards, we must
+ * check that the source transaction is still running, and we'd better do
+ * that atomically with installing the new xmin.
+ *
+ * Returns true if successful, false if source xact is no longer running.
+ */
+bool
+ProcArrayInstallImportedXmin(TransactionId xmin,
+							 VirtualTransactionId *sourcevxid)
+{
+	bool		result = false;
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	Assert(TransactionIdIsNormal(xmin));
+	if (!sourcevxid)
+		return false;
+
+	/* Get lock so source xact can't end while we're doing this */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		int			statusFlags = ProcGlobal->statusFlags[index];
+		TransactionId xid;
+
+		/* Ignore procs running LAZY VACUUM */
+		if (statusFlags & PROC_IN_VACUUM)
+			continue;
+
+		/* We are only interested in the specific virtual transaction. */
+		if (proc->backendId != sourcevxid->backendId)
+			continue;
+		if (proc->lxid != sourcevxid->localTransactionId)
+			continue;
+
+		/*
+		 * We check the transaction's database ID for paranoia's sake: if it's
+		 * in another DB then its xmin does not cover us.  Caller should have
+		 * detected this already, so we just treat any funny cases as
+		 * "transaction not found".
+		 */
+		if (proc->databaseId != MyDatabaseId)
+			continue;
+
+		/*
+		 * Likewise, let's just make real sure its xmin does cover us.
+		 */
+		xid = UINT32_ACCESS_ONCE(proc->xmin);
+		if (!TransactionIdIsNormal(xid) ||
+			!TransactionIdPrecedesOrEquals(xid, xmin))
+			continue;
+
+		/*
+		 * We're good.  Install the new xmin.  As in GetSnapshotData, set
+		 * TransactionXmin too.  (Note that because snapmgr.c called
+		 * GetSnapshotData first, we'll be overwriting a valid xmin here, so
+		 * we don't check that.)
+		 */
+		MyProc->xmin = TransactionXmin = xmin;
+
+		result = true;
+		break;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * ProcArrayInstallRestoredXmin -- install restored xmin into MyProc->xmin
+ *
+ * This is like ProcArrayInstallImportedXmin, but we have a pointer to the
+ * PGPROC of the transaction from which we imported the snapshot, rather than
+ * an XID.
+ *
+ * Note that this function also copies statusFlags from the source `proc` in
+ * order to avoid the case where MyProc's xmin needs to be skipped for
+ * computing xid horizon.
+ *
+ * Returns true if successful, false if source xact is no longer running.
+ */
+bool
+ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
+{
+	bool		result = false;
+	TransactionId xid;
+
+	Assert(TransactionIdIsNormal(xmin));
+	Assert(proc != NULL);
+
+	/*
+	 * Get an exclusive lock so that we can copy statusFlags from source proc.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Be certain that the referenced PGPROC has an advertised xmin which is
+	 * no later than the one we're installing, so that the system-wide xmin
+	 * can't go backwards.  Also, make sure it's running in the same database,
+	 * so that the per-database xmin cannot go backwards.
+	 */
+	xid = UINT32_ACCESS_ONCE(proc->xmin);
+	if (proc->databaseId == MyDatabaseId &&
+		TransactionIdIsNormal(xid) &&
+		TransactionIdPrecedesOrEquals(xid, xmin))
+	{
+		/*
+		 * Install xmin and propagate the statusFlags that affect how the
+		 * value is interpreted by vacuum.
+		 */
+		MyProc->xmin = TransactionXmin = xmin;
+		MyProc->statusFlags = (MyProc->statusFlags & ~PROC_XMIN_FLAGS) |
+			(proc->statusFlags & PROC_XMIN_FLAGS);
+		ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags;
+
+		result = true;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * GetRunningTransactionData -- returns information about running transactions.
+ *
+ * Similar to GetSnapshotData but returns more information. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes and
+ * prepared transactions.
+ *
+ * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
+ * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
+ * array until the caller has WAL-logged this snapshot, and releases the
+ * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
+ * lock is released.
+ *
+ * The returned data structure is statically allocated; caller should not
+ * modify it, and must not assume it is valid past the next call.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * KnownAssignedXids.
+ *
+ * Dummy PGPROCs from prepared transaction are included, meaning that this
+ * may return entries with duplicated TransactionId values coming from
+ * transaction finishing to prepare.  Nothing is done about duplicated
+ * entries here to not hold on ProcArrayLock more than necessary.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ *
+ * Note that if any transaction has overflowed its cached subtransactions
+ * then there is no real need include any subtransactions.
+ */
+RunningTransactions
+GetRunningTransactionData(void)
+{
+	/* result workspace */
+	static RunningTransactionsData CurrentRunningXactsData;
+
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
+	TransactionId latestCompletedXid;
+	TransactionId oldestRunningXid;
+	TransactionId *xids;
+	int			index;
+	int			count;
+	int			subcount;
+	bool		suboverflowed;
+
+	Assert(!RecoveryInProgress());
+
+	/*
+	 * Allocating space for maxProcs xids is usually overkill; numProcs would
+	 * be sufficient.  But it seems better to do the malloc while not holding
+	 * the lock, so we can't look at numProcs.  Likewise, we allocate much
+	 * more subxip storage than is probably needed.
+	 *
+	 * Should only be allocated in bgwriter, since only ever executed during
+	 * checkpoints.
+	 */
+	if (CurrentRunningXacts->xids == NULL)
+	{
+		/*
+		 * First call
+		 */
+		CurrentRunningXacts->xids = (TransactionId *)
+			malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+		if (CurrentRunningXacts->xids == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	xids = CurrentRunningXacts->xids;
+
+	count = subcount = 0;
+	suboverflowed = false;
+
+	/*
+	 * Ensure that no xids enter or leave the procarray while we obtain
+	 * snapshot.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	LWLockAcquire(XidGenLock, LW_SHARED);
+
+	latestCompletedXid =
+		XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+	oldestRunningXid =
+		XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+	/*
+	 * Spin over procArray collecting all xids
+	 */
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		TransactionId xid;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+		/*
+		 * We don't need to store transactions that don't have a TransactionId
+		 * yet because they will not show as running on a standby server.
+		 */
+		if (!TransactionIdIsValid(xid))
+			continue;
+
+		/*
+		 * Be careful not to exclude any xids before calculating the values of
+		 * oldestRunningXid and suboverflowed, since these are used to clean
+		 * up transaction information held on standbys.
+		 */
+		if (TransactionIdPrecedes(xid, oldestRunningXid))
+			oldestRunningXid = xid;
+
+		if (ProcGlobal->subxidStates[index].overflowed)
+			suboverflowed = true;
+
+		/*
+		 * If we wished to exclude xids this would be the right place for it.
+		 * Procs with the PROC_IN_VACUUM flag set don't usually assign xids,
+		 * but they do during truncation at the end when they get the lock and
+		 * truncate, so it is not much of a problem to include them if they
+		 * are seen and it is cleaner to include them.
+		 */
+
+		xids[count++] = xid;
+	}
+
+	/*
+	 * Spin over procArray collecting all subxids, but only if there hasn't
+	 * been a suboverflow.
+	 */
+	if (!suboverflowed)
+	{
+		XidCacheStatus *other_subxidstates = ProcGlobal->subxidStates;
+
+		for (index = 0; index < arrayP->numProcs; index++)
+		{
+			int			pgprocno = arrayP->pgprocnos[index];
+			PGPROC	   *proc = &allProcs[pgprocno];
+			int			nsubxids;
+
+			/*
+			 * Save subtransaction XIDs. Other backends can't add or remove
+			 * entries while we're holding XidGenLock.
+			 */
+			nsubxids = other_subxidstates[index].count;
+			if (nsubxids > 0)
+			{
+				/* barrier not really required, as XidGenLock is held, but ... */
+				pg_read_barrier();	/* pairs with GetNewTransactionId */
+
+				memcpy(&xids[count], (void *) proc->subxids.xids,
+					   nsubxids * sizeof(TransactionId));
+				count += nsubxids;
+				subcount += nsubxids;
+
+				/*
+				 * Top-level XID of a transaction is always less than any of
+				 * its subxids, so we don't need to check if any of the
+				 * subxids are smaller than oldestRunningXid
+				 */
+			}
+		}
+	}
+
+	/*
+	 * It's important *not* to include the limits set by slots here because
+	 * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those
+	 * were to be included here the initial value could never increase because
+	 * of a circular dependency where slots only increase their limits when
+	 * running xacts increases oldestRunningXid and running xacts only
+	 * increases if slots do.
+	 */
+
+	CurrentRunningXacts->xcnt = count - subcount;
+	CurrentRunningXacts->subxcnt = subcount;
+	CurrentRunningXacts->subxid_overflow = suboverflowed;
+	CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
+	CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
+
+	Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
+	Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
+	Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
+
+	/* We don't release the locks here, the caller is responsible for that */
+
+	return CurrentRunningXacts;
+}
+
+/*
+ * GetOldestActiveTransactionId()
+ *
+ * Similar to GetSnapshotData but returns just oldestActiveXid. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes.
+ * We look at all databases, though there is no need to include WALSender
+ * since this has no effect on hot standby conflicts.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * KnownAssignedXids.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ */
+TransactionId
+GetOldestActiveTransactionId(void)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	TransactionId oldestRunningXid;
+	int			index;
+
+	Assert(!RecoveryInProgress());
+
+	/*
+	 * Read nextXid, as the upper bound of what's still active.
+	 *
+	 * Reading a TransactionId is atomic, but we must grab the lock to make
+	 * sure that all XIDs < nextXid are already present in the proc array (or
+	 * have already completed), when we spin over it.
+	 */
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	LWLockRelease(XidGenLock);
+
+	/*
+	 * Spin over procArray collecting all xids and subxids.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		TransactionId xid;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+		if (!TransactionIdIsNormal(xid))
+			continue;
+
+		if (TransactionIdPrecedes(xid, oldestRunningXid))
+			oldestRunningXid = xid;
+
+		/*
+		 * Top-level XID of a transaction is always less than any of its
+		 * subxids, so we don't need to check if any of the subxids are
+		 * smaller than oldestRunningXid
+		 */
+	}
+	LWLockRelease(ProcArrayLock);
+
+	return oldestRunningXid;
+}
+
+/*
+ * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum
+ *
+ * Returns the oldest xid that we can guarantee not to have been affected by
+ * vacuum, i.e. no rows >= that xid have been vacuumed away unless the
+ * transaction aborted. Note that the value can (and most of the time will) be
+ * much more conservative than what really has been affected by vacuum, but we
+ * currently don't have better data available.
+ *
+ * This is useful to initialize the cutoff xid after which a new changeset
+ * extraction replication slot can start decoding changes.
+ *
+ * Must be called with ProcArrayLock held either shared or exclusively,
+ * although most callers will want to use exclusive mode since it is expected
+ * that the caller will immediately use the xid to peg the xmin horizon.
+ */
+TransactionId
+GetOldestSafeDecodingTransactionId(bool catalogOnly)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId oldestSafeXid;
+	int			index;
+	bool		recovery_in_progress = RecoveryInProgress();
+
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	/*
+	 * Acquire XidGenLock, so no transactions can acquire an xid while we're
+	 * running. If no transaction with xid were running concurrently a new xid
+	 * could influence the RecentXmin et al.
+	 *
+	 * We initialize the computation to nextXid since that's guaranteed to be
+	 * a safe, albeit pessimal, value.
+	 */
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+	/*
+	 * If there's already a slot pegging the xmin horizon, we can start with
+	 * that value, it's guaranteed to be safe since it's computed by this
+	 * routine initially and has been enforced since.  We can always use the
+	 * slot's general xmin horizon, but the catalog horizon is only usable
+	 * when only catalog data is going to be looked at.
+	 */
+	if (TransactionIdIsValid(procArray->replication_slot_xmin) &&
+		TransactionIdPrecedes(procArray->replication_slot_xmin,
+							  oldestSafeXid))
+		oldestSafeXid = procArray->replication_slot_xmin;
+
+	if (catalogOnly &&
+		TransactionIdIsValid(procArray->replication_slot_catalog_xmin) &&
+		TransactionIdPrecedes(procArray->replication_slot_catalog_xmin,
+							  oldestSafeXid))
+		oldestSafeXid = procArray->replication_slot_catalog_xmin;
+
+	/*
+	 * If we're not in recovery, we walk over the procarray and collect the
+	 * lowest xid. Since we're called with ProcArrayLock held and have
+	 * acquired XidGenLock, no entries can vanish concurrently, since
+	 * ProcGlobal->xids[i] is only set with XidGenLock held and only cleared
+	 * with ProcArrayLock held.
+	 *
+	 * In recovery we can't lower the safe value besides what we've computed
+	 * above, so we'll have to wait a bit longer there. We unfortunately can
+	 * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
+	 * machinery can miss values and return an older value than is safe.
+	 */
+	if (!recovery_in_progress)
+	{
+		TransactionId *other_xids = ProcGlobal->xids;
+
+		/*
+		 * Spin over procArray collecting min(ProcGlobal->xids[i])
+		 */
+		for (index = 0; index < arrayP->numProcs; index++)
+		{
+			TransactionId xid;
+
+			/* Fetch xid just once - see GetNewTransactionId */
+			xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+			if (!TransactionIdIsNormal(xid))
+				continue;
+
+			if (TransactionIdPrecedes(xid, oldestSafeXid))
+				oldestSafeXid = xid;
+		}
+	}
+
+	LWLockRelease(XidGenLock);
+
+	return oldestSafeXid;
+}
+
+/*
+ * GetVirtualXIDsDelayingChkptGuts -- Get the VXIDs of transactions that are
+ * delaying the start or end of a checkpoint because they have critical
+ * actions in progress.
+ *
+ * Constructs an array of VXIDs of transactions that are currently in commit
+ * critical sections, as shown by having delayChkpt or delayChkptEnd set in
+ * their PGPROC.
+ *
+ * Returns a palloc'd array that should be freed by the caller.
+ * *nvxids is the number of valid entries.
+ *
+ * Note that because backends set or clear delayChkpt and delayChkptEnd
+ * without holding any lock, the result is somewhat indeterminate, but we
+ * don't really care.  Even in a multiprocessor with delayed writes to
+ * shared memory, it should be certain that setting of delayChkpt will
+ * propagate to shared memory when the backend takes a lock, so we cannot
+ * fail to see a virtual xact as delayChkpt if it's already inserted its
+ * commit record.  Whether it takes a little while for clearing of
+ * delayChkpt to propagate is unimportant for correctness.
+ */
+static VirtualTransactionId *
+GetVirtualXIDsDelayingChkptGuts(int *nvxids, int type)
+{
+	VirtualTransactionId *vxids;
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	Assert(type != 0);
+
+	/* allocate what's certainly enough result space */
+	vxids = (VirtualTransactionId *)
+		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (((type & DELAY_CHKPT_START) && proc->delayChkpt) ||
+			((type & DELAY_CHKPT_COMPLETE) && proc->delayChkptEnd))
+		{
+			VirtualTransactionId vxid;
+
+			GET_VXID_FROM_PGPROC(vxid, *proc);
+			if (VirtualTransactionIdIsValid(vxid))
+				vxids[count++] = vxid;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	*nvxids = count;
+	return vxids;
+}
+
+/*
+ * GetVirtualXIDsDelayingChkpt - Get the VXIDs of transactions that are
+ * delaying the start of a checkpoint.
+ */
+VirtualTransactionId *
+GetVirtualXIDsDelayingChkpt(int *nvxids)
+{
+	return GetVirtualXIDsDelayingChkptGuts(nvxids, DELAY_CHKPT_START);
+}
+
+/*
+ * GetVirtualXIDsDelayingChkptEnd - Get the VXIDs of transactions that are
+ * delaying the end of a checkpoint.
+ */
+VirtualTransactionId *
+GetVirtualXIDsDelayingChkptEnd(int *nvxids)
+{
+	return GetVirtualXIDsDelayingChkptGuts(nvxids, DELAY_CHKPT_COMPLETE);
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying?
+ *
+ * This is used with the results of GetVirtualXIDsDelayingChkpt to see if any
+ * of the specified VXIDs are still in critical sections of code.
+ *
+ * Note: this is O(N^2) in the number of vxacts that are/were delaying, but
+ * those numbers should be small enough for it not to be a problem.
+ */
+static bool
+HaveVirtualXIDsDelayingChkptGuts(VirtualTransactionId *vxids, int nvxids,
+								 int type)
+{
+	bool		result = false;
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	Assert(type != 0);
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		VirtualTransactionId vxid;
+
+		GET_VXID_FROM_PGPROC(vxid, *proc);
+
+		if ((((type & DELAY_CHKPT_START) && proc->delayChkpt) ||
+			 ((type & DELAY_CHKPT_COMPLETE) && proc->delayChkptEnd)) &&
+			VirtualTransactionIdIsValid(vxid))
+		{
+			int			i;
+
+			for (i = 0; i < nvxids; i++)
+			{
+				if (VirtualTransactionIdEquals(vxid, vxids[i]))
+				{
+					result = true;
+					break;
+				}
+			}
+			if (result)
+				break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying
+ * the start of a checkpoint?
+ */
+bool
+HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
+{
+	return HaveVirtualXIDsDelayingChkptGuts(vxids, nvxids,
+											DELAY_CHKPT_START);
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkptEnd -- Are any of the specified VXIDs delaying
+ * the end of a checkpoint?
+ */
+bool
+HaveVirtualXIDsDelayingChkptEnd(VirtualTransactionId *vxids, int nvxids)
+{
+	return HaveVirtualXIDsDelayingChkptGuts(vxids, nvxids,
+											DELAY_CHKPT_COMPLETE);
+}
+
+/*
+ * BackendPidGetProc -- get a backend's PGPROC given its PID
+ *
+ * Returns NULL if not found.  Note that it is up to the caller to be
+ * sure that the question remains meaningful for long enough for the
+ * answer to be used ...
+ */
+PGPROC *
+BackendPidGetProc(int pid)
+{
+	PGPROC	   *result;
+
+	if (pid == 0)				/* never match dummy PGPROCs */
+		return NULL;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	result = BackendPidGetProcWithLock(pid);
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * BackendPidGetProcWithLock -- get a backend's PGPROC given its PID
+ *
+ * Same as above, except caller must be holding ProcArrayLock.  The found
+ * entry, if any, can be assumed to be valid as long as the lock remains held.
+ */
+PGPROC *
+BackendPidGetProcWithLock(int pid)
+{
+	PGPROC	   *result = NULL;
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	if (pid == 0)				/* never match dummy PGPROCs */
+		return NULL;
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		PGPROC	   *proc = &allProcs[arrayP->pgprocnos[index]];
+
+		if (proc->pid == pid)
+		{
+			result = proc;
+			break;
+		}
+	}
+
+	return result;
+}
+
+/*
+ * BackendXidGetPid -- get a backend's pid given its XID
+ *
+ * Returns 0 if not found or it's a prepared transaction.  Note that
+ * it is up to the caller to be sure that the question remains
+ * meaningful for long enough for the answer to be used ...
+ *
+ * Only main transaction Ids are considered.  This function is mainly
+ * useful for determining what backend owns a lock.
+ *
+ * Beware that not every xact has an XID assigned.  However, as long as you
+ * only call this using an XID found on disk, you're safe.
+ */
+int
+BackendXidGetPid(TransactionId xid)
+{
+	int			result = 0;
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
+	int			index;
+
+	if (xid == InvalidTransactionId)	/* never match invalid xid */
+		return 0;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (other_xids[index] == xid)
+		{
+			result = proc->pid;
+			break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
+ * IsBackendPid -- is a given pid a running backend
+ *
+ * This is not called by the backend, but is called by external modules.
+ */
+bool
+IsBackendPid(int pid)
+{
+	return (BackendPidGetProc(pid) != NULL);
+}
+
+
+/*
+ * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * The array is palloc'd. The number of valid entries is returned into *nvxids.
+ *
+ * The arguments allow filtering the set of VXIDs returned.  Our own process
+ * is always skipped.  In addition:
+ *	If limitXmin is not InvalidTransactionId, skip processes with
+ *		xmin > limitXmin.
+ *	If excludeXmin0 is true, skip processes with xmin = 0.
+ *	If allDbs is false, skip processes attached to other databases.
+ *	If excludeVacuum isn't zero, skip processes for which
+ *		(statusFlags & excludeVacuum) is not zero.
+ *
+ * Note: the purpose of the limitXmin and excludeXmin0 parameters is to
+ * allow skipping backends whose oldest live snapshot is no older than
+ * some snapshot we have.  Since we examine the procarray with only shared
+ * lock, there are race conditions: a backend could set its xmin just after
+ * we look.  Indeed, on multiprocessors with weak memory ordering, the
+ * other backend could have set its xmin *before* we look.  We know however
+ * that such a backend must have held shared ProcArrayLock overlapping our
+ * own hold of ProcArrayLock, else we would see its xmin update.  Therefore,
+ * any snapshot the other backend is taking concurrently with our scan cannot
+ * consider any transactions as still running that we think are committed
+ * (since backends must hold ProcArrayLock exclusive to commit).
+ */
+VirtualTransactionId *
+GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
+					  bool allDbs, int excludeVacuum,
+					  int *nvxids)
+{
+	VirtualTransactionId *vxids;
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	/* allocate what's certainly enough result space */
+	vxids = (VirtualTransactionId *)
+		palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		uint8		statusFlags = ProcGlobal->statusFlags[index];
+
+		if (proc == MyProc)
+			continue;
+
+		if (excludeVacuum & statusFlags)
+			continue;
+
+		if (allDbs || proc->databaseId == MyDatabaseId)
+		{
+			/* Fetch xmin just once - might change on us */
+			TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+			if (excludeXmin0 && !TransactionIdIsValid(pxmin))
+				continue;
+
+			/*
+			 * InvalidTransactionId precedes all other XIDs, so a proc that
+			 * hasn't set xmin yet will not be rejected by this test.
+			 */
+			if (!TransactionIdIsValid(limitXmin) ||
+				TransactionIdPrecedesOrEquals(pxmin, limitXmin))
+			{
+				VirtualTransactionId vxid;
+
+				GET_VXID_FROM_PGPROC(vxid, *proc);
+				if (VirtualTransactionIdIsValid(vxid))
+					vxids[count++] = vxid;
+			}
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	*nvxids = count;
+	return vxids;
+}
+
+/*
+ * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * Usage is limited to conflict resolution during recovery on standby servers.
+ * limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId
+ * in cases where we cannot accurately determine a value for latestRemovedXid.
+ *
+ * If limitXmin is InvalidTransactionId then we want to kill everybody,
+ * so we're not worried if they have a snapshot or not, nor does it really
+ * matter what type of lock we hold.
+ *
+ * All callers that are checking xmins always now supply a valid and useful
+ * value for limitXmin. The limitXmin is always lower than the lowest
+ * numbered KnownAssignedXid that is not already a FATAL error. This is
+ * because we only care about cleanup records that are cleaning up tuple
+ * versions from committed transactions. In that case they will only occur
+ * at the point where the record is less than the lowest running xid. That
+ * allows us to say that if any backend takes a snapshot concurrently with
+ * us then the conflict assessment made here would never include the snapshot
+ * that is being derived. So we take LW_SHARED on the ProcArray and allow
+ * concurrent snapshots when limitXmin is valid. We might think about adding
+ *	 Assert(limitXmin < lowest(KnownAssignedXids))
+ * but that would not be true in the case of FATAL errors lagging in array,
+ * but we already know those are bogus anyway, so we skip that test.
+ *
+ * If dbOid is valid we skip backends attached to other databases.
+ *
+ * Be careful to *not* pfree the result from this function. We reuse
+ * this array sufficiently often that we use malloc for the result.
+ */
+VirtualTransactionId *
+GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
+{
+	static VirtualTransactionId *vxids;
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	/*
+	 * If first time through, get workspace to remember main XIDs in. We
+	 * malloc it permanently to avoid repeated palloc/pfree overhead. Allow
+	 * result space, remembering room for a terminator.
+	 */
+	if (vxids == NULL)
+	{
+		vxids = (VirtualTransactionId *)
+			malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
+		if (vxids == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		/* Exclude prepared transactions */
+		if (proc->pid == 0)
+			continue;
+
+		if (!OidIsValid(dbOid) ||
+			proc->databaseId == dbOid)
+		{
+			/* Fetch xmin just once - can't change on us, but good coding */
+			TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+			/*
+			 * We ignore an invalid pxmin because this means that backend has
+			 * no snapshot currently. We hold a Share lock to avoid contention
+			 * with users taking snapshots.  That is not a problem because the
+			 * current xmin is always at least one higher than the latest
+			 * removed xid, so any new snapshot would never conflict with the
+			 * test here.
+			 */
+			if (!TransactionIdIsValid(limitXmin) ||
+				(TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin)))
+			{
+				VirtualTransactionId vxid;
+
+				GET_VXID_FROM_PGPROC(vxid, *proc);
+				if (VirtualTransactionIdIsValid(vxid))
+					vxids[count++] = vxid;
+			}
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	/* add the terminator */
+	vxids[count].backendId = InvalidBackendId;
+	vxids[count].localTransactionId = InvalidLocalTransactionId;
+
+	return vxids;
+}
+
+/*
+ * CancelVirtualTransaction - used in recovery conflict processing
+ *
+ * Returns pid of the process signaled, or 0 if not found.
+ */
+pid_t
+CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
+{
+	return SignalVirtualTransaction(vxid, sigmode, true);
+}
+
+pid_t
+SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
+						 bool conflictPending)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+	pid_t		pid = 0;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+		VirtualTransactionId procvxid;
+
+		GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+		if (procvxid.backendId == vxid.backendId &&
+			procvxid.localTransactionId == vxid.localTransactionId)
+		{
+			proc->recoveryConflictPending = conflictPending;
+			pid = proc->pid;
+			if (pid != 0)
+			{
+				/*
+				 * Kill the pid if it's still here. If not, that's what we
+				 * wanted so ignore any errors.
+				 */
+				(void) SendProcSignal(pid, sigmode, vxid.backendId);
+			}
+			break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return pid;
+}
+
+/*
+ * MinimumActiveBackends --- count backends (other than myself) that are
+ *		in active transactions.  Return true if the count exceeds the
+ *		minimum threshold passed.  This is used as a heuristic to decide if
+ *		a pre-XLOG-flush delay is worthwhile during commit.
+ *
+ * Do not count backends that are blocked waiting for locks, since they are
+ * not going to get to run until someone else commits.
+ */
+bool
+MinimumActiveBackends(int min)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	/* Quick short-circuit if no minimum is specified */
+	if (min == 0)
+		return true;
+
+	/*
+	 * Note: for speed, we don't acquire ProcArrayLock.  This is a little bit
+	 * bogus, but since we are only testing fields for zero or nonzero, it
+	 * should be OK.  The result is only used for heuristic purposes anyway...
+	 */
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		/*
+		 * Since we're not holding a lock, need to be prepared to deal with
+		 * garbage, as someone could have incremented numProcs but not yet
+		 * filled the structure.
+		 *
+		 * If someone just decremented numProcs, 'proc' could also point to a
+		 * PGPROC entry that's no longer in the array. It still points to a
+		 * PGPROC struct, though, because freed PGPROC entries just go to the
+		 * free list and are recycled. Its contents are nonsense in that case,
+		 * but that's acceptable for this function.
+		 */
+		if (pgprocno == -1)
+			continue;			/* do not count deleted entries */
+		if (proc == MyProc)
+			continue;			/* do not count myself */
+		if (proc->xid == InvalidTransactionId)
+			continue;			/* do not count if no XID assigned */
+		if (proc->pid == 0)
+			continue;			/* do not count prepared xacts */
+		if (proc->waitLock != NULL)
+			continue;			/* do not count if blocked on a lock */
+		count++;
+		if (count >= min)
+			break;
+	}
+
+	return count >= min;
+}
+
+/*
+ * CountDBBackends --- count backends that are using specified database
+ */
+int
+CountDBBackends(Oid databaseid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (proc->pid == 0)
+			continue;			/* do not count prepared xacts */
+		if (!OidIsValid(databaseid) ||
+			proc->databaseId == databaseid)
+			count++;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return count;
+}
+
+/*
+ * CountDBConnections --- counts database backends ignoring any background
+ *		worker processes
+ */
+int
+CountDBConnections(Oid databaseid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (proc->pid == 0)
+			continue;			/* do not count prepared xacts */
+		if (proc->isBackgroundWorker)
+			continue;			/* do not count background workers */
+		if (!OidIsValid(databaseid) ||
+			proc->databaseId == databaseid)
+			count++;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return count;
+}
+
+/*
+ * CancelDBBackends --- cancel backends that are using specified database
+ */
+void
+CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	/* tell all backends to die */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (databaseid == InvalidOid || proc->databaseId == databaseid)
+		{
+			VirtualTransactionId procvxid;
+			pid_t		pid;
+
+			GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+			proc->recoveryConflictPending = conflictPending;
+			pid = proc->pid;
+			if (pid != 0)
+			{
+				/*
+				 * Kill the pid if it's still here. If not, that's what we
+				 * wanted so ignore any errors.
+				 */
+				(void) SendProcSignal(pid, sigmode, procvxid.backendId);
+			}
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * CountUserBackends --- count backends that are used by specified user
+ */
+int
+CountUserBackends(Oid roleid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (proc->pid == 0)
+			continue;			/* do not count prepared xacts */
+		if (proc->isBackgroundWorker)
+			continue;			/* do not count background workers */
+		if (proc->roleId == roleid)
+			count++;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return count;
+}
+
+/*
+ * CountOtherDBBackends -- check for other backends running in the given DB
+ *
+ * If there are other backends in the DB, we will wait a maximum of 5 seconds
+ * for them to exit.  Autovacuum backends are encouraged to exit early by
+ * sending them SIGTERM, but normal user backends are just waited for.
+ *
+ * The current backend is always ignored; it is caller's responsibility to
+ * check whether the current backend uses the given DB, if it's important.
+ *
+ * Returns true if there are (still) other backends in the DB, false if not.
+ * Also, *nbackends and *nprepared are set to the number of other backends
+ * and prepared transactions in the DB, respectively.
+ *
+ * This function is used to interlock DROP DATABASE and related commands
+ * against there being any active backends in the target DB --- dropping the
+ * DB while active backends remain would be a Bad Thing.  Note that we cannot
+ * detect here the possibility of a newly-started backend that is trying to
+ * connect to the doomed database, so additional interlocking is needed during
+ * backend startup.  The caller should normally hold an exclusive lock on the
+ * target DB before calling this, which is one reason we mustn't wait
+ * indefinitely.
+ */
+bool
+CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
+{
+	ProcArrayStruct *arrayP = procArray;
+
+#define MAXAUTOVACPIDS	10		/* max autovacs to SIGTERM per iteration */
+	int			autovac_pids[MAXAUTOVACPIDS];
+	int			tries;
+
+	/* 50 tries with 100ms sleep between tries makes 5 sec total wait */
+	for (tries = 0; tries < 50; tries++)
+	{
+		int			nautovacs = 0;
+		bool		found = false;
+		int			index;
+
+		CHECK_FOR_INTERRUPTS();
+
+		*nbackends = *nprepared = 0;
+
+		LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+		for (index = 0; index < arrayP->numProcs; index++)
+		{
+			int			pgprocno = arrayP->pgprocnos[index];
+			PGPROC	   *proc = &allProcs[pgprocno];
+			uint8		statusFlags = ProcGlobal->statusFlags[index];
+
+			if (proc->databaseId != databaseId)
+				continue;
+			if (proc == MyProc)
+				continue;
+
+			found = true;
+
+			if (proc->pid == 0)
+				(*nprepared)++;
+			else
+			{
+				(*nbackends)++;
+				if ((statusFlags & PROC_IS_AUTOVACUUM) &&
+					nautovacs < MAXAUTOVACPIDS)
+					autovac_pids[nautovacs++] = proc->pid;
+			}
+		}
+
+		LWLockRelease(ProcArrayLock);
+
+		if (!found)
+			return false;		/* no conflicting backends, so done */
+
+		/*
+		 * Send SIGTERM to any conflicting autovacuums before sleeping. We
+		 * postpone this step until after the loop because we don't want to
+		 * hold ProcArrayLock while issuing kill(). We have no idea what might
+		 * block kill() inside the kernel...
+		 */
+		for (index = 0; index < nautovacs; index++)
+			(void) kill(autovac_pids[index], SIGTERM);	/* ignore any error */
+
+		/* sleep, then try again */
+		pg_usleep(100 * 1000L); /* 100ms */
+	}
+
+	return true;				/* timed out, still conflicts */
+}
+
+/*
+ * Terminate existing connections to the specified database. This routine
+ * is used by the DROP DATABASE command when user has asked to forcefully
+ * drop the database.
+ *
+ * The current backend is always ignored; it is caller's responsibility to
+ * check whether the current backend uses the given DB, if it's important.
+ *
+ * It doesn't allow to terminate the connections even if there is a one
+ * backend with the prepared transaction in the target database.
+ */
+void
+TerminateOtherDBBackends(Oid databaseId)
+{
+	ProcArrayStruct *arrayP = procArray;
+	List	   *pids = NIL;
+	int			nprepared = 0;
+	int			i;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (i = 0; i < procArray->numProcs; i++)
+	{
+		int			pgprocno = arrayP->pgprocnos[i];
+		PGPROC	   *proc = &allProcs[pgprocno];
+
+		if (proc->databaseId != databaseId)
+			continue;
+		if (proc == MyProc)
+			continue;
+
+		if (proc->pid != 0)
+			pids = lappend_int(pids, proc->pid);
+		else
+			nprepared++;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	if (nprepared > 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_IN_USE),
+				 errmsg("database \"%s\" is being used by prepared transactions",
+						get_database_name(databaseId)),
+				 errdetail_plural("There is %d prepared transaction using the database.",
+								  "There are %d prepared transactions using the database.",
+								  nprepared,
+								  nprepared)));
+
+	if (pids)
+	{
+		ListCell   *lc;
+
+		/*
+		 * Check whether we have the necessary rights to terminate other
+		 * sessions.  We don't terminate any session until we ensure that we
+		 * have rights on all the sessions to be terminated.  These checks are
+		 * the same as we do in pg_terminate_backend.
+		 *
+		 * In this case we don't raise some warnings - like "PID %d is not a
+		 * PostgreSQL server process", because for us already finished session
+		 * is not a problem.
+		 */
+		foreach(lc, pids)
+		{
+			int			pid = lfirst_int(lc);
+			PGPROC	   *proc = BackendPidGetProc(pid);
+
+			if (proc != NULL)
+			{
+				/* Only allow superusers to signal superuser-owned backends. */
+				if (superuser_arg(proc->roleId) && !superuser())
+					ereport(ERROR,
+							(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+							 errmsg("must be a superuser to terminate superuser process")));
+
+				/* Users can signal backends they have role membership in. */
+				if (!has_privs_of_role(GetUserId(), proc->roleId) &&
+					!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+					ereport(ERROR,
+							(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+							 errmsg("must be a member of the role whose process is being terminated or member of pg_signal_backend")));
+			}
+		}
+
+		/*
+		 * There's a race condition here: once we release the ProcArrayLock,
+		 * it's possible for the session to exit before we issue kill.  That
+		 * race condition possibility seems too unlikely to worry about.  See
+		 * pg_signal_backend.
+		 */
+		foreach(lc, pids)
+		{
+			int			pid = lfirst_int(lc);
+			PGPROC	   *proc = BackendPidGetProc(pid);
+
+			if (proc != NULL)
+			{
+				/*
+				 * If we have setsid(), signal the backend's whole process
+				 * group
+				 */
+#ifdef HAVE_SETSID
+				(void) kill(-pid, SIGTERM);
+#else
+				(void) kill(pid, SIGTERM);
+#endif
+			}
+		}
+	}
+}
+
+/*
+ * ProcArraySetReplicationSlotXmin
+ *
+ * Install limits to future computations of the xmin horizon to prevent vacuum
+ * and HOT pruning from removing affected rows still needed by clients with
+ * replication slots.
+ */
+void
+ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
+								bool already_locked)
+{
+	Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
+
+	if (!already_locked)
+		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	procArray->replication_slot_xmin = xmin;
+	procArray->replication_slot_catalog_xmin = catalog_xmin;
+
+	if (!already_locked)
+		LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ProcArrayGetReplicationSlotXmin
+ *
+ * Return the current slot xmin limits. That's useful to be able to remove
+ * data that's older than those limits.
+ */
+void
+ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+								TransactionId *catalog_xmin)
+{
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	if (xmin != NULL)
+		*xmin = procArray->replication_slot_xmin;
+
+	if (catalog_xmin != NULL)
+		*catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * XidCacheRemoveRunningXids
+ *
+ * Remove a bunch of TransactionIds from the list of known-running
+ * subtransactions for my backend.  Both the specified xid and those in
+ * the xids[] array (of length nxids) are removed from the subxids cache.
+ * latestXid must be the latest XID among the group.
+ */
+void
+XidCacheRemoveRunningXids(TransactionId xid,
+						  int nxids, const TransactionId *xids,
+						  TransactionId latestXid)
+{
+	int			i,
+				j;
+	XidCacheStatus *mysubxidstat;
+
+	Assert(TransactionIdIsValid(xid));
+
+	/*
+	 * We must hold ProcArrayLock exclusively in order to remove transactions
+	 * from the PGPROC array.  (See src/backend/access/transam/README.)  It's
+	 * possible this could be relaxed since we know this routine is only used
+	 * to abort subtransactions, but pending closer analysis we'd best be
+	 * conservative.
+	 *
+	 * Note that we do not have to be careful about memory ordering of our own
+	 * reads wrt. GetNewTransactionId() here - only this process can modify
+	 * relevant fields of MyProc/ProcGlobal->xids[].  But we do have to be
+	 * careful about our own writes being well ordered.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	mysubxidstat = &ProcGlobal->subxidStates[MyProc->pgxactoff];
+
+	/*
+	 * Under normal circumstances xid and xids[] will be in increasing order,
+	 * as will be the entries in subxids.  Scan backwards to avoid O(N^2)
+	 * behavior when removing a lot of xids.
+	 */
+	for (i = nxids - 1; i >= 0; i--)
+	{
+		TransactionId anxid = xids[i];
+
+		for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
+		{
+			if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
+			{
+				MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+				pg_write_barrier();
+				mysubxidstat->count--;
+				MyProc->subxidStatus.count--;
+				break;
+			}
+		}
+
+		/*
+		 * Ordinarily we should have found it, unless the cache has
+		 * overflowed. However it's also possible for this routine to be
+		 * invoked multiple times for the same subtransaction, in case of an
+		 * error during AbortSubTransaction.  So instead of Assert, emit a
+		 * debug warning.
+		 */
+		if (j < 0 && !MyProc->subxidStatus.overflowed)
+			elog(WARNING, "did not find subXID %u in MyProc", anxid);
+	}
+
+	for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
+	{
+		if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
+		{
+			MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+			pg_write_barrier();
+			mysubxidstat->count--;
+			MyProc->subxidStatus.count--;
+			break;
+		}
+	}
+	/* Ordinarily we should have found it, unless the cache has overflowed */
+	if (j < 0 && !MyProc->subxidStatus.overflowed)
+		elog(WARNING, "did not find subXID %u in MyProc", xid);
+
+	/* Also advance global latestCompletedXid while holding the lock */
+	MaintainLatestCompletedXid(latestXid);
+
+	/* ... and xactCompletionCount */
+	ShmemVariableCache->xactCompletionCount++;
+
+	LWLockRelease(ProcArrayLock);
+}
+
+#ifdef XIDCACHE_DEBUG
+
+/*
+ * Print stats about effectiveness of XID cache
+ */
+static void
+DisplayXidCache(void)
+{
+	fprintf(stderr,
+			"XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
+			xc_by_recent_xmin,
+			xc_by_known_xact,
+			xc_by_my_xact,
+			xc_by_latest_xid,
+			xc_by_main_xid,
+			xc_by_child_xid,
+			xc_by_known_assigned,
+			xc_no_overflow,
+			xc_slow_answer);
+}
+#endif							/* XIDCACHE_DEBUG */
+
+/*
+ * If rel != NULL, return test state appropriate for relation, otherwise
+ * return state usable for all relations.  The latter may consider XIDs as
+ * not-yet-visible-to-everyone that a state for a specific relation would
+ * already consider visible-to-everyone.
+ *
+ * This needs to be called while a snapshot is active or registered, otherwise
+ * there are wraparound and other dangers.
+ *
+ * See comment for GlobalVisState for details.
+ */
+GlobalVisState *
+GlobalVisTestFor(Relation rel)
+{
+	GlobalVisState *state = NULL;
+
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(RecentXmin);
+
+	switch (GlobalVisHorizonKindForRel(rel))
+	{
+		case VISHORIZON_SHARED:
+			state = &GlobalVisSharedRels;
+			break;
+		case VISHORIZON_CATALOG:
+			state = &GlobalVisCatalogRels;
+			break;
+		case VISHORIZON_DATA:
+			state = &GlobalVisDataRels;
+			break;
+		case VISHORIZON_TEMP:
+			state = &GlobalVisTempRels;
+			break;
+	}
+
+	Assert(FullTransactionIdIsValid(state->definitely_needed) &&
+		   FullTransactionIdIsValid(state->maybe_needed));
+
+	return state;
+}
+
+/*
+ * Return true if it's worth updating the accurate maybe_needed boundary.
+ *
+ * As it is somewhat expensive to determine xmin horizons, we don't want to
+ * repeatedly do so when there is a low likelihood of it being beneficial.
+ *
+ * The current heuristic is that we update only if RecentXmin has changed
+ * since the last update. If the oldest currently running transaction has not
+ * finished, it is unlikely that recomputing the horizon would be useful.
+ */
+static bool
+GlobalVisTestShouldUpdate(GlobalVisState *state)
+{
+	/* hasn't been updated yet */
+	if (!TransactionIdIsValid(ComputeXidHorizonsResultLastXmin))
+		return true;
+
+	/*
+	 * If the maybe_needed/definitely_needed boundaries are the same, it's
+	 * unlikely to be beneficial to refresh boundaries.
+	 */
+	if (FullTransactionIdFollowsOrEquals(state->maybe_needed,
+										 state->definitely_needed))
+		return false;
+
+	/* does the last snapshot built have a different xmin? */
+	return RecentXmin != ComputeXidHorizonsResultLastXmin;
+}
+
+static void
+GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons)
+{
+	GlobalVisSharedRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->shared_oldest_nonremovable);
+	GlobalVisCatalogRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->catalog_oldest_nonremovable);
+	GlobalVisDataRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->data_oldest_nonremovable);
+	GlobalVisTempRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->temp_oldest_nonremovable);
+
+	/*
+	 * In longer running transactions it's possible that transactions we
+	 * previously needed to treat as running aren't around anymore. So update
+	 * definitely_needed to not be earlier than maybe_needed.
+	 */
+	GlobalVisSharedRels.definitely_needed =
+		FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+							   GlobalVisSharedRels.definitely_needed);
+	GlobalVisCatalogRels.definitely_needed =
+		FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+							   GlobalVisCatalogRels.definitely_needed);
+	GlobalVisDataRels.definitely_needed =
+		FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+							   GlobalVisDataRels.definitely_needed);
+	GlobalVisTempRels.definitely_needed = GlobalVisTempRels.maybe_needed;
+
+	ComputeXidHorizonsResultLastXmin = RecentXmin;
+}
+
+/*
+ * Update boundaries in GlobalVis{Shared,Catalog, Data}Rels
+ * using ComputeXidHorizons().
+ */
+static void
+GlobalVisUpdate(void)
+{
+	ComputeXidHorizonsResult horizons;
+
+	/* updates the horizons as a side-effect */
+	ComputeXidHorizons(&horizons);
+}
+
+/*
+ * Return true if no snapshot still considers fxid to be running.
+ *
+ * The state passed needs to have been initialized for the relation fxid is
+ * from (NULL is also OK), otherwise the result may not be correct.
+ *
+ * See comment for GlobalVisState for details.
+ */
+bool
+GlobalVisTestIsRemovableFullXid(GlobalVisState *state,
+								FullTransactionId fxid)
+{
+	/*
+	 * If fxid is older than maybe_needed bound, it definitely is visible to
+	 * everyone.
+	 */
+	if (FullTransactionIdPrecedes(fxid, state->maybe_needed))
+		return true;
+
+	/*
+	 * If fxid is >= definitely_needed bound, it is very likely to still be
+	 * considered running.
+	 */
+	if (FullTransactionIdFollowsOrEquals(fxid, state->definitely_needed))
+		return false;
+
+	/*
+	 * fxid is between maybe_needed and definitely_needed, i.e. there might or
+	 * might not exist a snapshot considering fxid running. If it makes sense,
+	 * update boundaries and recheck.
+	 */
+	if (GlobalVisTestShouldUpdate(state))
+	{
+		GlobalVisUpdate();
+
+		Assert(FullTransactionIdPrecedes(fxid, state->definitely_needed));
+
+		return FullTransactionIdPrecedes(fxid, state->maybe_needed);
+	}
+	else
+		return false;
+}
+
+/*
+ * Wrapper around GlobalVisTestIsRemovableFullXid() for 32bit xids.
+ *
+ * It is crucial that this only gets called for xids from a source that
+ * protects against xid wraparounds (e.g. from a table and thus protected by
+ * relfrozenxid).
+ */
+bool
+GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
+{
+	FullTransactionId fxid;
+
+	/*
+	 * Convert 32 bit argument to FullTransactionId. We can do so safely
+	 * because we know the xid has to, at the very least, be between
+	 * [oldestXid, nextFullXid), i.e. within 2 billion of xid. To avoid taking
+	 * a lock to determine either, we can just compare with
+	 * state->definitely_needed, which was based on those value at the time
+	 * the current snapshot was built.
+	 */
+	fxid = FullXidRelativeTo(state->definitely_needed, xid);
+
+	return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Return FullTransactionId below which all transactions are not considered
+ * running anymore.
+ *
+ * Note: This is less efficient than testing with
+ * GlobalVisTestIsRemovableFullXid as it likely requires building an accurate
+ * cutoff, even in the case all the XIDs compared with the cutoff are outside
+ * [maybe_needed, definitely_needed).
+ */
+FullTransactionId
+GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state)
+{
+	/* acquire accurate horizon if not already done */
+	if (GlobalVisTestShouldUpdate(state))
+		GlobalVisUpdate();
+
+	return state->maybe_needed;
+}
+
+/* Convenience wrapper around GlobalVisTestNonRemovableFullHorizon */
+TransactionId
+GlobalVisTestNonRemovableHorizon(GlobalVisState *state)
+{
+	FullTransactionId cutoff;
+
+	cutoff = GlobalVisTestNonRemovableFullHorizon(state);
+
+	return XidFromFullTransactionId(cutoff);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableFullXid(), see their comments.
+ */
+bool
+GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
+{
+	GlobalVisState *state;
+
+	state = GlobalVisTestFor(rel);
+
+	return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableXid(), see their comments.
+ */
+bool
+GlobalVisCheckRemovableXid(Relation rel, TransactionId xid)
+{
+	GlobalVisState *state;
+
+	state = GlobalVisTestFor(rel);
+
+	return GlobalVisTestIsRemovableXid(state, xid);
+}
+
+/*
+ * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it
+ * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel).
+ *
+ * Be very careful about when to use this function. It can only safely be used
+ * when there is a guarantee that xid is within MaxTransactionId / 2 xids of
+ * rel. That e.g. can be guaranteed if the caller assures a snapshot is
+ * held by the backend and xid is from a table (where vacuum/freezing ensures
+ * the xid has to be within that range), or if xid is from the procarray and
+ * prevents xid wraparound that way.
+ */
+static inline FullTransactionId
+FullXidRelativeTo(FullTransactionId rel, TransactionId xid)
+{
+	TransactionId rel_xid = XidFromFullTransactionId(rel);
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(TransactionIdIsValid(rel_xid));
+
+	/* not guaranteed to find issues, but likely to catch mistakes */
+	AssertTransactionIdInAllowableRange(xid);
+
+	return FullTransactionIdFromU64(U64FromFullTransactionId(rel)
+									+ (int32) (xid - rel_xid));
+}
+
+
+/* ----------------------------------------------
+ *		KnownAssignedTransactionIds sub-module
+ * ----------------------------------------------
+ */
+
+/*
+ * In Hot Standby mode, we maintain a list of transactions that are (or were)
+ * running on the primary at the current point in WAL.  These XIDs must be
+ * treated as running by standby transactions, even though they are not in
+ * the standby server's PGPROC array.
+ *
+ * We record all XIDs that we know have been assigned.  That includes all the
+ * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
+ * been assigned.  We can deduce the existence of unobserved XIDs because we
+ * know XIDs are assigned in sequence, with no gaps.  The KnownAssignedXids
+ * list expands as new XIDs are observed or inferred, and contracts when
+ * transaction completion records arrive.
+ *
+ * During hot standby we do not fret too much about the distinction between
+ * top-level XIDs and subtransaction XIDs. We store both together in the
+ * KnownAssignedXids list.  In backends, this is copied into snapshots in
+ * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
+ * doesn't care about the distinction either.  Subtransaction XIDs are
+ * effectively treated as top-level XIDs and in the typical case pg_subtrans
+ * links are *not* maintained (which does not affect visibility).
+ *
+ * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
+ * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every primary transaction must
+ * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
+ * least every PGPROC_MAX_CACHED_SUBXIDS.  When we receive one of these
+ * records, we mark the subXIDs as children of the top XID in pg_subtrans,
+ * and then remove them from KnownAssignedXids.  This prevents overflow of
+ * KnownAssignedXids and snapshots, at the cost that status checks for these
+ * subXIDs will take a slower path through TransactionIdIsInProgress().
+ * This means that KnownAssignedXids is not necessarily complete for subXIDs,
+ * though it should be complete for top-level XIDs; this is the same situation
+ * that holds with respect to the PGPROC entries in normal running.
+ *
+ * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
+ * that, similarly to tracking overflow of a PGPROC's subxids array.  We do
+ * that by remembering the lastOverflowedXid, ie the last thrown-away subXID.
+ * As long as that is within the range of interesting XIDs, we have to assume
+ * that subXIDs are missing from snapshots.  (Note that subXID overflow occurs
+ * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
+ * subXID arrives - that is not an error.)
+ *
+ * Should a backend on primary somehow disappear before it can write an abort
+ * record, then we just leave those XIDs in KnownAssignedXids. They actually
+ * aborted but we think they were running; the distinction is irrelevant
+ * because either way any changes done by the transaction are not visible to
+ * backends in the standby.  We prune KnownAssignedXids when
+ * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
+ * array due to such dead XIDs.
+ */
+
+/*
+ * RecordKnownAssignedTransactionIds
+ *		Record the given XID in KnownAssignedXids, as well as any preceding
+ *		unobserved XIDs.
+ *
+ * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
+ * associated with a transaction. Must be called for each record after we
+ * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
+ *
+ * Called during recovery in analogy with and in place of GetNewTransactionId()
+ */
+void
+RecordKnownAssignedTransactionIds(TransactionId xid)
+{
+	Assert(standbyState >= STANDBY_INITIALIZED);
+	Assert(TransactionIdIsValid(xid));
+	Assert(TransactionIdIsValid(latestObservedXid));
+
+	elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
+		 xid, latestObservedXid);
+
+	/*
+	 * When a newly observed xid arrives, it is frequently the case that it is
+	 * *not* the next xid in sequence. When this occurs, we must treat the
+	 * intervening xids as running also.
+	 */
+	if (TransactionIdFollows(xid, latestObservedXid))
+	{
+		TransactionId next_expected_xid;
+
+		/*
+		 * Extend subtrans like we do in GetNewTransactionId() during normal
+		 * operation using individual extend steps. Note that we do not need
+		 * to extend clog since its extensions are WAL logged.
+		 *
+		 * This part has to be done regardless of standbyState since we
+		 * immediately start assigning subtransactions to their toplevel
+		 * transactions.
+		 */
+		next_expected_xid = latestObservedXid;
+		while (TransactionIdPrecedes(next_expected_xid, xid))
+		{
+			TransactionIdAdvance(next_expected_xid);
+			ExtendSUBTRANS(next_expected_xid);
+		}
+		Assert(next_expected_xid == xid);
+
+		/*
+		 * If the KnownAssignedXids machinery isn't up yet, there's nothing
+		 * more to do since we don't track assigned xids yet.
+		 */
+		if (standbyState <= STANDBY_INITIALIZED)
+		{
+			latestObservedXid = xid;
+			return;
+		}
+
+		/*
+		 * Add (latestObservedXid, xid] onto the KnownAssignedXids array.
+		 */
+		next_expected_xid = latestObservedXid;
+		TransactionIdAdvance(next_expected_xid);
+		KnownAssignedXidsAdd(next_expected_xid, xid, false);
+
+		/*
+		 * Now we can advance latestObservedXid
+		 */
+		latestObservedXid = xid;
+
+		/* ShmemVariableCache->nextXid must be beyond any observed xid */
+		AdvanceNextFullTransactionIdPastXid(latestObservedXid);
+		next_expected_xid = latestObservedXid;
+		TransactionIdAdvance(next_expected_xid);
+	}
+}
+
+/*
+ * ExpireTreeKnownAssignedTransactionIds
+ *		Remove the given XIDs from KnownAssignedXids.
+ *
+ * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
+ */
+void
+ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
+									  TransactionId *subxids, TransactionId max_xid)
+{
+	Assert(standbyState >= STANDBY_INITIALIZED);
+
+	/*
+	 * Uses same locking as transaction commit
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
+
+	/* As in ProcArrayEndTransaction, advance latestCompletedXid */
+	MaintainLatestCompletedXidRecovery(max_xid);
+
+	/* ... and xactCompletionCount */
+	ShmemVariableCache->xactCompletionCount++;
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ExpireAllKnownAssignedTransactionIds
+ *		Remove all entries in KnownAssignedXids and reset lastOverflowedXid.
+ */
+void
+ExpireAllKnownAssignedTransactionIds(void)
+{
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	KnownAssignedXidsRemovePreceding(InvalidTransactionId);
+
+	/*
+	 * Reset lastOverflowedXid.  Currently, lastOverflowedXid has no use after
+	 * the call of this function.  But do this for unification with what
+	 * ExpireOldKnownAssignedTransactionIds() do.
+	 */
+	procArray->lastOverflowedXid = InvalidTransactionId;
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ExpireOldKnownAssignedTransactionIds
+ *		Remove KnownAssignedXids entries preceding the given XID and
+ *		potentially reset lastOverflowedXid.
+ */
+void
+ExpireOldKnownAssignedTransactionIds(TransactionId xid)
+{
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Reset lastOverflowedXid if we know all transactions that have been
+	 * possibly running are being gone.  Not doing so could cause an incorrect
+	 * lastOverflowedXid value, which makes extra snapshots be marked as
+	 * suboverflowed.
+	 */
+	if (TransactionIdPrecedes(procArray->lastOverflowedXid, xid))
+		procArray->lastOverflowedXid = InvalidTransactionId;
+	KnownAssignedXidsRemovePreceding(xid);
+	LWLockRelease(ProcArrayLock);
+}
+
+
+/*
+ * Private module functions to manipulate KnownAssignedXids
+ *
+ * There are 5 main uses of the KnownAssignedXids data structure:
+ *
+ *	* backends taking snapshots - all valid XIDs need to be copied out
+ *	* backends seeking to determine presence of a specific XID
+ *	* startup process adding new known-assigned XIDs
+ *	* startup process removing specific XIDs as transactions end
+ *	* startup process pruning array when special WAL records arrive
+ *
+ * This data structure is known to be a hot spot during Hot Standby, so we
+ * go to some lengths to make these operations as efficient and as concurrent
+ * as possible.
+ *
+ * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
+ * order, to be exact --- to allow binary search for specific XIDs.  Note:
+ * in general TransactionIdPrecedes would not provide a total order, but
+ * we know that the entries present at any instant should not extend across
+ * a large enough fraction of XID space to wrap around (the primary would
+ * shut down for fear of XID wrap long before that happens).  So it's OK to
+ * use TransactionIdPrecedes as a binary-search comparator.
+ *
+ * It's cheap to maintain the sortedness during insertions, since new known
+ * XIDs are always reported in XID order; we just append them at the right.
+ *
+ * To keep individual deletions cheap, we need to allow gaps in the array.
+ * This is implemented by marking array elements as valid or invalid using
+ * the parallel boolean array KnownAssignedXidsValid[].  A deletion is done
+ * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
+ * XID entry itself.  This preserves the property that the XID entries are
+ * sorted, so we can do binary searches easily.  Periodically we compress
+ * out the unused entries; that's much cheaper than having to compress the
+ * array immediately on every deletion.
+ *
+ * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
+ * are those with indexes tail <= i < head; items outside this subscript range
+ * have unspecified contents.  When head reaches the end of the array, we
+ * force compression of unused entries rather than wrapping around, since
+ * allowing wraparound would greatly complicate the search logic.  We maintain
+ * an explicit tail pointer so that pruning of old XIDs can be done without
+ * immediately moving the array contents.  In most cases only a small fraction
+ * of the array contains valid entries at any instant.
+ *
+ * Although only the startup process can ever change the KnownAssignedXids
+ * data structure, we still need interlocking so that standby backends will
+ * not observe invalid intermediate states.  The convention is that backends
+ * must hold shared ProcArrayLock to examine the array.  To remove XIDs from
+ * the array, the startup process must hold ProcArrayLock exclusively, for
+ * the usual transactional reasons (compare commit/abort of a transaction
+ * during normal running).  Compressing unused entries out of the array
+ * likewise requires exclusive lock.  To add XIDs to the array, we just insert
+ * them into slots to the right of the head pointer and then advance the head
+ * pointer.  This wouldn't require any lock at all, except that on machines
+ * with weak memory ordering we need to be careful that other processors
+ * see the array element changes before they see the head pointer change.
+ * We handle this by using a spinlock to protect reads and writes of the
+ * head/tail pointers.  (We could dispense with the spinlock if we were to
+ * create suitable memory access barrier primitives and use those instead.)
+ * The spinlock must be taken to read or write the head/tail pointers unless
+ * the caller holds ProcArrayLock exclusively.
+ *
+ * Algorithmic analysis:
+ *
+ * If we have a maximum of M slots, with N XIDs currently spread across
+ * S elements then we have N <= S <= M always.
+ *
+ *	* Adding a new XID is O(1) and needs little locking (unless compression
+ *		must happen)
+ *	* Compressing the array is O(S) and requires exclusive lock
+ *	* Removing an XID is O(logS) and requires exclusive lock
+ *	* Taking a snapshot is O(S) and requires shared lock
+ *	* Checking for an XID is O(logS) and requires shared lock
+ *
+ * In comparison, using a hash table for KnownAssignedXids would mean that
+ * taking snapshots would be O(M). If we can maintain S << M then the
+ * sorted array technique will deliver significantly faster snapshots.
+ * If we try to keep S too small then we will spend too much time compressing,
+ * so there is an optimal point for any workload mix. We use a heuristic to
+ * decide when to compress the array, though trimming also helps reduce
+ * frequency of compressing. The heuristic requires us to track the number of
+ * currently valid XIDs in the array.
+ */
+
+
+/*
+ * Compress KnownAssignedXids by shifting valid data down to the start of the
+ * array, removing any gaps.
+ *
+ * A compression step is forced if "force" is true, otherwise we do it
+ * only if a heuristic indicates it's a good time to do it.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsCompress(bool force)
+{
+	ProcArrayStruct *pArray = procArray;
+	int			head,
+				tail;
+	int			compress_index;
+	int			i;
+
+	/* no spinlock required since we hold ProcArrayLock exclusively */
+	head = pArray->headKnownAssignedXids;
+	tail = pArray->tailKnownAssignedXids;
+
+	if (!force)
+	{
+		/*
+		 * If we can choose how much to compress, use a heuristic to avoid
+		 * compressing too often or not often enough.
+		 *
+		 * Heuristic is if we have a large enough current spread and less than
+		 * 50% of the elements are currently in use, then compress. This
+		 * should ensure we compress fairly infrequently. We could compress
+		 * less often though the virtual array would spread out more and
+		 * snapshots would become more expensive.
+		 */
+		int			nelements = head - tail;
+
+		if (nelements < 4 * PROCARRAY_MAXPROCS ||
+			nelements < 2 * pArray->numKnownAssignedXids)
+			return;
+	}
+
+	/*
+	 * We compress the array by reading the valid values from tail to head,
+	 * re-aligning data to 0th element.
+	 */
+	compress_index = 0;
+	for (i = tail; i < head; i++)
+	{
+		if (KnownAssignedXidsValid[i])
+		{
+			KnownAssignedXids[compress_index] = KnownAssignedXids[i];
+			KnownAssignedXidsValid[compress_index] = true;
+			compress_index++;
+		}
+	}
+
+	pArray->tailKnownAssignedXids = 0;
+	pArray->headKnownAssignedXids = compress_index;
+}
+
+/*
+ * Add xids into KnownAssignedXids at the head of the array.
+ *
+ * xids from from_xid to to_xid, inclusive, are added to the array.
+ *
+ * If exclusive_lock is true then caller already holds ProcArrayLock in
+ * exclusive mode, so we need no extra locking here.  Else caller holds no
+ * lock, so we need to be sure we maintain sufficient interlocks against
+ * concurrent readers.  (Only the startup process ever calls this, so no need
+ * to worry about concurrent writers.)
+ */
+static void
+KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
+					 bool exclusive_lock)
+{
+	ProcArrayStruct *pArray = procArray;
+	TransactionId next_xid;
+	int			head,
+				tail;
+	int			nxids;
+	int			i;
+
+	Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));
+
+	/*
+	 * Calculate how many array slots we'll need.  Normally this is cheap; in
+	 * the unusual case where the XIDs cross the wrap point, we do it the hard
+	 * way.
+	 */
+	if (to_xid >= from_xid)
+		nxids = to_xid - from_xid + 1;
+	else
+	{
+		nxids = 1;
+		next_xid = from_xid;
+		while (TransactionIdPrecedes(next_xid, to_xid))
+		{
+			nxids++;
+			TransactionIdAdvance(next_xid);
+		}
+	}
+
+	/*
+	 * Since only the startup process modifies the head/tail pointers, we
+	 * don't need a lock to read them here.
+	 */
+	head = pArray->headKnownAssignedXids;
+	tail = pArray->tailKnownAssignedXids;
+
+	Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
+	Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);
+
+	/*
+	 * Verify that insertions occur in TransactionId sequence.  Note that even
+	 * if the last existing element is marked invalid, it must still have a
+	 * correctly sequenced XID value.
+	 */
+	if (head > tail &&
+		TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
+	{
+		KnownAssignedXidsDisplay(LOG);
+		elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
+	}
+
+	/*
+	 * If our xids won't fit in the remaining space, compress out free space
+	 */
+	if (head + nxids > pArray->maxKnownAssignedXids)
+	{
+		/* must hold lock to compress */
+		if (!exclusive_lock)
+			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+		KnownAssignedXidsCompress(true);
+
+		head = pArray->headKnownAssignedXids;
+		/* note: we no longer care about the tail pointer */
+
+		if (!exclusive_lock)
+			LWLockRelease(ProcArrayLock);
+
+		/*
+		 * If it still won't fit then we're out of memory
+		 */
+		if (head + nxids > pArray->maxKnownAssignedXids)
+			elog(ERROR, "too many KnownAssignedXids");
+	}
+
+	/* Now we can insert the xids into the space starting at head */
+	next_xid = from_xid;
+	for (i = 0; i < nxids; i++)
+	{
+		KnownAssignedXids[head] = next_xid;
+		KnownAssignedXidsValid[head] = true;
+		TransactionIdAdvance(next_xid);
+		head++;
+	}
+
+	/* Adjust count of number of valid entries */
+	pArray->numKnownAssignedXids += nxids;
+
+	/*
+	 * Now update the head pointer.  We use a spinlock to protect this
+	 * pointer, not because the update is likely to be non-atomic, but to
+	 * ensure that other processors see the above array updates before they
+	 * see the head pointer change.
+	 *
+	 * If we're holding ProcArrayLock exclusively, there's no need to take the
+	 * spinlock.
+	 */
+	if (exclusive_lock)
+		pArray->headKnownAssignedXids = head;
+	else
+	{
+		SpinLockAcquire(&pArray->known_assigned_xids_lck);
+		pArray->headKnownAssignedXids = head;
+		SpinLockRelease(&pArray->known_assigned_xids_lck);
+	}
+}
+
+/*
+ * KnownAssignedXidsSearch
+ *
+ * Searches KnownAssignedXids for a specific xid and optionally removes it.
+ * Returns true if it was found, false if not.
+ *
+ * Caller must hold ProcArrayLock in shared or exclusive mode.
+ * Exclusive lock must be held for remove = true.
+ */
+static bool
+KnownAssignedXidsSearch(TransactionId xid, bool remove)
+{
+	ProcArrayStruct *pArray = procArray;
+	int			first,
+				last;
+	int			head;
+	int			tail;
+	int			result_index = -1;
+
+	if (remove)
+	{
+		/* we hold ProcArrayLock exclusively, so no need for spinlock */
+		tail = pArray->tailKnownAssignedXids;
+		head = pArray->headKnownAssignedXids;
+	}
+	else
+	{
+		/* take spinlock to ensure we see up-to-date array contents */
+		SpinLockAcquire(&pArray->known_assigned_xids_lck);
+		tail = pArray->tailKnownAssignedXids;
+		head = pArray->headKnownAssignedXids;
+		SpinLockRelease(&pArray->known_assigned_xids_lck);
+	}
+
+	/*
+	 * Standard binary search.  Note we can ignore the KnownAssignedXidsValid
+	 * array here, since even invalid entries will contain sorted XIDs.
+	 */
+	first = tail;
+	last = head - 1;
+	while (first <= last)
+	{
+		int			mid_index;
+		TransactionId mid_xid;
+
+		mid_index = (first + last) / 2;
+		mid_xid = KnownAssignedXids[mid_index];
+
+		if (xid == mid_xid)
+		{
+			result_index = mid_index;
+			break;
+		}
+		else if (TransactionIdPrecedes(xid, mid_xid))
+			last = mid_index - 1;
+		else
+			first = mid_index + 1;
+	}
+
+	if (result_index < 0)
+		return false;			/* not in array */
+
+	if (!KnownAssignedXidsValid[result_index])
+		return false;			/* in array, but invalid */
+
+	if (remove)
+	{
+		KnownAssignedXidsValid[result_index] = false;
+
+		pArray->numKnownAssignedXids--;
+		Assert(pArray->numKnownAssignedXids >= 0);
+
+		/*
+		 * If we're removing the tail element then advance tail pointer over
+		 * any invalid elements.  This will speed future searches.
+		 */
+		if (result_index == tail)
+		{
+			tail++;
+			while (tail < head && !KnownAssignedXidsValid[tail])
+				tail++;
+			if (tail >= head)
+			{
+				/* Array is empty, so we can reset both pointers */
+				pArray->headKnownAssignedXids = 0;
+				pArray->tailKnownAssignedXids = 0;
+			}
+			else
+			{
+				pArray->tailKnownAssignedXids = tail;
+			}
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Is the specified XID present in KnownAssignedXids[]?
+ *
+ * Caller must hold ProcArrayLock in shared or exclusive mode.
+ */
+static bool
+KnownAssignedXidExists(TransactionId xid)
+{
+	Assert(TransactionIdIsValid(xid));
+
+	return KnownAssignedXidsSearch(xid, false);
+}
+
+/*
+ * Remove the specified XID from KnownAssignedXids[].
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemove(TransactionId xid)
+{
+	Assert(TransactionIdIsValid(xid));
+
+	elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);
+
+	/*
+	 * Note: we cannot consider it an error to remove an XID that's not
+	 * present.  We intentionally remove subxact IDs while processing
+	 * XLOG_XACT_ASSIGNMENT, to avoid array overflow.  Then those XIDs will be
+	 * removed again when the top-level xact commits or aborts.
+	 *
+	 * It might be possible to track such XIDs to distinguish this case from
+	 * actual errors, but it would be complicated and probably not worth it.
+	 * So, just ignore the search result.
+	 */
+	(void) KnownAssignedXidsSearch(xid, true);
+}
+
+/*
+ * KnownAssignedXidsRemoveTree
+ *		Remove xid (if it's not InvalidTransactionId) and all the subxids.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
+							TransactionId *subxids)
+{
+	int			i;
+
+	if (TransactionIdIsValid(xid))
+		KnownAssignedXidsRemove(xid);
+
+	for (i = 0; i < nsubxids; i++)
+		KnownAssignedXidsRemove(subxids[i]);
+
+	/* Opportunistically compress the array */
+	KnownAssignedXidsCompress(false);
+}
+
+/*
+ * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
+ * then clear the whole table.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemovePreceding(TransactionId removeXid)
+{
+	ProcArrayStruct *pArray = procArray;
+	int			count = 0;
+	int			head,
+				tail,
+				i;
+
+	if (!TransactionIdIsValid(removeXid))
+	{
+		elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
+		pArray->numKnownAssignedXids = 0;
+		pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
+		return;
+	}
+
+	elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
+
+	/*
+	 * Mark entries invalid starting at the tail.  Since array is sorted, we
+	 * can stop as soon as we reach an entry >= removeXid.
+	 */
+	tail = pArray->tailKnownAssignedXids;
+	head = pArray->headKnownAssignedXids;
+
+	for (i = tail; i < head; i++)
+	{
+		if (KnownAssignedXidsValid[i])
+		{
+			TransactionId knownXid = KnownAssignedXids[i];
+
+			if (TransactionIdFollowsOrEquals(knownXid, removeXid))
+				break;
+
+			if (!StandbyTransactionIdIsPrepared(knownXid))
+			{
+				KnownAssignedXidsValid[i] = false;
+				count++;
+			}
+		}
+	}
+
+	pArray->numKnownAssignedXids -= count;
+	Assert(pArray->numKnownAssignedXids >= 0);
+
+	/*
+	 * Advance the tail pointer if we've marked the tail item invalid.
+	 */
+	for (i = tail; i < head; i++)
+	{
+		if (KnownAssignedXidsValid[i])
+			break;
+	}
+	if (i >= head)
+	{
+		/* Array is empty, so we can reset both pointers */
+		pArray->headKnownAssignedXids = 0;
+		pArray->tailKnownAssignedXids = 0;
+	}
+	else
+	{
+		pArray->tailKnownAssignedXids = i;
+	}
+
+	/* Opportunistically compress the array */
+	KnownAssignedXidsCompress(false);
+}
+
+/*
+ * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
+ * We filter out anything >= xmax.
+ *
+ * Returns the number of XIDs stored into xarray[].  Caller is responsible
+ * that array is large enough.
+ *
+ * Caller must hold ProcArrayLock in (at least) shared mode.
+ */
+static int
+KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
+{
+	TransactionId xtmp = InvalidTransactionId;
+
+	return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
+}
+
+/*
+ * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
+ * we reduce *xmin to the lowest xid value seen if not already lower.
+ *
+ * Caller must hold ProcArrayLock in (at least) shared mode.
+ */
+static int
+KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
+							   TransactionId xmax)
+{
+	int			count = 0;
+	int			head,
+				tail;
+	int			i;
+
+	/*
+	 * Fetch head just once, since it may change while we loop. We can stop
+	 * once we reach the initially seen head, since we are certain that an xid
+	 * cannot enter and then leave the array while we hold ProcArrayLock.  We
+	 * might miss newly-added xids, but they should be >= xmax so irrelevant
+	 * anyway.
+	 *
+	 * Must take spinlock to ensure we see up-to-date array contents.
+	 */
+	SpinLockAcquire(&procArray->known_assigned_xids_lck);
+	tail = procArray->tailKnownAssignedXids;
+	head = procArray->headKnownAssignedXids;
+	SpinLockRelease(&procArray->known_assigned_xids_lck);
+
+	for (i = tail; i < head; i++)
+	{
+		/* Skip any gaps in the array */
+		if (KnownAssignedXidsValid[i])
+		{
+			TransactionId knownXid = KnownAssignedXids[i];
+
+			/*
+			 * Update xmin if required.  Only the first XID need be checked,
+			 * since the array is sorted.
+			 */
+			if (count == 0 &&
+				TransactionIdPrecedes(knownXid, *xmin))
+				*xmin = knownXid;
+
+			/*
+			 * Filter out anything >= xmax, again relying on sorted property
+			 * of array.
+			 */
+			if (TransactionIdIsValid(xmax) &&
+				TransactionIdFollowsOrEquals(knownXid, xmax))
+				break;
+
+			/* Add knownXid into output array */
+			xarray[count++] = knownXid;
+		}
+	}
+
+	return count;
+}
+
+/*
+ * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
+ * if nothing there.
+ */
+static TransactionId
+KnownAssignedXidsGetOldestXmin(void)
+{
+	int			head,
+				tail;
+	int			i;
+
+	/*
+	 * Fetch head just once, since it may change while we loop.
+	 */
+	SpinLockAcquire(&procArray->known_assigned_xids_lck);
+	tail = procArray->tailKnownAssignedXids;
+	head = procArray->headKnownAssignedXids;
+	SpinLockRelease(&procArray->known_assigned_xids_lck);
+
+	for (i = tail; i < head; i++)
+	{
+		/* Skip any gaps in the array */
+		if (KnownAssignedXidsValid[i])
+			return KnownAssignedXids[i];
+	}
+
+	return InvalidTransactionId;
+}
+
+/*
+ * Display KnownAssignedXids to provide debug trail
+ *
+ * Currently this is only called within startup process, so we need no
+ * special locking.
+ *
+ * Note this is pretty expensive, and much of the expense will be incurred
+ * even if the elog message will get discarded.  It's not currently called
+ * in any performance-critical places, however, so no need to be tenser.
+ */
+static void
+KnownAssignedXidsDisplay(int trace_level)
+{
+	ProcArrayStruct *pArray = procArray;
+	StringInfoData buf;
+	int			head,
+				tail,
+				i;
+	int			nxids = 0;
+
+	tail = pArray->tailKnownAssignedXids;
+	head = pArray->headKnownAssignedXids;
+
+	initStringInfo(&buf);
+
+	for (i = tail; i < head; i++)
+	{
+		if (KnownAssignedXidsValid[i])
+		{
+			nxids++;
+			appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
+		}
+	}
+
+	elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
+		 nxids,
+		 pArray->numKnownAssignedXids,
+		 pArray->tailKnownAssignedXids,
+		 pArray->headKnownAssignedXids,
+		 buf.data);
+
+	pfree(buf.data);
+}
+
+/*
+ * KnownAssignedXidsReset
+ *		Resets KnownAssignedXids to be empty
+ */
+static void
+KnownAssignedXidsReset(void)
+{
+	ProcArrayStruct *pArray = procArray;
+
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	pArray->numKnownAssignedXids = 0;
+	pArray->tailKnownAssignedXids = 0;
+	pArray->headKnownAssignedXids = 0;
+
+	LWLockRelease(ProcArrayLock);
+}
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
new file mode 100644
index 0000000..defb75a
--- /dev/null
+++ b/src/backend/storage/ipc/procsignal.c
@@ -0,0 +1,685 @@
+/*-------------------------------------------------------------------------
+ *
+ * procsignal.c
+ *	  Routines for interprocess signaling
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/procsignal.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/parallel.h"
+#include "port/pg_bitutils.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "storage/sinval.h"
+#include "tcop/tcopprot.h"
+#include "utils/memutils.h"
+
+/*
+ * The SIGUSR1 signal is multiplexed to support signaling multiple event
+ * types. The specific reason is communicated via flags in shared memory.
+ * We keep a boolean flag for each possible "reason", so that different
+ * reasons can be signaled to a process concurrently.  (However, if the same
+ * reason is signaled more than once nearly simultaneously, the process may
+ * observe it only once.)
+ *
+ * Each process that wants to receive signals registers its process ID
+ * in the ProcSignalSlots array. The array is indexed by backend ID to make
+ * slot allocation simple, and to avoid having to search the array when you
+ * know the backend ID of the process you're signaling.  (We do support
+ * signaling without backend ID, but it's a bit less efficient.)
+ *
+ * The flags are actually declared as "volatile sig_atomic_t" for maximum
+ * portability.  This should ensure that loads and stores of the flag
+ * values are atomic, allowing us to dispense with any explicit locking.
+ *
+ * pss_signalFlags are intended to be set in cases where we don't need to
+ * keep track of whether or not the target process has handled the signal,
+ * but sometimes we need confirmation, as when making a global state change
+ * that cannot be considered complete until all backends have taken notice
+ * of it. For such use cases, we set a bit in pss_barrierCheckMask and then
+ * increment the current "barrier generation"; when the new barrier generation
+ * (or greater) appears in the pss_barrierGeneration flag of every process,
+ * we know that the message has been received everywhere.
+ */
+typedef struct
+{
+	volatile pid_t pss_pid;
+	volatile sig_atomic_t pss_signalFlags[NUM_PROCSIGNALS];
+	pg_atomic_uint64 pss_barrierGeneration;
+	pg_atomic_uint32 pss_barrierCheckMask;
+	ConditionVariable pss_barrierCV;
+} ProcSignalSlot;
+
+/*
+ * Information that is global to the entire ProcSignal system can be stored
+ * here.
+ *
+ * psh_barrierGeneration is the highest barrier generation in existence.
+ */
+typedef struct
+{
+	pg_atomic_uint64 psh_barrierGeneration;
+	ProcSignalSlot psh_slot[FLEXIBLE_ARRAY_MEMBER];
+} ProcSignalHeader;
+
+/*
+ * We reserve a slot for each possible BackendId, plus one for each
+ * possible auxiliary process type.  (This scheme assumes there is not
+ * more than one of any auxiliary process type at a time.)
+ */
+#define NumProcSignalSlots	(MaxBackends + NUM_AUXPROCTYPES)
+
+/* Check whether the relevant type bit is set in the flags. */
+#define BARRIER_SHOULD_CHECK(flags, type) \
+	(((flags) & (((uint32) 1) << (uint32) (type))) != 0)
+
+/* Clear the relevant type bit from the flags. */
+#define BARRIER_CLEAR_BIT(flags, type) \
+	((flags) &= ~(((uint32) 1) << (uint32) (type)))
+
+static ProcSignalHeader *ProcSignal = NULL;
+static ProcSignalSlot *MyProcSignalSlot = NULL;
+
+static bool CheckProcSignal(ProcSignalReason reason);
+static void CleanupProcSignalState(int status, Datum arg);
+static void ResetProcSignalBarrierBits(uint32 flags);
+static bool ProcessBarrierPlaceholder(void);
+
+/*
+ * ProcSignalShmemSize
+ *		Compute space needed for procsignal's shared memory
+ */
+Size
+ProcSignalShmemSize(void)
+{
+	Size		size;
+
+	size = mul_size(NumProcSignalSlots, sizeof(ProcSignalSlot));
+	size = add_size(size, offsetof(ProcSignalHeader, psh_slot));
+	return size;
+}
+
+/*
+ * ProcSignalShmemInit
+ *		Allocate and initialize procsignal's shared memory
+ */
+void
+ProcSignalShmemInit(void)
+{
+	Size		size = ProcSignalShmemSize();
+	bool		found;
+
+	ProcSignal = (ProcSignalHeader *)
+		ShmemInitStruct("ProcSignal", size, &found);
+
+	/* If we're first, initialize. */
+	if (!found)
+	{
+		int			i;
+
+		pg_atomic_init_u64(&ProcSignal->psh_barrierGeneration, 0);
+
+		for (i = 0; i < NumProcSignalSlots; ++i)
+		{
+			ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+
+			slot->pss_pid = 0;
+			MemSet(slot->pss_signalFlags, 0, sizeof(slot->pss_signalFlags));
+			pg_atomic_init_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX);
+			pg_atomic_init_u32(&slot->pss_barrierCheckMask, 0);
+			ConditionVariableInit(&slot->pss_barrierCV);
+		}
+	}
+}
+
+/*
+ * ProcSignalInit
+ *		Register the current process in the procsignal array
+ *
+ * The passed index should be my BackendId if the process has one,
+ * or MaxBackends + aux process type if not.
+ */
+void
+ProcSignalInit(int pss_idx)
+{
+	ProcSignalSlot *slot;
+	uint64		barrier_generation;
+
+	Assert(pss_idx >= 1 && pss_idx <= NumProcSignalSlots);
+
+	slot = &ProcSignal->psh_slot[pss_idx - 1];
+
+	/* sanity check */
+	if (slot->pss_pid != 0)
+		elog(LOG, "process %d taking over ProcSignal slot %d, but it's not empty",
+			 MyProcPid, pss_idx);
+
+	/* Clear out any leftover signal reasons */
+	MemSet(slot->pss_signalFlags, 0, NUM_PROCSIGNALS * sizeof(sig_atomic_t));
+
+	/*
+	 * Initialize barrier state. Since we're a brand-new process, there
+	 * shouldn't be any leftover backend-private state that needs to be
+	 * updated. Therefore, we can broadcast the latest barrier generation and
+	 * disregard any previously-set check bits.
+	 *
+	 * NB: This only works if this initialization happens early enough in the
+	 * startup sequence that we haven't yet cached any state that might need
+	 * to be invalidated. That's also why we have a memory barrier here, to be
+	 * sure that any later reads of memory happen strictly after this.
+	 */
+	pg_atomic_write_u32(&slot->pss_barrierCheckMask, 0);
+	barrier_generation =
+		pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
+	pg_atomic_write_u64(&slot->pss_barrierGeneration, barrier_generation);
+	pg_memory_barrier();
+
+	/* Mark slot with my PID */
+	slot->pss_pid = MyProcPid;
+
+	/* Remember slot location for CheckProcSignal */
+	MyProcSignalSlot = slot;
+
+	/* Set up to release the slot on process exit */
+	on_shmem_exit(CleanupProcSignalState, Int32GetDatum(pss_idx));
+}
+
+/*
+ * CleanupProcSignalState
+ *		Remove current process from ProcSignal mechanism
+ *
+ * This function is called via on_shmem_exit() during backend shutdown.
+ */
+static void
+CleanupProcSignalState(int status, Datum arg)
+{
+	int			pss_idx = DatumGetInt32(arg);
+	ProcSignalSlot *slot;
+
+	slot = &ProcSignal->psh_slot[pss_idx - 1];
+	Assert(slot == MyProcSignalSlot);
+
+	/*
+	 * Clear MyProcSignalSlot, so that a SIGUSR1 received after this point
+	 * won't try to access it after it's no longer ours (and perhaps even
+	 * after we've unmapped the shared memory segment).
+	 */
+	MyProcSignalSlot = NULL;
+
+	/* sanity check */
+	if (slot->pss_pid != MyProcPid)
+	{
+		/*
+		 * don't ERROR here. We're exiting anyway, and don't want to get into
+		 * infinite loop trying to exit
+		 */
+		elog(LOG, "process %d releasing ProcSignal slot %d, but it contains %d",
+			 MyProcPid, pss_idx, (int) slot->pss_pid);
+		return;					/* XXX better to zero the slot anyway? */
+	}
+
+	/*
+	 * Make this slot look like it's absorbed all possible barriers, so that
+	 * no barrier waits block on it.
+	 */
+	pg_atomic_write_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX);
+	ConditionVariableBroadcast(&slot->pss_barrierCV);
+
+	slot->pss_pid = 0;
+}
+
+/*
+ * SendProcSignal
+ *		Send a signal to a Postgres process
+ *
+ * Providing backendId is optional, but it will speed up the operation.
+ *
+ * On success (a signal was sent), zero is returned.
+ * On error, -1 is returned, and errno is set (typically to ESRCH or EPERM).
+ *
+ * Not to be confused with ProcSendSignal
+ */
+int
+SendProcSignal(pid_t pid, ProcSignalReason reason, BackendId backendId)
+{
+	volatile ProcSignalSlot *slot;
+
+	if (backendId != InvalidBackendId)
+	{
+		slot = &ProcSignal->psh_slot[backendId - 1];
+
+		/*
+		 * Note: Since there's no locking, it's possible that the target
+		 * process detaches from shared memory and exits right after this
+		 * test, before we set the flag and send signal. And the signal slot
+		 * might even be recycled by a new process, so it's remotely possible
+		 * that we set a flag for a wrong process. That's OK, all the signals
+		 * are such that no harm is done if they're mistakenly fired.
+		 */
+		if (slot->pss_pid == pid)
+		{
+			/* Atomically set the proper flag */
+			slot->pss_signalFlags[reason] = true;
+			/* Send signal */
+			return kill(pid, SIGUSR1);
+		}
+	}
+	else
+	{
+		/*
+		 * BackendId not provided, so search the array using pid.  We search
+		 * the array back to front so as to reduce search overhead.  Passing
+		 * InvalidBackendId means that the target is most likely an auxiliary
+		 * process, which will have a slot near the end of the array.
+		 */
+		int			i;
+
+		for (i = NumProcSignalSlots - 1; i >= 0; i--)
+		{
+			slot = &ProcSignal->psh_slot[i];
+
+			if (slot->pss_pid == pid)
+			{
+				/* the above note about race conditions applies here too */
+
+				/* Atomically set the proper flag */
+				slot->pss_signalFlags[reason] = true;
+				/* Send signal */
+				return kill(pid, SIGUSR1);
+			}
+		}
+	}
+
+	errno = ESRCH;
+	return -1;
+}
+
+/*
+ * EmitProcSignalBarrier
+ *		Send a signal to every Postgres process
+ *
+ * The return value of this function is the barrier "generation" created
+ * by this operation. This value can be passed to WaitForProcSignalBarrier
+ * to wait until it is known that every participant in the ProcSignal
+ * mechanism has absorbed the signal (or started afterwards).
+ *
+ * Note that it would be a bad idea to use this for anything that happens
+ * frequently, as interrupting every backend could cause a noticeable
+ * performance hit.
+ *
+ * Callers are entitled to assume that this function will not throw ERROR
+ * or FATAL.
+ */
+uint64
+EmitProcSignalBarrier(ProcSignalBarrierType type)
+{
+	uint32		flagbit = 1 << (uint32) type;
+	uint64		generation;
+
+	/*
+	 * Set all the flags.
+	 *
+	 * Note that pg_atomic_fetch_or_u32 has full barrier semantics, so this is
+	 * totally ordered with respect to anything the caller did before, and
+	 * anything that we do afterwards. (This is also true of the later call to
+	 * pg_atomic_add_fetch_u64.)
+	 */
+	for (int i = 0; i < NumProcSignalSlots; i++)
+	{
+		volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+
+		pg_atomic_fetch_or_u32(&slot->pss_barrierCheckMask, flagbit);
+	}
+
+	/*
+	 * Increment the generation counter.
+	 */
+	generation =
+		pg_atomic_add_fetch_u64(&ProcSignal->psh_barrierGeneration, 1);
+
+	/*
+	 * Signal all the processes, so that they update their advertised barrier
+	 * generation.
+	 *
+	 * Concurrency is not a problem here. Backends that have exited don't
+	 * matter, and new backends that have joined since we entered this
+	 * function must already have current state, since the caller is
+	 * responsible for making sure that the relevant state is entirely visible
+	 * before calling this function in the first place. We still have to wake
+	 * them up - because we can't distinguish between such backends and older
+	 * backends that need to update state - but they won't actually need to
+	 * change any state.
+	 */
+	for (int i = NumProcSignalSlots - 1; i >= 0; i--)
+	{
+		volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+		pid_t		pid = slot->pss_pid;
+
+		if (pid != 0)
+		{
+			/* see SendProcSignal for details */
+			slot->pss_signalFlags[PROCSIG_BARRIER] = true;
+			kill(pid, SIGUSR1);
+		}
+	}
+
+	return generation;
+}
+
+/*
+ * WaitForProcSignalBarrier - wait until it is guaranteed that all changes
+ * requested by a specific call to EmitProcSignalBarrier() have taken effect.
+ */
+void
+WaitForProcSignalBarrier(uint64 generation)
+{
+	Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration));
+
+	for (int i = NumProcSignalSlots - 1; i >= 0; i--)
+	{
+		ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+		uint64		oldval;
+
+		/*
+		 * It's important that we check only pss_barrierGeneration here and
+		 * not pss_barrierCheckMask. Bits in pss_barrierCheckMask get cleared
+		 * before the barrier is actually absorbed, but pss_barrierGeneration
+		 * is updated only afterward.
+		 */
+		oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
+		while (oldval < generation)
+		{
+			ConditionVariableSleep(&slot->pss_barrierCV,
+								   WAIT_EVENT_PROC_SIGNAL_BARRIER);
+			oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
+		}
+		ConditionVariableCancelSleep();
+	}
+
+	/*
+	 * The caller is probably calling this function because it wants to read
+	 * the shared state or perform further writes to shared state once all
+	 * backends are known to have absorbed the barrier. However, the read of
+	 * pss_barrierGeneration was performed unlocked; insert a memory barrier
+	 * to separate it from whatever follows.
+	 */
+	pg_memory_barrier();
+}
+
+/*
+ * Handle receipt of an interrupt indicating a global barrier event.
+ *
+ * All the actual work is deferred to ProcessProcSignalBarrier(), because we
+ * cannot safely access the barrier generation inside the signal handler as
+ * 64bit atomics might use spinlock based emulation, even for reads. As this
+ * routine only gets called when PROCSIG_BARRIER is sent that won't cause a
+ * lot of unnecessary work.
+ */
+static void
+HandleProcSignalBarrierInterrupt(void)
+{
+	InterruptPending = true;
+	ProcSignalBarrierPending = true;
+	/* latch will be set by procsignal_sigusr1_handler */
+}
+
+/*
+ * Perform global barrier related interrupt checking.
+ *
+ * Any backend that participates in ProcSignal signaling must arrange to
+ * call this function periodically. It is called from CHECK_FOR_INTERRUPTS(),
+ * which is enough for normal backends, but not necessarily for all types of
+ * background processes.
+ */
+void
+ProcessProcSignalBarrier(void)
+{
+	uint64		local_gen;
+	uint64		shared_gen;
+	volatile uint32 flags;
+
+	Assert(MyProcSignalSlot);
+
+	/* Exit quickly if there's no work to do. */
+	if (!ProcSignalBarrierPending)
+		return;
+	ProcSignalBarrierPending = false;
+
+	/*
+	 * It's not unlikely to process multiple barriers at once, before the
+	 * signals for all the barriers have arrived. To avoid unnecessary work in
+	 * response to subsequent signals, exit early if we already have processed
+	 * all of them.
+	 */
+	local_gen = pg_atomic_read_u64(&MyProcSignalSlot->pss_barrierGeneration);
+	shared_gen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
+
+	Assert(local_gen <= shared_gen);
+
+	if (local_gen == shared_gen)
+		return;
+
+	/*
+	 * Get and clear the flags that are set for this backend. Note that
+	 * pg_atomic_exchange_u32 is a full barrier, so we're guaranteed that the
+	 * read of the barrier generation above happens before we atomically
+	 * extract the flags, and that any subsequent state changes happen
+	 * afterward.
+	 *
+	 * NB: In order to avoid race conditions, we must zero
+	 * pss_barrierCheckMask first and only afterwards try to do barrier
+	 * processing. If we did it in the other order, someone could send us
+	 * another barrier of some type right after we called the
+	 * barrier-processing function but before we cleared the bit. We would
+	 * have no way of knowing that the bit needs to stay set in that case, so
+	 * the need to call the barrier-processing function again would just get
+	 * forgotten. So instead, we tentatively clear all the bits and then put
+	 * back any for which we don't manage to successfully absorb the barrier.
+	 */
+	flags = pg_atomic_exchange_u32(&MyProcSignalSlot->pss_barrierCheckMask, 0);
+
+	/*
+	 * If there are no flags set, then we can skip doing any real work.
+	 * Otherwise, establish a PG_TRY block, so that we don't lose track of
+	 * which types of barrier processing are needed if an ERROR occurs.
+	 */
+	if (flags != 0)
+	{
+		bool		success = true;
+
+		PG_TRY();
+		{
+			/*
+			 * Process each type of barrier. The barrier-processing functions
+			 * should normally return true, but may return false if the
+			 * barrier can't be absorbed at the current time. This should be
+			 * rare, because it's pretty expensive.  Every single
+			 * CHECK_FOR_INTERRUPTS() will return here until we manage to
+			 * absorb the barrier, and that cost will add up in a hurry.
+			 *
+			 * NB: It ought to be OK to call the barrier-processing functions
+			 * unconditionally, but it's more efficient to call only the ones
+			 * that might need us to do something based on the flags.
+			 */
+			while (flags != 0)
+			{
+				ProcSignalBarrierType type;
+				bool		processed = true;
+
+				type = (ProcSignalBarrierType) pg_rightmost_one_pos32(flags);
+				switch (type)
+				{
+					case PROCSIGNAL_BARRIER_PLACEHOLDER:
+						processed = ProcessBarrierPlaceholder();
+						break;
+				}
+
+				/*
+				 * To avoid an infinite loop, we must always unset the bit in
+				 * flags.
+				 */
+				BARRIER_CLEAR_BIT(flags, type);
+
+				/*
+				 * If we failed to process the barrier, reset the shared bit
+				 * so we try again later, and set a flag so that we don't bump
+				 * our generation.
+				 */
+				if (!processed)
+				{
+					ResetProcSignalBarrierBits(((uint32) 1) << type);
+					success = false;
+				}
+			}
+		}
+		PG_CATCH();
+		{
+			/*
+			 * If an ERROR occurred, we'll need to try again later to handle
+			 * that barrier type and any others that haven't been handled yet
+			 * or weren't successfully absorbed.
+			 */
+			ResetProcSignalBarrierBits(flags);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+
+		/*
+		 * If some barrier types were not successfully absorbed, we will have
+		 * to try again later.
+		 */
+		if (!success)
+			return;
+	}
+
+	/*
+	 * State changes related to all types of barriers that might have been
+	 * emitted have now been handled, so we can update our notion of the
+	 * generation to the one we observed before beginning the updates. If
+	 * things have changed further, it'll get fixed up when this function is
+	 * next called.
+	 */
+	pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, shared_gen);
+	ConditionVariableBroadcast(&MyProcSignalSlot->pss_barrierCV);
+}
+
+/*
+ * If it turns out that we couldn't absorb one or more barrier types, either
+ * because the barrier-processing functions returned false or due to an error,
+ * arrange for processing to be retried later.
+ */
+static void
+ResetProcSignalBarrierBits(uint32 flags)
+{
+	pg_atomic_fetch_or_u32(&MyProcSignalSlot->pss_barrierCheckMask, flags);
+	ProcSignalBarrierPending = true;
+	InterruptPending = true;
+}
+
+static bool
+ProcessBarrierPlaceholder(void)
+{
+	/*
+	 * XXX. This is just a placeholder until the first real user of this
+	 * machinery gets committed. Rename PROCSIGNAL_BARRIER_PLACEHOLDER to
+	 * PROCSIGNAL_BARRIER_SOMETHING_ELSE where SOMETHING_ELSE is something
+	 * appropriately descriptive. Get rid of this function and instead have
+	 * ProcessBarrierSomethingElse. Most likely, that function should live in
+	 * the file pertaining to that subsystem, rather than here.
+	 *
+	 * The return value should be 'true' if the barrier was successfully
+	 * absorbed and 'false' if not. Note that returning 'false' can lead to
+	 * very frequent retries, so try hard to make that an uncommon case.
+	 */
+	return true;
+}
+
+/*
+ * CheckProcSignal - check to see if a particular reason has been
+ * signaled, and clear the signal flag.  Should be called after receiving
+ * SIGUSR1.
+ */
+static bool
+CheckProcSignal(ProcSignalReason reason)
+{
+	volatile ProcSignalSlot *slot = MyProcSignalSlot;
+
+	if (slot != NULL)
+	{
+		/* Careful here --- don't clear flag if we haven't seen it set */
+		if (slot->pss_signalFlags[reason])
+		{
+			slot->pss_signalFlags[reason] = false;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * procsignal_sigusr1_handler - handle SIGUSR1 signal.
+ */
+void
+procsignal_sigusr1_handler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	if (CheckProcSignal(PROCSIG_CATCHUP_INTERRUPT))
+		HandleCatchupInterrupt();
+
+	if (CheckProcSignal(PROCSIG_NOTIFY_INTERRUPT))
+		HandleNotifyInterrupt();
+
+	if (CheckProcSignal(PROCSIG_PARALLEL_MESSAGE))
+		HandleParallelMessageInterrupt();
+
+	if (CheckProcSignal(PROCSIG_WALSND_INIT_STOPPING))
+		HandleWalSndInitStopping();
+
+	if (CheckProcSignal(PROCSIG_BARRIER))
+		HandleProcSignalBarrierInterrupt();
+
+	if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT))
+		HandleLogMemoryContextInterrupt();
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_TABLESPACE))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOCK))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOCK);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+
+	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN))
+		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+
+	SetLatch(MyLatch);
+
+	errno = save_errno;
+}
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
new file mode 100644
index 0000000..3240af4
--- /dev/null
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -0,0 +1,1288 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_mq.c
+ *	  single-reader, single-writer shared memory message queue
+ *
+ * Both the sender and the receiver must have a PGPROC; their respective
+ * process latches are used for synchronization.  Only the sender may send,
+ * and only the receiver may receive.  This is intended to allow a user
+ * backend to communicate with worker backends that it has registered.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/ipc/shm_mq.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "storage/procsignal.h"
+#include "storage/shm_mq.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+/*
+ * This structure represents the actual queue, stored in shared memory.
+ *
+ * Some notes on synchronization:
+ *
+ * mq_receiver and mq_bytes_read can only be changed by the receiver; and
+ * mq_sender and mq_bytes_written can only be changed by the sender.
+ * mq_receiver and mq_sender are protected by mq_mutex, although, importantly,
+ * they cannot change once set, and thus may be read without a lock once this
+ * is known to be the case.
+ *
+ * mq_bytes_read and mq_bytes_written are not protected by the mutex.  Instead,
+ * they are written atomically using 8 byte loads and stores.  Memory barriers
+ * must be carefully used to synchronize reads and writes of these values with
+ * reads and writes of the actual data in mq_ring.
+ *
+ * mq_detached needs no locking.  It can be set by either the sender or the
+ * receiver, but only ever from false to true, so redundant writes don't
+ * matter.  It is important that if we set mq_detached and then set the
+ * counterparty's latch, the counterparty must be certain to see the change
+ * after waking up.  Since SetLatch begins with a memory barrier and ResetLatch
+ * ends with one, this should be OK.
+ *
+ * mq_ring_size and mq_ring_offset never change after initialization, and
+ * can therefore be read without the lock.
+ *
+ * Importantly, mq_ring can be safely read and written without a lock.
+ * At any given time, the difference between mq_bytes_read and
+ * mq_bytes_written defines the number of bytes within mq_ring that contain
+ * unread data, and mq_bytes_read defines the position where those bytes
+ * begin.  The sender can increase the number of unread bytes at any time,
+ * but only the receiver can give license to overwrite those bytes, by
+ * incrementing mq_bytes_read.  Therefore, it's safe for the receiver to read
+ * the unread bytes it knows to be present without the lock.  Conversely,
+ * the sender can write to the unused portion of the ring buffer without
+ * the lock, because nobody else can be reading or writing those bytes.  The
+ * receiver could be making more bytes unused by incrementing mq_bytes_read,
+ * but that's OK.  Note that it would be unsafe for the receiver to read any
+ * data it's already marked as read, or to write any data; and it would be
+ * unsafe for the sender to reread any data after incrementing
+ * mq_bytes_written, but fortunately there's no need for any of that.
+ */
+struct shm_mq
+{
+	slock_t		mq_mutex;
+	PGPROC	   *mq_receiver;
+	PGPROC	   *mq_sender;
+	pg_atomic_uint64 mq_bytes_read;
+	pg_atomic_uint64 mq_bytes_written;
+	Size		mq_ring_size;
+	bool		mq_detached;
+	uint8		mq_ring_offset;
+	char		mq_ring[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/*
+ * This structure is a backend-private handle for access to a queue.
+ *
+ * mqh_queue is a pointer to the queue we've attached, and mqh_segment is
+ * an optional pointer to the dynamic shared memory segment that contains it.
+ * (If mqh_segment is provided, we register an on_dsm_detach callback to
+ * make sure we detach from the queue before detaching from DSM.)
+ *
+ * If this queue is intended to connect the current process with a background
+ * worker that started it, the user can pass a pointer to the worker handle
+ * to shm_mq_attach(), and we'll store it in mqh_handle.  The point of this
+ * is to allow us to begin sending to or receiving from that queue before the
+ * process we'll be communicating with has even been started.  If it fails
+ * to start, the handle will allow us to notice that and fail cleanly, rather
+ * than waiting forever; see shm_mq_wait_internal.  This is mostly useful in
+ * simple cases - e.g. where there are just 2 processes communicating; in
+ * more complex scenarios, every process may not have a BackgroundWorkerHandle
+ * available, or may need to watch for the failure of more than one other
+ * process at a time.
+ *
+ * When a message exists as a contiguous chunk of bytes in the queue - that is,
+ * it is smaller than the size of the ring buffer and does not wrap around
+ * the end - we return the message to the caller as a pointer into the buffer.
+ * For messages that are larger or happen to wrap, we reassemble the message
+ * locally by copying the chunks into a backend-local buffer.  mqh_buffer is
+ * the buffer, and mqh_buflen is the number of bytes allocated for it.
+ *
+ * mqh_partial_bytes, mqh_expected_bytes, and mqh_length_word_complete
+ * are used to track the state of non-blocking operations.  When the caller
+ * attempts a non-blocking operation that returns SHM_MQ_WOULD_BLOCK, they
+ * are expected to retry the call at a later time with the same argument;
+ * we need to retain enough state to pick up where we left off.
+ * mqh_length_word_complete tracks whether we are done sending or receiving
+ * (whichever we're doing) the entire length word.  mqh_partial_bytes tracks
+ * the number of bytes read or written for either the length word or the
+ * message itself, and mqh_expected_bytes - which is used only for reads -
+ * tracks the expected total size of the payload.
+ *
+ * mqh_counterparty_attached tracks whether we know the counterparty to have
+ * attached to the queue at some previous point.  This lets us avoid some
+ * mutex acquisitions.
+ *
+ * mqh_context is the memory context in effect at the time we attached to
+ * the shm_mq.  The shm_mq_handle itself is allocated in this context, and
+ * we make sure any other allocations we do happen in this context as well,
+ * to avoid nasty surprises.
+ */
+struct shm_mq_handle
+{
+	shm_mq	   *mqh_queue;
+	dsm_segment *mqh_segment;
+	BackgroundWorkerHandle *mqh_handle;
+	char	   *mqh_buffer;
+	Size		mqh_buflen;
+	Size		mqh_consume_pending;
+	Size		mqh_partial_bytes;
+	Size		mqh_expected_bytes;
+	bool		mqh_length_word_complete;
+	bool		mqh_counterparty_attached;
+	MemoryContext mqh_context;
+};
+
+static void shm_mq_detach_internal(shm_mq *mq);
+static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes,
+									   const void *data, bool nowait, Size *bytes_written);
+static shm_mq_result shm_mq_receive_bytes(shm_mq_handle *mqh,
+										  Size bytes_needed, bool nowait, Size *nbytesp,
+										  void **datap);
+static bool shm_mq_counterparty_gone(shm_mq *mq,
+									 BackgroundWorkerHandle *handle);
+static bool shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr,
+								 BackgroundWorkerHandle *handle);
+static void shm_mq_inc_bytes_read(shm_mq *mq, Size n);
+static void shm_mq_inc_bytes_written(shm_mq *mq, Size n);
+static void shm_mq_detach_callback(dsm_segment *seg, Datum arg);
+
+/* Minimum queue size is enough for header and at least one chunk of data. */
+const Size	shm_mq_minimum_size =
+MAXALIGN(offsetof(shm_mq, mq_ring)) + MAXIMUM_ALIGNOF;
+
+#define MQH_INITIAL_BUFSIZE				8192
+
+/*
+ * Initialize a new shared message queue.
+ */
+shm_mq *
+shm_mq_create(void *address, Size size)
+{
+	shm_mq	   *mq = address;
+	Size		data_offset = MAXALIGN(offsetof(shm_mq, mq_ring));
+
+	/* If the size isn't MAXALIGN'd, just discard the odd bytes. */
+	size = MAXALIGN_DOWN(size);
+
+	/* Queue size must be large enough to hold some data. */
+	Assert(size > data_offset);
+
+	/* Initialize queue header. */
+	SpinLockInit(&mq->mq_mutex);
+	mq->mq_receiver = NULL;
+	mq->mq_sender = NULL;
+	pg_atomic_init_u64(&mq->mq_bytes_read, 0);
+	pg_atomic_init_u64(&mq->mq_bytes_written, 0);
+	mq->mq_ring_size = size - data_offset;
+	mq->mq_detached = false;
+	mq->mq_ring_offset = data_offset - offsetof(shm_mq, mq_ring);
+
+	return mq;
+}
+
+/*
+ * Set the identity of the process that will receive from a shared message
+ * queue.
+ */
+void
+shm_mq_set_receiver(shm_mq *mq, PGPROC *proc)
+{
+	PGPROC	   *sender;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	Assert(mq->mq_receiver == NULL);
+	mq->mq_receiver = proc;
+	sender = mq->mq_sender;
+	SpinLockRelease(&mq->mq_mutex);
+
+	if (sender != NULL)
+		SetLatch(&sender->procLatch);
+}
+
+/*
+ * Set the identity of the process that will send to a shared message queue.
+ */
+void
+shm_mq_set_sender(shm_mq *mq, PGPROC *proc)
+{
+	PGPROC	   *receiver;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	Assert(mq->mq_sender == NULL);
+	mq->mq_sender = proc;
+	receiver = mq->mq_receiver;
+	SpinLockRelease(&mq->mq_mutex);
+
+	if (receiver != NULL)
+		SetLatch(&receiver->procLatch);
+}
+
+/*
+ * Get the configured receiver.
+ */
+PGPROC *
+shm_mq_get_receiver(shm_mq *mq)
+{
+	PGPROC	   *receiver;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	receiver = mq->mq_receiver;
+	SpinLockRelease(&mq->mq_mutex);
+
+	return receiver;
+}
+
+/*
+ * Get the configured sender.
+ */
+PGPROC *
+shm_mq_get_sender(shm_mq *mq)
+{
+	PGPROC	   *sender;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	sender = mq->mq_sender;
+	SpinLockRelease(&mq->mq_mutex);
+
+	return sender;
+}
+
+/*
+ * Attach to a shared message queue so we can send or receive messages.
+ *
+ * The memory context in effect at the time this function is called should
+ * be one which will last for at least as long as the message queue itself.
+ * We'll allocate the handle in that context, and future allocations that
+ * are needed to buffer incoming data will happen in that context as well.
+ *
+ * If seg != NULL, the queue will be automatically detached when that dynamic
+ * shared memory segment is detached.
+ *
+ * If handle != NULL, the queue can be read or written even before the
+ * other process has attached.  We'll wait for it to do so if needed.  The
+ * handle must be for a background worker initialized with bgw_notify_pid
+ * equal to our PID.
+ *
+ * shm_mq_detach() should be called when done.  This will free the
+ * shm_mq_handle and mark the queue itself as detached, so that our
+ * counterpart won't get stuck waiting for us to fill or drain the queue
+ * after we've already lost interest.
+ */
+shm_mq_handle *
+shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
+{
+	shm_mq_handle *mqh = palloc(sizeof(shm_mq_handle));
+
+	Assert(mq->mq_receiver == MyProc || mq->mq_sender == MyProc);
+	mqh->mqh_queue = mq;
+	mqh->mqh_segment = seg;
+	mqh->mqh_handle = handle;
+	mqh->mqh_buffer = NULL;
+	mqh->mqh_buflen = 0;
+	mqh->mqh_consume_pending = 0;
+	mqh->mqh_partial_bytes = 0;
+	mqh->mqh_expected_bytes = 0;
+	mqh->mqh_length_word_complete = false;
+	mqh->mqh_counterparty_attached = false;
+	mqh->mqh_context = CurrentMemoryContext;
+
+	if (seg != NULL)
+		on_dsm_detach(seg, shm_mq_detach_callback, PointerGetDatum(mq));
+
+	return mqh;
+}
+
+/*
+ * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had
+ * been passed to shm_mq_attach.
+ */
+void
+shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle)
+{
+	Assert(mqh->mqh_handle == NULL);
+	mqh->mqh_handle = handle;
+}
+
+/*
+ * Write a message into a shared message queue.
+ */
+shm_mq_result
+shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait)
+{
+	shm_mq_iovec iov;
+
+	iov.data = data;
+	iov.len = nbytes;
+
+	return shm_mq_sendv(mqh, &iov, 1, nowait);
+}
+
+/*
+ * Write a message into a shared message queue, gathered from multiple
+ * addresses.
+ *
+ * When nowait = false, we'll wait on our process latch when the ring buffer
+ * fills up, and then continue writing once the receiver has drained some data.
+ * The process latch is reset after each wait.
+ *
+ * When nowait = true, we do not manipulate the state of the process latch;
+ * instead, if the buffer becomes full, we return SHM_MQ_WOULD_BLOCK.  In
+ * this case, the caller should call this function again, with the same
+ * arguments, each time the process latch is set.  (Once begun, the sending
+ * of a message cannot be aborted except by detaching from the queue; changing
+ * the length or payload will corrupt the queue.)
+ */
+shm_mq_result
+shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait)
+{
+	shm_mq_result res;
+	shm_mq	   *mq = mqh->mqh_queue;
+	PGPROC	   *receiver;
+	Size		nbytes = 0;
+	Size		bytes_written;
+	int			i;
+	int			which_iov = 0;
+	Size		offset;
+
+	Assert(mq->mq_sender == MyProc);
+
+	/* Compute total size of write. */
+	for (i = 0; i < iovcnt; ++i)
+		nbytes += iov[i].len;
+
+	/* Prevent writing messages overwhelming the receiver. */
+	if (nbytes > MaxAllocSize)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot send a message of size %zu via shared memory queue",
+						nbytes)));
+
+	/* Try to write, or finish writing, the length word into the buffer. */
+	while (!mqh->mqh_length_word_complete)
+	{
+		Assert(mqh->mqh_partial_bytes < sizeof(Size));
+		res = shm_mq_send_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes,
+								((char *) &nbytes) + mqh->mqh_partial_bytes,
+								nowait, &bytes_written);
+
+		if (res == SHM_MQ_DETACHED)
+		{
+			/* Reset state in case caller tries to send another message. */
+			mqh->mqh_partial_bytes = 0;
+			mqh->mqh_length_word_complete = false;
+			return res;
+		}
+		mqh->mqh_partial_bytes += bytes_written;
+
+		if (mqh->mqh_partial_bytes >= sizeof(Size))
+		{
+			Assert(mqh->mqh_partial_bytes == sizeof(Size));
+
+			mqh->mqh_partial_bytes = 0;
+			mqh->mqh_length_word_complete = true;
+		}
+
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+
+		/* Length word can't be split unless bigger than required alignment. */
+		Assert(mqh->mqh_length_word_complete || sizeof(Size) > MAXIMUM_ALIGNOF);
+	}
+
+	/* Write the actual data bytes into the buffer. */
+	Assert(mqh->mqh_partial_bytes <= nbytes);
+	offset = mqh->mqh_partial_bytes;
+	do
+	{
+		Size		chunksize;
+
+		/* Figure out which bytes need to be sent next. */
+		if (offset >= iov[which_iov].len)
+		{
+			offset -= iov[which_iov].len;
+			++which_iov;
+			if (which_iov >= iovcnt)
+				break;
+			continue;
+		}
+
+		/*
+		 * We want to avoid copying the data if at all possible, but every
+		 * chunk of bytes we write into the queue has to be MAXALIGN'd, except
+		 * the last.  Thus, if a chunk other than the last one ends on a
+		 * non-MAXALIGN'd boundary, we have to combine the tail end of its
+		 * data with data from one or more following chunks until we either
+		 * reach the last chunk or accumulate a number of bytes which is
+		 * MAXALIGN'd.
+		 */
+		if (which_iov + 1 < iovcnt &&
+			offset + MAXIMUM_ALIGNOF > iov[which_iov].len)
+		{
+			char		tmpbuf[MAXIMUM_ALIGNOF];
+			int			j = 0;
+
+			for (;;)
+			{
+				if (offset < iov[which_iov].len)
+				{
+					tmpbuf[j] = iov[which_iov].data[offset];
+					j++;
+					offset++;
+					if (j == MAXIMUM_ALIGNOF)
+						break;
+				}
+				else
+				{
+					offset -= iov[which_iov].len;
+					which_iov++;
+					if (which_iov >= iovcnt)
+						break;
+				}
+			}
+
+			res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written);
+
+			if (res == SHM_MQ_DETACHED)
+			{
+				/* Reset state in case caller tries to send another message. */
+				mqh->mqh_partial_bytes = 0;
+				mqh->mqh_length_word_complete = false;
+				return res;
+			}
+
+			mqh->mqh_partial_bytes += bytes_written;
+			if (res != SHM_MQ_SUCCESS)
+				return res;
+			continue;
+		}
+
+		/*
+		 * If this is the last chunk, we can write all the data, even if it
+		 * isn't a multiple of MAXIMUM_ALIGNOF.  Otherwise, we need to
+		 * MAXALIGN_DOWN the write size.
+		 */
+		chunksize = iov[which_iov].len - offset;
+		if (which_iov + 1 < iovcnt)
+			chunksize = MAXALIGN_DOWN(chunksize);
+		res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset],
+								nowait, &bytes_written);
+
+		if (res == SHM_MQ_DETACHED)
+		{
+			/* Reset state in case caller tries to send another message. */
+			mqh->mqh_length_word_complete = false;
+			mqh->mqh_partial_bytes = 0;
+			return res;
+		}
+
+		mqh->mqh_partial_bytes += bytes_written;
+		offset += bytes_written;
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+	} while (mqh->mqh_partial_bytes < nbytes);
+
+	/* Reset for next message. */
+	mqh->mqh_partial_bytes = 0;
+	mqh->mqh_length_word_complete = false;
+
+	/* If queue has been detached, let caller know. */
+	if (mq->mq_detached)
+		return SHM_MQ_DETACHED;
+
+	/*
+	 * If the counterparty is known to have attached, we can read mq_receiver
+	 * without acquiring the spinlock and assume it isn't NULL.  Otherwise,
+	 * more caution is needed.
+	 */
+	if (mqh->mqh_counterparty_attached)
+		receiver = mq->mq_receiver;
+	else
+	{
+		SpinLockAcquire(&mq->mq_mutex);
+		receiver = mq->mq_receiver;
+		SpinLockRelease(&mq->mq_mutex);
+		if (receiver == NULL)
+			return SHM_MQ_SUCCESS;
+		mqh->mqh_counterparty_attached = true;
+	}
+
+	/* Notify receiver of the newly-written data, and return. */
+	SetLatch(&receiver->procLatch);
+	return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Receive a message from a shared message queue.
+ *
+ * We set *nbytes to the message length and *data to point to the message
+ * payload.  If the entire message exists in the queue as a single,
+ * contiguous chunk, *data will point directly into shared memory; otherwise,
+ * it will point to a temporary buffer.  This mostly avoids data copying in
+ * the hoped-for case where messages are short compared to the buffer size,
+ * while still allowing longer messages.  In either case, the return value
+ * remains valid until the next receive operation is performed on the queue.
+ *
+ * When nowait = false, we'll wait on our process latch when the ring buffer
+ * is empty and we have not yet received a full message.  The sender will
+ * set our process latch after more data has been written, and we'll resume
+ * processing.  Each call will therefore return a complete message
+ * (unless the sender detaches the queue).
+ *
+ * When nowait = true, we do not manipulate the state of the process latch;
+ * instead, whenever the buffer is empty and we need to read from it, we
+ * return SHM_MQ_WOULD_BLOCK.  In this case, the caller should call this
+ * function again after the process latch has been set.
+ */
+shm_mq_result
+shm_mq_receive(shm_mq_handle *mqh, Size *nbytesp, void **datap, bool nowait)
+{
+	shm_mq	   *mq = mqh->mqh_queue;
+	shm_mq_result res;
+	Size		rb = 0;
+	Size		nbytes;
+	void	   *rawdata;
+
+	Assert(mq->mq_receiver == MyProc);
+
+	/* We can't receive data until the sender has attached. */
+	if (!mqh->mqh_counterparty_attached)
+	{
+		if (nowait)
+		{
+			int			counterparty_gone;
+
+			/*
+			 * We shouldn't return at this point at all unless the sender
+			 * hasn't attached yet.  However, the correct return value depends
+			 * on whether the sender is still attached.  If we first test
+			 * whether the sender has ever attached and then test whether the
+			 * sender has detached, there's a race condition: a sender that
+			 * attaches and detaches very quickly might fool us into thinking
+			 * the sender never attached at all.  So, test whether our
+			 * counterparty is definitively gone first, and only afterwards
+			 * check whether the sender ever attached in the first place.
+			 */
+			counterparty_gone = shm_mq_counterparty_gone(mq, mqh->mqh_handle);
+			if (shm_mq_get_sender(mq) == NULL)
+			{
+				if (counterparty_gone)
+					return SHM_MQ_DETACHED;
+				else
+					return SHM_MQ_WOULD_BLOCK;
+			}
+		}
+		else if (!shm_mq_wait_internal(mq, &mq->mq_sender, mqh->mqh_handle)
+				 && shm_mq_get_sender(mq) == NULL)
+		{
+			mq->mq_detached = true;
+			return SHM_MQ_DETACHED;
+		}
+		mqh->mqh_counterparty_attached = true;
+	}
+
+	/*
+	 * If we've consumed an amount of data greater than 1/4th of the ring
+	 * size, mark it consumed in shared memory.  We try to avoid doing this
+	 * unnecessarily when only a small amount of data has been consumed,
+	 * because SetLatch() is fairly expensive and we don't want to do it too
+	 * often.
+	 */
+	if (mqh->mqh_consume_pending > mq->mq_ring_size / 4)
+	{
+		shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending);
+		mqh->mqh_consume_pending = 0;
+	}
+
+	/* Try to read, or finish reading, the length word from the buffer. */
+	while (!mqh->mqh_length_word_complete)
+	{
+		/* Try to receive the message length word. */
+		Assert(mqh->mqh_partial_bytes < sizeof(Size));
+		res = shm_mq_receive_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes,
+								   nowait, &rb, &rawdata);
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+
+		/*
+		 * Hopefully, we'll receive the entire message length word at once.
+		 * But if sizeof(Size) > MAXIMUM_ALIGNOF, then it might be split over
+		 * multiple reads.
+		 */
+		if (mqh->mqh_partial_bytes == 0 && rb >= sizeof(Size))
+		{
+			Size		needed;
+
+			nbytes = *(Size *) rawdata;
+
+			/* If we've already got the whole message, we're done. */
+			needed = MAXALIGN(sizeof(Size)) + MAXALIGN(nbytes);
+			if (rb >= needed)
+			{
+				mqh->mqh_consume_pending += needed;
+				*nbytesp = nbytes;
+				*datap = ((char *) rawdata) + MAXALIGN(sizeof(Size));
+				return SHM_MQ_SUCCESS;
+			}
+
+			/*
+			 * We don't have the whole message, but we at least have the whole
+			 * length word.
+			 */
+			mqh->mqh_expected_bytes = nbytes;
+			mqh->mqh_length_word_complete = true;
+			mqh->mqh_consume_pending += MAXALIGN(sizeof(Size));
+			rb -= MAXALIGN(sizeof(Size));
+		}
+		else
+		{
+			Size		lengthbytes;
+
+			/* Can't be split unless bigger than required alignment. */
+			Assert(sizeof(Size) > MAXIMUM_ALIGNOF);
+
+			/* Message word is split; need buffer to reassemble. */
+			if (mqh->mqh_buffer == NULL)
+			{
+				mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context,
+													 MQH_INITIAL_BUFSIZE);
+				mqh->mqh_buflen = MQH_INITIAL_BUFSIZE;
+			}
+			Assert(mqh->mqh_buflen >= sizeof(Size));
+
+			/* Copy partial length word; remember to consume it. */
+			if (mqh->mqh_partial_bytes + rb > sizeof(Size))
+				lengthbytes = sizeof(Size) - mqh->mqh_partial_bytes;
+			else
+				lengthbytes = rb;
+			memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata,
+				   lengthbytes);
+			mqh->mqh_partial_bytes += lengthbytes;
+			mqh->mqh_consume_pending += MAXALIGN(lengthbytes);
+			rb -= lengthbytes;
+
+			/* If we now have the whole word, we're ready to read payload. */
+			if (mqh->mqh_partial_bytes >= sizeof(Size))
+			{
+				Assert(mqh->mqh_partial_bytes == sizeof(Size));
+				mqh->mqh_expected_bytes = *(Size *) mqh->mqh_buffer;
+				mqh->mqh_length_word_complete = true;
+				mqh->mqh_partial_bytes = 0;
+			}
+		}
+	}
+	nbytes = mqh->mqh_expected_bytes;
+
+	/*
+	 * Should be disallowed on the sending side already, but better check and
+	 * error out on the receiver side as well rather than trying to read a
+	 * prohibitively large message.
+	 */
+	if (nbytes > MaxAllocSize)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("invalid message size %zu in shared memory queue",
+						nbytes)));
+
+	if (mqh->mqh_partial_bytes == 0)
+	{
+		/*
+		 * Try to obtain the whole message in a single chunk.  If this works,
+		 * we need not copy the data and can return a pointer directly into
+		 * shared memory.
+		 */
+		res = shm_mq_receive_bytes(mqh, nbytes, nowait, &rb, &rawdata);
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+		if (rb >= nbytes)
+		{
+			mqh->mqh_length_word_complete = false;
+			mqh->mqh_consume_pending += MAXALIGN(nbytes);
+			*nbytesp = nbytes;
+			*datap = rawdata;
+			return SHM_MQ_SUCCESS;
+		}
+
+		/*
+		 * The message has wrapped the buffer.  We'll need to copy it in order
+		 * to return it to the client in one chunk.  First, make sure we have
+		 * a large enough buffer available.
+		 */
+		if (mqh->mqh_buflen < nbytes)
+		{
+			Size		newbuflen = Max(mqh->mqh_buflen, MQH_INITIAL_BUFSIZE);
+
+			/*
+			 * Double the buffer size until the payload fits, but limit to
+			 * MaxAllocSize.
+			 */
+			while (newbuflen < nbytes)
+				newbuflen *= 2;
+			newbuflen = Min(newbuflen, MaxAllocSize);
+
+			if (mqh->mqh_buffer != NULL)
+			{
+				pfree(mqh->mqh_buffer);
+				mqh->mqh_buffer = NULL;
+				mqh->mqh_buflen = 0;
+			}
+			mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, newbuflen);
+			mqh->mqh_buflen = newbuflen;
+		}
+	}
+
+	/* Loop until we've copied the entire message. */
+	for (;;)
+	{
+		Size		still_needed;
+
+		/* Copy as much as we can. */
+		Assert(mqh->mqh_partial_bytes + rb <= nbytes);
+		if (rb > 0)
+		{
+			memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, rb);
+			mqh->mqh_partial_bytes += rb;
+		}
+
+		/*
+		 * Update count of bytes that can be consumed, accounting for
+		 * alignment padding.  Note that this will never actually insert any
+		 * padding except at the end of a message, because the buffer size is
+		 * a multiple of MAXIMUM_ALIGNOF, and each read and write is as well.
+		 */
+		Assert(mqh->mqh_partial_bytes == nbytes || rb == MAXALIGN(rb));
+		mqh->mqh_consume_pending += MAXALIGN(rb);
+
+		/* If we got all the data, exit the loop. */
+		if (mqh->mqh_partial_bytes >= nbytes)
+			break;
+
+		/* Wait for some more data. */
+		still_needed = nbytes - mqh->mqh_partial_bytes;
+		res = shm_mq_receive_bytes(mqh, still_needed, nowait, &rb, &rawdata);
+		if (res != SHM_MQ_SUCCESS)
+			return res;
+		if (rb > still_needed)
+			rb = still_needed;
+	}
+
+	/* Return the complete message, and reset for next message. */
+	*nbytesp = nbytes;
+	*datap = mqh->mqh_buffer;
+	mqh->mqh_length_word_complete = false;
+	mqh->mqh_partial_bytes = 0;
+	return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Wait for the other process that's supposed to use this queue to attach
+ * to it.
+ *
+ * The return value is SHM_MQ_DETACHED if the worker has already detached or
+ * if it dies; it is SHM_MQ_SUCCESS if we detect that the worker has attached.
+ * Note that we will only be able to detect that the worker has died before
+ * attaching if a background worker handle was passed to shm_mq_attach().
+ */
+shm_mq_result
+shm_mq_wait_for_attach(shm_mq_handle *mqh)
+{
+	shm_mq	   *mq = mqh->mqh_queue;
+	PGPROC	  **victim;
+
+	if (shm_mq_get_receiver(mq) == MyProc)
+		victim = &mq->mq_sender;
+	else
+	{
+		Assert(shm_mq_get_sender(mq) == MyProc);
+		victim = &mq->mq_receiver;
+	}
+
+	if (shm_mq_wait_internal(mq, victim, mqh->mqh_handle))
+		return SHM_MQ_SUCCESS;
+	else
+		return SHM_MQ_DETACHED;
+}
+
+/*
+ * Detach from a shared message queue, and destroy the shm_mq_handle.
+ */
+void
+shm_mq_detach(shm_mq_handle *mqh)
+{
+	/* Notify counterparty that we're outta here. */
+	shm_mq_detach_internal(mqh->mqh_queue);
+
+	/* Cancel on_dsm_detach callback, if any. */
+	if (mqh->mqh_segment)
+		cancel_on_dsm_detach(mqh->mqh_segment,
+							 shm_mq_detach_callback,
+							 PointerGetDatum(mqh->mqh_queue));
+
+	/* Release local memory associated with handle. */
+	if (mqh->mqh_buffer != NULL)
+		pfree(mqh->mqh_buffer);
+	pfree(mqh);
+}
+
+/*
+ * Notify counterparty that we're detaching from shared message queue.
+ *
+ * The purpose of this function is to make sure that the process
+ * with which we're communicating doesn't block forever waiting for us to
+ * fill or drain the queue once we've lost interest.  When the sender
+ * detaches, the receiver can read any messages remaining in the queue;
+ * further reads will return SHM_MQ_DETACHED.  If the receiver detaches,
+ * further attempts to send messages will likewise return SHM_MQ_DETACHED.
+ *
+ * This is separated out from shm_mq_detach() because if the on_dsm_detach
+ * callback fires, we only want to do this much.  We do not try to touch
+ * the local shm_mq_handle, as it may have been pfree'd already.
+ */
+static void
+shm_mq_detach_internal(shm_mq *mq)
+{
+	PGPROC	   *victim;
+
+	SpinLockAcquire(&mq->mq_mutex);
+	if (mq->mq_sender == MyProc)
+		victim = mq->mq_receiver;
+	else
+	{
+		Assert(mq->mq_receiver == MyProc);
+		victim = mq->mq_sender;
+	}
+	mq->mq_detached = true;
+	SpinLockRelease(&mq->mq_mutex);
+
+	if (victim != NULL)
+		SetLatch(&victim->procLatch);
+}
+
+/*
+ * Get the shm_mq from handle.
+ */
+shm_mq *
+shm_mq_get_queue(shm_mq_handle *mqh)
+{
+	return mqh->mqh_queue;
+}
+
+/*
+ * Write bytes into a shared message queue.
+ */
+static shm_mq_result
+shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
+				  bool nowait, Size *bytes_written)
+{
+	shm_mq	   *mq = mqh->mqh_queue;
+	Size		sent = 0;
+	uint64		used;
+	Size		ringsize = mq->mq_ring_size;
+	Size		available;
+
+	while (sent < nbytes)
+	{
+		uint64		rb;
+		uint64		wb;
+
+		/* Compute number of ring buffer bytes used and available. */
+		rb = pg_atomic_read_u64(&mq->mq_bytes_read);
+		wb = pg_atomic_read_u64(&mq->mq_bytes_written);
+		Assert(wb >= rb);
+		used = wb - rb;
+		Assert(used <= ringsize);
+		available = Min(ringsize - used, nbytes - sent);
+
+		/*
+		 * Bail out if the queue has been detached.  Note that we would be in
+		 * trouble if the compiler decided to cache the value of
+		 * mq->mq_detached in a register or on the stack across loop
+		 * iterations.  It probably shouldn't do that anyway since we'll
+		 * always return, call an external function that performs a system
+		 * call, or reach a memory barrier at some point later in the loop,
+		 * but just to be sure, insert a compiler barrier here.
+		 */
+		pg_compiler_barrier();
+		if (mq->mq_detached)
+		{
+			*bytes_written = sent;
+			return SHM_MQ_DETACHED;
+		}
+
+		if (available == 0 && !mqh->mqh_counterparty_attached)
+		{
+			/*
+			 * The queue is full, so if the receiver isn't yet known to be
+			 * attached, we must wait for that to happen.
+			 */
+			if (nowait)
+			{
+				if (shm_mq_counterparty_gone(mq, mqh->mqh_handle))
+				{
+					*bytes_written = sent;
+					return SHM_MQ_DETACHED;
+				}
+				if (shm_mq_get_receiver(mq) == NULL)
+				{
+					*bytes_written = sent;
+					return SHM_MQ_WOULD_BLOCK;
+				}
+			}
+			else if (!shm_mq_wait_internal(mq, &mq->mq_receiver,
+										   mqh->mqh_handle))
+			{
+				mq->mq_detached = true;
+				*bytes_written = sent;
+				return SHM_MQ_DETACHED;
+			}
+			mqh->mqh_counterparty_attached = true;
+
+			/*
+			 * The receiver may have read some data after attaching, so we
+			 * must not wait without rechecking the queue state.
+			 */
+		}
+		else if (available == 0)
+		{
+			/*
+			 * Since mq->mqh_counterparty_attached is known to be true at this
+			 * point, mq_receiver has been set, and it can't change once set.
+			 * Therefore, we can read it without acquiring the spinlock.
+			 */
+			Assert(mqh->mqh_counterparty_attached);
+			SetLatch(&mq->mq_receiver->procLatch);
+
+			/* Skip manipulation of our latch if nowait = true. */
+			if (nowait)
+			{
+				*bytes_written = sent;
+				return SHM_MQ_WOULD_BLOCK;
+			}
+
+			/*
+			 * Wait for our latch to be set.  It might already be set for some
+			 * unrelated reason, but that'll just result in one extra trip
+			 * through the loop.  It's worth it to avoid resetting the latch
+			 * at top of loop, because setting an already-set latch is much
+			 * cheaper than setting one that has been reset.
+			 */
+			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+							 WAIT_EVENT_MQ_SEND);
+
+			/* Reset the latch so we don't spin. */
+			ResetLatch(MyLatch);
+
+			/* An interrupt may have occurred while we were waiting. */
+			CHECK_FOR_INTERRUPTS();
+		}
+		else
+		{
+			Size		offset;
+			Size		sendnow;
+
+			offset = wb % (uint64) ringsize;
+			sendnow = Min(available, ringsize - offset);
+
+			/*
+			 * Write as much data as we can via a single memcpy(). Make sure
+			 * these writes happen after the read of mq_bytes_read, above.
+			 * This barrier pairs with the one in shm_mq_inc_bytes_read.
+			 * (Since we're separating the read of mq_bytes_read from a
+			 * subsequent write to mq_ring, we need a full barrier here.)
+			 */
+			pg_memory_barrier();
+			memcpy(&mq->mq_ring[mq->mq_ring_offset + offset],
+				   (char *) data + sent, sendnow);
+			sent += sendnow;
+
+			/*
+			 * Update count of bytes written, with alignment padding.  Note
+			 * that this will never actually insert any padding except at the
+			 * end of a run of bytes, because the buffer size is a multiple of
+			 * MAXIMUM_ALIGNOF, and each read is as well.
+			 */
+			Assert(sent == nbytes || sendnow == MAXALIGN(sendnow));
+			shm_mq_inc_bytes_written(mq, MAXALIGN(sendnow));
+
+			/*
+			 * For efficiency, we don't set the reader's latch here.  We'll do
+			 * that only when the buffer fills up or after writing an entire
+			 * message.
+			 */
+		}
+	}
+
+	*bytes_written = sent;
+	return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Wait until at least *nbytesp bytes are available to be read from the
+ * shared message queue, or until the buffer wraps around.  If the queue is
+ * detached, returns SHM_MQ_DETACHED.  If nowait is specified and a wait
+ * would be required, returns SHM_MQ_WOULD_BLOCK.  Otherwise, *datap is set
+ * to the location at which data bytes can be read, *nbytesp is set to the
+ * number of bytes which can be read at that address, and the return value
+ * is SHM_MQ_SUCCESS.
+ */
+static shm_mq_result
+shm_mq_receive_bytes(shm_mq_handle *mqh, Size bytes_needed, bool nowait,
+					 Size *nbytesp, void **datap)
+{
+	shm_mq	   *mq = mqh->mqh_queue;
+	Size		ringsize = mq->mq_ring_size;
+	uint64		used;
+	uint64		written;
+
+	for (;;)
+	{
+		Size		offset;
+		uint64		read;
+
+		/* Get bytes written, so we can compute what's available to read. */
+		written = pg_atomic_read_u64(&mq->mq_bytes_written);
+
+		/*
+		 * Get bytes read.  Include bytes we could consume but have not yet
+		 * consumed.
+		 */
+		read = pg_atomic_read_u64(&mq->mq_bytes_read) +
+			mqh->mqh_consume_pending;
+		used = written - read;
+		Assert(used <= ringsize);
+		offset = read % (uint64) ringsize;
+
+		/* If we have enough data or buffer has wrapped, we're done. */
+		if (used >= bytes_needed || offset + used >= ringsize)
+		{
+			*nbytesp = Min(used, ringsize - offset);
+			*datap = &mq->mq_ring[mq->mq_ring_offset + offset];
+
+			/*
+			 * Separate the read of mq_bytes_written, above, from caller's
+			 * attempt to read the data itself.  Pairs with the barrier in
+			 * shm_mq_inc_bytes_written.
+			 */
+			pg_read_barrier();
+			return SHM_MQ_SUCCESS;
+		}
+
+		/*
+		 * Fall out before waiting if the queue has been detached.
+		 *
+		 * Note that we don't check for this until *after* considering whether
+		 * the data already available is enough, since the receiver can finish
+		 * receiving a message stored in the buffer even after the sender has
+		 * detached.
+		 */
+		if (mq->mq_detached)
+		{
+			/*
+			 * If the writer advanced mq_bytes_written and then set
+			 * mq_detached, we might not have read the final value of
+			 * mq_bytes_written above.  Insert a read barrier and then check
+			 * again if mq_bytes_written has advanced.
+			 */
+			pg_read_barrier();
+			if (written != pg_atomic_read_u64(&mq->mq_bytes_written))
+				continue;
+
+			return SHM_MQ_DETACHED;
+		}
+
+		/*
+		 * We didn't get enough data to satisfy the request, so mark any data
+		 * previously-consumed as read to make more buffer space.
+		 */
+		if (mqh->mqh_consume_pending > 0)
+		{
+			shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending);
+			mqh->mqh_consume_pending = 0;
+		}
+
+		/* Skip manipulation of our latch if nowait = true. */
+		if (nowait)
+			return SHM_MQ_WOULD_BLOCK;
+
+		/*
+		 * Wait for our latch to be set.  It might already be set for some
+		 * unrelated reason, but that'll just result in one extra trip through
+		 * the loop.  It's worth it to avoid resetting the latch at top of
+		 * loop, because setting an already-set latch is much cheaper than
+		 * setting one that has been reset.
+		 */
+		(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+						 WAIT_EVENT_MQ_RECEIVE);
+
+		/* Reset the latch so we don't spin. */
+		ResetLatch(MyLatch);
+
+		/* An interrupt may have occurred while we were waiting. */
+		CHECK_FOR_INTERRUPTS();
+	}
+}
+
+/*
+ * Test whether a counterparty who may not even be alive yet is definitely gone.
+ */
+static bool
+shm_mq_counterparty_gone(shm_mq *mq, BackgroundWorkerHandle *handle)
+{
+	pid_t		pid;
+
+	/* If the queue has been detached, counterparty is definitely gone. */
+	if (mq->mq_detached)
+		return true;
+
+	/* If there's a handle, check worker status. */
+	if (handle != NULL)
+	{
+		BgwHandleStatus status;
+
+		/* Check for unexpected worker death. */
+		status = GetBackgroundWorkerPid(handle, &pid);
+		if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
+		{
+			/* Mark it detached, just to make it official. */
+			mq->mq_detached = true;
+			return true;
+		}
+	}
+
+	/* Counterparty is not definitively gone. */
+	return false;
+}
+
+/*
+ * This is used when a process is waiting for its counterpart to attach to the
+ * queue.  We exit when the other process attaches as expected, or, if
+ * handle != NULL, when the referenced background process or the postmaster
+ * dies.  Note that if handle == NULL, and the process fails to attach, we'll
+ * potentially get stuck here forever waiting for a process that may never
+ * start.  We do check for interrupts, though.
+ *
+ * ptr is a pointer to the memory address that we're expecting to become
+ * non-NULL when our counterpart attaches to the queue.
+ */
+static bool
+shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr, BackgroundWorkerHandle *handle)
+{
+	bool		result = false;
+
+	for (;;)
+	{
+		BgwHandleStatus status;
+		pid_t		pid;
+
+		/* Acquire the lock just long enough to check the pointer. */
+		SpinLockAcquire(&mq->mq_mutex);
+		result = (*ptr != NULL);
+		SpinLockRelease(&mq->mq_mutex);
+
+		/* Fail if detached; else succeed if initialized. */
+		if (mq->mq_detached)
+		{
+			result = false;
+			break;
+		}
+		if (result)
+			break;
+
+		if (handle != NULL)
+		{
+			/* Check for unexpected worker death. */
+			status = GetBackgroundWorkerPid(handle, &pid);
+			if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
+			{
+				result = false;
+				break;
+			}
+		}
+
+		/* Wait to be signaled. */
+		(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+						 WAIT_EVENT_MQ_INTERNAL);
+
+		/* Reset the latch so we don't spin. */
+		ResetLatch(MyLatch);
+
+		/* An interrupt may have occurred while we were waiting. */
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	return result;
+}
+
+/*
+ * Increment the number of bytes read.
+ */
+static void
+shm_mq_inc_bytes_read(shm_mq *mq, Size n)
+{
+	PGPROC	   *sender;
+
+	/*
+	 * Separate prior reads of mq_ring from the increment of mq_bytes_read
+	 * which follows.  This pairs with the full barrier in
+	 * shm_mq_send_bytes(). We only need a read barrier here because the
+	 * increment of mq_bytes_read is actually a read followed by a dependent
+	 * write.
+	 */
+	pg_read_barrier();
+
+	/*
+	 * There's no need to use pg_atomic_fetch_add_u64 here, because nobody
+	 * else can be changing this value.  This method should be cheaper.
+	 */
+	pg_atomic_write_u64(&mq->mq_bytes_read,
+						pg_atomic_read_u64(&mq->mq_bytes_read) + n);
+
+	/*
+	 * We shouldn't have any bytes to read without a sender, so we can read
+	 * mq_sender here without a lock.  Once it's initialized, it can't change.
+	 */
+	sender = mq->mq_sender;
+	Assert(sender != NULL);
+	SetLatch(&sender->procLatch);
+}
+
+/*
+ * Increment the number of bytes written.
+ */
+static void
+shm_mq_inc_bytes_written(shm_mq *mq, Size n)
+{
+	/*
+	 * Separate prior reads of mq_ring from the write of mq_bytes_written
+	 * which we're about to do.  Pairs with the read barrier found in
+	 * shm_mq_receive_bytes.
+	 */
+	pg_write_barrier();
+
+	/*
+	 * There's no need to use pg_atomic_fetch_add_u64 here, because nobody
+	 * else can be changing this value.  This method avoids taking the bus
+	 * lock unnecessarily.
+	 */
+	pg_atomic_write_u64(&mq->mq_bytes_written,
+						pg_atomic_read_u64(&mq->mq_bytes_written) + n);
+}
+
+/* Shim for on_dsm_detach callback. */
+static void
+shm_mq_detach_callback(dsm_segment *seg, Datum arg)
+{
+	shm_mq	   *mq = (shm_mq *) DatumGetPointer(arg);
+
+	shm_mq_detach_internal(mq);
+}
diff --git a/src/backend/storage/ipc/shm_toc.c b/src/backend/storage/ipc/shm_toc.c
new file mode 100644
index 0000000..863b98b
--- /dev/null
+++ b/src/backend/storage/ipc/shm_toc.c
@@ -0,0 +1,272 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_toc.c
+ *	  shared memory segment table of contents
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/ipc/shm_toc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "storage/shm_toc.h"
+#include "storage/spin.h"
+
+typedef struct shm_toc_entry
+{
+	uint64		key;			/* Arbitrary identifier */
+	Size		offset;			/* Offset, in bytes, from TOC start */
+} shm_toc_entry;
+
+struct shm_toc
+{
+	uint64		toc_magic;		/* Magic number identifying this TOC */
+	slock_t		toc_mutex;		/* Spinlock for mutual exclusion */
+	Size		toc_total_bytes;	/* Bytes managed by this TOC */
+	Size		toc_allocated_bytes;	/* Bytes allocated of those managed */
+	uint32		toc_nentry;		/* Number of entries in TOC */
+	shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/*
+ * Initialize a region of shared memory with a table of contents.
+ */
+shm_toc *
+shm_toc_create(uint64 magic, void *address, Size nbytes)
+{
+	shm_toc    *toc = (shm_toc *) address;
+
+	Assert(nbytes > offsetof(shm_toc, toc_entry));
+	toc->toc_magic = magic;
+	SpinLockInit(&toc->toc_mutex);
+
+	/*
+	 * The alignment code in shm_toc_allocate() assumes that the starting
+	 * value is buffer-aligned.
+	 */
+	toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
+	toc->toc_allocated_bytes = 0;
+	toc->toc_nentry = 0;
+
+	return toc;
+}
+
+/*
+ * Attach to an existing table of contents.  If the magic number found at
+ * the target address doesn't match our expectations, return NULL.
+ */
+shm_toc *
+shm_toc_attach(uint64 magic, void *address)
+{
+	shm_toc    *toc = (shm_toc *) address;
+
+	if (toc->toc_magic != magic)
+		return NULL;
+
+	Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
+	Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));
+
+	return toc;
+}
+
+/*
+ * Allocate shared memory from a segment managed by a table of contents.
+ *
+ * This is not a full-blown allocator; there's no way to free memory.  It's
+ * just a way of dividing a single physical shared memory segment into logical
+ * chunks that may be used for different purposes.
+ *
+ * We allocate backwards from the end of the segment, so that the TOC entries
+ * can grow forward from the start of the segment.
+ */
+void *
+shm_toc_allocate(shm_toc *toc, Size nbytes)
+{
+	volatile shm_toc *vtoc = toc;
+	Size		total_bytes;
+	Size		allocated_bytes;
+	Size		nentry;
+	Size		toc_bytes;
+
+	/*
+	 * Make sure request is well-aligned.  XXX: MAXALIGN is not enough,
+	 * because atomic ops might need a wider alignment.  We don't have a
+	 * proper definition for the minimum to make atomic ops safe, but
+	 * BUFFERALIGN ought to be enough.
+	 */
+	nbytes = BUFFERALIGN(nbytes);
+
+	SpinLockAcquire(&toc->toc_mutex);
+
+	total_bytes = vtoc->toc_total_bytes;
+	allocated_bytes = vtoc->toc_allocated_bytes;
+	nentry = vtoc->toc_nentry;
+	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+		+ allocated_bytes;
+
+	/* Check for memory exhaustion and overflow. */
+	if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
+	{
+		SpinLockRelease(&toc->toc_mutex);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory")));
+	}
+	vtoc->toc_allocated_bytes += nbytes;
+
+	SpinLockRelease(&toc->toc_mutex);
+
+	return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
+}
+
+/*
+ * Return the number of bytes that can still be allocated.
+ */
+Size
+shm_toc_freespace(shm_toc *toc)
+{
+	volatile shm_toc *vtoc = toc;
+	Size		total_bytes;
+	Size		allocated_bytes;
+	Size		nentry;
+	Size		toc_bytes;
+
+	SpinLockAcquire(&toc->toc_mutex);
+	total_bytes = vtoc->toc_total_bytes;
+	allocated_bytes = vtoc->toc_allocated_bytes;
+	nentry = vtoc->toc_nentry;
+	SpinLockRelease(&toc->toc_mutex);
+
+	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
+	Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
+	return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
+}
+
+/*
+ * Insert a TOC entry.
+ *
+ * The idea here is that the process setting up the shared memory segment will
+ * register the addresses of data structures within the segment using this
+ * function.  Each data structure will be identified using a 64-bit key, which
+ * is assumed to be a well-known or discoverable integer.  Other processes
+ * accessing the shared memory segment can pass the same key to
+ * shm_toc_lookup() to discover the addresses of those data structures.
+ *
+ * Since the shared memory segment may be mapped at different addresses within
+ * different backends, we store relative rather than absolute pointers.
+ *
+ * This won't scale well to a large number of keys.  Hopefully, that isn't
+ * necessary; if it proves to be, we might need to provide a more sophisticated
+ * data structure here.  But the real idea here is just to give someone mapping
+ * a dynamic shared memory the ability to find the bare minimum number of
+ * pointers that they need to bootstrap.  If you're storing a lot of stuff in
+ * the TOC, you're doing it wrong.
+ */
+void
+shm_toc_insert(shm_toc *toc, uint64 key, void *address)
+{
+	volatile shm_toc *vtoc = toc;
+	Size		total_bytes;
+	Size		allocated_bytes;
+	Size		nentry;
+	Size		toc_bytes;
+	Size		offset;
+
+	/* Relativize pointer. */
+	Assert(address > (void *) toc);
+	offset = ((char *) address) - (char *) toc;
+
+	SpinLockAcquire(&toc->toc_mutex);
+
+	total_bytes = vtoc->toc_total_bytes;
+	allocated_bytes = vtoc->toc_allocated_bytes;
+	nentry = vtoc->toc_nentry;
+	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+		+ allocated_bytes;
+
+	/* Check for memory exhaustion and overflow. */
+	if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
+		toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
+		nentry >= PG_UINT32_MAX)
+	{
+		SpinLockRelease(&toc->toc_mutex);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory")));
+	}
+
+	Assert(offset < total_bytes);
+	vtoc->toc_entry[nentry].key = key;
+	vtoc->toc_entry[nentry].offset = offset;
+
+	/*
+	 * By placing a write barrier after filling in the entry and before
+	 * updating the number of entries, we make it safe to read the TOC
+	 * unlocked.
+	 */
+	pg_write_barrier();
+
+	vtoc->toc_nentry++;
+
+	SpinLockRelease(&toc->toc_mutex);
+}
+
+/*
+ * Look up a TOC entry.
+ *
+ * If the key is not found, returns NULL if noError is true, otherwise
+ * throws elog(ERROR).
+ *
+ * Unlike the other functions in this file, this operation acquires no lock;
+ * it uses only barriers.  It probably wouldn't hurt concurrency very much even
+ * if it did get a lock, but since it's reasonably likely that a group of
+ * worker processes could each read a series of entries from the same TOC
+ * right around the same time, there seems to be some value in avoiding it.
+ */
+void *
+shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
+{
+	uint32		nentry;
+	uint32		i;
+
+	/*
+	 * Read the number of entries before we examine any entry.  We assume that
+	 * reading a uint32 is atomic.
+	 */
+	nentry = toc->toc_nentry;
+	pg_read_barrier();
+
+	/* Now search for a matching entry. */
+	for (i = 0; i < nentry; ++i)
+	{
+		if (toc->toc_entry[i].key == key)
+			return ((char *) toc) + toc->toc_entry[i].offset;
+	}
+
+	/* No matching entry was found. */
+	if (!noError)
+		elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
+			 key, toc);
+	return NULL;
+}
+
+/*
+ * Estimate how much shared memory will be required to store a TOC and its
+ * dependent data structures.
+ */
+Size
+shm_toc_estimate(shm_toc_estimator *e)
+{
+	Size		sz;
+
+	sz = offsetof(shm_toc, toc_entry);
+	sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
+	sz = add_size(sz, e->space_for_chunks);
+
+	return BUFFERALIGN(sz);
+}
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
new file mode 100644
index 0000000..4425e99
--- /dev/null
+++ b/src/backend/storage/ipc/shmem.c
@@ -0,0 +1,611 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.c
+ *	  create shared memory and initialize shared memory data structures.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/shmem.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES processes share one or more regions of shared memory.
+ * The shared memory is created by a postmaster and is inherited
+ * by each backend via fork() (or, in some ports, via other OS-specific
+ * methods).  The routines in this file are used for allocating and
+ * binding to shared memory data structures.
+ *
+ * NOTES:
+ *		(a) There are three kinds of shared memory data structures
+ *	available to POSTGRES: fixed-size structures, queues and hash
+ *	tables.  Fixed-size structures contain things like global variables
+ *	for a module and should never be allocated after the shared memory
+ *	initialization phase.  Hash tables have a fixed maximum size, but
+ *	their actual size can vary dynamically.  When entries are added
+ *	to the table, more space is allocated.  Queues link data structures
+ *	that have been allocated either within fixed-size structures or as hash
+ *	buckets.  Each shared data structure has a string name to identify
+ *	it (assigned in the module that declares it).
+ *
+ *		(b) During initialization, each module looks for its
+ *	shared data structures in a hash table called the "Shmem Index".
+ *	If the data structure is not present, the caller can allocate
+ *	a new one and initialize it.  If the data structure is present,
+ *	the caller "attaches" to the structure by initializing a pointer
+ *	in the local address space.
+ *		The shmem index has two purposes: first, it gives us
+ *	a simple model of how the world looks when a backend process
+ *	initializes.  If something is present in the shmem index,
+ *	it is initialized.  If it is not, it is uninitialized.  Second,
+ *	the shmem index allows us to allocate shared memory on demand
+ *	instead of trying to preallocate structures and hard-wire the
+ *	sizes and locations in header files.  If you are using a lot
+ *	of shared memory in a lot of different places (and changing
+ *	things during development), this is important.
+ *
+ *		(c) In standard Unix-ish environments, individual backends do not
+ *	need to re-establish their local pointers into shared memory, because
+ *	they inherit correct values of those variables via fork() from the
+ *	postmaster.  However, this does not work in the EXEC_BACKEND case.
+ *	In ports using EXEC_BACKEND, new backends have to set up their local
+ *	pointers using the method described in (b) above.
+ *
+ *		(d) memory allocation model: shared memory can never be
+ *	freed, once allocated.   Each hash table has its own free list,
+ *	so hash buckets can be reused when an item is deleted.  However,
+ *	if one hash table grows very large and then shrinks, its space
+ *	cannot be redistributed to other tables.  We could build a simple
+ *	hash bucket garbage collector if need be.  Right now, it seems
+ *	unnecessary.
+ */
+
+#include "postgres.h"
+
+#include "access/transam.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+
+static void *ShmemAllocRaw(Size size, Size *allocated_size);
+
+/* shared memory global variables */
+
+static PGShmemHeader *ShmemSegHdr;	/* shared mem segment header */
+
+static void *ShmemBase;			/* start address of shared memory */
+
+static void *ShmemEnd;			/* end+1 address of shared memory */
+
+slock_t    *ShmemLock;			/* spinlock for shared memory and LWLock
+								 * allocation */
+
+static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
+
+
+/*
+ *	InitShmemAccess() --- set up basic pointers to shared memory.
+ *
+ * Note: the argument should be declared "PGShmemHeader *seghdr",
+ * but we use void to avoid having to include ipc.h in shmem.h.
+ */
+void
+InitShmemAccess(void *seghdr)
+{
+	PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr;
+
+	ShmemSegHdr = shmhdr;
+	ShmemBase = (void *) shmhdr;
+	ShmemEnd = (char *) ShmemBase + shmhdr->totalsize;
+}
+
+/*
+ *	InitShmemAllocation() --- set up shared-memory space allocation.
+ *
+ * This should be called only in the postmaster or a standalone backend.
+ */
+void
+InitShmemAllocation(void)
+{
+	PGShmemHeader *shmhdr = ShmemSegHdr;
+	char	   *aligned;
+
+	Assert(shmhdr != NULL);
+
+	/*
+	 * Initialize the spinlock used by ShmemAlloc.  We must use
+	 * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
+	 */
+	ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
+
+	SpinLockInit(ShmemLock);
+
+	/*
+	 * Allocations after this point should go through ShmemAlloc, which
+	 * expects to allocate everything on cache line boundaries.  Make sure the
+	 * first allocation begins on a cache line boundary.
+	 */
+	aligned = (char *)
+		(CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
+	shmhdr->freeoffset = aligned - (char *) shmhdr;
+
+	/* ShmemIndex can't be set up yet (need LWLocks first) */
+	shmhdr->index = NULL;
+	ShmemIndex = (HTAB *) NULL;
+
+	/*
+	 * Initialize ShmemVariableCache for transaction manager. (This doesn't
+	 * really belong here, but not worth moving.)
+	 */
+	ShmemVariableCache = (VariableCache)
+		ShmemAlloc(sizeof(*ShmemVariableCache));
+	memset(ShmemVariableCache, 0, sizeof(*ShmemVariableCache));
+}
+
+/*
+ * ShmemAlloc -- allocate max-aligned chunk from shared memory
+ *
+ * Throws error if request cannot be satisfied.
+ *
+ * Assumes ShmemLock and ShmemSegHdr are initialized.
+ */
+void *
+ShmemAlloc(Size size)
+{
+	void	   *newSpace;
+	Size		allocated_size;
+
+	newSpace = ShmemAllocRaw(size, &allocated_size);
+	if (!newSpace)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory (%zu bytes requested)",
+						size)));
+	return newSpace;
+}
+
+/*
+ * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
+ *
+ * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
+ */
+void *
+ShmemAllocNoError(Size size)
+{
+	Size		allocated_size;
+
+	return ShmemAllocRaw(size, &allocated_size);
+}
+
+/*
+ * ShmemAllocRaw -- allocate align chunk and return allocated size
+ *
+ * Also sets *allocated_size to the number of bytes allocated, which will
+ * be equal to the number requested plus any padding we choose to add.
+ */
+static void *
+ShmemAllocRaw(Size size, Size *allocated_size)
+{
+	Size		newStart;
+	Size		newFree;
+	void	   *newSpace;
+
+	/*
+	 * Ensure all space is adequately aligned.  We used to only MAXALIGN this
+	 * space but experience has proved that on modern systems that is not good
+	 * enough.  Many parts of the system are very sensitive to critical data
+	 * structures getting split across cache line boundaries.  To avoid that,
+	 * attempt to align the beginning of the allocation to a cache line
+	 * boundary.  The calling code will still need to be careful about how it
+	 * uses the allocated space - e.g. by padding each element in an array of
+	 * structures out to a power-of-two size - but without this, even that
+	 * won't be sufficient.
+	 */
+	size = CACHELINEALIGN(size);
+	*allocated_size = size;
+
+	Assert(ShmemSegHdr != NULL);
+
+	SpinLockAcquire(ShmemLock);
+
+	newStart = ShmemSegHdr->freeoffset;
+
+	newFree = newStart + size;
+	if (newFree <= ShmemSegHdr->totalsize)
+	{
+		newSpace = (void *) ((char *) ShmemBase + newStart);
+		ShmemSegHdr->freeoffset = newFree;
+	}
+	else
+		newSpace = NULL;
+
+	SpinLockRelease(ShmemLock);
+
+	/* note this assert is okay with newSpace == NULL */
+	Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
+
+	return newSpace;
+}
+
+/*
+ * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
+ *
+ * Allocate space without locking ShmemLock.  This should be used for,
+ * and only for, allocations that must happen before ShmemLock is ready.
+ *
+ * We consider maxalign, rather than cachealign, sufficient here.
+ */
+void *
+ShmemAllocUnlocked(Size size)
+{
+	Size		newStart;
+	Size		newFree;
+	void	   *newSpace;
+
+	/*
+	 * Ensure allocated space is adequately aligned.
+	 */
+	size = MAXALIGN(size);
+
+	Assert(ShmemSegHdr != NULL);
+
+	newStart = ShmemSegHdr->freeoffset;
+
+	newFree = newStart + size;
+	if (newFree > ShmemSegHdr->totalsize)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory (%zu bytes requested)",
+						size)));
+	ShmemSegHdr->freeoffset = newFree;
+
+	newSpace = (void *) ((char *) ShmemBase + newStart);
+
+	Assert(newSpace == (void *) MAXALIGN(newSpace));
+
+	return newSpace;
+}
+
+/*
+ * ShmemAddrIsValid -- test if an address refers to shared memory
+ *
+ * Returns true if the pointer points within the shared memory segment.
+ */
+bool
+ShmemAddrIsValid(const void *addr)
+{
+	return (addr >= ShmemBase) && (addr < ShmemEnd);
+}
+
+/*
+ *	InitShmemIndex() --- set up or attach to shmem index table.
+ */
+void
+InitShmemIndex(void)
+{
+	HASHCTL		info;
+
+	/*
+	 * Create the shared memory shmem index.
+	 *
+	 * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
+	 * hashtable to exist already, we have a bit of a circularity problem in
+	 * initializing the ShmemIndex itself.  The special "ShmemIndex" hash
+	 * table name will tell ShmemInitStruct to fake it.
+	 */
+	info.keysize = SHMEM_INDEX_KEYSIZE;
+	info.entrysize = sizeof(ShmemIndexEnt);
+
+	ShmemIndex = ShmemInitHash("ShmemIndex",
+							   SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
+							   &info,
+							   HASH_ELEM | HASH_STRINGS);
+}
+
+/*
+ * ShmemInitHash -- Create and initialize, or attach to, a
+ *		shared memory hash table.
+ *
+ * We assume caller is doing some kind of synchronization
+ * so that two processes don't try to create/initialize the same
+ * table at once.  (In practice, all creations are done in the postmaster
+ * process; child processes should always be attaching to existing tables.)
+ *
+ * max_size is the estimated maximum number of hashtable entries.  This is
+ * not a hard limit, but the access efficiency will degrade if it is
+ * exceeded substantially (since it's used to compute directory size and
+ * the hash table buckets will get overfull).
+ *
+ * init_size is the number of hashtable entries to preallocate.  For a table
+ * whose maximum size is certain, this should be equal to max_size; that
+ * ensures that no run-time out-of-shared-memory failures can occur.
+ *
+ * *infoP and hash_flags must specify at least the entry sizes and key
+ * comparison semantics (see hash_create()).  Flag bits and values specific
+ * to shared-memory hash tables are added here, except that callers may
+ * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
+ *
+ * Note: before Postgres 9.0, this function returned NULL for some failure
+ * cases.  Now, it always throws error instead, so callers need not check
+ * for NULL.
+ */
+HTAB *
+ShmemInitHash(const char *name,		/* table string name for shmem index */
+			  long init_size,	/* initial table size */
+			  long max_size,	/* max size of the table */
+			  HASHCTL *infoP,	/* info about key and bucket size */
+			  int hash_flags)	/* info about infoP */
+{
+	bool		found;
+	void	   *location;
+
+	/*
+	 * Hash tables allocated in shared memory have a fixed directory; it can't
+	 * grow or other backends wouldn't be able to find it. So, make sure we
+	 * make it big enough to start with.
+	 *
+	 * The shared memory allocator must be specified too.
+	 */
+	infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
+	infoP->alloc = ShmemAllocNoError;
+	hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
+
+	/* look it up in the shmem index */
+	location = ShmemInitStruct(name,
+							   hash_get_shared_size(infoP, hash_flags),
+							   &found);
+
+	/*
+	 * if it already exists, attach to it rather than allocate and initialize
+	 * new space
+	 */
+	if (found)
+		hash_flags |= HASH_ATTACH;
+
+	/* Pass location of hashtable header to hash_create */
+	infoP->hctl = (HASHHDR *) location;
+
+	return hash_create(name, init_size, infoP, hash_flags);
+}
+
+/*
+ * ShmemInitStruct -- Create/attach to a structure in shared memory.
+ *
+ *		This is called during initialization to find or allocate
+ *		a data structure in shared memory.  If no other process
+ *		has created the structure, this routine allocates space
+ *		for it.  If it exists already, a pointer to the existing
+ *		structure is returned.
+ *
+ *	Returns: pointer to the object.  *foundPtr is set true if the object was
+ *		already in the shmem index (hence, already initialized).
+ *
+ *	Note: before Postgres 9.0, this function returned NULL for some failure
+ *	cases.  Now, it always throws error instead, so callers need not check
+ *	for NULL.
+ */
+void *
+ShmemInitStruct(const char *name, Size size, bool *foundPtr)
+{
+	ShmemIndexEnt *result;
+	void	   *structPtr;
+
+	LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
+
+	if (!ShmemIndex)
+	{
+		PGShmemHeader *shmemseghdr = ShmemSegHdr;
+
+		/* Must be trying to create/attach to ShmemIndex itself */
+		Assert(strcmp(name, "ShmemIndex") == 0);
+
+		if (IsUnderPostmaster)
+		{
+			/* Must be initializing a (non-standalone) backend */
+			Assert(shmemseghdr->index != NULL);
+			structPtr = shmemseghdr->index;
+			*foundPtr = true;
+		}
+		else
+		{
+			/*
+			 * If the shmem index doesn't exist, we are bootstrapping: we must
+			 * be trying to init the shmem index itself.
+			 *
+			 * Notice that the ShmemIndexLock is released before the shmem
+			 * index has been initialized.  This should be OK because no other
+			 * process can be accessing shared memory yet.
+			 */
+			Assert(shmemseghdr->index == NULL);
+			structPtr = ShmemAlloc(size);
+			shmemseghdr->index = structPtr;
+			*foundPtr = false;
+		}
+		LWLockRelease(ShmemIndexLock);
+		return structPtr;
+	}
+
+	/* look it up in the shmem index */
+	result = (ShmemIndexEnt *)
+		hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
+
+	if (!result)
+	{
+		LWLockRelease(ShmemIndexLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("could not create ShmemIndex entry for data structure \"%s\"",
+						name)));
+	}
+
+	if (*foundPtr)
+	{
+		/*
+		 * Structure is in the shmem index so someone else has allocated it
+		 * already.  The size better be the same as the size we are trying to
+		 * initialize to, or there is a name conflict (or worse).
+		 */
+		if (result->size != size)
+		{
+			LWLockRelease(ShmemIndexLock);
+			ereport(ERROR,
+					(errmsg("ShmemIndex entry size is wrong for data structure"
+							" \"%s\": expected %zu, actual %zu",
+							name, size, result->size)));
+		}
+		structPtr = result->location;
+	}
+	else
+	{
+		Size		allocated_size;
+
+		/* It isn't in the table yet. allocate and initialize it */
+		structPtr = ShmemAllocRaw(size, &allocated_size);
+		if (structPtr == NULL)
+		{
+			/* out of memory; remove the failed ShmemIndex entry */
+			hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
+			LWLockRelease(ShmemIndexLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("not enough shared memory for data structure"
+							" \"%s\" (%zu bytes requested)",
+							name, size)));
+		}
+		result->size = size;
+		result->allocated_size = allocated_size;
+		result->location = structPtr;
+	}
+
+	LWLockRelease(ShmemIndexLock);
+
+	Assert(ShmemAddrIsValid(structPtr));
+
+	Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
+
+	return structPtr;
+}
+
+
+/*
+ * Add two Size values, checking for overflow
+ */
+Size
+add_size(Size s1, Size s2)
+{
+	Size		result;
+
+	result = s1 + s2;
+	/* We are assuming Size is an unsigned type here... */
+	if (result < s1 || result < s2)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("requested shared memory size overflows size_t")));
+	return result;
+}
+
+/*
+ * Multiply two Size values, checking for overflow
+ */
+Size
+mul_size(Size s1, Size s2)
+{
+	Size		result;
+
+	if (s1 == 0 || s2 == 0)
+		return 0;
+	result = s1 * s2;
+	/* We are assuming Size is an unsigned type here... */
+	if (result / s2 != s1)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("requested shared memory size overflows size_t")));
+	return result;
+}
+
+/* SQL SRF showing allocated shared memory */
+Datum
+pg_get_shmem_allocations(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_SIZES_COLS 4
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	HASH_SEQ_STATUS hstat;
+	ShmemIndexEnt *ent;
+	Size		named_allocated = 0;
+	Datum		values[PG_GET_SHMEM_SIZES_COLS];
+	bool		nulls[PG_GET_SHMEM_SIZES_COLS];
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	hash_seq_init(&hstat, ShmemIndex);
+
+	/* output all allocated entries */
+	memset(nulls, 0, sizeof(nulls));
+	while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		values[0] = CStringGetTextDatum(ent->key);
+		values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
+		values[2] = Int64GetDatum(ent->size);
+		values[3] = Int64GetDatum(ent->allocated_size);
+		named_allocated += ent->allocated_size;
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	/* output shared memory allocated but not counted via the shmem index */
+	values[0] = CStringGetTextDatum("<anonymous>");
+	nulls[1] = true;
+	values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
+	values[3] = values[2];
+	tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+
+	/* output as-of-yet unused shared memory */
+	nulls[0] = true;
+	values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
+	nulls[1] = false;
+	values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
+	values[3] = values[2];
+	tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+
+	LWLockRelease(ShmemIndexLock);
+
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/storage/ipc/shmqueue.c b/src/backend/storage/ipc/shmqueue.c
new file mode 100644
index 0000000..dc3238c
--- /dev/null
+++ b/src/backend/storage/ipc/shmqueue.c
@@ -0,0 +1,190 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmqueue.c
+ *	  shared memory linked lists
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/shmqueue.c
+ *
+ * NOTES
+ *
+ * Package for managing doubly-linked lists in shared memory.
+ * The only tricky thing is that SHM_QUEUE will usually be a field
+ * in a larger record.  SHMQueueNext has to return a pointer
+ * to the record itself instead of a pointer to the SHMQueue field
+ * of the record.  It takes an extra parameter and does some extra
+ * pointer arithmetic to do this correctly.
+ *
+ * NOTE: These are set up so they can be turned into macros some day.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/shmem.h"
+
+
+/*
+ * ShmemQueueInit -- make the head of a new queue point
+ *		to itself
+ */
+void
+SHMQueueInit(SHM_QUEUE *queue)
+{
+	Assert(ShmemAddrIsValid(queue));
+	queue->prev = queue->next = queue;
+}
+
+/*
+ * SHMQueueIsDetached -- true if element is not currently
+ *		in a queue.
+ */
+bool
+SHMQueueIsDetached(const SHM_QUEUE *queue)
+{
+	Assert(ShmemAddrIsValid(queue));
+	return (queue->prev == NULL);
+}
+
+/*
+ * SHMQueueElemInit -- clear an element's links
+ */
+void
+SHMQueueElemInit(SHM_QUEUE *queue)
+{
+	Assert(ShmemAddrIsValid(queue));
+	queue->prev = queue->next = NULL;
+}
+
+/*
+ * SHMQueueDelete -- remove an element from the queue and
+ *		close the links
+ */
+void
+SHMQueueDelete(SHM_QUEUE *queue)
+{
+	SHM_QUEUE  *nextElem = queue->next;
+	SHM_QUEUE  *prevElem = queue->prev;
+
+	Assert(ShmemAddrIsValid(queue));
+	Assert(ShmemAddrIsValid(nextElem));
+	Assert(ShmemAddrIsValid(prevElem));
+
+	prevElem->next = queue->next;
+	nextElem->prev = queue->prev;
+
+	queue->prev = queue->next = NULL;
+}
+
+/*
+ * SHMQueueInsertBefore -- put elem in queue before the given queue
+ *		element.  Inserting "before" the queue head puts the elem
+ *		at the tail of the queue.
+ */
+void
+SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+	SHM_QUEUE  *prevPtr = queue->prev;
+
+	Assert(ShmemAddrIsValid(queue));
+	Assert(ShmemAddrIsValid(elem));
+
+	elem->next = prevPtr->next;
+	elem->prev = queue->prev;
+	queue->prev = elem;
+	prevPtr->next = elem;
+}
+
+/*
+ * SHMQueueInsertAfter -- put elem in queue after the given queue
+ *		element.  Inserting "after" the queue head puts the elem
+ *		at the head of the queue.
+ */
+void
+SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+	SHM_QUEUE  *nextPtr = queue->next;
+
+	Assert(ShmemAddrIsValid(queue));
+	Assert(ShmemAddrIsValid(elem));
+
+	elem->prev = nextPtr->prev;
+	elem->next = queue->next;
+	queue->next = elem;
+	nextPtr->prev = elem;
+}
+
+/*--------------------
+ * SHMQueueNext -- Get the next element from a queue
+ *
+ * To start the iteration, pass the queue head as both queue and curElem.
+ * Returns NULL if no more elements.
+ *
+ * Next element is at curElem->next.  If SHMQueue is part of
+ * a larger structure, we want to return a pointer to the
+ * whole structure rather than a pointer to its SHMQueue field.
+ * For example,
+ * struct {
+ *		int				stuff;
+ *		SHMQueue		elem;
+ * } ELEMType;
+ * When this element is in a queue, prevElem->next points at struct.elem.
+ * We subtract linkOffset to get the correct start address of the structure.
+ *
+ * calls to SHMQueueNext should take these parameters:
+ *	 &(queueHead), &(queueHead), offsetof(ELEMType, elem)
+ * or
+ *	 &(queueHead), &(curElem->elem), offsetof(ELEMType, elem)
+ *--------------------
+ */
+Pointer
+SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
+{
+	SHM_QUEUE  *elemPtr = curElem->next;
+
+	Assert(ShmemAddrIsValid(curElem));
+
+	if (elemPtr == queue)		/* back to the queue head? */
+		return NULL;
+
+	return (Pointer) (((char *) elemPtr) - linkOffset);
+}
+
+/*--------------------
+ * SHMQueuePrev -- Get the previous element from a queue
+ *
+ * Same as SHMQueueNext, just starting at tail and moving towards head.
+ * All other comments and usage applies.
+ */
+Pointer
+SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
+{
+	SHM_QUEUE  *elemPtr = curElem->prev;
+
+	Assert(ShmemAddrIsValid(curElem));
+
+	if (elemPtr == queue)		/* back to the queue head? */
+		return NULL;
+
+	return (Pointer) (((char *) elemPtr) - linkOffset);
+}
+
+/*
+ * SHMQueueEmpty -- true if queue head is only element, false otherwise
+ */
+bool
+SHMQueueEmpty(const SHM_QUEUE *queue)
+{
+	Assert(ShmemAddrIsValid(queue));
+
+	if (queue->prev == queue)
+	{
+		Assert(queue->next == queue);
+		return true;
+	}
+	return false;
+}
diff --git a/src/backend/storage/ipc/signalfuncs.c b/src/backend/storage/ipc/signalfuncs.c
new file mode 100644
index 0000000..de69d60
--- /dev/null
+++ b/src/backend/storage/ipc/signalfuncs.c
@@ -0,0 +1,300 @@
+/*-------------------------------------------------------------------------
+ *
+ * signalfuncs.c
+ *	  Functions for signaling backends
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/signalfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+
+#include "catalog/pg_authid.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/syslogger.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+
+
+/*
+ * Send a signal to another backend.
+ *
+ * The signal is delivered if the user is either a superuser or the same
+ * role as the backend being signaled. For "dangerous" signals, an explicit
+ * check for superuser needs to be done prior to calling this function.
+ *
+ * Returns 0 on success, 1 on general failure, 2 on normal permission error
+ * and 3 if the caller needs to be a superuser.
+ *
+ * In the event of a general failure (return code 1), a warning message will
+ * be emitted. For permission errors, doing that is the responsibility of
+ * the caller.
+ */
+#define SIGNAL_BACKEND_SUCCESS 0
+#define SIGNAL_BACKEND_ERROR 1
+#define SIGNAL_BACKEND_NOPERMISSION 2
+#define SIGNAL_BACKEND_NOSUPERUSER 3
+static int
+pg_signal_backend(int pid, int sig)
+{
+	PGPROC	   *proc = BackendPidGetProc(pid);
+
+	/*
+	 * BackendPidGetProc returns NULL if the pid isn't valid; but by the time
+	 * we reach kill(), a process for which we get a valid proc here might
+	 * have terminated on its own.  There's no way to acquire a lock on an
+	 * arbitrary process to prevent that. But since so far all the callers of
+	 * this mechanism involve some request for ending the process anyway, that
+	 * it might end on its own first is not a problem.
+	 */
+	if (proc == NULL)
+	{
+		/*
+		 * This is just a warning so a loop-through-resultset will not abort
+		 * if one backend terminated on its own during the run.
+		 */
+		ereport(WARNING,
+				(errmsg("PID %d is not a PostgreSQL server process", pid)));
+		return SIGNAL_BACKEND_ERROR;
+	}
+
+	/* Only allow superusers to signal superuser-owned backends. */
+	if (superuser_arg(proc->roleId) && !superuser())
+		return SIGNAL_BACKEND_NOSUPERUSER;
+
+	/* Users can signal backends they have role membership in. */
+	if (!has_privs_of_role(GetUserId(), proc->roleId) &&
+		!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+		return SIGNAL_BACKEND_NOPERMISSION;
+
+	/*
+	 * Can the process we just validated above end, followed by the pid being
+	 * recycled for a new process, before reaching here?  Then we'd be trying
+	 * to kill the wrong thing.  Seems near impossible when sequential pid
+	 * assignment and wraparound is used.  Perhaps it could happen on a system
+	 * where pid re-use is randomized.  That race condition possibility seems
+	 * too unlikely to worry about.
+	 */
+
+	/* If we have setsid(), signal the backend's whole process group */
+#ifdef HAVE_SETSID
+	if (kill(-pid, sig))
+#else
+	if (kill(pid, sig))
+#endif
+	{
+		/* Again, just a warning to allow loops */
+		ereport(WARNING,
+				(errmsg("could not send signal to process %d: %m", pid)));
+		return SIGNAL_BACKEND_ERROR;
+	}
+	return SIGNAL_BACKEND_SUCCESS;
+}
+
+/*
+ * Signal to cancel a backend process.  This is allowed if you are a member of
+ * the role whose process is being canceled.
+ *
+ * Note that only superusers can signal superuser-owned processes.
+ */
+Datum
+pg_cancel_backend(PG_FUNCTION_ARGS)
+{
+	int			r = pg_signal_backend(PG_GETARG_INT32(0), SIGINT);
+
+	if (r == SIGNAL_BACKEND_NOSUPERUSER)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a superuser to cancel superuser query")));
+
+	if (r == SIGNAL_BACKEND_NOPERMISSION)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a member of the role whose query is being canceled or member of pg_signal_backend")));
+
+	PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS);
+}
+
+/*
+ * Wait until there is no backend process with the given PID and return true.
+ * On timeout, a warning is emitted and false is returned.
+ */
+static bool
+pg_wait_until_termination(int pid, int64 timeout)
+{
+	/*
+	 * Wait in steps of waittime milliseconds until this function exits or
+	 * timeout.
+	 */
+	int64		waittime = 100;
+
+	/*
+	 * Initially remaining time is the entire timeout specified by the user.
+	 */
+	int64		remainingtime = timeout;
+
+	/*
+	 * Check existence of the backend. If the backend still exists, then wait
+	 * for waittime milliseconds, again check for the existence. Repeat this
+	 * until timeout or an error occurs or a pending interrupt such as query
+	 * cancel gets processed.
+	 */
+	do
+	{
+		if (remainingtime < waittime)
+			waittime = remainingtime;
+
+		if (kill(pid, 0) == -1)
+		{
+			if (errno == ESRCH)
+				return true;
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("could not check the existence of the backend with PID %d: %m",
+								pid)));
+		}
+
+		/* Process interrupts, if any, before waiting */
+		CHECK_FOR_INTERRUPTS();
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+						 waittime,
+						 WAIT_EVENT_BACKEND_TERMINATION);
+
+		ResetLatch(MyLatch);
+
+		remainingtime -= waittime;
+	} while (remainingtime > 0);
+
+	ereport(WARNING,
+			(errmsg_plural("backend with PID %d did not terminate within %lld millisecond",
+						   "backend with PID %d did not terminate within %lld milliseconds",
+						   timeout,
+						   pid, (long long int) timeout)));
+
+	return false;
+}
+
+/*
+ * Send a signal to terminate a backend process. This is allowed if you are a
+ * member of the role whose process is being terminated. If the timeout input
+ * argument is 0, then this function just signals the backend and returns
+ * true.  If timeout is nonzero, then it waits until no process has the given
+ * PID; if the process ends within the timeout, true is returned, and if the
+ * timeout is exceeded, a warning is emitted and false is returned.
+ *
+ * Note that only superusers can signal superuser-owned processes.
+ */
+Datum
+pg_terminate_backend(PG_FUNCTION_ARGS)
+{
+	int			pid;
+	int			r;
+	int			timeout;		/* milliseconds */
+
+	pid = PG_GETARG_INT32(0);
+	timeout = PG_GETARG_INT64(1);
+
+	if (timeout < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+				 errmsg("\"timeout\" must not be negative")));
+
+	r = pg_signal_backend(pid, SIGTERM);
+
+	if (r == SIGNAL_BACKEND_NOSUPERUSER)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a superuser to terminate superuser process")));
+
+	if (r == SIGNAL_BACKEND_NOPERMISSION)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a member of the role whose process is being terminated or member of pg_signal_backend")));
+
+	/* Wait only on success and if actually requested */
+	if (r == SIGNAL_BACKEND_SUCCESS && timeout > 0)
+		PG_RETURN_BOOL(pg_wait_until_termination(pid, timeout));
+	else
+		PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS);
+}
+
+/*
+ * Signal to reload the database configuration
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_reload_conf(PG_FUNCTION_ARGS)
+{
+	if (kill(PostmasterPid, SIGHUP))
+	{
+		ereport(WARNING,
+				(errmsg("failed to send signal to postmaster: %m")));
+		PG_RETURN_BOOL(false);
+	}
+
+	PG_RETURN_BOOL(true);
+}
+
+
+/*
+ * Rotate log file
+ *
+ * This function is kept to support adminpack 1.0.
+ */
+Datum
+pg_rotate_logfile(PG_FUNCTION_ARGS)
+{
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to rotate log files with adminpack 1.0"),
+		/* translator: %s is a SQL function name */
+				 errhint("Consider using %s, which is part of core, instead.",
+						 "pg_logfile_rotate()")));
+
+	if (!Logging_collector)
+	{
+		ereport(WARNING,
+				(errmsg("rotation not possible because log collection not active")));
+		PG_RETURN_BOOL(false);
+	}
+
+	SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE);
+	PG_RETURN_BOOL(true);
+}
+
+/*
+ * Rotate log file
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_rotate_logfile_v2(PG_FUNCTION_ARGS)
+{
+	if (!Logging_collector)
+	{
+		ereport(WARNING,
+				(errmsg("rotation not possible because log collection not active")));
+		PG_RETURN_BOOL(false);
+	}
+
+	SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE);
+	PG_RETURN_BOOL(true);
+}
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
new file mode 100644
index 0000000..f585d63
--- /dev/null
+++ b/src/backend/storage/ipc/sinval.c
@@ -0,0 +1,205 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.c
+ *	  POSTGRES shared cache invalidation communication code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/sinval.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/sinvaladt.h"
+#include "utils/inval.h"
+
+
+uint64		SharedInvalidMessageCounter;
+
+
+/*
+ * Because backends sitting idle will not be reading sinval events, we
+ * need a way to give an idle backend a swift kick in the rear and make
+ * it catch up before the sinval queue overflows and forces it to go
+ * through a cache reset exercise.  This is done by sending
+ * PROCSIG_CATCHUP_INTERRUPT to any backend that gets too far behind.
+ *
+ * The signal handler will set an interrupt pending flag and will set the
+ * processes latch. Whenever starting to read from the client, or when
+ * interrupted while doing so, ProcessClientReadInterrupt() will call
+ * ProcessCatchupEvent().
+ */
+volatile sig_atomic_t catchupInterruptPending = false;
+
+
+/*
+ * SendSharedInvalidMessages
+ *	Add shared-cache-invalidation message(s) to the global SI message queue.
+ */
+void
+SendSharedInvalidMessages(const SharedInvalidationMessage *msgs, int n)
+{
+	SIInsertDataEntries(msgs, n);
+}
+
+/*
+ * ReceiveSharedInvalidMessages
+ *		Process shared-cache-invalidation messages waiting for this backend
+ *
+ * We guarantee to process all messages that had been queued before the
+ * routine was entered.  It is of course possible for more messages to get
+ * queued right after our last SIGetDataEntries call.
+ *
+ * NOTE: it is entirely possible for this routine to be invoked recursively
+ * as a consequence of processing inside the invalFunction or resetFunction.
+ * Furthermore, such a recursive call must guarantee that all outstanding
+ * inval messages have been processed before it exits.  This is the reason
+ * for the strange-looking choice to use a statically allocated buffer array
+ * and counters; it's so that a recursive call can process messages already
+ * sucked out of sinvaladt.c.
+ */
+void
+ReceiveSharedInvalidMessages(void (*invalFunction) (SharedInvalidationMessage *msg),
+							 void (*resetFunction) (void))
+{
+#define MAXINVALMSGS 32
+	static SharedInvalidationMessage messages[MAXINVALMSGS];
+
+	/*
+	 * We use volatile here to prevent bugs if a compiler doesn't realize that
+	 * recursion is a possibility ...
+	 */
+	static volatile int nextmsg = 0;
+	static volatile int nummsgs = 0;
+
+	/* Deal with any messages still pending from an outer recursion */
+	while (nextmsg < nummsgs)
+	{
+		SharedInvalidationMessage msg = messages[nextmsg++];
+
+		SharedInvalidMessageCounter++;
+		invalFunction(&msg);
+	}
+
+	do
+	{
+		int			getResult;
+
+		nextmsg = nummsgs = 0;
+
+		/* Try to get some more messages */
+		getResult = SIGetDataEntries(messages, MAXINVALMSGS);
+
+		if (getResult < 0)
+		{
+			/* got a reset message */
+			elog(DEBUG4, "cache state reset");
+			SharedInvalidMessageCounter++;
+			resetFunction();
+			break;				/* nothing more to do */
+		}
+
+		/* Process them, being wary that a recursive call might eat some */
+		nextmsg = 0;
+		nummsgs = getResult;
+
+		while (nextmsg < nummsgs)
+		{
+			SharedInvalidationMessage msg = messages[nextmsg++];
+
+			SharedInvalidMessageCounter++;
+			invalFunction(&msg);
+		}
+
+		/*
+		 * We only need to loop if the last SIGetDataEntries call (which might
+		 * have been within a recursive call) returned a full buffer.
+		 */
+	} while (nummsgs == MAXINVALMSGS);
+
+	/*
+	 * We are now caught up.  If we received a catchup signal, reset that
+	 * flag, and call SICleanupQueue().  This is not so much because we need
+	 * to flush dead messages right now, as that we want to pass on the
+	 * catchup signal to the next slowest backend.  "Daisy chaining" the
+	 * catchup signal this way avoids creating spikes in system load for what
+	 * should be just a background maintenance activity.
+	 */
+	if (catchupInterruptPending)
+	{
+		catchupInterruptPending = false;
+		elog(DEBUG4, "sinval catchup complete, cleaning queue");
+		SICleanupQueue(false, 0);
+	}
+}
+
+
+/*
+ * HandleCatchupInterrupt
+ *
+ * This is called when PROCSIG_CATCHUP_INTERRUPT is received.
+ *
+ * We used to directly call ProcessCatchupEvent directly when idle. These days
+ * we just set a flag to do it later and notify the process of that fact by
+ * setting the process's latch.
+ */
+void
+HandleCatchupInterrupt(void)
+{
+	/*
+	 * Note: this is called by a SIGNAL HANDLER. You must be very wary what
+	 * you do here.
+	 */
+
+	catchupInterruptPending = true;
+
+	/* make sure the event is processed in due course */
+	SetLatch(MyLatch);
+}
+
+/*
+ * ProcessCatchupInterrupt
+ *
+ * The portion of catchup interrupt handling that runs outside of the signal
+ * handler, which allows it to actually process pending invalidations.
+ */
+void
+ProcessCatchupInterrupt(void)
+{
+	while (catchupInterruptPending)
+	{
+		/*
+		 * What we need to do here is cause ReceiveSharedInvalidMessages() to
+		 * run, which will do the necessary work and also reset the
+		 * catchupInterruptPending flag.  If we are inside a transaction we
+		 * can just call AcceptInvalidationMessages() to do this.  If we
+		 * aren't, we start and immediately end a transaction; the call to
+		 * AcceptInvalidationMessages() happens down inside transaction start.
+		 *
+		 * It is awfully tempting to just call AcceptInvalidationMessages()
+		 * without the rest of the xact start/stop overhead, and I think that
+		 * would actually work in the normal case; but I am not sure that
+		 * things would clean up nicely if we got an error partway through.
+		 */
+		if (IsTransactionOrTransactionBlock())
+		{
+			elog(DEBUG4, "ProcessCatchupEvent inside transaction");
+			AcceptInvalidationMessages();
+		}
+		else
+		{
+			elog(DEBUG4, "ProcessCatchupEvent outside transaction");
+			StartTransactionCommand();
+			CommitTransactionCommand();
+		}
+	}
+}
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
new file mode 100644
index 0000000..946bd8e
--- /dev/null
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -0,0 +1,777 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.c
+ *	  POSTGRES shared cache invalidation data manager.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/sinvaladt.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "storage/backendid.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/procsignal.h"
+#include "storage/shmem.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+
+/*
+ * Conceptually, the shared cache invalidation messages are stored in an
+ * infinite array, where maxMsgNum is the next array subscript to store a
+ * submitted message in, minMsgNum is the smallest array subscript containing
+ * a message not yet read by all backends, and we always have maxMsgNum >=
+ * minMsgNum.  (They are equal when there are no messages pending.)  For each
+ * active backend, there is a nextMsgNum pointer indicating the next message it
+ * needs to read; we have maxMsgNum >= nextMsgNum >= minMsgNum for every
+ * backend.
+ *
+ * (In the current implementation, minMsgNum is a lower bound for the
+ * per-process nextMsgNum values, but it isn't rigorously kept equal to the
+ * smallest nextMsgNum --- it may lag behind.  We only update it when
+ * SICleanupQueue is called, and we try not to do that often.)
+ *
+ * In reality, the messages are stored in a circular buffer of MAXNUMMESSAGES
+ * entries.  We translate MsgNum values into circular-buffer indexes by
+ * computing MsgNum % MAXNUMMESSAGES (this should be fast as long as
+ * MAXNUMMESSAGES is a constant and a power of 2).  As long as maxMsgNum
+ * doesn't exceed minMsgNum by more than MAXNUMMESSAGES, we have enough space
+ * in the buffer.  If the buffer does overflow, we recover by setting the
+ * "reset" flag for each backend that has fallen too far behind.  A backend
+ * that is in "reset" state is ignored while determining minMsgNum.  When
+ * it does finally attempt to receive inval messages, it must discard all
+ * its invalidatable state, since it won't know what it missed.
+ *
+ * To reduce the probability of needing resets, we send a "catchup" interrupt
+ * to any backend that seems to be falling unreasonably far behind.  The
+ * normal behavior is that at most one such interrupt is in flight at a time;
+ * when a backend completes processing a catchup interrupt, it executes
+ * SICleanupQueue, which will signal the next-furthest-behind backend if
+ * needed.  This avoids undue contention from multiple backends all trying
+ * to catch up at once.  However, the furthest-back backend might be stuck
+ * in a state where it can't catch up.  Eventually it will get reset, so it
+ * won't cause any more problems for anyone but itself.  But we don't want
+ * to find that a bunch of other backends are now too close to the reset
+ * threshold to be saved.  So SICleanupQueue is designed to occasionally
+ * send extra catchup interrupts as the queue gets fuller, to backends that
+ * are far behind and haven't gotten one yet.  As long as there aren't a lot
+ * of "stuck" backends, we won't need a lot of extra interrupts, since ones
+ * that aren't stuck will propagate their interrupts to the next guy.
+ *
+ * We would have problems if the MsgNum values overflow an integer, so
+ * whenever minMsgNum exceeds MSGNUMWRAPAROUND, we subtract MSGNUMWRAPAROUND
+ * from all the MsgNum variables simultaneously.  MSGNUMWRAPAROUND can be
+ * large so that we don't need to do this often.  It must be a multiple of
+ * MAXNUMMESSAGES so that the existing circular-buffer entries don't need
+ * to be moved when we do it.
+ *
+ * Access to the shared sinval array is protected by two locks, SInvalReadLock
+ * and SInvalWriteLock.  Readers take SInvalReadLock in shared mode; this
+ * authorizes them to modify their own ProcState but not to modify or even
+ * look at anyone else's.  When we need to perform array-wide updates,
+ * such as in SICleanupQueue, we take SInvalReadLock in exclusive mode to
+ * lock out all readers.  Writers take SInvalWriteLock (always in exclusive
+ * mode) to serialize adding messages to the queue.  Note that a writer
+ * can operate in parallel with one or more readers, because the writer
+ * has no need to touch anyone's ProcState, except in the infrequent cases
+ * when SICleanupQueue is needed.  The only point of overlap is that
+ * the writer wants to change maxMsgNum while readers need to read it.
+ * We deal with that by having a spinlock that readers must take for just
+ * long enough to read maxMsgNum, while writers take it for just long enough
+ * to write maxMsgNum.  (The exact rule is that you need the spinlock to
+ * read maxMsgNum if you are not holding SInvalWriteLock, and you need the
+ * spinlock to write maxMsgNum unless you are holding both locks.)
+ *
+ * Note: since maxMsgNum is an int and hence presumably atomically readable/
+ * writable, the spinlock might seem unnecessary.  The reason it is needed
+ * is to provide a memory barrier: we need to be sure that messages written
+ * to the array are actually there before maxMsgNum is increased, and that
+ * readers will see that data after fetching maxMsgNum.  Multiprocessors
+ * that have weak memory-ordering guarantees can fail without the memory
+ * barrier instructions that are included in the spinlock sequences.
+ */
+
+
+/*
+ * Configurable parameters.
+ *
+ * MAXNUMMESSAGES: max number of shared-inval messages we can buffer.
+ * Must be a power of 2 for speed.
+ *
+ * MSGNUMWRAPAROUND: how often to reduce MsgNum variables to avoid overflow.
+ * Must be a multiple of MAXNUMMESSAGES.  Should be large.
+ *
+ * CLEANUP_MIN: the minimum number of messages that must be in the buffer
+ * before we bother to call SICleanupQueue.
+ *
+ * CLEANUP_QUANTUM: how often (in messages) to call SICleanupQueue once
+ * we exceed CLEANUP_MIN.  Should be a power of 2 for speed.
+ *
+ * SIG_THRESHOLD: the minimum number of messages a backend must have fallen
+ * behind before we'll send it PROCSIG_CATCHUP_INTERRUPT.
+ *
+ * WRITE_QUANTUM: the max number of messages to push into the buffer per
+ * iteration of SIInsertDataEntries.  Noncritical but should be less than
+ * CLEANUP_QUANTUM, because we only consider calling SICleanupQueue once
+ * per iteration.
+ */
+
+#define MAXNUMMESSAGES 4096
+#define MSGNUMWRAPAROUND (MAXNUMMESSAGES * 262144)
+#define CLEANUP_MIN (MAXNUMMESSAGES / 2)
+#define CLEANUP_QUANTUM (MAXNUMMESSAGES / 16)
+#define SIG_THRESHOLD (MAXNUMMESSAGES / 2)
+#define WRITE_QUANTUM 64
+
+/* Per-backend state in shared invalidation structure */
+typedef struct ProcState
+{
+	/* procPid is zero in an inactive ProcState array entry. */
+	pid_t		procPid;		/* PID of backend, for signaling */
+	PGPROC	   *proc;			/* PGPROC of backend */
+	/* nextMsgNum is meaningless if procPid == 0 or resetState is true. */
+	int			nextMsgNum;		/* next message number to read */
+	bool		resetState;		/* backend needs to reset its state */
+	bool		signaled;		/* backend has been sent catchup signal */
+	bool		hasMessages;	/* backend has unread messages */
+
+	/*
+	 * Backend only sends invalidations, never receives them. This only makes
+	 * sense for Startup process during recovery because it doesn't maintain a
+	 * relcache, yet it fires inval messages to allow query backends to see
+	 * schema changes.
+	 */
+	bool		sendOnly;		/* backend only sends, never receives */
+
+	/*
+	 * Next LocalTransactionId to use for each idle backend slot.  We keep
+	 * this here because it is indexed by BackendId and it is convenient to
+	 * copy the value to and from local memory when MyBackendId is set. It's
+	 * meaningless in an active ProcState entry.
+	 */
+	LocalTransactionId nextLXID;
+} ProcState;
+
+/* Shared cache invalidation memory segment */
+typedef struct SISeg
+{
+	/*
+	 * General state information
+	 */
+	int			minMsgNum;		/* oldest message still needed */
+	int			maxMsgNum;		/* next message number to be assigned */
+	int			nextThreshold;	/* # of messages to call SICleanupQueue */
+	int			lastBackend;	/* index of last active procState entry, +1 */
+	int			maxBackends;	/* size of procState array */
+
+	slock_t		msgnumLock;		/* spinlock protecting maxMsgNum */
+
+	/*
+	 * Circular buffer holding shared-inval messages
+	 */
+	SharedInvalidationMessage buffer[MAXNUMMESSAGES];
+
+	/*
+	 * Per-backend invalidation state info (has MaxBackends entries).
+	 */
+	ProcState	procState[FLEXIBLE_ARRAY_MEMBER];
+} SISeg;
+
+static SISeg *shmInvalBuffer;	/* pointer to the shared inval buffer */
+
+
+static LocalTransactionId nextLocalTransactionId;
+
+static void CleanupInvalidationState(int status, Datum arg);
+
+
+/*
+ * SInvalShmemSize --- return shared-memory space needed
+ */
+Size
+SInvalShmemSize(void)
+{
+	Size		size;
+
+	size = offsetof(SISeg, procState);
+	size = add_size(size, mul_size(sizeof(ProcState), MaxBackends));
+
+	return size;
+}
+
+/*
+ * CreateSharedInvalidationState
+ *		Create and initialize the SI message buffer
+ */
+void
+CreateSharedInvalidationState(void)
+{
+	int			i;
+	bool		found;
+
+	/* Allocate space in shared memory */
+	shmInvalBuffer = (SISeg *)
+		ShmemInitStruct("shmInvalBuffer", SInvalShmemSize(), &found);
+	if (found)
+		return;
+
+	/* Clear message counters, save size of procState array, init spinlock */
+	shmInvalBuffer->minMsgNum = 0;
+	shmInvalBuffer->maxMsgNum = 0;
+	shmInvalBuffer->nextThreshold = CLEANUP_MIN;
+	shmInvalBuffer->lastBackend = 0;
+	shmInvalBuffer->maxBackends = MaxBackends;
+	SpinLockInit(&shmInvalBuffer->msgnumLock);
+
+	/* The buffer[] array is initially all unused, so we need not fill it */
+
+	/* Mark all backends inactive, and initialize nextLXID */
+	for (i = 0; i < shmInvalBuffer->maxBackends; i++)
+	{
+		shmInvalBuffer->procState[i].procPid = 0;	/* inactive */
+		shmInvalBuffer->procState[i].proc = NULL;
+		shmInvalBuffer->procState[i].nextMsgNum = 0;	/* meaningless */
+		shmInvalBuffer->procState[i].resetState = false;
+		shmInvalBuffer->procState[i].signaled = false;
+		shmInvalBuffer->procState[i].hasMessages = false;
+		shmInvalBuffer->procState[i].nextLXID = InvalidLocalTransactionId;
+	}
+}
+
+/*
+ * SharedInvalBackendInit
+ *		Initialize a new backend to operate on the sinval buffer
+ */
+void
+SharedInvalBackendInit(bool sendOnly)
+{
+	int			index;
+	ProcState  *stateP = NULL;
+	SISeg	   *segP = shmInvalBuffer;
+
+	/*
+	 * This can run in parallel with read operations, but not with write
+	 * operations, since SIInsertDataEntries relies on lastBackend to set
+	 * hasMessages appropriately.
+	 */
+	LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+	/* Look for a free entry in the procState array */
+	for (index = 0; index < segP->lastBackend; index++)
+	{
+		if (segP->procState[index].procPid == 0)	/* inactive slot? */
+		{
+			stateP = &segP->procState[index];
+			break;
+		}
+	}
+
+	if (stateP == NULL)
+	{
+		if (segP->lastBackend < segP->maxBackends)
+		{
+			stateP = &segP->procState[segP->lastBackend];
+			Assert(stateP->procPid == 0);
+			segP->lastBackend++;
+		}
+		else
+		{
+			/*
+			 * out of procState slots: MaxBackends exceeded -- report normally
+			 */
+			MyBackendId = InvalidBackendId;
+			LWLockRelease(SInvalWriteLock);
+			ereport(FATAL,
+					(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+					 errmsg("sorry, too many clients already")));
+		}
+	}
+
+	MyBackendId = (stateP - &segP->procState[0]) + 1;
+
+	/* Advertise assigned backend ID in MyProc */
+	MyProc->backendId = MyBackendId;
+
+	/* Fetch next local transaction ID into local memory */
+	nextLocalTransactionId = stateP->nextLXID;
+
+	/* mark myself active, with all extant messages already read */
+	stateP->procPid = MyProcPid;
+	stateP->proc = MyProc;
+	stateP->nextMsgNum = segP->maxMsgNum;
+	stateP->resetState = false;
+	stateP->signaled = false;
+	stateP->hasMessages = false;
+	stateP->sendOnly = sendOnly;
+
+	LWLockRelease(SInvalWriteLock);
+
+	/* register exit routine to mark my entry inactive at exit */
+	on_shmem_exit(CleanupInvalidationState, PointerGetDatum(segP));
+
+	elog(DEBUG4, "my backend ID is %d", MyBackendId);
+}
+
+/*
+ * CleanupInvalidationState
+ *		Mark the current backend as no longer active.
+ *
+ * This function is called via on_shmem_exit() during backend shutdown.
+ *
+ * arg is really of type "SISeg*".
+ */
+static void
+CleanupInvalidationState(int status, Datum arg)
+{
+	SISeg	   *segP = (SISeg *) DatumGetPointer(arg);
+	ProcState  *stateP;
+	int			i;
+
+	Assert(PointerIsValid(segP));
+
+	LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+	stateP = &segP->procState[MyBackendId - 1];
+
+	/* Update next local transaction ID for next holder of this backendID */
+	stateP->nextLXID = nextLocalTransactionId;
+
+	/* Mark myself inactive */
+	stateP->procPid = 0;
+	stateP->proc = NULL;
+	stateP->nextMsgNum = 0;
+	stateP->resetState = false;
+	stateP->signaled = false;
+
+	/* Recompute index of last active backend */
+	for (i = segP->lastBackend; i > 0; i--)
+	{
+		if (segP->procState[i - 1].procPid != 0)
+			break;
+	}
+	segP->lastBackend = i;
+
+	LWLockRelease(SInvalWriteLock);
+}
+
+/*
+ * BackendIdGetProc
+ *		Get the PGPROC structure for a backend, given the backend ID.
+ *		The result may be out of date arbitrarily quickly, so the caller
+ *		must be careful about how this information is used.  NULL is
+ *		returned if the backend is not active.
+ */
+PGPROC *
+BackendIdGetProc(int backendID)
+{
+	PGPROC	   *result = NULL;
+	SISeg	   *segP = shmInvalBuffer;
+
+	/* Need to lock out additions/removals of backends */
+	LWLockAcquire(SInvalWriteLock, LW_SHARED);
+
+	if (backendID > 0 && backendID <= segP->lastBackend)
+	{
+		ProcState  *stateP = &segP->procState[backendID - 1];
+
+		result = stateP->proc;
+	}
+
+	LWLockRelease(SInvalWriteLock);
+
+	return result;
+}
+
+/*
+ * BackendIdGetTransactionIds
+ *		Get the xid and xmin of the backend. The result may be out of date
+ *		arbitrarily quickly, so the caller must be careful about how this
+ *		information is used.
+ */
+void
+BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmin)
+{
+	SISeg	   *segP = shmInvalBuffer;
+
+	*xid = InvalidTransactionId;
+	*xmin = InvalidTransactionId;
+
+	/* Need to lock out additions/removals of backends */
+	LWLockAcquire(SInvalWriteLock, LW_SHARED);
+
+	if (backendID > 0 && backendID <= segP->lastBackend)
+	{
+		ProcState  *stateP = &segP->procState[backendID - 1];
+		PGPROC	   *proc = stateP->proc;
+
+		if (proc != NULL)
+		{
+			*xid = proc->xid;
+			*xmin = proc->xmin;
+		}
+	}
+
+	LWLockRelease(SInvalWriteLock);
+}
+
+/*
+ * SIInsertDataEntries
+ *		Add new invalidation message(s) to the buffer.
+ */
+void
+SIInsertDataEntries(const SharedInvalidationMessage *data, int n)
+{
+	SISeg	   *segP = shmInvalBuffer;
+
+	/*
+	 * N can be arbitrarily large.  We divide the work into groups of no more
+	 * than WRITE_QUANTUM messages, to be sure that we don't hold the lock for
+	 * an unreasonably long time.  (This is not so much because we care about
+	 * letting in other writers, as that some just-caught-up backend might be
+	 * trying to do SICleanupQueue to pass on its signal, and we don't want it
+	 * to have to wait a long time.)  Also, we need to consider calling
+	 * SICleanupQueue every so often.
+	 */
+	while (n > 0)
+	{
+		int			nthistime = Min(n, WRITE_QUANTUM);
+		int			numMsgs;
+		int			max;
+		int			i;
+
+		n -= nthistime;
+
+		LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+		/*
+		 * If the buffer is full, we *must* acquire some space.  Clean the
+		 * queue and reset anyone who is preventing space from being freed.
+		 * Otherwise, clean the queue only when it's exceeded the next
+		 * fullness threshold.  We have to loop and recheck the buffer state
+		 * after any call of SICleanupQueue.
+		 */
+		for (;;)
+		{
+			numMsgs = segP->maxMsgNum - segP->minMsgNum;
+			if (numMsgs + nthistime > MAXNUMMESSAGES ||
+				numMsgs >= segP->nextThreshold)
+				SICleanupQueue(true, nthistime);
+			else
+				break;
+		}
+
+		/*
+		 * Insert new message(s) into proper slot of circular buffer
+		 */
+		max = segP->maxMsgNum;
+		while (nthistime-- > 0)
+		{
+			segP->buffer[max % MAXNUMMESSAGES] = *data++;
+			max++;
+		}
+
+		/* Update current value of maxMsgNum using spinlock */
+		SpinLockAcquire(&segP->msgnumLock);
+		segP->maxMsgNum = max;
+		SpinLockRelease(&segP->msgnumLock);
+
+		/*
+		 * Now that the maxMsgNum change is globally visible, we give everyone
+		 * a swift kick to make sure they read the newly added messages.
+		 * Releasing SInvalWriteLock will enforce a full memory barrier, so
+		 * these (unlocked) changes will be committed to memory before we exit
+		 * the function.
+		 */
+		for (i = 0; i < segP->lastBackend; i++)
+		{
+			ProcState  *stateP = &segP->procState[i];
+
+			stateP->hasMessages = true;
+		}
+
+		LWLockRelease(SInvalWriteLock);
+	}
+}
+
+/*
+ * SIGetDataEntries
+ *		get next SI message(s) for current backend, if there are any
+ *
+ * Possible return values:
+ *	0:	 no SI message available
+ *	n>0: next n SI messages have been extracted into data[]
+ * -1:	 SI reset message extracted
+ *
+ * If the return value is less than the array size "datasize", the caller
+ * can assume that there are no more SI messages after the one(s) returned.
+ * Otherwise, another call is needed to collect more messages.
+ *
+ * NB: this can run in parallel with other instances of SIGetDataEntries
+ * executing on behalf of other backends, since each instance will modify only
+ * fields of its own backend's ProcState, and no instance will look at fields
+ * of other backends' ProcStates.  We express this by grabbing SInvalReadLock
+ * in shared mode.  Note that this is not exactly the normal (read-only)
+ * interpretation of a shared lock! Look closely at the interactions before
+ * allowing SInvalReadLock to be grabbed in shared mode for any other reason!
+ *
+ * NB: this can also run in parallel with SIInsertDataEntries.  It is not
+ * guaranteed that we will return any messages added after the routine is
+ * entered.
+ *
+ * Note: we assume that "datasize" is not so large that it might be important
+ * to break our hold on SInvalReadLock into segments.
+ */
+int
+SIGetDataEntries(SharedInvalidationMessage *data, int datasize)
+{
+	SISeg	   *segP;
+	ProcState  *stateP;
+	int			max;
+	int			n;
+
+	segP = shmInvalBuffer;
+	stateP = &segP->procState[MyBackendId - 1];
+
+	/*
+	 * Before starting to take locks, do a quick, unlocked test to see whether
+	 * there can possibly be anything to read.  On a multiprocessor system,
+	 * it's possible that this load could migrate backwards and occur before
+	 * we actually enter this function, so we might miss a sinval message that
+	 * was just added by some other processor.  But they can't migrate
+	 * backwards over a preceding lock acquisition, so it should be OK.  If we
+	 * haven't acquired a lock preventing against further relevant
+	 * invalidations, any such occurrence is not much different than if the
+	 * invalidation had arrived slightly later in the first place.
+	 */
+	if (!stateP->hasMessages)
+		return 0;
+
+	LWLockAcquire(SInvalReadLock, LW_SHARED);
+
+	/*
+	 * We must reset hasMessages before determining how many messages we're
+	 * going to read.  That way, if new messages arrive after we have
+	 * determined how many we're reading, the flag will get reset and we'll
+	 * notice those messages part-way through.
+	 *
+	 * Note that, if we don't end up reading all of the messages, we had
+	 * better be certain to reset this flag before exiting!
+	 */
+	stateP->hasMessages = false;
+
+	/* Fetch current value of maxMsgNum using spinlock */
+	SpinLockAcquire(&segP->msgnumLock);
+	max = segP->maxMsgNum;
+	SpinLockRelease(&segP->msgnumLock);
+
+	if (stateP->resetState)
+	{
+		/*
+		 * Force reset.  We can say we have dealt with any messages added
+		 * since the reset, as well; and that means we should clear the
+		 * signaled flag, too.
+		 */
+		stateP->nextMsgNum = max;
+		stateP->resetState = false;
+		stateP->signaled = false;
+		LWLockRelease(SInvalReadLock);
+		return -1;
+	}
+
+	/*
+	 * Retrieve messages and advance backend's counter, until data array is
+	 * full or there are no more messages.
+	 *
+	 * There may be other backends that haven't read the message(s), so we
+	 * cannot delete them here.  SICleanupQueue() will eventually remove them
+	 * from the queue.
+	 */
+	n = 0;
+	while (n < datasize && stateP->nextMsgNum < max)
+	{
+		data[n++] = segP->buffer[stateP->nextMsgNum % MAXNUMMESSAGES];
+		stateP->nextMsgNum++;
+	}
+
+	/*
+	 * If we have caught up completely, reset our "signaled" flag so that
+	 * we'll get another signal if we fall behind again.
+	 *
+	 * If we haven't caught up completely, reset the hasMessages flag so that
+	 * we see the remaining messages next time.
+	 */
+	if (stateP->nextMsgNum >= max)
+		stateP->signaled = false;
+	else
+		stateP->hasMessages = true;
+
+	LWLockRelease(SInvalReadLock);
+	return n;
+}
+
+/*
+ * SICleanupQueue
+ *		Remove messages that have been consumed by all active backends
+ *
+ * callerHasWriteLock is true if caller is holding SInvalWriteLock.
+ * minFree is the minimum number of message slots to make free.
+ *
+ * Possible side effects of this routine include marking one or more
+ * backends as "reset" in the array, and sending PROCSIG_CATCHUP_INTERRUPT
+ * to some backend that seems to be getting too far behind.  We signal at
+ * most one backend at a time, for reasons explained at the top of the file.
+ *
+ * Caution: because we transiently release write lock when we have to signal
+ * some other backend, it is NOT guaranteed that there are still minFree
+ * free message slots at exit.  Caller must recheck and perhaps retry.
+ */
+void
+SICleanupQueue(bool callerHasWriteLock, int minFree)
+{
+	SISeg	   *segP = shmInvalBuffer;
+	int			min,
+				minsig,
+				lowbound,
+				numMsgs,
+				i;
+	ProcState  *needSig = NULL;
+
+	/* Lock out all writers and readers */
+	if (!callerHasWriteLock)
+		LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+	LWLockAcquire(SInvalReadLock, LW_EXCLUSIVE);
+
+	/*
+	 * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify the
+	 * furthest-back backend that needs signaling (if any), and reset any
+	 * backends that are too far back.  Note that because we ignore sendOnly
+	 * backends here it is possible for them to keep sending messages without
+	 * a problem even when they are the only active backend.
+	 */
+	min = segP->maxMsgNum;
+	minsig = min - SIG_THRESHOLD;
+	lowbound = min - MAXNUMMESSAGES + minFree;
+
+	for (i = 0; i < segP->lastBackend; i++)
+	{
+		ProcState  *stateP = &segP->procState[i];
+		int			n = stateP->nextMsgNum;
+
+		/* Ignore if inactive or already in reset state */
+		if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly)
+			continue;
+
+		/*
+		 * If we must free some space and this backend is preventing it, force
+		 * him into reset state and then ignore until he catches up.
+		 */
+		if (n < lowbound)
+		{
+			stateP->resetState = true;
+			/* no point in signaling him ... */
+			continue;
+		}
+
+		/* Track the global minimum nextMsgNum */
+		if (n < min)
+			min = n;
+
+		/* Also see who's furthest back of the unsignaled backends */
+		if (n < minsig && !stateP->signaled)
+		{
+			minsig = n;
+			needSig = stateP;
+		}
+	}
+	segP->minMsgNum = min;
+
+	/*
+	 * When minMsgNum gets really large, decrement all message counters so as
+	 * to forestall overflow of the counters.  This happens seldom enough that
+	 * folding it into the previous loop would be a loser.
+	 */
+	if (min >= MSGNUMWRAPAROUND)
+	{
+		segP->minMsgNum -= MSGNUMWRAPAROUND;
+		segP->maxMsgNum -= MSGNUMWRAPAROUND;
+		for (i = 0; i < segP->lastBackend; i++)
+		{
+			/* we don't bother skipping inactive entries here */
+			segP->procState[i].nextMsgNum -= MSGNUMWRAPAROUND;
+		}
+	}
+
+	/*
+	 * Determine how many messages are still in the queue, and set the
+	 * threshold at which we should repeat SICleanupQueue().
+	 */
+	numMsgs = segP->maxMsgNum - segP->minMsgNum;
+	if (numMsgs < CLEANUP_MIN)
+		segP->nextThreshold = CLEANUP_MIN;
+	else
+		segP->nextThreshold = (numMsgs / CLEANUP_QUANTUM + 1) * CLEANUP_QUANTUM;
+
+	/*
+	 * Lastly, signal anyone who needs a catchup interrupt.  Since
+	 * SendProcSignal() might not be fast, we don't want to hold locks while
+	 * executing it.
+	 */
+	if (needSig)
+	{
+		pid_t		his_pid = needSig->procPid;
+		BackendId	his_backendId = (needSig - &segP->procState[0]) + 1;
+
+		needSig->signaled = true;
+		LWLockRelease(SInvalReadLock);
+		LWLockRelease(SInvalWriteLock);
+		elog(DEBUG4, "sending sinval catchup signal to PID %d", (int) his_pid);
+		SendProcSignal(his_pid, PROCSIG_CATCHUP_INTERRUPT, his_backendId);
+		if (callerHasWriteLock)
+			LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+	}
+	else
+	{
+		LWLockRelease(SInvalReadLock);
+		if (!callerHasWriteLock)
+			LWLockRelease(SInvalWriteLock);
+	}
+}
+
+
+/*
+ * GetNextLocalTransactionId --- allocate a new LocalTransactionId
+ *
+ * We split VirtualTransactionIds into two parts so that it is possible
+ * to allocate a new one without any contention for shared memory, except
+ * for a bit of additional overhead during backend startup/shutdown.
+ * The high-order part of a VirtualTransactionId is a BackendId, and the
+ * low-order part is a LocalTransactionId, which we assign from a local
+ * counter.  To avoid the risk of a VirtualTransactionId being reused
+ * within a short interval, successive procs occupying the same backend ID
+ * slot should use a consecutive sequence of local IDs, which is implemented
+ * by copying nextLocalTransactionId as seen above.
+ */
+LocalTransactionId
+GetNextLocalTransactionId(void)
+{
+	LocalTransactionId result;
+
+	/* loop to avoid returning InvalidLocalTransactionId at wraparound */
+	do
+	{
+		result = nextLocalTransactionId++;
+	} while (!LocalTransactionIdIsValid(result));
+
+	return result;
+}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
new file mode 100644
index 0000000..687ce03
--- /dev/null
+++ b/src/backend/storage/ipc/standby.c
@@ -0,0 +1,1450 @@
+/*-------------------------------------------------------------------------
+ *
+ * standby.c
+ *	  Misc functions used in Hot Standby mode.
+ *
+ *	All functions for handling RM_STANDBY_ID, which relate to
+ *	AccessExclusiveLocks and starting snapshots for Hot Standby mode.
+ *	Plus conflict recovery processing.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/ipc/standby.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/standby.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/* User-settable GUC parameters */
+int			vacuum_defer_cleanup_age;
+int			max_standby_archive_delay = 30 * 1000;
+int			max_standby_streaming_delay = 30 * 1000;
+bool		log_recovery_conflict_waits = false;
+
+static HTAB *RecoveryLockLists;
+
+/* Flags set by timeout handlers */
+static volatile sig_atomic_t got_standby_deadlock_timeout = false;
+static volatile sig_atomic_t got_standby_delay_timeout = false;
+static volatile sig_atomic_t got_standby_lock_timeout = false;
+
+static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+												   ProcSignalReason reason,
+												   uint32 wait_event_info,
+												   bool report_waiting);
+static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
+static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
+static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
+static const char *get_recovery_conflict_desc(ProcSignalReason reason);
+
+/*
+ * Keep track of all the locks owned by a given transaction.
+ */
+typedef struct RecoveryLockListsEntry
+{
+	TransactionId xid;
+	List	   *locks;
+} RecoveryLockListsEntry;
+
+/*
+ * InitRecoveryTransactionEnvironment
+ *		Initialize tracking of our primary's in-progress transactions.
+ *
+ * We need to issue shared invalidations and hold locks. Holding locks
+ * means others may want to wait on us, so we need to make a lock table
+ * vxact entry like a real transaction. We could create and delete
+ * lock table entries for each transaction but its simpler just to create
+ * one permanent entry and leave it there all the time. Locks are then
+ * acquired and released as needed. Yes, this means you can see the
+ * Startup process in pg_locks once we have run this.
+ */
+void
+InitRecoveryTransactionEnvironment(void)
+{
+	VirtualTransactionId vxid;
+	HASHCTL		hash_ctl;
+
+	/*
+	 * Initialize the hash table for tracking the list of locks held by each
+	 * transaction.
+	 */
+	hash_ctl.keysize = sizeof(TransactionId);
+	hash_ctl.entrysize = sizeof(RecoveryLockListsEntry);
+	RecoveryLockLists = hash_create("RecoveryLockLists",
+									64,
+									&hash_ctl,
+									HASH_ELEM | HASH_BLOBS);
+
+	/*
+	 * Initialize shared invalidation management for Startup process, being
+	 * careful to register ourselves as a sendOnly process so we don't need to
+	 * read messages, nor will we get signaled when the queue starts filling
+	 * up.
+	 */
+	SharedInvalBackendInit(true);
+
+	/*
+	 * Lock a virtual transaction id for Startup process.
+	 *
+	 * We need to do GetNextLocalTransactionId() because
+	 * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
+	 * manager doesn't like that at all.
+	 *
+	 * Note that we don't need to run XactLockTableInsert() because nobody
+	 * needs to wait on xids. That sounds a little strange, but table locks
+	 * are held by vxids and row level locks are held by xids. All queries
+	 * hold AccessShareLocks so never block while we write or lock new rows.
+	 */
+	vxid.backendId = MyBackendId;
+	vxid.localTransactionId = GetNextLocalTransactionId();
+	VirtualXactLockTableInsert(vxid);
+
+	standbyState = STANDBY_INITIALIZED;
+}
+
+/*
+ * ShutdownRecoveryTransactionEnvironment
+ *		Shut down transaction tracking
+ *
+ * Prepare to switch from hot standby mode to normal operation. Shut down
+ * recovery-time transaction tracking.
+ *
+ * This must be called even in shutdown of startup process if transaction
+ * tracking has been initialized. Otherwise some locks the tracked
+ * transactions were holding will not be released and and may interfere with
+ * the processes still running (but will exit soon later) at the exit of
+ * startup process.
+ */
+void
+ShutdownRecoveryTransactionEnvironment(void)
+{
+	/*
+	 * Do nothing if RecoveryLockLists is NULL because which means that
+	 * transaction tracking has not been yet initialized or has been already
+	 * shutdowned. This prevents transaction tracking from being shutdowned
+	 * unexpectedly more than once.
+	 */
+	if (RecoveryLockLists == NULL)
+		return;
+
+	/* Mark all tracked in-progress transactions as finished. */
+	ExpireAllKnownAssignedTransactionIds();
+
+	/* Release all locks the tracked transactions were holding */
+	StandbyReleaseAllLocks();
+
+	/* Destroy the hash table of locks. */
+	hash_destroy(RecoveryLockLists);
+	RecoveryLockLists = NULL;
+
+	/* Cleanup our VirtualTransaction */
+	VirtualXactLockTableCleanup();
+}
+
+
+/*
+ * -----------------------------------------------------
+ *		Standby wait timers and backend cancel logic
+ * -----------------------------------------------------
+ */
+
+/*
+ * Determine the cutoff time at which we want to start canceling conflicting
+ * transactions.  Returns zero (a time safely in the past) if we are willing
+ * to wait forever.
+ */
+static TimestampTz
+GetStandbyLimitTime(void)
+{
+	TimestampTz rtime;
+	bool		fromStream;
+
+	/*
+	 * The cutoff time is the last WAL data receipt time plus the appropriate
+	 * delay variable.  Delay of -1 means wait forever.
+	 */
+	GetXLogReceiptTime(&rtime, &fromStream);
+	if (fromStream)
+	{
+		if (max_standby_streaming_delay < 0)
+			return 0;			/* wait forever */
+		return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
+	}
+	else
+	{
+		if (max_standby_archive_delay < 0)
+			return 0;			/* wait forever */
+		return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
+	}
+}
+
+#define STANDBY_INITIAL_WAIT_US  1000
+static int	standbyWait_us = STANDBY_INITIAL_WAIT_US;
+
+/*
+ * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
+ * We wait here for a while then return. If we decide we can't wait any
+ * more then we return true, if we can wait some more return false.
+ */
+static bool
+WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
+{
+	TimestampTz ltime;
+
+	CHECK_FOR_INTERRUPTS();
+
+	/* Are we past the limit time? */
+	ltime = GetStandbyLimitTime();
+	if (ltime && GetCurrentTimestamp() >= ltime)
+		return true;
+
+	/*
+	 * Sleep a bit (this is essential to avoid busy-waiting).
+	 */
+	pgstat_report_wait_start(wait_event_info);
+	pg_usleep(standbyWait_us);
+	pgstat_report_wait_end();
+
+	/*
+	 * Progressively increase the sleep times, but not to more than 1s, since
+	 * pg_usleep isn't interruptible on some platforms.
+	 */
+	standbyWait_us *= 2;
+	if (standbyWait_us > 1000000)
+		standbyWait_us = 1000000;
+
+	return false;
+}
+
+/*
+ * Log the recovery conflict.
+ *
+ * wait_start is the timestamp when the caller started to wait.
+ * now is the timestamp when this function has been called.
+ * wait_list is the list of virtual transaction ids assigned to
+ * conflicting processes. still_waiting indicates whether
+ * the startup process is still waiting for the recovery conflict
+ * to be resolved or not.
+ */
+void
+LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
+					TimestampTz now, VirtualTransactionId *wait_list,
+					bool still_waiting)
+{
+	long		secs;
+	int			usecs;
+	long		msecs;
+	StringInfoData buf;
+	int			nprocs = 0;
+
+	/*
+	 * There must be no conflicting processes when the recovery conflict has
+	 * already been resolved.
+	 */
+	Assert(still_waiting || wait_list == NULL);
+
+	TimestampDifference(wait_start, now, &secs, &usecs);
+	msecs = secs * 1000 + usecs / 1000;
+	usecs = usecs % 1000;
+
+	if (wait_list)
+	{
+		VirtualTransactionId *vxids;
+
+		/* Construct a string of list of the conflicting processes */
+		vxids = wait_list;
+		while (VirtualTransactionIdIsValid(*vxids))
+		{
+			PGPROC	   *proc = BackendIdGetProc(vxids->backendId);
+
+			/* proc can be NULL if the target backend is not active */
+			if (proc)
+			{
+				if (nprocs == 0)
+				{
+					initStringInfo(&buf);
+					appendStringInfo(&buf, "%d", proc->pid);
+				}
+				else
+					appendStringInfo(&buf, ", %d", proc->pid);
+
+				nprocs++;
+			}
+
+			vxids++;
+		}
+	}
+
+	/*
+	 * If wait_list is specified, report the list of PIDs of active
+	 * conflicting backends in a detail message. Note that if all the backends
+	 * in the list are not active, no detail message is logged.
+	 */
+	if (still_waiting)
+	{
+		ereport(LOG,
+				errmsg("recovery still waiting after %ld.%03d ms: %s",
+					   msecs, usecs, get_recovery_conflict_desc(reason)),
+				nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
+												  "Conflicting processes: %s.",
+												  nprocs, buf.data) : 0);
+	}
+	else
+	{
+		ereport(LOG,
+				errmsg("recovery finished waiting after %ld.%03d ms: %s",
+					   msecs, usecs, get_recovery_conflict_desc(reason)));
+	}
+
+	if (nprocs > 0)
+		pfree(buf.data);
+}
+
+/*
+ * This is the main executioner for any query backend that conflicts with
+ * recovery processing. Judgement has already been passed on it within
+ * a specific rmgr. Here we just issue the orders to the procs. The procs
+ * then throw the required error as instructed.
+ *
+ * If report_waiting is true, "waiting" is reported in PS display and the
+ * wait for recovery conflict is reported in the log, if necessary. If
+ * the caller is responsible for reporting them, report_waiting should be
+ * false. Otherwise, both the caller and this function report the same
+ * thing unexpectedly.
+ */
+static void
+ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+									   ProcSignalReason reason, uint32 wait_event_info,
+									   bool report_waiting)
+{
+	TimestampTz waitStart = 0;
+	char	   *new_status = NULL;
+	bool		logged_recovery_conflict = false;
+
+	/* Fast exit, to avoid a kernel call if there's no work to be done. */
+	if (!VirtualTransactionIdIsValid(*waitlist))
+		return;
+
+	/* Set the wait start timestamp for reporting */
+	if (report_waiting && (log_recovery_conflict_waits || update_process_title))
+		waitStart = GetCurrentTimestamp();
+
+	while (VirtualTransactionIdIsValid(*waitlist))
+	{
+		/* reset standbyWait_us for each xact we wait for */
+		standbyWait_us = STANDBY_INITIAL_WAIT_US;
+
+		/* wait until the virtual xid is gone */
+		while (!VirtualXactLock(*waitlist, false))
+		{
+			/* Is it time to kill it? */
+			if (WaitExceedsMaxStandbyDelay(wait_event_info))
+			{
+				pid_t		pid;
+
+				/*
+				 * Now find out who to throw out of the balloon.
+				 */
+				Assert(VirtualTransactionIdIsValid(*waitlist));
+				pid = CancelVirtualTransaction(*waitlist, reason);
+
+				/*
+				 * Wait a little bit for it to die so that we avoid flooding
+				 * an unresponsive backend when system is heavily loaded.
+				 */
+				if (pid != 0)
+					pg_usleep(5000L);
+			}
+
+			if (waitStart != 0 && (!logged_recovery_conflict || new_status == NULL))
+			{
+				TimestampTz now = 0;
+				bool		maybe_log_conflict;
+				bool		maybe_update_title;
+
+				maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
+				maybe_update_title = (update_process_title && new_status == NULL);
+
+				/* Get the current timestamp if not report yet */
+				if (maybe_log_conflict || maybe_update_title)
+					now = GetCurrentTimestamp();
+
+				/*
+				 * Report via ps if we have been waiting for more than 500
+				 * msec (should that be configurable?)
+				 */
+				if (maybe_update_title &&
+					TimestampDifferenceExceeds(waitStart, now, 500))
+				{
+					const char *old_status;
+					int			len;
+
+					old_status = get_ps_display(&len);
+					new_status = (char *) palloc(len + 8 + 1);
+					memcpy(new_status, old_status, len);
+					strcpy(new_status + len, " waiting");
+					set_ps_display(new_status);
+					new_status[len] = '\0'; /* truncate off " waiting" */
+				}
+
+				/*
+				 * Emit the log message if the startup process is waiting
+				 * longer than deadlock_timeout for recovery conflict.
+				 */
+				if (maybe_log_conflict &&
+					TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
+				{
+					LogRecoveryConflict(reason, waitStart, now, waitlist, true);
+					logged_recovery_conflict = true;
+				}
+			}
+		}
+
+		/* The virtual transaction is gone now, wait for the next one */
+		waitlist++;
+	}
+
+	/*
+	 * Emit the log message if recovery conflict was resolved but the startup
+	 * process waited longer than deadlock_timeout for it.
+	 */
+	if (logged_recovery_conflict)
+		LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
+							NULL, false);
+
+	/* Reset ps display if we changed it */
+	if (new_status)
+	{
+		set_ps_display(new_status);
+		pfree(new_status);
+	}
+}
+
+void
+ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
+{
+	VirtualTransactionId *backends;
+
+	/*
+	 * If we get passed InvalidTransactionId then we do nothing (no conflict).
+	 *
+	 * This can happen when replaying already-applied WAL records after a
+	 * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
+	 * record that marks as frozen a page which was already all-visible.  It's
+	 * also quite common with records generated during index deletion
+	 * (original execution of the deletion can reason that a recovery conflict
+	 * which is sufficient for the deletion operation must take place before
+	 * replay of the deletion record itself).
+	 */
+	if (!TransactionIdIsValid(latestRemovedXid))
+		return;
+
+	backends = GetConflictingVirtualXIDs(latestRemovedXid,
+										 node.dbNode);
+
+	ResolveRecoveryConflictWithVirtualXIDs(backends,
+										   PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
+										   WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
+										   true);
+}
+
+/*
+ * Variant of ResolveRecoveryConflictWithSnapshot that works with
+ * FullTransactionId values
+ */
+void
+ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+										   RelFileNode node)
+{
+	/*
+	 * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
+	 * so truncate the logged FullTransactionId.  If the logged value is very
+	 * old, so that XID wrap-around already happened on it, there can't be any
+	 * snapshots that still see it.
+	 */
+	FullTransactionId nextXid = ReadNextFullTransactionId();
+	uint64		diff;
+
+	diff = U64FromFullTransactionId(nextXid) -
+		U64FromFullTransactionId(latestRemovedFullXid);
+	if (diff < MaxTransactionId / 2)
+	{
+		TransactionId latestRemovedXid;
+
+		latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
+		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node);
+	}
+}
+
+void
+ResolveRecoveryConflictWithTablespace(Oid tsid)
+{
+	VirtualTransactionId *temp_file_users;
+
+	/*
+	 * Standby users may be currently using this tablespace for their
+	 * temporary files. We only care about current users because
+	 * temp_tablespace parameter will just ignore tablespaces that no longer
+	 * exist.
+	 *
+	 * Ask everybody to cancel their queries immediately so we can ensure no
+	 * temp files remain and we can remove the tablespace. Nuke the entire
+	 * site from orbit, it's the only way to be sure.
+	 *
+	 * XXX: We could work out the pids of active backends using this
+	 * tablespace by examining the temp filenames in the directory. We would
+	 * then convert the pids into VirtualXIDs before attempting to cancel
+	 * them.
+	 *
+	 * We don't wait for commit because drop tablespace is non-transactional.
+	 */
+	temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
+												InvalidOid);
+	ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
+										   PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
+										   WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
+										   true);
+}
+
+void
+ResolveRecoveryConflictWithDatabase(Oid dbid)
+{
+	/*
+	 * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
+	 * only waits for transactions and completely idle sessions would block
+	 * us. This is rare enough that we do this as simply as possible: no wait,
+	 * just force them off immediately.
+	 *
+	 * No locking is required here because we already acquired
+	 * AccessExclusiveLock. Anybody trying to connect while we do this will
+	 * block during InitPostgres() and then disconnect when they see the
+	 * database has been removed.
+	 */
+	while (CountDBBackends(dbid) > 0)
+	{
+		CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
+
+		/*
+		 * Wait awhile for them to die so that we avoid flooding an
+		 * unresponsive backend when system is heavily loaded.
+		 */
+		pg_usleep(10000);
+	}
+}
+
+/*
+ * ResolveRecoveryConflictWithLock is called from ProcSleep()
+ * to resolve conflicts with other backends holding relation locks.
+ *
+ * The WaitLatch sleep normally done in ProcSleep()
+ * (when not InHotStandby) is performed here, for code clarity.
+ *
+ * We either resolve conflicts immediately or set a timeout to wake us at
+ * the limit of our patience.
+ *
+ * Resolve conflicts by canceling to all backends holding a conflicting
+ * lock.  As we are already queued to be granted the lock, no new lock
+ * requests conflicting with ours will be granted in the meantime.
+ *
+ * We also must check for deadlocks involving the Startup process and
+ * hot-standby backend processes. If deadlock_timeout is reached in
+ * this function, all the backends holding the conflicting locks are
+ * requested to check themselves for deadlocks.
+ *
+ * logging_conflict should be true if the recovery conflict has not been
+ * logged yet even though logging is enabled. After deadlock_timeout is
+ * reached and the request for deadlock check is sent, we wait again to
+ * be signaled by the release of the lock if logging_conflict is false.
+ * Otherwise we return without waiting again so that the caller can report
+ * the recovery conflict. In this case, then, this function is called again
+ * with logging_conflict=false (because the recovery conflict has already
+ * been logged) and we will wait again for the lock to be released.
+ */
+void
+ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
+{
+	TimestampTz ltime;
+	TimestampTz now;
+
+	Assert(InHotStandby);
+
+	ltime = GetStandbyLimitTime();
+	now = GetCurrentTimestamp();
+
+	/*
+	 * Update waitStart if first time through after the startup process
+	 * started waiting for the lock. It should not be updated every time
+	 * ResolveRecoveryConflictWithLock() is called during the wait.
+	 *
+	 * Use the current time obtained for comparison with ltime as waitStart
+	 * (i.e., the time when this process started waiting for the lock). Since
+	 * getting the current time newly can cause overhead, we reuse the
+	 * already-obtained time to avoid that overhead.
+	 *
+	 * Note that waitStart is updated without holding the lock table's
+	 * partition lock, to avoid the overhead by additional lock acquisition.
+	 * This can cause "waitstart" in pg_locks to become NULL for a very short
+	 * period of time after the wait started even though "granted" is false.
+	 * This is OK in practice because we can assume that users are likely to
+	 * look at "waitstart" when waiting for the lock for a long time.
+	 */
+	if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
+		pg_atomic_write_u64(&MyProc->waitStart, now);
+
+	if (now >= ltime && ltime != 0)
+	{
+		/*
+		 * We're already behind, so clear a path as quickly as possible.
+		 */
+		VirtualTransactionId *backends;
+
+		backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
+
+		/*
+		 * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
+		 * "waiting" in PS display by disabling its argument report_waiting
+		 * because the caller, WaitOnLock(), has already reported that.
+		 */
+		ResolveRecoveryConflictWithVirtualXIDs(backends,
+											   PROCSIG_RECOVERY_CONFLICT_LOCK,
+											   PG_WAIT_LOCK | locktag.locktag_type,
+											   false);
+	}
+	else
+	{
+		/*
+		 * Wait (or wait again) until ltime, and check for deadlocks as well
+		 * if we will be waiting longer than deadlock_timeout
+		 */
+		EnableTimeoutParams timeouts[2];
+		int			cnt = 0;
+
+		if (ltime != 0)
+		{
+			got_standby_lock_timeout = false;
+			timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
+			timeouts[cnt].type = TMPARAM_AT;
+			timeouts[cnt].fin_time = ltime;
+			cnt++;
+		}
+
+		got_standby_deadlock_timeout = false;
+		timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
+		timeouts[cnt].type = TMPARAM_AFTER;
+		timeouts[cnt].delay_ms = DeadlockTimeout;
+		cnt++;
+
+		enable_timeouts(timeouts, cnt);
+	}
+
+	/* Wait to be signaled by the release of the Relation Lock */
+	ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
+
+	/*
+	 * Exit if ltime is reached. Then all the backends holding conflicting
+	 * locks will be canceled in the next ResolveRecoveryConflictWithLock()
+	 * call.
+	 */
+	if (got_standby_lock_timeout)
+		goto cleanup;
+
+	if (got_standby_deadlock_timeout)
+	{
+		VirtualTransactionId *backends;
+
+		backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
+
+		/* Quick exit if there's no work to be done */
+		if (!VirtualTransactionIdIsValid(*backends))
+			goto cleanup;
+
+		/*
+		 * Send signals to all the backends holding the conflicting locks, to
+		 * ask them to check themselves for deadlocks.
+		 */
+		while (VirtualTransactionIdIsValid(*backends))
+		{
+			SignalVirtualTransaction(*backends,
+									 PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
+									 false);
+			backends++;
+		}
+
+		/*
+		 * Exit if the recovery conflict has not been logged yet even though
+		 * logging is enabled, so that the caller can log that. Then
+		 * RecoveryConflictWithLock() is called again and we will wait again
+		 * for the lock to be released.
+		 */
+		if (logging_conflict)
+			goto cleanup;
+
+		/*
+		 * Wait again here to be signaled by the release of the Relation Lock,
+		 * to prevent the subsequent RecoveryConflictWithLock() from causing
+		 * deadlock_timeout and sending a request for deadlocks check again.
+		 * Otherwise the request continues to be sent every deadlock_timeout
+		 * until the relation locks are released or ltime is reached.
+		 */
+		got_standby_deadlock_timeout = false;
+		ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
+	}
+
+cleanup:
+
+	/*
+	 * Clear any timeout requests established above.  We assume here that the
+	 * Startup process doesn't have any other outstanding timeouts than those
+	 * used by this function. If that stops being true, we could cancel the
+	 * timeouts individually, but that'd be slower.
+	 */
+	disable_all_timeouts(false);
+	got_standby_lock_timeout = false;
+	got_standby_deadlock_timeout = false;
+}
+
+/*
+ * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
+ * to resolve conflicts with other backends holding buffer pins.
+ *
+ * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
+ * (when not InHotStandby) is performed here, for code clarity.
+ *
+ * We either resolve conflicts immediately or set a timeout to wake us at
+ * the limit of our patience.
+ *
+ * Resolve conflicts by sending a PROCSIG signal to all backends to check if
+ * they hold one of the buffer pins that is blocking Startup process. If so,
+ * those backends will take an appropriate error action, ERROR or FATAL.
+ *
+ * We also must check for deadlocks.  Deadlocks occur because if queries
+ * wait on a lock, that must be behind an AccessExclusiveLock, which can only
+ * be cleared if the Startup process replays a transaction completion record.
+ * If Startup process is also waiting then that is a deadlock. The deadlock
+ * can occur if the query is waiting and then the Startup sleeps, or if
+ * Startup is sleeping and the query waits on a lock. We protect against
+ * only the former sequence here, the latter sequence is checked prior to
+ * the query sleeping, in CheckRecoveryConflictDeadlock().
+ *
+ * Deadlocks are extremely rare, and relatively expensive to check for,
+ * so we don't do a deadlock check right away ... only if we have had to wait
+ * at least deadlock_timeout.
+ */
+void
+ResolveRecoveryConflictWithBufferPin(void)
+{
+	TimestampTz ltime;
+
+	Assert(InHotStandby);
+
+	ltime = GetStandbyLimitTime();
+
+	if (GetCurrentTimestamp() >= ltime && ltime != 0)
+	{
+		/*
+		 * We're already behind, so clear a path as quickly as possible.
+		 */
+		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+	}
+	else
+	{
+		/*
+		 * Wake up at ltime, and check for deadlocks as well if we will be
+		 * waiting longer than deadlock_timeout
+		 */
+		EnableTimeoutParams timeouts[2];
+		int			cnt = 0;
+
+		if (ltime != 0)
+		{
+			timeouts[cnt].id = STANDBY_TIMEOUT;
+			timeouts[cnt].type = TMPARAM_AT;
+			timeouts[cnt].fin_time = ltime;
+			cnt++;
+		}
+
+		got_standby_deadlock_timeout = false;
+		timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
+		timeouts[cnt].type = TMPARAM_AFTER;
+		timeouts[cnt].delay_ms = DeadlockTimeout;
+		cnt++;
+
+		enable_timeouts(timeouts, cnt);
+	}
+
+	/*
+	 * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
+	 * by one of the timeouts established above.
+	 *
+	 * We assume that only UnpinBuffer() and the timeout requests established
+	 * above can wake us up here. WakeupRecovery() called by walreceiver or
+	 * SIGHUP signal handler, etc cannot do that because it uses the different
+	 * latch from that ProcWaitForSignal() waits on.
+	 */
+	ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
+
+	if (got_standby_delay_timeout)
+		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+	else if (got_standby_deadlock_timeout)
+	{
+		/*
+		 * Send out a request for hot-standby backends to check themselves for
+		 * deadlocks.
+		 *
+		 * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
+		 * to be signaled by UnpinBuffer() again and send a request for
+		 * deadlocks check if deadlock_timeout happens. This causes the
+		 * request to continue to be sent every deadlock_timeout until the
+		 * buffer is unpinned or ltime is reached. This would increase the
+		 * workload in the startup process and backends. In practice it may
+		 * not be so harmful because the period that the buffer is kept pinned
+		 * is basically no so long. But we should fix this?
+		 */
+		SendRecoveryConflictWithBufferPin(
+										  PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+	}
+
+	/*
+	 * Clear any timeout requests established above.  We assume here that the
+	 * Startup process doesn't have any other timeouts than what this function
+	 * uses.  If that stops being true, we could cancel the timeouts
+	 * individually, but that'd be slower.
+	 */
+	disable_all_timeouts(false);
+	got_standby_delay_timeout = false;
+	got_standby_deadlock_timeout = false;
+}
+
+static void
+SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
+{
+	Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
+		   reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+
+	/*
+	 * We send signal to all backends to ask them if they are holding the
+	 * buffer pin which is delaying the Startup process. We must not set the
+	 * conflict flag yet, since most backends will be innocent. Let the
+	 * SIGUSR1 handling in each backend decide their own fate.
+	 */
+	CancelDBBackends(InvalidOid, reason, false);
+}
+
+/*
+ * In Hot Standby perform early deadlock detection.  We abort the lock
+ * wait if we are about to sleep while holding the buffer pin that Startup
+ * process is waiting for.
+ *
+ * Note: this code is pessimistic, because there is no way for it to
+ * determine whether an actual deadlock condition is present: the lock we
+ * need to wait for might be unrelated to any held by the Startup process.
+ * Sooner or later, this mechanism should get ripped out in favor of somehow
+ * accounting for buffer locks in DeadLockCheck().  However, errors here
+ * seem to be very low-probability in practice, so for now it's not worth
+ * the trouble.
+ */
+void
+CheckRecoveryConflictDeadlock(void)
+{
+	Assert(!InRecovery);		/* do not call in Startup process */
+
+	if (!HoldingBufferPinThatDelaysRecovery())
+		return;
+
+	/*
+	 * Error message should match ProcessInterrupts() but we avoid calling
+	 * that because we aren't handling an interrupt at this point. Note that
+	 * we only cancel the current transaction here, so if we are in a
+	 * subtransaction and the pin is held by a parent, then the Startup
+	 * process will continue to wait even though we have avoided deadlock.
+	 */
+	ereport(ERROR,
+			(errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
+			 errmsg("canceling statement due to conflict with recovery"),
+			 errdetail("User transaction caused buffer deadlock with recovery.")));
+}
+
+
+/* --------------------------------
+ *		timeout handler routines
+ * --------------------------------
+ */
+
+/*
+ * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
+ * exceeded.
+ */
+void
+StandbyDeadLockHandler(void)
+{
+	got_standby_deadlock_timeout = true;
+}
+
+/*
+ * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
+ */
+void
+StandbyTimeoutHandler(void)
+{
+	got_standby_delay_timeout = true;
+}
+
+/*
+ * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
+ */
+void
+StandbyLockTimeoutHandler(void)
+{
+	got_standby_lock_timeout = true;
+}
+
+/*
+ * -----------------------------------------------------
+ * Locking in Recovery Mode
+ * -----------------------------------------------------
+ *
+ * All locks are held by the Startup process using a single virtual
+ * transaction. This implementation is both simpler and in some senses,
+ * more correct. The locks held mean "some original transaction held
+ * this lock, so query access is not allowed at this time". So the Startup
+ * process is the proxy by which the original locks are implemented.
+ *
+ * We only keep track of AccessExclusiveLocks, which are only ever held by
+ * one transaction on one relation.
+ *
+ * We keep a hash table of lists of locks in local memory keyed by xid,
+ * RecoveryLockLists, so we can keep track of the various entries made by
+ * the Startup process's virtual xid in the shared lock table.
+ *
+ * List elements use type xl_standby_lock, since the WAL record type exactly
+ * matches the information that we need to keep track of.
+ *
+ * We use session locks rather than normal locks so we don't need
+ * ResourceOwners.
+ */
+
+
+void
+StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
+{
+	RecoveryLockListsEntry *entry;
+	xl_standby_lock *newlock;
+	LOCKTAG		locktag;
+	bool		found;
+
+	/* Already processed? */
+	if (!TransactionIdIsValid(xid) ||
+		TransactionIdDidCommit(xid) ||
+		TransactionIdDidAbort(xid))
+		return;
+
+	elog(trace_recovery(DEBUG4),
+		 "adding recovery lock: db %u rel %u", dbOid, relOid);
+
+	/* dbOid is InvalidOid when we are locking a shared relation. */
+	Assert(OidIsValid(relOid));
+
+	/* Create a new list for this xid, if we don't have one already. */
+	entry = hash_search(RecoveryLockLists, &xid, HASH_ENTER, &found);
+	if (!found)
+	{
+		entry->xid = xid;
+		entry->locks = NIL;
+	}
+
+	newlock = palloc(sizeof(xl_standby_lock));
+	newlock->xid = xid;
+	newlock->dbOid = dbOid;
+	newlock->relOid = relOid;
+	entry->locks = lappend(entry->locks, newlock);
+
+	SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
+
+	(void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
+}
+
+static void
+StandbyReleaseLockList(List *locks)
+{
+	ListCell   *lc;
+
+	foreach(lc, locks)
+	{
+		xl_standby_lock *lock = (xl_standby_lock *) lfirst(lc);
+		LOCKTAG		locktag;
+
+		elog(trace_recovery(DEBUG4),
+			 "releasing recovery lock: xid %u db %u rel %u",
+			 lock->xid, lock->dbOid, lock->relOid);
+		SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
+		if (!LockRelease(&locktag, AccessExclusiveLock, true))
+		{
+			elog(LOG,
+				 "RecoveryLockLists contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
+				 lock->xid, lock->dbOid, lock->relOid);
+			Assert(false);
+		}
+	}
+
+	list_free_deep(locks);
+}
+
+static void
+StandbyReleaseLocks(TransactionId xid)
+{
+	RecoveryLockListsEntry *entry;
+
+	if (TransactionIdIsValid(xid))
+	{
+		if ((entry = hash_search(RecoveryLockLists, &xid, HASH_FIND, NULL)))
+		{
+			StandbyReleaseLockList(entry->locks);
+			hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+		}
+	}
+	else
+		StandbyReleaseAllLocks();
+}
+
+/*
+ * Release locks for a transaction tree, starting at xid down, from
+ * RecoveryLockLists.
+ *
+ * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
+ * to remove any AccessExclusiveLocks requested by a transaction.
+ */
+void
+StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
+{
+	int			i;
+
+	StandbyReleaseLocks(xid);
+
+	for (i = 0; i < nsubxids; i++)
+		StandbyReleaseLocks(subxids[i]);
+}
+
+/*
+ * Called at end of recovery and when we see a shutdown checkpoint.
+ */
+void
+StandbyReleaseAllLocks(void)
+{
+	HASH_SEQ_STATUS status;
+	RecoveryLockListsEntry *entry;
+
+	elog(trace_recovery(DEBUG2), "release all standby locks");
+
+	hash_seq_init(&status, RecoveryLockLists);
+	while ((entry = hash_seq_search(&status)))
+	{
+		StandbyReleaseLockList(entry->locks);
+		hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+	}
+}
+
+/*
+ * StandbyReleaseOldLocks
+ *		Release standby locks held by top-level XIDs that aren't running,
+ *		as long as they're not prepared transactions.
+ */
+void
+StandbyReleaseOldLocks(TransactionId oldxid)
+{
+	HASH_SEQ_STATUS status;
+	RecoveryLockListsEntry *entry;
+
+	hash_seq_init(&status, RecoveryLockLists);
+	while ((entry = hash_seq_search(&status)))
+	{
+		Assert(TransactionIdIsValid(entry->xid));
+
+		/* Skip if prepared transaction. */
+		if (StandbyTransactionIdIsPrepared(entry->xid))
+			continue;
+
+		/* Skip if >= oldxid. */
+		if (!TransactionIdPrecedes(entry->xid, oldxid))
+			continue;
+
+		/* Remove all locks and hash table entry. */
+		StandbyReleaseLockList(entry->locks);
+		hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+	}
+}
+
+/*
+ * --------------------------------------------------------------------
+ *		Recovery handling for Rmgr RM_STANDBY_ID
+ *
+ * These record types will only be created if XLogStandbyInfoActive()
+ * --------------------------------------------------------------------
+ */
+
+void
+standby_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	/* Backup blocks are not used in standby records */
+	Assert(!XLogRecHasAnyBlockRefs(record));
+
+	/* Do nothing if we're not in hot standby mode */
+	if (standbyState == STANDBY_DISABLED)
+		return;
+
+	if (info == XLOG_STANDBY_LOCK)
+	{
+		xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
+		int			i;
+
+		for (i = 0; i < xlrec->nlocks; i++)
+			StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
+											  xlrec->locks[i].dbOid,
+											  xlrec->locks[i].relOid);
+	}
+	else if (info == XLOG_RUNNING_XACTS)
+	{
+		xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
+		RunningTransactionsData running;
+
+		running.xcnt = xlrec->xcnt;
+		running.subxcnt = xlrec->subxcnt;
+		running.subxid_overflow = xlrec->subxid_overflow;
+		running.nextXid = xlrec->nextXid;
+		running.latestCompletedXid = xlrec->latestCompletedXid;
+		running.oldestRunningXid = xlrec->oldestRunningXid;
+		running.xids = xlrec->xids;
+
+		ProcArrayApplyRecoveryInfo(&running);
+	}
+	else if (info == XLOG_INVALIDATIONS)
+	{
+		xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
+
+		ProcessCommittedInvalidationMessages(xlrec->msgs,
+											 xlrec->nmsgs,
+											 xlrec->relcacheInitFileInval,
+											 xlrec->dbId,
+											 xlrec->tsId);
+	}
+	else
+		elog(PANIC, "standby_redo: unknown op code %u", info);
+}
+
+/*
+ * Log details of the current snapshot to WAL. This allows the snapshot state
+ * to be reconstructed on the standby and for logical decoding.
+ *
+ * This is used for Hot Standby as follows:
+ *
+ * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
+ * start from a shutdown checkpoint because we know nothing was running
+ * at that time and our recovery snapshot is known empty. In the more
+ * typical case of an online checkpoint we need to jump through a few
+ * hoops to get a correct recovery snapshot and this requires a two or
+ * sometimes a three stage process.
+ *
+ * The initial snapshot must contain all running xids and all current
+ * AccessExclusiveLocks at a point in time on the standby. Assembling
+ * that information while the server is running requires many and
+ * various LWLocks, so we choose to derive that information piece by
+ * piece and then re-assemble that info on the standby. When that
+ * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
+ *
+ * Since locking on the primary when we derive the information is not
+ * strict, we note that there is a time window between the derivation and
+ * writing to WAL of the derived information. That allows race conditions
+ * that we must resolve, since xids and locks may enter or leave the
+ * snapshot during that window. This creates the issue that an xid or
+ * lock may start *after* the snapshot has been derived yet *before* the
+ * snapshot is logged in the running xacts WAL record. We resolve this by
+ * starting to accumulate changes at a point just prior to when we derive
+ * the snapshot on the primary, then ignore duplicates when we later apply
+ * the snapshot from the running xacts record. This is implemented during
+ * CreateCheckpoint() where we use the logical checkpoint location as
+ * our starting point and then write the running xacts record immediately
+ * before writing the main checkpoint WAL record. Since we always start
+ * up from a checkpoint and are immediately at our starting point, we
+ * unconditionally move to STANDBY_INITIALIZED. After this point we
+ * must do 4 things:
+ *	* move shared nextXid forwards as we see new xids
+ *	* extend the clog and subtrans with each new xid
+ *	* keep track of uncommitted known assigned xids
+ *	* keep track of uncommitted AccessExclusiveLocks
+ *
+ * When we see a commit/abort we must remove known assigned xids and locks
+ * from the completing transaction. Attempted removals that cannot locate
+ * an entry are expected and must not cause an error when we are in state
+ * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
+ * KnownAssignedXidsRemove().
+ *
+ * Later, when we apply the running xact data we must be careful to ignore
+ * transactions already committed, since those commits raced ahead when
+ * making WAL entries.
+ *
+ * The loose timing also means that locks may be recorded that have a
+ * zero xid, since xids are removed from procs before locks are removed.
+ * So we must prune the lock list down to ensure we hold locks only for
+ * currently running xids, performed by StandbyReleaseOldLocks().
+ * Zero xids should no longer be possible, but we may be replaying WAL
+ * from a time when they were possible.
+ *
+ * For logical decoding only the running xacts information is needed;
+ * there's no need to look at the locking information, but it's logged anyway,
+ * as there's no independent knob to just enable logical decoding. For
+ * details of how this is used, check snapbuild.c's introductory comment.
+ *
+ *
+ * Returns the RecPtr of the last inserted record.
+ */
+XLogRecPtr
+LogStandbySnapshot(void)
+{
+	XLogRecPtr	recptr;
+	RunningTransactions running;
+	xl_standby_lock *locks;
+	int			nlocks;
+
+	Assert(XLogStandbyInfoActive());
+
+	/*
+	 * Get details of any AccessExclusiveLocks being held at the moment.
+	 */
+	locks = GetRunningTransactionLocks(&nlocks);
+	if (nlocks > 0)
+		LogAccessExclusiveLocks(nlocks, locks);
+	pfree(locks);
+
+	/*
+	 * Log details of all in-progress transactions. This should be the last
+	 * record we write, because standby will open up when it sees this.
+	 */
+	running = GetRunningTransactionData();
+
+	/*
+	 * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
+	 * For Hot Standby this can be done before inserting the WAL record
+	 * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
+	 * the clog. For logical decoding, though, the lock can't be released
+	 * early because the clog might be "in the future" from the POV of the
+	 * historic snapshot. This would allow for situations where we're waiting
+	 * for the end of a transaction listed in the xl_running_xacts record
+	 * which, according to the WAL, has committed before the xl_running_xacts
+	 * record. Fortunately this routine isn't executed frequently, and it's
+	 * only a shared lock.
+	 */
+	if (wal_level < WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
+	recptr = LogCurrentRunningXacts(running);
+
+	/* Release lock if we kept it longer ... */
+	if (wal_level >= WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
+	/* GetRunningTransactionData() acquired XidGenLock, we must release it */
+	LWLockRelease(XidGenLock);
+
+	return recptr;
+}
+
+/*
+ * Record an enhanced snapshot of running transactions into WAL.
+ *
+ * The definitions of RunningTransactionsData and xl_xact_running_xacts are
+ * similar. We keep them separate because xl_xact_running_xacts is a
+ * contiguous chunk of memory and never exists fully until it is assembled in
+ * WAL. The inserted records are marked as not being important for durability,
+ * to avoid triggering superfluous checkpoint / archiving activity.
+ */
+static XLogRecPtr
+LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
+{
+	xl_running_xacts xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.xcnt = CurrRunningXacts->xcnt;
+	xlrec.subxcnt = CurrRunningXacts->subxcnt;
+	xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
+	xlrec.nextXid = CurrRunningXacts->nextXid;
+	xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
+	xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
+
+	/* Header */
+	XLogBeginInsert();
+	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+	XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
+
+	/* array of TransactionIds */
+	if (xlrec.xcnt > 0)
+		XLogRegisterData((char *) CurrRunningXacts->xids,
+						 (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
+
+	recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
+
+	if (CurrRunningXacts->subxid_overflow)
+		elog(trace_recovery(DEBUG2),
+			 "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+			 CurrRunningXacts->xcnt,
+			 LSN_FORMAT_ARGS(recptr),
+			 CurrRunningXacts->oldestRunningXid,
+			 CurrRunningXacts->latestCompletedXid,
+			 CurrRunningXacts->nextXid);
+	else
+		elog(trace_recovery(DEBUG2),
+			 "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+			 CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
+			 LSN_FORMAT_ARGS(recptr),
+			 CurrRunningXacts->oldestRunningXid,
+			 CurrRunningXacts->latestCompletedXid,
+			 CurrRunningXacts->nextXid);
+
+	/*
+	 * Ensure running_xacts information is synced to disk not too far in the
+	 * future. We don't want to stall anything though (i.e. use XLogFlush()),
+	 * so we let the wal writer do it during normal operation.
+	 * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
+	 * and nudge the WALWriter into action if sleeping. Check
+	 * XLogBackgroundFlush() for details why a record might not be flushed
+	 * without it.
+	 */
+	XLogSetAsyncXactLSN(recptr);
+
+	return recptr;
+}
+
+/*
+ * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
+ * logged, as described in backend/storage/lmgr/README.
+ */
+static void
+LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
+{
+	xl_standby_locks xlrec;
+
+	xlrec.nlocks = nlocks;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
+	XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
+	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+
+	(void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
+}
+
+/*
+ * Individual logging of AccessExclusiveLocks for use during LockAcquire()
+ */
+void
+LogAccessExclusiveLock(Oid dbOid, Oid relOid)
+{
+	xl_standby_lock xlrec;
+
+	xlrec.xid = GetCurrentTransactionId();
+
+	xlrec.dbOid = dbOid;
+	xlrec.relOid = relOid;
+
+	LogAccessExclusiveLocks(1, &xlrec);
+	MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
+}
+
+/*
+ * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
+ */
+void
+LogAccessExclusiveLockPrepare(void)
+{
+	/*
+	 * Ensure that a TransactionId has been assigned to this transaction, for
+	 * two reasons, both related to lock release on the standby. First, we
+	 * must assign an xid so that RecordTransactionCommit() and
+	 * RecordTransactionAbort() do not optimise away the transaction
+	 * completion record which recovery relies upon to release locks. It's a
+	 * hack, but for a corner case not worth adding code for into the main
+	 * commit path. Second, we must assign an xid before the lock is recorded
+	 * in shared memory, otherwise a concurrently executing
+	 * GetRunningTransactionLocks() might see a lock associated with an
+	 * InvalidTransactionId which we later assert cannot happen.
+	 */
+	(void) GetCurrentTransactionId();
+}
+
+/*
+ * Emit WAL for invalidations. This currently is only used for commits without
+ * an xid but which contain invalidations.
+ */
+void
+LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
+						bool relcacheInitFileInval)
+{
+	xl_invalidations xlrec;
+
+	/* prepare record */
+	memset(&xlrec, 0, sizeof(xlrec));
+	xlrec.dbId = MyDatabaseId;
+	xlrec.tsId = MyDatabaseTableSpace;
+	xlrec.relcacheInitFileInval = relcacheInitFileInval;
+	xlrec.nmsgs = nmsgs;
+
+	/* perform insertion */
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
+	XLogRegisterData((char *) msgs,
+					 nmsgs * sizeof(SharedInvalidationMessage));
+	XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
+}
+
+/* Return the description of recovery conflict */
+static const char *
+get_recovery_conflict_desc(ProcSignalReason reason)
+{
+	const char *reasonDesc = _("unknown reason");
+
+	switch (reason)
+	{
+		case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+			reasonDesc = _("recovery conflict on buffer pin");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_LOCK:
+			reasonDesc = _("recovery conflict on lock");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+			reasonDesc = _("recovery conflict on tablespace");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+			reasonDesc = _("recovery conflict on snapshot");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+			reasonDesc = _("recovery conflict on buffer deadlock");
+			break;
+		case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+			reasonDesc = _("recovery conflict on database");
+			break;
+		default:
+			break;
+	}
+
+	return reasonDesc;
+}
diff --git a/src/backend/storage/large_object/Makefile b/src/backend/storage/large_object/Makefile
new file mode 100644
index 0000000..8a6bc36
--- /dev/null
+++ b/src/backend/storage/large_object/Makefile
@@ -0,0 +1,18 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/large_object
+#
+# IDENTIFICATION
+#    src/backend/storage/large_object/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/large_object
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	inv_api.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c
new file mode 100644
index 0000000..c98606a
--- /dev/null
+++ b/src/backend/storage/large_object/inv_api.c
@@ -0,0 +1,955 @@
+/*-------------------------------------------------------------------------
+ *
+ * inv_api.c
+ *	  routines for manipulating inversion fs large objects. This file
+ *	  contains the user-level large object application interface routines.
+ *
+ *
+ * Note: we access pg_largeobject.data using its C struct declaration.
+ * This is safe because it immediately follows pageno which is an int4 field,
+ * and therefore the data field will always be 4-byte aligned, even if it
+ * is in the short 1-byte-header format.  We have to detoast it since it's
+ * quite likely to be in compressed or short format.  We also need to check
+ * for NULLs, since initdb will mark loid and pageno but not data as NOT NULL.
+ *
+ * Note: many of these routines leak memory in CurrentMemoryContext, as indeed
+ * does most of the backend code.  We expect that CurrentMemoryContext will
+ * be a short-lived context.  Data that must persist across function calls
+ * is kept either in CacheMemoryContext (the Relation structs) or in the
+ * memory context given to inv_open (for LargeObjectDesc structs).
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/large_object/inv_api.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "access/detoast.h"
+#include "access/genam.h"
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "access/table.h"
+#include "access/xact.h"
+#include "catalog/dependency.h"
+#include "catalog/indexing.h"
+#include "catalog/objectaccess.h"
+#include "catalog/pg_largeobject.h"
+#include "catalog/pg_largeobject_metadata.h"
+#include "libpq/libpq-fs.h"
+#include "miscadmin.h"
+#include "storage/large_object.h"
+#include "utils/acl.h"
+#include "utils/fmgroids.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+
+/*
+ * GUC: backwards-compatibility flag to suppress LO permission checks
+ */
+bool		lo_compat_privileges;
+
+/*
+ * All accesses to pg_largeobject and its index make use of a single Relation
+ * reference, so that we only need to open pg_relation once per transaction.
+ * To avoid problems when the first such reference occurs inside a
+ * subtransaction, we execute a slightly klugy maneuver to assign ownership of
+ * the Relation reference to TopTransactionResourceOwner.
+ */
+static Relation lo_heap_r = NULL;
+static Relation lo_index_r = NULL;
+
+
+/*
+ * Open pg_largeobject and its index, if not already done in current xact
+ */
+static void
+open_lo_relation(void)
+{
+	ResourceOwner currentOwner;
+
+	if (lo_heap_r && lo_index_r)
+		return;					/* already open in current xact */
+
+	/* Arrange for the top xact to own these relation references */
+	currentOwner = CurrentResourceOwner;
+	CurrentResourceOwner = TopTransactionResourceOwner;
+
+	/* Use RowExclusiveLock since we might either read or write */
+	if (lo_heap_r == NULL)
+		lo_heap_r = table_open(LargeObjectRelationId, RowExclusiveLock);
+	if (lo_index_r == NULL)
+		lo_index_r = index_open(LargeObjectLOidPNIndexId, RowExclusiveLock);
+
+	CurrentResourceOwner = currentOwner;
+}
+
+/*
+ * Clean up at main transaction end
+ */
+void
+close_lo_relation(bool isCommit)
+{
+	if (lo_heap_r || lo_index_r)
+	{
+		/*
+		 * Only bother to close if committing; else abort cleanup will handle
+		 * it
+		 */
+		if (isCommit)
+		{
+			ResourceOwner currentOwner;
+
+			currentOwner = CurrentResourceOwner;
+			CurrentResourceOwner = TopTransactionResourceOwner;
+
+			if (lo_index_r)
+				index_close(lo_index_r, NoLock);
+			if (lo_heap_r)
+				table_close(lo_heap_r, NoLock);
+
+			CurrentResourceOwner = currentOwner;
+		}
+		lo_heap_r = NULL;
+		lo_index_r = NULL;
+	}
+}
+
+
+/*
+ * Same as pg_largeobject.c's LargeObjectExists(), except snapshot to
+ * read with can be specified.
+ */
+static bool
+myLargeObjectExists(Oid loid, Snapshot snapshot)
+{
+	Relation	pg_lo_meta;
+	ScanKeyData skey[1];
+	SysScanDesc sd;
+	HeapTuple	tuple;
+	bool		retval = false;
+
+	ScanKeyInit(&skey[0],
+				Anum_pg_largeobject_metadata_oid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(loid));
+
+	pg_lo_meta = table_open(LargeObjectMetadataRelationId,
+							AccessShareLock);
+
+	sd = systable_beginscan(pg_lo_meta,
+							LargeObjectMetadataOidIndexId, true,
+							snapshot, 1, skey);
+
+	tuple = systable_getnext(sd);
+	if (HeapTupleIsValid(tuple))
+		retval = true;
+
+	systable_endscan(sd);
+
+	table_close(pg_lo_meta, AccessShareLock);
+
+	return retval;
+}
+
+
+/*
+ * Extract data field from a pg_largeobject tuple, detoasting if needed
+ * and verifying that the length is sane.  Returns data pointer (a bytea *),
+ * data length, and an indication of whether to pfree the data pointer.
+ */
+static void
+getdatafield(Form_pg_largeobject tuple,
+			 bytea **pdatafield,
+			 int *plen,
+			 bool *pfreeit)
+{
+	bytea	   *datafield;
+	int			len;
+	bool		freeit;
+
+	datafield = &(tuple->data); /* see note at top of file */
+	freeit = false;
+	if (VARATT_IS_EXTENDED(datafield))
+	{
+		datafield = (bytea *)
+			detoast_attr((struct varlena *) datafield);
+		freeit = true;
+	}
+	len = VARSIZE(datafield) - VARHDRSZ;
+	if (len < 0 || len > LOBLKSIZE)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("pg_largeobject entry for OID %u, page %d has invalid data field size %d",
+						tuple->loid, tuple->pageno, len)));
+	*pdatafield = datafield;
+	*plen = len;
+	*pfreeit = freeit;
+}
+
+
+/*
+ *	inv_create -- create a new large object
+ *
+ *	Arguments:
+ *	  lobjId - OID to use for new large object, or InvalidOid to pick one
+ *
+ *	Returns:
+ *	  OID of new object
+ *
+ * If lobjId is not InvalidOid, then an error occurs if the OID is already
+ * in use.
+ */
+Oid
+inv_create(Oid lobjId)
+{
+	Oid			lobjId_new;
+
+	/*
+	 * Create a new largeobject with empty data pages
+	 */
+	lobjId_new = LargeObjectCreate(lobjId);
+
+	/*
+	 * dependency on the owner of largeobject
+	 *
+	 * The reason why we use LargeObjectRelationId instead of
+	 * LargeObjectMetadataRelationId here is to provide backward compatibility
+	 * to the applications which utilize a knowledge about internal layout of
+	 * system catalogs. OID of pg_largeobject_metadata and loid of
+	 * pg_largeobject are same value, so there are no actual differences here.
+	 */
+	recordDependencyOnOwner(LargeObjectRelationId,
+							lobjId_new, GetUserId());
+
+	/* Post creation hook for new large object */
+	InvokeObjectPostCreateHook(LargeObjectRelationId, lobjId_new, 0);
+
+	/*
+	 * Advance command counter to make new tuple visible to later operations.
+	 */
+	CommandCounterIncrement();
+
+	return lobjId_new;
+}
+
+/*
+ *	inv_open -- access an existing large object.
+ *
+ * Returns a large object descriptor, appropriately filled in.
+ * The descriptor and subsidiary data are allocated in the specified
+ * memory context, which must be suitably long-lived for the caller's
+ * purposes.  If the returned descriptor has a snapshot associated
+ * with it, the caller must ensure that it also lives long enough,
+ * e.g. by calling RegisterSnapshotOnOwner
+ */
+LargeObjectDesc *
+inv_open(Oid lobjId, int flags, MemoryContext mcxt)
+{
+	LargeObjectDesc *retval;
+	Snapshot	snapshot = NULL;
+	int			descflags = 0;
+
+	/*
+	 * Historically, no difference is made between (INV_WRITE) and (INV_WRITE
+	 * | INV_READ), the caller being allowed to read the large object
+	 * descriptor in either case.
+	 */
+	if (flags & INV_WRITE)
+		descflags |= IFS_WRLOCK | IFS_RDLOCK;
+	if (flags & INV_READ)
+		descflags |= IFS_RDLOCK;
+
+	if (descflags == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid flags for opening a large object: %d",
+						flags)));
+
+	/* Get snapshot.  If write is requested, use an instantaneous snapshot. */
+	if (descflags & IFS_WRLOCK)
+		snapshot = NULL;
+	else
+		snapshot = GetActiveSnapshot();
+
+	/* Can't use LargeObjectExists here because we need to specify snapshot */
+	if (!myLargeObjectExists(lobjId, snapshot))
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_OBJECT),
+				 errmsg("large object %u does not exist", lobjId)));
+
+	/* Apply permission checks, again specifying snapshot */
+	if ((descflags & IFS_RDLOCK) != 0)
+	{
+		if (!lo_compat_privileges &&
+			pg_largeobject_aclcheck_snapshot(lobjId,
+											 GetUserId(),
+											 ACL_SELECT,
+											 snapshot) != ACLCHECK_OK)
+			ereport(ERROR,
+					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+					 errmsg("permission denied for large object %u",
+							lobjId)));
+	}
+	if ((descflags & IFS_WRLOCK) != 0)
+	{
+		if (!lo_compat_privileges &&
+			pg_largeobject_aclcheck_snapshot(lobjId,
+											 GetUserId(),
+											 ACL_UPDATE,
+											 snapshot) != ACLCHECK_OK)
+			ereport(ERROR,
+					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+					 errmsg("permission denied for large object %u",
+							lobjId)));
+	}
+
+	/* OK to create a descriptor */
+	retval = (LargeObjectDesc *) MemoryContextAlloc(mcxt,
+													sizeof(LargeObjectDesc));
+	retval->id = lobjId;
+	retval->offset = 0;
+	retval->flags = descflags;
+
+	/* caller sets if needed, not used by the functions in this file */
+	retval->subid = InvalidSubTransactionId;
+
+	/*
+	 * The snapshot (if any) is just the currently active snapshot.  The
+	 * caller will replace it with a longer-lived copy if needed.
+	 */
+	retval->snapshot = snapshot;
+
+	return retval;
+}
+
+/*
+ * Closes a large object descriptor previously made by inv_open(), and
+ * releases the long-term memory used by it.
+ */
+void
+inv_close(LargeObjectDesc *obj_desc)
+{
+	Assert(PointerIsValid(obj_desc));
+	pfree(obj_desc);
+}
+
+/*
+ * Destroys an existing large object (not to be confused with a descriptor!)
+ *
+ * Note we expect caller to have done any required permissions check.
+ */
+int
+inv_drop(Oid lobjId)
+{
+	ObjectAddress object;
+
+	/*
+	 * Delete any comments and dependencies on the large object
+	 */
+	object.classId = LargeObjectRelationId;
+	object.objectId = lobjId;
+	object.objectSubId = 0;
+	performDeletion(&object, DROP_CASCADE, 0);
+
+	/*
+	 * Advance command counter so that tuple removal will be seen by later
+	 * large-object operations in this transaction.
+	 */
+	CommandCounterIncrement();
+
+	/* For historical reasons, we always return 1 on success. */
+	return 1;
+}
+
+/*
+ * Determine size of a large object
+ *
+ * NOTE: LOs can contain gaps, just like Unix files.  We actually return
+ * the offset of the last byte + 1.
+ */
+static uint64
+inv_getsize(LargeObjectDesc *obj_desc)
+{
+	uint64		lastbyte = 0;
+	ScanKeyData skey[1];
+	SysScanDesc sd;
+	HeapTuple	tuple;
+
+	Assert(PointerIsValid(obj_desc));
+
+	open_lo_relation();
+
+	ScanKeyInit(&skey[0],
+				Anum_pg_largeobject_loid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(obj_desc->id));
+
+	sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
+									obj_desc->snapshot, 1, skey);
+
+	/*
+	 * Because the pg_largeobject index is on both loid and pageno, but we
+	 * constrain only loid, a backwards scan should visit all pages of the
+	 * large object in reverse pageno order.  So, it's sufficient to examine
+	 * the first valid tuple (== last valid page).
+	 */
+	tuple = systable_getnext_ordered(sd, BackwardScanDirection);
+	if (HeapTupleIsValid(tuple))
+	{
+		Form_pg_largeobject data;
+		bytea	   *datafield;
+		int			len;
+		bool		pfreeit;
+
+		if (HeapTupleHasNulls(tuple))	/* paranoia */
+			elog(ERROR, "null field found in pg_largeobject");
+		data = (Form_pg_largeobject) GETSTRUCT(tuple);
+		getdatafield(data, &datafield, &len, &pfreeit);
+		lastbyte = (uint64) data->pageno * LOBLKSIZE + len;
+		if (pfreeit)
+			pfree(datafield);
+	}
+
+	systable_endscan_ordered(sd);
+
+	return lastbyte;
+}
+
+int64
+inv_seek(LargeObjectDesc *obj_desc, int64 offset, int whence)
+{
+	int64		newoffset;
+
+	Assert(PointerIsValid(obj_desc));
+
+	/*
+	 * We allow seek/tell if you have either read or write permission, so no
+	 * need for a permission check here.
+	 */
+
+	/*
+	 * Note: overflow in the additions is possible, but since we will reject
+	 * negative results, we don't need any extra test for that.
+	 */
+	switch (whence)
+	{
+		case SEEK_SET:
+			newoffset = offset;
+			break;
+		case SEEK_CUR:
+			newoffset = obj_desc->offset + offset;
+			break;
+		case SEEK_END:
+			newoffset = inv_getsize(obj_desc) + offset;
+			break;
+		default:
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("invalid whence setting: %d", whence)));
+			newoffset = 0;		/* keep compiler quiet */
+			break;
+	}
+
+	/*
+	 * use errmsg_internal here because we don't want to expose INT64_FORMAT
+	 * in translatable strings; doing better is not worth the trouble
+	 */
+	if (newoffset < 0 || newoffset > MAX_LARGE_OBJECT_SIZE)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg_internal("invalid large object seek target: " INT64_FORMAT,
+								 newoffset)));
+
+	obj_desc->offset = newoffset;
+	return newoffset;
+}
+
+int64
+inv_tell(LargeObjectDesc *obj_desc)
+{
+	Assert(PointerIsValid(obj_desc));
+
+	/*
+	 * We allow seek/tell if you have either read or write permission, so no
+	 * need for a permission check here.
+	 */
+
+	return obj_desc->offset;
+}
+
+int
+inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
+{
+	int			nread = 0;
+	int64		n;
+	int64		off;
+	int			len;
+	int32		pageno = (int32) (obj_desc->offset / LOBLKSIZE);
+	uint64		pageoff;
+	ScanKeyData skey[2];
+	SysScanDesc sd;
+	HeapTuple	tuple;
+
+	Assert(PointerIsValid(obj_desc));
+	Assert(buf != NULL);
+
+	if ((obj_desc->flags & IFS_RDLOCK) == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("permission denied for large object %u",
+						obj_desc->id)));
+
+	if (nbytes <= 0)
+		return 0;
+
+	open_lo_relation();
+
+	ScanKeyInit(&skey[0],
+				Anum_pg_largeobject_loid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(obj_desc->id));
+
+	ScanKeyInit(&skey[1],
+				Anum_pg_largeobject_pageno,
+				BTGreaterEqualStrategyNumber, F_INT4GE,
+				Int32GetDatum(pageno));
+
+	sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
+									obj_desc->snapshot, 2, skey);
+
+	while ((tuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
+	{
+		Form_pg_largeobject data;
+		bytea	   *datafield;
+		bool		pfreeit;
+
+		if (HeapTupleHasNulls(tuple))	/* paranoia */
+			elog(ERROR, "null field found in pg_largeobject");
+		data = (Form_pg_largeobject) GETSTRUCT(tuple);
+
+		/*
+		 * We expect the indexscan will deliver pages in order.  However,
+		 * there may be missing pages if the LO contains unwritten "holes". We
+		 * want missing sections to read out as zeroes.
+		 */
+		pageoff = ((uint64) data->pageno) * LOBLKSIZE;
+		if (pageoff > obj_desc->offset)
+		{
+			n = pageoff - obj_desc->offset;
+			n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
+			MemSet(buf + nread, 0, n);
+			nread += n;
+			obj_desc->offset += n;
+		}
+
+		if (nread < nbytes)
+		{
+			Assert(obj_desc->offset >= pageoff);
+			off = (int) (obj_desc->offset - pageoff);
+			Assert(off >= 0 && off < LOBLKSIZE);
+
+			getdatafield(data, &datafield, &len, &pfreeit);
+			if (len > off)
+			{
+				n = len - off;
+				n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
+				memcpy(buf + nread, VARDATA(datafield) + off, n);
+				nread += n;
+				obj_desc->offset += n;
+			}
+			if (pfreeit)
+				pfree(datafield);
+		}
+
+		if (nread >= nbytes)
+			break;
+	}
+
+	systable_endscan_ordered(sd);
+
+	return nread;
+}
+
+int
+inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes)
+{
+	int			nwritten = 0;
+	int			n;
+	int			off;
+	int			len;
+	int32		pageno = (int32) (obj_desc->offset / LOBLKSIZE);
+	ScanKeyData skey[2];
+	SysScanDesc sd;
+	HeapTuple	oldtuple;
+	Form_pg_largeobject olddata;
+	bool		neednextpage;
+	bytea	   *datafield;
+	bool		pfreeit;
+	union
+	{
+		bytea		hdr;
+		/* this is to make the union big enough for a LO data chunk: */
+		char		data[LOBLKSIZE + VARHDRSZ];
+		/* ensure union is aligned well enough: */
+		int32		align_it;
+	}			workbuf;
+	char	   *workb = VARDATA(&workbuf.hdr);
+	HeapTuple	newtup;
+	Datum		values[Natts_pg_largeobject];
+	bool		nulls[Natts_pg_largeobject];
+	bool		replace[Natts_pg_largeobject];
+	CatalogIndexState indstate;
+
+	Assert(PointerIsValid(obj_desc));
+	Assert(buf != NULL);
+
+	/* enforce writability because snapshot is probably wrong otherwise */
+	if ((obj_desc->flags & IFS_WRLOCK) == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("permission denied for large object %u",
+						obj_desc->id)));
+
+	if (nbytes <= 0)
+		return 0;
+
+	/* this addition can't overflow because nbytes is only int32 */
+	if ((nbytes + obj_desc->offset) > MAX_LARGE_OBJECT_SIZE)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid large object write request size: %d",
+						nbytes)));
+
+	open_lo_relation();
+
+	indstate = CatalogOpenIndexes(lo_heap_r);
+
+	ScanKeyInit(&skey[0],
+				Anum_pg_largeobject_loid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(obj_desc->id));
+
+	ScanKeyInit(&skey[1],
+				Anum_pg_largeobject_pageno,
+				BTGreaterEqualStrategyNumber, F_INT4GE,
+				Int32GetDatum(pageno));
+
+	sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
+									obj_desc->snapshot, 2, skey);
+
+	oldtuple = NULL;
+	olddata = NULL;
+	neednextpage = true;
+
+	while (nwritten < nbytes)
+	{
+		/*
+		 * If possible, get next pre-existing page of the LO.  We expect the
+		 * indexscan will deliver these in order --- but there may be holes.
+		 */
+		if (neednextpage)
+		{
+			if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
+			{
+				if (HeapTupleHasNulls(oldtuple))	/* paranoia */
+					elog(ERROR, "null field found in pg_largeobject");
+				olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple);
+				Assert(olddata->pageno >= pageno);
+			}
+			neednextpage = false;
+		}
+
+		/*
+		 * If we have a pre-existing page, see if it is the page we want to
+		 * write, or a later one.
+		 */
+		if (olddata != NULL && olddata->pageno == pageno)
+		{
+			/*
+			 * Update an existing page with fresh data.
+			 *
+			 * First, load old data into workbuf
+			 */
+			getdatafield(olddata, &datafield, &len, &pfreeit);
+			memcpy(workb, VARDATA(datafield), len);
+			if (pfreeit)
+				pfree(datafield);
+
+			/*
+			 * Fill any hole
+			 */
+			off = (int) (obj_desc->offset % LOBLKSIZE);
+			if (off > len)
+				MemSet(workb + len, 0, off - len);
+
+			/*
+			 * Insert appropriate portion of new data
+			 */
+			n = LOBLKSIZE - off;
+			n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
+			memcpy(workb + off, buf + nwritten, n);
+			nwritten += n;
+			obj_desc->offset += n;
+			off += n;
+			/* compute valid length of new page */
+			len = (len >= off) ? len : off;
+			SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ);
+
+			/*
+			 * Form and insert updated tuple
+			 */
+			memset(values, 0, sizeof(values));
+			memset(nulls, false, sizeof(nulls));
+			memset(replace, false, sizeof(replace));
+			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
+			replace[Anum_pg_largeobject_data - 1] = true;
+			newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r),
+									   values, nulls, replace);
+			CatalogTupleUpdateWithInfo(lo_heap_r, &newtup->t_self, newtup,
+									   indstate);
+			heap_freetuple(newtup);
+
+			/*
+			 * We're done with this old page.
+			 */
+			oldtuple = NULL;
+			olddata = NULL;
+			neednextpage = true;
+		}
+		else
+		{
+			/*
+			 * Write a brand new page.
+			 *
+			 * First, fill any hole
+			 */
+			off = (int) (obj_desc->offset % LOBLKSIZE);
+			if (off > 0)
+				MemSet(workb, 0, off);
+
+			/*
+			 * Insert appropriate portion of new data
+			 */
+			n = LOBLKSIZE - off;
+			n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
+			memcpy(workb + off, buf + nwritten, n);
+			nwritten += n;
+			obj_desc->offset += n;
+			/* compute valid length of new page */
+			len = off + n;
+			SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ);
+
+			/*
+			 * Form and insert updated tuple
+			 */
+			memset(values, 0, sizeof(values));
+			memset(nulls, false, sizeof(nulls));
+			values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
+			values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
+			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
+			newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls);
+			CatalogTupleInsertWithInfo(lo_heap_r, newtup, indstate);
+			heap_freetuple(newtup);
+		}
+		pageno++;
+	}
+
+	systable_endscan_ordered(sd);
+
+	CatalogCloseIndexes(indstate);
+
+	/*
+	 * Advance command counter so that my tuple updates will be seen by later
+	 * large-object operations in this transaction.
+	 */
+	CommandCounterIncrement();
+
+	return nwritten;
+}
+
+void
+inv_truncate(LargeObjectDesc *obj_desc, int64 len)
+{
+	int32		pageno = (int32) (len / LOBLKSIZE);
+	int32		off;
+	ScanKeyData skey[2];
+	SysScanDesc sd;
+	HeapTuple	oldtuple;
+	Form_pg_largeobject olddata;
+	union
+	{
+		bytea		hdr;
+		/* this is to make the union big enough for a LO data chunk: */
+		char		data[LOBLKSIZE + VARHDRSZ];
+		/* ensure union is aligned well enough: */
+		int32		align_it;
+	}			workbuf;
+	char	   *workb = VARDATA(&workbuf.hdr);
+	HeapTuple	newtup;
+	Datum		values[Natts_pg_largeobject];
+	bool		nulls[Natts_pg_largeobject];
+	bool		replace[Natts_pg_largeobject];
+	CatalogIndexState indstate;
+
+	Assert(PointerIsValid(obj_desc));
+
+	/* enforce writability because snapshot is probably wrong otherwise */
+	if ((obj_desc->flags & IFS_WRLOCK) == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("permission denied for large object %u",
+						obj_desc->id)));
+
+	/*
+	 * use errmsg_internal here because we don't want to expose INT64_FORMAT
+	 * in translatable strings; doing better is not worth the trouble
+	 */
+	if (len < 0 || len > MAX_LARGE_OBJECT_SIZE)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg_internal("invalid large object truncation target: " INT64_FORMAT,
+								 len)));
+
+	open_lo_relation();
+
+	indstate = CatalogOpenIndexes(lo_heap_r);
+
+	/*
+	 * Set up to find all pages with desired loid and pageno >= target
+	 */
+	ScanKeyInit(&skey[0],
+				Anum_pg_largeobject_loid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(obj_desc->id));
+
+	ScanKeyInit(&skey[1],
+				Anum_pg_largeobject_pageno,
+				BTGreaterEqualStrategyNumber, F_INT4GE,
+				Int32GetDatum(pageno));
+
+	sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
+									obj_desc->snapshot, 2, skey);
+
+	/*
+	 * If possible, get the page the truncation point is in. The truncation
+	 * point may be beyond the end of the LO or in a hole.
+	 */
+	olddata = NULL;
+	if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
+	{
+		if (HeapTupleHasNulls(oldtuple))	/* paranoia */
+			elog(ERROR, "null field found in pg_largeobject");
+		olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple);
+		Assert(olddata->pageno >= pageno);
+	}
+
+	/*
+	 * If we found the page of the truncation point we need to truncate the
+	 * data in it.  Otherwise if we're in a hole, we need to create a page to
+	 * mark the end of data.
+	 */
+	if (olddata != NULL && olddata->pageno == pageno)
+	{
+		/* First, load old data into workbuf */
+		bytea	   *datafield;
+		int			pagelen;
+		bool		pfreeit;
+
+		getdatafield(olddata, &datafield, &pagelen, &pfreeit);
+		memcpy(workb, VARDATA(datafield), pagelen);
+		if (pfreeit)
+			pfree(datafield);
+
+		/*
+		 * Fill any hole
+		 */
+		off = len % LOBLKSIZE;
+		if (off > pagelen)
+			MemSet(workb + pagelen, 0, off - pagelen);
+
+		/* compute length of new page */
+		SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ);
+
+		/*
+		 * Form and insert updated tuple
+		 */
+		memset(values, 0, sizeof(values));
+		memset(nulls, false, sizeof(nulls));
+		memset(replace, false, sizeof(replace));
+		values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
+		replace[Anum_pg_largeobject_data - 1] = true;
+		newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r),
+								   values, nulls, replace);
+		CatalogTupleUpdateWithInfo(lo_heap_r, &newtup->t_self, newtup,
+								   indstate);
+		heap_freetuple(newtup);
+	}
+	else
+	{
+		/*
+		 * If the first page we found was after the truncation point, we're in
+		 * a hole that we'll fill, but we need to delete the later page
+		 * because the loop below won't visit it again.
+		 */
+		if (olddata != NULL)
+		{
+			Assert(olddata->pageno > pageno);
+			CatalogTupleDelete(lo_heap_r, &oldtuple->t_self);
+		}
+
+		/*
+		 * Write a brand new page.
+		 *
+		 * Fill the hole up to the truncation point
+		 */
+		off = len % LOBLKSIZE;
+		if (off > 0)
+			MemSet(workb, 0, off);
+
+		/* compute length of new page */
+		SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ);
+
+		/*
+		 * Form and insert new tuple
+		 */
+		memset(values, 0, sizeof(values));
+		memset(nulls, false, sizeof(nulls));
+		values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
+		values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
+		values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
+		newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls);
+		CatalogTupleInsertWithInfo(lo_heap_r, newtup, indstate);
+		heap_freetuple(newtup);
+	}
+
+	/*
+	 * Delete any pages after the truncation point.  If the initial search
+	 * didn't find a page, then of course there's nothing more to do.
+	 */
+	if (olddata != NULL)
+	{
+		while ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
+		{
+			CatalogTupleDelete(lo_heap_r, &oldtuple->t_self);
+		}
+	}
+
+	systable_endscan_ordered(sd);
+
+	CatalogCloseIndexes(indstate);
+
+	/*
+	 * Advance command counter so that tuple updates will be seen by later
+	 * large-object operations in this transaction.
+	 */
+	CommandCounterIncrement();
+}
diff --git a/src/backend/storage/lmgr/.gitignore b/src/backend/storage/lmgr/.gitignore
new file mode 100644
index 0000000..9355cae
--- /dev/null
+++ b/src/backend/storage/lmgr/.gitignore
@@ -0,0 +1,2 @@
+/lwlocknames.c
+/lwlocknames.h
diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile
new file mode 100644
index 0000000..829b792
--- /dev/null
+++ b/src/backend/storage/lmgr/Makefile
@@ -0,0 +1,51 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/lmgr
+#
+# IDENTIFICATION
+#    src/backend/storage/lmgr/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/lmgr
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	condition_variable.o \
+	deadlock.o \
+	lmgr.o \
+	lock.o \
+	lwlock.o \
+	lwlocknames.o \
+	predicate.o \
+	proc.o \
+	s_lock.o \
+	spin.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+ifdef TAS
+TASPATH = $(top_builddir)/src/backend/port/tas.o
+endif
+
+s_lock_test: s_lock.c $(top_builddir)/src/port/libpgport.a
+	$(CC) $(CPPFLAGS) $(CFLAGS) -DS_LOCK_TEST=1 $(srcdir)/s_lock.c \
+		$(TASPATH) -L $(top_builddir)/src/port -lpgport -o s_lock_test
+
+# see notes in src/backend/parser/Makefile
+lwlocknames.c: lwlocknames.h
+	touch $@
+
+lwlocknames.h: $(top_srcdir)/src/backend/storage/lmgr/lwlocknames.txt generate-lwlocknames.pl
+	$(PERL) $(srcdir)/generate-lwlocknames.pl $<
+
+check: s_lock_test
+	./s_lock_test
+
+clean distclean:
+	rm -f s_lock_test
+
+maintainer-clean: clean
+	rm -f lwlocknames.h lwlocknames.c
diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README
new file mode 100644
index 0000000..c96cc7b
--- /dev/null
+++ b/src/backend/storage/lmgr/README
@@ -0,0 +1,739 @@
+src/backend/storage/lmgr/README
+
+Locking Overview
+================
+
+Postgres uses four types of interprocess locks:
+
+* Spinlocks.  These are intended for *very* short-term locks.  If a lock
+is to be held more than a few dozen instructions, or across any sort of
+kernel call (or even a call to a nontrivial subroutine), don't use a
+spinlock. Spinlocks are primarily used as infrastructure for lightweight
+locks. They are implemented using a hardware atomic-test-and-set
+instruction, if available.  Waiting processes busy-loop until they can
+get the lock. There is no provision for deadlock detection, automatic
+release on error, or any other nicety.  There is a timeout if the lock
+cannot be gotten after a minute or so (which is approximately forever in
+comparison to the intended lock hold time, so this is certainly an error
+condition).
+
+* Lightweight locks (LWLocks).  These locks are typically used to
+interlock access to datastructures in shared memory.  LWLocks support
+both exclusive and shared lock modes (for read/write and read-only
+access to a shared object). There is no provision for deadlock
+detection, but the LWLock manager will automatically release held
+LWLocks during elog() recovery, so it is safe to raise an error while
+holding LWLocks.  Obtaining or releasing an LWLock is quite fast (a few
+dozen instructions) when there is no contention for the lock.  When a
+process has to wait for an LWLock, it blocks on a SysV semaphore so as
+to not consume CPU time.  Waiting processes will be granted the lock in
+arrival order.  There is no timeout.
+
+* Regular locks (a/k/a heavyweight locks).  The regular lock manager
+supports a variety of lock modes with table-driven semantics, and it has
+full deadlock detection and automatic release at transaction end.
+Regular locks should be used for all user-driven lock requests.
+
+* SIReadLock predicate locks.  See separate README-SSI file for details.
+
+Acquisition of either a spinlock or a lightweight lock causes query
+cancel and die() interrupts to be held off until all such locks are
+released. No such restriction exists for regular locks, however.  Also
+note that we can accept query cancel and die() interrupts while waiting
+for a regular lock, but we will not accept them while waiting for
+spinlocks or LW locks. It is therefore not a good idea to use LW locks
+when the wait time might exceed a few seconds.
+
+The rest of this README file discusses the regular lock manager in detail.
+
+
+Lock Data Structures
+--------------------
+
+Lock methods describe the overall locking behavior.  Currently there are
+two lock methods: DEFAULT and USER.
+
+Lock modes describe the type of the lock (read/write or shared/exclusive).
+In principle, each lock method can have its own set of lock modes with
+different conflict rules, but currently DEFAULT and USER methods use
+identical lock mode sets. See src/include/storage/lock.h for more details.
+(Lock modes are also called lock types in some places in the code and
+documentation.)
+
+There are two main methods for recording locks in shared memory.  The primary
+mechanism uses two main structures: the per-lockable-object LOCK struct, and
+the per-lock-and-requestor PROCLOCK struct.  A LOCK object exists for each
+lockable object that currently has locks held or requested on it.  A PROCLOCK
+struct exists for each backend that is holding or requesting lock(s) on each
+LOCK object.
+
+There is also a special "fast path" mechanism which backends may use to
+record a limited number of locks with very specific characteristics: they must
+use the DEFAULT lockmethod; they must represent a lock on a database relation
+(not a shared relation), they must be a "weak" lock which is unlikely to
+conflict (AccessShareLock, RowShareLock, or RowExclusiveLock); and the system
+must be able to quickly verify that no conflicting locks could possibly be
+present.  See "Fast Path Locking", below, for more details.
+
+Each backend also maintains an unshared LOCALLOCK structure for each lockable
+object and lock mode that it is currently holding or requesting.  The shared
+lock structures only allow a single lock grant to be made per lockable
+object/lock mode/backend.  Internally to a backend, however, the same lock may
+be requested and perhaps released multiple times in a transaction, and it can
+also be held both transactionally and session-wide.  The internal request
+counts are held in LOCALLOCK so that the shared data structures need not be
+accessed to alter them.
+
+---------------------------------------------------------------------------
+
+The lock manager's LOCK objects contain:
+
+tag -
+    The key fields that are used for hashing locks in the shared memory
+    lock hash table.  The contents of the tag essentially define an
+    individual lockable object.  See include/storage/lock.h for details
+    about the supported types of lockable objects.  This is declared as
+    a separate struct to ensure that we always zero out the correct number
+    of bytes.  It is critical that any alignment-padding bytes the compiler
+    might insert in the struct be zeroed out, else the hash computation
+    will be random.  (Currently, we are careful to define struct LOCKTAG
+    so that there are no padding bytes.)
+
+grantMask -
+    This bitmask indicates what types of locks are currently held on the
+    given lockable object.  It is used (against the lock table's conflict
+    table) to determine if a new lock request will conflict with existing
+    lock types held.  Conflicts are determined by bitwise AND operations
+    between the grantMask and the conflict table entry for the requested
+    lock type.  Bit i of grantMask is 1 if and only if granted[i] > 0.
+
+waitMask -
+    This bitmask shows the types of locks being waited for.  Bit i of waitMask
+    is 1 if and only if requested[i] > granted[i].
+
+procLocks -
+    This is a shared memory queue of all the PROCLOCK structs associated with
+    the lock object.  Note that both granted and waiting PROCLOCKs are in this
+    list (indeed, the same PROCLOCK might have some already-granted locks and
+    be waiting for more!).
+
+waitProcs -
+    This is a shared memory queue of all PGPROC structures corresponding to
+    backends that are waiting (sleeping) until another backend releases this
+    lock.  The process structure holds the information needed to determine
+    if it should be woken up when the lock is released.
+
+nRequested -
+    Keeps a count of how many times this lock has been attempted to be
+    acquired.  The count includes attempts by processes which were put
+    to sleep due to conflicts.  It also counts the same backend twice
+    if, for example, a backend process first acquires a read and then
+    acquires a write.  (But multiple acquisitions of the same lock/lock mode
+    within a backend are not multiply counted here; they are recorded
+    only in the backend's LOCALLOCK structure.)
+
+requested -
+    Keeps a count of how many locks of each type have been attempted.  Only
+    elements 1 through MAX_LOCKMODES-1 are used as they correspond to the lock
+    type defined constants.  Summing the values of requested[] should come out
+    equal to nRequested.
+
+nGranted -
+    Keeps count of how many times this lock has been successfully acquired.
+    This count does not include attempts that are waiting due to conflicts.
+    Otherwise the counting rules are the same as for nRequested.
+
+granted -
+    Keeps count of how many locks of each type are currently held.  Once again
+    only elements 1 through MAX_LOCKMODES-1 are used (0 is not).  Also, like
+    requested[], summing the values of granted[] should total to the value
+    of nGranted.
+
+We should always have 0 <= nGranted <= nRequested, and
+0 <= granted[i] <= requested[i] for each i.  When all the request counts
+go to zero, the LOCK object is no longer needed and can be freed.
+
+---------------------------------------------------------------------------
+
+The lock manager's PROCLOCK objects contain:
+
+tag -
+    The key fields that are used for hashing entries in the shared memory
+    PROCLOCK hash table.  This is declared as a separate struct to ensure that
+    we always zero out the correct number of bytes.  It is critical that any
+    alignment-padding bytes the compiler might insert in the struct be zeroed
+    out, else the hash computation will be random.  (Currently, we are careful
+    to define struct PROCLOCKTAG so that there are no padding bytes.)
+
+    tag.myLock
+        Pointer to the shared LOCK object this PROCLOCK is for.
+
+    tag.myProc
+        Pointer to the PGPROC of backend process that owns this PROCLOCK.
+
+    Note: it's OK to use pointers here because a PROCLOCK never outlives
+    either its lock or its proc.  The tag is therefore unique for as long
+    as it needs to be, even though the same tag values might mean something
+    else at other times.
+
+holdMask -
+    A bitmask for the lock modes successfully acquired by this PROCLOCK.
+    This should be a subset of the LOCK object's grantMask, and also a
+    subset of the PGPROC object's heldLocks mask (if the PGPROC is
+    currently waiting for another lock mode on this lock).
+
+releaseMask -
+    A bitmask for the lock modes due to be released during LockReleaseAll.
+    This must be a subset of the holdMask.  Note that it is modified without
+    taking the partition LWLock, and therefore it is unsafe for any
+    backend except the one owning the PROCLOCK to examine/change it.
+
+lockLink -
+    List link for shared memory queue of all the PROCLOCK objects for the
+    same LOCK.
+
+procLink -
+    List link for shared memory queue of all the PROCLOCK objects for the
+    same backend.
+
+---------------------------------------------------------------------------
+
+
+Lock Manager Internal Locking
+-----------------------------
+
+Before PostgreSQL 8.2, all of the shared-memory data structures used by
+the lock manager were protected by a single LWLock, the LockMgrLock;
+any operation involving these data structures had to exclusively lock
+LockMgrLock.  Not too surprisingly, this became a contention bottleneck.
+To reduce contention, the lock manager's data structures have been split
+into multiple "partitions", each protected by an independent LWLock.
+Most operations only need to lock the single partition they are working in.
+Here are the details:
+
+* Each possible lock is assigned to one partition according to a hash of
+its LOCKTAG value.  The partition's LWLock is considered to protect all the
+LOCK objects of that partition as well as their subsidiary PROCLOCKs.
+
+* The shared-memory hash tables for LOCKs and PROCLOCKs are organized
+so that different partitions use different hash chains, and thus there
+is no conflict in working with objects in different partitions.  This
+is supported directly by dynahash.c's "partitioned table" mechanism
+for the LOCK table: we need only ensure that the partition number is
+taken from the low-order bits of the dynahash hash value for the LOCKTAG.
+To make it work for PROCLOCKs, we have to ensure that a PROCLOCK's hash
+value has the same low-order bits as its associated LOCK.  This requires
+a specialized hash function (see proclock_hash).
+
+* Formerly, each PGPROC had a single list of PROCLOCKs belonging to it.
+This has now been split into per-partition lists, so that access to a
+particular PROCLOCK list can be protected by the associated partition's
+LWLock.  (This rule allows one backend to manipulate another backend's
+PROCLOCK lists, which was not originally necessary but is now required in
+connection with fast-path locking; see below.)
+
+* The other lock-related fields of a PGPROC are only interesting when
+the PGPROC is waiting for a lock, so we consider that they are protected
+by the partition LWLock of the awaited lock.
+
+For normal lock acquisition and release, it is sufficient to lock the
+partition containing the desired lock.  Deadlock checking needs to touch
+multiple partitions in general; for simplicity, we just make it lock all
+the partitions in partition-number order.  (To prevent LWLock deadlock,
+we establish the rule that any backend needing to lock more than one
+partition at once must lock them in partition-number order.)  It's
+possible that deadlock checking could be done without touching every
+partition in typical cases, but since in a properly functioning system
+deadlock checking should not occur often enough to be performance-critical,
+trying to make this work does not seem a productive use of effort.
+
+A backend's internal LOCALLOCK hash table is not partitioned.  We do store
+a copy of the locktag hash code in LOCALLOCK table entries, from which the
+partition number can be computed, but this is a straight speed-for-space
+tradeoff: we could instead recalculate the partition number from the LOCKTAG
+when needed.
+
+
+Fast Path Locking
+-----------------
+
+Fast path locking is a special purpose mechanism designed to reduce the
+overhead of taking and releasing certain types of locks which are taken
+and released very frequently but rarely conflict.  Currently, this includes
+two categories of locks:
+
+(1) Weak relation locks.  SELECT, INSERT, UPDATE, and DELETE must acquire a
+lock on every relation they operate on, as well as various system catalogs
+that can be used internally.  Many DML operations can proceed in parallel
+against the same table at the same time; only DDL operations such as
+CLUSTER, ALTER TABLE, or DROP -- or explicit user action such as LOCK TABLE
+-- will create lock conflicts with the "weak" locks (AccessShareLock,
+RowShareLock, RowExclusiveLock) acquired by DML operations.
+
+(2) VXID locks.  Every transaction takes a lock on its own virtual
+transaction ID.  Currently, the only operations that wait for these locks
+are CREATE INDEX CONCURRENTLY and Hot Standby (in the case of a conflict),
+so most VXID locks are taken and released by the owner without anyone else
+needing to care.
+
+The primary locking mechanism does not cope well with this workload.  Even
+though the lock manager locks are partitioned, the locktag for any given
+relation still falls in one, and only one, partition.  Thus, if many short
+queries are accessing the same relation, the lock manager partition lock for
+that partition becomes a contention bottleneck.  This effect is measurable
+even on 2-core servers, and becomes very pronounced as core count increases.
+
+To alleviate this bottleneck, beginning in PostgreSQL 9.2, each backend is
+permitted to record a limited number of locks on unshared relations in an
+array within its PGPROC structure, rather than using the primary lock table.
+This mechanism can only be used when the locker can verify that no conflicting
+locks exist at the time of taking the lock.
+
+A key point of this algorithm is that it must be possible to verify the
+absence of possibly conflicting locks without fighting over a shared LWLock or
+spinlock.  Otherwise, this effort would simply move the contention bottleneck
+from one place to another.  We accomplish this using an array of 1024 integer
+counters, which are in effect a 1024-way partitioning of the lock space.
+Each counter records the number of "strong" locks (that is, ShareLock,
+ShareRowExclusiveLock, ExclusiveLock, and AccessExclusiveLock) on unshared
+relations that fall into that partition.  When this counter is non-zero, the
+fast path mechanism may not be used to take new relation locks within that
+partition.  A strong locker bumps the counter and then scans each per-backend
+array for matching fast-path locks; any which are found must be transferred to
+the primary lock table before attempting to acquire the lock, to ensure proper
+lock conflict and deadlock detection.
+
+On an SMP system, we must guarantee proper memory synchronization.  Here we
+rely on the fact that LWLock acquisition acts as a memory sequence point: if
+A performs a store, A and B both acquire an LWLock in either order, and B
+then performs a load on the same memory location, it is guaranteed to see
+A's store.  In this case, each backend's fast-path lock queue is protected
+by an LWLock.  A backend wishing to acquire a fast-path lock grabs this
+LWLock before examining FastPathStrongRelationLocks to check for the presence
+of a conflicting strong lock.  And the backend attempting to acquire a strong
+lock, because it must transfer any matching weak locks taken via the fast-path
+mechanism to the shared lock table, will acquire every LWLock protecting a
+backend fast-path queue in turn.  So, if we examine
+FastPathStrongRelationLocks and see a zero, then either the value is truly
+zero, or if it is a stale value, the strong locker has yet to acquire the
+per-backend LWLock we now hold (or, indeed, even the first per-backend LWLock)
+and will notice any weak lock we take when it does.
+
+Fast-path VXID locks do not use the FastPathStrongRelationLocks table.  The
+first lock taken on a VXID is always the ExclusiveLock taken by its owner.
+Any subsequent lockers are share lockers waiting for the VXID to terminate.
+Indeed, the only reason VXID locks use the lock manager at all (rather than
+waiting for the VXID to terminate via some other method) is for deadlock
+detection.  Thus, the initial VXID lock can *always* be taken via the fast
+path without checking for conflicts.  Any subsequent locker must check
+whether the lock has been transferred to the main lock table, and if not,
+do so.  The backend owning the VXID must be careful to clean up any entry
+made in the main lock table at end of transaction.
+
+Deadlock detection does not need to examine the fast-path data structures,
+because any lock that could possibly be involved in a deadlock must have
+been transferred to the main tables beforehand.
+
+
+The Deadlock Detection Algorithm
+--------------------------------
+
+Since we allow user transactions to request locks in any order, deadlock
+is possible.  We use a deadlock detection/breaking algorithm that is
+fairly standard in essence, but there are many special considerations
+needed to deal with Postgres' generalized locking model.
+
+A key design consideration is that we want to make routine operations
+(lock grant and release) run quickly when there is no deadlock, and
+avoid the overhead of deadlock handling as much as possible.  We do this
+using an "optimistic waiting" approach: if a process cannot acquire the
+lock it wants immediately, it goes to sleep without any deadlock check.
+But it also sets a delay timer, with a delay of DeadlockTimeout
+milliseconds (typically set to one second).  If the delay expires before
+the process is granted the lock it wants, it runs the deadlock
+detection/breaking code. Normally this code will determine that there is
+no deadlock condition, and then the process will go back to sleep and
+wait quietly until it is granted the lock.  But if a deadlock condition
+does exist, it will be resolved, usually by aborting the detecting
+process' transaction.  In this way, we avoid deadlock handling overhead
+whenever the wait time for a lock is less than DeadlockTimeout, while
+not imposing an unreasonable delay of detection when there is an error.
+
+Lock acquisition (routines LockAcquire and ProcSleep) follows these rules:
+
+1. A lock request is granted immediately if it does not conflict with
+any existing or waiting lock request, or if the process already holds an
+instance of the same lock type (eg, there's no penalty to acquire a read
+lock twice).  Note that a process never conflicts with itself, eg one
+can obtain read lock when one already holds exclusive lock.
+
+2. Otherwise the process joins the lock's wait queue.  Normally it will
+be added to the end of the queue, but there is an exception: if the
+process already holds locks on this same lockable object that conflict
+with the request of any pending waiter, then the process will be
+inserted in the wait queue just ahead of the first such waiter.  (If we
+did not make this check, the deadlock detection code would adjust the
+queue order to resolve the conflict, but it's relatively cheap to make
+the check in ProcSleep and avoid a deadlock timeout delay in this case.)
+Note special case when inserting before the end of the queue: if the
+process's request does not conflict with any existing lock nor any
+waiting request before its insertion point, then go ahead and grant the
+lock without waiting.
+
+When a lock is released, the lock release routine (ProcLockWakeup) scans
+the lock object's wait queue.  Each waiter is awoken if (a) its request
+does not conflict with already-granted locks, and (b) its request does
+not conflict with the requests of prior un-wakable waiters.  Rule (b)
+ensures that conflicting requests are granted in order of arrival. There
+are cases where a later waiter must be allowed to go in front of
+conflicting earlier waiters to avoid deadlock, but it is not
+ProcLockWakeup's responsibility to recognize these cases; instead, the
+deadlock detection code will re-order the wait queue when necessary.
+
+To perform deadlock checking, we use the standard method of viewing the
+various processes as nodes in a directed graph (the waits-for graph or
+WFG).  There is a graph edge leading from process A to process B if A
+waits for B, ie, A is waiting for some lock and B holds a conflicting
+lock.  There is a deadlock condition if and only if the WFG contains a
+cycle.  We detect cycles by searching outward along waits-for edges to
+see if we return to our starting point.  There are three possible
+outcomes:
+
+1. All outgoing paths terminate at a running process (which has no
+outgoing edge).
+
+2. A deadlock is detected by looping back to the start point.  We
+resolve such a deadlock by canceling the start point's lock request and
+reporting an error in that transaction, which normally leads to
+transaction abort and release of that transaction's held locks.  Note
+that it's sufficient to cancel one request to remove the cycle; we don't
+need to kill all the transactions involved.
+
+3. Some path(s) loop back to a node other than the start point.  This
+indicates a deadlock, but one that does not involve our starting
+process. We ignore this condition on the grounds that resolving such a
+deadlock is the responsibility of the processes involved --- killing our
+start-point process would not resolve the deadlock.  So, cases 1 and 3
+both report "no deadlock".
+
+Postgres' situation is a little more complex than the standard discussion
+of deadlock detection, for two reasons:
+
+1. A process can be waiting for more than one other process, since there
+might be multiple PROCLOCKs of (non-conflicting) lock types that all
+conflict with the waiter's request.  This creates no real difficulty
+however; we simply need to be prepared to trace more than one outgoing
+edge.
+
+2. If a process A is behind a process B in some lock's wait queue, and
+their requested locks conflict, then we must say that A waits for B, since
+ProcLockWakeup will never awaken A before B.  This creates additional
+edges in the WFG.  We call these "soft" edges, as opposed to the "hard"
+edges induced by locks already held.  Note that if B already holds any
+locks conflicting with A's request, then their relationship is a hard edge
+not a soft edge.
+
+A "soft" block, or wait-priority block, has the same potential for
+inducing deadlock as a hard block.  However, we may be able to resolve
+a soft block without aborting the transactions involved: we can instead
+rearrange the order of the wait queue.  This rearrangement reverses the
+direction of the soft edge between two processes with conflicting requests
+whose queue order is reversed.  If we can find a rearrangement that
+eliminates a cycle without creating new ones, then we can avoid an abort.
+Checking for such possible rearrangements is the trickiest part of the
+algorithm.
+
+The workhorse of the deadlock detector is a routine FindLockCycle() which
+is given a starting point process (which must be a waiting process).
+It recursively scans outward across waits-for edges as discussed above.
+If it finds no cycle involving the start point, it returns "false".
+(As discussed above, we can ignore cycles not involving the start point.)
+When such a cycle is found, FindLockCycle() returns "true", and as it
+unwinds it also builds a list of any "soft" edges involved in the cycle.
+If the resulting list is empty then there is a hard deadlock and the
+configuration cannot succeed.  However, if the list is not empty, then
+reversing any one of the listed edges through wait-queue rearrangement
+will eliminate that cycle.  Since such a reversal might create cycles
+elsewhere, we may need to try every possibility.  Therefore, we need to
+be able to invoke FindLockCycle() on hypothetical configurations (wait
+orders) as well as the current real order.
+
+The easiest way to handle this seems to be to have a lookaside table that
+shows the proposed new queue order for each wait queue that we are
+considering rearranging.  This table is checked by FindLockCycle, and it
+believes the proposed queue order rather than the real order for each lock
+that has an entry in the lookaside table.
+
+We build a proposed new queue order by doing a "topological sort" of the
+existing entries.  Each soft edge that we are currently considering
+reversing creates a property of the partial order that the topological sort
+has to enforce.  We must use a sort method that preserves the input
+ordering as much as possible, so as not to gratuitously break arrival
+order for processes not involved in a deadlock.  (This is not true of the
+tsort method shown in Knuth, for example, but it's easily done by a simple
+doubly-nested-loop method that emits the first legal candidate at each
+step.  Fortunately, we don't need a highly efficient sort algorithm, since
+the number of partial order constraints is not likely to be large.)  Note
+that failure of the topological sort tells us we have conflicting ordering
+constraints, and therefore that the last-added soft edge reversal
+conflicts with a prior edge reversal.  We need to detect this case to
+avoid an infinite loop in the case where no possible rearrangement will
+work: otherwise, we might try a reversal, find that it still leads to
+a cycle, then try to un-reverse the reversal while trying to get rid of
+that cycle, etc etc.  Topological sort failure tells us the un-reversal
+is not a legitimate move in this context.
+
+So, the basic step in our rearrangement method is to take a list of
+soft edges in a cycle (as returned by FindLockCycle()) and successively
+try the reversal of each one as a topological-sort constraint added to
+whatever constraints we are already considering.  We recursively search
+through all such sets of constraints to see if any one eliminates all
+the deadlock cycles at once.  Although this might seem impossibly
+inefficient, it shouldn't be a big problem in practice, because there
+will normally be very few, and not very large, deadlock cycles --- if
+any at all.  So the combinatorial inefficiency isn't going to hurt us.
+Besides, it's better to spend some time to guarantee that we've checked
+all possible escape routes than to abort a transaction when we didn't
+really have to.
+
+Each edge reversal constraint can be viewed as requesting that the waiting
+process A be moved to before the blocking process B in the wait queue they
+are both in.  This action will reverse the desired soft edge, as well as
+any other soft edges between A and other processes it is advanced over.
+No other edges will be affected (note this is actually a constraint on our
+topological sort method to not re-order the queue more than necessary.)
+Therefore, we can be sure we have not created any new deadlock cycles if
+neither FindLockCycle(A) nor FindLockCycle(B) discovers any cycle.  Given
+the above-defined behavior of FindLockCycle, each of these searches is
+necessary as well as sufficient, since FindLockCycle starting at the
+original start point will not complain about cycles that include A or B
+but not the original start point.
+
+In short then, a proposed rearrangement of the wait queue(s) is determined
+by one or more broken soft edges A->B, fully specified by the output of
+topological sorts of each wait queue involved, and then tested by invoking
+FindLockCycle() starting at the original start point as well as each of
+the mentioned processes (A's and B's).  If none of the tests detect a
+cycle, then we have a valid configuration and can implement it by
+reordering the wait queues per the sort outputs (and then applying
+ProcLockWakeup on each reordered queue, in case a waiter has become wakable).
+If any test detects a soft cycle, we can try to resolve it by adding each
+soft link in that cycle, in turn, to the proposed rearrangement list.
+This is repeated recursively until we either find a workable rearrangement
+or determine that none exists.  In the latter case, the outer level
+resolves the deadlock by aborting the original start-point transaction.
+
+The particular order in which rearrangements are tried depends on the
+order FindLockCycle() happens to scan in, so if there are multiple
+workable rearrangements of the wait queues, then it is unspecified which
+one will be chosen.  What's more important is that we guarantee to try
+every queue rearrangement that could lead to success.  (For example,
+if we have A before B before C and the needed order constraints are
+C before A and B before C, we would first discover that A before C
+doesn't work and try the rearrangement C before A before B.  This would
+eventually lead to the discovery of the additional constraint B before C.)
+
+Got that?
+
+Miscellaneous Notes
+-------------------
+
+1. It is easily proven that no deadlock will be missed due to our
+asynchronous invocation of deadlock checking.  A deadlock cycle in the WFG
+is formed when the last edge in the cycle is added; therefore the last
+process in the cycle to wait (the one from which that edge is outgoing) is
+certain to detect and resolve the cycle when it later runs CheckDeadLock.
+This holds even if that edge addition created multiple cycles; the process
+may indeed abort without ever noticing those additional cycles, but we
+don't particularly care.  The only other possible creation of deadlocks is
+during deadlock resolution's rearrangement of wait queues, and we already
+saw that that algorithm will prove that it creates no new deadlocks before
+it attempts to actually execute any rearrangement.
+
+2. It is not certain that a deadlock will be resolved by aborting the
+last-to-wait process.  If earlier waiters in the cycle have not yet run
+CheckDeadLock, then the first one to do so will be the victim.
+
+3. No live (wakable) process can be missed by ProcLockWakeup, since it
+examines every member of the wait queue (this was not true in the 7.0
+implementation, BTW).  Therefore, if ProcLockWakeup is always invoked
+after a lock is released or a wait queue is rearranged, there can be no
+failure to wake a wakable process.  One should also note that
+LockErrorCleanup (abort a waiter due to outside factors) must run
+ProcLockWakeup, in case the canceled waiter was soft-blocking other
+waiters.
+
+4. We can minimize excess rearrangement-trial work by being careful to
+scan the wait queue from the front when looking for soft edges.  For
+example, if we have queue order A,B,C and C has deadlock conflicts with
+both A and B, we want to generate the "C before A" constraint first,
+rather than wasting time with "C before B", which won't move C far
+enough up.  So we look for soft edges outgoing from C starting at the
+front of the wait queue.
+
+5. The working data structures needed by the deadlock detection code can
+be limited to numbers of entries computed from MaxBackends.  Therefore,
+we can allocate the worst-case space needed during backend startup. This
+seems a safer approach than trying to allocate workspace on the fly; we
+don't want to risk having the deadlock detector run out of memory, else
+we really have no guarantees at all that deadlock will be detected.
+
+6. We abuse the deadlock detector to implement autovacuum cancellation.
+When we run the detector and we find that there's an autovacuum worker
+involved in the waits-for graph, we store a pointer to its PGPROC, and
+return a special return code (unless a hard deadlock has been detected).
+The caller can then send a cancellation signal.  This implements the
+principle that autovacuum has a low locking priority (eg it must not block
+DDL on the table).
+
+Group Locking
+-------------
+
+As if all of that weren't already complicated enough, PostgreSQL now supports
+parallelism (see src/backend/access/transam/README.parallel), which means that
+we might need to resolve deadlocks that occur between gangs of related
+processes rather than individual processes.  This doesn't change the basic
+deadlock detection algorithm very much, but it makes the bookkeeping more
+complicated.
+
+We choose to regard locks held by processes in the same parallel group as
+non-conflicting with the exception of relation extension and page locks.  This
+means that two processes in a parallel group can hold a self-exclusive lock on
+the same relation at the same time, or one process can acquire an AccessShareLock
+while the other already holds AccessExclusiveLock.  This might seem dangerous and
+could be in some cases (more on that below), but if we didn't do this then
+parallel query would be extremely prone to self-deadlock.  For example, a
+parallel query against a relation on which the leader already had
+AccessExclusiveLock would hang, because the workers would try to lock the same
+relation and be blocked by the leader; yet the leader can't finish until it
+receives completion indications from all workers.  An undetected deadlock
+results.  This is far from the only scenario where such a problem happens.  The
+same thing will occur if the leader holds only AccessShareLock, the worker
+seeks AccessShareLock, but between the time the leader attempts to acquire the
+lock and the time the worker attempts to acquire it, some other process queues
+up waiting for an AccessExclusiveLock.  In this case, too, an indefinite hang
+results.
+
+It might seem that we could predict which locks the workers will attempt to
+acquire and ensure before going parallel that those locks would be acquired
+successfully.  But this is very difficult to make work in a general way.  For
+example, a parallel worker's portion of the query plan could involve an
+SQL-callable function which generates a query dynamically, and that query
+might happen to hit a table on which the leader happens to hold
+AccessExclusiveLock.  By imposing enough restrictions on what workers can do,
+we could eventually create a situation where their behavior can be adequately
+restricted, but these restrictions would be fairly onerous, and even then, the
+system required to decide whether the workers will succeed at acquiring the
+necessary locks would be complex and possibly buggy.
+
+So, instead, we take the approach of deciding that locks within a lock group
+do not conflict.  This eliminates the possibility of an undetected deadlock,
+but also opens up some problem cases: if the leader and worker try to do some
+operation at the same time which would ordinarily be prevented by the
+heavyweight lock mechanism, undefined behavior might result.  In practice, the
+dangers are modest.  The leader and worker share the same transaction,
+snapshot, and combo CID hash, and neither can perform any DDL or, indeed,
+write any data at all.  Thus, for either to read a table locked exclusively by
+the other is safe enough.  Problems would occur if the leader initiated
+parallelism from a point in the code at which it had some backend-private
+state that made table access from another process unsafe, for example after
+calling SetReindexProcessing and before calling ResetReindexProcessing,
+catastrophe could ensue, because the worker won't have that state.
+
+To allow parallel inserts and parallel copy, we have ensured that relation
+extension and page locks don't participate in group locking which means such
+locks can conflict among the same group members.  This is required as it is no
+safer for two related processes to extend the same relation or perform clean up
+in gin indexes at a time than for unrelated processes to do the same.  We don't
+acquire a heavyweight lock on any other object after relation extension lock
+which means such a lock can never participate in the deadlock cycle.  After
+acquiring page locks, we can acquire relation extension lock but reverse never
+happens, so those will also not participate in deadlock.  To allow for other
+parallel writes like parallel update or parallel delete, we'll either need to
+(1) further enhance the deadlock detector to handle those tuple locks in a
+different way than other types; or (2) have parallel workers use some other
+mutual exclusion method for such cases.  Currently, the parallel mode is
+strictly read-only, but now we have the infrastructure to allow parallel
+inserts and parallel copy.
+
+Group locking adds three new members to each PGPROC: lockGroupLeader,
+lockGroupMembers, and lockGroupLink. A PGPROC's lockGroupLeader is NULL for
+processes not involved in parallel query. When a process wants to cooperate
+with parallel workers, it becomes a lock group leader, which means setting
+this field to point to its own PGPROC. When a parallel worker starts up, it
+points this field at the leader. The lockGroupMembers field is only used in
+the leader; it is a list of the member PGPROCs of the lock group (the leader
+and all workers). The lockGroupLink field is the list link for this list.
+
+All three of these fields are considered to be protected by a lock manager
+partition lock.  The partition lock that protects these fields within a given
+lock group is chosen by taking the leader's pgprocno modulo the number of lock
+manager partitions.  This unusual arrangement has a major advantage: the
+deadlock detector can count on the fact that no lockGroupLeader field can
+change while the deadlock detector is running, because it knows that it holds
+all the lock manager locks.  Also, holding this single lock allows safe
+manipulation of the lockGroupMembers list for the lock group.
+
+We need an additional interlock when setting these fields, because a newly
+started parallel worker has to try to join the leader's lock group, but it
+has no guarantee that the group leader is still alive by the time it gets
+started.  We try to ensure that the parallel leader dies after all workers
+in normal cases, but also that the system could survive relatively intact
+if that somehow fails to happen.  This is one of the precautions against
+such a scenario: the leader relays its PGPROC and also its PID to the
+worker, and the worker fails to join the lock group unless the given PGPROC
+still has the same PID and is still a lock group leader.  We assume that
+PIDs are not recycled quickly enough for this interlock to fail.
+
+
+User Locks (Advisory Locks)
+---------------------------
+
+User locks are handled totally on the application side as long term
+cooperative locks which may extend beyond the normal transaction boundaries.
+Their purpose is to indicate to an application that someone is `working'
+on an item.  So it is possible to put an user lock on a tuple's oid,
+retrieve the tuple, work on it for an hour and then update it and remove
+the lock.  While the lock is active other clients can still read and write
+the tuple but they can be aware that it has been locked at the application
+level by someone.
+
+User locks and normal locks are completely orthogonal and they don't
+interfere with each other.
+
+User locks can be acquired either at session level or transaction level.
+A session-level lock request is not automatically released at transaction
+end, but must be explicitly released by the application.  (However, any
+remaining locks are always released at session end.)  Transaction-level
+user lock requests behave the same as normal lock requests, in that they
+are released at transaction end and do not need explicit unlocking.
+
+Locking during Hot Standby
+--------------------------
+
+The Startup process is the only backend that can make changes during
+recovery, all other backends are read only.  As a result the Startup
+process does not acquire locks on relations or objects except when the lock
+level is AccessExclusiveLock.
+
+Regular backends are only allowed to take locks on relations or objects
+at RowExclusiveLock or lower. This ensures that they do not conflict with
+each other or with the Startup process, unless AccessExclusiveLocks are
+requested by the Startup process.
+
+Deadlocks involving AccessExclusiveLocks are not possible, so we need
+not be concerned that a user initiated deadlock can prevent recovery from
+progressing.
+
+AccessExclusiveLocks on the primary node generate WAL records
+that are then applied by the Startup process. Locks are released at end
+of transaction just as they are in normal processing. These locks are
+held by the Startup process, acting as a proxy for the backends that
+originally acquired these locks. Again, these locks cannot conflict with
+one another, so the Startup process cannot deadlock itself either.
+
+Although deadlock is not possible, a regular backend's weak lock can
+prevent the Startup process from making progress in applying WAL, which is
+usually not something that should be tolerated for very long.  Mechanisms
+exist to forcibly cancel a regular backend's query if it blocks the
+Startup process for too long.
diff --git a/src/backend/storage/lmgr/README-SSI b/src/backend/storage/lmgr/README-SSI
new file mode 100644
index 0000000..50d2ecc
--- /dev/null
+++ b/src/backend/storage/lmgr/README-SSI
@@ -0,0 +1,646 @@
+src/backend/storage/lmgr/README-SSI
+
+Serializable Snapshot Isolation (SSI) and Predicate Locking
+===========================================================
+
+This code is in the lmgr directory because about 90% of it is an
+implementation of predicate locking, which is required for SSI,
+rather than being directly related to SSI itself.  When another use
+for predicate locking justifies the effort to tease these two things
+apart, this README file should probably be split.
+
+
+Credits
+-------
+
+This feature was developed by Kevin Grittner and Dan R. K. Ports,
+with review and suggestions from Joe Conway, Heikki Linnakangas, and
+Jeff Davis.  It is based on work published in these papers:
+
+	Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
+	Serializable isolation for snapshot databases.
+	In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
+	international conference on Management of data,
+	pages 729-738, New York, NY, USA. ACM.
+	http://doi.acm.org/10.1145/1376616.1376690
+
+	Michael James Cahill. 2009.
+	Serializable Isolation for Snapshot Databases.
+	Sydney Digital Theses.
+	University of Sydney, School of Information Technologies.
+	http://hdl.handle.net/2123/5353
+
+
+Overview
+--------
+
+With true serializable transactions, if you can show that your
+transaction will do the right thing if there are no concurrent
+transactions, it will do the right thing in any mix of serializable
+transactions or be rolled back with a serialization failure.  This
+feature has been implemented in PostgreSQL using SSI.
+
+
+Serializable and Snapshot Transaction Isolation Levels
+------------------------------------------------------
+
+Serializable transaction isolation is attractive for shops with
+active development by many programmers against a complex schema
+because it guarantees data integrity with very little staff time --
+if a transaction can be shown to always do the right thing when it is
+run alone (before or after any other transaction), it will always do
+the right thing in any mix of concurrent serializable transactions.
+Where conflicts with other transactions would result in an
+inconsistent state within the database or an inconsistent view of
+the data, a serializable transaction will block or roll back to
+prevent the anomaly. The SQL standard provides a specific SQLSTATE
+for errors generated when a transaction rolls back for this reason,
+so that transactions can be retried automatically.
+
+Before version 9.1, PostgreSQL did not support a full serializable
+isolation level. A request for serializable transaction isolation
+actually provided snapshot isolation. This has well known anomalies
+which can allow data corruption or inconsistent views of the data
+during concurrent transactions; although these anomalies only occur
+when certain patterns of read-write dependencies exist within a set
+of concurrent transactions. Where these patterns exist, the anomalies
+can be prevented by introducing conflicts through explicitly
+programmed locks or otherwise unnecessary writes to the database.
+Snapshot isolation is popular because performance is better than
+serializable isolation and the integrity guarantees which it does
+provide allow anomalies to be avoided or managed with reasonable
+effort in many environments.
+
+
+Serializable Isolation Implementation Strategies
+------------------------------------------------
+
+Techniques for implementing full serializable isolation have been
+published and in use in many database products for decades. The
+primary technique which has been used is Strict Two-Phase Locking
+(S2PL), which operates by blocking writes against data which has been
+read by concurrent transactions and blocking any access (read or
+write) against data which has been written by concurrent
+transactions. A cycle in a graph of blocking indicates a deadlock,
+requiring a rollback. Blocking and deadlocks under S2PL in high
+contention workloads can be debilitating, crippling throughput and
+response time.
+
+A new technique for implementing full serializable isolation in an
+MVCC database appears in the literature beginning in 2008. This
+technique, known as Serializable Snapshot Isolation (SSI) has many of
+the advantages of snapshot isolation. In particular, reads don't
+block anything and writes don't block reads. Essentially, it runs
+snapshot isolation but monitors the read-write conflicts between
+transactions to identify dangerous structures in the transaction
+graph which indicate that a set of concurrent transactions might
+produce an anomaly, and rolls back transactions to ensure that no
+anomalies occur. It will produce some false positives (where a
+transaction is rolled back even though there would not have been an
+anomaly), but will never let an anomaly occur. In the two known
+prototype implementations, performance for many workloads (even with
+the need to restart transactions which are rolled back) is very close
+to snapshot isolation and generally far better than an S2PL
+implementation.
+
+
+Apparent Serial Order of Execution
+----------------------------------
+
+One way to understand when snapshot anomalies can occur, and to
+visualize the difference between the serializable implementations
+described above, is to consider that among transactions executing at
+the serializable transaction isolation level, the results are
+required to be consistent with some serial (one-at-a-time) execution
+of the transactions [1]. How is that order determined in each?
+
+In S2PL, each transaction locks any data it accesses. It holds the
+locks until committing, preventing other transactions from making
+conflicting accesses to the same data in the interim. Some
+transactions may have to be rolled back to prevent deadlock. But
+successful transactions can always be viewed as having occurred
+sequentially, in the order they committed.
+
+With snapshot isolation, reads never block writes, nor vice versa, so
+more concurrency is possible. The order in which transactions appear
+to have executed is determined by something more subtle than in S2PL:
+read/write dependencies. If a transaction reads data, it appears to
+execute after the transaction that wrote the data it is reading.
+Similarly, if it updates data, it appears to execute after the
+transaction that wrote the previous version. These dependencies, which
+we call "wr-dependencies" and "ww-dependencies", are consistent with
+the commit order, because the first transaction must have committed
+before the second starts. However, there can also be dependencies
+between two *concurrent* transactions, i.e. where one was running when
+the other acquired its snapshot.  These "rw-conflicts" occur when one
+transaction attempts to read data which is not visible to it because
+the transaction which wrote it (or will later write it) is
+concurrent. The reading transaction appears to have executed first,
+regardless of the actual sequence of transaction starts or commits,
+because it sees a database state prior to that in which the other
+transaction leaves it.
+
+Anomalies occur when a cycle is created in the graph of dependencies:
+when a dependency or series of dependencies causes transaction A to
+appear to have executed before transaction B, but another series of
+dependencies causes B to appear before A. If that's the case, then
+the results can't be consistent with any serial execution of the
+transactions.
+
+
+SSI Algorithm
+-------------
+
+As of 9.1, serializable transactions in PostgreSQL are implemented using
+Serializable Snapshot Isolation (SSI), based on the work of Cahill
+et al. Fundamentally, this allows snapshot isolation to run as it
+previously did, while monitoring for conditions which could create a
+serialization anomaly.
+
+SSI is based on the observation [2] that each snapshot isolation
+anomaly corresponds to a cycle that contains a "dangerous structure"
+of two adjacent rw-conflict edges:
+
+      Tin ------> Tpivot ------> Tout
+            rw             rw
+
+SSI works by watching for this dangerous structure, and rolling
+back a transaction when needed to prevent any anomaly. This means it
+only needs to track rw-conflicts between concurrent transactions, not
+wr- and ww-dependencies. It also means there is a risk of false
+positives, because not every dangerous structure is embedded in an
+actual cycle.  The number of false positives is low in practice, so
+this represents an acceptable tradeoff for keeping the detection
+overhead low.
+
+The PostgreSQL implementation uses two additional optimizations:
+
+* Tout must commit before any other transaction in the cycle
+  (see proof of Theorem 2.1 of [2]). We only roll back a transaction
+  if Tout commits before Tpivot and Tin.
+
+* if Tin is read-only, there can only be an anomaly if Tout committed
+  before Tin takes its snapshot. This optimization is an original
+  one. Proof:
+
+  - Because there is a cycle, there must be some transaction T0 that
+    precedes Tin in the cycle. (T0 might be the same as Tout.)
+
+  - The edge between T0 and Tin can't be a rw-conflict or ww-dependency,
+    because Tin was read-only, so it must be a wr-dependency.
+    Those can only occur if T0 committed before Tin took its snapshot,
+    else Tin would have ignored T0's output.
+
+  - Because Tout must commit before any other transaction in the
+    cycle, it must commit before T0 commits -- and thus before Tin
+    starts.
+
+
+PostgreSQL Implementation
+-------------------------
+
+    * Since this technique is based on Snapshot Isolation (SI), those
+areas in PostgreSQL which don't use SI can't be brought under SSI.
+This includes system tables, temporary tables, sequences, hint bit
+rewrites, etc.  SSI can not eliminate existing anomalies in these
+areas.
+
+    * Any transaction which is run at a transaction isolation level
+other than SERIALIZABLE will not be affected by SSI.  If you want to
+enforce business rules through SSI, all transactions should be run at
+the SERIALIZABLE transaction isolation level, and that should
+probably be set as the default.
+
+    * If all transactions are run at the SERIALIZABLE transaction
+isolation level, business rules can be enforced in triggers or
+application code without ever having a need to acquire an explicit
+lock or to use SELECT FOR SHARE or SELECT FOR UPDATE.
+
+    * Those who want to continue to use snapshot isolation without
+the additional protections of SSI (and the associated costs of
+enforcing those protections), can use the REPEATABLE READ transaction
+isolation level.  This level retains its legacy behavior, which
+is identical to the old SERIALIZABLE implementation and fully
+consistent with the standard's requirements for the REPEATABLE READ
+transaction isolation level.
+
+    * Performance under this SSI implementation will be significantly
+improved if transactions which don't modify permanent tables are
+declared to be READ ONLY before they begin reading data.
+
+    * Performance under SSI will tend to degrade more rapidly with a
+large number of active database transactions than under less strict
+isolation levels.  Limiting the number of active transactions through
+use of a connection pool or similar techniques may be necessary to
+maintain good performance.
+
+    * Any transaction which must be rolled back to prevent
+serialization anomalies will fail with SQLSTATE 40001, which has a
+standard meaning of "serialization failure".
+
+    * This SSI implementation makes an effort to choose the
+transaction to be canceled such that an immediate retry of the
+transaction will not fail due to conflicts with exactly the same
+transactions.  Pursuant to this goal, no transaction is canceled
+until one of the other transactions in the set of conflicts which
+could generate an anomaly has successfully committed.  This is
+conceptually similar to how write conflicts are handled.  To fully
+implement this guarantee there needs to be a way to roll back the
+active transaction for another process with a serialization failure
+SQLSTATE, even if it is "idle in transaction".
+
+
+Predicate Locking
+-----------------
+
+Both S2PL and SSI require some form of predicate locking to handle
+situations where reads conflict with later inserts or with later
+updates which move data into the selected range.  PostgreSQL didn't
+already have predicate locking, so it needed to be added to support
+full serializable transactions under either strategy. Practical
+implementations of predicate locking generally involve acquiring
+locks against data as it is accessed, using multiple granularities
+(tuple, page, table, etc.) with escalation as needed to keep the lock
+count to a number which can be tracked within RAM structures.  This
+approach was used in PostgreSQL.  Coarse granularities can cause some
+false positive indications of conflict. The number of false positives
+can be influenced by plan choice.
+
+
+Implementation overview
+-----------------------
+
+New RAM structures, inspired by those used to track traditional locks
+in PostgreSQL, but tailored to the needs of SIREAD predicate locking,
+are used.  These refer to physical objects actually accessed in the
+course of executing the query, to model the predicates through
+inference.  Anyone interested in this subject should review the
+Hellerstein, Stonebraker and Hamilton paper [3], along with the
+locking papers referenced from that and the Cahill papers.
+
+Because the SIREAD locks don't block, traditional locking techniques
+have to be modified.  Intent locking (locking higher level objects
+before locking lower level objects) doesn't work with non-blocking
+"locks" (which are, in some respects, more like flags than locks).
+
+A configurable amount of shared memory is reserved at postmaster
+start-up to track predicate locks. This size cannot be changed
+without a restart.
+
+To prevent resource exhaustion, multiple fine-grained locks may
+be promoted to a single coarser-grained lock as needed.
+
+An attempt to acquire an SIREAD lock on a tuple when the same
+transaction already holds an SIREAD lock on the page or the relation
+will be ignored. Likewise, an attempt to lock a page when the
+relation is locked will be ignored, and the acquisition of a coarser
+lock will result in the automatic release of all finer-grained locks
+it covers.
+
+
+Heap locking
+------------
+
+Predicate locks will be acquired for the heap based on the following:
+
+    * For a table scan, the entire relation will be locked.
+
+    * Each tuple read which is visible to the reading transaction
+will be locked, whether or not it meets selection criteria; except
+that there is no need to acquire an SIREAD lock on a tuple when the
+transaction already holds a write lock on any tuple representing the
+row, since a rw-conflict would also create a ww-dependency which
+has more aggressive enforcement and thus will prevent any anomaly.
+
+    * Modifying a heap tuple creates a rw-conflict with any transaction
+that holds a SIREAD lock on that tuple, or on the page or relation
+that contains it.
+
+    * Inserting a new tuple creates a rw-conflict with any transaction
+holding a SIREAD lock on the entire relation. It doesn't conflict with
+page-level locks, because page-level locks are only used to aggregate
+tuple locks. Unlike index page locks, they don't lock "gaps" on the page.
+
+
+Index AM implementations
+------------------------
+
+Since predicate locks only exist to detect writes which conflict with
+earlier reads, and heap tuple locks are acquired to cover all heap
+tuples actually read, including those read through indexes, the index
+tuples which were actually scanned are not of interest in themselves;
+we only care about their "new neighbors" -- later inserts into the
+index which would have been included in the scan had they existed at
+the time.  Conceptually, we want to lock the gaps between and
+surrounding index entries within the scanned range.
+
+Correctness requires that any insert into an index generates a
+rw-conflict with a concurrent serializable transaction if, after that
+insert, re-execution of any index scan of the other transaction would
+access the heap for a row not accessed during the previous execution.
+Note that a non-HOT update which expires an old index entry covered
+by the scan and adds a new entry for the modified row's new tuple
+need not generate a conflict, although an update which "moves" a row
+into the scan must generate a conflict.  While correctness allows
+false positives, they should be minimized for performance reasons.
+
+Several optimizations are possible, though not all are implemented yet:
+
+    * An index scan which is just finding the right position for an
+index insertion or deletion need not acquire a predicate lock.
+
+    * An index scan which is comparing for equality on the entire key
+for a unique index need not acquire a predicate lock as long as a key
+is found corresponding to a visible tuple which has not been modified
+by another transaction -- there are no "between or around" gaps to
+cover.
+
+    * As long as built-in foreign key enforcement continues to use
+its current "special tricks" to deal with MVCC issues, predicate
+locks should not be needed for scans done by enforcement code.
+
+    * If a search determines that no rows can be found regardless of
+index contents because the search conditions are contradictory (e.g.,
+x = 1 AND x = 2), then no predicate lock is needed.
+
+Other index AM implementation considerations:
+
+    * For an index AM that doesn't have support for predicate locking,
+we just acquire a predicate lock on the whole index for any search.
+
+    * B-tree index searches acquire predicate locks only on the
+index *leaf* pages needed to lock the appropriate index range. If,
+however, a search discovers that no root page has yet been created, a
+predicate lock on the index relation is required.
+
+    * Like a B-tree, GIN searches acquire predicate locks only on the
+leaf pages of entry tree. When performing an equality scan, and an
+entry has a posting tree, the posting tree root is locked instead, to
+lock only that key value. However, fastupdate=on postpones the
+insertion of tuples into index structure by temporarily storing them
+into pending list. That makes us unable to detect r-w conflicts using
+page-level locks. To cope with that, insertions to the pending list
+conflict with all scans.
+
+    * GiST searches can determine that there are no matches at any
+level of the index, so we acquire predicate lock at each index
+level during a GiST search. An index insert at the leaf level can
+then be trusted to ripple up to all levels and locations where
+conflicting predicate locks may exist. In case there is a page split,
+we need to copy predicate lock from the original page to all the new
+pages.
+
+    * Hash index searches acquire predicate locks on the primary
+page of a bucket. It acquires a lock on both the old and new buckets
+for scans that happen concurrently with page splits. During a bucket
+split, a predicate lock is copied from the primary page of an old
+bucket to the primary page of a new bucket.
+
+    * The effects of page splits, overflows, consolidations, and
+removals must be carefully reviewed to ensure that predicate locks
+aren't "lost" during those operations, or kept with pages which could
+get re-used for different parts of the index.
+
+
+Innovations
+-----------
+
+The PostgreSQL implementation of Serializable Snapshot Isolation
+differs from what is described in the cited papers for several
+reasons:
+
+   1. PostgreSQL didn't have any existing predicate locking. It had
+to be added from scratch.
+
+   2. The existing in-memory lock structures were not suitable for
+tracking SIREAD locks.
+          * In PostgreSQL, tuple level locks are not held in RAM for
+any length of time; lock information is written to the tuples
+involved in the transactions.
+          * In PostgreSQL, existing lock structures have pointers to
+memory which is related to a session. SIREAD locks need to persist
+past the end of the originating transaction and even the session
+which ran it.
+          * PostgreSQL needs to be able to tolerate a large number of
+transactions executing while one long-running transaction stays open
+-- the in-RAM techniques discussed in the papers wouldn't support
+that.
+
+   3. Unlike the database products used for the prototypes described
+in the papers, PostgreSQL didn't already have a true serializable
+isolation level distinct from snapshot isolation.
+
+   4. PostgreSQL supports subtransactions -- an issue not mentioned
+in the papers.
+
+   5. PostgreSQL doesn't assign a transaction number to a database
+transaction until and unless necessary (normally, when the transaction
+attempts to modify data).
+
+   6. PostgreSQL has pluggable data types with user-definable
+operators, as well as pluggable index types, not all of which are
+based around data types which support ordering.
+
+   7. Some possible optimizations became apparent during development
+and testing.
+
+Differences from the implementation described in the papers are
+listed below.
+
+    * New structures needed to be created in shared memory to track
+the proper information for serializable transactions and their SIREAD
+locks.
+
+    * Because PostgreSQL does not have the same concept of an "oldest
+transaction ID" for all serializable transactions as assumed in the
+Cahill thesis, we track the oldest snapshot xmin among serializable
+transactions, and a count of how many active transactions use that
+xmin. When the count hits zero we find the new oldest xmin and run a
+clean-up based on that.
+
+    * Because reads in a subtransaction may cause that subtransaction
+to roll back, thereby affecting what is written by the top level
+transaction, predicate locks must survive a subtransaction rollback.
+As a consequence, all xid usage in SSI, including predicate locking,
+is based on the top level xid.  When looking at an xid that comes
+from a tuple's xmin or xmax, for example, we always call
+SubTransGetTopmostTransaction() before doing much else with it.
+
+    * PostgreSQL does not use "update in place" with a rollback log
+for its MVCC implementation.  Where possible it uses "HOT" updates on
+the same page (if there is room and no indexed value is changed).
+For non-HOT updates the old tuple is expired in place and a new tuple
+is inserted at a new location.  Because of this difference, a tuple
+lock in PostgreSQL doesn't automatically lock any other versions of a
+row.  We don't try to copy or expand a tuple lock to any other
+versions of the row, based on the following proof that any additional
+serialization failures we would get from that would be false
+positives:
+
+          o If transaction T1 reads a row version (thus acquiring a
+predicate lock on it) and a second transaction T2 updates that row
+version (thus creating a rw-conflict graph edge from T1 to T2), must a
+third transaction T3 which re-updates the new version of the row also
+have a rw-conflict in from T1 to prevent anomalies?  In other words,
+does it matter whether we recognize the edge T1 -> T3?
+
+          o If T1 has a conflict in, it certainly doesn't. Adding the
+edge T1 -> T3 would create a dangerous structure, but we already had
+one from the edge T1 -> T2, so we would have aborted something anyway.
+(T2 has already committed, else T3 could not have updated its output;
+but we would have aborted either T1 or T1's predecessor(s).  Hence
+no cycle involving T1 and T3 can survive.)
+
+          o Now let's consider the case where T1 doesn't have a
+rw-conflict in. If that's the case, for this edge T1 -> T3 to make a
+difference, T3 must have a rw-conflict out that induces a cycle in the
+dependency graph, i.e. a conflict out to some transaction preceding T1
+in the graph. (A conflict out to T1 itself would be problematic too,
+but that would mean T1 has a conflict in, the case we already
+eliminated.)
+
+          o So now we're trying to figure out if there can be an
+rw-conflict edge T3 -> T0, where T0 is some transaction that precedes
+T1. For T0 to precede T1, there has to be some edge, or sequence of
+edges, from T0 to T1. At least the last edge has to be a wr-dependency
+or ww-dependency rather than a rw-conflict, because T1 doesn't have a
+rw-conflict in. And that gives us enough information about the order
+of transactions to see that T3 can't have a rw-conflict to T0:
+ - T0 committed before T1 started (the wr/ww-dependency implies this)
+ - T1 started before T2 committed (the T1->T2 rw-conflict implies this)
+ - T2 committed before T3 started (otherwise, T3 would get aborted
+                                   because of an update conflict)
+
+          o That means T0 committed before T3 started, and therefore
+there can't be a rw-conflict from T3 to T0.
+
+          o So in all cases, we don't need the T1 -> T3 edge to
+recognize cycles.  Therefore it's not necessary for T1's SIREAD lock
+on the original tuple version to cover later versions as well.
+
+    * Predicate locking in PostgreSQL starts at the tuple level
+when possible. Multiple fine-grained locks are promoted to a single
+coarser-granularity lock as needed to avoid resource exhaustion.  The
+amount of memory used for these structures is configurable, to balance
+RAM usage against SIREAD lock granularity.
+
+    * Each backend keeps a process-local table of the locks it holds.
+To support granularity promotion decisions with low CPU and locking
+overhead, this table also includes the coarser covering locks and the
+number of finer-granularity locks they cover.
+
+    * Conflicts are identified by looking for predicate locks
+when tuples are written, and by looking at the MVCC information when
+tuples are read. There is no matching between two RAM-based locks.
+
+    * Because write locks are stored in the heap tuples rather than a
+RAM-based lock table, the optimization described in the Cahill thesis
+which eliminates an SIREAD lock where there is a write lock is
+implemented by the following:
+         1. When checking a heap write for conflicts against existing
+predicate locks, a tuple lock on the tuple being written is removed.
+         2. When acquiring a predicate lock on a heap tuple, we
+return quickly without doing anything if it is a tuple written by the
+reading transaction.
+
+    * Rather than using conflictIn and conflictOut pointers which use
+NULL to indicate no conflict and a self-reference to indicate
+multiple conflicts or conflicts with committed transactions, we use a
+list of rw-conflicts. With the more complete information, false
+positives are reduced and we have sufficient data for more aggressive
+clean-up and other optimizations:
+
+          o We can avoid ever rolling back a transaction until and
+unless there is a pivot where a transaction on the conflict *out*
+side of the pivot committed before either of the other transactions.
+
+          o We can avoid ever rolling back a transaction when the
+transaction on the conflict *in* side of the pivot is explicitly or
+implicitly READ ONLY unless the transaction on the conflict *out*
+side of the pivot committed before the READ ONLY transaction acquired
+its snapshot. (An implicit READ ONLY transaction is one which
+committed without writing, even though it was not explicitly declared
+to be READ ONLY.)
+
+          o We can more aggressively clean up conflicts, predicate
+locks, and SSI transaction information.
+
+    * We allow a READ ONLY transaction to "opt out" of SSI if there are
+no READ WRITE transactions which could cause the READ ONLY
+transaction to ever become part of a "dangerous structure" of
+overlapping transaction dependencies.
+
+    * We allow the user to request that a READ ONLY transaction wait
+until the conditions are right for it to start in the "opt out" state
+described above. We add a DEFERRABLE state to transactions, which is
+specified and maintained in a way similar to READ ONLY. It is
+ignored for transactions that are not SERIALIZABLE and READ ONLY.
+
+    * When a transaction must be rolled back, we pick among the
+active transactions such that an immediate retry will not fail again
+on conflicts with the same transactions.
+
+    * We use the PostgreSQL SLRU system to hold summarized
+information about older committed transactions to put an upper bound
+on RAM used. Beyond that limit, information spills to disk.
+Performance can degrade in a pessimal situation, but it should be
+tolerable, and transactions won't need to be canceled or blocked
+from starting.
+
+
+R&D Issues
+----------
+
+This is intended to be the place to record specific issues which need
+more detailed review or analysis.
+
+    * WAL file replay. While serializable implementations using S2PL
+can guarantee that the write-ahead log contains commits in a sequence
+consistent with some serial execution of serializable transactions,
+SSI cannot make that guarantee. While the WAL replay is no less
+consistent than under snapshot isolation, it is possible that under
+PITR recovery or hot standby a database could reach a readable state
+where some transactions appear before other transactions which would
+have had to precede them to maintain serializable consistency. In
+essence, if we do nothing, WAL replay will be at snapshot isolation
+even for serializable transactions. Is this OK? If not, how do we
+address it?
+
+    * External replication. Look at how this impacts external
+replication solutions, like Postgres-R, Slony, pgpool, HS/SR, etc.
+This is related to the "WAL file replay" issue.
+
+    * UNIQUE btree search for equality on all columns. Since a search
+of a UNIQUE index using equality tests on all columns will lock the
+heap tuple if an entry is found, it appears that there is no need to
+get a predicate lock on the index in that case. A predicate lock is
+still needed for such a search if a matching index entry which points
+to a visible tuple is not found.
+
+    * Minimize touching of shared memory. Should lists in shared
+memory push entries which have just been returned to the front of the
+available list, so they will be popped back off soon and some memory
+might never be touched, or should we keep adding returned items to
+the end of the available list?
+
+
+References
+----------
+
+[1] http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
+Search for serial execution to find the relevant section.
+
+[2] A. Fekete et al. Making Snapshot Isolation Serializable. In ACM
+Transactions on Database Systems 30:2, Jun. 2005.
+http://dx.doi.org/10.1145/1071610.1071615
+
+[3] Joseph M. Hellerstein, Michael Stonebraker and James Hamilton. 2007.
+Architecture of a Database System. Foundations and Trends(R) in
+Databases Vol. 1, No. 2 (2007) 141-259.
+http://db.cs.berkeley.edu/papers/fntdb07-architecture.pdf
+  Of particular interest:
+    * 6.1 A Note on ACID
+    * 6.2 A Brief Review of Serializability
+    * 6.3 Locking and Latching
+    * 6.3.1 Transaction Isolation Levels
+    * 6.5.3 Next-Key Locking: Physical Surrogates for Logical Properties
diff --git a/src/backend/storage/lmgr/README.barrier b/src/backend/storage/lmgr/README.barrier
new file mode 100644
index 0000000..f78e5ac
--- /dev/null
+++ b/src/backend/storage/lmgr/README.barrier
@@ -0,0 +1,197 @@
+Memory Barriers
+===============
+
+Modern CPUs make extensive use of pipe-lining and out-of-order execution,
+meaning that the CPU is often executing more than one instruction at a
+time, and not necessarily in the order that the source code would suggest.
+Furthermore, even before the CPU gets a chance to reorder operations, the
+compiler may (and often does) reorganize the code for greater efficiency,
+particularly at higher optimization levels.  Optimizing compilers and
+out-of-order execution are both critical for good performance, but they
+can lead to surprising results when multiple processes access the same
+memory space.
+
+Example
+=======
+
+Suppose x is a pointer to a structure stored in shared memory, and that the
+entire structure has been initialized to zero bytes.  One backend executes
+the following code fragment:
+
+    x->foo = 1;
+    x->bar = 1;
+
+Meanwhile, at approximately the same time, another backend executes this
+code fragment:
+
+    bar = x->bar;
+    foo = x->foo;
+
+The second backend might end up with foo = 1 and bar = 1 (if it executes
+both statements after the first backend), or with foo = 0 and bar = 0 (if
+it executes both statements before the first backend), or with foo = 1 and
+bar = 0 (if the first backend executes the first statement, the second
+backend executes both statements, and then the first backend executes the
+second statement).
+
+Surprisingly, however, the second backend could also end up with foo = 0
+and bar = 1.  The compiler might swap the order of the two stores performed
+by the first backend, or the two loads performed by the second backend.
+Even if it doesn't, on a machine with weak memory ordering (such as PowerPC
+or ARM) the CPU might choose to execute either the loads or the stores
+out of order.  This surprising result can lead to bugs.
+
+A common pattern where this actually does result in a bug is when adding items
+onto a queue.  The writer does this:
+
+    q->items[q->num_items] = new_item;
+    ++q->num_items;
+
+The reader does this:
+
+    num_items = q->num_items;
+    for (i = 0; i < num_items; ++i)
+        /* do something with q->items[i] */
+
+This code turns out to be unsafe, because the writer might increment
+q->num_items before it finishes storing the new item into the appropriate slot.
+More subtly, the reader might prefetch the contents of the q->items array
+before reading q->num_items.  Thus, there's still a bug here *even if the
+writer does everything in the order we expect*.  We need the writer to update
+the array before bumping the item counter, and the reader to examine the item
+counter before examining the array.
+
+Note that these types of highly counterintuitive bugs can *only* occur when
+multiple processes are interacting with the same memory segment.  A given
+process always perceives its *own* writes to memory in program order.
+
+Avoiding Memory Ordering Bugs
+=============================
+
+The simplest (and often best) way to avoid memory ordering bugs is to
+protect the data structures involved with an lwlock.  For more details, see
+src/backend/storage/lmgr/README.  For instance, in the above example, the
+writer could acquire an lwlock in exclusive mode before appending to the
+queue, and each reader could acquire the same lock in shared mode before
+reading it.  If the data structure is not heavily trafficked, this solution is
+generally entirely adequate.
+
+However, in some cases, it is desirable to avoid the overhead of acquiring
+and releasing locks.  In this case, memory barriers may be used to ensure
+that the apparent order of execution is as the programmer desires.   In
+PostgreSQL backend code, the pg_memory_barrier() macro may be used to achieve
+this result.  In the example above, we can prevent the reader from seeing a
+garbage value by having the writer do this:
+
+    q->items[q->num_items] = new_item;
+    pg_memory_barrier();
+    ++q->num_items;
+
+And by having the reader do this:
+
+    num_items = q->num_items;
+    pg_memory_barrier();
+    for (i = 0; i < num_items; ++i)
+        /* do something with q->items[i] */
+
+The pg_memory_barrier() macro will (1) prevent the compiler from rearranging
+the code in such a way as to allow the memory accesses to occur out of order
+and (2) generate any code (often, inline assembly) that is needed to prevent
+the CPU from executing the memory accesses out of order.  Specifically, the
+barrier prevents loads and stores written after the barrier from being
+performed before the barrier, and vice-versa.
+
+Although this code will work, it is needlessly inefficient.  On systems with
+strong memory ordering (such as x86), the CPU never reorders loads with other
+loads, nor stores with other stores.  It can, however, allow a load to be
+performed before a subsequent store.  To avoid emitting unnecessary memory
+instructions, we provide two additional primitives: pg_read_barrier(), and
+pg_write_barrier().  When a memory barrier is being used to separate two
+loads, use pg_read_barrier(); when it is separating two stores, use
+pg_write_barrier(); when it is a separating a load and a store (in either
+order), use pg_memory_barrier().  pg_memory_barrier() can always substitute
+for either a read or a write barrier, but is typically more expensive, and
+therefore should be used only when needed.
+
+With these guidelines in mind, the writer can do this:
+
+    q->items[q->num_items] = new_item;
+    pg_write_barrier();
+    ++q->num_items;
+
+And the reader can do this:
+
+    num_items = q->num_items;
+    pg_read_barrier();
+    for (i = 0; i < num_items; ++i)
+        /* do something with q->items[i] */
+
+On machines with strong memory ordering, these weaker barriers will simply
+prevent compiler rearrangement, without emitting any actual machine code.
+On machines with weak memory ordering, they will prevent compiler
+reordering and also emit whatever hardware barrier may be required.  Even
+on machines with weak memory ordering, a read or write barrier may be able
+to use a less expensive instruction than a full barrier.
+
+Weaknesses of Memory Barriers
+=============================
+
+While memory barriers are a powerful tool, and much cheaper than locks, they
+are also much less capable than locks.  Here are some of the problems.
+
+1. Concurrent writers are unsafe.  In the above example of a queue, using
+memory barriers doesn't make it safe for two processes to add items to the
+same queue at the same time.  If more than one process can write to the queue,
+a spinlock or lwlock must be used to synchronize access. The readers can
+perhaps proceed without any lock, but the writers may not.
+
+Even very simple write operations often require additional synchronization.
+For example, it's not safe for multiple writers to simultaneously execute
+this code (supposing x is a pointer into shared memory):
+
+    x->foo++;
+
+Although this may compile down to a single machine-language instruction,
+the CPU will execute that instruction by reading the current value of foo,
+adding one to it, and then storing the result back to the original address.
+If two CPUs try to do this simultaneously, both may do their reads before
+either one does their writes.  Such a case could be made safe by using an
+atomic variable and an atomic add.  See port/atomics.h.
+
+2. Eight-byte loads and stores aren't necessarily atomic.  We assume in
+various places in the source code that an aligned four-byte load or store is
+atomic, and that other processes therefore won't see a half-set value.
+Sadly, the same can't be said for eight-byte value: on some platforms, an
+aligned eight-byte load or store will generate two four-byte operations.  If
+you need an atomic eight-byte read or write, you must either serialize access
+with a lock or use an atomic variable.
+
+3. No ordering guarantees.  While memory barriers ensure that any given
+process performs loads and stores to shared memory in order, they don't
+guarantee synchronization.  In the queue example above, we can use memory
+barriers to be sure that readers won't see garbage, but there's nothing to
+say whether a given reader will run before or after a given writer.  If this
+matters in a given situation, some other mechanism must be used instead of
+or in addition to memory barriers.
+
+4. Barrier proliferation.  Many algorithms that at first seem appealing
+require multiple barriers.  If the number of barriers required is more than
+one or two, you may be better off just using a lock.  Keep in mind that, on
+some platforms, a barrier may be implemented by acquiring and releasing a
+backend-private spinlock.  This may be better than a centralized lock under
+contention, but it may also be slower in the uncontended case.
+
+Further Reading
+===============
+
+Much of the documentation about memory barriers appears to be quite
+Linux-specific.  The following papers may be helpful:
+
+Memory Ordering in Modern Microprocessors, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf
+
+Memory Barriers: a Hardware View for Software Hackers, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/whymb.2010.06.07c.pdf
+
+The Linux kernel also has some useful documentation on this topic.  Start
+with Documentation/memory-barriers.txt
diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
new file mode 100644
index 0000000..80d70c1
--- /dev/null
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -0,0 +1,364 @@
+/*-------------------------------------------------------------------------
+ *
+ * condition_variable.c
+ *	  Implementation of condition variables.  Condition variables provide
+ *	  a way for one process to wait until a specific condition occurs,
+ *	  without needing to know the specific identity of the process for
+ *	  which they are waiting.  Waits for condition variables can be
+ *	  interrupted, unlike LWLock waits.  Condition variables are safe
+ *	  to use within dynamic shared memory segments.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/lmgr/condition_variable.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+/* Initially, we are not prepared to sleep on any condition variable. */
+static ConditionVariable *cv_sleep_target = NULL;
+
+/*
+ * Initialize a condition variable.
+ */
+void
+ConditionVariableInit(ConditionVariable *cv)
+{
+	SpinLockInit(&cv->mutex);
+	proclist_init(&cv->wakeup);
+}
+
+/*
+ * Prepare to wait on a given condition variable.
+ *
+ * This can optionally be called before entering a test/sleep loop.
+ * Doing so is more efficient if we'll need to sleep at least once.
+ * However, if the first test of the exit condition is likely to succeed,
+ * it's more efficient to omit the ConditionVariablePrepareToSleep call.
+ * See comments in ConditionVariableSleep for more detail.
+ *
+ * Caution: "before entering the loop" means you *must* test the exit
+ * condition between calling ConditionVariablePrepareToSleep and calling
+ * ConditionVariableSleep.  If that is inconvenient, omit calling
+ * ConditionVariablePrepareToSleep.
+ */
+void
+ConditionVariablePrepareToSleep(ConditionVariable *cv)
+{
+	int			pgprocno = MyProc->pgprocno;
+
+	/*
+	 * If some other sleep is already prepared, cancel it; this is necessary
+	 * because we have just one static variable tracking the prepared sleep,
+	 * and also only one cvWaitLink in our PGPROC.  It's okay to do this
+	 * because whenever control does return to the other test-and-sleep loop,
+	 * its ConditionVariableSleep call will just re-establish that sleep as
+	 * the prepared one.
+	 */
+	if (cv_sleep_target != NULL)
+		ConditionVariableCancelSleep();
+
+	/* Record the condition variable on which we will sleep. */
+	cv_sleep_target = cv;
+
+	/* Add myself to the wait queue. */
+	SpinLockAcquire(&cv->mutex);
+	proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink);
+	SpinLockRelease(&cv->mutex);
+}
+
+/*
+ * Wait for the given condition variable to be signaled.
+ *
+ * This should be called in a predicate loop that tests for a specific exit
+ * condition and otherwise sleeps, like so:
+ *
+ *	 ConditionVariablePrepareToSleep(cv);  // optional
+ *	 while (condition for which we are waiting is not true)
+ *		 ConditionVariableSleep(cv, wait_event_info);
+ *	 ConditionVariableCancelSleep();
+ *
+ * wait_event_info should be a value from one of the WaitEventXXX enums
+ * defined in pgstat.h.  This controls the contents of pg_stat_activity's
+ * wait_event_type and wait_event columns while waiting.
+ */
+void
+ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
+{
+	(void) ConditionVariableTimedSleep(cv, -1 /* no timeout */ ,
+									   wait_event_info);
+}
+
+/*
+ * Wait for a condition variable to be signaled or a timeout to be reached.
+ *
+ * Returns true when timeout expires, otherwise returns false.
+ *
+ * See ConditionVariableSleep() for general usage.
+ */
+bool
+ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
+							uint32 wait_event_info)
+{
+	long		cur_timeout = -1;
+	instr_time	start_time;
+	instr_time	cur_time;
+	int			wait_events;
+
+	/*
+	 * If the caller didn't prepare to sleep explicitly, then do so now and
+	 * return immediately.  The caller's predicate loop should immediately
+	 * call again if its exit condition is not yet met.  This will result in
+	 * the exit condition being tested twice before we first sleep.  The extra
+	 * test can be prevented by calling ConditionVariablePrepareToSleep(cv)
+	 * first.  Whether it's worth doing that depends on whether you expect the
+	 * exit condition to be met initially, in which case skipping the prepare
+	 * is recommended because it avoids manipulations of the wait list, or not
+	 * met initially, in which case preparing first is better because it
+	 * avoids one extra test of the exit condition.
+	 *
+	 * If we are currently prepared to sleep on some other CV, we just cancel
+	 * that and prepare this one; see ConditionVariablePrepareToSleep.
+	 */
+	if (cv_sleep_target != cv)
+	{
+		ConditionVariablePrepareToSleep(cv);
+		return false;
+	}
+
+	/*
+	 * Record the current time so that we can calculate the remaining timeout
+	 * if we are woken up spuriously.
+	 */
+	if (timeout >= 0)
+	{
+		INSTR_TIME_SET_CURRENT(start_time);
+		Assert(timeout >= 0 && timeout <= INT_MAX);
+		cur_timeout = timeout;
+		wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH;
+	}
+	else
+		wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
+
+	while (true)
+	{
+		bool		done = false;
+
+		/*
+		 * Wait for latch to be set.  (If we're awakened for some other
+		 * reason, the code below will cope anyway.)
+		 */
+		(void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info);
+
+		/* Reset latch before examining the state of the wait list. */
+		ResetLatch(MyLatch);
+
+		/*
+		 * If this process has been taken out of the wait list, then we know
+		 * that it has been signaled by ConditionVariableSignal (or
+		 * ConditionVariableBroadcast), so we should return to the caller. But
+		 * that doesn't guarantee that the exit condition is met, only that we
+		 * ought to check it.  So we must put the process back into the wait
+		 * list, to ensure we don't miss any additional wakeup occurring while
+		 * the caller checks its exit condition.  We can take ourselves out of
+		 * the wait list only when the caller calls
+		 * ConditionVariableCancelSleep.
+		 *
+		 * If we're still in the wait list, then the latch must have been set
+		 * by something other than ConditionVariableSignal; though we don't
+		 * guarantee not to return spuriously, we'll avoid this obvious case.
+		 */
+		SpinLockAcquire(&cv->mutex);
+		if (!proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink))
+		{
+			done = true;
+			proclist_push_tail(&cv->wakeup, MyProc->pgprocno, cvWaitLink);
+		}
+		SpinLockRelease(&cv->mutex);
+
+		/*
+		 * Check for interrupts, and return spuriously if that caused the
+		 * current sleep target to change (meaning that interrupt handler code
+		 * waited for a different condition variable).
+		 */
+		CHECK_FOR_INTERRUPTS();
+		if (cv != cv_sleep_target)
+			done = true;
+
+		/* We were signaled, so return */
+		if (done)
+			return false;
+
+		/* If we're not done, update cur_timeout for next iteration */
+		if (timeout >= 0)
+		{
+			INSTR_TIME_SET_CURRENT(cur_time);
+			INSTR_TIME_SUBTRACT(cur_time, start_time);
+			cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+
+			/* Have we crossed the timeout threshold? */
+			if (cur_timeout <= 0)
+				return true;
+		}
+	}
+}
+
+/*
+ * Cancel any pending sleep operation.
+ *
+ * We just need to remove ourselves from the wait queue of any condition
+ * variable for which we have previously prepared a sleep.
+ *
+ * Do nothing if nothing is pending; this allows this function to be called
+ * during transaction abort to clean up any unfinished CV sleep.
+ */
+void
+ConditionVariableCancelSleep(void)
+{
+	ConditionVariable *cv = cv_sleep_target;
+	bool		signaled = false;
+
+	if (cv == NULL)
+		return;
+
+	SpinLockAcquire(&cv->mutex);
+	if (proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink))
+		proclist_delete(&cv->wakeup, MyProc->pgprocno, cvWaitLink);
+	else
+		signaled = true;
+	SpinLockRelease(&cv->mutex);
+
+	/*
+	 * If we've received a signal, pass it on to another waiting process, if
+	 * there is one.  Otherwise a call to ConditionVariableSignal() might get
+	 * lost, despite there being another process ready to handle it.
+	 */
+	if (signaled)
+		ConditionVariableSignal(cv);
+
+	cv_sleep_target = NULL;
+}
+
+/*
+ * Wake up the oldest process sleeping on the CV, if there is any.
+ *
+ * Note: it's difficult to tell whether this has any real effect: we know
+ * whether we took an entry off the list, but the entry might only be a
+ * sentinel.  Hence, think twice before proposing that this should return
+ * a flag telling whether it woke somebody.
+ */
+void
+ConditionVariableSignal(ConditionVariable *cv)
+{
+	PGPROC	   *proc = NULL;
+
+	/* Remove the first process from the wakeup queue (if any). */
+	SpinLockAcquire(&cv->mutex);
+	if (!proclist_is_empty(&cv->wakeup))
+		proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+	SpinLockRelease(&cv->mutex);
+
+	/* If we found someone sleeping, set their latch to wake them up. */
+	if (proc != NULL)
+		SetLatch(&proc->procLatch);
+}
+
+/*
+ * Wake up all processes sleeping on the given CV.
+ *
+ * This guarantees to wake all processes that were sleeping on the CV
+ * at time of call, but processes that add themselves to the list mid-call
+ * will typically not get awakened.
+ */
+void
+ConditionVariableBroadcast(ConditionVariable *cv)
+{
+	int			pgprocno = MyProc->pgprocno;
+	PGPROC	   *proc = NULL;
+	bool		have_sentinel = false;
+
+	/*
+	 * In some use-cases, it is common for awakened processes to immediately
+	 * re-queue themselves.  If we just naively try to reduce the wakeup list
+	 * to empty, we'll get into a potentially-indefinite loop against such a
+	 * process.  The semantics we really want are just to be sure that we have
+	 * wakened all processes that were in the list at entry.  We can use our
+	 * own cvWaitLink as a sentinel to detect when we've finished.
+	 *
+	 * A seeming flaw in this approach is that someone else might signal the
+	 * CV and in doing so remove our sentinel entry.  But that's fine: since
+	 * CV waiters are always added and removed in order, that must mean that
+	 * every previous waiter has been wakened, so we're done.  We'll get an
+	 * extra "set" on our latch from the someone else's signal, which is
+	 * slightly inefficient but harmless.
+	 *
+	 * We can't insert our cvWaitLink as a sentinel if it's already in use in
+	 * some other proclist.  While that's not expected to be true for typical
+	 * uses of this function, we can deal with it by simply canceling any
+	 * prepared CV sleep.  The next call to ConditionVariableSleep will take
+	 * care of re-establishing the lost state.
+	 */
+	if (cv_sleep_target != NULL)
+		ConditionVariableCancelSleep();
+
+	/*
+	 * Inspect the state of the queue.  If it's empty, we have nothing to do.
+	 * If there's exactly one entry, we need only remove and signal that
+	 * entry.  Otherwise, remove the first entry and insert our sentinel.
+	 */
+	SpinLockAcquire(&cv->mutex);
+	/* While we're here, let's assert we're not in the list. */
+	Assert(!proclist_contains(&cv->wakeup, pgprocno, cvWaitLink));
+
+	if (!proclist_is_empty(&cv->wakeup))
+	{
+		proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+		if (!proclist_is_empty(&cv->wakeup))
+		{
+			proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink);
+			have_sentinel = true;
+		}
+	}
+	SpinLockRelease(&cv->mutex);
+
+	/* Awaken first waiter, if there was one. */
+	if (proc != NULL)
+		SetLatch(&proc->procLatch);
+
+	while (have_sentinel)
+	{
+		/*
+		 * Each time through the loop, remove the first wakeup list entry, and
+		 * signal it unless it's our sentinel.  Repeat as long as the sentinel
+		 * remains in the list.
+		 *
+		 * Notice that if someone else removes our sentinel, we will waken one
+		 * additional process before exiting.  That's intentional, because if
+		 * someone else signals the CV, they may be intending to waken some
+		 * third process that added itself to the list after we added the
+		 * sentinel.  Better to give a spurious wakeup (which should be
+		 * harmless beyond wasting some cycles) than to lose a wakeup.
+		 */
+		proc = NULL;
+		SpinLockAcquire(&cv->mutex);
+		if (!proclist_is_empty(&cv->wakeup))
+			proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+		have_sentinel = proclist_contains(&cv->wakeup, pgprocno, cvWaitLink);
+		SpinLockRelease(&cv->mutex);
+
+		if (proc != NULL && proc != MyProc)
+			SetLatch(&proc->procLatch);
+	}
+}
diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c
new file mode 100644
index 0000000..67733c0
--- /dev/null
+++ b/src/backend/storage/lmgr/deadlock.c
@@ -0,0 +1,1177 @@
+/*-------------------------------------------------------------------------
+ *
+ * deadlock.c
+ *	  POSTGRES deadlock detection code
+ *
+ * See src/backend/storage/lmgr/README for a description of the deadlock
+ * detection and resolution algorithms.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/deadlock.c
+ *
+ *	Interface:
+ *
+ *	DeadLockCheck()
+ *	DeadLockReport()
+ *	RememberSimpleDeadLock()
+ *	InitDeadLockChecking()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "utils/memutils.h"
+
+
+/*
+ * One edge in the waits-for graph.
+ *
+ * waiter and blocker may or may not be members of a lock group, but if either
+ * is, it will be the leader rather than any other member of the lock group.
+ * The group leaders act as representatives of the whole group even though
+ * those particular processes need not be waiting at all.  There will be at
+ * least one member of the waiter's lock group on the wait queue for the given
+ * lock, maybe more.
+ */
+typedef struct
+{
+	PGPROC	   *waiter;			/* the leader of the waiting lock group */
+	PGPROC	   *blocker;		/* the leader of the group it is waiting for */
+	LOCK	   *lock;			/* the lock being waited for */
+	int			pred;			/* workspace for TopoSort */
+	int			link;			/* workspace for TopoSort */
+} EDGE;
+
+/* One potential reordering of a lock's wait queue */
+typedef struct
+{
+	LOCK	   *lock;			/* the lock whose wait queue is described */
+	PGPROC	  **procs;			/* array of PGPROC *'s in new wait order */
+	int			nProcs;
+} WAIT_ORDER;
+
+/*
+ * Information saved about each edge in a detected deadlock cycle.  This
+ * is used to print a diagnostic message upon failure.
+ *
+ * Note: because we want to examine this info after releasing the lock
+ * manager's partition locks, we can't just store LOCK and PGPROC pointers;
+ * we must extract out all the info we want to be able to print.
+ */
+typedef struct
+{
+	LOCKTAG		locktag;		/* ID of awaited lock object */
+	LOCKMODE	lockmode;		/* type of lock we're waiting for */
+	int			pid;			/* PID of blocked backend */
+} DEADLOCK_INFO;
+
+
+static bool DeadLockCheckRecurse(PGPROC *proc);
+static int	TestConfiguration(PGPROC *startProc);
+static bool FindLockCycle(PGPROC *checkProc,
+						  EDGE *softEdges, int *nSoftEdges);
+static bool FindLockCycleRecurse(PGPROC *checkProc, int depth,
+								 EDGE *softEdges, int *nSoftEdges);
+static bool FindLockCycleRecurseMember(PGPROC *checkProc,
+									   PGPROC *checkProcLeader,
+									   int depth, EDGE *softEdges, int *nSoftEdges);
+static bool ExpandConstraints(EDGE *constraints, int nConstraints);
+static bool TopoSort(LOCK *lock, EDGE *constraints, int nConstraints,
+					 PGPROC **ordering);
+
+#ifdef DEBUG_DEADLOCK
+static void PrintLockQueue(LOCK *lock, const char *info);
+#endif
+
+
+/*
+ * Working space for the deadlock detector
+ */
+
+/* Workspace for FindLockCycle */
+static PGPROC **visitedProcs;	/* Array of visited procs */
+static int	nVisitedProcs;
+
+/* Workspace for TopoSort */
+static PGPROC **topoProcs;		/* Array of not-yet-output procs */
+static int *beforeConstraints;	/* Counts of remaining before-constraints */
+static int *afterConstraints;	/* List head for after-constraints */
+
+/* Output area for ExpandConstraints */
+static WAIT_ORDER *waitOrders;	/* Array of proposed queue rearrangements */
+static int	nWaitOrders;
+static PGPROC **waitOrderProcs; /* Space for waitOrders queue contents */
+
+/* Current list of constraints being considered */
+static EDGE *curConstraints;
+static int	nCurConstraints;
+static int	maxCurConstraints;
+
+/* Storage space for results from FindLockCycle */
+static EDGE *possibleConstraints;
+static int	nPossibleConstraints;
+static int	maxPossibleConstraints;
+static DEADLOCK_INFO *deadlockDetails;
+static int	nDeadlockDetails;
+
+/* PGPROC pointer of any blocking autovacuum worker found */
+static PGPROC *blocking_autovacuum_proc = NULL;
+
+
+/*
+ * InitDeadLockChecking -- initialize deadlock checker during backend startup
+ *
+ * This does per-backend initialization of the deadlock checker; primarily,
+ * allocation of working memory for DeadLockCheck.  We do this per-backend
+ * since there's no percentage in making the kernel do copy-on-write
+ * inheritance of workspace from the postmaster.  We want to allocate the
+ * space at startup because (a) the deadlock checker might be invoked when
+ * there's no free memory left, and (b) the checker is normally run inside a
+ * signal handler, which is a very dangerous place to invoke palloc from.
+ */
+void
+InitDeadLockChecking(void)
+{
+	MemoryContext oldcxt;
+
+	/* Make sure allocations are permanent */
+	oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * FindLockCycle needs at most MaxBackends entries in visitedProcs[] and
+	 * deadlockDetails[].
+	 */
+	visitedProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
+	deadlockDetails = (DEADLOCK_INFO *) palloc(MaxBackends * sizeof(DEADLOCK_INFO));
+
+	/*
+	 * TopoSort needs to consider at most MaxBackends wait-queue entries, and
+	 * it needn't run concurrently with FindLockCycle.
+	 */
+	topoProcs = visitedProcs;	/* re-use this space */
+	beforeConstraints = (int *) palloc(MaxBackends * sizeof(int));
+	afterConstraints = (int *) palloc(MaxBackends * sizeof(int));
+
+	/*
+	 * We need to consider rearranging at most MaxBackends/2 wait queues
+	 * (since it takes at least two waiters in a queue to create a soft edge),
+	 * and the expanded form of the wait queues can't involve more than
+	 * MaxBackends total waiters.
+	 */
+	waitOrders = (WAIT_ORDER *)
+		palloc((MaxBackends / 2) * sizeof(WAIT_ORDER));
+	waitOrderProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
+
+	/*
+	 * Allow at most MaxBackends distinct constraints in a configuration. (Is
+	 * this enough?  In practice it seems it should be, but I don't quite see
+	 * how to prove it.  If we run out, we might fail to find a workable wait
+	 * queue rearrangement even though one exists.)  NOTE that this number
+	 * limits the maximum recursion depth of DeadLockCheckRecurse. Making it
+	 * really big might potentially allow a stack-overflow problem.
+	 */
+	maxCurConstraints = MaxBackends;
+	curConstraints = (EDGE *) palloc(maxCurConstraints * sizeof(EDGE));
+
+	/*
+	 * Allow up to 3*MaxBackends constraints to be saved without having to
+	 * re-run TestConfiguration.  (This is probably more than enough, but we
+	 * can survive if we run low on space by doing excess runs of
+	 * TestConfiguration to re-compute constraint lists each time needed.) The
+	 * last MaxBackends entries in possibleConstraints[] are reserved as
+	 * output workspace for FindLockCycle.
+	 */
+	maxPossibleConstraints = MaxBackends * 4;
+	possibleConstraints =
+		(EDGE *) palloc(maxPossibleConstraints * sizeof(EDGE));
+
+	MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * DeadLockCheck -- Checks for deadlocks for a given process
+ *
+ * This code looks for deadlocks involving the given process.  If any
+ * are found, it tries to rearrange lock wait queues to resolve the
+ * deadlock.  If resolution is impossible, return DS_HARD_DEADLOCK ---
+ * the caller is then expected to abort the given proc's transaction.
+ *
+ * Caller must already have locked all partitions of the lock tables.
+ *
+ * On failure, deadlock details are recorded in deadlockDetails[] for
+ * subsequent printing by DeadLockReport().  That activity is separate
+ * because (a) we don't want to do it while holding all those LWLocks,
+ * and (b) we are typically invoked inside a signal handler.
+ */
+DeadLockState
+DeadLockCheck(PGPROC *proc)
+{
+	int			i,
+				j;
+
+	/* Initialize to "no constraints" */
+	nCurConstraints = 0;
+	nPossibleConstraints = 0;
+	nWaitOrders = 0;
+
+	/* Initialize to not blocked by an autovacuum worker */
+	blocking_autovacuum_proc = NULL;
+
+	/* Search for deadlocks and possible fixes */
+	if (DeadLockCheckRecurse(proc))
+	{
+		/*
+		 * Call FindLockCycle one more time, to record the correct
+		 * deadlockDetails[] for the basic state with no rearrangements.
+		 */
+		int			nSoftEdges;
+
+		TRACE_POSTGRESQL_DEADLOCK_FOUND();
+
+		nWaitOrders = 0;
+		if (!FindLockCycle(proc, possibleConstraints, &nSoftEdges))
+			elog(FATAL, "deadlock seems to have disappeared");
+
+		return DS_HARD_DEADLOCK;	/* cannot find a non-deadlocked state */
+	}
+
+	/* Apply any needed rearrangements of wait queues */
+	for (i = 0; i < nWaitOrders; i++)
+	{
+		LOCK	   *lock = waitOrders[i].lock;
+		PGPROC	  **procs = waitOrders[i].procs;
+		int			nProcs = waitOrders[i].nProcs;
+		PROC_QUEUE *waitQueue = &(lock->waitProcs);
+
+		Assert(nProcs == waitQueue->size);
+
+#ifdef DEBUG_DEADLOCK
+		PrintLockQueue(lock, "DeadLockCheck:");
+#endif
+
+		/* Reset the queue and re-add procs in the desired order */
+		ProcQueueInit(waitQueue);
+		for (j = 0; j < nProcs; j++)
+		{
+			SHMQueueInsertBefore(&(waitQueue->links), &(procs[j]->links));
+			waitQueue->size++;
+		}
+
+#ifdef DEBUG_DEADLOCK
+		PrintLockQueue(lock, "rearranged to:");
+#endif
+
+		/* See if any waiters for the lock can be woken up now */
+		ProcLockWakeup(GetLocksMethodTable(lock), lock);
+	}
+
+	/* Return code tells caller if we had to escape a deadlock or not */
+	if (nWaitOrders > 0)
+		return DS_SOFT_DEADLOCK;
+	else if (blocking_autovacuum_proc != NULL)
+		return DS_BLOCKED_BY_AUTOVACUUM;
+	else
+		return DS_NO_DEADLOCK;
+}
+
+/*
+ * Return the PGPROC of the autovacuum that's blocking a process.
+ *
+ * We reset the saved pointer as soon as we pass it back.
+ */
+PGPROC *
+GetBlockingAutoVacuumPgproc(void)
+{
+	PGPROC	   *ptr;
+
+	ptr = blocking_autovacuum_proc;
+	blocking_autovacuum_proc = NULL;
+
+	return ptr;
+}
+
+/*
+ * DeadLockCheckRecurse -- recursively search for valid orderings
+ *
+ * curConstraints[] holds the current set of constraints being considered
+ * by an outer level of recursion.  Add to this each possible solution
+ * constraint for any cycle detected at this level.
+ *
+ * Returns true if no solution exists.  Returns false if a deadlock-free
+ * state is attainable, in which case waitOrders[] shows the required
+ * rearrangements of lock wait queues (if any).
+ */
+static bool
+DeadLockCheckRecurse(PGPROC *proc)
+{
+	int			nEdges;
+	int			oldPossibleConstraints;
+	bool		savedList;
+	int			i;
+
+	nEdges = TestConfiguration(proc);
+	if (nEdges < 0)
+		return true;			/* hard deadlock --- no solution */
+	if (nEdges == 0)
+		return false;			/* good configuration found */
+	if (nCurConstraints >= maxCurConstraints)
+		return true;			/* out of room for active constraints? */
+	oldPossibleConstraints = nPossibleConstraints;
+	if (nPossibleConstraints + nEdges + MaxBackends <= maxPossibleConstraints)
+	{
+		/* We can save the edge list in possibleConstraints[] */
+		nPossibleConstraints += nEdges;
+		savedList = true;
+	}
+	else
+	{
+		/* Not room; will need to regenerate the edges on-the-fly */
+		savedList = false;
+	}
+
+	/*
+	 * Try each available soft edge as an addition to the configuration.
+	 */
+	for (i = 0; i < nEdges; i++)
+	{
+		if (!savedList && i > 0)
+		{
+			/* Regenerate the list of possible added constraints */
+			if (nEdges != TestConfiguration(proc))
+				elog(FATAL, "inconsistent results during deadlock check");
+		}
+		curConstraints[nCurConstraints] =
+			possibleConstraints[oldPossibleConstraints + i];
+		nCurConstraints++;
+		if (!DeadLockCheckRecurse(proc))
+			return false;		/* found a valid solution! */
+		/* give up on that added constraint, try again */
+		nCurConstraints--;
+	}
+	nPossibleConstraints = oldPossibleConstraints;
+	return true;				/* no solution found */
+}
+
+
+/*--------------------
+ * Test a configuration (current set of constraints) for validity.
+ *
+ * Returns:
+ *		0: the configuration is good (no deadlocks)
+ *	   -1: the configuration has a hard deadlock or is not self-consistent
+ *		>0: the configuration has one or more soft deadlocks
+ *
+ * In the soft-deadlock case, one of the soft cycles is chosen arbitrarily
+ * and a list of its soft edges is returned beginning at
+ * possibleConstraints+nPossibleConstraints.  The return value is the
+ * number of soft edges.
+ *--------------------
+ */
+static int
+TestConfiguration(PGPROC *startProc)
+{
+	int			softFound = 0;
+	EDGE	   *softEdges = possibleConstraints + nPossibleConstraints;
+	int			nSoftEdges;
+	int			i;
+
+	/*
+	 * Make sure we have room for FindLockCycle's output.
+	 */
+	if (nPossibleConstraints + MaxBackends > maxPossibleConstraints)
+		return -1;
+
+	/*
+	 * Expand current constraint set into wait orderings.  Fail if the
+	 * constraint set is not self-consistent.
+	 */
+	if (!ExpandConstraints(curConstraints, nCurConstraints))
+		return -1;
+
+	/*
+	 * Check for cycles involving startProc or any of the procs mentioned in
+	 * constraints.  We check startProc last because if it has a soft cycle
+	 * still to be dealt with, we want to deal with that first.
+	 */
+	for (i = 0; i < nCurConstraints; i++)
+	{
+		if (FindLockCycle(curConstraints[i].waiter, softEdges, &nSoftEdges))
+		{
+			if (nSoftEdges == 0)
+				return -1;		/* hard deadlock detected */
+			softFound = nSoftEdges;
+		}
+		if (FindLockCycle(curConstraints[i].blocker, softEdges, &nSoftEdges))
+		{
+			if (nSoftEdges == 0)
+				return -1;		/* hard deadlock detected */
+			softFound = nSoftEdges;
+		}
+	}
+	if (FindLockCycle(startProc, softEdges, &nSoftEdges))
+	{
+		if (nSoftEdges == 0)
+			return -1;			/* hard deadlock detected */
+		softFound = nSoftEdges;
+	}
+	return softFound;
+}
+
+
+/*
+ * FindLockCycle -- basic check for deadlock cycles
+ *
+ * Scan outward from the given proc to see if there is a cycle in the
+ * waits-for graph that includes this proc.  Return true if a cycle
+ * is found, else false.  If a cycle is found, we return a list of
+ * the "soft edges", if any, included in the cycle.  These edges could
+ * potentially be eliminated by rearranging wait queues.  We also fill
+ * deadlockDetails[] with information about the detected cycle; this info
+ * is not used by the deadlock algorithm itself, only to print a useful
+ * message after failing.
+ *
+ * Since we need to be able to check hypothetical configurations that would
+ * exist after wait queue rearrangement, the routine pays attention to the
+ * table of hypothetical queue orders in waitOrders[].  These orders will
+ * be believed in preference to the actual ordering seen in the locktable.
+ */
+static bool
+FindLockCycle(PGPROC *checkProc,
+			  EDGE *softEdges,	/* output argument */
+			  int *nSoftEdges)	/* output argument */
+{
+	nVisitedProcs = 0;
+	nDeadlockDetails = 0;
+	*nSoftEdges = 0;
+	return FindLockCycleRecurse(checkProc, 0, softEdges, nSoftEdges);
+}
+
+static bool
+FindLockCycleRecurse(PGPROC *checkProc,
+					 int depth,
+					 EDGE *softEdges,	/* output argument */
+					 int *nSoftEdges)	/* output argument */
+{
+	int			i;
+	dlist_iter	iter;
+
+	/*
+	 * If this process is a lock group member, check the leader instead. (Note
+	 * that we might be the leader, in which case this is a no-op.)
+	 */
+	if (checkProc->lockGroupLeader != NULL)
+		checkProc = checkProc->lockGroupLeader;
+
+	/*
+	 * Have we already seen this proc?
+	 */
+	for (i = 0; i < nVisitedProcs; i++)
+	{
+		if (visitedProcs[i] == checkProc)
+		{
+			/* If we return to starting point, we have a deadlock cycle */
+			if (i == 0)
+			{
+				/*
+				 * record total length of cycle --- outer levels will now fill
+				 * deadlockDetails[]
+				 */
+				Assert(depth <= MaxBackends);
+				nDeadlockDetails = depth;
+
+				return true;
+			}
+
+			/*
+			 * Otherwise, we have a cycle but it does not include the start
+			 * point, so say "no deadlock".
+			 */
+			return false;
+		}
+	}
+	/* Mark proc as seen */
+	Assert(nVisitedProcs < MaxBackends);
+	visitedProcs[nVisitedProcs++] = checkProc;
+
+	/*
+	 * If the process is waiting, there is an outgoing waits-for edge to each
+	 * process that blocks it.
+	 */
+	if (checkProc->links.next != NULL && checkProc->waitLock != NULL &&
+		FindLockCycleRecurseMember(checkProc, checkProc, depth, softEdges,
+								   nSoftEdges))
+		return true;
+
+	/*
+	 * If the process is not waiting, there could still be outgoing waits-for
+	 * edges if it is part of a lock group, because other members of the lock
+	 * group might be waiting even though this process is not.  (Given lock
+	 * groups {A1, A2} and {B1, B2}, if A1 waits for B1 and B2 waits for A2,
+	 * that is a deadlock even neither of B1 and A2 are waiting for anything.)
+	 */
+	dlist_foreach(iter, &checkProc->lockGroupMembers)
+	{
+		PGPROC	   *memberProc;
+
+		memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur);
+
+		if (memberProc->links.next != NULL && memberProc->waitLock != NULL &&
+			memberProc != checkProc &&
+			FindLockCycleRecurseMember(memberProc, checkProc, depth, softEdges,
+									   nSoftEdges))
+			return true;
+	}
+
+	return false;
+}
+
+static bool
+FindLockCycleRecurseMember(PGPROC *checkProc,
+						   PGPROC *checkProcLeader,
+						   int depth,
+						   EDGE *softEdges, /* output argument */
+						   int *nSoftEdges) /* output argument */
+{
+	PGPROC	   *proc;
+	LOCK	   *lock = checkProc->waitLock;
+	PROCLOCK   *proclock;
+	SHM_QUEUE  *procLocks;
+	LockMethod	lockMethodTable;
+	PROC_QUEUE *waitQueue;
+	int			queue_size;
+	int			conflictMask;
+	int			i;
+	int			numLockModes,
+				lm;
+
+	/*
+	 * The relation extension or page lock can never participate in actual
+	 * deadlock cycle.  See Asserts in LockAcquireExtended.  So, there is no
+	 * advantage in checking wait edges from them.
+	 */
+	if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND ||
+		(LOCK_LOCKTAG(*lock) == LOCKTAG_PAGE))
+		return false;
+
+	lockMethodTable = GetLocksMethodTable(lock);
+	numLockModes = lockMethodTable->numLockModes;
+	conflictMask = lockMethodTable->conflictTab[checkProc->waitLockMode];
+
+	/*
+	 * Scan for procs that already hold conflicting locks.  These are "hard"
+	 * edges in the waits-for graph.
+	 */
+	procLocks = &(lock->procLocks);
+
+	proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+										 offsetof(PROCLOCK, lockLink));
+
+	while (proclock)
+	{
+		PGPROC	   *leader;
+
+		proc = proclock->tag.myProc;
+		leader = proc->lockGroupLeader == NULL ? proc : proc->lockGroupLeader;
+
+		/* A proc never blocks itself or any other lock group member */
+		if (leader != checkProcLeader)
+		{
+			for (lm = 1; lm <= numLockModes; lm++)
+			{
+				if ((proclock->holdMask & LOCKBIT_ON(lm)) &&
+					(conflictMask & LOCKBIT_ON(lm)))
+				{
+					/* This proc hard-blocks checkProc */
+					if (FindLockCycleRecurse(proc, depth + 1,
+											 softEdges, nSoftEdges))
+					{
+						/* fill deadlockDetails[] */
+						DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+						info->locktag = lock->tag;
+						info->lockmode = checkProc->waitLockMode;
+						info->pid = checkProc->pid;
+
+						return true;
+					}
+
+					/*
+					 * No deadlock here, but see if this proc is an autovacuum
+					 * that is directly hard-blocking our own proc.  If so,
+					 * report it so that the caller can send a cancel signal
+					 * to it, if appropriate.  If there's more than one such
+					 * proc, it's indeterminate which one will be reported.
+					 *
+					 * We don't touch autovacuums that are indirectly blocking
+					 * us; it's up to the direct blockee to take action.  This
+					 * rule simplifies understanding the behavior and ensures
+					 * that an autovacuum won't be canceled with less than
+					 * deadlock_timeout grace period.
+					 *
+					 * Note we read statusFlags without any locking.  This is
+					 * OK only for checking the PROC_IS_AUTOVACUUM flag,
+					 * because that flag is set at process start and never
+					 * reset.  There is logic elsewhere to avoid canceling an
+					 * autovacuum that is working to prevent XID wraparound
+					 * problems (which needs to read a different statusFlags
+					 * bit), but we don't do that here to avoid grabbing
+					 * ProcArrayLock.
+					 */
+					if (checkProc == MyProc &&
+						proc->statusFlags & PROC_IS_AUTOVACUUM)
+						blocking_autovacuum_proc = proc;
+
+					/* We're done looking at this proclock */
+					break;
+				}
+			}
+		}
+
+		proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
+											 offsetof(PROCLOCK, lockLink));
+	}
+
+	/*
+	 * Scan for procs that are ahead of this one in the lock's wait queue.
+	 * Those that have conflicting requests soft-block this one.  This must be
+	 * done after the hard-block search, since if another proc both hard- and
+	 * soft-blocks this one, we want to call it a hard edge.
+	 *
+	 * If there is a proposed re-ordering of the lock's wait order, use that
+	 * rather than the current wait order.
+	 */
+	for (i = 0; i < nWaitOrders; i++)
+	{
+		if (waitOrders[i].lock == lock)
+			break;
+	}
+
+	if (i < nWaitOrders)
+	{
+		/* Use the given hypothetical wait queue order */
+		PGPROC	  **procs = waitOrders[i].procs;
+
+		queue_size = waitOrders[i].nProcs;
+
+		for (i = 0; i < queue_size; i++)
+		{
+			PGPROC	   *leader;
+
+			proc = procs[i];
+			leader = proc->lockGroupLeader == NULL ? proc :
+				proc->lockGroupLeader;
+
+			/*
+			 * TopoSort will always return an ordering with group members
+			 * adjacent to each other in the wait queue (see comments
+			 * therein). So, as soon as we reach a process in the same lock
+			 * group as checkProc, we know we've found all the conflicts that
+			 * precede any member of the lock group lead by checkProcLeader.
+			 */
+			if (leader == checkProcLeader)
+				break;
+
+			/* Is there a conflict with this guy's request? */
+			if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0)
+			{
+				/* This proc soft-blocks checkProc */
+				if (FindLockCycleRecurse(proc, depth + 1,
+										 softEdges, nSoftEdges))
+				{
+					/* fill deadlockDetails[] */
+					DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+					info->locktag = lock->tag;
+					info->lockmode = checkProc->waitLockMode;
+					info->pid = checkProc->pid;
+
+					/*
+					 * Add this edge to the list of soft edges in the cycle
+					 */
+					Assert(*nSoftEdges < MaxBackends);
+					softEdges[*nSoftEdges].waiter = checkProcLeader;
+					softEdges[*nSoftEdges].blocker = leader;
+					softEdges[*nSoftEdges].lock = lock;
+					(*nSoftEdges)++;
+					return true;
+				}
+			}
+		}
+	}
+	else
+	{
+		PGPROC	   *lastGroupMember = NULL;
+
+		/* Use the true lock wait queue order */
+		waitQueue = &(lock->waitProcs);
+
+		/*
+		 * Find the last member of the lock group that is present in the wait
+		 * queue.  Anything after this is not a soft lock conflict. If group
+		 * locking is not in use, then we know immediately which process we're
+		 * looking for, but otherwise we've got to search the wait queue to
+		 * find the last process actually present.
+		 */
+		if (checkProc->lockGroupLeader == NULL)
+			lastGroupMember = checkProc;
+		else
+		{
+			proc = (PGPROC *) waitQueue->links.next;
+			queue_size = waitQueue->size;
+			while (queue_size-- > 0)
+			{
+				if (proc->lockGroupLeader == checkProcLeader)
+					lastGroupMember = proc;
+				proc = (PGPROC *) proc->links.next;
+			}
+			Assert(lastGroupMember != NULL);
+		}
+
+		/*
+		 * OK, now rescan (or scan) the queue to identify the soft conflicts.
+		 */
+		queue_size = waitQueue->size;
+		proc = (PGPROC *) waitQueue->links.next;
+		while (queue_size-- > 0)
+		{
+			PGPROC	   *leader;
+
+			leader = proc->lockGroupLeader == NULL ? proc :
+				proc->lockGroupLeader;
+
+			/* Done when we reach the target proc */
+			if (proc == lastGroupMember)
+				break;
+
+			/* Is there a conflict with this guy's request? */
+			if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0 &&
+				leader != checkProcLeader)
+			{
+				/* This proc soft-blocks checkProc */
+				if (FindLockCycleRecurse(proc, depth + 1,
+										 softEdges, nSoftEdges))
+				{
+					/* fill deadlockDetails[] */
+					DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+					info->locktag = lock->tag;
+					info->lockmode = checkProc->waitLockMode;
+					info->pid = checkProc->pid;
+
+					/*
+					 * Add this edge to the list of soft edges in the cycle
+					 */
+					Assert(*nSoftEdges < MaxBackends);
+					softEdges[*nSoftEdges].waiter = checkProcLeader;
+					softEdges[*nSoftEdges].blocker = leader;
+					softEdges[*nSoftEdges].lock = lock;
+					(*nSoftEdges)++;
+					return true;
+				}
+			}
+
+			proc = (PGPROC *) proc->links.next;
+		}
+	}
+
+	/*
+	 * No conflict detected here.
+	 */
+	return false;
+}
+
+
+/*
+ * ExpandConstraints -- expand a list of constraints into a set of
+ *		specific new orderings for affected wait queues
+ *
+ * Input is a list of soft edges to be reversed.  The output is a list
+ * of nWaitOrders WAIT_ORDER structs in waitOrders[], with PGPROC array
+ * workspace in waitOrderProcs[].
+ *
+ * Returns true if able to build an ordering that satisfies all the
+ * constraints, false if not (there are contradictory constraints).
+ */
+static bool
+ExpandConstraints(EDGE *constraints,
+				  int nConstraints)
+{
+	int			nWaitOrderProcs = 0;
+	int			i,
+				j;
+
+	nWaitOrders = 0;
+
+	/*
+	 * Scan constraint list backwards.  This is because the last-added
+	 * constraint is the only one that could fail, and so we want to test it
+	 * for inconsistency first.
+	 */
+	for (i = nConstraints; --i >= 0;)
+	{
+		LOCK	   *lock = constraints[i].lock;
+
+		/* Did we already make a list for this lock? */
+		for (j = nWaitOrders; --j >= 0;)
+		{
+			if (waitOrders[j].lock == lock)
+				break;
+		}
+		if (j >= 0)
+			continue;
+		/* No, so allocate a new list */
+		waitOrders[nWaitOrders].lock = lock;
+		waitOrders[nWaitOrders].procs = waitOrderProcs + nWaitOrderProcs;
+		waitOrders[nWaitOrders].nProcs = lock->waitProcs.size;
+		nWaitOrderProcs += lock->waitProcs.size;
+		Assert(nWaitOrderProcs <= MaxBackends);
+
+		/*
+		 * Do the topo sort.  TopoSort need not examine constraints after this
+		 * one, since they must be for different locks.
+		 */
+		if (!TopoSort(lock, constraints, i + 1,
+					  waitOrders[nWaitOrders].procs))
+			return false;
+		nWaitOrders++;
+	}
+	return true;
+}
+
+
+/*
+ * TopoSort -- topological sort of a wait queue
+ *
+ * Generate a re-ordering of a lock's wait queue that satisfies given
+ * constraints about certain procs preceding others.  (Each such constraint
+ * is a fact of a partial ordering.)  Minimize rearrangement of the queue
+ * not needed to achieve the partial ordering.
+ *
+ * This is a lot simpler and slower than, for example, the topological sort
+ * algorithm shown in Knuth's Volume 1.  However, Knuth's method doesn't
+ * try to minimize the damage to the existing order.  In practice we are
+ * not likely to be working with more than a few constraints, so the apparent
+ * slowness of the algorithm won't really matter.
+ *
+ * The initial queue ordering is taken directly from the lock's wait queue.
+ * The output is an array of PGPROC pointers, of length equal to the lock's
+ * wait queue length (the caller is responsible for providing this space).
+ * The partial order is specified by an array of EDGE structs.  Each EDGE
+ * is one that we need to reverse, therefore the "waiter" must appear before
+ * the "blocker" in the output array.  The EDGE array may well contain
+ * edges associated with other locks; these should be ignored.
+ *
+ * Returns true if able to build an ordering that satisfies all the
+ * constraints, false if not (there are contradictory constraints).
+ */
+static bool
+TopoSort(LOCK *lock,
+		 EDGE *constraints,
+		 int nConstraints,
+		 PGPROC **ordering)		/* output argument */
+{
+	PROC_QUEUE *waitQueue = &(lock->waitProcs);
+	int			queue_size = waitQueue->size;
+	PGPROC	   *proc;
+	int			i,
+				j,
+				jj,
+				k,
+				kk,
+				last;
+
+	/* First, fill topoProcs[] array with the procs in their current order */
+	proc = (PGPROC *) waitQueue->links.next;
+	for (i = 0; i < queue_size; i++)
+	{
+		topoProcs[i] = proc;
+		proc = (PGPROC *) proc->links.next;
+	}
+
+	/*
+	 * Scan the constraints, and for each proc in the array, generate a count
+	 * of the number of constraints that say it must be before something else,
+	 * plus a list of the constraints that say it must be after something
+	 * else. The count for the j'th proc is stored in beforeConstraints[j],
+	 * and the head of its list in afterConstraints[j].  Each constraint
+	 * stores its list link in constraints[i].link (note any constraint will
+	 * be in just one list). The array index for the before-proc of the i'th
+	 * constraint is remembered in constraints[i].pred.
+	 *
+	 * Note that it's not necessarily the case that every constraint affects
+	 * this particular wait queue.  Prior to group locking, a process could be
+	 * waiting for at most one lock.  But a lock group can be waiting for
+	 * zero, one, or multiple locks.  Since topoProcs[] is an array of the
+	 * processes actually waiting, while constraints[] is an array of group
+	 * leaders, we've got to scan through topoProcs[] for each constraint,
+	 * checking whether both a waiter and a blocker for that group are
+	 * present.  If so, the constraint is relevant to this wait queue; if not,
+	 * it isn't.
+	 */
+	MemSet(beforeConstraints, 0, queue_size * sizeof(int));
+	MemSet(afterConstraints, 0, queue_size * sizeof(int));
+	for (i = 0; i < nConstraints; i++)
+	{
+		/*
+		 * Find a representative process that is on the lock queue and part of
+		 * the waiting lock group.  This may or may not be the leader, which
+		 * may or may not be waiting at all.  If there are any other processes
+		 * in the same lock group on the queue, set their number of
+		 * beforeConstraints to -1 to indicate that they should be emitted
+		 * with their groupmates rather than considered separately.
+		 *
+		 * In this loop and the similar one just below, it's critical that we
+		 * consistently select the same representative member of any one lock
+		 * group, so that all the constraints are associated with the same
+		 * proc, and the -1's are only associated with not-representative
+		 * members.  We select the last one in the topoProcs array.
+		 */
+		proc = constraints[i].waiter;
+		Assert(proc != NULL);
+		jj = -1;
+		for (j = queue_size; --j >= 0;)
+		{
+			PGPROC	   *waiter = topoProcs[j];
+
+			if (waiter == proc || waiter->lockGroupLeader == proc)
+			{
+				Assert(waiter->waitLock == lock);
+				if (jj == -1)
+					jj = j;
+				else
+				{
+					Assert(beforeConstraints[j] <= 0);
+					beforeConstraints[j] = -1;
+				}
+			}
+		}
+
+		/* If no matching waiter, constraint is not relevant to this lock. */
+		if (jj < 0)
+			continue;
+
+		/*
+		 * Similarly, find a representative process that is on the lock queue
+		 * and waiting for the blocking lock group.  Again, this could be the
+		 * leader but does not need to be.
+		 */
+		proc = constraints[i].blocker;
+		Assert(proc != NULL);
+		kk = -1;
+		for (k = queue_size; --k >= 0;)
+		{
+			PGPROC	   *blocker = topoProcs[k];
+
+			if (blocker == proc || blocker->lockGroupLeader == proc)
+			{
+				Assert(blocker->waitLock == lock);
+				if (kk == -1)
+					kk = k;
+				else
+				{
+					Assert(beforeConstraints[k] <= 0);
+					beforeConstraints[k] = -1;
+				}
+			}
+		}
+
+		/* If no matching blocker, constraint is not relevant to this lock. */
+		if (kk < 0)
+			continue;
+
+		Assert(beforeConstraints[jj] >= 0);
+		beforeConstraints[jj]++;	/* waiter must come before */
+		/* add this constraint to list of after-constraints for blocker */
+		constraints[i].pred = jj;
+		constraints[i].link = afterConstraints[kk];
+		afterConstraints[kk] = i + 1;
+	}
+
+	/*--------------------
+	 * Now scan the topoProcs array backwards.  At each step, output the
+	 * last proc that has no remaining before-constraints plus any other
+	 * members of the same lock group; then decrease the beforeConstraints
+	 * count of each of the procs it was constrained against.
+	 * i = index of ordering[] entry we want to output this time
+	 * j = search index for topoProcs[]
+	 * k = temp for scanning constraint list for proc j
+	 * last = last non-null index in topoProcs (avoid redundant searches)
+	 *--------------------
+	 */
+	last = queue_size - 1;
+	for (i = queue_size - 1; i >= 0;)
+	{
+		int			c;
+		int			nmatches = 0;
+
+		/* Find next candidate to output */
+		while (topoProcs[last] == NULL)
+			last--;
+		for (j = last; j >= 0; j--)
+		{
+			if (topoProcs[j] != NULL && beforeConstraints[j] == 0)
+				break;
+		}
+
+		/* If no available candidate, topological sort fails */
+		if (j < 0)
+			return false;
+
+		/*
+		 * Output everything in the lock group.  There's no point in
+		 * outputting an ordering where members of the same lock group are not
+		 * consecutive on the wait queue: if some other waiter is between two
+		 * requests that belong to the same group, then either it conflicts
+		 * with both of them and is certainly not a solution; or it conflicts
+		 * with at most one of them and is thus isomorphic to an ordering
+		 * where the group members are consecutive.
+		 */
+		proc = topoProcs[j];
+		if (proc->lockGroupLeader != NULL)
+			proc = proc->lockGroupLeader;
+		Assert(proc != NULL);
+		for (c = 0; c <= last; ++c)
+		{
+			if (topoProcs[c] == proc || (topoProcs[c] != NULL &&
+										 topoProcs[c]->lockGroupLeader == proc))
+			{
+				ordering[i - nmatches] = topoProcs[c];
+				topoProcs[c] = NULL;
+				++nmatches;
+			}
+		}
+		Assert(nmatches > 0);
+		i -= nmatches;
+
+		/* Update beforeConstraints counts of its predecessors */
+		for (k = afterConstraints[j]; k > 0; k = constraints[k - 1].link)
+			beforeConstraints[constraints[k - 1].pred]--;
+	}
+
+	/* Done */
+	return true;
+}
+
+#ifdef DEBUG_DEADLOCK
+static void
+PrintLockQueue(LOCK *lock, const char *info)
+{
+	PROC_QUEUE *waitQueue = &(lock->waitProcs);
+	int			queue_size = waitQueue->size;
+	PGPROC	   *proc;
+	int			i;
+
+	printf("%s lock %p queue ", info, lock);
+	proc = (PGPROC *) waitQueue->links.next;
+	for (i = 0; i < queue_size; i++)
+	{
+		printf(" %d", proc->pid);
+		proc = (PGPROC *) proc->links.next;
+	}
+	printf("\n");
+	fflush(stdout);
+}
+#endif
+
+/*
+ * Report a detected deadlock, with available details.
+ */
+void
+DeadLockReport(void)
+{
+	StringInfoData clientbuf;	/* errdetail for client */
+	StringInfoData logbuf;		/* errdetail for server log */
+	StringInfoData locktagbuf;
+	int			i;
+
+	initStringInfo(&clientbuf);
+	initStringInfo(&logbuf);
+	initStringInfo(&locktagbuf);
+
+	/* Generate the "waits for" lines sent to the client */
+	for (i = 0; i < nDeadlockDetails; i++)
+	{
+		DEADLOCK_INFO *info = &deadlockDetails[i];
+		int			nextpid;
+
+		/* The last proc waits for the first one... */
+		if (i < nDeadlockDetails - 1)
+			nextpid = info[1].pid;
+		else
+			nextpid = deadlockDetails[0].pid;
+
+		/* reset locktagbuf to hold next object description */
+		resetStringInfo(&locktagbuf);
+
+		DescribeLockTag(&locktagbuf, &info->locktag);
+
+		if (i > 0)
+			appendStringInfoChar(&clientbuf, '\n');
+
+		appendStringInfo(&clientbuf,
+						 _("Process %d waits for %s on %s; blocked by process %d."),
+						 info->pid,
+						 GetLockmodeName(info->locktag.locktag_lockmethodid,
+										 info->lockmode),
+						 locktagbuf.data,
+						 nextpid);
+	}
+
+	/* Duplicate all the above for the server ... */
+	appendBinaryStringInfo(&logbuf, clientbuf.data, clientbuf.len);
+
+	/* ... and add info about query strings */
+	for (i = 0; i < nDeadlockDetails; i++)
+	{
+		DEADLOCK_INFO *info = &deadlockDetails[i];
+
+		appendStringInfoChar(&logbuf, '\n');
+
+		appendStringInfo(&logbuf,
+						 _("Process %d: %s"),
+						 info->pid,
+						 pgstat_get_backend_current_activity(info->pid, false));
+	}
+
+	pgstat_report_deadlock();
+
+	ereport(ERROR,
+			(errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
+			 errmsg("deadlock detected"),
+			 errdetail_internal("%s", clientbuf.data),
+			 errdetail_log("%s", logbuf.data),
+			 errhint("See server log for query details.")));
+}
+
+/*
+ * RememberSimpleDeadLock: set up info for DeadLockReport when ProcSleep
+ * detects a trivial (two-way) deadlock.  proc1 wants to block for lockmode
+ * on lock, but proc2 is already waiting and would be blocked by proc1.
+ */
+void
+RememberSimpleDeadLock(PGPROC *proc1,
+					   LOCKMODE lockmode,
+					   LOCK *lock,
+					   PGPROC *proc2)
+{
+	DEADLOCK_INFO *info = &deadlockDetails[0];
+
+	info->locktag = lock->tag;
+	info->lockmode = lockmode;
+	info->pid = proc1->pid;
+	info++;
+	info->locktag = proc2->waitLock->tag;
+	info->lockmode = proc2->waitLockMode;
+	info->pid = proc2->pid;
+	nDeadlockDetails = 2;
+}
diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl
new file mode 100644
index 0000000..8a44946
--- /dev/null
+++ b/src/backend/storage/lmgr/generate-lwlocknames.pl
@@ -0,0 +1,71 @@
+#!/usr/bin/perl
+#
+# Generate lwlocknames.h and lwlocknames.c from lwlocknames.txt
+# Copyright (c) 2000-2021, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $lastlockidx = -1;
+my $continue    = "\n";
+
+open my $lwlocknames, '<', $ARGV[0] or die;
+
+# Include PID in suffix in case parallel make runs this multiple times.
+my $htmp = "lwlocknames.h.tmp$$";
+my $ctmp = "lwlocknames.c.tmp$$";
+open my $h, '>', $htmp or die "Could not open $htmp: $!";
+open my $c, '>', $ctmp or die "Could not open $ctmp: $!";
+
+my $autogen =
+  "/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */\n";
+print $h $autogen;
+print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n";
+print $c $autogen, "\n";
+
+print $c "const char *const IndividualLWLockNames[] = {";
+
+while (<$lwlocknames>)
+{
+	chomp;
+
+	# Skip comments
+	next if /^#/;
+	next if /^\s*$/;
+
+	die "unable to parse lwlocknames.txt"
+	  unless /^(\w+)\s+(\d+)$/;
+
+	(my $lockname, my $lockidx) = ($1, $2);
+
+	my $trimmedlockname = $lockname;
+	$trimmedlockname =~ s/Lock$//;
+	die "lock names must end with 'Lock'" if $trimmedlockname eq $lockname;
+
+	die "lwlocknames.txt not in order"   if $lockidx < $lastlockidx;
+	die "lwlocknames.txt has duplicates" if $lockidx == $lastlockidx;
+
+	while ($lastlockidx < $lockidx - 1)
+	{
+		++$lastlockidx;
+		printf $c "%s	\"<unassigned:%d>\"", $continue, $lastlockidx;
+		$continue = ",\n";
+	}
+	printf $c "%s	\"%s\"", $continue, $trimmedlockname;
+	$lastlockidx = $lockidx;
+	$continue    = ",\n";
+
+	print $h "#define $lockname (&MainLWLockArray[$lockidx].lock)\n";
+}
+
+printf $c "\n};\n";
+print $h "\n";
+printf $h "#define NUM_INDIVIDUAL_LWLOCKS		%s\n", $lastlockidx + 1;
+
+close $h;
+close $c;
+
+rename($htmp, 'lwlocknames.h') || die "rename: $htmp: $!";
+rename($ctmp, 'lwlocknames.c') || die "rename: $ctmp: $!";
+
+close $lwlocknames;
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
new file mode 100644
index 0000000..2db0424
--- /dev/null
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -0,0 +1,1196 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.c
+ *	  POSTGRES lock manager code
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/lmgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "catalog/catalog.h"
+#include "commands/progress.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "utils/inval.h"
+
+
+/*
+ * Per-backend counter for generating speculative insertion tokens.
+ *
+ * This may wrap around, but that's OK as it's only used for the short
+ * duration between inserting a tuple and checking that there are no (unique)
+ * constraint violations.  It's theoretically possible that a backend sees a
+ * tuple that was speculatively inserted by another backend, but before it has
+ * started waiting on the token, the other backend completes its insertion,
+ * and then performs 2^32 unrelated insertions.  And after all that, the
+ * first backend finally calls SpeculativeInsertionLockAcquire(), with the
+ * intention of waiting for the first insertion to complete, but ends up
+ * waiting for the latest unrelated insertion instead.  Even then, nothing
+ * particularly bad happens: in the worst case they deadlock, causing one of
+ * the transactions to abort.
+ */
+static uint32 speculativeInsertionToken = 0;
+
+
+/*
+ * Struct to hold context info for transaction lock waits.
+ *
+ * 'oper' is the operation that needs to wait for the other transaction; 'rel'
+ * and 'ctid' specify the address of the tuple being waited for.
+ */
+typedef struct XactLockTableWaitInfo
+{
+	XLTW_Oper	oper;
+	Relation	rel;
+	ItemPointer ctid;
+} XactLockTableWaitInfo;
+
+static void XactLockTableWaitErrorCb(void *arg);
+
+/*
+ * RelationInitLockInfo
+ *		Initializes the lock information in a relation descriptor.
+ *
+ *		relcache.c must call this during creation of any reldesc.
+ */
+void
+RelationInitLockInfo(Relation relation)
+{
+	Assert(RelationIsValid(relation));
+	Assert(OidIsValid(RelationGetRelid(relation)));
+
+	relation->rd_lockInfo.lockRelId.relId = RelationGetRelid(relation);
+
+	if (relation->rd_rel->relisshared)
+		relation->rd_lockInfo.lockRelId.dbId = InvalidOid;
+	else
+		relation->rd_lockInfo.lockRelId.dbId = MyDatabaseId;
+}
+
+/*
+ * SetLocktagRelationOid
+ *		Set up a locktag for a relation, given only relation OID
+ */
+static inline void
+SetLocktagRelationOid(LOCKTAG *tag, Oid relid)
+{
+	Oid			dbid;
+
+	if (IsSharedRelation(relid))
+		dbid = InvalidOid;
+	else
+		dbid = MyDatabaseId;
+
+	SET_LOCKTAG_RELATION(*tag, dbid, relid);
+}
+
+/*
+ *		LockRelationOid
+ *
+ * Lock a relation given only its OID.  This should generally be used
+ * before attempting to open the relation's relcache entry.
+ */
+void
+LockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+	/*
+	 * Now that we have the lock, check for invalidation messages, so that we
+	 * will update or flush any stale relcache entry before we try to use it.
+	 * RangeVarGetRelid() specifically relies on us for this.  We can skip
+	 * this in the not-uncommon case that we already had the same type of lock
+	 * being requested, since then no one else could have modified the
+	 * relcache entry in an undesirable way.  (In the case where our own xact
+	 * modifies the rel, the relcache update happens via
+	 * CommandCounterIncrement, not here.)
+	 *
+	 * However, in corner cases where code acts on tables (usually catalogs)
+	 * recursively, we might get here while still processing invalidation
+	 * messages in some outer execution of this function or a sibling.  The
+	 * "cleared" status of the lock tells us whether we really are done
+	 * absorbing relevant inval messages.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+}
+
+/*
+ *		ConditionalLockRelationOid
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ *
+ * NOTE: we do not currently need conditional versions of all the
+ * LockXXX routines in this file, but they could easily be added if needed.
+ */
+bool
+ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+	if (res == LOCKACQUIRE_NOT_AVAIL)
+		return false;
+
+	/*
+	 * Now that we have the lock, check for invalidation messages; see notes
+	 * in LockRelationOid.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+
+	return true;
+}
+
+/*
+ *		UnlockRelationId
+ *
+ * Unlock, given a LockRelId.  This is preferred over UnlockRelationOid
+ * for speed reasons.
+ */
+void
+UnlockRelationId(LockRelId *relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		UnlockRelationOid
+ *
+ * Unlock, given only a relation Oid.  Use UnlockRelationId if you can.
+ */
+void
+UnlockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockRelation
+ *
+ * This is a convenience routine for acquiring an additional lock on an
+ * already-open relation.  Never try to do "relation_open(foo, NoLock)"
+ * and then lock with this.
+ */
+void
+LockRelation(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+	/*
+	 * Now that we have the lock, check for invalidation messages; see notes
+	 * in LockRelationOid.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+}
+
+/*
+ *		ConditionalLockRelation
+ *
+ * This is a convenience routine for acquiring an additional lock on an
+ * already-open relation.  Never try to do "relation_open(foo, NoLock)"
+ * and then lock with this.
+ */
+bool
+ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+	if (res == LOCKACQUIRE_NOT_AVAIL)
+		return false;
+
+	/*
+	 * Now that we have the lock, check for invalidation messages; see notes
+	 * in LockRelationOid.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+
+	return true;
+}
+
+/*
+ *		UnlockRelation
+ *
+ * This is a convenience routine for unlocking a relation without also
+ * closing it.
+ */
+void
+UnlockRelation(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		CheckRelationLockedByMe
+ *
+ * Returns true if current transaction holds a lock on 'relation' of mode
+ * 'lockmode'.  If 'orstronger' is true, a stronger lockmode is also OK.
+ * ("Stronger" is defined as "numerically higher", which is a bit
+ * semantically dubious but is OK for the purposes we use this for.)
+ */
+bool
+CheckRelationLockedByMe(Relation relation, LOCKMODE lockmode, bool orstronger)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	if (LockHeldByMe(&tag, lockmode))
+		return true;
+
+	if (orstronger)
+	{
+		LOCKMODE	slockmode;
+
+		for (slockmode = lockmode + 1;
+			 slockmode <= MaxLockMode;
+			 slockmode++)
+		{
+			if (LockHeldByMe(&tag, slockmode))
+			{
+#ifdef NOT_USED
+				/* Sometimes this might be useful for debugging purposes */
+				elog(WARNING, "lock mode %s substituted for %s on relation %s",
+					 GetLockmodeName(tag.locktag_lockmethodid, slockmode),
+					 GetLockmodeName(tag.locktag_lockmethodid, lockmode),
+					 RelationGetRelationName(relation));
+#endif
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+/*
+ *		LockHasWaitersRelation
+ *
+ * This is a function to check whether someone else is waiting for a
+ * lock which we are currently holding.
+ */
+bool
+LockHasWaitersRelation(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	return LockHasWaiters(&tag, lockmode, false);
+}
+
+/*
+ *		LockRelationIdForSession
+ *
+ * This routine grabs a session-level lock on the target relation.  The
+ * session lock persists across transaction boundaries.  It will be removed
+ * when UnlockRelationIdForSession() is called, or if an ereport(ERROR) occurs,
+ * or if the backend exits.
+ *
+ * Note that one should also grab a transaction-level lock on the rel
+ * in any transaction that actually uses the rel, to ensure that the
+ * relcache entry is up to date.
+ */
+void
+LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+	(void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ *		UnlockRelationIdForSession
+ */
+void
+UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+	LockRelease(&tag, lockmode, true);
+}
+
+/*
+ *		LockRelationForExtension
+ *
+ * This lock tag is used to interlock addition of pages to relations.
+ * We need such locking because bufmgr/smgr definition of P_NEW is not
+ * race-condition-proof.
+ *
+ * We assume the caller is already holding some type of regular lock on
+ * the relation, so no AcceptInvalidationMessages call is needed here.
+ */
+void
+LockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION_EXTEND(tag,
+								relation->rd_lockInfo.lockRelId.dbId,
+								relation->rd_lockInfo.lockRelId.relId);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ *		ConditionalLockRelationForExtension
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION_EXTEND(tag,
+								relation->rd_lockInfo.lockRelId.dbId,
+								relation->rd_lockInfo.lockRelId.relId);
+
+	return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ *		RelationExtensionLockWaiterCount
+ *
+ * Count the number of processes waiting for the given relation extension lock.
+ */
+int
+RelationExtensionLockWaiterCount(Relation relation)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION_EXTEND(tag,
+								relation->rd_lockInfo.lockRelId.dbId,
+								relation->rd_lockInfo.lockRelId.relId);
+
+	return LockWaiterCount(&tag);
+}
+
+/*
+ *		UnlockRelationForExtension
+ */
+void
+UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION_EXTEND(tag,
+								relation->rd_lockInfo.lockRelId.dbId,
+								relation->rd_lockInfo.lockRelId.relId);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ *		LockPage
+ *
+ * Obtain a page-level lock.  This is currently used by some index access
+ * methods to lock individual index pages.
+ */
+void
+LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_PAGE(tag,
+					 relation->rd_lockInfo.lockRelId.dbId,
+					 relation->rd_lockInfo.lockRelId.relId,
+					 blkno);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ *		ConditionalLockPage
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_PAGE(tag,
+					 relation->rd_lockInfo.lockRelId.dbId,
+					 relation->rd_lockInfo.lockRelId.relId,
+					 blkno);
+
+	return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ *		UnlockPage
+ */
+void
+UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_PAGE(tag,
+					 relation->rd_lockInfo.lockRelId.dbId,
+					 relation->rd_lockInfo.lockRelId.relId,
+					 blkno);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockTuple
+ *
+ * Obtain a tuple-level lock.  This is used in a less-than-intuitive fashion
+ * because we can't afford to keep a separate lock in shared memory for every
+ * tuple.  See heap_lock_tuple before using this!
+ */
+void
+LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TUPLE(tag,
+					  relation->rd_lockInfo.lockRelId.dbId,
+					  relation->rd_lockInfo.lockRelId.relId,
+					  ItemPointerGetBlockNumber(tid),
+					  ItemPointerGetOffsetNumber(tid));
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ *		ConditionalLockTuple
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TUPLE(tag,
+					  relation->rd_lockInfo.lockRelId.dbId,
+					  relation->rd_lockInfo.lockRelId.relId,
+					  ItemPointerGetBlockNumber(tid),
+					  ItemPointerGetOffsetNumber(tid));
+
+	return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ *		UnlockTuple
+ */
+void
+UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TUPLE(tag,
+					  relation->rd_lockInfo.lockRelId.dbId,
+					  relation->rd_lockInfo.lockRelId.relId,
+					  ItemPointerGetBlockNumber(tid),
+					  ItemPointerGetOffsetNumber(tid));
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		XactLockTableInsert
+ *
+ * Insert a lock showing that the given transaction ID is running ---
+ * this is done when an XID is acquired by a transaction or subtransaction.
+ * The lock can then be used to wait for the transaction to finish.
+ */
+void
+XactLockTableInsert(TransactionId xid)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TRANSACTION(tag, xid);
+
+	(void) LockAcquire(&tag, ExclusiveLock, false, false);
+}
+
+/*
+ *		XactLockTableDelete
+ *
+ * Delete the lock showing that the given transaction ID is running.
+ * (This is never used for main transaction IDs; those locks are only
+ * released implicitly at transaction end.  But we do use it for subtrans IDs.)
+ */
+void
+XactLockTableDelete(TransactionId xid)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TRANSACTION(tag, xid);
+
+	LockRelease(&tag, ExclusiveLock, false);
+}
+
+/*
+ *		XactLockTableWait
+ *
+ * Wait for the specified transaction to commit or abort.  If an operation
+ * is specified, an error context callback is set up.  If 'oper' is passed as
+ * None, no error context callback is set up.
+ *
+ * Note that this does the right thing for subtransactions: if we wait on a
+ * subtransaction, we will exit as soon as it aborts or its top parent commits.
+ * It takes some extra work to ensure this, because to save on shared memory
+ * the XID lock of a subtransaction is released when it ends, whether
+ * successfully or unsuccessfully.  So we have to check if it's "still running"
+ * and if so wait for its parent.
+ */
+void
+XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid,
+				  XLTW_Oper oper)
+{
+	LOCKTAG		tag;
+	XactLockTableWaitInfo info;
+	ErrorContextCallback callback;
+	bool		first = true;
+
+	/*
+	 * If an operation is specified, set up our verbose error context
+	 * callback.
+	 */
+	if (oper != XLTW_None)
+	{
+		Assert(RelationIsValid(rel));
+		Assert(ItemPointerIsValid(ctid));
+
+		info.rel = rel;
+		info.ctid = ctid;
+		info.oper = oper;
+
+		callback.callback = XactLockTableWaitErrorCb;
+		callback.arg = &info;
+		callback.previous = error_context_stack;
+		error_context_stack = &callback;
+	}
+
+	for (;;)
+	{
+		Assert(TransactionIdIsValid(xid));
+		Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+		SET_LOCKTAG_TRANSACTION(tag, xid);
+
+		(void) LockAcquire(&tag, ShareLock, false, false);
+
+		LockRelease(&tag, ShareLock, false);
+
+		if (!TransactionIdIsInProgress(xid))
+			break;
+
+		/*
+		 * If the Xid belonged to a subtransaction, then the lock would have
+		 * gone away as soon as it was finished; for correct tuple visibility,
+		 * the right action is to wait on its parent transaction to go away.
+		 * But instead of going levels up one by one, we can just wait for the
+		 * topmost transaction to finish with the same end result, which also
+		 * incurs less locktable traffic.
+		 *
+		 * Some uses of this function don't involve tuple visibility -- such
+		 * as when building snapshots for logical decoding.  It is possible to
+		 * see a transaction in ProcArray before it registers itself in the
+		 * locktable.  The topmost transaction in that case is the same xid,
+		 * so we try again after a short sleep.  (Don't sleep the first time
+		 * through, to avoid slowing down the normal case.)
+		 */
+		if (!first)
+			pg_usleep(1000L);
+		first = false;
+		xid = SubTransGetTopmostTransaction(xid);
+	}
+
+	if (oper != XLTW_None)
+		error_context_stack = callback.previous;
+}
+
+/*
+ *		ConditionalXactLockTableWait
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true if the lock was acquired.
+ */
+bool
+ConditionalXactLockTableWait(TransactionId xid)
+{
+	LOCKTAG		tag;
+	bool		first = true;
+
+	for (;;)
+	{
+		Assert(TransactionIdIsValid(xid));
+		Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+		SET_LOCKTAG_TRANSACTION(tag, xid);
+
+		if (LockAcquire(&tag, ShareLock, false, true) == LOCKACQUIRE_NOT_AVAIL)
+			return false;
+
+		LockRelease(&tag, ShareLock, false);
+
+		if (!TransactionIdIsInProgress(xid))
+			break;
+
+		/* See XactLockTableWait about this case */
+		if (!first)
+			pg_usleep(1000L);
+		first = false;
+		xid = SubTransGetTopmostTransaction(xid);
+	}
+
+	return true;
+}
+
+/*
+ *		SpeculativeInsertionLockAcquire
+ *
+ * Insert a lock showing that the given transaction ID is inserting a tuple,
+ * but hasn't yet decided whether it's going to keep it.  The lock can then be
+ * used to wait for the decision to go ahead with the insertion, or aborting
+ * it.
+ *
+ * The token is used to distinguish multiple insertions by the same
+ * transaction.  It is returned to caller.
+ */
+uint32
+SpeculativeInsertionLockAcquire(TransactionId xid)
+{
+	LOCKTAG		tag;
+
+	speculativeInsertionToken++;
+
+	/*
+	 * Check for wrap-around. Zero means no token is held, so don't use that.
+	 */
+	if (speculativeInsertionToken == 0)
+		speculativeInsertionToken = 1;
+
+	SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken);
+
+	(void) LockAcquire(&tag, ExclusiveLock, false, false);
+
+	return speculativeInsertionToken;
+}
+
+/*
+ *		SpeculativeInsertionLockRelease
+ *
+ * Delete the lock showing that the given transaction is speculatively
+ * inserting a tuple.
+ */
+void
+SpeculativeInsertionLockRelease(TransactionId xid)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken);
+
+	LockRelease(&tag, ExclusiveLock, false);
+}
+
+/*
+ *		SpeculativeInsertionWait
+ *
+ * Wait for the specified transaction to finish or abort the insertion of a
+ * tuple.
+ */
+void
+SpeculativeInsertionWait(TransactionId xid, uint32 token)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, token);
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(token != 0);
+
+	(void) LockAcquire(&tag, ShareLock, false, false);
+	LockRelease(&tag, ShareLock, false);
+}
+
+/*
+ * XactLockTableWaitErrorCb
+ *		Error context callback for transaction lock waits.
+ */
+static void
+XactLockTableWaitErrorCb(void *arg)
+{
+	XactLockTableWaitInfo *info = (XactLockTableWaitInfo *) arg;
+
+	/*
+	 * We would like to print schema name too, but that would require a
+	 * syscache lookup.
+	 */
+	if (info->oper != XLTW_None &&
+		ItemPointerIsValid(info->ctid) && RelationIsValid(info->rel))
+	{
+		const char *cxt;
+
+		switch (info->oper)
+		{
+			case XLTW_Update:
+				cxt = gettext_noop("while updating tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_Delete:
+				cxt = gettext_noop("while deleting tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_Lock:
+				cxt = gettext_noop("while locking tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_LockUpdated:
+				cxt = gettext_noop("while locking updated version (%u,%u) of tuple in relation \"%s\"");
+				break;
+			case XLTW_InsertIndex:
+				cxt = gettext_noop("while inserting index tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_InsertIndexUnique:
+				cxt = gettext_noop("while checking uniqueness of tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_FetchUpdated:
+				cxt = gettext_noop("while rechecking updated tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_RecheckExclusionConstr:
+				cxt = gettext_noop("while checking exclusion constraint on tuple (%u,%u) in relation \"%s\"");
+				break;
+
+			default:
+				return;
+		}
+
+		errcontext(cxt,
+				   ItemPointerGetBlockNumber(info->ctid),
+				   ItemPointerGetOffsetNumber(info->ctid),
+				   RelationGetRelationName(info->rel));
+	}
+}
+
+/*
+ * WaitForLockersMultiple
+ *		Wait until no transaction holds locks that conflict with the given
+ *		locktags at the given lockmode.
+ *
+ * To do this, obtain the current list of lockers, and wait on their VXIDs
+ * until they are finished.
+ *
+ * Note we don't try to acquire the locks on the given locktags, only the
+ * VXIDs and XIDs of their lock holders; if somebody grabs a conflicting lock
+ * on the objects after we obtained our initial list of lockers, we will not
+ * wait for them.
+ */
+void
+WaitForLockersMultiple(List *locktags, LOCKMODE lockmode, bool progress)
+{
+	List	   *holders = NIL;
+	ListCell   *lc;
+	int			total = 0;
+	int			done = 0;
+
+	/* Done if no locks to wait for */
+	if (list_length(locktags) == 0)
+		return;
+
+	/* Collect the transactions we need to wait on */
+	foreach(lc, locktags)
+	{
+		LOCKTAG    *locktag = lfirst(lc);
+		int			count;
+
+		holders = lappend(holders,
+						  GetLockConflicts(locktag, lockmode,
+										   progress ? &count : NULL));
+		if (progress)
+			total += count;
+	}
+
+	if (progress)
+		pgstat_progress_update_param(PROGRESS_WAITFOR_TOTAL, total);
+
+	/*
+	 * Note: GetLockConflicts() never reports our own xid, hence we need not
+	 * check for that.  Also, prepared xacts are reported and awaited.
+	 */
+
+	/* Finally wait for each such transaction to complete */
+	foreach(lc, holders)
+	{
+		VirtualTransactionId *lockholders = lfirst(lc);
+
+		while (VirtualTransactionIdIsValid(*lockholders))
+		{
+			/* If requested, publish who we're going to wait for. */
+			if (progress)
+			{
+				PGPROC	   *holder = BackendIdGetProc(lockholders->backendId);
+
+				if (holder)
+					pgstat_progress_update_param(PROGRESS_WAITFOR_CURRENT_PID,
+												 holder->pid);
+			}
+			VirtualXactLock(*lockholders, true);
+			lockholders++;
+
+			if (progress)
+				pgstat_progress_update_param(PROGRESS_WAITFOR_DONE, ++done);
+		}
+	}
+	if (progress)
+	{
+		const int	index[] = {
+			PROGRESS_WAITFOR_TOTAL,
+			PROGRESS_WAITFOR_DONE,
+			PROGRESS_WAITFOR_CURRENT_PID
+		};
+		const int64 values[] = {
+			0, 0, 0
+		};
+
+		pgstat_progress_update_multi_param(3, index, values);
+	}
+
+	list_free_deep(holders);
+}
+
+/*
+ * WaitForLockers
+ *
+ * Same as WaitForLockersMultiple, for a single lock tag.
+ */
+void
+WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode, bool progress)
+{
+	List	   *l;
+
+	l = list_make1(&heaplocktag);
+	WaitForLockersMultiple(l, lockmode, progress);
+	list_free(l);
+}
+
+
+/*
+ *		LockDatabaseObject
+ *
+ * Obtain a lock on a general object of the current database.  Don't use
+ * this for shared objects (such as tablespaces).  It's unwise to apply it
+ * to relations, also, since a lock taken this way will NOT conflict with
+ * locks taken via LockRelation and friends.
+ */
+void
+LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+				   LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   MyDatabaseId,
+					   classid,
+					   objid,
+					   objsubid);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+
+	/* Make sure syscaches are up-to-date with any changes we waited for */
+	AcceptInvalidationMessages();
+}
+
+/*
+ *		UnlockDatabaseObject
+ */
+void
+UnlockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+					 LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   MyDatabaseId,
+					   classid,
+					   objid,
+					   objsubid);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockSharedObject
+ *
+ * Obtain a lock on a shared-across-databases object.
+ */
+void
+LockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+				 LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   InvalidOid,
+					   classid,
+					   objid,
+					   objsubid);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+
+	/* Make sure syscaches are up-to-date with any changes we waited for */
+	AcceptInvalidationMessages();
+}
+
+/*
+ *		UnlockSharedObject
+ */
+void
+UnlockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+				   LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   InvalidOid,
+					   classid,
+					   objid,
+					   objsubid);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockSharedObjectForSession
+ *
+ * Obtain a session-level lock on a shared-across-databases object.
+ * See LockRelationIdForSession for notes about session-level locks.
+ */
+void
+LockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+						   LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   InvalidOid,
+					   classid,
+					   objid,
+					   objsubid);
+
+	(void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ *		UnlockSharedObjectForSession
+ */
+void
+UnlockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+							 LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   InvalidOid,
+					   classid,
+					   objid,
+					   objsubid);
+
+	LockRelease(&tag, lockmode, true);
+}
+
+
+/*
+ * Append a description of a lockable object to buf.
+ *
+ * Ideally we would print names for the numeric values, but that requires
+ * getting locks on system tables, which might cause problems since this is
+ * typically used to report deadlock situations.
+ */
+void
+DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
+{
+	switch ((LockTagType) tag->locktag_type)
+	{
+		case LOCKTAG_RELATION:
+			appendStringInfo(buf,
+							 _("relation %u of database %u"),
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_RELATION_EXTEND:
+			appendStringInfo(buf,
+							 _("extension of relation %u of database %u"),
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_DATABASE_FROZEN_IDS:
+			appendStringInfo(buf,
+							 _("pg_database.datfrozenxid of database %u"),
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_PAGE:
+			appendStringInfo(buf,
+							 _("page %u of relation %u of database %u"),
+							 tag->locktag_field3,
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_TUPLE:
+			appendStringInfo(buf,
+							 _("tuple (%u,%u) of relation %u of database %u"),
+							 tag->locktag_field3,
+							 tag->locktag_field4,
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_TRANSACTION:
+			appendStringInfo(buf,
+							 _("transaction %u"),
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_VIRTUALTRANSACTION:
+			appendStringInfo(buf,
+							 _("virtual transaction %d/%u"),
+							 tag->locktag_field1,
+							 tag->locktag_field2);
+			break;
+		case LOCKTAG_SPECULATIVE_TOKEN:
+			appendStringInfo(buf,
+							 _("speculative token %u of transaction %u"),
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_OBJECT:
+			appendStringInfo(buf,
+							 _("object %u of class %u of database %u"),
+							 tag->locktag_field3,
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_USERLOCK:
+			/* reserved for old contrib code, now on pgfoundry */
+			appendStringInfo(buf,
+							 _("user lock [%u,%u,%u]"),
+							 tag->locktag_field1,
+							 tag->locktag_field2,
+							 tag->locktag_field3);
+			break;
+		case LOCKTAG_ADVISORY:
+			appendStringInfo(buf,
+							 _("advisory lock [%u,%u,%u,%u]"),
+							 tag->locktag_field1,
+							 tag->locktag_field2,
+							 tag->locktag_field3,
+							 tag->locktag_field4);
+			break;
+		default:
+			appendStringInfo(buf,
+							 _("unrecognized locktag type %d"),
+							 (int) tag->locktag_type);
+			break;
+	}
+}
+
+/*
+ * GetLockNameFromTagType
+ *
+ *	Given locktag type, return the corresponding lock name.
+ */
+const char *
+GetLockNameFromTagType(uint16 locktag_type)
+{
+	if (locktag_type > LOCKTAG_LAST_TYPE)
+		return "???";
+	return LockTagTypeNames[locktag_type];
+}
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
new file mode 100644
index 0000000..818666f
--- /dev/null
+++ b/src/backend/storage/lmgr/lock.c
@@ -0,0 +1,4738 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.c
+ *	  POSTGRES primary lock mechanism
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/lock.c
+ *
+ * NOTES
+ *	  A lock table is a shared memory hash table.  When
+ *	  a process tries to acquire a lock of a type that conflicts
+ *	  with existing locks, it is put to sleep using the routines
+ *	  in storage/lmgr/proc.c.
+ *
+ *	  For the most part, this code should be invoked via lmgr.c
+ *	  or another lock-management module, not directly.
+ *
+ *	Interface:
+ *
+ *	InitLocks(), GetLocksMethodTable(), GetLockTagsMethodTable(),
+ *	LockAcquire(), LockRelease(), LockReleaseAll(),
+ *	LockCheckConflicts(), GrantLock()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "storage/standby.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/resowner_private.h"
+
+
+/* This configuration variable is used to set the lock table size */
+int			max_locks_per_xact; /* set by guc.c */
+
+#define NLOCKENTS() \
+	mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+
+
+/*
+ * Data structures defining the semantics of the standard lock methods.
+ *
+ * The conflict table defines the semantics of the various lock modes.
+ */
+static const LOCKMASK LockConflicts[] = {
+	0,
+
+	/* AccessShareLock */
+	LOCKBIT_ON(AccessExclusiveLock),
+
+	/* RowShareLock */
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* RowExclusiveLock */
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* ShareUpdateExclusiveLock */
+	LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* ShareLock */
+	LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* ShareRowExclusiveLock */
+	LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* ExclusiveLock */
+	LOCKBIT_ON(RowShareLock) |
+	LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* AccessExclusiveLock */
+	LOCKBIT_ON(AccessShareLock) | LOCKBIT_ON(RowShareLock) |
+	LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock)
+
+};
+
+/* Names of lock modes, for debug printouts */
+static const char *const lock_mode_names[] =
+{
+	"INVALID",
+	"AccessShareLock",
+	"RowShareLock",
+	"RowExclusiveLock",
+	"ShareUpdateExclusiveLock",
+	"ShareLock",
+	"ShareRowExclusiveLock",
+	"ExclusiveLock",
+	"AccessExclusiveLock"
+};
+
+#ifndef LOCK_DEBUG
+static bool Dummy_trace = false;
+#endif
+
+static const LockMethodData default_lockmethod = {
+	AccessExclusiveLock,		/* highest valid lock mode number */
+	LockConflicts,
+	lock_mode_names,
+#ifdef LOCK_DEBUG
+	&Trace_locks
+#else
+	&Dummy_trace
+#endif
+};
+
+static const LockMethodData user_lockmethod = {
+	AccessExclusiveLock,		/* highest valid lock mode number */
+	LockConflicts,
+	lock_mode_names,
+#ifdef LOCK_DEBUG
+	&Trace_userlocks
+#else
+	&Dummy_trace
+#endif
+};
+
+/*
+ * map from lock method id to the lock table data structures
+ */
+static const LockMethod LockMethods[] = {
+	NULL,
+	&default_lockmethod,
+	&user_lockmethod
+};
+
+
+/* Record that's written to 2PC state file when a lock is persisted */
+typedef struct TwoPhaseLockRecord
+{
+	LOCKTAG		locktag;
+	LOCKMODE	lockmode;
+} TwoPhaseLockRecord;
+
+
+/*
+ * Count of the number of fast path lock slots we believe to be used.  This
+ * might be higher than the real number if another backend has transferred
+ * our locks to the primary lock table, but it can never be lower than the
+ * real value, since only we can acquire locks on our own behalf.
+ */
+static int	FastPathLocalUseCount = 0;
+
+/*
+ * Flag to indicate if the relation extension lock is held by this backend.
+ * This flag is used to ensure that while holding the relation extension lock
+ * we don't try to acquire a heavyweight lock on any other object.  This
+ * restriction implies that the relation extension lock won't ever participate
+ * in the deadlock cycle because we can never wait for any other heavyweight
+ * lock after acquiring this lock.
+ *
+ * Such a restriction is okay for relation extension locks as unlike other
+ * heavyweight locks these are not held till the transaction end.  These are
+ * taken for a short duration to extend a particular relation and then
+ * released.
+ */
+static bool IsRelationExtensionLockHeld PG_USED_FOR_ASSERTS_ONLY = false;
+
+/*
+ * Flag to indicate if the page lock is held by this backend.  We don't
+ * acquire any other heavyweight lock while holding the page lock except for
+ * relation extension.  However, these locks are never taken in reverse order
+ * which implies that page locks will also never participate in the deadlock
+ * cycle.
+ *
+ * Similar to relation extension, page locks are also held for a short
+ * duration, so imposing such a restriction won't hurt.
+ */
+static bool IsPageLockHeld PG_USED_FOR_ASSERTS_ONLY = false;
+
+/* Macros for manipulating proc->fpLockBits */
+#define FAST_PATH_BITS_PER_SLOT			3
+#define FAST_PATH_LOCKNUMBER_OFFSET		1
+#define FAST_PATH_MASK					((1 << FAST_PATH_BITS_PER_SLOT) - 1)
+#define FAST_PATH_GET_BITS(proc, n) \
+	(((proc)->fpLockBits >> (FAST_PATH_BITS_PER_SLOT * n)) & FAST_PATH_MASK)
+#define FAST_PATH_BIT_POSITION(n, l) \
+	(AssertMacro((l) >= FAST_PATH_LOCKNUMBER_OFFSET), \
+	 AssertMacro((l) < FAST_PATH_BITS_PER_SLOT+FAST_PATH_LOCKNUMBER_OFFSET), \
+	 AssertMacro((n) < FP_LOCK_SLOTS_PER_BACKEND), \
+	 ((l) - FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (n)))
+#define FAST_PATH_SET_LOCKMODE(proc, n, l) \
+	 (proc)->fpLockBits |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)
+#define FAST_PATH_CLEAR_LOCKMODE(proc, n, l) \
+	 (proc)->fpLockBits &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))
+#define FAST_PATH_CHECK_LOCKMODE(proc, n, l) \
+	 ((proc)->fpLockBits & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)))
+
+/*
+ * The fast-path lock mechanism is concerned only with relation locks on
+ * unshared relations by backends bound to a database.  The fast-path
+ * mechanism exists mostly to accelerate acquisition and release of locks
+ * that rarely conflict.  Because ShareUpdateExclusiveLock is
+ * self-conflicting, it can't use the fast-path mechanism; but it also does
+ * not conflict with any of the locks that do, so we can ignore it completely.
+ */
+#define EligibleForRelationFastPath(locktag, mode) \
+	((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \
+	(locktag)->locktag_type == LOCKTAG_RELATION && \
+	(locktag)->locktag_field1 == MyDatabaseId && \
+	MyDatabaseId != InvalidOid && \
+	(mode) < ShareUpdateExclusiveLock)
+#define ConflictsWithRelationFastPath(locktag, mode) \
+	((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \
+	(locktag)->locktag_type == LOCKTAG_RELATION && \
+	(locktag)->locktag_field1 != InvalidOid && \
+	(mode) > ShareUpdateExclusiveLock)
+
+static bool FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode);
+static bool FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode);
+static bool FastPathTransferRelationLocks(LockMethod lockMethodTable,
+										  const LOCKTAG *locktag, uint32 hashcode);
+static PROCLOCK *FastPathGetRelationLockEntry(LOCALLOCK *locallock);
+
+/*
+ * To make the fast-path lock mechanism work, we must have some way of
+ * preventing the use of the fast-path when a conflicting lock might be present.
+ * We partition* the locktag space into FAST_PATH_STRONG_LOCK_HASH_PARTITIONS,
+ * and maintain an integer count of the number of "strong" lockers
+ * in each partition.  When any "strong" lockers are present (which is
+ * hopefully not very often), the fast-path mechanism can't be used, and we
+ * must fall back to the slower method of pushing matching locks directly
+ * into the main lock tables.
+ *
+ * The deadlock detector does not know anything about the fast path mechanism,
+ * so any locks that might be involved in a deadlock must be transferred from
+ * the fast-path queues to the main lock table.
+ */
+
+#define FAST_PATH_STRONG_LOCK_HASH_BITS			10
+#define FAST_PATH_STRONG_LOCK_HASH_PARTITIONS \
+	(1 << FAST_PATH_STRONG_LOCK_HASH_BITS)
+#define FastPathStrongLockHashPartition(hashcode) \
+	((hashcode) % FAST_PATH_STRONG_LOCK_HASH_PARTITIONS)
+
+typedef struct
+{
+	slock_t		mutex;
+	uint32		count[FAST_PATH_STRONG_LOCK_HASH_PARTITIONS];
+} FastPathStrongRelationLockData;
+
+static volatile FastPathStrongRelationLockData *FastPathStrongRelationLocks;
+
+
+/*
+ * Pointers to hash tables containing lock state
+ *
+ * The LockMethodLockHash and LockMethodProcLockHash hash tables are in
+ * shared memory; LockMethodLocalHash is local to each backend.
+ */
+static HTAB *LockMethodLockHash;
+static HTAB *LockMethodProcLockHash;
+static HTAB *LockMethodLocalHash;
+
+
+/* private state for error cleanup */
+static LOCALLOCK *StrongLockInProgress;
+static LOCALLOCK *awaitedLock;
+static ResourceOwner awaitedOwner;
+
+
+#ifdef LOCK_DEBUG
+
+/*------
+ * The following configuration options are available for lock debugging:
+ *
+ *	   TRACE_LOCKS		-- give a bunch of output what's going on in this file
+ *	   TRACE_USERLOCKS	-- same but for user locks
+ *	   TRACE_LOCK_OIDMIN-- do not trace locks for tables below this oid
+ *						   (use to avoid output on system tables)
+ *	   TRACE_LOCK_TABLE -- trace locks on this table (oid) unconditionally
+ *	   DEBUG_DEADLOCKS	-- currently dumps locks at untimely occasions ;)
+ *
+ * Furthermore, but in storage/lmgr/lwlock.c:
+ *	   TRACE_LWLOCKS	-- trace lightweight locks (pretty useless)
+ *
+ * Define LOCK_DEBUG at compile time to get all these enabled.
+ * --------
+ */
+
+int			Trace_lock_oidmin = FirstNormalObjectId;
+bool		Trace_locks = false;
+bool		Trace_userlocks = false;
+int			Trace_lock_table = 0;
+bool		Debug_deadlocks = false;
+
+
+inline static bool
+LOCK_DEBUG_ENABLED(const LOCKTAG *tag)
+{
+	return
+		(*(LockMethods[tag->locktag_lockmethodid]->trace_flag) &&
+		 ((Oid) tag->locktag_field2 >= (Oid) Trace_lock_oidmin))
+		|| (Trace_lock_table &&
+			(tag->locktag_field2 == Trace_lock_table));
+}
+
+
+inline static void
+LOCK_PRINT(const char *where, const LOCK *lock, LOCKMODE type)
+{
+	if (LOCK_DEBUG_ENABLED(&lock->tag))
+		elog(LOG,
+			 "%s: lock(%p) id(%u,%u,%u,%u,%u,%u) grantMask(%x) "
+			 "req(%d,%d,%d,%d,%d,%d,%d)=%d "
+			 "grant(%d,%d,%d,%d,%d,%d,%d)=%d wait(%d) type(%s)",
+			 where, lock,
+			 lock->tag.locktag_field1, lock->tag.locktag_field2,
+			 lock->tag.locktag_field3, lock->tag.locktag_field4,
+			 lock->tag.locktag_type, lock->tag.locktag_lockmethodid,
+			 lock->grantMask,
+			 lock->requested[1], lock->requested[2], lock->requested[3],
+			 lock->requested[4], lock->requested[5], lock->requested[6],
+			 lock->requested[7], lock->nRequested,
+			 lock->granted[1], lock->granted[2], lock->granted[3],
+			 lock->granted[4], lock->granted[5], lock->granted[6],
+			 lock->granted[7], lock->nGranted,
+			 lock->waitProcs.size,
+			 LockMethods[LOCK_LOCKMETHOD(*lock)]->lockModeNames[type]);
+}
+
+
+inline static void
+PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP)
+{
+	if (LOCK_DEBUG_ENABLED(&proclockP->tag.myLock->tag))
+		elog(LOG,
+			 "%s: proclock(%p) lock(%p) method(%u) proc(%p) hold(%x)",
+			 where, proclockP, proclockP->tag.myLock,
+			 PROCLOCK_LOCKMETHOD(*(proclockP)),
+			 proclockP->tag.myProc, (int) proclockP->holdMask);
+}
+#else							/* not LOCK_DEBUG */
+
+#define LOCK_PRINT(where, lock, type)  ((void) 0)
+#define PROCLOCK_PRINT(where, proclockP)  ((void) 0)
+#endif							/* not LOCK_DEBUG */
+
+
+static uint32 proclock_hash(const void *key, Size keysize);
+static void RemoveLocalLock(LOCALLOCK *locallock);
+static PROCLOCK *SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
+								  const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode);
+static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner);
+static void BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode);
+static void FinishStrongLockAcquire(void);
+static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner);
+static void ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock);
+static void LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent);
+static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode,
+						PROCLOCK *proclock, LockMethod lockMethodTable);
+static void CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+						LockMethod lockMethodTable, uint32 hashcode,
+						bool wakeupNeeded);
+static void LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc,
+								 LOCKTAG *locktag, LOCKMODE lockmode,
+								 bool decrement_strong_lock_count);
+static void GetSingleProcBlockerStatusData(PGPROC *blocked_proc,
+										   BlockedProcsData *data);
+
+
+/*
+ * InitLocks -- Initialize the lock manager's data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments.  In the normal postmaster case, the shared hash tables
+ * are created here, as well as a locallock hash table that will remain
+ * unused and empty in the postmaster itself.  Backends inherit the pointers
+ * to the shared tables via fork(), and also inherit an image of the locallock
+ * hash table, which they proceed to use.  In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables and to create its locallock hash table.
+ */
+void
+InitLocks(void)
+{
+	HASHCTL		info;
+	long		init_table_size,
+				max_table_size;
+	bool		found;
+
+	/*
+	 * Compute init/max size to request for lock hashtables.  Note these
+	 * calculations must agree with LockShmemSize!
+	 */
+	max_table_size = NLOCKENTS();
+	init_table_size = max_table_size / 2;
+
+	/*
+	 * Allocate hash table for LOCK structs.  This stores per-locked-object
+	 * information.
+	 */
+	info.keysize = sizeof(LOCKTAG);
+	info.entrysize = sizeof(LOCK);
+	info.num_partitions = NUM_LOCK_PARTITIONS;
+
+	LockMethodLockHash = ShmemInitHash("LOCK hash",
+									   init_table_size,
+									   max_table_size,
+									   &info,
+									   HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
+
+	/* Assume an average of 2 holders per lock */
+	max_table_size *= 2;
+	init_table_size *= 2;
+
+	/*
+	 * Allocate hash table for PROCLOCK structs.  This stores
+	 * per-lock-per-holder information.
+	 */
+	info.keysize = sizeof(PROCLOCKTAG);
+	info.entrysize = sizeof(PROCLOCK);
+	info.hash = proclock_hash;
+	info.num_partitions = NUM_LOCK_PARTITIONS;
+
+	LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash",
+										   init_table_size,
+										   max_table_size,
+										   &info,
+										   HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+	/*
+	 * Allocate fast-path structures.
+	 */
+	FastPathStrongRelationLocks =
+		ShmemInitStruct("Fast Path Strong Relation Lock Data",
+						sizeof(FastPathStrongRelationLockData), &found);
+	if (!found)
+		SpinLockInit(&FastPathStrongRelationLocks->mutex);
+
+	/*
+	 * Allocate non-shared hash table for LOCALLOCK structs.  This stores lock
+	 * counts and resource owner information.
+	 *
+	 * The non-shared table could already exist in this process (this occurs
+	 * when the postmaster is recreating shared memory after a backend crash).
+	 * If so, delete and recreate it.  (We could simply leave it, since it
+	 * ought to be empty in the postmaster, but for safety let's zap it.)
+	 */
+	if (LockMethodLocalHash)
+		hash_destroy(LockMethodLocalHash);
+
+	info.keysize = sizeof(LOCALLOCKTAG);
+	info.entrysize = sizeof(LOCALLOCK);
+
+	LockMethodLocalHash = hash_create("LOCALLOCK hash",
+									  16,
+									  &info,
+									  HASH_ELEM | HASH_BLOBS);
+}
+
+
+/*
+ * Fetch the lock method table associated with a given lock
+ */
+LockMethod
+GetLocksMethodTable(const LOCK *lock)
+{
+	LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*lock);
+
+	Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+	return LockMethods[lockmethodid];
+}
+
+/*
+ * Fetch the lock method table associated with a given locktag
+ */
+LockMethod
+GetLockTagsMethodTable(const LOCKTAG *locktag)
+{
+	LOCKMETHODID lockmethodid = (LOCKMETHODID) locktag->locktag_lockmethodid;
+
+	Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+	return LockMethods[lockmethodid];
+}
+
+
+/*
+ * Compute the hash code associated with a LOCKTAG.
+ *
+ * To avoid unnecessary recomputations of the hash code, we try to do this
+ * just once per function, and then pass it around as needed.  Aside from
+ * passing the hashcode to hash_search_with_hash_value(), we can extract
+ * the lock partition number from the hashcode.
+ */
+uint32
+LockTagHashCode(const LOCKTAG *locktag)
+{
+	return get_hash_value(LockMethodLockHash, (const void *) locktag);
+}
+
+/*
+ * Compute the hash code associated with a PROCLOCKTAG.
+ *
+ * Because we want to use just one set of partition locks for both the
+ * LOCK and PROCLOCK hash tables, we have to make sure that PROCLOCKs
+ * fall into the same partition number as their associated LOCKs.
+ * dynahash.c expects the partition number to be the low-order bits of
+ * the hash code, and therefore a PROCLOCKTAG's hash code must have the
+ * same low-order bits as the associated LOCKTAG's hash code.  We achieve
+ * this with this specialized hash function.
+ */
+static uint32
+proclock_hash(const void *key, Size keysize)
+{
+	const PROCLOCKTAG *proclocktag = (const PROCLOCKTAG *) key;
+	uint32		lockhash;
+	Datum		procptr;
+
+	Assert(keysize == sizeof(PROCLOCKTAG));
+
+	/* Look into the associated LOCK object, and compute its hash code */
+	lockhash = LockTagHashCode(&proclocktag->myLock->tag);
+
+	/*
+	 * To make the hash code also depend on the PGPROC, we xor the proc
+	 * struct's address into the hash code, left-shifted so that the
+	 * partition-number bits don't change.  Since this is only a hash, we
+	 * don't care if we lose high-order bits of the address; use an
+	 * intermediate variable to suppress cast-pointer-to-int warnings.
+	 */
+	procptr = PointerGetDatum(proclocktag->myProc);
+	lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS;
+
+	return lockhash;
+}
+
+/*
+ * Compute the hash code associated with a PROCLOCKTAG, given the hashcode
+ * for its underlying LOCK.
+ *
+ * We use this just to avoid redundant calls of LockTagHashCode().
+ */
+static inline uint32
+ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode)
+{
+	uint32		lockhash = hashcode;
+	Datum		procptr;
+
+	/*
+	 * This must match proclock_hash()!
+	 */
+	procptr = PointerGetDatum(proclocktag->myProc);
+	lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS;
+
+	return lockhash;
+}
+
+/*
+ * Given two lock modes, return whether they would conflict.
+ */
+bool
+DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
+{
+	LockMethod	lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+
+	if (lockMethodTable->conflictTab[mode1] & LOCKBIT_ON(mode2))
+		return true;
+
+	return false;
+}
+
+/*
+ * LockHeldByMe -- test whether lock 'locktag' is held with mode 'lockmode'
+ *		by the current transaction
+ */
+bool
+LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode)
+{
+	LOCALLOCKTAG localtag;
+	LOCALLOCK  *locallock;
+
+	/*
+	 * See if there is a LOCALLOCK entry for this lock and lockmode
+	 */
+	MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+	localtag.lock = *locktag;
+	localtag.mode = lockmode;
+
+	locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+										  (void *) &localtag,
+										  HASH_FIND, NULL);
+
+	return (locallock && locallock->nLocks > 0);
+}
+
+#ifdef USE_ASSERT_CHECKING
+/*
+ * GetLockMethodLocalHash -- return the hash of local locks, for modules that
+ *		evaluate assertions based on all locks held.
+ */
+HTAB *
+GetLockMethodLocalHash(void)
+{
+	return LockMethodLocalHash;
+}
+#endif
+
+/*
+ * LockHasWaiters -- look up 'locktag' and check if releasing this
+ *		lock would wake up other processes waiting for it.
+ */
+bool
+LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
+{
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LockMethod	lockMethodTable;
+	LOCALLOCKTAG localtag;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	LWLock	   *partitionLock;
+	bool		hasWaiters = false;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+		elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+#ifdef LOCK_DEBUG
+	if (LOCK_DEBUG_ENABLED(locktag))
+		elog(LOG, "LockHasWaiters: lock [%u,%u] %s",
+			 locktag->locktag_field1, locktag->locktag_field2,
+			 lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+	/*
+	 * Find the LOCALLOCK entry for this lock and lockmode
+	 */
+	MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+	localtag.lock = *locktag;
+	localtag.mode = lockmode;
+
+	locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+										  (void *) &localtag,
+										  HASH_FIND, NULL);
+
+	/*
+	 * let the caller print its own error message, too. Do not ereport(ERROR).
+	 */
+	if (!locallock || locallock->nLocks <= 0)
+	{
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		return false;
+	}
+
+	/*
+	 * Check the shared lock table.
+	 */
+	partitionLock = LockHashPartitionLock(locallock->hashcode);
+
+	LWLockAcquire(partitionLock, LW_SHARED);
+
+	/*
+	 * We don't need to re-find the lock or proclock, since we kept their
+	 * addresses in the locallock table, and they couldn't have been removed
+	 * while we were holding a lock on them.
+	 */
+	lock = locallock->lock;
+	LOCK_PRINT("LockHasWaiters: found", lock, lockmode);
+	proclock = locallock->proclock;
+	PROCLOCK_PRINT("LockHasWaiters: found", proclock);
+
+	/*
+	 * Double-check that we are actually holding a lock of the type we want to
+	 * release.
+	 */
+	if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+	{
+		PROCLOCK_PRINT("LockHasWaiters: WRONGTYPE", proclock);
+		LWLockRelease(partitionLock);
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		RemoveLocalLock(locallock);
+		return false;
+	}
+
+	/*
+	 * Do the checking.
+	 */
+	if ((lockMethodTable->conflictTab[lockmode] & lock->waitMask) != 0)
+		hasWaiters = true;
+
+	LWLockRelease(partitionLock);
+
+	return hasWaiters;
+}
+
+/*
+ * LockAcquire -- Check for lock conflicts, sleep if conflict found,
+ *		set lock if/when no conflicts.
+ *
+ * Inputs:
+ *	locktag: unique identifier for the lockable object
+ *	lockmode: lock mode to acquire
+ *	sessionLock: if true, acquire lock for session not current transaction
+ *	dontWait: if true, don't wait to acquire lock
+ *
+ * Returns one of:
+ *		LOCKACQUIRE_NOT_AVAIL		lock not available, and dontWait=true
+ *		LOCKACQUIRE_OK				lock successfully acquired
+ *		LOCKACQUIRE_ALREADY_HELD	incremented count for lock already held
+ *		LOCKACQUIRE_ALREADY_CLEAR	incremented count for lock already clear
+ *
+ * In the normal case where dontWait=false and the caller doesn't need to
+ * distinguish a freshly acquired lock from one already taken earlier in
+ * this same transaction, there is no need to examine the return value.
+ *
+ * Side Effects: The lock is acquired and recorded in lock tables.
+ *
+ * NOTE: if we wait for the lock, there is no way to abort the wait
+ * short of aborting the transaction.
+ */
+LockAcquireResult
+LockAcquire(const LOCKTAG *locktag,
+			LOCKMODE lockmode,
+			bool sessionLock,
+			bool dontWait)
+{
+	return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait,
+							   true, NULL);
+}
+
+/*
+ * LockAcquireExtended - allows us to specify additional options
+ *
+ * reportMemoryError specifies whether a lock request that fills the lock
+ * table should generate an ERROR or not.  Passing "false" allows the caller
+ * to attempt to recover from lock-table-full situations, perhaps by forcibly
+ * canceling other lock holders and then retrying.  Note, however, that the
+ * return code for that is LOCKACQUIRE_NOT_AVAIL, so that it's unsafe to use
+ * in combination with dontWait = true, as the cause of failure couldn't be
+ * distinguished.
+ *
+ * If locallockp isn't NULL, *locallockp receives a pointer to the LOCALLOCK
+ * table entry if a lock is successfully acquired, or NULL if not.
+ */
+LockAcquireResult
+LockAcquireExtended(const LOCKTAG *locktag,
+					LOCKMODE lockmode,
+					bool sessionLock,
+					bool dontWait,
+					bool reportMemoryError,
+					LOCALLOCK **locallockp)
+{
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LockMethod	lockMethodTable;
+	LOCALLOCKTAG localtag;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	bool		found;
+	ResourceOwner owner;
+	uint32		hashcode;
+	LWLock	   *partitionLock;
+	bool		found_conflict;
+	bool		log_lock = false;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+		elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+	if (RecoveryInProgress() && !InRecovery &&
+		(locktag->locktag_type == LOCKTAG_OBJECT ||
+		 locktag->locktag_type == LOCKTAG_RELATION) &&
+		lockmode > RowExclusiveLock)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot acquire lock mode %s on database objects while recovery is in progress",
+						lockMethodTable->lockModeNames[lockmode]),
+				 errhint("Only RowExclusiveLock or less can be acquired on database objects during recovery.")));
+
+#ifdef LOCK_DEBUG
+	if (LOCK_DEBUG_ENABLED(locktag))
+		elog(LOG, "LockAcquire: lock [%u,%u] %s",
+			 locktag->locktag_field1, locktag->locktag_field2,
+			 lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+	/* Identify owner for lock */
+	if (sessionLock)
+		owner = NULL;
+	else
+		owner = CurrentResourceOwner;
+
+	/*
+	 * Find or create a LOCALLOCK entry for this lock and lockmode
+	 */
+	MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+	localtag.lock = *locktag;
+	localtag.mode = lockmode;
+
+	locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+										  (void *) &localtag,
+										  HASH_ENTER, &found);
+
+	/*
+	 * if it's a new locallock object, initialize it
+	 */
+	if (!found)
+	{
+		locallock->lock = NULL;
+		locallock->proclock = NULL;
+		locallock->hashcode = LockTagHashCode(&(localtag.lock));
+		locallock->nLocks = 0;
+		locallock->holdsStrongLockCount = false;
+		locallock->lockCleared = false;
+		locallock->numLockOwners = 0;
+		locallock->maxLockOwners = 8;
+		locallock->lockOwners = NULL;	/* in case next line fails */
+		locallock->lockOwners = (LOCALLOCKOWNER *)
+			MemoryContextAlloc(TopMemoryContext,
+							   locallock->maxLockOwners * sizeof(LOCALLOCKOWNER));
+	}
+	else
+	{
+		/* Make sure there will be room to remember the lock */
+		if (locallock->numLockOwners >= locallock->maxLockOwners)
+		{
+			int			newsize = locallock->maxLockOwners * 2;
+
+			locallock->lockOwners = (LOCALLOCKOWNER *)
+				repalloc(locallock->lockOwners,
+						 newsize * sizeof(LOCALLOCKOWNER));
+			locallock->maxLockOwners = newsize;
+		}
+	}
+	hashcode = locallock->hashcode;
+
+	if (locallockp)
+		*locallockp = locallock;
+
+	/*
+	 * If we already hold the lock, we can just increase the count locally.
+	 *
+	 * If lockCleared is already set, caller need not worry about absorbing
+	 * sinval messages related to the lock's object.
+	 */
+	if (locallock->nLocks > 0)
+	{
+		GrantLockLocal(locallock, owner);
+		if (locallock->lockCleared)
+			return LOCKACQUIRE_ALREADY_CLEAR;
+		else
+			return LOCKACQUIRE_ALREADY_HELD;
+	}
+
+	/*
+	 * We don't acquire any other heavyweight lock while holding the relation
+	 * extension lock.  We do allow to acquire the same relation extension
+	 * lock more than once but that case won't reach here.
+	 */
+	Assert(!IsRelationExtensionLockHeld);
+
+	/*
+	 * We don't acquire any other heavyweight lock while holding the page lock
+	 * except for relation extension.
+	 */
+	Assert(!IsPageLockHeld ||
+		   (locktag->locktag_type == LOCKTAG_RELATION_EXTEND));
+
+	/*
+	 * Prepare to emit a WAL record if acquisition of this lock needs to be
+	 * replayed in a standby server.
+	 *
+	 * Here we prepare to log; after lock is acquired we'll issue log record.
+	 * This arrangement simplifies error recovery in case the preparation step
+	 * fails.
+	 *
+	 * Only AccessExclusiveLocks can conflict with lock types that read-only
+	 * transactions can acquire in a standby server. Make sure this definition
+	 * matches the one in GetRunningTransactionLocks().
+	 */
+	if (lockmode >= AccessExclusiveLock &&
+		locktag->locktag_type == LOCKTAG_RELATION &&
+		!RecoveryInProgress() &&
+		XLogStandbyInfoActive())
+	{
+		LogAccessExclusiveLockPrepare();
+		log_lock = true;
+	}
+
+	/*
+	 * Attempt to take lock via fast path, if eligible.  But if we remember
+	 * having filled up the fast path array, we don't attempt to make any
+	 * further use of it until we release some locks.  It's possible that some
+	 * other backend has transferred some of those locks to the shared hash
+	 * table, leaving space free, but it's not worth acquiring the LWLock just
+	 * to check.  It's also possible that we're acquiring a second or third
+	 * lock type on a relation we have already locked using the fast-path, but
+	 * for now we don't worry about that case either.
+	 */
+	if (EligibleForRelationFastPath(locktag, lockmode) &&
+		FastPathLocalUseCount < FP_LOCK_SLOTS_PER_BACKEND)
+	{
+		uint32		fasthashcode = FastPathStrongLockHashPartition(hashcode);
+		bool		acquired;
+
+		/*
+		 * LWLockAcquire acts as a memory sequencing point, so it's safe to
+		 * assume that any strong locker whose increment to
+		 * FastPathStrongRelationLocks->counts becomes visible after we test
+		 * it has yet to begin to transfer fast-path locks.
+		 */
+		LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+		if (FastPathStrongRelationLocks->count[fasthashcode] != 0)
+			acquired = false;
+		else
+			acquired = FastPathGrantRelationLock(locktag->locktag_field2,
+												 lockmode);
+		LWLockRelease(&MyProc->fpInfoLock);
+		if (acquired)
+		{
+			/*
+			 * The locallock might contain stale pointers to some old shared
+			 * objects; we MUST reset these to null before considering the
+			 * lock to be acquired via fast-path.
+			 */
+			locallock->lock = NULL;
+			locallock->proclock = NULL;
+			GrantLockLocal(locallock, owner);
+			return LOCKACQUIRE_OK;
+		}
+	}
+
+	/*
+	 * If this lock could potentially have been taken via the fast-path by
+	 * some other backend, we must (temporarily) disable further use of the
+	 * fast-path for this lock tag, and migrate any locks already taken via
+	 * this method to the main lock table.
+	 */
+	if (ConflictsWithRelationFastPath(locktag, lockmode))
+	{
+		uint32		fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+		BeginStrongLockAcquire(locallock, fasthashcode);
+		if (!FastPathTransferRelationLocks(lockMethodTable, locktag,
+										   hashcode))
+		{
+			AbortStrongLockAcquire();
+			if (locallock->nLocks == 0)
+				RemoveLocalLock(locallock);
+			if (locallockp)
+				*locallockp = NULL;
+			if (reportMemoryError)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of shared memory"),
+						 errhint("You might need to increase max_locks_per_transaction.")));
+			else
+				return LOCKACQUIRE_NOT_AVAIL;
+		}
+	}
+
+	/*
+	 * We didn't find the lock in our LOCALLOCK table, and we didn't manage to
+	 * take it via the fast-path, either, so we've got to mess with the shared
+	 * lock table.
+	 */
+	partitionLock = LockHashPartitionLock(hashcode);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Find or create lock and proclock entries with this tag
+	 *
+	 * Note: if the locallock object already existed, it might have a pointer
+	 * to the lock already ... but we should not assume that that pointer is
+	 * valid, since a lock object with zero hold and request counts can go
+	 * away anytime.  So we have to use SetupLockInTable() to recompute the
+	 * lock and proclock pointers, even if they're already set.
+	 */
+	proclock = SetupLockInTable(lockMethodTable, MyProc, locktag,
+								hashcode, lockmode);
+	if (!proclock)
+	{
+		AbortStrongLockAcquire();
+		LWLockRelease(partitionLock);
+		if (locallock->nLocks == 0)
+			RemoveLocalLock(locallock);
+		if (locallockp)
+			*locallockp = NULL;
+		if (reportMemoryError)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory"),
+					 errhint("You might need to increase max_locks_per_transaction.")));
+		else
+			return LOCKACQUIRE_NOT_AVAIL;
+	}
+	locallock->proclock = proclock;
+	lock = proclock->tag.myLock;
+	locallock->lock = lock;
+
+	/*
+	 * If lock requested conflicts with locks requested by waiters, must join
+	 * wait queue.  Otherwise, check for conflict with already-held locks.
+	 * (That's last because most complex check.)
+	 */
+	if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
+		found_conflict = true;
+	else
+		found_conflict = LockCheckConflicts(lockMethodTable, lockmode,
+											lock, proclock);
+
+	if (!found_conflict)
+	{
+		/* No conflict with held or previously requested locks */
+		GrantLock(lock, proclock, lockmode);
+		GrantLockLocal(locallock, owner);
+	}
+	else
+	{
+		/*
+		 * We can't acquire the lock immediately.  If caller specified no
+		 * blocking, remove useless table entries and return
+		 * LOCKACQUIRE_NOT_AVAIL without waiting.
+		 */
+		if (dontWait)
+		{
+			AbortStrongLockAcquire();
+			if (proclock->holdMask == 0)
+			{
+				uint32		proclock_hashcode;
+
+				proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
+				SHMQueueDelete(&proclock->lockLink);
+				SHMQueueDelete(&proclock->procLink);
+				if (!hash_search_with_hash_value(LockMethodProcLockHash,
+												 (void *) &(proclock->tag),
+												 proclock_hashcode,
+												 HASH_REMOVE,
+												 NULL))
+					elog(PANIC, "proclock table corrupted");
+			}
+			else
+				PROCLOCK_PRINT("LockAcquire: NOWAIT", proclock);
+			lock->nRequested--;
+			lock->requested[lockmode]--;
+			LOCK_PRINT("LockAcquire: conditional lock failed", lock, lockmode);
+			Assert((lock->nRequested > 0) && (lock->requested[lockmode] >= 0));
+			Assert(lock->nGranted <= lock->nRequested);
+			LWLockRelease(partitionLock);
+			if (locallock->nLocks == 0)
+				RemoveLocalLock(locallock);
+			if (locallockp)
+				*locallockp = NULL;
+			return LOCKACQUIRE_NOT_AVAIL;
+		}
+
+		/*
+		 * Set bitmask of locks this process already holds on this object.
+		 */
+		MyProc->heldLocks = proclock->holdMask;
+
+		/*
+		 * Sleep till someone wakes me up.
+		 */
+
+		TRACE_POSTGRESQL_LOCK_WAIT_START(locktag->locktag_field1,
+										 locktag->locktag_field2,
+										 locktag->locktag_field3,
+										 locktag->locktag_field4,
+										 locktag->locktag_type,
+										 lockmode);
+
+		WaitOnLock(locallock, owner);
+
+		TRACE_POSTGRESQL_LOCK_WAIT_DONE(locktag->locktag_field1,
+										locktag->locktag_field2,
+										locktag->locktag_field3,
+										locktag->locktag_field4,
+										locktag->locktag_type,
+										lockmode);
+
+		/*
+		 * NOTE: do not do any material change of state between here and
+		 * return.  All required changes in locktable state must have been
+		 * done when the lock was granted to us --- see notes in WaitOnLock.
+		 */
+
+		/*
+		 * Check the proclock entry status, in case something in the ipc
+		 * communication doesn't work correctly.
+		 */
+		if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+		{
+			AbortStrongLockAcquire();
+			PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock);
+			LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode);
+			/* Should we retry ? */
+			LWLockRelease(partitionLock);
+			elog(ERROR, "LockAcquire failed");
+		}
+		PROCLOCK_PRINT("LockAcquire: granted", proclock);
+		LOCK_PRINT("LockAcquire: granted", lock, lockmode);
+	}
+
+	/*
+	 * Lock state is fully up-to-date now; if we error out after this, no
+	 * special error cleanup is required.
+	 */
+	FinishStrongLockAcquire();
+
+	LWLockRelease(partitionLock);
+
+	/*
+	 * Emit a WAL record if acquisition of this lock needs to be replayed in a
+	 * standby server.
+	 */
+	if (log_lock)
+	{
+		/*
+		 * Decode the locktag back to the original values, to avoid sending
+		 * lots of empty bytes with every message.  See lock.h to check how a
+		 * locktag is defined for LOCKTAG_RELATION
+		 */
+		LogAccessExclusiveLock(locktag->locktag_field1,
+							   locktag->locktag_field2);
+	}
+
+	return LOCKACQUIRE_OK;
+}
+
+/*
+ * Find or create LOCK and PROCLOCK objects as needed for a new lock
+ * request.
+ *
+ * Returns the PROCLOCK object, or NULL if we failed to create the objects
+ * for lack of shared memory.
+ *
+ * The appropriate partition lock must be held at entry, and will be
+ * held at exit.
+ */
+static PROCLOCK *
+SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
+				 const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode)
+{
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	uint32		proclock_hashcode;
+	bool		found;
+
+	/*
+	 * Find or create a lock with this tag.
+	 */
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												(const void *) locktag,
+												hashcode,
+												HASH_ENTER_NULL,
+												&found);
+	if (!lock)
+		return NULL;
+
+	/*
+	 * if it's a new lock object, initialize it
+	 */
+	if (!found)
+	{
+		lock->grantMask = 0;
+		lock->waitMask = 0;
+		SHMQueueInit(&(lock->procLocks));
+		ProcQueueInit(&(lock->waitProcs));
+		lock->nRequested = 0;
+		lock->nGranted = 0;
+		MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
+		MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+		LOCK_PRINT("LockAcquire: new", lock, lockmode);
+	}
+	else
+	{
+		LOCK_PRINT("LockAcquire: found", lock, lockmode);
+		Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0));
+		Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0));
+		Assert(lock->nGranted <= lock->nRequested);
+	}
+
+	/*
+	 * Create the hash key for the proclock table.
+	 */
+	proclocktag.myLock = lock;
+	proclocktag.myProc = proc;
+
+	proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+	/*
+	 * Find or create a proclock entry with this tag
+	 */
+	proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+														(void *) &proclocktag,
+														proclock_hashcode,
+														HASH_ENTER_NULL,
+														&found);
+	if (!proclock)
+	{
+		/* Oops, not enough shmem for the proclock */
+		if (lock->nRequested == 0)
+		{
+			/*
+			 * There are no other requestors of this lock, so garbage-collect
+			 * the lock object.  We *must* do this to avoid a permanent leak
+			 * of shared memory, because there won't be anything to cause
+			 * anyone to release the lock object later.
+			 */
+			Assert(SHMQueueEmpty(&(lock->procLocks)));
+			if (!hash_search_with_hash_value(LockMethodLockHash,
+											 (void *) &(lock->tag),
+											 hashcode,
+											 HASH_REMOVE,
+											 NULL))
+				elog(PANIC, "lock table corrupted");
+		}
+		return NULL;
+	}
+
+	/*
+	 * If new, initialize the new entry
+	 */
+	if (!found)
+	{
+		uint32		partition = LockHashPartition(hashcode);
+
+		/*
+		 * It might seem unsafe to access proclock->groupLeader without a
+		 * lock, but it's not really.  Either we are initializing a proclock
+		 * on our own behalf, in which case our group leader isn't changing
+		 * because the group leader for a process can only ever be changed by
+		 * the process itself; or else we are transferring a fast-path lock to
+		 * the main lock table, in which case that process can't change it's
+		 * lock group leader without first releasing all of its locks (and in
+		 * particular the one we are currently transferring).
+		 */
+		proclock->groupLeader = proc->lockGroupLeader != NULL ?
+			proc->lockGroupLeader : proc;
+		proclock->holdMask = 0;
+		proclock->releaseMask = 0;
+		/* Add proclock to appropriate lists */
+		SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink);
+		SHMQueueInsertBefore(&(proc->myProcLocks[partition]),
+							 &proclock->procLink);
+		PROCLOCK_PRINT("LockAcquire: new", proclock);
+	}
+	else
+	{
+		PROCLOCK_PRINT("LockAcquire: found", proclock);
+		Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+#ifdef CHECK_DEADLOCK_RISK
+
+		/*
+		 * Issue warning if we already hold a lower-level lock on this object
+		 * and do not hold a lock of the requested level or higher. This
+		 * indicates a deadlock-prone coding practice (eg, we'd have a
+		 * deadlock if another backend were following the same code path at
+		 * about the same time).
+		 *
+		 * This is not enabled by default, because it may generate log entries
+		 * about user-level coding practices that are in fact safe in context.
+		 * It can be enabled to help find system-level problems.
+		 *
+		 * XXX Doing numeric comparison on the lockmodes is a hack; it'd be
+		 * better to use a table.  For now, though, this works.
+		 */
+		{
+			int			i;
+
+			for (i = lockMethodTable->numLockModes; i > 0; i--)
+			{
+				if (proclock->holdMask & LOCKBIT_ON(i))
+				{
+					if (i >= (int) lockmode)
+						break;	/* safe: we have a lock >= req level */
+					elog(LOG, "deadlock risk: raising lock level"
+						 " from %s to %s on object %u/%u/%u",
+						 lockMethodTable->lockModeNames[i],
+						 lockMethodTable->lockModeNames[lockmode],
+						 lock->tag.locktag_field1, lock->tag.locktag_field2,
+						 lock->tag.locktag_field3);
+					break;
+				}
+			}
+		}
+#endif							/* CHECK_DEADLOCK_RISK */
+	}
+
+	/*
+	 * lock->nRequested and lock->requested[] count the total number of
+	 * requests, whether granted or waiting, so increment those immediately.
+	 * The other counts don't increment till we get the lock.
+	 */
+	lock->nRequested++;
+	lock->requested[lockmode]++;
+	Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+
+	/*
+	 * We shouldn't already hold the desired lock; else locallock table is
+	 * broken.
+	 */
+	if (proclock->holdMask & LOCKBIT_ON(lockmode))
+		elog(ERROR, "lock %s on object %u/%u/%u is already held",
+			 lockMethodTable->lockModeNames[lockmode],
+			 lock->tag.locktag_field1, lock->tag.locktag_field2,
+			 lock->tag.locktag_field3);
+
+	return proclock;
+}
+
+/*
+ * Check and set/reset the flag that we hold the relation extension/page lock.
+ *
+ * It is callers responsibility that this function is called after
+ * acquiring/releasing the relation extension/page lock.
+ *
+ * Pass acquired as true if lock is acquired, false otherwise.
+ */
+static inline void
+CheckAndSetLockHeld(LOCALLOCK *locallock, bool acquired)
+{
+#ifdef USE_ASSERT_CHECKING
+	if (LOCALLOCK_LOCKTAG(*locallock) == LOCKTAG_RELATION_EXTEND)
+		IsRelationExtensionLockHeld = acquired;
+	else if (LOCALLOCK_LOCKTAG(*locallock) == LOCKTAG_PAGE)
+		IsPageLockHeld = acquired;
+
+#endif
+}
+
+/*
+ * Subroutine to free a locallock entry
+ */
+static void
+RemoveLocalLock(LOCALLOCK *locallock)
+{
+	int			i;
+
+	for (i = locallock->numLockOwners - 1; i >= 0; i--)
+	{
+		if (locallock->lockOwners[i].owner != NULL)
+			ResourceOwnerForgetLock(locallock->lockOwners[i].owner, locallock);
+	}
+	locallock->numLockOwners = 0;
+	if (locallock->lockOwners != NULL)
+		pfree(locallock->lockOwners);
+	locallock->lockOwners = NULL;
+
+	if (locallock->holdsStrongLockCount)
+	{
+		uint32		fasthashcode;
+
+		fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode);
+
+		SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+		Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+		FastPathStrongRelationLocks->count[fasthashcode]--;
+		locallock->holdsStrongLockCount = false;
+		SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+	}
+
+	if (!hash_search(LockMethodLocalHash,
+					 (void *) &(locallock->tag),
+					 HASH_REMOVE, NULL))
+		elog(WARNING, "locallock table corrupted");
+
+	/*
+	 * Indicate that the lock is released for certain types of locks
+	 */
+	CheckAndSetLockHeld(locallock, false);
+}
+
+/*
+ * LockCheckConflicts -- test whether requested lock conflicts
+ *		with those already granted
+ *
+ * Returns true if conflict, false if no conflict.
+ *
+ * NOTES:
+ *		Here's what makes this complicated: one process's locks don't
+ * conflict with one another, no matter what purpose they are held for
+ * (eg, session and transaction locks do not conflict).  Nor do the locks
+ * of one process in a lock group conflict with those of another process in
+ * the same group.  So, we must subtract off these locks when determining
+ * whether the requested new lock conflicts with those already held.
+ */
+bool
+LockCheckConflicts(LockMethod lockMethodTable,
+				   LOCKMODE lockmode,
+				   LOCK *lock,
+				   PROCLOCK *proclock)
+{
+	int			numLockModes = lockMethodTable->numLockModes;
+	LOCKMASK	myLocks;
+	int			conflictMask = lockMethodTable->conflictTab[lockmode];
+	int			conflictsRemaining[MAX_LOCKMODES];
+	int			totalConflictsRemaining = 0;
+	int			i;
+	SHM_QUEUE  *procLocks;
+	PROCLOCK   *otherproclock;
+
+	/*
+	 * first check for global conflicts: If no locks conflict with my request,
+	 * then I get the lock.
+	 *
+	 * Checking for conflict: lock->grantMask represents the types of
+	 * currently held locks.  conflictTable[lockmode] has a bit set for each
+	 * type of lock that conflicts with request.   Bitwise compare tells if
+	 * there is a conflict.
+	 */
+	if (!(conflictMask & lock->grantMask))
+	{
+		PROCLOCK_PRINT("LockCheckConflicts: no conflict", proclock);
+		return false;
+	}
+
+	/*
+	 * Rats.  Something conflicts.  But it could still be my own lock, or a
+	 * lock held by another member of my locking group.  First, figure out how
+	 * many conflicts remain after subtracting out any locks I hold myself.
+	 */
+	myLocks = proclock->holdMask;
+	for (i = 1; i <= numLockModes; i++)
+	{
+		if ((conflictMask & LOCKBIT_ON(i)) == 0)
+		{
+			conflictsRemaining[i] = 0;
+			continue;
+		}
+		conflictsRemaining[i] = lock->granted[i];
+		if (myLocks & LOCKBIT_ON(i))
+			--conflictsRemaining[i];
+		totalConflictsRemaining += conflictsRemaining[i];
+	}
+
+	/* If no conflicts remain, we get the lock. */
+	if (totalConflictsRemaining == 0)
+	{
+		PROCLOCK_PRINT("LockCheckConflicts: resolved (simple)", proclock);
+		return false;
+	}
+
+	/* If no group locking, it's definitely a conflict. */
+	if (proclock->groupLeader == MyProc && MyProc->lockGroupLeader == NULL)
+	{
+		Assert(proclock->tag.myProc == MyProc);
+		PROCLOCK_PRINT("LockCheckConflicts: conflicting (simple)",
+					   proclock);
+		return true;
+	}
+
+	/*
+	 * The relation extension or page lock conflict even between the group
+	 * members.
+	 */
+	if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND ||
+		(LOCK_LOCKTAG(*lock) == LOCKTAG_PAGE))
+	{
+		PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)",
+					   proclock);
+		return true;
+	}
+
+	/*
+	 * Locks held in conflicting modes by members of our own lock group are
+	 * not real conflicts; we can subtract those out and see if we still have
+	 * a conflict.  This is O(N) in the number of processes holding or
+	 * awaiting locks on this object.  We could improve that by making the
+	 * shared memory state more complex (and larger) but it doesn't seem worth
+	 * it.
+	 */
+	procLocks = &(lock->procLocks);
+	otherproclock = (PROCLOCK *)
+		SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, lockLink));
+	while (otherproclock != NULL)
+	{
+		if (proclock != otherproclock &&
+			proclock->groupLeader == otherproclock->groupLeader &&
+			(otherproclock->holdMask & conflictMask) != 0)
+		{
+			int			intersectMask = otherproclock->holdMask & conflictMask;
+
+			for (i = 1; i <= numLockModes; i++)
+			{
+				if ((intersectMask & LOCKBIT_ON(i)) != 0)
+				{
+					if (conflictsRemaining[i] <= 0)
+						elog(PANIC, "proclocks held do not match lock");
+					conflictsRemaining[i]--;
+					totalConflictsRemaining--;
+				}
+			}
+
+			if (totalConflictsRemaining == 0)
+			{
+				PROCLOCK_PRINT("LockCheckConflicts: resolved (group)",
+							   proclock);
+				return false;
+			}
+		}
+		otherproclock = (PROCLOCK *)
+			SHMQueueNext(procLocks, &otherproclock->lockLink,
+						 offsetof(PROCLOCK, lockLink));
+	}
+
+	/* Nope, it's a real conflict. */
+	PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)", proclock);
+	return true;
+}
+
+/*
+ * GrantLock -- update the lock and proclock data structures to show
+ *		the lock request has been granted.
+ *
+ * NOTE: if proc was blocked, it also needs to be removed from the wait list
+ * and have its waitLock/waitProcLock fields cleared.  That's not done here.
+ *
+ * NOTE: the lock grant also has to be recorded in the associated LOCALLOCK
+ * table entry; but since we may be awaking some other process, we can't do
+ * that here; it's done by GrantLockLocal, instead.
+ */
+void
+GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode)
+{
+	lock->nGranted++;
+	lock->granted[lockmode]++;
+	lock->grantMask |= LOCKBIT_ON(lockmode);
+	if (lock->granted[lockmode] == lock->requested[lockmode])
+		lock->waitMask &= LOCKBIT_OFF(lockmode);
+	proclock->holdMask |= LOCKBIT_ON(lockmode);
+	LOCK_PRINT("GrantLock", lock, lockmode);
+	Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0));
+	Assert(lock->nGranted <= lock->nRequested);
+}
+
+/*
+ * UnGrantLock -- opposite of GrantLock.
+ *
+ * Updates the lock and proclock data structures to show that the lock
+ * is no longer held nor requested by the current holder.
+ *
+ * Returns true if there were any waiters waiting on the lock that
+ * should now be woken up with ProcLockWakeup.
+ */
+static bool
+UnGrantLock(LOCK *lock, LOCKMODE lockmode,
+			PROCLOCK *proclock, LockMethod lockMethodTable)
+{
+	bool		wakeupNeeded = false;
+
+	Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+	Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0));
+	Assert(lock->nGranted <= lock->nRequested);
+
+	/*
+	 * fix the general lock stats
+	 */
+	lock->nRequested--;
+	lock->requested[lockmode]--;
+	lock->nGranted--;
+	lock->granted[lockmode]--;
+
+	if (lock->granted[lockmode] == 0)
+	{
+		/* change the conflict mask.  No more of this lock type. */
+		lock->grantMask &= LOCKBIT_OFF(lockmode);
+	}
+
+	LOCK_PRINT("UnGrantLock: updated", lock, lockmode);
+
+	/*
+	 * We need only run ProcLockWakeup if the released lock conflicts with at
+	 * least one of the lock types requested by waiter(s).  Otherwise whatever
+	 * conflict made them wait must still exist.  NOTE: before MVCC, we could
+	 * skip wakeup if lock->granted[lockmode] was still positive. But that's
+	 * not true anymore, because the remaining granted locks might belong to
+	 * some waiter, who could now be awakened because he doesn't conflict with
+	 * his own locks.
+	 */
+	if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
+		wakeupNeeded = true;
+
+	/*
+	 * Now fix the per-proclock state.
+	 */
+	proclock->holdMask &= LOCKBIT_OFF(lockmode);
+	PROCLOCK_PRINT("UnGrantLock: updated", proclock);
+
+	return wakeupNeeded;
+}
+
+/*
+ * CleanUpLock -- clean up after releasing a lock.  We garbage-collect the
+ * proclock and lock objects if possible, and call ProcLockWakeup if there
+ * are remaining requests and the caller says it's OK.  (Normally, this
+ * should be called after UnGrantLock, and wakeupNeeded is the result from
+ * UnGrantLock.)
+ *
+ * The appropriate partition lock must be held at entry, and will be
+ * held at exit.
+ */
+static void
+CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+			LockMethod lockMethodTable, uint32 hashcode,
+			bool wakeupNeeded)
+{
+	/*
+	 * If this was my last hold on this lock, delete my entry in the proclock
+	 * table.
+	 */
+	if (proclock->holdMask == 0)
+	{
+		uint32		proclock_hashcode;
+
+		PROCLOCK_PRINT("CleanUpLock: deleting", proclock);
+		SHMQueueDelete(&proclock->lockLink);
+		SHMQueueDelete(&proclock->procLink);
+		proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
+		if (!hash_search_with_hash_value(LockMethodProcLockHash,
+										 (void *) &(proclock->tag),
+										 proclock_hashcode,
+										 HASH_REMOVE,
+										 NULL))
+			elog(PANIC, "proclock table corrupted");
+	}
+
+	if (lock->nRequested == 0)
+	{
+		/*
+		 * The caller just released the last lock, so garbage-collect the lock
+		 * object.
+		 */
+		LOCK_PRINT("CleanUpLock: deleting", lock, 0);
+		Assert(SHMQueueEmpty(&(lock->procLocks)));
+		if (!hash_search_with_hash_value(LockMethodLockHash,
+										 (void *) &(lock->tag),
+										 hashcode,
+										 HASH_REMOVE,
+										 NULL))
+			elog(PANIC, "lock table corrupted");
+	}
+	else if (wakeupNeeded)
+	{
+		/* There are waiters on this lock, so wake them up. */
+		ProcLockWakeup(lockMethodTable, lock);
+	}
+}
+
+/*
+ * GrantLockLocal -- update the locallock data structures to show
+ *		the lock request has been granted.
+ *
+ * We expect that LockAcquire made sure there is room to add a new
+ * ResourceOwner entry.
+ */
+static void
+GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner)
+{
+	LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+	int			i;
+
+	Assert(locallock->numLockOwners < locallock->maxLockOwners);
+	/* Count the total */
+	locallock->nLocks++;
+	/* Count the per-owner lock */
+	for (i = 0; i < locallock->numLockOwners; i++)
+	{
+		if (lockOwners[i].owner == owner)
+		{
+			lockOwners[i].nLocks++;
+			return;
+		}
+	}
+	lockOwners[i].owner = owner;
+	lockOwners[i].nLocks = 1;
+	locallock->numLockOwners++;
+	if (owner != NULL)
+		ResourceOwnerRememberLock(owner, locallock);
+
+	/* Indicate that the lock is acquired for certain types of locks. */
+	CheckAndSetLockHeld(locallock, true);
+}
+
+/*
+ * BeginStrongLockAcquire - inhibit use of fastpath for a given LOCALLOCK,
+ * and arrange for error cleanup if it fails
+ */
+static void
+BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode)
+{
+	Assert(StrongLockInProgress == NULL);
+	Assert(locallock->holdsStrongLockCount == false);
+
+	/*
+	 * Adding to a memory location is not atomic, so we take a spinlock to
+	 * ensure we don't collide with someone else trying to bump the count at
+	 * the same time.
+	 *
+	 * XXX: It might be worth considering using an atomic fetch-and-add
+	 * instruction here, on architectures where that is supported.
+	 */
+
+	SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+	FastPathStrongRelationLocks->count[fasthashcode]++;
+	locallock->holdsStrongLockCount = true;
+	StrongLockInProgress = locallock;
+	SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+}
+
+/*
+ * FinishStrongLockAcquire - cancel pending cleanup for a strong lock
+ * acquisition once it's no longer needed
+ */
+static void
+FinishStrongLockAcquire(void)
+{
+	StrongLockInProgress = NULL;
+}
+
+/*
+ * AbortStrongLockAcquire - undo strong lock state changes performed by
+ * BeginStrongLockAcquire.
+ */
+void
+AbortStrongLockAcquire(void)
+{
+	uint32		fasthashcode;
+	LOCALLOCK  *locallock = StrongLockInProgress;
+
+	if (locallock == NULL)
+		return;
+
+	fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode);
+	Assert(locallock->holdsStrongLockCount == true);
+	SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+	Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+	FastPathStrongRelationLocks->count[fasthashcode]--;
+	locallock->holdsStrongLockCount = false;
+	StrongLockInProgress = NULL;
+	SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+}
+
+/*
+ * GrantAwaitedLock -- call GrantLockLocal for the lock we are doing
+ *		WaitOnLock on.
+ *
+ * proc.c needs this for the case where we are booted off the lock by
+ * timeout, but discover that someone granted us the lock anyway.
+ *
+ * We could just export GrantLockLocal, but that would require including
+ * resowner.h in lock.h, which creates circularity.
+ */
+void
+GrantAwaitedLock(void)
+{
+	GrantLockLocal(awaitedLock, awaitedOwner);
+}
+
+/*
+ * MarkLockClear -- mark an acquired lock as "clear"
+ *
+ * This means that we know we have absorbed all sinval messages that other
+ * sessions generated before we acquired this lock, and so we can confidently
+ * assume we know about any catalog changes protected by this lock.
+ */
+void
+MarkLockClear(LOCALLOCK *locallock)
+{
+	Assert(locallock->nLocks > 0);
+	locallock->lockCleared = true;
+}
+
+/*
+ * WaitOnLock -- wait to acquire a lock
+ *
+ * Caller must have set MyProc->heldLocks to reflect locks already held
+ * on the lockable object by this process.
+ *
+ * The appropriate partition lock must be held at entry.
+ */
+static void
+WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner)
+{
+	LOCKMETHODID lockmethodid = LOCALLOCK_LOCKMETHOD(*locallock);
+	LockMethod	lockMethodTable = LockMethods[lockmethodid];
+	char	   *volatile new_status = NULL;
+
+	LOCK_PRINT("WaitOnLock: sleeping on lock",
+			   locallock->lock, locallock->tag.mode);
+
+	/* Report change to waiting status */
+	if (update_process_title)
+	{
+		const char *old_status;
+		int			len;
+
+		old_status = get_ps_display(&len);
+		new_status = (char *) palloc(len + 8 + 1);
+		memcpy(new_status, old_status, len);
+		strcpy(new_status + len, " waiting");
+		set_ps_display(new_status);
+		new_status[len] = '\0'; /* truncate off " waiting" */
+	}
+
+	awaitedLock = locallock;
+	awaitedOwner = owner;
+
+	/*
+	 * NOTE: Think not to put any shared-state cleanup after the call to
+	 * ProcSleep, in either the normal or failure path.  The lock state must
+	 * be fully set by the lock grantor, or by CheckDeadLock if we give up
+	 * waiting for the lock.  This is necessary because of the possibility
+	 * that a cancel/die interrupt will interrupt ProcSleep after someone else
+	 * grants us the lock, but before we've noticed it. Hence, after granting,
+	 * the locktable state must fully reflect the fact that we own the lock;
+	 * we can't do additional work on return.
+	 *
+	 * We can and do use a PG_TRY block to try to clean up after failure, but
+	 * this still has a major limitation: elog(FATAL) can occur while waiting
+	 * (eg, a "die" interrupt), and then control won't come back here. So all
+	 * cleanup of essential state should happen in LockErrorCleanup, not here.
+	 * We can use PG_TRY to clear the "waiting" status flags, since doing that
+	 * is unimportant if the process exits.
+	 */
+	PG_TRY();
+	{
+		if (ProcSleep(locallock, lockMethodTable) != PROC_WAIT_STATUS_OK)
+		{
+			/*
+			 * We failed as a result of a deadlock, see CheckDeadLock(). Quit
+			 * now.
+			 */
+			awaitedLock = NULL;
+			LOCK_PRINT("WaitOnLock: aborting on lock",
+					   locallock->lock, locallock->tag.mode);
+			LWLockRelease(LockHashPartitionLock(locallock->hashcode));
+
+			/*
+			 * Now that we aren't holding the partition lock, we can give an
+			 * error report including details about the detected deadlock.
+			 */
+			DeadLockReport();
+			/* not reached */
+		}
+	}
+	PG_CATCH();
+	{
+		/* In this path, awaitedLock remains set until LockErrorCleanup */
+
+		/* Report change to non-waiting status */
+		if (update_process_title)
+		{
+			set_ps_display(new_status);
+			pfree(new_status);
+		}
+
+		/* and propagate the error */
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	awaitedLock = NULL;
+
+	/* Report change to non-waiting status */
+	if (update_process_title)
+	{
+		set_ps_display(new_status);
+		pfree(new_status);
+	}
+
+	LOCK_PRINT("WaitOnLock: wakeup on lock",
+			   locallock->lock, locallock->tag.mode);
+}
+
+/*
+ * Remove a proc from the wait-queue it is on (caller must know it is on one).
+ * This is only used when the proc has failed to get the lock, so we set its
+ * waitStatus to PROC_WAIT_STATUS_ERROR.
+ *
+ * Appropriate partition lock must be held by caller.  Also, caller is
+ * responsible for signaling the proc if needed.
+ *
+ * NB: this does not clean up any locallock object that may exist for the lock.
+ */
+void
+RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode)
+{
+	LOCK	   *waitLock = proc->waitLock;
+	PROCLOCK   *proclock = proc->waitProcLock;
+	LOCKMODE	lockmode = proc->waitLockMode;
+	LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*waitLock);
+
+	/* Make sure proc is waiting */
+	Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING);
+	Assert(proc->links.next != NULL);
+	Assert(waitLock);
+	Assert(waitLock->waitProcs.size > 0);
+	Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+
+	/* Remove proc from lock's wait queue */
+	SHMQueueDelete(&(proc->links));
+	waitLock->waitProcs.size--;
+
+	/* Undo increments of request counts by waiting process */
+	Assert(waitLock->nRequested > 0);
+	Assert(waitLock->nRequested > proc->waitLock->nGranted);
+	waitLock->nRequested--;
+	Assert(waitLock->requested[lockmode] > 0);
+	waitLock->requested[lockmode]--;
+	/* don't forget to clear waitMask bit if appropriate */
+	if (waitLock->granted[lockmode] == waitLock->requested[lockmode])
+		waitLock->waitMask &= LOCKBIT_OFF(lockmode);
+
+	/* Clean up the proc's own state, and pass it the ok/fail signal */
+	proc->waitLock = NULL;
+	proc->waitProcLock = NULL;
+	proc->waitStatus = PROC_WAIT_STATUS_ERROR;
+
+	/*
+	 * Delete the proclock immediately if it represents no already-held locks.
+	 * (This must happen now because if the owner of the lock decides to
+	 * release it, and the requested/granted counts then go to zero,
+	 * LockRelease expects there to be no remaining proclocks.) Then see if
+	 * any other waiters for the lock can be woken up now.
+	 */
+	CleanUpLock(waitLock, proclock,
+				LockMethods[lockmethodid], hashcode,
+				true);
+}
+
+/*
+ * LockRelease -- look up 'locktag' and release one 'lockmode' lock on it.
+ *		Release a session lock if 'sessionLock' is true, else release a
+ *		regular transaction lock.
+ *
+ * Side Effects: find any waiting processes that are now wakable,
+ *		grant them their requested locks and awaken them.
+ *		(We have to grant the lock here to avoid a race between
+ *		the waking process and any new process to
+ *		come along and request the lock.)
+ */
+bool
+LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
+{
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LockMethod	lockMethodTable;
+	LOCALLOCKTAG localtag;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	LWLock	   *partitionLock;
+	bool		wakeupNeeded;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+		elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+#ifdef LOCK_DEBUG
+	if (LOCK_DEBUG_ENABLED(locktag))
+		elog(LOG, "LockRelease: lock [%u,%u] %s",
+			 locktag->locktag_field1, locktag->locktag_field2,
+			 lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+	/*
+	 * Find the LOCALLOCK entry for this lock and lockmode
+	 */
+	MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+	localtag.lock = *locktag;
+	localtag.mode = lockmode;
+
+	locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+										  (void *) &localtag,
+										  HASH_FIND, NULL);
+
+	/*
+	 * let the caller print its own error message, too. Do not ereport(ERROR).
+	 */
+	if (!locallock || locallock->nLocks <= 0)
+	{
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		return false;
+	}
+
+	/*
+	 * Decrease the count for the resource owner.
+	 */
+	{
+		LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+		ResourceOwner owner;
+		int			i;
+
+		/* Identify owner for lock */
+		if (sessionLock)
+			owner = NULL;
+		else
+			owner = CurrentResourceOwner;
+
+		for (i = locallock->numLockOwners - 1; i >= 0; i--)
+		{
+			if (lockOwners[i].owner == owner)
+			{
+				Assert(lockOwners[i].nLocks > 0);
+				if (--lockOwners[i].nLocks == 0)
+				{
+					if (owner != NULL)
+						ResourceOwnerForgetLock(owner, locallock);
+					/* compact out unused slot */
+					locallock->numLockOwners--;
+					if (i < locallock->numLockOwners)
+						lockOwners[i] = lockOwners[locallock->numLockOwners];
+				}
+				break;
+			}
+		}
+		if (i < 0)
+		{
+			/* don't release a lock belonging to another owner */
+			elog(WARNING, "you don't own a lock of type %s",
+				 lockMethodTable->lockModeNames[lockmode]);
+			return false;
+		}
+	}
+
+	/*
+	 * Decrease the total local count.  If we're still holding the lock, we're
+	 * done.
+	 */
+	locallock->nLocks--;
+
+	if (locallock->nLocks > 0)
+		return true;
+
+	/*
+	 * At this point we can no longer suppose we are clear of invalidation
+	 * messages related to this lock.  Although we'll delete the LOCALLOCK
+	 * object before any intentional return from this routine, it seems worth
+	 * the trouble to explicitly reset lockCleared right now, just in case
+	 * some error prevents us from deleting the LOCALLOCK.
+	 */
+	locallock->lockCleared = false;
+
+	/* Attempt fast release of any lock eligible for the fast path. */
+	if (EligibleForRelationFastPath(locktag, lockmode) &&
+		FastPathLocalUseCount > 0)
+	{
+		bool		released;
+
+		/*
+		 * We might not find the lock here, even if we originally entered it
+		 * here.  Another backend may have moved it to the main table.
+		 */
+		LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+		released = FastPathUnGrantRelationLock(locktag->locktag_field2,
+											   lockmode);
+		LWLockRelease(&MyProc->fpInfoLock);
+		if (released)
+		{
+			RemoveLocalLock(locallock);
+			return true;
+		}
+	}
+
+	/*
+	 * Otherwise we've got to mess with the shared lock table.
+	 */
+	partitionLock = LockHashPartitionLock(locallock->hashcode);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Normally, we don't need to re-find the lock or proclock, since we kept
+	 * their addresses in the locallock table, and they couldn't have been
+	 * removed while we were holding a lock on them.  But it's possible that
+	 * the lock was taken fast-path and has since been moved to the main hash
+	 * table by another backend, in which case we will need to look up the
+	 * objects here.  We assume the lock field is NULL if so.
+	 */
+	lock = locallock->lock;
+	if (!lock)
+	{
+		PROCLOCKTAG proclocktag;
+
+		Assert(EligibleForRelationFastPath(locktag, lockmode));
+		lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+													(const void *) locktag,
+													locallock->hashcode,
+													HASH_FIND,
+													NULL);
+		if (!lock)
+			elog(ERROR, "failed to re-find shared lock object");
+		locallock->lock = lock;
+
+		proclocktag.myLock = lock;
+		proclocktag.myProc = MyProc;
+		locallock->proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+													   (void *) &proclocktag,
+													   HASH_FIND,
+													   NULL);
+		if (!locallock->proclock)
+			elog(ERROR, "failed to re-find shared proclock object");
+	}
+	LOCK_PRINT("LockRelease: found", lock, lockmode);
+	proclock = locallock->proclock;
+	PROCLOCK_PRINT("LockRelease: found", proclock);
+
+	/*
+	 * Double-check that we are actually holding a lock of the type we want to
+	 * release.
+	 */
+	if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+	{
+		PROCLOCK_PRINT("LockRelease: WRONGTYPE", proclock);
+		LWLockRelease(partitionLock);
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		RemoveLocalLock(locallock);
+		return false;
+	}
+
+	/*
+	 * Do the releasing.  CleanUpLock will waken any now-wakable waiters.
+	 */
+	wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
+
+	CleanUpLock(lock, proclock,
+				lockMethodTable, locallock->hashcode,
+				wakeupNeeded);
+
+	LWLockRelease(partitionLock);
+
+	RemoveLocalLock(locallock);
+	return true;
+}
+
+/*
+ * LockReleaseAll -- Release all locks of the specified lock method that
+ *		are held by the current process.
+ *
+ * Well, not necessarily *all* locks.  The available behaviors are:
+ *		allLocks == true: release all locks including session locks.
+ *		allLocks == false: release all non-session locks.
+ */
+void
+LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
+{
+	HASH_SEQ_STATUS status;
+	LockMethod	lockMethodTable;
+	int			i,
+				numLockModes;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	int			partition;
+	bool		have_fast_path_lwlock = false;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+
+#ifdef LOCK_DEBUG
+	if (*(lockMethodTable->trace_flag))
+		elog(LOG, "LockReleaseAll: lockmethod=%d", lockmethodid);
+#endif
+
+	/*
+	 * Get rid of our fast-path VXID lock, if appropriate.  Note that this is
+	 * the only way that the lock we hold on our own VXID can ever get
+	 * released: it is always and only released when a toplevel transaction
+	 * ends.
+	 */
+	if (lockmethodid == DEFAULT_LOCKMETHOD)
+		VirtualXactLockTableCleanup();
+
+	numLockModes = lockMethodTable->numLockModes;
+
+	/*
+	 * First we run through the locallock table and get rid of unwanted
+	 * entries, then we scan the process's proclocks and get rid of those. We
+	 * do this separately because we may have multiple locallock entries
+	 * pointing to the same proclock, and we daren't end up with any dangling
+	 * pointers.  Fast-path locks are cleaned up during the locallock table
+	 * scan, though.
+	 */
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		/*
+		 * If the LOCALLOCK entry is unused, we must've run out of shared
+		 * memory while trying to set up this lock.  Just forget the local
+		 * entry.
+		 */
+		if (locallock->nLocks == 0)
+		{
+			RemoveLocalLock(locallock);
+			continue;
+		}
+
+		/* Ignore items that are not of the lockmethod to be removed */
+		if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+			continue;
+
+		/*
+		 * If we are asked to release all locks, we can just zap the entry.
+		 * Otherwise, must scan to see if there are session locks. We assume
+		 * there is at most one lockOwners entry for session locks.
+		 */
+		if (!allLocks)
+		{
+			LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+
+			/* If session lock is above array position 0, move it down to 0 */
+			for (i = 0; i < locallock->numLockOwners; i++)
+			{
+				if (lockOwners[i].owner == NULL)
+					lockOwners[0] = lockOwners[i];
+				else
+					ResourceOwnerForgetLock(lockOwners[i].owner, locallock);
+			}
+
+			if (locallock->numLockOwners > 0 &&
+				lockOwners[0].owner == NULL &&
+				lockOwners[0].nLocks > 0)
+			{
+				/* Fix the locallock to show just the session locks */
+				locallock->nLocks = lockOwners[0].nLocks;
+				locallock->numLockOwners = 1;
+				/* We aren't deleting this locallock, so done */
+				continue;
+			}
+			else
+				locallock->numLockOwners = 0;
+		}
+
+		/*
+		 * If the lock or proclock pointers are NULL, this lock was taken via
+		 * the relation fast-path (and is not known to have been transferred).
+		 */
+		if (locallock->proclock == NULL || locallock->lock == NULL)
+		{
+			LOCKMODE	lockmode = locallock->tag.mode;
+			Oid			relid;
+
+			/* Verify that a fast-path lock is what we've got. */
+			if (!EligibleForRelationFastPath(&locallock->tag.lock, lockmode))
+				elog(PANIC, "locallock table corrupted");
+
+			/*
+			 * If we don't currently hold the LWLock that protects our
+			 * fast-path data structures, we must acquire it before attempting
+			 * to release the lock via the fast-path.  We will continue to
+			 * hold the LWLock until we're done scanning the locallock table,
+			 * unless we hit a transferred fast-path lock.  (XXX is this
+			 * really such a good idea?  There could be a lot of entries ...)
+			 */
+			if (!have_fast_path_lwlock)
+			{
+				LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+				have_fast_path_lwlock = true;
+			}
+
+			/* Attempt fast-path release. */
+			relid = locallock->tag.lock.locktag_field2;
+			if (FastPathUnGrantRelationLock(relid, lockmode))
+			{
+				RemoveLocalLock(locallock);
+				continue;
+			}
+
+			/*
+			 * Our lock, originally taken via the fast path, has been
+			 * transferred to the main lock table.  That's going to require
+			 * some extra work, so release our fast-path lock before starting.
+			 */
+			LWLockRelease(&MyProc->fpInfoLock);
+			have_fast_path_lwlock = false;
+
+			/*
+			 * Now dump the lock.  We haven't got a pointer to the LOCK or
+			 * PROCLOCK in this case, so we have to handle this a bit
+			 * differently than a normal lock release.  Unfortunately, this
+			 * requires an extra LWLock acquire-and-release cycle on the
+			 * partitionLock, but hopefully it shouldn't happen often.
+			 */
+			LockRefindAndRelease(lockMethodTable, MyProc,
+								 &locallock->tag.lock, lockmode, false);
+			RemoveLocalLock(locallock);
+			continue;
+		}
+
+		/* Mark the proclock to show we need to release this lockmode */
+		if (locallock->nLocks > 0)
+			locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode);
+
+		/* And remove the locallock hashtable entry */
+		RemoveLocalLock(locallock);
+	}
+
+	/* Done with the fast-path data structures */
+	if (have_fast_path_lwlock)
+		LWLockRelease(&MyProc->fpInfoLock);
+
+	/*
+	 * Now, scan each lock partition separately.
+	 */
+	for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+	{
+		LWLock	   *partitionLock;
+		SHM_QUEUE  *procLocks = &(MyProc->myProcLocks[partition]);
+		PROCLOCK   *nextplock;
+
+		partitionLock = LockHashPartitionLockByIndex(partition);
+
+		/*
+		 * If the proclock list for this partition is empty, we can skip
+		 * acquiring the partition lock.  This optimization is trickier than
+		 * it looks, because another backend could be in process of adding
+		 * something to our proclock list due to promoting one of our
+		 * fast-path locks.  However, any such lock must be one that we
+		 * decided not to delete above, so it's okay to skip it again now;
+		 * we'd just decide not to delete it again.  We must, however, be
+		 * careful to re-fetch the list header once we've acquired the
+		 * partition lock, to be sure we have a valid, up-to-date pointer.
+		 * (There is probably no significant risk if pointer fetch/store is
+		 * atomic, but we don't wish to assume that.)
+		 *
+		 * XXX This argument assumes that the locallock table correctly
+		 * represents all of our fast-path locks.  While allLocks mode
+		 * guarantees to clean up all of our normal locks regardless of the
+		 * locallock situation, we lose that guarantee for fast-path locks.
+		 * This is not ideal.
+		 */
+		if (SHMQueueNext(procLocks, procLocks,
+						 offsetof(PROCLOCK, procLink)) == NULL)
+			continue;			/* needn't examine this partition */
+
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		for (proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+												  offsetof(PROCLOCK, procLink));
+			 proclock;
+			 proclock = nextplock)
+		{
+			bool		wakeupNeeded = false;
+
+			/* Get link first, since we may unlink/delete this proclock */
+			nextplock = (PROCLOCK *)
+				SHMQueueNext(procLocks, &proclock->procLink,
+							 offsetof(PROCLOCK, procLink));
+
+			Assert(proclock->tag.myProc == MyProc);
+
+			lock = proclock->tag.myLock;
+
+			/* Ignore items that are not of the lockmethod to be removed */
+			if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
+				continue;
+
+			/*
+			 * In allLocks mode, force release of all locks even if locallock
+			 * table had problems
+			 */
+			if (allLocks)
+				proclock->releaseMask = proclock->holdMask;
+			else
+				Assert((proclock->releaseMask & ~proclock->holdMask) == 0);
+
+			/*
+			 * Ignore items that have nothing to be released, unless they have
+			 * holdMask == 0 and are therefore recyclable
+			 */
+			if (proclock->releaseMask == 0 && proclock->holdMask != 0)
+				continue;
+
+			PROCLOCK_PRINT("LockReleaseAll", proclock);
+			LOCK_PRINT("LockReleaseAll", lock, 0);
+			Assert(lock->nRequested >= 0);
+			Assert(lock->nGranted >= 0);
+			Assert(lock->nGranted <= lock->nRequested);
+			Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+			/*
+			 * Release the previously-marked lock modes
+			 */
+			for (i = 1; i <= numLockModes; i++)
+			{
+				if (proclock->releaseMask & LOCKBIT_ON(i))
+					wakeupNeeded |= UnGrantLock(lock, i, proclock,
+												lockMethodTable);
+			}
+			Assert((lock->nRequested >= 0) && (lock->nGranted >= 0));
+			Assert(lock->nGranted <= lock->nRequested);
+			LOCK_PRINT("LockReleaseAll: updated", lock, 0);
+
+			proclock->releaseMask = 0;
+
+			/* CleanUpLock will wake up waiters if needed. */
+			CleanUpLock(lock, proclock,
+						lockMethodTable,
+						LockTagHashCode(&lock->tag),
+						wakeupNeeded);
+		}						/* loop over PROCLOCKs within this partition */
+
+		LWLockRelease(partitionLock);
+	}							/* loop over partitions */
+
+#ifdef LOCK_DEBUG
+	if (*(lockMethodTable->trace_flag))
+		elog(LOG, "LockReleaseAll done");
+#endif
+}
+
+/*
+ * LockReleaseSession -- Release all session locks of the specified lock method
+ *		that are held by the current process.
+ */
+void
+LockReleaseSession(LOCKMETHODID lockmethodid)
+{
+	HASH_SEQ_STATUS status;
+	LOCALLOCK  *locallock;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		/* Ignore items that are not of the specified lock method */
+		if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+			continue;
+
+		ReleaseLockIfHeld(locallock, true);
+	}
+}
+
+/*
+ * LockReleaseCurrentOwner
+ *		Release all locks belonging to CurrentResourceOwner
+ *
+ * If the caller knows what those locks are, it can pass them as an array.
+ * That speeds up the call significantly, when a lot of locks are held.
+ * Otherwise, pass NULL for locallocks, and we'll traverse through our hash
+ * table to find them.
+ */
+void
+LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks)
+{
+	if (locallocks == NULL)
+	{
+		HASH_SEQ_STATUS status;
+		LOCALLOCK  *locallock;
+
+		hash_seq_init(&status, LockMethodLocalHash);
+
+		while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+			ReleaseLockIfHeld(locallock, false);
+	}
+	else
+	{
+		int			i;
+
+		for (i = nlocks - 1; i >= 0; i--)
+			ReleaseLockIfHeld(locallocks[i], false);
+	}
+}
+
+/*
+ * ReleaseLockIfHeld
+ *		Release any session-level locks on this lockable object if sessionLock
+ *		is true; else, release any locks held by CurrentResourceOwner.
+ *
+ * It is tempting to pass this a ResourceOwner pointer (or NULL for session
+ * locks), but without refactoring LockRelease() we cannot support releasing
+ * locks belonging to resource owners other than CurrentResourceOwner.
+ * If we were to refactor, it'd be a good idea to fix it so we don't have to
+ * do a hashtable lookup of the locallock, too.  However, currently this
+ * function isn't used heavily enough to justify refactoring for its
+ * convenience.
+ */
+static void
+ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock)
+{
+	ResourceOwner owner;
+	LOCALLOCKOWNER *lockOwners;
+	int			i;
+
+	/* Identify owner for lock (must match LockRelease!) */
+	if (sessionLock)
+		owner = NULL;
+	else
+		owner = CurrentResourceOwner;
+
+	/* Scan to see if there are any locks belonging to the target owner */
+	lockOwners = locallock->lockOwners;
+	for (i = locallock->numLockOwners - 1; i >= 0; i--)
+	{
+		if (lockOwners[i].owner == owner)
+		{
+			Assert(lockOwners[i].nLocks > 0);
+			if (lockOwners[i].nLocks < locallock->nLocks)
+			{
+				/*
+				 * We will still hold this lock after forgetting this
+				 * ResourceOwner.
+				 */
+				locallock->nLocks -= lockOwners[i].nLocks;
+				/* compact out unused slot */
+				locallock->numLockOwners--;
+				if (owner != NULL)
+					ResourceOwnerForgetLock(owner, locallock);
+				if (i < locallock->numLockOwners)
+					lockOwners[i] = lockOwners[locallock->numLockOwners];
+			}
+			else
+			{
+				Assert(lockOwners[i].nLocks == locallock->nLocks);
+				/* We want to call LockRelease just once */
+				lockOwners[i].nLocks = 1;
+				locallock->nLocks = 1;
+				if (!LockRelease(&locallock->tag.lock,
+								 locallock->tag.mode,
+								 sessionLock))
+					elog(WARNING, "ReleaseLockIfHeld: failed??");
+			}
+			break;
+		}
+	}
+}
+
+/*
+ * LockReassignCurrentOwner
+ *		Reassign all locks belonging to CurrentResourceOwner to belong
+ *		to its parent resource owner.
+ *
+ * If the caller knows what those locks are, it can pass them as an array.
+ * That speeds up the call significantly, when a lot of locks are held
+ * (e.g pg_dump with a large schema).  Otherwise, pass NULL for locallocks,
+ * and we'll traverse through our hash table to find them.
+ */
+void
+LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks)
+{
+	ResourceOwner parent = ResourceOwnerGetParent(CurrentResourceOwner);
+
+	Assert(parent != NULL);
+
+	if (locallocks == NULL)
+	{
+		HASH_SEQ_STATUS status;
+		LOCALLOCK  *locallock;
+
+		hash_seq_init(&status, LockMethodLocalHash);
+
+		while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+			LockReassignOwner(locallock, parent);
+	}
+	else
+	{
+		int			i;
+
+		for (i = nlocks - 1; i >= 0; i--)
+			LockReassignOwner(locallocks[i], parent);
+	}
+}
+
+/*
+ * Subroutine of LockReassignCurrentOwner. Reassigns a given lock belonging to
+ * CurrentResourceOwner to its parent.
+ */
+static void
+LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent)
+{
+	LOCALLOCKOWNER *lockOwners;
+	int			i;
+	int			ic = -1;
+	int			ip = -1;
+
+	/*
+	 * Scan to see if there are any locks belonging to current owner or its
+	 * parent
+	 */
+	lockOwners = locallock->lockOwners;
+	for (i = locallock->numLockOwners - 1; i >= 0; i--)
+	{
+		if (lockOwners[i].owner == CurrentResourceOwner)
+			ic = i;
+		else if (lockOwners[i].owner == parent)
+			ip = i;
+	}
+
+	if (ic < 0)
+		return;					/* no current locks */
+
+	if (ip < 0)
+	{
+		/* Parent has no slot, so just give it the child's slot */
+		lockOwners[ic].owner = parent;
+		ResourceOwnerRememberLock(parent, locallock);
+	}
+	else
+	{
+		/* Merge child's count with parent's */
+		lockOwners[ip].nLocks += lockOwners[ic].nLocks;
+		/* compact out unused slot */
+		locallock->numLockOwners--;
+		if (ic < locallock->numLockOwners)
+			lockOwners[ic] = lockOwners[locallock->numLockOwners];
+	}
+	ResourceOwnerForgetLock(CurrentResourceOwner, locallock);
+}
+
+/*
+ * FastPathGrantRelationLock
+ *		Grant lock using per-backend fast-path array, if there is space.
+ */
+static bool
+FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode)
+{
+	uint32		f;
+	uint32		unused_slot = FP_LOCK_SLOTS_PER_BACKEND;
+
+	/* Scan for existing entry for this relid, remembering empty slot. */
+	for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+	{
+		if (FAST_PATH_GET_BITS(MyProc, f) == 0)
+			unused_slot = f;
+		else if (MyProc->fpRelId[f] == relid)
+		{
+			Assert(!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode));
+			FAST_PATH_SET_LOCKMODE(MyProc, f, lockmode);
+			return true;
+		}
+	}
+
+	/* If no existing entry, use any empty slot. */
+	if (unused_slot < FP_LOCK_SLOTS_PER_BACKEND)
+	{
+		MyProc->fpRelId[unused_slot] = relid;
+		FAST_PATH_SET_LOCKMODE(MyProc, unused_slot, lockmode);
+		++FastPathLocalUseCount;
+		return true;
+	}
+
+	/* No existing entry, and no empty slot. */
+	return false;
+}
+
+/*
+ * FastPathUnGrantRelationLock
+ *		Release fast-path lock, if present.  Update backend-private local
+ *		use count, while we're at it.
+ */
+static bool
+FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode)
+{
+	uint32		f;
+	bool		result = false;
+
+	FastPathLocalUseCount = 0;
+	for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+	{
+		if (MyProc->fpRelId[f] == relid
+			&& FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode))
+		{
+			Assert(!result);
+			FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode);
+			result = true;
+			/* we continue iterating so as to update FastPathLocalUseCount */
+		}
+		if (FAST_PATH_GET_BITS(MyProc, f) != 0)
+			++FastPathLocalUseCount;
+	}
+	return result;
+}
+
+/*
+ * FastPathTransferRelationLocks
+ *		Transfer locks matching the given lock tag from per-backend fast-path
+ *		arrays to the shared hash table.
+ *
+ * Returns true if successful, false if ran out of shared memory.
+ */
+static bool
+FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag,
+							  uint32 hashcode)
+{
+	LWLock	   *partitionLock = LockHashPartitionLock(hashcode);
+	Oid			relid = locktag->locktag_field2;
+	uint32		i;
+
+	/*
+	 * Every PGPROC that can potentially hold a fast-path lock is present in
+	 * ProcGlobal->allProcs.  Prepared transactions are not, but any
+	 * outstanding fast-path locks held by prepared transactions are
+	 * transferred to the main lock table.
+	 */
+	for (i = 0; i < ProcGlobal->allProcCount; i++)
+	{
+		PGPROC	   *proc = &ProcGlobal->allProcs[i];
+		uint32		f;
+
+		LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
+
+		/*
+		 * If the target backend isn't referencing the same database as the
+		 * lock, then we needn't examine the individual relation IDs at all;
+		 * none of them can be relevant.
+		 *
+		 * proc->databaseId is set at backend startup time and never changes
+		 * thereafter, so it might be safe to perform this test before
+		 * acquiring &proc->fpInfoLock.  In particular, it's certainly safe to
+		 * assume that if the target backend holds any fast-path locks, it
+		 * must have performed a memory-fencing operation (in particular, an
+		 * LWLock acquisition) since setting proc->databaseId.  However, it's
+		 * less clear that our backend is certain to have performed a memory
+		 * fencing operation since the other backend set proc->databaseId.  So
+		 * for now, we test it after acquiring the LWLock just to be safe.
+		 */
+		if (proc->databaseId != locktag->locktag_field1)
+		{
+			LWLockRelease(&proc->fpInfoLock);
+			continue;
+		}
+
+		for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+		{
+			uint32		lockmode;
+
+			/* Look for an allocated slot matching the given relid. */
+			if (relid != proc->fpRelId[f] || FAST_PATH_GET_BITS(proc, f) == 0)
+				continue;
+
+			/* Find or create lock object. */
+			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+			for (lockmode = FAST_PATH_LOCKNUMBER_OFFSET;
+				 lockmode < FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT;
+				 ++lockmode)
+			{
+				PROCLOCK   *proclock;
+
+				if (!FAST_PATH_CHECK_LOCKMODE(proc, f, lockmode))
+					continue;
+				proclock = SetupLockInTable(lockMethodTable, proc, locktag,
+											hashcode, lockmode);
+				if (!proclock)
+				{
+					LWLockRelease(partitionLock);
+					LWLockRelease(&proc->fpInfoLock);
+					return false;
+				}
+				GrantLock(proclock->tag.myLock, proclock, lockmode);
+				FAST_PATH_CLEAR_LOCKMODE(proc, f, lockmode);
+			}
+			LWLockRelease(partitionLock);
+
+			/* No need to examine remaining slots. */
+			break;
+		}
+		LWLockRelease(&proc->fpInfoLock);
+	}
+	return true;
+}
+
+/*
+ * FastPathGetRelationLockEntry
+ *		Return the PROCLOCK for a lock originally taken via the fast-path,
+ *		transferring it to the primary lock table if necessary.
+ *
+ * Note: caller takes care of updating the locallock object.
+ */
+static PROCLOCK *
+FastPathGetRelationLockEntry(LOCALLOCK *locallock)
+{
+	LockMethod	lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+	LOCKTAG    *locktag = &locallock->tag.lock;
+	PROCLOCK   *proclock = NULL;
+	LWLock	   *partitionLock = LockHashPartitionLock(locallock->hashcode);
+	Oid			relid = locktag->locktag_field2;
+	uint32		f;
+
+	LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+	for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+	{
+		uint32		lockmode;
+
+		/* Look for an allocated slot matching the given relid. */
+		if (relid != MyProc->fpRelId[f] || FAST_PATH_GET_BITS(MyProc, f) == 0)
+			continue;
+
+		/* If we don't have a lock of the given mode, forget it! */
+		lockmode = locallock->tag.mode;
+		if (!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode))
+			break;
+
+		/* Find or create lock object. */
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		proclock = SetupLockInTable(lockMethodTable, MyProc, locktag,
+									locallock->hashcode, lockmode);
+		if (!proclock)
+		{
+			LWLockRelease(partitionLock);
+			LWLockRelease(&MyProc->fpInfoLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory"),
+					 errhint("You might need to increase max_locks_per_transaction.")));
+		}
+		GrantLock(proclock->tag.myLock, proclock, lockmode);
+		FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode);
+
+		LWLockRelease(partitionLock);
+
+		/* No need to examine remaining slots. */
+		break;
+	}
+
+	LWLockRelease(&MyProc->fpInfoLock);
+
+	/* Lock may have already been transferred by some other backend. */
+	if (proclock == NULL)
+	{
+		LOCK	   *lock;
+		PROCLOCKTAG proclocktag;
+		uint32		proclock_hashcode;
+
+		LWLockAcquire(partitionLock, LW_SHARED);
+
+		lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+													(void *) locktag,
+													locallock->hashcode,
+													HASH_FIND,
+													NULL);
+		if (!lock)
+			elog(ERROR, "failed to re-find shared lock object");
+
+		proclocktag.myLock = lock;
+		proclocktag.myProc = MyProc;
+
+		proclock_hashcode = ProcLockHashCode(&proclocktag, locallock->hashcode);
+		proclock = (PROCLOCK *)
+			hash_search_with_hash_value(LockMethodProcLockHash,
+										(void *) &proclocktag,
+										proclock_hashcode,
+										HASH_FIND,
+										NULL);
+		if (!proclock)
+			elog(ERROR, "failed to re-find shared proclock object");
+		LWLockRelease(partitionLock);
+	}
+
+	return proclock;
+}
+
+/*
+ * GetLockConflicts
+ *		Get an array of VirtualTransactionIds of xacts currently holding locks
+ *		that would conflict with the specified lock/lockmode.
+ *		xacts merely awaiting such a lock are NOT reported.
+ *
+ * The result array is palloc'd and is terminated with an invalid VXID.
+ * *countp, if not null, is updated to the number of items set.
+ *
+ * Of course, the result could be out of date by the time it's returned, so
+ * use of this function has to be thought about carefully.  Similarly, a
+ * PGPROC with no "lxid" will be considered non-conflicting regardless of any
+ * lock it holds.  Existing callers don't care about a locker after that
+ * locker's pg_xact updates complete.  CommitTransaction() clears "lxid" after
+ * pg_xact updates and before releasing locks.
+ *
+ * Note we never include the current xact's vxid in the result array,
+ * since an xact never blocks itself.
+ */
+VirtualTransactionId *
+GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
+{
+	static VirtualTransactionId *vxids;
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LockMethod	lockMethodTable;
+	LOCK	   *lock;
+	LOCKMASK	conflictMask;
+	SHM_QUEUE  *procLocks;
+	PROCLOCK   *proclock;
+	uint32		hashcode;
+	LWLock	   *partitionLock;
+	int			count = 0;
+	int			fast_count = 0;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+		elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+	/*
+	 * Allocate memory to store results, and fill with InvalidVXID.  We only
+	 * need enough space for MaxBackends + max_prepared_xacts + a terminator.
+	 * InHotStandby allocate once in TopMemoryContext.
+	 */
+	if (InHotStandby)
+	{
+		if (vxids == NULL)
+			vxids = (VirtualTransactionId *)
+				MemoryContextAlloc(TopMemoryContext,
+								   sizeof(VirtualTransactionId) *
+								   (MaxBackends + max_prepared_xacts + 1));
+	}
+	else
+		vxids = (VirtualTransactionId *)
+			palloc0(sizeof(VirtualTransactionId) *
+					(MaxBackends + max_prepared_xacts + 1));
+
+	/* Compute hash code and partition lock, and look up conflicting modes. */
+	hashcode = LockTagHashCode(locktag);
+	partitionLock = LockHashPartitionLock(hashcode);
+	conflictMask = lockMethodTable->conflictTab[lockmode];
+
+	/*
+	 * Fast path locks might not have been entered in the primary lock table.
+	 * If the lock we're dealing with could conflict with such a lock, we must
+	 * examine each backend's fast-path array for conflicts.
+	 */
+	if (ConflictsWithRelationFastPath(locktag, lockmode))
+	{
+		int			i;
+		Oid			relid = locktag->locktag_field2;
+		VirtualTransactionId vxid;
+
+		/*
+		 * Iterate over relevant PGPROCs.  Anything held by a prepared
+		 * transaction will have been transferred to the primary lock table,
+		 * so we need not worry about those.  This is all a bit fuzzy, because
+		 * new locks could be taken after we've visited a particular
+		 * partition, but the callers had better be prepared to deal with that
+		 * anyway, since the locks could equally well be taken between the
+		 * time we return the value and the time the caller does something
+		 * with it.
+		 */
+		for (i = 0; i < ProcGlobal->allProcCount; i++)
+		{
+			PGPROC	   *proc = &ProcGlobal->allProcs[i];
+			uint32		f;
+
+			/* A backend never blocks itself */
+			if (proc == MyProc)
+				continue;
+
+			LWLockAcquire(&proc->fpInfoLock, LW_SHARED);
+
+			/*
+			 * If the target backend isn't referencing the same database as
+			 * the lock, then we needn't examine the individual relation IDs
+			 * at all; none of them can be relevant.
+			 *
+			 * See FastPathTransferRelationLocks() for discussion of why we do
+			 * this test after acquiring the lock.
+			 */
+			if (proc->databaseId != locktag->locktag_field1)
+			{
+				LWLockRelease(&proc->fpInfoLock);
+				continue;
+			}
+
+			for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+			{
+				uint32		lockmask;
+
+				/* Look for an allocated slot matching the given relid. */
+				if (relid != proc->fpRelId[f])
+					continue;
+				lockmask = FAST_PATH_GET_BITS(proc, f);
+				if (!lockmask)
+					continue;
+				lockmask <<= FAST_PATH_LOCKNUMBER_OFFSET;
+
+				/*
+				 * There can only be one entry per relation, so if we found it
+				 * and it doesn't conflict, we can skip the rest of the slots.
+				 */
+				if ((lockmask & conflictMask) == 0)
+					break;
+
+				/* Conflict! */
+				GET_VXID_FROM_PGPROC(vxid, *proc);
+
+				if (VirtualTransactionIdIsValid(vxid))
+					vxids[count++] = vxid;
+				/* else, xact already committed or aborted */
+
+				/* No need to examine remaining slots. */
+				break;
+			}
+
+			LWLockRelease(&proc->fpInfoLock);
+		}
+	}
+
+	/* Remember how many fast-path conflicts we found. */
+	fast_count = count;
+
+	/*
+	 * Look up the lock object matching the tag.
+	 */
+	LWLockAcquire(partitionLock, LW_SHARED);
+
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												(const void *) locktag,
+												hashcode,
+												HASH_FIND,
+												NULL);
+	if (!lock)
+	{
+		/*
+		 * If the lock object doesn't exist, there is nothing holding a lock
+		 * on this lockable object.
+		 */
+		LWLockRelease(partitionLock);
+		vxids[count].backendId = InvalidBackendId;
+		vxids[count].localTransactionId = InvalidLocalTransactionId;
+		if (countp)
+			*countp = count;
+		return vxids;
+	}
+
+	/*
+	 * Examine each existing holder (or awaiter) of the lock.
+	 */
+
+	procLocks = &(lock->procLocks);
+
+	proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+										 offsetof(PROCLOCK, lockLink));
+
+	while (proclock)
+	{
+		if (conflictMask & proclock->holdMask)
+		{
+			PGPROC	   *proc = proclock->tag.myProc;
+
+			/* A backend never blocks itself */
+			if (proc != MyProc)
+			{
+				VirtualTransactionId vxid;
+
+				GET_VXID_FROM_PGPROC(vxid, *proc);
+
+				if (VirtualTransactionIdIsValid(vxid))
+				{
+					int			i;
+
+					/* Avoid duplicate entries. */
+					for (i = 0; i < fast_count; ++i)
+						if (VirtualTransactionIdEquals(vxids[i], vxid))
+							break;
+					if (i >= fast_count)
+						vxids[count++] = vxid;
+				}
+				/* else, xact already committed or aborted */
+			}
+		}
+
+		proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
+											 offsetof(PROCLOCK, lockLink));
+	}
+
+	LWLockRelease(partitionLock);
+
+	if (count > MaxBackends + max_prepared_xacts)	/* should never happen */
+		elog(PANIC, "too many conflicting locks found");
+
+	vxids[count].backendId = InvalidBackendId;
+	vxids[count].localTransactionId = InvalidLocalTransactionId;
+	if (countp)
+		*countp = count;
+	return vxids;
+}
+
+/*
+ * Find a lock in the shared lock table and release it.  It is the caller's
+ * responsibility to verify that this is a sane thing to do.  (For example, it
+ * would be bad to release a lock here if there might still be a LOCALLOCK
+ * object with pointers to it.)
+ *
+ * We currently use this in two situations: first, to release locks held by
+ * prepared transactions on commit (see lock_twophase_postcommit); and second,
+ * to release locks taken via the fast-path, transferred to the main hash
+ * table, and then released (see LockReleaseAll).
+ */
+static void
+LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc,
+					 LOCKTAG *locktag, LOCKMODE lockmode,
+					 bool decrement_strong_lock_count)
+{
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	uint32		hashcode;
+	uint32		proclock_hashcode;
+	LWLock	   *partitionLock;
+	bool		wakeupNeeded;
+
+	hashcode = LockTagHashCode(locktag);
+	partitionLock = LockHashPartitionLock(hashcode);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Re-find the lock object (it had better be there).
+	 */
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												(void *) locktag,
+												hashcode,
+												HASH_FIND,
+												NULL);
+	if (!lock)
+		elog(PANIC, "failed to re-find shared lock object");
+
+	/*
+	 * Re-find the proclock object (ditto).
+	 */
+	proclocktag.myLock = lock;
+	proclocktag.myProc = proc;
+
+	proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+	proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+														(void *) &proclocktag,
+														proclock_hashcode,
+														HASH_FIND,
+														NULL);
+	if (!proclock)
+		elog(PANIC, "failed to re-find shared proclock object");
+
+	/*
+	 * Double-check that we are actually holding a lock of the type we want to
+	 * release.
+	 */
+	if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+	{
+		PROCLOCK_PRINT("lock_twophase_postcommit: WRONGTYPE", proclock);
+		LWLockRelease(partitionLock);
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		return;
+	}
+
+	/*
+	 * Do the releasing.  CleanUpLock will waken any now-wakable waiters.
+	 */
+	wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
+
+	CleanUpLock(lock, proclock,
+				lockMethodTable, hashcode,
+				wakeupNeeded);
+
+	LWLockRelease(partitionLock);
+
+	/*
+	 * Decrement strong lock count.  This logic is needed only for 2PC.
+	 */
+	if (decrement_strong_lock_count
+		&& ConflictsWithRelationFastPath(locktag, lockmode))
+	{
+		uint32		fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+		SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+		Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+		FastPathStrongRelationLocks->count[fasthashcode]--;
+		SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+	}
+}
+
+/*
+ * CheckForSessionAndXactLocks
+ *		Check to see if transaction holds both session-level and xact-level
+ *		locks on the same object; if so, throw an error.
+ *
+ * If we have both session- and transaction-level locks on the same object,
+ * PREPARE TRANSACTION must fail.  This should never happen with regular
+ * locks, since we only take those at session level in some special operations
+ * like VACUUM.  It's possible to hit this with advisory locks, though.
+ *
+ * It would be nice if we could keep the session hold and give away the
+ * transactional hold to the prepared xact.  However, that would require two
+ * PROCLOCK objects, and we cannot be sure that another PROCLOCK will be
+ * available when it comes time for PostPrepare_Locks to do the deed.
+ * So for now, we error out while we can still do so safely.
+ *
+ * Since the LOCALLOCK table stores a separate entry for each lockmode,
+ * we can't implement this check by examining LOCALLOCK entries in isolation.
+ * We must build a transient hashtable that is indexed by locktag only.
+ */
+static void
+CheckForSessionAndXactLocks(void)
+{
+	typedef struct
+	{
+		LOCKTAG		lock;		/* identifies the lockable object */
+		bool		sessLock;	/* is any lockmode held at session level? */
+		bool		xactLock;	/* is any lockmode held at xact level? */
+	} PerLockTagEntry;
+
+	HASHCTL		hash_ctl;
+	HTAB	   *lockhtab;
+	HASH_SEQ_STATUS status;
+	LOCALLOCK  *locallock;
+
+	/* Create a local hash table keyed by LOCKTAG only */
+	hash_ctl.keysize = sizeof(LOCKTAG);
+	hash_ctl.entrysize = sizeof(PerLockTagEntry);
+	hash_ctl.hcxt = CurrentMemoryContext;
+
+	lockhtab = hash_create("CheckForSessionAndXactLocks table",
+						   256, /* arbitrary initial size */
+						   &hash_ctl,
+						   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+	/* Scan local lock table to find entries for each LOCKTAG */
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+		PerLockTagEntry *hentry;
+		bool		found;
+		int			i;
+
+		/*
+		 * Ignore VXID locks.  We don't want those to be held by prepared
+		 * transactions, since they aren't meaningful after a restart.
+		 */
+		if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+			continue;
+
+		/* Ignore it if we don't actually hold the lock */
+		if (locallock->nLocks <= 0)
+			continue;
+
+		/* Otherwise, find or make an entry in lockhtab */
+		hentry = (PerLockTagEntry *) hash_search(lockhtab,
+												 (void *) &locallock->tag.lock,
+												 HASH_ENTER, &found);
+		if (!found)				/* initialize, if newly created */
+			hentry->sessLock = hentry->xactLock = false;
+
+		/* Scan to see if we hold lock at session or xact level or both */
+		for (i = locallock->numLockOwners - 1; i >= 0; i--)
+		{
+			if (lockOwners[i].owner == NULL)
+				hentry->sessLock = true;
+			else
+				hentry->xactLock = true;
+		}
+
+		/*
+		 * We can throw error immediately when we see both types of locks; no
+		 * need to wait around to see if there are more violations.
+		 */
+		if (hentry->sessLock && hentry->xactLock)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+	}
+
+	/* Success, so clean up */
+	hash_destroy(lockhtab);
+}
+
+/*
+ * AtPrepare_Locks
+ *		Do the preparatory work for a PREPARE: make 2PC state file records
+ *		for all locks currently held.
+ *
+ * Session-level locks are ignored, as are VXID locks.
+ *
+ * For the most part, we don't need to touch shared memory for this ---
+ * all the necessary state information is in the locallock table.
+ * Fast-path locks are an exception, however: we move any such locks to
+ * the main table before allowing PREPARE TRANSACTION to succeed.
+ */
+void
+AtPrepare_Locks(void)
+{
+	HASH_SEQ_STATUS status;
+	LOCALLOCK  *locallock;
+
+	/* First, verify there aren't locks of both xact and session level */
+	CheckForSessionAndXactLocks();
+
+	/* Now do the per-locallock cleanup work */
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		TwoPhaseLockRecord record;
+		LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+		bool		haveSessionLock;
+		bool		haveXactLock;
+		int			i;
+
+		/*
+		 * Ignore VXID locks.  We don't want those to be held by prepared
+		 * transactions, since they aren't meaningful after a restart.
+		 */
+		if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+			continue;
+
+		/* Ignore it if we don't actually hold the lock */
+		if (locallock->nLocks <= 0)
+			continue;
+
+		/* Scan to see whether we hold it at session or transaction level */
+		haveSessionLock = haveXactLock = false;
+		for (i = locallock->numLockOwners - 1; i >= 0; i--)
+		{
+			if (lockOwners[i].owner == NULL)
+				haveSessionLock = true;
+			else
+				haveXactLock = true;
+		}
+
+		/* Ignore it if we have only session lock */
+		if (!haveXactLock)
+			continue;
+
+		/* This can't happen, because we already checked it */
+		if (haveSessionLock)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+
+		/*
+		 * If the local lock was taken via the fast-path, we need to move it
+		 * to the primary lock table, or just get a pointer to the existing
+		 * primary lock table entry if by chance it's already been
+		 * transferred.
+		 */
+		if (locallock->proclock == NULL)
+		{
+			locallock->proclock = FastPathGetRelationLockEntry(locallock);
+			locallock->lock = locallock->proclock->tag.myLock;
+		}
+
+		/*
+		 * Arrange to not release any strong lock count held by this lock
+		 * entry.  We must retain the count until the prepared transaction is
+		 * committed or rolled back.
+		 */
+		locallock->holdsStrongLockCount = false;
+
+		/*
+		 * Create a 2PC record.
+		 */
+		memcpy(&(record.locktag), &(locallock->tag.lock), sizeof(LOCKTAG));
+		record.lockmode = locallock->tag.mode;
+
+		RegisterTwoPhaseRecord(TWOPHASE_RM_LOCK_ID, 0,
+							   &record, sizeof(TwoPhaseLockRecord));
+	}
+}
+
+/*
+ * PostPrepare_Locks
+ *		Clean up after successful PREPARE
+ *
+ * Here, we want to transfer ownership of our locks to a dummy PGPROC
+ * that's now associated with the prepared transaction, and we want to
+ * clean out the corresponding entries in the LOCALLOCK table.
+ *
+ * Note: by removing the LOCALLOCK entries, we are leaving dangling
+ * pointers in the transaction's resource owner.  This is OK at the
+ * moment since resowner.c doesn't try to free locks retail at a toplevel
+ * transaction commit or abort.  We could alternatively zero out nLocks
+ * and leave the LOCALLOCK entries to be garbage-collected by LockReleaseAll,
+ * but that probably costs more cycles.
+ */
+void
+PostPrepare_Locks(TransactionId xid)
+{
+	PGPROC	   *newproc = TwoPhaseGetDummyProc(xid, false);
+	HASH_SEQ_STATUS status;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	int			partition;
+
+	/* Can't prepare a lock group follower. */
+	Assert(MyProc->lockGroupLeader == NULL ||
+		   MyProc->lockGroupLeader == MyProc);
+
+	/* This is a critical section: any error means big trouble */
+	START_CRIT_SECTION();
+
+	/*
+	 * First we run through the locallock table and get rid of unwanted
+	 * entries, then we scan the process's proclocks and transfer them to the
+	 * target proc.
+	 *
+	 * We do this separately because we may have multiple locallock entries
+	 * pointing to the same proclock, and we daren't end up with any dangling
+	 * pointers.
+	 */
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+		bool		haveSessionLock;
+		bool		haveXactLock;
+		int			i;
+
+		if (locallock->proclock == NULL || locallock->lock == NULL)
+		{
+			/*
+			 * We must've run out of shared memory while trying to set up this
+			 * lock.  Just forget the local entry.
+			 */
+			Assert(locallock->nLocks == 0);
+			RemoveLocalLock(locallock);
+			continue;
+		}
+
+		/* Ignore VXID locks */
+		if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+			continue;
+
+		/* Scan to see whether we hold it at session or transaction level */
+		haveSessionLock = haveXactLock = false;
+		for (i = locallock->numLockOwners - 1; i >= 0; i--)
+		{
+			if (lockOwners[i].owner == NULL)
+				haveSessionLock = true;
+			else
+				haveXactLock = true;
+		}
+
+		/* Ignore it if we have only session lock */
+		if (!haveXactLock)
+			continue;
+
+		/* This can't happen, because we already checked it */
+		if (haveSessionLock)
+			ereport(PANIC,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+
+		/* Mark the proclock to show we need to release this lockmode */
+		if (locallock->nLocks > 0)
+			locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode);
+
+		/* And remove the locallock hashtable entry */
+		RemoveLocalLock(locallock);
+	}
+
+	/*
+	 * Now, scan each lock partition separately.
+	 */
+	for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+	{
+		LWLock	   *partitionLock;
+		SHM_QUEUE  *procLocks = &(MyProc->myProcLocks[partition]);
+		PROCLOCK   *nextplock;
+
+		partitionLock = LockHashPartitionLockByIndex(partition);
+
+		/*
+		 * If the proclock list for this partition is empty, we can skip
+		 * acquiring the partition lock.  This optimization is safer than the
+		 * situation in LockReleaseAll, because we got rid of any fast-path
+		 * locks during AtPrepare_Locks, so there cannot be any case where
+		 * another backend is adding something to our lists now.  For safety,
+		 * though, we code this the same way as in LockReleaseAll.
+		 */
+		if (SHMQueueNext(procLocks, procLocks,
+						 offsetof(PROCLOCK, procLink)) == NULL)
+			continue;			/* needn't examine this partition */
+
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		for (proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+												  offsetof(PROCLOCK, procLink));
+			 proclock;
+			 proclock = nextplock)
+		{
+			/* Get link first, since we may unlink/relink this proclock */
+			nextplock = (PROCLOCK *)
+				SHMQueueNext(procLocks, &proclock->procLink,
+							 offsetof(PROCLOCK, procLink));
+
+			Assert(proclock->tag.myProc == MyProc);
+
+			lock = proclock->tag.myLock;
+
+			/* Ignore VXID locks */
+			if (lock->tag.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+				continue;
+
+			PROCLOCK_PRINT("PostPrepare_Locks", proclock);
+			LOCK_PRINT("PostPrepare_Locks", lock, 0);
+			Assert(lock->nRequested >= 0);
+			Assert(lock->nGranted >= 0);
+			Assert(lock->nGranted <= lock->nRequested);
+			Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+			/* Ignore it if nothing to release (must be a session lock) */
+			if (proclock->releaseMask == 0)
+				continue;
+
+			/* Else we should be releasing all locks */
+			if (proclock->releaseMask != proclock->holdMask)
+				elog(PANIC, "we seem to have dropped a bit somewhere");
+
+			/*
+			 * We cannot simply modify proclock->tag.myProc to reassign
+			 * ownership of the lock, because that's part of the hash key and
+			 * the proclock would then be in the wrong hash chain.  Instead
+			 * use hash_update_hash_key.  (We used to create a new hash entry,
+			 * but that risks out-of-memory failure if other processes are
+			 * busy making proclocks too.)	We must unlink the proclock from
+			 * our procLink chain and put it into the new proc's chain, too.
+			 *
+			 * Note: the updated proclock hash key will still belong to the
+			 * same hash partition, cf proclock_hash().  So the partition lock
+			 * we already hold is sufficient for this.
+			 */
+			SHMQueueDelete(&proclock->procLink);
+
+			/*
+			 * Create the new hash key for the proclock.
+			 */
+			proclocktag.myLock = lock;
+			proclocktag.myProc = newproc;
+
+			/*
+			 * Update groupLeader pointer to point to the new proc.  (We'd
+			 * better not be a member of somebody else's lock group!)
+			 */
+			Assert(proclock->groupLeader == proclock->tag.myProc);
+			proclock->groupLeader = newproc;
+
+			/*
+			 * Update the proclock.  We should not find any existing entry for
+			 * the same hash key, since there can be only one entry for any
+			 * given lock with my own proc.
+			 */
+			if (!hash_update_hash_key(LockMethodProcLockHash,
+									  (void *) proclock,
+									  (void *) &proclocktag))
+				elog(PANIC, "duplicate entry found while reassigning a prepared transaction's locks");
+
+			/* Re-link into the new proc's proclock list */
+			SHMQueueInsertBefore(&(newproc->myProcLocks[partition]),
+								 &proclock->procLink);
+
+			PROCLOCK_PRINT("PostPrepare_Locks: updated", proclock);
+		}						/* loop over PROCLOCKs within this partition */
+
+		LWLockRelease(partitionLock);
+	}							/* loop over partitions */
+
+	END_CRIT_SECTION();
+}
+
+
+/*
+ * Estimate shared-memory space used for lock tables
+ */
+Size
+LockShmemSize(void)
+{
+	Size		size = 0;
+	long		max_table_size;
+
+	/* lock hash table */
+	max_table_size = NLOCKENTS();
+	size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK)));
+
+	/* proclock hash table */
+	max_table_size *= 2;
+	size = add_size(size, hash_estimate_size(max_table_size, sizeof(PROCLOCK)));
+
+	/*
+	 * Since NLOCKENTS is only an estimate, add 10% safety margin.
+	 */
+	size = add_size(size, size / 10);
+
+	return size;
+}
+
+/*
+ * GetLockStatusData - Return a summary of the lock manager's internal
+ * status, for use in a user-level reporting function.
+ *
+ * The return data consists of an array of LockInstanceData objects,
+ * which are a lightly abstracted version of the PROCLOCK data structures,
+ * i.e. there is one entry for each unique lock and interested PGPROC.
+ * It is the caller's responsibility to match up related items (such as
+ * references to the same lockable object or PGPROC) if wanted.
+ *
+ * The design goal is to hold the LWLocks for as short a time as possible;
+ * thus, this function simply makes a copy of the necessary data and releases
+ * the locks, allowing the caller to contemplate and format the data for as
+ * long as it pleases.
+ */
+LockData *
+GetLockStatusData(void)
+{
+	LockData   *data;
+	PROCLOCK   *proclock;
+	HASH_SEQ_STATUS seqstat;
+	int			els;
+	int			el;
+	int			i;
+
+	data = (LockData *) palloc(sizeof(LockData));
+
+	/* Guess how much space we'll need. */
+	els = MaxBackends;
+	el = 0;
+	data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * els);
+
+	/*
+	 * First, we iterate through the per-backend fast-path arrays, locking
+	 * them one at a time.  This might produce an inconsistent picture of the
+	 * system state, but taking all of those LWLocks at the same time seems
+	 * impractical (in particular, note MAX_SIMUL_LWLOCKS).  It shouldn't
+	 * matter too much, because none of these locks can be involved in lock
+	 * conflicts anyway - anything that might must be present in the main lock
+	 * table.  (For the same reason, we don't sweat about making leaderPid
+	 * completely valid.  We cannot safely dereference another backend's
+	 * lockGroupLeader field without holding all lock partition locks, and
+	 * it's not worth that.)
+	 */
+	for (i = 0; i < ProcGlobal->allProcCount; ++i)
+	{
+		PGPROC	   *proc = &ProcGlobal->allProcs[i];
+		uint32		f;
+
+		LWLockAcquire(&proc->fpInfoLock, LW_SHARED);
+
+		for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; ++f)
+		{
+			LockInstanceData *instance;
+			uint32		lockbits = FAST_PATH_GET_BITS(proc, f);
+
+			/* Skip unallocated slots. */
+			if (!lockbits)
+				continue;
+
+			if (el >= els)
+			{
+				els += MaxBackends;
+				data->locks = (LockInstanceData *)
+					repalloc(data->locks, sizeof(LockInstanceData) * els);
+			}
+
+			instance = &data->locks[el];
+			SET_LOCKTAG_RELATION(instance->locktag, proc->databaseId,
+								 proc->fpRelId[f]);
+			instance->holdMask = lockbits << FAST_PATH_LOCKNUMBER_OFFSET;
+			instance->waitLockMode = NoLock;
+			instance->backend = proc->backendId;
+			instance->lxid = proc->lxid;
+			instance->pid = proc->pid;
+			instance->leaderPid = proc->pid;
+			instance->fastpath = true;
+
+			/*
+			 * Successfully taking fast path lock means there were no
+			 * conflicting locks.
+			 */
+			instance->waitStart = 0;
+
+			el++;
+		}
+
+		if (proc->fpVXIDLock)
+		{
+			VirtualTransactionId vxid;
+			LockInstanceData *instance;
+
+			if (el >= els)
+			{
+				els += MaxBackends;
+				data->locks = (LockInstanceData *)
+					repalloc(data->locks, sizeof(LockInstanceData) * els);
+			}
+
+			vxid.backendId = proc->backendId;
+			vxid.localTransactionId = proc->fpLocalTransactionId;
+
+			instance = &data->locks[el];
+			SET_LOCKTAG_VIRTUALTRANSACTION(instance->locktag, vxid);
+			instance->holdMask = LOCKBIT_ON(ExclusiveLock);
+			instance->waitLockMode = NoLock;
+			instance->backend = proc->backendId;
+			instance->lxid = proc->lxid;
+			instance->pid = proc->pid;
+			instance->leaderPid = proc->pid;
+			instance->fastpath = true;
+			instance->waitStart = 0;
+
+			el++;
+		}
+
+		LWLockRelease(&proc->fpInfoLock);
+	}
+
+	/*
+	 * Next, acquire lock on the entire shared lock data structure.  We do
+	 * this so that, at least for locks in the primary lock table, the state
+	 * will be self-consistent.
+	 *
+	 * Since this is a read-only operation, we take shared instead of
+	 * exclusive lock.  There's not a whole lot of point to this, because all
+	 * the normal operations require exclusive lock, but it doesn't hurt
+	 * anything either. It will at least allow two backends to do
+	 * GetLockStatusData in parallel.
+	 *
+	 * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+	 */
+	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+		LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+	/* Now we can safely count the number of proclocks */
+	data->nelements = el + hash_get_num_entries(LockMethodProcLockHash);
+	if (data->nelements > els)
+	{
+		els = data->nelements;
+		data->locks = (LockInstanceData *)
+			repalloc(data->locks, sizeof(LockInstanceData) * els);
+	}
+
+	/* Now scan the tables to copy the data */
+	hash_seq_init(&seqstat, LockMethodProcLockHash);
+
+	while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+	{
+		PGPROC	   *proc = proclock->tag.myProc;
+		LOCK	   *lock = proclock->tag.myLock;
+		LockInstanceData *instance = &data->locks[el];
+
+		memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
+		instance->holdMask = proclock->holdMask;
+		if (proc->waitLock == proclock->tag.myLock)
+			instance->waitLockMode = proc->waitLockMode;
+		else
+			instance->waitLockMode = NoLock;
+		instance->backend = proc->backendId;
+		instance->lxid = proc->lxid;
+		instance->pid = proc->pid;
+		instance->leaderPid = proclock->groupLeader->pid;
+		instance->fastpath = false;
+		instance->waitStart = (TimestampTz) pg_atomic_read_u64(&proc->waitStart);
+
+		el++;
+	}
+
+	/*
+	 * And release locks.  We do this in reverse order for two reasons: (1)
+	 * Anyone else who needs more than one of the locks will be trying to lock
+	 * them in increasing order; we don't want to release the other process
+	 * until it can get all the locks it needs. (2) This avoids O(N^2)
+	 * behavior inside LWLockRelease.
+	 */
+	for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+		LWLockRelease(LockHashPartitionLockByIndex(i));
+
+	Assert(el == data->nelements);
+
+	return data;
+}
+
+/*
+ * GetBlockerStatusData - Return a summary of the lock manager's state
+ * concerning locks that are blocking the specified PID or any member of
+ * the PID's lock group, for use in a user-level reporting function.
+ *
+ * For each PID within the lock group that is awaiting some heavyweight lock,
+ * the return data includes an array of LockInstanceData objects, which are
+ * the same data structure used by GetLockStatusData; but unlike that function,
+ * this one reports only the PROCLOCKs associated with the lock that that PID
+ * is blocked on.  (Hence, all the locktags should be the same for any one
+ * blocked PID.)  In addition, we return an array of the PIDs of those backends
+ * that are ahead of the blocked PID in the lock's wait queue.  These can be
+ * compared with the PIDs in the LockInstanceData objects to determine which
+ * waiters are ahead of or behind the blocked PID in the queue.
+ *
+ * If blocked_pid isn't a valid backend PID or nothing in its lock group is
+ * waiting on any heavyweight lock, return empty arrays.
+ *
+ * The design goal is to hold the LWLocks for as short a time as possible;
+ * thus, this function simply makes a copy of the necessary data and releases
+ * the locks, allowing the caller to contemplate and format the data for as
+ * long as it pleases.
+ */
+BlockedProcsData *
+GetBlockerStatusData(int blocked_pid)
+{
+	BlockedProcsData *data;
+	PGPROC	   *proc;
+	int			i;
+
+	data = (BlockedProcsData *) palloc(sizeof(BlockedProcsData));
+
+	/*
+	 * Guess how much space we'll need, and preallocate.  Most of the time
+	 * this will avoid needing to do repalloc while holding the LWLocks.  (We
+	 * assume, but check with an Assert, that MaxBackends is enough entries
+	 * for the procs[] array; the other two could need enlargement, though.)
+	 */
+	data->nprocs = data->nlocks = data->npids = 0;
+	data->maxprocs = data->maxlocks = data->maxpids = MaxBackends;
+	data->procs = (BlockedProcData *) palloc(sizeof(BlockedProcData) * data->maxprocs);
+	data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * data->maxlocks);
+	data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids);
+
+	/*
+	 * In order to search the ProcArray for blocked_pid and assume that that
+	 * entry won't immediately disappear under us, we must hold ProcArrayLock.
+	 * In addition, to examine the lock grouping fields of any other backend,
+	 * we must hold all the hash partition locks.  (Only one of those locks is
+	 * actually relevant for any one lock group, but we can't know which one
+	 * ahead of time.)	It's fairly annoying to hold all those locks
+	 * throughout this, but it's no worse than GetLockStatusData(), and it
+	 * does have the advantage that we're guaranteed to return a
+	 * self-consistent instantaneous state.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	proc = BackendPidGetProcWithLock(blocked_pid);
+
+	/* Nothing to do if it's gone */
+	if (proc != NULL)
+	{
+		/*
+		 * Acquire lock on the entire shared lock data structure.  See notes
+		 * in GetLockStatusData().
+		 */
+		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+			LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+		if (proc->lockGroupLeader == NULL)
+		{
+			/* Easy case, proc is not a lock group member */
+			GetSingleProcBlockerStatusData(proc, data);
+		}
+		else
+		{
+			/* Examine all procs in proc's lock group */
+			dlist_iter	iter;
+
+			dlist_foreach(iter, &proc->lockGroupLeader->lockGroupMembers)
+			{
+				PGPROC	   *memberProc;
+
+				memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur);
+				GetSingleProcBlockerStatusData(memberProc, data);
+			}
+		}
+
+		/*
+		 * And release locks.  See notes in GetLockStatusData().
+		 */
+		for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+			LWLockRelease(LockHashPartitionLockByIndex(i));
+
+		Assert(data->nprocs <= data->maxprocs);
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return data;
+}
+
+/* Accumulate data about one possibly-blocked proc for GetBlockerStatusData */
+static void
+GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data)
+{
+	LOCK	   *theLock = blocked_proc->waitLock;
+	BlockedProcData *bproc;
+	SHM_QUEUE  *procLocks;
+	PROCLOCK   *proclock;
+	PROC_QUEUE *waitQueue;
+	PGPROC	   *proc;
+	int			queue_size;
+	int			i;
+
+	/* Nothing to do if this proc is not blocked */
+	if (theLock == NULL)
+		return;
+
+	/* Set up a procs[] element */
+	bproc = &data->procs[data->nprocs++];
+	bproc->pid = blocked_proc->pid;
+	bproc->first_lock = data->nlocks;
+	bproc->first_waiter = data->npids;
+
+	/*
+	 * We may ignore the proc's fast-path arrays, since nothing in those could
+	 * be related to a contended lock.
+	 */
+
+	/* Collect all PROCLOCKs associated with theLock */
+	procLocks = &(theLock->procLocks);
+	proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+										 offsetof(PROCLOCK, lockLink));
+	while (proclock)
+	{
+		PGPROC	   *proc = proclock->tag.myProc;
+		LOCK	   *lock = proclock->tag.myLock;
+		LockInstanceData *instance;
+
+		if (data->nlocks >= data->maxlocks)
+		{
+			data->maxlocks += MaxBackends;
+			data->locks = (LockInstanceData *)
+				repalloc(data->locks, sizeof(LockInstanceData) * data->maxlocks);
+		}
+
+		instance = &data->locks[data->nlocks];
+		memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
+		instance->holdMask = proclock->holdMask;
+		if (proc->waitLock == lock)
+			instance->waitLockMode = proc->waitLockMode;
+		else
+			instance->waitLockMode = NoLock;
+		instance->backend = proc->backendId;
+		instance->lxid = proc->lxid;
+		instance->pid = proc->pid;
+		instance->leaderPid = proclock->groupLeader->pid;
+		instance->fastpath = false;
+		data->nlocks++;
+
+		proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
+											 offsetof(PROCLOCK, lockLink));
+	}
+
+	/* Enlarge waiter_pids[] if it's too small to hold all wait queue PIDs */
+	waitQueue = &(theLock->waitProcs);
+	queue_size = waitQueue->size;
+
+	if (queue_size > data->maxpids - data->npids)
+	{
+		data->maxpids = Max(data->maxpids + MaxBackends,
+							data->npids + queue_size);
+		data->waiter_pids = (int *) repalloc(data->waiter_pids,
+											 sizeof(int) * data->maxpids);
+	}
+
+	/* Collect PIDs from the lock's wait queue, stopping at blocked_proc */
+	proc = (PGPROC *) waitQueue->links.next;
+	for (i = 0; i < queue_size; i++)
+	{
+		if (proc == blocked_proc)
+			break;
+		data->waiter_pids[data->npids++] = proc->pid;
+		proc = (PGPROC *) proc->links.next;
+	}
+
+	bproc->num_locks = data->nlocks - bproc->first_lock;
+	bproc->num_waiters = data->npids - bproc->first_waiter;
+}
+
+/*
+ * Returns a list of currently held AccessExclusiveLocks, for use by
+ * LogStandbySnapshot().  The result is a palloc'd array,
+ * with the number of elements returned into *nlocks.
+ *
+ * XXX This currently takes a lock on all partitions of the lock table,
+ * but it's possible to do better.  By reference counting locks and storing
+ * the value in the ProcArray entry for each backend we could tell if any
+ * locks need recording without having to acquire the partition locks and
+ * scan the lock table.  Whether that's worth the additional overhead
+ * is pretty dubious though.
+ */
+xl_standby_lock *
+GetRunningTransactionLocks(int *nlocks)
+{
+	xl_standby_lock *accessExclusiveLocks;
+	PROCLOCK   *proclock;
+	HASH_SEQ_STATUS seqstat;
+	int			i;
+	int			index;
+	int			els;
+
+	/*
+	 * Acquire lock on the entire shared lock data structure.
+	 *
+	 * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+	 */
+	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+		LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+	/* Now we can safely count the number of proclocks */
+	els = hash_get_num_entries(LockMethodProcLockHash);
+
+	/*
+	 * Allocating enough space for all locks in the lock table is overkill,
+	 * but it's more convenient and faster than having to enlarge the array.
+	 */
+	accessExclusiveLocks = palloc(els * sizeof(xl_standby_lock));
+
+	/* Now scan the tables to copy the data */
+	hash_seq_init(&seqstat, LockMethodProcLockHash);
+
+	/*
+	 * If lock is a currently granted AccessExclusiveLock then it will have
+	 * just one proclock holder, so locks are never accessed twice in this
+	 * particular case. Don't copy this code for use elsewhere because in the
+	 * general case this will give you duplicate locks when looking at
+	 * non-exclusive lock types.
+	 */
+	index = 0;
+	while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+	{
+		/* make sure this definition matches the one used in LockAcquire */
+		if ((proclock->holdMask & LOCKBIT_ON(AccessExclusiveLock)) &&
+			proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION)
+		{
+			PGPROC	   *proc = proclock->tag.myProc;
+			LOCK	   *lock = proclock->tag.myLock;
+			TransactionId xid = proc->xid;
+
+			/*
+			 * Don't record locks for transactions if we know they have
+			 * already issued their WAL record for commit but not yet released
+			 * lock. It is still possible that we see locks held by already
+			 * complete transactions, if they haven't yet zeroed their xids.
+			 */
+			if (!TransactionIdIsValid(xid))
+				continue;
+
+			accessExclusiveLocks[index].xid = xid;
+			accessExclusiveLocks[index].dbOid = lock->tag.locktag_field1;
+			accessExclusiveLocks[index].relOid = lock->tag.locktag_field2;
+
+			index++;
+		}
+	}
+
+	Assert(index <= els);
+
+	/*
+	 * And release locks.  We do this in reverse order for two reasons: (1)
+	 * Anyone else who needs more than one of the locks will be trying to lock
+	 * them in increasing order; we don't want to release the other process
+	 * until it can get all the locks it needs. (2) This avoids O(N^2)
+	 * behavior inside LWLockRelease.
+	 */
+	for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+		LWLockRelease(LockHashPartitionLockByIndex(i));
+
+	*nlocks = index;
+	return accessExclusiveLocks;
+}
+
+/* Provide the textual name of any lock mode */
+const char *
+GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode)
+{
+	Assert(lockmethodid > 0 && lockmethodid < lengthof(LockMethods));
+	Assert(mode > 0 && mode <= LockMethods[lockmethodid]->numLockModes);
+	return LockMethods[lockmethodid]->lockModeNames[mode];
+}
+
+#ifdef LOCK_DEBUG
+/*
+ * Dump all locks in the given proc's myProcLocks lists.
+ *
+ * Caller is responsible for having acquired appropriate LWLocks.
+ */
+void
+DumpLocks(PGPROC *proc)
+{
+	SHM_QUEUE  *procLocks;
+	PROCLOCK   *proclock;
+	LOCK	   *lock;
+	int			i;
+
+	if (proc == NULL)
+		return;
+
+	if (proc->waitLock)
+		LOCK_PRINT("DumpLocks: waiting on", proc->waitLock, 0);
+
+	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+	{
+		procLocks = &(proc->myProcLocks[i]);
+
+		proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+											 offsetof(PROCLOCK, procLink));
+
+		while (proclock)
+		{
+			Assert(proclock->tag.myProc == proc);
+
+			lock = proclock->tag.myLock;
+
+			PROCLOCK_PRINT("DumpLocks", proclock);
+			LOCK_PRINT("DumpLocks", lock, 0);
+
+			proclock = (PROCLOCK *)
+				SHMQueueNext(procLocks, &proclock->procLink,
+							 offsetof(PROCLOCK, procLink));
+		}
+	}
+}
+
+/*
+ * Dump all lmgr locks.
+ *
+ * Caller is responsible for having acquired appropriate LWLocks.
+ */
+void
+DumpAllLocks(void)
+{
+	PGPROC	   *proc;
+	PROCLOCK   *proclock;
+	LOCK	   *lock;
+	HASH_SEQ_STATUS status;
+
+	proc = MyProc;
+
+	if (proc && proc->waitLock)
+		LOCK_PRINT("DumpAllLocks: waiting on", proc->waitLock, 0);
+
+	hash_seq_init(&status, LockMethodProcLockHash);
+
+	while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		PROCLOCK_PRINT("DumpAllLocks", proclock);
+
+		lock = proclock->tag.myLock;
+		if (lock)
+			LOCK_PRINT("DumpAllLocks", lock, 0);
+		else
+			elog(LOG, "DumpAllLocks: proclock->tag.myLock = NULL");
+	}
+}
+#endif							/* LOCK_DEBUG */
+
+/*
+ * LOCK 2PC resource manager's routines
+ */
+
+/*
+ * Re-acquire a lock belonging to a transaction that was prepared.
+ *
+ * Because this function is run at db startup, re-acquiring the locks should
+ * never conflict with running transactions because there are none.  We
+ * assume that the lock state represented by the stored 2PC files is legal.
+ *
+ * When switching from Hot Standby mode to normal operation, the locks will
+ * be already held by the startup process. The locks are acquired for the new
+ * procs without checking for conflicts, so we don't get a conflict between the
+ * startup process and the dummy procs, even though we will momentarily have
+ * a situation where two procs are holding the same AccessExclusiveLock,
+ * which isn't normally possible because the conflict. If we're in standby
+ * mode, but a recovery snapshot hasn't been established yet, it's possible
+ * that some but not all of the locks are already held by the startup process.
+ *
+ * This approach is simple, but also a bit dangerous, because if there isn't
+ * enough shared memory to acquire the locks, an error will be thrown, which
+ * is promoted to FATAL and recovery will abort, bringing down postmaster.
+ * A safer approach would be to transfer the locks like we do in
+ * AtPrepare_Locks, but then again, in hot standby mode it's possible for
+ * read-only backends to use up all the shared lock memory anyway, so that
+ * replaying the WAL record that needs to acquire a lock will throw an error
+ * and PANIC anyway.
+ */
+void
+lock_twophase_recover(TransactionId xid, uint16 info,
+					  void *recdata, uint32 len)
+{
+	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+	PGPROC	   *proc = TwoPhaseGetDummyProc(xid, false);
+	LOCKTAG    *locktag;
+	LOCKMODE	lockmode;
+	LOCKMETHODID lockmethodid;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	bool		found;
+	uint32		hashcode;
+	uint32		proclock_hashcode;
+	int			partition;
+	LWLock	   *partitionLock;
+	LockMethod	lockMethodTable;
+
+	Assert(len == sizeof(TwoPhaseLockRecord));
+	locktag = &rec->locktag;
+	lockmode = rec->lockmode;
+	lockmethodid = locktag->locktag_lockmethodid;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+
+	hashcode = LockTagHashCode(locktag);
+	partition = LockHashPartition(hashcode);
+	partitionLock = LockHashPartitionLock(hashcode);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Find or create a lock with this tag.
+	 */
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												(void *) locktag,
+												hashcode,
+												HASH_ENTER_NULL,
+												&found);
+	if (!lock)
+	{
+		LWLockRelease(partitionLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+				 errhint("You might need to increase max_locks_per_transaction.")));
+	}
+
+	/*
+	 * if it's a new lock object, initialize it
+	 */
+	if (!found)
+	{
+		lock->grantMask = 0;
+		lock->waitMask = 0;
+		SHMQueueInit(&(lock->procLocks));
+		ProcQueueInit(&(lock->waitProcs));
+		lock->nRequested = 0;
+		lock->nGranted = 0;
+		MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
+		MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+		LOCK_PRINT("lock_twophase_recover: new", lock, lockmode);
+	}
+	else
+	{
+		LOCK_PRINT("lock_twophase_recover: found", lock, lockmode);
+		Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0));
+		Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0));
+		Assert(lock->nGranted <= lock->nRequested);
+	}
+
+	/*
+	 * Create the hash key for the proclock table.
+	 */
+	proclocktag.myLock = lock;
+	proclocktag.myProc = proc;
+
+	proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+	/*
+	 * Find or create a proclock entry with this tag
+	 */
+	proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+														(void *) &proclocktag,
+														proclock_hashcode,
+														HASH_ENTER_NULL,
+														&found);
+	if (!proclock)
+	{
+		/* Oops, not enough shmem for the proclock */
+		if (lock->nRequested == 0)
+		{
+			/*
+			 * There are no other requestors of this lock, so garbage-collect
+			 * the lock object.  We *must* do this to avoid a permanent leak
+			 * of shared memory, because there won't be anything to cause
+			 * anyone to release the lock object later.
+			 */
+			Assert(SHMQueueEmpty(&(lock->procLocks)));
+			if (!hash_search_with_hash_value(LockMethodLockHash,
+											 (void *) &(lock->tag),
+											 hashcode,
+											 HASH_REMOVE,
+											 NULL))
+				elog(PANIC, "lock table corrupted");
+		}
+		LWLockRelease(partitionLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+				 errhint("You might need to increase max_locks_per_transaction.")));
+	}
+
+	/*
+	 * If new, initialize the new entry
+	 */
+	if (!found)
+	{
+		Assert(proc->lockGroupLeader == NULL);
+		proclock->groupLeader = proc;
+		proclock->holdMask = 0;
+		proclock->releaseMask = 0;
+		/* Add proclock to appropriate lists */
+		SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink);
+		SHMQueueInsertBefore(&(proc->myProcLocks[partition]),
+							 &proclock->procLink);
+		PROCLOCK_PRINT("lock_twophase_recover: new", proclock);
+	}
+	else
+	{
+		PROCLOCK_PRINT("lock_twophase_recover: found", proclock);
+		Assert((proclock->holdMask & ~lock->grantMask) == 0);
+	}
+
+	/*
+	 * lock->nRequested and lock->requested[] count the total number of
+	 * requests, whether granted or waiting, so increment those immediately.
+	 */
+	lock->nRequested++;
+	lock->requested[lockmode]++;
+	Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+
+	/*
+	 * We shouldn't already hold the desired lock.
+	 */
+	if (proclock->holdMask & LOCKBIT_ON(lockmode))
+		elog(ERROR, "lock %s on object %u/%u/%u is already held",
+			 lockMethodTable->lockModeNames[lockmode],
+			 lock->tag.locktag_field1, lock->tag.locktag_field2,
+			 lock->tag.locktag_field3);
+
+	/*
+	 * We ignore any possible conflicts and just grant ourselves the lock. Not
+	 * only because we don't bother, but also to avoid deadlocks when
+	 * switching from standby to normal mode. See function comment.
+	 */
+	GrantLock(lock, proclock, lockmode);
+
+	/*
+	 * Bump strong lock count, to make sure any fast-path lock requests won't
+	 * be granted without consulting the primary lock table.
+	 */
+	if (ConflictsWithRelationFastPath(&lock->tag, lockmode))
+	{
+		uint32		fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+		SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+		FastPathStrongRelationLocks->count[fasthashcode]++;
+		SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+	}
+
+	LWLockRelease(partitionLock);
+}
+
+/*
+ * Re-acquire a lock belonging to a transaction that was prepared, when
+ * starting up into hot standby mode.
+ */
+void
+lock_twophase_standby_recover(TransactionId xid, uint16 info,
+							  void *recdata, uint32 len)
+{
+	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+	LOCKTAG    *locktag;
+	LOCKMODE	lockmode;
+	LOCKMETHODID lockmethodid;
+
+	Assert(len == sizeof(TwoPhaseLockRecord));
+	locktag = &rec->locktag;
+	lockmode = rec->lockmode;
+	lockmethodid = locktag->locktag_lockmethodid;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+	if (lockmode == AccessExclusiveLock &&
+		locktag->locktag_type == LOCKTAG_RELATION)
+	{
+		StandbyAcquireAccessExclusiveLock(xid,
+										  locktag->locktag_field1 /* dboid */ ,
+										  locktag->locktag_field2 /* reloid */ );
+	}
+}
+
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Find and release the lock indicated by the 2PC record.
+ */
+void
+lock_twophase_postcommit(TransactionId xid, uint16 info,
+						 void *recdata, uint32 len)
+{
+	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+	PGPROC	   *proc = TwoPhaseGetDummyProc(xid, true);
+	LOCKTAG    *locktag;
+	LOCKMETHODID lockmethodid;
+	LockMethod	lockMethodTable;
+
+	Assert(len == sizeof(TwoPhaseLockRecord));
+	locktag = &rec->locktag;
+	lockmethodid = locktag->locktag_lockmethodid;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+
+	LockRefindAndRelease(lockMethodTable, proc, locktag, rec->lockmode, true);
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * This is actually just the same as the COMMIT case.
+ */
+void
+lock_twophase_postabort(TransactionId xid, uint16 info,
+						void *recdata, uint32 len)
+{
+	lock_twophase_postcommit(xid, info, recdata, len);
+}
+
+/*
+ *		VirtualXactLockTableInsert
+ *
+ *		Take vxid lock via the fast-path.  There can't be any pre-existing
+ *		lockers, as we haven't advertised this vxid via the ProcArray yet.
+ *
+ *		Since MyProc->fpLocalTransactionId will normally contain the same data
+ *		as MyProc->lxid, you might wonder if we really need both.  The
+ *		difference is that MyProc->lxid is set and cleared unlocked, and
+ *		examined by procarray.c, while fpLocalTransactionId is protected by
+ *		fpInfoLock and is used only by the locking subsystem.  Doing it this
+ *		way makes it easier to verify that there are no funny race conditions.
+ *
+ *		We don't bother recording this lock in the local lock table, since it's
+ *		only ever released at the end of a transaction.  Instead,
+ *		LockReleaseAll() calls VirtualXactLockTableCleanup().
+ */
+void
+VirtualXactLockTableInsert(VirtualTransactionId vxid)
+{
+	Assert(VirtualTransactionIdIsValid(vxid));
+
+	LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+	Assert(MyProc->backendId == vxid.backendId);
+	Assert(MyProc->fpLocalTransactionId == InvalidLocalTransactionId);
+	Assert(MyProc->fpVXIDLock == false);
+
+	MyProc->fpVXIDLock = true;
+	MyProc->fpLocalTransactionId = vxid.localTransactionId;
+
+	LWLockRelease(&MyProc->fpInfoLock);
+}
+
+/*
+ *		VirtualXactLockTableCleanup
+ *
+ *		Check whether a VXID lock has been materialized; if so, release it,
+ *		unblocking waiters.
+ */
+void
+VirtualXactLockTableCleanup(void)
+{
+	bool		fastpath;
+	LocalTransactionId lxid;
+
+	Assert(MyProc->backendId != InvalidBackendId);
+
+	/*
+	 * Clean up shared memory state.
+	 */
+	LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+	fastpath = MyProc->fpVXIDLock;
+	lxid = MyProc->fpLocalTransactionId;
+	MyProc->fpVXIDLock = false;
+	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+
+	LWLockRelease(&MyProc->fpInfoLock);
+
+	/*
+	 * If fpVXIDLock has been cleared without touching fpLocalTransactionId,
+	 * that means someone transferred the lock to the main lock table.
+	 */
+	if (!fastpath && LocalTransactionIdIsValid(lxid))
+	{
+		VirtualTransactionId vxid;
+		LOCKTAG		locktag;
+
+		vxid.backendId = MyBackendId;
+		vxid.localTransactionId = lxid;
+		SET_LOCKTAG_VIRTUALTRANSACTION(locktag, vxid);
+
+		LockRefindAndRelease(LockMethods[DEFAULT_LOCKMETHOD], MyProc,
+							 &locktag, ExclusiveLock, false);
+	}
+}
+
+/*
+ *		XactLockForVirtualXact
+ *
+ * If TransactionIdIsValid(xid), this is essentially XactLockTableWait(xid,
+ * NULL, NULL, XLTW_None) or ConditionalXactLockTableWait(xid).  Unlike those
+ * functions, it assumes "xid" is never a subtransaction and that "xid" is
+ * prepared, committed, or aborted.
+ *
+ * If !TransactionIdIsValid(xid), this locks every prepared XID having been
+ * known as "vxid" before its PREPARE TRANSACTION.
+ */
+static bool
+XactLockForVirtualXact(VirtualTransactionId vxid,
+					   TransactionId xid, bool wait)
+{
+	bool		more = false;
+
+	/* There is no point to wait for 2PCs if you have no 2PCs. */
+	if (max_prepared_xacts == 0)
+		return true;
+
+	do
+	{
+		LockAcquireResult lar;
+		LOCKTAG		tag;
+
+		/* Clear state from previous iterations. */
+		if (more)
+		{
+			xid = InvalidTransactionId;
+			more = false;
+		}
+
+		/* If we have no xid, try to find one. */
+		if (!TransactionIdIsValid(xid))
+			xid = TwoPhaseGetXidByVirtualXID(vxid, &more);
+		if (!TransactionIdIsValid(xid))
+		{
+			Assert(!more);
+			return true;
+		}
+
+		/* Check or wait for XID completion. */
+		SET_LOCKTAG_TRANSACTION(tag, xid);
+		lar = LockAcquire(&tag, ShareLock, false, !wait);
+		if (lar == LOCKACQUIRE_NOT_AVAIL)
+			return false;
+		LockRelease(&tag, ShareLock, false);
+	} while (more);
+
+	return true;
+}
+
+/*
+ *		VirtualXactLock
+ *
+ * If wait = true, wait as long as the given VXID or any XID acquired by the
+ * same transaction is still running.  Then, return true.
+ *
+ * If wait = false, just check whether that VXID or one of those XIDs is still
+ * running, and return true or false.
+ */
+bool
+VirtualXactLock(VirtualTransactionId vxid, bool wait)
+{
+	LOCKTAG		tag;
+	PGPROC	   *proc;
+	TransactionId xid = InvalidTransactionId;
+
+	Assert(VirtualTransactionIdIsValid(vxid));
+
+	if (VirtualTransactionIdIsRecoveredPreparedXact(vxid))
+		/* no vxid lock; localTransactionId is a normal, locked XID */
+		return XactLockForVirtualXact(vxid, vxid.localTransactionId, wait);
+
+	SET_LOCKTAG_VIRTUALTRANSACTION(tag, vxid);
+
+	/*
+	 * If a lock table entry must be made, this is the PGPROC on whose behalf
+	 * it must be done.  Note that the transaction might end or the PGPROC
+	 * might be reassigned to a new backend before we get around to examining
+	 * it, but it doesn't matter.  If we find upon examination that the
+	 * relevant lxid is no longer running here, that's enough to prove that
+	 * it's no longer running anywhere.
+	 */
+	proc = BackendIdGetProc(vxid.backendId);
+	if (proc == NULL)
+		return XactLockForVirtualXact(vxid, InvalidTransactionId, wait);
+
+	/*
+	 * We must acquire this lock before checking the backendId and lxid
+	 * against the ones we're waiting for.  The target backend will only set
+	 * or clear lxid while holding this lock.
+	 */
+	LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
+
+	if (proc->backendId != vxid.backendId
+		|| proc->fpLocalTransactionId != vxid.localTransactionId)
+	{
+		/* VXID ended */
+		LWLockRelease(&proc->fpInfoLock);
+		return XactLockForVirtualXact(vxid, InvalidTransactionId, wait);
+	}
+
+	/*
+	 * If we aren't asked to wait, there's no need to set up a lock table
+	 * entry.  The transaction is still in progress, so just return false.
+	 */
+	if (!wait)
+	{
+		LWLockRelease(&proc->fpInfoLock);
+		return false;
+	}
+
+	/*
+	 * OK, we're going to need to sleep on the VXID.  But first, we must set
+	 * up the primary lock table entry, if needed (ie, convert the proc's
+	 * fast-path lock on its VXID to a regular lock).
+	 */
+	if (proc->fpVXIDLock)
+	{
+		PROCLOCK   *proclock;
+		uint32		hashcode;
+		LWLock	   *partitionLock;
+
+		hashcode = LockTagHashCode(&tag);
+
+		partitionLock = LockHashPartitionLock(hashcode);
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		proclock = SetupLockInTable(LockMethods[DEFAULT_LOCKMETHOD], proc,
+									&tag, hashcode, ExclusiveLock);
+		if (!proclock)
+		{
+			LWLockRelease(partitionLock);
+			LWLockRelease(&proc->fpInfoLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory"),
+					 errhint("You might need to increase max_locks_per_transaction.")));
+		}
+		GrantLock(proclock->tag.myLock, proclock, ExclusiveLock);
+
+		LWLockRelease(partitionLock);
+
+		proc->fpVXIDLock = false;
+	}
+
+	/*
+	 * If the proc has an XID now, we'll avoid a TwoPhaseGetXidByVirtualXID()
+	 * search.  The proc might have assigned this XID but not yet locked it,
+	 * in which case the proc will lock this XID before releasing the VXID.
+	 * The fpInfoLock critical section excludes VirtualXactLockTableCleanup(),
+	 * so we won't save an XID of a different VXID.  It doesn't matter whether
+	 * we save this before or after setting up the primary lock table entry.
+	 */
+	xid = proc->xid;
+
+	/* Done with proc->fpLockBits */
+	LWLockRelease(&proc->fpInfoLock);
+
+	/* Time to wait. */
+	(void) LockAcquire(&tag, ShareLock, false, false);
+
+	LockRelease(&tag, ShareLock, false);
+	return XactLockForVirtualXact(vxid, xid, wait);
+}
+
+/*
+ * LockWaiterCount
+ *
+ * Find the number of lock requester on this locktag
+ */
+int
+LockWaiterCount(const LOCKTAG *locktag)
+{
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LOCK	   *lock;
+	bool		found;
+	uint32		hashcode;
+	LWLock	   *partitionLock;
+	int			waiters = 0;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+	hashcode = LockTagHashCode(locktag);
+	partitionLock = LockHashPartitionLock(hashcode);
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												(const void *) locktag,
+												hashcode,
+												HASH_FIND,
+												&found);
+	if (found)
+	{
+		Assert(lock != NULL);
+		waiters = lock->nRequested;
+	}
+	LWLockRelease(partitionLock);
+
+	return waiters;
+}
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
new file mode 100644
index 0000000..07eb6f6
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -0,0 +1,1977 @@
+/*-------------------------------------------------------------------------
+ *
+ * lwlock.c
+ *	  Lightweight lock manager
+ *
+ * Lightweight locks are intended primarily to provide mutual exclusion of
+ * access to shared-memory data structures.  Therefore, they offer both
+ * exclusive and shared lock modes (to support read/write and read-only
+ * access to a shared object).  There are few other frammishes.  User-level
+ * locking should be done with the full lock manager --- which depends on
+ * LWLocks to protect its shared state.
+ *
+ * In addition to exclusive and shared modes, lightweight locks can be used to
+ * wait until a variable changes value.  The variable is initially not set
+ * when the lock is acquired with LWLockAcquire, i.e. it remains set to the
+ * value it was set to when the lock was released last, and can be updated
+ * without releasing the lock by calling LWLockUpdateVar.  LWLockWaitForVar
+ * waits for the variable to be updated, or until the lock is free.  When
+ * releasing the lock with LWLockReleaseClearVar() the value can be set to an
+ * appropriate value for a free lock.  The meaning of the variable is up to
+ * the caller, the lightweight lock code just assigns and compares it.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/lwlock.c
+ *
+ * NOTES:
+ *
+ * This used to be a pretty straight forward reader-writer lock
+ * implementation, in which the internal state was protected by a
+ * spinlock. Unfortunately the overhead of taking the spinlock proved to be
+ * too high for workloads/locks that were taken in shared mode very
+ * frequently. Often we were spinning in the (obviously exclusive) spinlock,
+ * while trying to acquire a shared lock that was actually free.
+ *
+ * Thus a new implementation was devised that provides wait-free shared lock
+ * acquisition for locks that aren't exclusively locked.
+ *
+ * The basic idea is to have a single atomic variable 'lockcount' instead of
+ * the formerly separate shared and exclusive counters and to use atomic
+ * operations to acquire the lock. That's fairly easy to do for plain
+ * rw-spinlocks, but a lot harder for something like LWLocks that want to wait
+ * in the OS.
+ *
+ * For lock acquisition we use an atomic compare-and-exchange on the lockcount
+ * variable. For exclusive lock we swap in a sentinel value
+ * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders.
+ *
+ * To release the lock we use an atomic decrement to release the lock. If the
+ * new value is zero (we get that atomically), we know we can/have to release
+ * waiters.
+ *
+ * Obviously it is important that the sentinel value for exclusive locks
+ * doesn't conflict with the maximum number of possible share lockers -
+ * luckily MAX_BACKENDS makes that easily possible.
+ *
+ *
+ * The attentive reader might have noticed that naively doing the above has a
+ * glaring race condition: We try to lock using the atomic operations and
+ * notice that we have to wait. Unfortunately by the time we have finished
+ * queuing, the former locker very well might have already finished it's
+ * work. That's problematic because we're now stuck waiting inside the OS.
+
+ * To mitigate those races we use a two phased attempt at locking:
+ *	 Phase 1: Try to do it atomically, if we succeed, nice
+ *	 Phase 2: Add ourselves to the waitqueue of the lock
+ *	 Phase 3: Try to grab the lock again, if we succeed, remove ourselves from
+ *			  the queue
+ *	 Phase 4: Sleep till wake-up, goto Phase 1
+ *
+ * This protects us against the problem from above as nobody can release too
+ *	  quick, before we're queued, since after Phase 2 we're already queued.
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "postmaster/postmaster.h"
+#include "replication/slot.h"
+#include "storage/ipc.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+#ifdef LWLOCK_STATS
+#include "utils/hsearch.h"
+#endif
+
+
+/* We use the ShmemLock spinlock to protect LWLockCounter */
+extern slock_t *ShmemLock;
+
+#define LW_FLAG_HAS_WAITERS			((uint32) 1 << 30)
+#define LW_FLAG_RELEASE_OK			((uint32) 1 << 29)
+#define LW_FLAG_LOCKED				((uint32) 1 << 28)
+
+#define LW_VAL_EXCLUSIVE			((uint32) 1 << 24)
+#define LW_VAL_SHARED				1
+
+#define LW_LOCK_MASK				((uint32) ((1 << 25)-1))
+/* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */
+#define LW_SHARED_MASK				((uint32) ((1 << 24)-1))
+
+/*
+ * There are three sorts of LWLock "tranches":
+ *
+ * 1. The individually-named locks defined in lwlocknames.h each have their
+ * own tranche.  The names of these tranches appear in IndividualLWLockNames[]
+ * in lwlocknames.c.
+ *
+ * 2. There are some predefined tranches for built-in groups of locks.
+ * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names
+ * appear in BuiltinTrancheNames[] below.
+ *
+ * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche
+ * or LWLockRegisterTranche.  The names of these that are known in the current
+ * process appear in LWLockTrancheNames[].
+ *
+ * All these names are user-visible as wait event names, so choose with care
+ * ... and do not forget to update the documentation's list of wait events.
+ */
+extern const char *const IndividualLWLockNames[];	/* in lwlocknames.c */
+
+static const char *const BuiltinTrancheNames[] = {
+	/* LWTRANCHE_XACT_BUFFER: */
+	"XactBuffer",
+	/* LWTRANCHE_COMMITTS_BUFFER: */
+	"CommitTSBuffer",
+	/* LWTRANCHE_SUBTRANS_BUFFER: */
+	"SubtransBuffer",
+	/* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */
+	"MultiXactOffsetBuffer",
+	/* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */
+	"MultiXactMemberBuffer",
+	/* LWTRANCHE_NOTIFY_BUFFER: */
+	"NotifyBuffer",
+	/* LWTRANCHE_SERIAL_BUFFER: */
+	"SerialBuffer",
+	/* LWTRANCHE_WAL_INSERT: */
+	"WALInsert",
+	/* LWTRANCHE_BUFFER_CONTENT: */
+	"BufferContent",
+	/* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
+	"ReplicationOriginState",
+	/* LWTRANCHE_REPLICATION_SLOT_IO: */
+	"ReplicationSlotIO",
+	/* LWTRANCHE_LOCK_FASTPATH: */
+	"LockFastPath",
+	/* LWTRANCHE_BUFFER_MAPPING: */
+	"BufferMapping",
+	/* LWTRANCHE_LOCK_MANAGER: */
+	"LockManager",
+	/* LWTRANCHE_PREDICATE_LOCK_MANAGER: */
+	"PredicateLockManager",
+	/* LWTRANCHE_PARALLEL_HASH_JOIN: */
+	"ParallelHashJoin",
+	/* LWTRANCHE_PARALLEL_QUERY_DSA: */
+	"ParallelQueryDSA",
+	/* LWTRANCHE_PER_SESSION_DSA: */
+	"PerSessionDSA",
+	/* LWTRANCHE_PER_SESSION_RECORD_TYPE: */
+	"PerSessionRecordType",
+	/* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */
+	"PerSessionRecordTypmod",
+	/* LWTRANCHE_SHARED_TUPLESTORE: */
+	"SharedTupleStore",
+	/* LWTRANCHE_SHARED_TIDBITMAP: */
+	"SharedTidBitmap",
+	/* LWTRANCHE_PARALLEL_APPEND: */
+	"ParallelAppend",
+	/* LWTRANCHE_PER_XACT_PREDICATE_LIST: */
+	"PerXactPredicateList"
+};
+
+StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
+				 LWTRANCHE_FIRST_USER_DEFINED - NUM_INDIVIDUAL_LWLOCKS,
+				 "missing entries in BuiltinTrancheNames[]");
+
+/*
+ * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and
+ * stores the names of all dynamically-created tranches known to the current
+ * process.  Any unused entries in the array will contain NULL.
+ */
+static const char **LWLockTrancheNames = NULL;
+static int	LWLockTrancheNamesAllocated = 0;
+
+/*
+ * This points to the main array of LWLocks in shared memory.  Backends inherit
+ * the pointer by fork from the postmaster (except in the EXEC_BACKEND case,
+ * where we have special measures to pass it down).
+ */
+LWLockPadded *MainLWLockArray = NULL;
+
+/*
+ * We use this structure to keep track of locked LWLocks for release
+ * during error recovery.  Normally, only a few will be held at once, but
+ * occasionally the number can be much higher; for example, the pg_buffercache
+ * extension locks all buffer partitions simultaneously.
+ */
+#define MAX_SIMUL_LWLOCKS	200
+
+/* struct representing the LWLocks we're holding */
+typedef struct LWLockHandle
+{
+	LWLock	   *lock;
+	LWLockMode	mode;
+} LWLockHandle;
+
+static int	num_held_lwlocks = 0;
+static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS];
+
+/* struct representing the LWLock tranche request for named tranche */
+typedef struct NamedLWLockTrancheRequest
+{
+	char		tranche_name[NAMEDATALEN];
+	int			num_lwlocks;
+} NamedLWLockTrancheRequest;
+
+static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL;
+static int	NamedLWLockTrancheRequestsAllocated = 0;
+
+/*
+ * NamedLWLockTrancheRequests is both the valid length of the request array,
+ * and the length of the shared-memory NamedLWLockTrancheArray later on.
+ * This variable and NamedLWLockTrancheArray are non-static so that
+ * postmaster.c can copy them to child processes in EXEC_BACKEND builds.
+ */
+int			NamedLWLockTrancheRequests = 0;
+
+/* points to data in shared memory: */
+NamedLWLockTranche *NamedLWLockTrancheArray = NULL;
+
+static bool lock_named_request_allowed = true;
+
+static void InitializeLWLocks(void);
+static inline void LWLockReportWaitStart(LWLock *lock);
+static inline void LWLockReportWaitEnd(void);
+static const char *GetLWTrancheName(uint16 trancheId);
+
+#define T_NAME(lock) \
+	GetLWTrancheName((lock)->tranche)
+
+#ifdef LWLOCK_STATS
+typedef struct lwlock_stats_key
+{
+	int			tranche;
+	void	   *instance;
+}			lwlock_stats_key;
+
+typedef struct lwlock_stats
+{
+	lwlock_stats_key key;
+	int			sh_acquire_count;
+	int			ex_acquire_count;
+	int			block_count;
+	int			dequeue_self_count;
+	int			spin_delay_count;
+}			lwlock_stats;
+
+static HTAB *lwlock_stats_htab;
+static lwlock_stats lwlock_stats_dummy;
+#endif
+
+#ifdef LOCK_DEBUG
+bool		Trace_lwlocks = false;
+
+inline static void
+PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode)
+{
+	/* hide statement & context here, otherwise the log is just too verbose */
+	if (Trace_lwlocks)
+	{
+		uint32		state = pg_atomic_read_u32(&lock->state);
+
+		ereport(LOG,
+				(errhidestmt(true),
+				 errhidecontext(true),
+				 errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d",
+								 MyProcPid,
+								 where, T_NAME(lock), lock,
+								 (state & LW_VAL_EXCLUSIVE) != 0,
+								 state & LW_SHARED_MASK,
+								 (state & LW_FLAG_HAS_WAITERS) != 0,
+								 pg_atomic_read_u32(&lock->nwaiters),
+								 (state & LW_FLAG_RELEASE_OK) != 0)));
+	}
+}
+
+inline static void
+LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg)
+{
+	/* hide statement & context here, otherwise the log is just too verbose */
+	if (Trace_lwlocks)
+	{
+		ereport(LOG,
+				(errhidestmt(true),
+				 errhidecontext(true),
+				 errmsg_internal("%s(%s %p): %s", where,
+								 T_NAME(lock), lock, msg)));
+	}
+}
+
+#else							/* not LOCK_DEBUG */
+#define PRINT_LWDEBUG(a,b,c) ((void)0)
+#define LOG_LWDEBUG(a,b,c) ((void)0)
+#endif							/* LOCK_DEBUG */
+
+#ifdef LWLOCK_STATS
+
+static void init_lwlock_stats(void);
+static void print_lwlock_stats(int code, Datum arg);
+static lwlock_stats * get_lwlock_stats_entry(LWLock *lock);
+
+static void
+init_lwlock_stats(void)
+{
+	HASHCTL		ctl;
+	static MemoryContext lwlock_stats_cxt = NULL;
+	static bool exit_registered = false;
+
+	if (lwlock_stats_cxt != NULL)
+		MemoryContextDelete(lwlock_stats_cxt);
+
+	/*
+	 * The LWLock stats will be updated within a critical section, which
+	 * requires allocating new hash entries. Allocations within a critical
+	 * section are normally not allowed because running out of memory would
+	 * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally
+	 * turned on in production, so that's an acceptable risk. The hash entries
+	 * are small, so the risk of running out of memory is minimal in practice.
+	 */
+	lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext,
+											 "LWLock stats",
+											 ALLOCSET_DEFAULT_SIZES);
+	MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true);
+
+	ctl.keysize = sizeof(lwlock_stats_key);
+	ctl.entrysize = sizeof(lwlock_stats);
+	ctl.hcxt = lwlock_stats_cxt;
+	lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl,
+									HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+	if (!exit_registered)
+	{
+		on_shmem_exit(print_lwlock_stats, 0);
+		exit_registered = true;
+	}
+}
+
+static void
+print_lwlock_stats(int code, Datum arg)
+{
+	HASH_SEQ_STATUS scan;
+	lwlock_stats *lwstats;
+
+	hash_seq_init(&scan, lwlock_stats_htab);
+
+	/* Grab an LWLock to keep different backends from mixing reports */
+	LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE);
+
+	while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL)
+	{
+		fprintf(stderr,
+				"PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n",
+				MyProcPid, GetLWTrancheName(lwstats->key.tranche),
+				lwstats->key.instance, lwstats->sh_acquire_count,
+				lwstats->ex_acquire_count, lwstats->block_count,
+				lwstats->spin_delay_count, lwstats->dequeue_self_count);
+	}
+
+	LWLockRelease(&MainLWLockArray[0].lock);
+}
+
+static lwlock_stats *
+get_lwlock_stats_entry(LWLock *lock)
+{
+	lwlock_stats_key key;
+	lwlock_stats *lwstats;
+	bool		found;
+
+	/*
+	 * During shared memory initialization, the hash table doesn't exist yet.
+	 * Stats of that phase aren't very interesting, so just collect operations
+	 * on all locks in a single dummy entry.
+	 */
+	if (lwlock_stats_htab == NULL)
+		return &lwlock_stats_dummy;
+
+	/* Fetch or create the entry. */
+	MemSet(&key, 0, sizeof(key));
+	key.tranche = lock->tranche;
+	key.instance = lock;
+	lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found);
+	if (!found)
+	{
+		lwstats->sh_acquire_count = 0;
+		lwstats->ex_acquire_count = 0;
+		lwstats->block_count = 0;
+		lwstats->dequeue_self_count = 0;
+		lwstats->spin_delay_count = 0;
+	}
+	return lwstats;
+}
+#endif							/* LWLOCK_STATS */
+
+
+/*
+ * Compute number of LWLocks required by named tranches.  These will be
+ * allocated in the main array.
+ */
+static int
+NumLWLocksForNamedTranches(void)
+{
+	int			numLocks = 0;
+	int			i;
+
+	for (i = 0; i < NamedLWLockTrancheRequests; i++)
+		numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+
+	return numLocks;
+}
+
+/*
+ * Compute shmem space needed for LWLocks and named tranches.
+ */
+Size
+LWLockShmemSize(void)
+{
+	Size		size;
+	int			i;
+	int			numLocks = NUM_FIXED_LWLOCKS;
+
+	/* Calculate total number of locks needed in the main array. */
+	numLocks += NumLWLocksForNamedTranches();
+
+	/* Space for the LWLock array. */
+	size = mul_size(numLocks, sizeof(LWLockPadded));
+
+	/* Space for dynamic allocation counter, plus room for alignment. */
+	size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE);
+
+	/* space for named tranches. */
+	size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche)));
+
+	/* space for name of each tranche. */
+	for (i = 0; i < NamedLWLockTrancheRequests; i++)
+		size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1);
+
+	/* Disallow adding any more named tranches. */
+	lock_named_request_allowed = false;
+
+	return size;
+}
+
+/*
+ * Allocate shmem space for the main LWLock array and all tranches and
+ * initialize it.  We also register extension LWLock tranches here.
+ */
+void
+CreateLWLocks(void)
+{
+	StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
+					 "MAX_BACKENDS too big for lwlock.c");
+
+	StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
+					 "Miscalculated LWLock padding");
+
+	if (!IsUnderPostmaster)
+	{
+		Size		spaceLocks = LWLockShmemSize();
+		int		   *LWLockCounter;
+		char	   *ptr;
+
+		/* Allocate space */
+		ptr = (char *) ShmemAlloc(spaceLocks);
+
+		/* Leave room for dynamic allocation of tranches */
+		ptr += sizeof(int);
+
+		/* Ensure desired alignment of LWLock array */
+		ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE;
+
+		MainLWLockArray = (LWLockPadded *) ptr;
+
+		/*
+		 * Initialize the dynamic-allocation counter for tranches, which is
+		 * stored just before the first LWLock.
+		 */
+		LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+		*LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED;
+
+		/* Initialize all LWLocks */
+		InitializeLWLocks();
+	}
+
+	/* Register named extension LWLock tranches in the current process. */
+	for (int i = 0; i < NamedLWLockTrancheRequests; i++)
+		LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId,
+							  NamedLWLockTrancheArray[i].trancheName);
+}
+
+/*
+ * Initialize LWLocks that are fixed and those belonging to named tranches.
+ */
+static void
+InitializeLWLocks(void)
+{
+	int			numNamedLocks = NumLWLocksForNamedTranches();
+	int			id;
+	int			i;
+	int			j;
+	LWLockPadded *lock;
+
+	/* Initialize all individual LWLocks in main array */
+	for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++)
+		LWLockInitialize(&lock->lock, id);
+
+	/* Initialize buffer mapping LWLocks in main array */
+	lock = MainLWLockArray + BUFFER_MAPPING_LWLOCK_OFFSET;
+	for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++)
+		LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING);
+
+	/* Initialize lmgrs' LWLocks in main array */
+	lock = MainLWLockArray + LOCK_MANAGER_LWLOCK_OFFSET;
+	for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++)
+		LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER);
+
+	/* Initialize predicate lmgrs' LWLocks in main array */
+	lock = MainLWLockArray + PREDICATELOCK_MANAGER_LWLOCK_OFFSET;
+	for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++)
+		LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER);
+
+	/*
+	 * Copy the info about any named tranches into shared memory (so that
+	 * other processes can see it), and initialize the requested LWLocks.
+	 */
+	if (NamedLWLockTrancheRequests > 0)
+	{
+		char	   *trancheNames;
+
+		NamedLWLockTrancheArray = (NamedLWLockTranche *)
+			&MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks];
+
+		trancheNames = (char *) NamedLWLockTrancheArray +
+			(NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche));
+		lock = &MainLWLockArray[NUM_FIXED_LWLOCKS];
+
+		for (i = 0; i < NamedLWLockTrancheRequests; i++)
+		{
+			NamedLWLockTrancheRequest *request;
+			NamedLWLockTranche *tranche;
+			char	   *name;
+
+			request = &NamedLWLockTrancheRequestArray[i];
+			tranche = &NamedLWLockTrancheArray[i];
+
+			name = trancheNames;
+			trancheNames += strlen(request->tranche_name) + 1;
+			strcpy(name, request->tranche_name);
+			tranche->trancheId = LWLockNewTrancheId();
+			tranche->trancheName = name;
+
+			for (j = 0; j < request->num_lwlocks; j++, lock++)
+				LWLockInitialize(&lock->lock, tranche->trancheId);
+		}
+	}
+}
+
+/*
+ * InitLWLockAccess - initialize backend-local state needed to hold LWLocks
+ */
+void
+InitLWLockAccess(void)
+{
+#ifdef LWLOCK_STATS
+	init_lwlock_stats();
+#endif
+}
+
+/*
+ * GetNamedLWLockTranche - returns the base address of LWLock from the
+ *		specified tranche.
+ *
+ * Caller needs to retrieve the requested number of LWLocks starting from
+ * the base lock address returned by this API.  This can be used for
+ * tranches that are requested by using RequestNamedLWLockTranche() API.
+ */
+LWLockPadded *
+GetNamedLWLockTranche(const char *tranche_name)
+{
+	int			lock_pos;
+	int			i;
+
+	/*
+	 * Obtain the position of base address of LWLock belonging to requested
+	 * tranche_name in MainLWLockArray.  LWLocks for named tranches are placed
+	 * in MainLWLockArray after fixed locks.
+	 */
+	lock_pos = NUM_FIXED_LWLOCKS;
+	for (i = 0; i < NamedLWLockTrancheRequests; i++)
+	{
+		if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name,
+				   tranche_name) == 0)
+			return &MainLWLockArray[lock_pos];
+
+		lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+	}
+
+	elog(ERROR, "requested tranche is not registered");
+
+	/* just to keep compiler quiet */
+	return NULL;
+}
+
+/*
+ * Allocate a new tranche ID.
+ */
+int
+LWLockNewTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = (*LWLockCounter)++;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
+/*
+ * Register a dynamic tranche name in the lookup table of the current process.
+ *
+ * This routine will save a pointer to the tranche name passed as an argument,
+ * so the name should be allocated in a backend-lifetime context
+ * (shared memory, TopMemoryContext, static constant, or similar).
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+LWLockRegisterTranche(int tranche_id, const char *tranche_name)
+{
+	/* This should only be called for user-defined tranches. */
+	if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED)
+		return;
+
+	/* Convert to array index. */
+	tranche_id -= LWTRANCHE_FIRST_USER_DEFINED;
+
+	/* If necessary, create or enlarge array. */
+	if (tranche_id >= LWLockTrancheNamesAllocated)
+	{
+		int			newalloc;
+
+		newalloc = Max(LWLockTrancheNamesAllocated, 8);
+		while (newalloc <= tranche_id)
+			newalloc *= 2;
+
+		if (LWLockTrancheNames == NULL)
+			LWLockTrancheNames = (const char **)
+				MemoryContextAllocZero(TopMemoryContext,
+									   newalloc * sizeof(char *));
+		else
+		{
+			LWLockTrancheNames = (const char **)
+				repalloc(LWLockTrancheNames, newalloc * sizeof(char *));
+			memset(LWLockTrancheNames + LWLockTrancheNamesAllocated,
+				   0,
+				   (newalloc - LWLockTrancheNamesAllocated) * sizeof(char *));
+		}
+		LWLockTrancheNamesAllocated = newalloc;
+	}
+
+	LWLockTrancheNames[tranche_id] = tranche_name;
+}
+
+/*
+ * RequestNamedLWLockTranche
+ *		Request that extra LWLocks be allocated during postmaster
+ *		startup.
+ *
+ * This is only useful for extensions if called from the _PG_init hook
+ * of a library that is loaded into the postmaster via
+ * shared_preload_libraries.  Once shared memory has been allocated, calls
+ * will be ignored.  (We could raise an error, but it seems better to make
+ * it a no-op, so that libraries containing such calls can be reloaded if
+ * needed.)
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks)
+{
+	NamedLWLockTrancheRequest *request;
+
+	if (IsUnderPostmaster || !lock_named_request_allowed)
+		return;					/* too late */
+
+	if (NamedLWLockTrancheRequestArray == NULL)
+	{
+		NamedLWLockTrancheRequestsAllocated = 16;
+		NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+			MemoryContextAlloc(TopMemoryContext,
+							   NamedLWLockTrancheRequestsAllocated
+							   * sizeof(NamedLWLockTrancheRequest));
+	}
+
+	if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated)
+	{
+		int			i = NamedLWLockTrancheRequestsAllocated;
+
+		while (i <= NamedLWLockTrancheRequests)
+			i *= 2;
+
+		NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+			repalloc(NamedLWLockTrancheRequestArray,
+					 i * sizeof(NamedLWLockTrancheRequest));
+		NamedLWLockTrancheRequestsAllocated = i;
+	}
+
+	request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests];
+	Assert(strlen(tranche_name) + 1 <= NAMEDATALEN);
+	strlcpy(request->tranche_name, tranche_name, NAMEDATALEN);
+	request->num_lwlocks = num_lwlocks;
+	NamedLWLockTrancheRequests++;
+}
+
+/*
+ * LWLockInitialize - initialize a new lwlock; it's initially unlocked
+ */
+void
+LWLockInitialize(LWLock *lock, int tranche_id)
+{
+	pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK);
+#ifdef LOCK_DEBUG
+	pg_atomic_init_u32(&lock->nwaiters, 0);
+#endif
+	lock->tranche = tranche_id;
+	proclist_init(&lock->waiters);
+}
+
+/*
+ * Report start of wait event for light-weight locks.
+ *
+ * This function will be used by all the light-weight lock calls which
+ * needs to wait to acquire the lock.  This function distinguishes wait
+ * event based on tranche and lock id.
+ */
+static inline void
+LWLockReportWaitStart(LWLock *lock)
+{
+	pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche);
+}
+
+/*
+ * Report end of wait event for light-weight locks.
+ */
+static inline void
+LWLockReportWaitEnd(void)
+{
+	pgstat_report_wait_end();
+}
+
+/*
+ * Return the name of an LWLock tranche.
+ */
+static const char *
+GetLWTrancheName(uint16 trancheId)
+{
+	/* Individual LWLock? */
+	if (trancheId < NUM_INDIVIDUAL_LWLOCKS)
+		return IndividualLWLockNames[trancheId];
+
+	/* Built-in tranche? */
+	if (trancheId < LWTRANCHE_FIRST_USER_DEFINED)
+		return BuiltinTrancheNames[trancheId - NUM_INDIVIDUAL_LWLOCKS];
+
+	/*
+	 * It's an extension tranche, so look in LWLockTrancheNames[].  However,
+	 * it's possible that the tranche has never been registered in the current
+	 * process, in which case give up and return "extension".
+	 */
+	trancheId -= LWTRANCHE_FIRST_USER_DEFINED;
+
+	if (trancheId >= LWLockTrancheNamesAllocated ||
+		LWLockTrancheNames[trancheId] == NULL)
+		return "extension";
+
+	return LWLockTrancheNames[trancheId];
+}
+
+/*
+ * Return an identifier for an LWLock based on the wait class and event.
+ */
+const char *
+GetLWLockIdentifier(uint32 classId, uint16 eventId)
+{
+	Assert(classId == PG_WAIT_LWLOCK);
+	/* The event IDs are just tranche numbers. */
+	return GetLWTrancheName(eventId);
+}
+
+/*
+ * Internal function that tries to atomically acquire the lwlock in the passed
+ * in mode.
+ *
+ * This function will not block waiting for a lock to become free - that's the
+ * callers job.
+ *
+ * Returns true if the lock isn't free and we need to wait.
+ */
+static bool
+LWLockAttemptLock(LWLock *lock, LWLockMode mode)
+{
+	uint32		old_state;
+
+	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
+
+	/*
+	 * Read once outside the loop, later iterations will get the newer value
+	 * via compare & exchange.
+	 */
+	old_state = pg_atomic_read_u32(&lock->state);
+
+	/* loop until we've determined whether we could acquire the lock or not */
+	while (true)
+	{
+		uint32		desired_state;
+		bool		lock_free;
+
+		desired_state = old_state;
+
+		if (mode == LW_EXCLUSIVE)
+		{
+			lock_free = (old_state & LW_LOCK_MASK) == 0;
+			if (lock_free)
+				desired_state += LW_VAL_EXCLUSIVE;
+		}
+		else
+		{
+			lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
+			if (lock_free)
+				desired_state += LW_VAL_SHARED;
+		}
+
+		/*
+		 * Attempt to swap in the state we are expecting. If we didn't see
+		 * lock to be free, that's just the old value. If we saw it as free,
+		 * we'll attempt to mark it acquired. The reason that we always swap
+		 * in the value is that this doubles as a memory barrier. We could try
+		 * to be smarter and only swap in values if we saw the lock as free,
+		 * but benchmark haven't shown it as beneficial so far.
+		 *
+		 * Retry if the value changed since we last looked at it.
+		 */
+		if (pg_atomic_compare_exchange_u32(&lock->state,
+										   &old_state, desired_state))
+		{
+			if (lock_free)
+			{
+				/* Great! Got the lock. */
+#ifdef LOCK_DEBUG
+				if (mode == LW_EXCLUSIVE)
+					lock->owner = MyProc;
+#endif
+				return false;
+			}
+			else
+				return true;	/* somebody else has the lock */
+		}
+	}
+	pg_unreachable();
+}
+
+/*
+ * Lock the LWLock's wait list against concurrent activity.
+ *
+ * NB: even though the wait list is locked, non-conflicting lock operations
+ * may still happen concurrently.
+ *
+ * Time spent holding mutex should be short!
+ */
+static void
+LWLockWaitListLock(LWLock *lock)
+{
+	uint32		old_state;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+	uint32		delays = 0;
+
+	lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+	while (true)
+	{
+		/* always try once to acquire lock directly */
+		old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED);
+		if (!(old_state & LW_FLAG_LOCKED))
+			break;				/* got lock */
+
+		/* and then spin without atomic operations until lock is released */
+		{
+			SpinDelayStatus delayStatus;
+
+			init_local_spin_delay(&delayStatus);
+
+			while (old_state & LW_FLAG_LOCKED)
+			{
+				perform_spin_delay(&delayStatus);
+				old_state = pg_atomic_read_u32(&lock->state);
+			}
+#ifdef LWLOCK_STATS
+			delays += delayStatus.delays;
+#endif
+			finish_spin_delay(&delayStatus);
+		}
+
+		/*
+		 * Retry. The lock might obviously already be re-acquired by the time
+		 * we're attempting to get it again.
+		 */
+	}
+
+#ifdef LWLOCK_STATS
+	lwstats->spin_delay_count += delays;
+#endif
+}
+
+/*
+ * Unlock the LWLock's wait list.
+ *
+ * Note that it can be more efficient to manipulate flags and release the
+ * locks in a single atomic operation.
+ */
+static void
+LWLockWaitListUnlock(LWLock *lock)
+{
+	uint32		old_state PG_USED_FOR_ASSERTS_ONLY;
+
+	old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED);
+
+	Assert(old_state & LW_FLAG_LOCKED);
+}
+
+/*
+ * Wakeup all the lockers that currently have a chance to acquire the lock.
+ */
+static void
+LWLockWakeup(LWLock *lock)
+{
+	bool		new_release_ok;
+	bool		wokeup_somebody = false;
+	proclist_head wakeup;
+	proclist_mutable_iter iter;
+
+	proclist_init(&wakeup);
+
+	new_release_ok = true;
+
+	/* lock wait list while collecting backends to wake up */
+	LWLockWaitListLock(lock);
+
+	proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+	{
+		PGPROC	   *waiter = GetPGProcByNumber(iter.cur);
+
+		if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE)
+			continue;
+
+		proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+		proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+
+		if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+		{
+			/*
+			 * Prevent additional wakeups until retryer gets to run. Backends
+			 * that are just waiting for the lock to become free don't retry
+			 * automatically.
+			 */
+			new_release_ok = false;
+
+			/*
+			 * Don't wakeup (further) exclusive locks.
+			 */
+			wokeup_somebody = true;
+		}
+
+		/*
+		 * Once we've woken up an exclusive lock, there's no point in waking
+		 * up anybody else.
+		 */
+		if (waiter->lwWaitMode == LW_EXCLUSIVE)
+			break;
+	}
+
+	Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS);
+
+	/* unset required flags, and release lock, in one fell swoop */
+	{
+		uint32		old_state;
+		uint32		desired_state;
+
+		old_state = pg_atomic_read_u32(&lock->state);
+		while (true)
+		{
+			desired_state = old_state;
+
+			/* compute desired flags */
+
+			if (new_release_ok)
+				desired_state |= LW_FLAG_RELEASE_OK;
+			else
+				desired_state &= ~LW_FLAG_RELEASE_OK;
+
+			if (proclist_is_empty(&wakeup))
+				desired_state &= ~LW_FLAG_HAS_WAITERS;
+
+			desired_state &= ~LW_FLAG_LOCKED;	/* release lock */
+
+			if (pg_atomic_compare_exchange_u32(&lock->state, &old_state,
+											   desired_state))
+				break;
+		}
+	}
+
+	/* Awaken any waiters I removed from the queue. */
+	proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+	{
+		PGPROC	   *waiter = GetPGProcByNumber(iter.cur);
+
+		LOG_LWDEBUG("LWLockRelease", lock, "release waiter");
+		proclist_delete(&wakeup, iter.cur, lwWaitLink);
+
+		/*
+		 * Guarantee that lwWaiting being unset only becomes visible once the
+		 * unlink from the link has completed. Otherwise the target backend
+		 * could be woken up for other reason and enqueue for a new lock - if
+		 * that happens before the list unlink happens, the list would end up
+		 * being corrupted.
+		 *
+		 * The barrier pairs with the LWLockWaitListLock() when enqueuing for
+		 * another lock.
+		 */
+		pg_write_barrier();
+		waiter->lwWaiting = false;
+		PGSemaphoreUnlock(waiter->sem);
+	}
+}
+
+/*
+ * Add ourselves to the end of the queue.
+ *
+ * NB: Mode can be LW_WAIT_UNTIL_FREE here!
+ */
+static void
+LWLockQueueSelf(LWLock *lock, LWLockMode mode)
+{
+	/*
+	 * If we don't have a PGPROC structure, there's no way to wait. This
+	 * should never occur, since MyProc should only be null during shared
+	 * memory initialization.
+	 */
+	if (MyProc == NULL)
+		elog(PANIC, "cannot wait without a PGPROC structure");
+
+	if (MyProc->lwWaiting)
+		elog(PANIC, "queueing for lock while waiting on another one");
+
+	LWLockWaitListLock(lock);
+
+	/* setting the flag is protected by the spinlock */
+	pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS);
+
+	MyProc->lwWaiting = true;
+	MyProc->lwWaitMode = mode;
+
+	/* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */
+	if (mode == LW_WAIT_UNTIL_FREE)
+		proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+	else
+		proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+
+	/* Can release the mutex now */
+	LWLockWaitListUnlock(lock);
+
+#ifdef LOCK_DEBUG
+	pg_atomic_fetch_add_u32(&lock->nwaiters, 1);
+#endif
+
+}
+
+/*
+ * Remove ourselves from the waitlist.
+ *
+ * This is used if we queued ourselves because we thought we needed to sleep
+ * but, after further checking, we discovered that we don't actually need to
+ * do so.
+ */
+static void
+LWLockDequeueSelf(LWLock *lock)
+{
+	bool		found = false;
+	proclist_mutable_iter iter;
+
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+
+	lwstats = get_lwlock_stats_entry(lock);
+
+	lwstats->dequeue_self_count++;
+#endif
+
+	LWLockWaitListLock(lock);
+
+	/*
+	 * Can't just remove ourselves from the list, but we need to iterate over
+	 * all entries as somebody else could have dequeued us.
+	 */
+	proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+	{
+		if (iter.cur == MyProc->pgprocno)
+		{
+			found = true;
+			proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+			break;
+		}
+	}
+
+	if (proclist_is_empty(&lock->waiters) &&
+		(pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0)
+	{
+		pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS);
+	}
+
+	/* XXX: combine with fetch_and above? */
+	LWLockWaitListUnlock(lock);
+
+	/* clear waiting state again, nice for debugging */
+	if (found)
+		MyProc->lwWaiting = false;
+	else
+	{
+		int			extraWaits = 0;
+
+		/*
+		 * Somebody else dequeued us and has or will wake us up. Deal with the
+		 * superfluous absorption of a wakeup.
+		 */
+
+		/*
+		 * Reset RELEASE_OK flag if somebody woke us before we removed
+		 * ourselves - they'll have set it to false.
+		 */
+		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+		/*
+		 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
+		 * get reset at some inconvenient point later. Most of the time this
+		 * will immediately return.
+		 */
+		for (;;)
+		{
+			PGSemaphoreLock(MyProc->sem);
+			if (!MyProc->lwWaiting)
+				break;
+			extraWaits++;
+		}
+
+		/*
+		 * Fix the process wait semaphore's count for any absorbed wakeups.
+		 */
+		while (extraWaits-- > 0)
+			PGSemaphoreUnlock(MyProc->sem);
+	}
+
+#ifdef LOCK_DEBUG
+	{
+		/* not waiting anymore */
+		uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+		Assert(nwaiters < MAX_BACKENDS);
+	}
+#endif
+}
+
+/*
+ * LWLockAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, sleep until it is.  Returns true if the lock
+ * was available immediately, false if we had to sleep.
+ *
+ * Side effect: cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockAcquire(LWLock *lock, LWLockMode mode)
+{
+	PGPROC	   *proc = MyProc;
+	bool		result = true;
+	int			extraWaits = 0;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+
+	lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+	AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+	PRINT_LWDEBUG("LWLockAcquire", lock, mode);
+
+#ifdef LWLOCK_STATS
+	/* Count lock acquisition attempts */
+	if (mode == LW_EXCLUSIVE)
+		lwstats->ex_acquire_count++;
+	else
+		lwstats->sh_acquire_count++;
+#endif							/* LWLOCK_STATS */
+
+	/*
+	 * We can't wait if we haven't got a PGPROC.  This should only occur
+	 * during bootstrap or shared memory initialization.  Put an Assert here
+	 * to catch unsafe coding practices.
+	 */
+	Assert(!(proc == NULL && IsUnderPostmaster));
+
+	/* Ensure we will have room to remember the lock */
+	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+		elog(ERROR, "too many LWLocks taken");
+
+	/*
+	 * Lock out cancel/die interrupts until we exit the code section protected
+	 * by the LWLock.  This ensures that interrupts will not interfere with
+	 * manipulations of data structures in shared memory.
+	 */
+	HOLD_INTERRUPTS();
+
+	/*
+	 * Loop here to try to acquire lock after each time we are signaled by
+	 * LWLockRelease.
+	 *
+	 * NOTE: it might seem better to have LWLockRelease actually grant us the
+	 * lock, rather than retrying and possibly having to go back to sleep. But
+	 * in practice that is no good because it means a process swap for every
+	 * lock acquisition when two or more processes are contending for the same
+	 * lock.  Since LWLocks are normally used to protect not-very-long
+	 * sections of computation, a process needs to be able to acquire and
+	 * release the same lock many times during a single CPU time slice, even
+	 * in the presence of contention.  The efficiency of being able to do that
+	 * outweighs the inefficiency of sometimes wasting a process dispatch
+	 * cycle because the lock is not free when a released waiter finally gets
+	 * to run.  See pgsql-hackers archives for 29-Dec-01.
+	 */
+	for (;;)
+	{
+		bool		mustwait;
+
+		/*
+		 * Try to grab the lock the first time, we're not in the waitqueue
+		 * yet/anymore.
+		 */
+		mustwait = LWLockAttemptLock(lock, mode);
+
+		if (!mustwait)
+		{
+			LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock");
+			break;				/* got the lock */
+		}
+
+		/*
+		 * Ok, at this point we couldn't grab the lock on the first try. We
+		 * cannot simply queue ourselves to the end of the list and wait to be
+		 * woken up because by now the lock could long have been released.
+		 * Instead add us to the queue and try to grab the lock again. If we
+		 * succeed we need to revert the queuing and be happy, otherwise we
+		 * recheck the lock. If we still couldn't grab it, we know that the
+		 * other locker will see our queue entries when releasing since they
+		 * existed before we checked for the lock.
+		 */
+
+		/* add to the queue */
+		LWLockQueueSelf(lock, mode);
+
+		/* we're now guaranteed to be woken up if necessary */
+		mustwait = LWLockAttemptLock(lock, mode);
+
+		/* ok, grabbed the lock the second time round, need to undo queueing */
+		if (!mustwait)
+		{
+			LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue");
+
+			LWLockDequeueSelf(lock);
+			break;
+		}
+
+		/*
+		 * Wait until awakened.
+		 *
+		 * It is possible that we get awakened for a reason other than being
+		 * signaled by LWLockRelease.  If so, loop back and wait again.  Once
+		 * we've gotten the LWLock, re-increment the sema by the number of
+		 * additional signals received.
+		 */
+		LOG_LWDEBUG("LWLockAcquire", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+		lwstats->block_count++;
+#endif
+
+		LWLockReportWaitStart(lock);
+		if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+		for (;;)
+		{
+			PGSemaphoreLock(proc->sem);
+			if (!proc->lwWaiting)
+				break;
+			extraWaits++;
+		}
+
+		/* Retrying, allow LWLockRelease to release waiters again. */
+		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+#ifdef LOCK_DEBUG
+		{
+			/* not waiting anymore */
+			uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+			Assert(nwaiters < MAX_BACKENDS);
+		}
+#endif
+
+		if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+		LWLockReportWaitEnd();
+
+		LOG_LWDEBUG("LWLockAcquire", lock, "awakened");
+
+		/* Now loop back and try to acquire lock again. */
+		result = false;
+	}
+
+	if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED())
+		TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode);
+
+	/* Add lock to list of locks held by this backend */
+	held_lwlocks[num_held_lwlocks].lock = lock;
+	held_lwlocks[num_held_lwlocks++].mode = mode;
+
+	/*
+	 * Fix the process wait semaphore's count for any absorbed wakeups.
+	 */
+	while (extraWaits-- > 0)
+		PGSemaphoreUnlock(proc->sem);
+
+	return result;
+}
+
+/*
+ * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, return false with no side-effects.
+ *
+ * If successful, cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
+{
+	bool		mustwait;
+
+	AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+	PRINT_LWDEBUG("LWLockConditionalAcquire", lock, mode);
+
+	/* Ensure we will have room to remember the lock */
+	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+		elog(ERROR, "too many LWLocks taken");
+
+	/*
+	 * Lock out cancel/die interrupts until we exit the code section protected
+	 * by the LWLock.  This ensures that interrupts will not interfere with
+	 * manipulations of data structures in shared memory.
+	 */
+	HOLD_INTERRUPTS();
+
+	/* Check for the lock */
+	mustwait = LWLockAttemptLock(lock, mode);
+
+	if (mustwait)
+	{
+		/* Failed to get lock, so release interrupt holdoff */
+		RESUME_INTERRUPTS();
+
+		LOG_LWDEBUG("LWLockConditionalAcquire", lock, "failed");
+		if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode);
+	}
+	else
+	{
+		/* Add lock to list of locks held by this backend */
+		held_lwlocks[num_held_lwlocks].lock = lock;
+		held_lwlocks[num_held_lwlocks++].mode = mode;
+		if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode);
+	}
+	return !mustwait;
+}
+
+/*
+ * LWLockAcquireOrWait - Acquire lock, or wait until it's free
+ *
+ * The semantics of this function are a bit funky.  If the lock is currently
+ * free, it is acquired in the given mode, and the function returns true.  If
+ * the lock isn't immediately free, the function waits until it is released
+ * and returns false, but does not acquire the lock.
+ *
+ * This is currently used for WALWriteLock: when a backend flushes the WAL,
+ * holding WALWriteLock, it can flush the commit records of many other
+ * backends as a side-effect.  Those other backends need to wait until the
+ * flush finishes, but don't need to acquire the lock anymore.  They can just
+ * wake up, observe that their records have already been flushed, and return.
+ */
+bool
+LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
+{
+	PGPROC	   *proc = MyProc;
+	bool		mustwait;
+	int			extraWaits = 0;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+
+	lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+	Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+	PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode);
+
+	/* Ensure we will have room to remember the lock */
+	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+		elog(ERROR, "too many LWLocks taken");
+
+	/*
+	 * Lock out cancel/die interrupts until we exit the code section protected
+	 * by the LWLock.  This ensures that interrupts will not interfere with
+	 * manipulations of data structures in shared memory.
+	 */
+	HOLD_INTERRUPTS();
+
+	/*
+	 * NB: We're using nearly the same twice-in-a-row lock acquisition
+	 * protocol as LWLockAcquire(). Check its comments for details.
+	 */
+	mustwait = LWLockAttemptLock(lock, mode);
+
+	if (mustwait)
+	{
+		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+		mustwait = LWLockAttemptLock(lock, mode);
+
+		if (mustwait)
+		{
+			/*
+			 * Wait until awakened.  Like in LWLockAcquire, be prepared for
+			 * bogus wakeups.
+			 */
+			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+			lwstats->block_count++;
+#endif
+
+			LWLockReportWaitStart(lock);
+			if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+				TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+			for (;;)
+			{
+				PGSemaphoreLock(proc->sem);
+				if (!proc->lwWaiting)
+					break;
+				extraWaits++;
+			}
+
+#ifdef LOCK_DEBUG
+			{
+				/* not waiting anymore */
+				uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+				Assert(nwaiters < MAX_BACKENDS);
+			}
+#endif
+			if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+				TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+			LWLockReportWaitEnd();
+
+			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened");
+		}
+		else
+		{
+			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue");
+
+			/*
+			 * Got lock in the second attempt, undo queueing. We need to treat
+			 * this as having successfully acquired the lock, otherwise we'd
+			 * not necessarily wake up people we've prevented from acquiring
+			 * the lock.
+			 */
+			LWLockDequeueSelf(lock);
+		}
+	}
+
+	/*
+	 * Fix the process wait semaphore's count for any absorbed wakeups.
+	 */
+	while (extraWaits-- > 0)
+		PGSemaphoreUnlock(proc->sem);
+
+	if (mustwait)
+	{
+		/* Failed to get lock, so release interrupt holdoff */
+		RESUME_INTERRUPTS();
+		LOG_LWDEBUG("LWLockAcquireOrWait", lock, "failed");
+		if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode);
+	}
+	else
+	{
+		LOG_LWDEBUG("LWLockAcquireOrWait", lock, "succeeded");
+		/* Add lock to list of locks held by this backend */
+		held_lwlocks[num_held_lwlocks].lock = lock;
+		held_lwlocks[num_held_lwlocks++].mode = mode;
+		if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode);
+	}
+
+	return !mustwait;
+}
+
+/*
+ * Does the lwlock in its current state need to wait for the variable value to
+ * change?
+ *
+ * If we don't need to wait, and it's because the value of the variable has
+ * changed, store the current value in newval.
+ *
+ * *result is set to true if the lock was free, and false otherwise.
+ */
+static bool
+LWLockConflictsWithVar(LWLock *lock,
+					   uint64 *valptr, uint64 oldval, uint64 *newval,
+					   bool *result)
+{
+	bool		mustwait;
+	uint64		value;
+
+	/*
+	 * Test first to see if it the slot is free right now.
+	 *
+	 * XXX: the caller uses a spinlock before this, so we don't need a memory
+	 * barrier here as far as the current usage is concerned.  But that might
+	 * not be safe in general.
+	 */
+	mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0;
+
+	if (!mustwait)
+	{
+		*result = true;
+		return false;
+	}
+
+	*result = false;
+
+	/*
+	 * Read value using the lwlock's wait list lock, as we can't generally
+	 * rely on atomic 64 bit reads/stores.  TODO: On platforms with a way to
+	 * do atomic 64 bit reads/writes the spinlock should be optimized away.
+	 */
+	LWLockWaitListLock(lock);
+	value = *valptr;
+	LWLockWaitListUnlock(lock);
+
+	if (value != oldval)
+	{
+		mustwait = false;
+		*newval = value;
+	}
+	else
+	{
+		mustwait = true;
+	}
+
+	return mustwait;
+}
+
+/*
+ * LWLockWaitForVar - Wait until lock is free, or a variable is updated.
+ *
+ * If the lock is held and *valptr equals oldval, waits until the lock is
+ * either freed, or the lock holder updates *valptr by calling
+ * LWLockUpdateVar.  If the lock is free on exit (immediately or after
+ * waiting), returns true.  If the lock is still held, but *valptr no longer
+ * matches oldval, returns false and sets *newval to the current value in
+ * *valptr.
+ *
+ * Note: this function ignores shared lock holders; if the lock is held
+ * in shared mode, returns 'true'.
+ */
+bool
+LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
+{
+	PGPROC	   *proc = MyProc;
+	int			extraWaits = 0;
+	bool		result = false;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+
+	lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+	PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE);
+
+	/*
+	 * Lock out cancel/die interrupts while we sleep on the lock.  There is no
+	 * cleanup mechanism to remove us from the wait queue if we got
+	 * interrupted.
+	 */
+	HOLD_INTERRUPTS();
+
+	/*
+	 * Loop here to check the lock's status after each time we are signaled.
+	 */
+	for (;;)
+	{
+		bool		mustwait;
+
+		mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+										  &result);
+
+		if (!mustwait)
+			break;				/* the lock was free or value didn't match */
+
+		/*
+		 * Add myself to wait queue. Note that this is racy, somebody else
+		 * could wakeup before we're finished queuing. NB: We're using nearly
+		 * the same twice-in-a-row lock acquisition protocol as
+		 * LWLockAcquire(). Check its comments for details. The only
+		 * difference is that we also have to check the variable's values when
+		 * checking the state of the lock.
+		 */
+		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+		/*
+		 * Set RELEASE_OK flag, to make sure we get woken up as soon as the
+		 * lock is released.
+		 */
+		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+		/*
+		 * We're now guaranteed to be woken up if necessary. Recheck the lock
+		 * and variables state.
+		 */
+		mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+										  &result);
+
+		/* Ok, no conflict after we queued ourselves. Undo queueing. */
+		if (!mustwait)
+		{
+			LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue");
+
+			LWLockDequeueSelf(lock);
+			break;
+		}
+
+		/*
+		 * Wait until awakened.
+		 *
+		 * It is possible that we get awakened for a reason other than being
+		 * signaled by LWLockRelease.  If so, loop back and wait again.  Once
+		 * we've gotten the LWLock, re-increment the sema by the number of
+		 * additional signals received.
+		 */
+		LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+		lwstats->block_count++;
+#endif
+
+		LWLockReportWaitStart(lock);
+		if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE);
+
+		for (;;)
+		{
+			PGSemaphoreLock(proc->sem);
+			if (!proc->lwWaiting)
+				break;
+			extraWaits++;
+		}
+
+#ifdef LOCK_DEBUG
+		{
+			/* not waiting anymore */
+			uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+			Assert(nwaiters < MAX_BACKENDS);
+		}
+#endif
+
+		if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE);
+		LWLockReportWaitEnd();
+
+		LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened");
+
+		/* Now loop back and check the status of the lock again. */
+	}
+
+	/*
+	 * Fix the process wait semaphore's count for any absorbed wakeups.
+	 */
+	while (extraWaits-- > 0)
+		PGSemaphoreUnlock(proc->sem);
+
+	/*
+	 * Now okay to allow cancel/die interrupts.
+	 */
+	RESUME_INTERRUPTS();
+
+	return result;
+}
+
+
+/*
+ * LWLockUpdateVar - Update a variable and wake up waiters atomically
+ *
+ * Sets *valptr to 'val', and wakes up all processes waiting for us with
+ * LWLockWaitForVar().  Setting the value and waking up the processes happen
+ * atomically so that any process calling LWLockWaitForVar() on the same lock
+ * is guaranteed to see the new value, and act accordingly.
+ *
+ * The caller must be holding the lock in exclusive mode.
+ */
+void
+LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+	proclist_head wakeup;
+	proclist_mutable_iter iter;
+
+	PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE);
+
+	proclist_init(&wakeup);
+
+	LWLockWaitListLock(lock);
+
+	Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE);
+
+	/* Update the lock's value */
+	*valptr = val;
+
+	/*
+	 * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken
+	 * up. They are always in the front of the queue.
+	 */
+	proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+	{
+		PGPROC	   *waiter = GetPGProcByNumber(iter.cur);
+
+		if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+			break;
+
+		proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+		proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+	}
+
+	/* We are done updating shared state of the lock itself. */
+	LWLockWaitListUnlock(lock);
+
+	/*
+	 * Awaken any waiters I removed from the queue.
+	 */
+	proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+	{
+		PGPROC	   *waiter = GetPGProcByNumber(iter.cur);
+
+		proclist_delete(&wakeup, iter.cur, lwWaitLink);
+		/* check comment in LWLockWakeup() about this barrier */
+		pg_write_barrier();
+		waiter->lwWaiting = false;
+		PGSemaphoreUnlock(waiter->sem);
+	}
+}
+
+
+/*
+ * LWLockRelease - release a previously acquired lock
+ */
+void
+LWLockRelease(LWLock *lock)
+{
+	LWLockMode	mode;
+	uint32		oldstate;
+	bool		check_waiters;
+	int			i;
+
+	/*
+	 * Remove lock from list of locks held.  Usually, but not always, it will
+	 * be the latest-acquired lock; so search array backwards.
+	 */
+	for (i = num_held_lwlocks; --i >= 0;)
+		if (lock == held_lwlocks[i].lock)
+			break;
+
+	if (i < 0)
+		elog(ERROR, "lock %s is not held", T_NAME(lock));
+
+	mode = held_lwlocks[i].mode;
+
+	num_held_lwlocks--;
+	for (; i < num_held_lwlocks; i++)
+		held_lwlocks[i] = held_lwlocks[i + 1];
+
+	PRINT_LWDEBUG("LWLockRelease", lock, mode);
+
+	/*
+	 * Release my hold on lock, after that it can immediately be acquired by
+	 * others, even if we still have to wakeup other waiters.
+	 */
+	if (mode == LW_EXCLUSIVE)
+		oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE);
+	else
+		oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED);
+
+	/* nobody else can have that kind of lock */
+	Assert(!(oldstate & LW_VAL_EXCLUSIVE));
+
+	if (TRACE_POSTGRESQL_LWLOCK_RELEASE_ENABLED())
+		TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock));
+
+	/*
+	 * We're still waiting for backends to get scheduled, don't wake them up
+	 * again.
+	 */
+	if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) ==
+		(LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) &&
+		(oldstate & LW_LOCK_MASK) == 0)
+		check_waiters = true;
+	else
+		check_waiters = false;
+
+	/*
+	 * As waking up waiters requires the spinlock to be acquired, only do so
+	 * if necessary.
+	 */
+	if (check_waiters)
+	{
+		/* XXX: remove before commit? */
+		LOG_LWDEBUG("LWLockRelease", lock, "releasing waiters");
+		LWLockWakeup(lock);
+	}
+
+	/*
+	 * Now okay to allow cancel/die interrupts.
+	 */
+	RESUME_INTERRUPTS();
+}
+
+/*
+ * LWLockReleaseClearVar - release a previously acquired lock, reset variable
+ */
+void
+LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+	LWLockWaitListLock(lock);
+
+	/*
+	 * Set the variable's value before releasing the lock, that prevents race
+	 * a race condition wherein a new locker acquires the lock, but hasn't yet
+	 * set the variables value.
+	 */
+	*valptr = val;
+	LWLockWaitListUnlock(lock);
+
+	LWLockRelease(lock);
+}
+
+
+/*
+ * LWLockReleaseAll - release all currently-held locks
+ *
+ * Used to clean up after ereport(ERROR). An important difference between this
+ * function and retail LWLockRelease calls is that InterruptHoldoffCount is
+ * unchanged by this operation.  This is necessary since InterruptHoldoffCount
+ * has been set to an appropriate level earlier in error recovery. We could
+ * decrement it below zero if we allow it to drop for each released lock!
+ */
+void
+LWLockReleaseAll(void)
+{
+	while (num_held_lwlocks > 0)
+	{
+		HOLD_INTERRUPTS();		/* match the upcoming RESUME_INTERRUPTS */
+
+		LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock);
+	}
+}
+
+
+/*
+ * LWLockHeldByMe - test whether my process holds a lock in any mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMe(LWLock *l)
+{
+	int			i;
+
+	for (i = 0; i < num_held_lwlocks; i++)
+	{
+		if (held_lwlocks[i].lock == l)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * LWLockHeldByMe - test whether my process holds any of an array of locks
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockAnyHeldByMe(LWLock *l, int nlocks, size_t stride)
+{
+	char	   *held_lock_addr;
+	char	   *begin;
+	char	   *end;
+	int			i;
+
+	begin = (char *) l;
+	end = begin + nlocks * stride;
+	for (i = 0; i < num_held_lwlocks; i++)
+	{
+		held_lock_addr = (char *) held_lwlocks[i].lock;
+		if (held_lock_addr >= begin &&
+			held_lock_addr < end &&
+			(held_lock_addr - begin) % stride == 0)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * LWLockHeldByMeInMode - test whether my process holds a lock in given mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
+{
+	int			i;
+
+	for (i = 0; i < num_held_lwlocks; i++)
+	{
+		if (held_lwlocks[i].lock == l && held_lwlocks[i].mode == mode)
+			return true;
+	}
+	return false;
+}
diff --git a/src/backend/storage/lmgr/lwlocknames.c b/src/backend/storage/lmgr/lwlocknames.c
new file mode 100644
index 0000000..65f7c5b
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.c
@@ -0,0 +1,52 @@
+/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */
+
+const char *const IndividualLWLockNames[] = {
+	"<unassigned:0>",
+	"ShmemIndex",
+	"OidGen",
+	"XidGen",
+	"ProcArray",
+	"SInvalRead",
+	"SInvalWrite",
+	"WALBufMapping",
+	"WALWrite",
+	"ControlFile",
+	"<unassigned:10>",
+	"XactSLRU",
+	"SubtransSLRU",
+	"MultiXactGen",
+	"MultiXactOffsetSLRU",
+	"MultiXactMemberSLRU",
+	"RelCacheInit",
+	"CheckpointerComm",
+	"TwoPhaseState",
+	"TablespaceCreate",
+	"BtreeVacuum",
+	"AddinShmemInit",
+	"Autovacuum",
+	"AutovacuumSchedule",
+	"SyncScan",
+	"RelationMapping",
+	"NotifySLRU",
+	"NotifyQueue",
+	"SerializableXactHash",
+	"SerializableFinishedList",
+	"SerializablePredicateList",
+	"SerialSLRU",
+	"SyncRep",
+	"BackgroundWorker",
+	"DynamicSharedMemoryControl",
+	"AutoFile",
+	"ReplicationSlotAllocation",
+	"ReplicationSlotControl",
+	"CommitTsSLRU",
+	"CommitTs",
+	"ReplicationOrigin",
+	"MultiXactTruncation",
+	"OldSnapshotTimeMap",
+	"LogicalRepWorker",
+	"XactTruncation",
+	"<unassigned:45>",
+	"WrapLimitsVacuum",
+	"NotifyQueueTail"
+};
diff --git a/src/backend/storage/lmgr/lwlocknames.h b/src/backend/storage/lmgr/lwlocknames.h
new file mode 100644
index 0000000..e279f72
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.h
@@ -0,0 +1,50 @@
+/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */
+/* there is deliberately not an #ifndef LWLOCKNAMES_H here */
+
+#define ShmemIndexLock (&MainLWLockArray[1].lock)
+#define OidGenLock (&MainLWLockArray[2].lock)
+#define XidGenLock (&MainLWLockArray[3].lock)
+#define ProcArrayLock (&MainLWLockArray[4].lock)
+#define SInvalReadLock (&MainLWLockArray[5].lock)
+#define SInvalWriteLock (&MainLWLockArray[6].lock)
+#define WALBufMappingLock (&MainLWLockArray[7].lock)
+#define WALWriteLock (&MainLWLockArray[8].lock)
+#define ControlFileLock (&MainLWLockArray[9].lock)
+#define XactSLRULock (&MainLWLockArray[11].lock)
+#define SubtransSLRULock (&MainLWLockArray[12].lock)
+#define MultiXactGenLock (&MainLWLockArray[13].lock)
+#define MultiXactOffsetSLRULock (&MainLWLockArray[14].lock)
+#define MultiXactMemberSLRULock (&MainLWLockArray[15].lock)
+#define RelCacheInitLock (&MainLWLockArray[16].lock)
+#define CheckpointerCommLock (&MainLWLockArray[17].lock)
+#define TwoPhaseStateLock (&MainLWLockArray[18].lock)
+#define TablespaceCreateLock (&MainLWLockArray[19].lock)
+#define BtreeVacuumLock (&MainLWLockArray[20].lock)
+#define AddinShmemInitLock (&MainLWLockArray[21].lock)
+#define AutovacuumLock (&MainLWLockArray[22].lock)
+#define AutovacuumScheduleLock (&MainLWLockArray[23].lock)
+#define SyncScanLock (&MainLWLockArray[24].lock)
+#define RelationMappingLock (&MainLWLockArray[25].lock)
+#define NotifySLRULock (&MainLWLockArray[26].lock)
+#define NotifyQueueLock (&MainLWLockArray[27].lock)
+#define SerializableXactHashLock (&MainLWLockArray[28].lock)
+#define SerializableFinishedListLock (&MainLWLockArray[29].lock)
+#define SerializablePredicateListLock (&MainLWLockArray[30].lock)
+#define SerialSLRULock (&MainLWLockArray[31].lock)
+#define SyncRepLock (&MainLWLockArray[32].lock)
+#define BackgroundWorkerLock (&MainLWLockArray[33].lock)
+#define DynamicSharedMemoryControlLock (&MainLWLockArray[34].lock)
+#define AutoFileLock (&MainLWLockArray[35].lock)
+#define ReplicationSlotAllocationLock (&MainLWLockArray[36].lock)
+#define ReplicationSlotControlLock (&MainLWLockArray[37].lock)
+#define CommitTsSLRULock (&MainLWLockArray[38].lock)
+#define CommitTsLock (&MainLWLockArray[39].lock)
+#define ReplicationOriginLock (&MainLWLockArray[40].lock)
+#define MultiXactTruncationLock (&MainLWLockArray[41].lock)
+#define OldSnapshotTimeMapLock (&MainLWLockArray[42].lock)
+#define LogicalRepWorkerLock (&MainLWLockArray[43].lock)
+#define XactTruncationLock (&MainLWLockArray[44].lock)
+#define WrapLimitsVacuumLock (&MainLWLockArray[46].lock)
+#define NotifyQueueTailLock (&MainLWLockArray[47].lock)
+
+#define NUM_INDIVIDUAL_LWLOCKS		48
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
new file mode 100644
index 0000000..6c7cf6c
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -0,0 +1,55 @@
+# Some commonly-used locks have predefined positions within MainLWLockArray;
+# these are defined here.  If you add a lock, add it to the end to avoid
+# renumbering the existing locks; if you remove a lock, consider leaving a gap
+# in the numbering sequence for the benefit of DTrace and other external
+# debugging scripts.  Also, do not forget to update the list of wait events
+# in the user documentation.
+
+# 0 is available; was formerly BufFreelistLock
+ShmemIndexLock						1
+OidGenLock							2
+XidGenLock							3
+ProcArrayLock						4
+SInvalReadLock						5
+SInvalWriteLock						6
+WALBufMappingLock					7
+WALWriteLock						8
+ControlFileLock						9
+# 10 was CheckpointLock
+XactSLRULock						11
+SubtransSLRULock					12
+MultiXactGenLock					13
+MultiXactOffsetSLRULock				14
+MultiXactMemberSLRULock				15
+RelCacheInitLock					16
+CheckpointerCommLock				17
+TwoPhaseStateLock					18
+TablespaceCreateLock				19
+BtreeVacuumLock						20
+AddinShmemInitLock					21
+AutovacuumLock						22
+AutovacuumScheduleLock				23
+SyncScanLock						24
+RelationMappingLock					25
+NotifySLRULock						26
+NotifyQueueLock						27
+SerializableXactHashLock			28
+SerializableFinishedListLock		29
+SerializablePredicateListLock		30
+SerialSLRULock						31
+SyncRepLock							32
+BackgroundWorkerLock				33
+DynamicSharedMemoryControlLock		34
+AutoFileLock						35
+ReplicationSlotAllocationLock		36
+ReplicationSlotControlLock			37
+CommitTsSLRULock					38
+CommitTsLock						39
+ReplicationOriginLock				40
+MultiXactTruncationLock				41
+OldSnapshotTimeMapLock				42
+LogicalRepWorkerLock				43
+XactTruncationLock					44
+# 45 was XactTruncationLock until removal of BackendRandomLock
+WrapLimitsVacuumLock				46
+NotifyQueueTailLock					47
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
new file mode 100644
index 0000000..d493aee
--- /dev/null
+++ b/src/backend/storage/lmgr/predicate.c
@@ -0,0 +1,5203 @@
+/*-------------------------------------------------------------------------
+ *
+ * predicate.c
+ *	  POSTGRES predicate locking
+ *	  to support full serializable transaction isolation
+ *
+ *
+ * The approach taken is to implement Serializable Snapshot Isolation (SSI)
+ * as initially described in this paper:
+ *
+ *	Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
+ *	Serializable isolation for snapshot databases.
+ *	In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
+ *	international conference on Management of data,
+ *	pages 729-738, New York, NY, USA. ACM.
+ *	http://doi.acm.org/10.1145/1376616.1376690
+ *
+ * and further elaborated in Cahill's doctoral thesis:
+ *
+ *	Michael James Cahill. 2009.
+ *	Serializable Isolation for Snapshot Databases.
+ *	Sydney Digital Theses.
+ *	University of Sydney, School of Information Technologies.
+ *	http://hdl.handle.net/2123/5353
+ *
+ *
+ * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
+ * locks, which are so different from normal locks that a distinct set of
+ * structures is required to handle them.  They are needed to detect
+ * rw-conflicts when the read happens before the write.  (When the write
+ * occurs first, the reading transaction can check for a conflict by
+ * examining the MVCC data.)
+ *
+ * (1)	Besides tuples actually read, they must cover ranges of tuples
+ *		which would have been read based on the predicate.  This will
+ *		require modelling the predicates through locks against database
+ *		objects such as pages, index ranges, or entire tables.
+ *
+ * (2)	They must be kept in RAM for quick access.  Because of this, it
+ *		isn't possible to always maintain tuple-level granularity -- when
+ *		the space allocated to store these approaches exhaustion, a
+ *		request for a lock may need to scan for situations where a single
+ *		transaction holds many fine-grained locks which can be coalesced
+ *		into a single coarser-grained lock.
+ *
+ * (3)	They never block anything; they are more like flags than locks
+ *		in that regard; although they refer to database objects and are
+ *		used to identify rw-conflicts with normal write locks.
+ *
+ * (4)	While they are associated with a transaction, they must survive
+ *		a successful COMMIT of that transaction, and remain until all
+ *		overlapping transactions complete.  This even means that they
+ *		must survive termination of the transaction's process.  If a
+ *		top level transaction is rolled back, however, it is immediately
+ *		flagged so that it can be ignored, and its SIREAD locks can be
+ *		released any time after that.
+ *
+ * (5)	The only transactions which create SIREAD locks or check for
+ *		conflicts with them are serializable transactions.
+ *
+ * (6)	When a write lock for a top level transaction is found to cover
+ *		an existing SIREAD lock for the same transaction, the SIREAD lock
+ *		can be deleted.
+ *
+ * (7)	A write from a serializable transaction must ensure that an xact
+ *		record exists for the transaction, with the same lifespan (until
+ *		all concurrent transaction complete or the transaction is rolled
+ *		back) so that rw-dependencies to that transaction can be
+ *		detected.
+ *
+ * We use an optimization for read-only transactions. Under certain
+ * circumstances, a read-only transaction's snapshot can be shown to
+ * never have conflicts with other transactions.  This is referred to
+ * as a "safe" snapshot (and one known not to be is "unsafe").
+ * However, it can't be determined whether a snapshot is safe until
+ * all concurrent read/write transactions complete.
+ *
+ * Once a read-only transaction is known to have a safe snapshot, it
+ * can release its predicate locks and exempt itself from further
+ * predicate lock tracking. READ ONLY DEFERRABLE transactions run only
+ * on safe snapshots, waiting as necessary for one to be available.
+ *
+ *
+ * Lightweight locks to manage access to the predicate locking shared
+ * memory objects must be taken in this order, and should be released in
+ * reverse order:
+ *
+ *	SerializableFinishedListLock
+ *		- Protects the list of transactions which have completed but which
+ *			may yet matter because they overlap still-active transactions.
+ *
+ *	SerializablePredicateListLock
+ *		- Protects the linked list of locks held by a transaction.  Note
+ *			that the locks themselves are also covered by the partition
+ *			locks of their respective lock targets; this lock only affects
+ *			the linked list connecting the locks related to a transaction.
+ *		- All transactions share this single lock (with no partitioning).
+ *		- There is never a need for a process other than the one running
+ *			an active transaction to walk the list of locks held by that
+ *			transaction, except parallel query workers sharing the leader's
+ *			transaction.  In the parallel case, an extra per-sxact lock is
+ *			taken; see below.
+ *		- It is relatively infrequent that another process needs to
+ *			modify the list for a transaction, but it does happen for such
+ *			things as index page splits for pages with predicate locks and
+ *			freeing of predicate locked pages by a vacuum process.  When
+ *			removing a lock in such cases, the lock itself contains the
+ *			pointers needed to remove it from the list.  When adding a
+ *			lock in such cases, the lock can be added using the anchor in
+ *			the transaction structure.  Neither requires walking the list.
+ *		- Cleaning up the list for a terminated transaction is sometimes
+ *			not done on a retail basis, in which case no lock is required.
+ *		- Due to the above, a process accessing its active transaction's
+ *			list always uses a shared lock, regardless of whether it is
+ *			walking or maintaining the list.  This improves concurrency
+ *			for the common access patterns.
+ *		- A process which needs to alter the list of a transaction other
+ *			than its own active transaction must acquire an exclusive
+ *			lock.
+ *
+ *	SERIALIZABLEXACT's member 'perXactPredicateListLock'
+ *		- Protects the linked list of predicate locks held by a transaction.
+ *			Only needed for parallel mode, where multiple backends share the
+ *			same SERIALIZABLEXACT object.  Not needed if
+ *			SerializablePredicateListLock is held exclusively.
+ *
+ *	PredicateLockHashPartitionLock(hashcode)
+ *		- The same lock protects a target, all locks on that target, and
+ *			the linked list of locks on the target.
+ *		- When more than one is needed, acquire in ascending address order.
+ *		- When all are needed (rare), acquire in ascending index order with
+ *			PredicateLockHashPartitionLockByIndex(index).
+ *
+ *	SerializableXactHashLock
+ *		- Protects both PredXact and SerializableXidHash.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/predicate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *
+ * housekeeping for setting up shared memory predicate lock structures
+ *		InitPredicateLocks(void)
+ *		PredicateLockShmemSize(void)
+ *
+ * predicate lock reporting
+ *		GetPredicateLockStatusData(void)
+ *		PageIsPredicateLocked(Relation relation, BlockNumber blkno)
+ *
+ * predicate lock maintenance
+ *		GetSerializableTransactionSnapshot(Snapshot snapshot)
+ *		SetSerializableTransactionSnapshot(Snapshot snapshot,
+ *										   VirtualTransactionId *sourcevxid)
+ *		RegisterPredicateLockingXid(void)
+ *		PredicateLockRelation(Relation relation, Snapshot snapshot)
+ *		PredicateLockPage(Relation relation, BlockNumber blkno,
+ *						Snapshot snapshot)
+ *		PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
+ *						 TransactionId insert_xid)
+ *		PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+ *							   BlockNumber newblkno)
+ *		PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+ *								 BlockNumber newblkno)
+ *		TransferPredicateLocksToHeapRelation(Relation relation)
+ *		ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
+ *
+ * conflict detection (may also trigger rollback)
+ *		CheckForSerializableConflictOut(Relation relation, TransactionId xid,
+ *										Snapshot snapshot)
+ *		CheckForSerializableConflictIn(Relation relation, ItemPointer tid,
+ *									   BlockNumber blkno)
+ *		CheckTableForSerializableConflictIn(Relation relation)
+ *
+ * final rollback checking
+ *		PreCommit_CheckForSerializationFailure(void)
+ *
+ * two-phase commit support
+ *		AtPrepare_PredicateLocks(void);
+ *		PostPrepare_PredicateLocks(TransactionId xid);
+ *		PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
+ *		predicatelock_twophase_recover(TransactionId xid, uint16 info,
+ *									   void *recdata, uint32 len);
+ */
+
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "storage/predicate_internals.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+/* Uncomment the next line to test the graceful degradation code. */
+/* #define TEST_SUMMARIZE_SERIAL */
+
+/*
+ * Test the most selective fields first, for performance.
+ *
+ * a is covered by b if all of the following hold:
+ *	1) a.database = b.database
+ *	2) a.relation = b.relation
+ *	3) b.offset is invalid (b is page-granularity or higher)
+ *	4) either of the following:
+ *		4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
+ *	 or 4b) a.offset is invalid and b.page is invalid (a is
+ *			page-granularity and b is relation-granularity
+ */
+#define TargetTagIsCoveredBy(covered_target, covering_target)			\
+	((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */	\
+	  GET_PREDICATELOCKTARGETTAG_RELATION(covering_target))				\
+	 && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) ==			\
+		 InvalidOffsetNumber)								 /* (3) */	\
+	 && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) !=			\
+		   InvalidOffsetNumber)								 /* (4a) */ \
+		  && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+			  GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)))			\
+		 || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+			  InvalidBlockNumber)							 /* (4b) */ \
+			 && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)		\
+				 != InvalidBlockNumber)))								\
+	 && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) ==	 /* (1) */	\
+		 GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
+
+/*
+ * The predicate locking target and lock shared hash tables are partitioned to
+ * reduce contention.  To determine which partition a given target belongs to,
+ * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
+ * apply one of these macros.
+ * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
+ */
+#define PredicateLockHashPartition(hashcode) \
+	((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
+#define PredicateLockHashPartitionLock(hashcode) \
+	(&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \
+		PredicateLockHashPartition(hashcode)].lock)
+#define PredicateLockHashPartitionLockByIndex(i) \
+	(&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock)
+
+#define NPREDICATELOCKTARGETENTS() \
+	mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+
+#define SxactIsOnFinishedList(sxact) (!SHMQueueIsDetached(&((sxact)->finishedLink)))
+
+/*
+ * Note that a sxact is marked "prepared" once it has passed
+ * PreCommit_CheckForSerializationFailure, even if it isn't using
+ * 2PC. This is the point at which it can no longer be aborted.
+ *
+ * The PREPARED flag remains set after commit, so SxactIsCommitted
+ * implies SxactIsPrepared.
+ */
+#define SxactIsCommitted(sxact) (((sxact)->flags & SXACT_FLAG_COMMITTED) != 0)
+#define SxactIsPrepared(sxact) (((sxact)->flags & SXACT_FLAG_PREPARED) != 0)
+#define SxactIsRolledBack(sxact) (((sxact)->flags & SXACT_FLAG_ROLLED_BACK) != 0)
+#define SxactIsDoomed(sxact) (((sxact)->flags & SXACT_FLAG_DOOMED) != 0)
+#define SxactIsReadOnly(sxact) (((sxact)->flags & SXACT_FLAG_READ_ONLY) != 0)
+#define SxactHasSummaryConflictIn(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_IN) != 0)
+#define SxactHasSummaryConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_OUT) != 0)
+/*
+ * The following macro actually means that the specified transaction has a
+ * conflict out *to a transaction which committed ahead of it*.  It's hard
+ * to get that into a name of a reasonable length.
+ */
+#define SxactHasConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_CONFLICT_OUT) != 0)
+#define SxactIsDeferrableWaiting(sxact) (((sxact)->flags & SXACT_FLAG_DEFERRABLE_WAITING) != 0)
+#define SxactIsROSafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_SAFE) != 0)
+#define SxactIsROUnsafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_UNSAFE) != 0)
+#define SxactIsPartiallyReleased(sxact) (((sxact)->flags & SXACT_FLAG_PARTIALLY_RELEASED) != 0)
+
+/*
+ * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
+ *
+ * To avoid unnecessary recomputations of the hash code, we try to do this
+ * just once per function, and then pass it around as needed.  Aside from
+ * passing the hashcode to hash_search_with_hash_value(), we can extract
+ * the lock partition number from the hashcode.
+ */
+#define PredicateLockTargetTagHashCode(predicatelocktargettag) \
+	get_hash_value(PredicateLockTargetHash, predicatelocktargettag)
+
+/*
+ * Given a predicate lock tag, and the hash for its target,
+ * compute the lock hash.
+ *
+ * To make the hash code also depend on the transaction, we xor the sxid
+ * struct's address into the hash code, left-shifted so that the
+ * partition-number bits don't change.  Since this is only a hash, we
+ * don't care if we lose high-order bits of the address; use an
+ * intermediate variable to suppress cast-pointer-to-int warnings.
+ */
+#define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
+	((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \
+	 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+
+
+/*
+ * The SLRU buffer area through which we access the old xids.
+ */
+static SlruCtlData SerialSlruCtlData;
+
+#define SerialSlruCtl			(&SerialSlruCtlData)
+
+#define SERIAL_PAGESIZE			BLCKSZ
+#define SERIAL_ENTRYSIZE			sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE	(SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+
+/*
+ * Set maximum pages based on the number needed to track all transactions.
+ */
+#define SERIAL_MAX_PAGE			(MaxTransactionId / SERIAL_ENTRIESPERPAGE)
+
+#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
+
+#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
+	(SerialSlruCtl->shared->page_buffer[slotno] + \
+	((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
+
+#define SerialPage(xid)	(((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
+
+typedef struct SerialControlData
+{
+	int			headPage;		/* newest initialized page */
+	TransactionId headXid;		/* newest valid Xid in the SLRU */
+	TransactionId tailXid;		/* oldest xmin we might be interested in */
+}			SerialControlData;
+
+typedef struct SerialControlData *SerialControl;
+
+static SerialControl serialControl;
+
+/*
+ * When the oldest committed transaction on the "finished" list is moved to
+ * SLRU, its predicate locks will be moved to this "dummy" transaction,
+ * collapsing duplicate targets.  When a duplicate is found, the later
+ * commitSeqNo is used.
+ */
+static SERIALIZABLEXACT *OldCommittedSxact;
+
+
+/*
+ * These configuration variables are used to set the predicate lock table size
+ * and to control promotion of predicate locks to coarser granularity in an
+ * attempt to degrade performance (mostly as false positive serialization
+ * failure) gracefully in the face of memory pressure.
+ */
+int			max_predicate_locks_per_xact;	/* set by guc.c */
+int			max_predicate_locks_per_relation;	/* set by guc.c */
+int			max_predicate_locks_per_page;	/* set by guc.c */
+
+/*
+ * This provides a list of objects in order to track transactions
+ * participating in predicate locking.  Entries in the list are fixed size,
+ * and reside in shared memory.  The memory address of an entry must remain
+ * fixed during its lifetime.  The list will be protected from concurrent
+ * update externally; no provision is made in this code to manage that.  The
+ * number of entries in the list, and the size allowed for each entry is
+ * fixed upon creation.
+ */
+static PredXactList PredXact;
+
+/*
+ * This provides a pool of RWConflict data elements to use in conflict lists
+ * between transactions.
+ */
+static RWConflictPoolHeader RWConflictPool;
+
+/*
+ * The predicate locking hash tables are in shared memory.
+ * Each backend keeps pointers to them.
+ */
+static HTAB *SerializableXidHash;
+static HTAB *PredicateLockTargetHash;
+static HTAB *PredicateLockHash;
+static SHM_QUEUE *FinishedSerializableTransactions;
+
+/*
+ * Tag for a dummy entry in PredicateLockTargetHash. By temporarily removing
+ * this entry, you can ensure that there's enough scratch space available for
+ * inserting one entry in the hash table. This is an otherwise-invalid tag.
+ */
+static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0};
+static uint32 ScratchTargetTagHash;
+static LWLock *ScratchPartitionLock;
+
+/*
+ * The local hash table used to determine when to combine multiple fine-
+ * grained locks into a single courser-grained lock.
+ */
+static HTAB *LocalPredicateLockHash = NULL;
+
+/*
+ * Keep a pointer to the currently-running serializable transaction (if any)
+ * for quick reference. Also, remember if we have written anything that could
+ * cause a rw-conflict.
+ */
+static SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact;
+static bool MyXactDidWrite = false;
+
+/*
+ * The SXACT_FLAG_RO_UNSAFE optimization might lead us to release
+ * MySerializableXact early.  If that happens in a parallel query, the leader
+ * needs to defer the destruction of the SERIALIZABLEXACT until end of
+ * transaction, because the workers still have a reference to it.  In that
+ * case, the leader stores it here.
+ */
+static SERIALIZABLEXACT *SavedSerializableXact = InvalidSerializableXact;
+
+/* local functions */
+
+static SERIALIZABLEXACT *CreatePredXact(void);
+static void ReleasePredXact(SERIALIZABLEXACT *sxact);
+static SERIALIZABLEXACT *FirstPredXact(void);
+static SERIALIZABLEXACT *NextPredXact(SERIALIZABLEXACT *sxact);
+
+static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer);
+static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact);
+static void ReleaseRWConflict(RWConflict conflict);
+static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact);
+
+static bool SerialPagePrecedesLogically(int page1, int page2);
+static void SerialInit(void);
+static void SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo);
+static SerCommitSeqNo SerialGetMinConflictCommitSeqNo(TransactionId xid);
+static void SerialSetActiveSerXmin(TransactionId xid);
+
+static uint32 predicatelock_hash(const void *key, Size keysize);
+static void SummarizeOldestCommittedSxact(void);
+static Snapshot GetSafeSnapshot(Snapshot snapshot);
+static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot,
+													  VirtualTransactionId *sourcevxid,
+													  int sourcepid);
+static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag);
+static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+									  PREDICATELOCKTARGETTAG *parent);
+static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
+static void RemoveScratchTarget(bool lockheld);
+static void RestoreScratchTarget(bool lockheld);
+static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target,
+									   uint32 targettaghash);
+static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag);
+static int	MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag);
+static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag);
+static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
+static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
+								uint32 targettaghash,
+								SERIALIZABLEXACT *sxact);
+static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash);
+static bool TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
+											  PREDICATELOCKTARGETTAG newtargettag,
+											  bool removeOld);
+static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag);
+static void DropAllPredicateLocksFromTable(Relation relation,
+										   bool transfer);
+static void SetNewSxactGlobalXmin(void);
+static void ClearOldPredicateLocks(void);
+static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
+									   bool summarize);
+static bool XidIsConcurrent(TransactionId xid);
+static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
+static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+													SERIALIZABLEXACT *writer);
+static void CreateLocalPredicateLockHash(void);
+static void ReleasePredicateLocksLocal(void);
+
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Does this relation participate in predicate locking? Temporary and system
+ * relations are exempt, as are materialized views.
+ */
+static inline bool
+PredicateLockingNeededForRelation(Relation relation)
+{
+	return !(relation->rd_id < FirstBootstrapObjectId ||
+			 RelationUsesLocalBuffers(relation) ||
+			 relation->rd_rel->relkind == RELKIND_MATVIEW);
+}
+
+/*
+ * When a public interface method is called for a read, this is the test to
+ * see if we should do a quick return.
+ *
+ * Note: this function has side-effects! If this transaction has been flagged
+ * as RO-safe since the last call, we release all predicate locks and reset
+ * MySerializableXact. That makes subsequent calls to return quickly.
+ *
+ * This is marked as 'inline' to eliminate the function call overhead in the
+ * common case that serialization is not needed.
+ */
+static inline bool
+SerializationNeededForRead(Relation relation, Snapshot snapshot)
+{
+	/* Nothing to do if this is not a serializable transaction */
+	if (MySerializableXact == InvalidSerializableXact)
+		return false;
+
+	/*
+	 * Don't acquire locks or conflict when scanning with a special snapshot.
+	 * This excludes things like CLUSTER and REINDEX. They use the wholesale
+	 * functions TransferPredicateLocksToHeapRelation() and
+	 * CheckTableForSerializableConflictIn() to participate in serialization,
+	 * but the scans involved don't need serialization.
+	 */
+	if (!IsMVCCSnapshot(snapshot))
+		return false;
+
+	/*
+	 * Check if we have just become "RO-safe". If we have, immediately release
+	 * all locks as they're not needed anymore. This also resets
+	 * MySerializableXact, so that subsequent calls to this function can exit
+	 * quickly.
+	 *
+	 * A transaction is flagged as RO_SAFE if all concurrent R/W transactions
+	 * commit without having conflicts out to an earlier snapshot, thus
+	 * ensuring that no conflicts are possible for this transaction.
+	 */
+	if (SxactIsROSafe(MySerializableXact))
+	{
+		ReleasePredicateLocks(false, true);
+		return false;
+	}
+
+	/* Check if the relation doesn't participate in predicate locking */
+	if (!PredicateLockingNeededForRelation(relation))
+		return false;
+
+	return true;				/* no excuse to skip predicate locking */
+}
+
+/*
+ * Like SerializationNeededForRead(), but called on writes.
+ * The logic is the same, but there is no snapshot and we can't be RO-safe.
+ */
+static inline bool
+SerializationNeededForWrite(Relation relation)
+{
+	/* Nothing to do if this is not a serializable transaction */
+	if (MySerializableXact == InvalidSerializableXact)
+		return false;
+
+	/* Check if the relation doesn't participate in predicate locking */
+	if (!PredicateLockingNeededForRelation(relation))
+		return false;
+
+	return true;				/* no excuse to skip predicate locking */
+}
+
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * These functions are a simple implementation of a list for this specific
+ * type of struct.  If there is ever a generalized shared memory list, we
+ * should probably switch to that.
+ */
+static SERIALIZABLEXACT *
+CreatePredXact(void)
+{
+	PredXactListElement ptle;
+
+	ptle = (PredXactListElement)
+		SHMQueueNext(&PredXact->availableList,
+					 &PredXact->availableList,
+					 offsetof(PredXactListElementData, link));
+	if (!ptle)
+		return NULL;
+
+	SHMQueueDelete(&ptle->link);
+	SHMQueueInsertBefore(&PredXact->activeList, &ptle->link);
+	return &ptle->sxact;
+}
+
+static void
+ReleasePredXact(SERIALIZABLEXACT *sxact)
+{
+	PredXactListElement ptle;
+
+	Assert(ShmemAddrIsValid(sxact));
+
+	ptle = (PredXactListElement)
+		(((char *) sxact)
+		 - offsetof(PredXactListElementData, sxact)
+		 + offsetof(PredXactListElementData, link));
+	SHMQueueDelete(&ptle->link);
+	SHMQueueInsertBefore(&PredXact->availableList, &ptle->link);
+}
+
+static SERIALIZABLEXACT *
+FirstPredXact(void)
+{
+	PredXactListElement ptle;
+
+	ptle = (PredXactListElement)
+		SHMQueueNext(&PredXact->activeList,
+					 &PredXact->activeList,
+					 offsetof(PredXactListElementData, link));
+	if (!ptle)
+		return NULL;
+
+	return &ptle->sxact;
+}
+
+static SERIALIZABLEXACT *
+NextPredXact(SERIALIZABLEXACT *sxact)
+{
+	PredXactListElement ptle;
+
+	Assert(ShmemAddrIsValid(sxact));
+
+	ptle = (PredXactListElement)
+		(((char *) sxact)
+		 - offsetof(PredXactListElementData, sxact)
+		 + offsetof(PredXactListElementData, link));
+	ptle = (PredXactListElement)
+		SHMQueueNext(&PredXact->activeList,
+					 &ptle->link,
+					 offsetof(PredXactListElementData, link));
+	if (!ptle)
+		return NULL;
+
+	return &ptle->sxact;
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * These functions manage primitive access to the RWConflict pool and lists.
+ */
+static bool
+RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer)
+{
+	RWConflict	conflict;
+
+	Assert(reader != writer);
+
+	/* Check the ends of the purported conflict first. */
+	if (SxactIsDoomed(reader)
+		|| SxactIsDoomed(writer)
+		|| SHMQueueEmpty(&reader->outConflicts)
+		|| SHMQueueEmpty(&writer->inConflicts))
+		return false;
+
+	/* A conflict is possible; walk the list to find out. */
+	conflict = (RWConflict)
+		SHMQueueNext(&reader->outConflicts,
+					 &reader->outConflicts,
+					 offsetof(RWConflictData, outLink));
+	while (conflict)
+	{
+		if (conflict->sxactIn == writer)
+			return true;
+		conflict = (RWConflict)
+			SHMQueueNext(&reader->outConflicts,
+						 &conflict->outLink,
+						 offsetof(RWConflictData, outLink));
+	}
+
+	/* No conflict found. */
+	return false;
+}
+
+static void
+SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+{
+	RWConflict	conflict;
+
+	Assert(reader != writer);
+	Assert(!RWConflictExists(reader, writer));
+
+	conflict = (RWConflict)
+		SHMQueueNext(&RWConflictPool->availableList,
+					 &RWConflictPool->availableList,
+					 offsetof(RWConflictData, outLink));
+	if (!conflict)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("not enough elements in RWConflictPool to record a read/write conflict"),
+				 errhint("You might need to run fewer transactions at a time or increase max_connections.")));
+
+	SHMQueueDelete(&conflict->outLink);
+
+	conflict->sxactOut = reader;
+	conflict->sxactIn = writer;
+	SHMQueueInsertBefore(&reader->outConflicts, &conflict->outLink);
+	SHMQueueInsertBefore(&writer->inConflicts, &conflict->inLink);
+}
+
+static void
+SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact,
+						  SERIALIZABLEXACT *activeXact)
+{
+	RWConflict	conflict;
+
+	Assert(roXact != activeXact);
+	Assert(SxactIsReadOnly(roXact));
+	Assert(!SxactIsReadOnly(activeXact));
+
+	conflict = (RWConflict)
+		SHMQueueNext(&RWConflictPool->availableList,
+					 &RWConflictPool->availableList,
+					 offsetof(RWConflictData, outLink));
+	if (!conflict)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("not enough elements in RWConflictPool to record a potential read/write conflict"),
+				 errhint("You might need to run fewer transactions at a time or increase max_connections.")));
+
+	SHMQueueDelete(&conflict->outLink);
+
+	conflict->sxactOut = activeXact;
+	conflict->sxactIn = roXact;
+	SHMQueueInsertBefore(&activeXact->possibleUnsafeConflicts,
+						 &conflict->outLink);
+	SHMQueueInsertBefore(&roXact->possibleUnsafeConflicts,
+						 &conflict->inLink);
+}
+
+static void
+ReleaseRWConflict(RWConflict conflict)
+{
+	SHMQueueDelete(&conflict->inLink);
+	SHMQueueDelete(&conflict->outLink);
+	SHMQueueInsertBefore(&RWConflictPool->availableList, &conflict->outLink);
+}
+
+static void
+FlagSxactUnsafe(SERIALIZABLEXACT *sxact)
+{
+	RWConflict	conflict,
+				nextConflict;
+
+	Assert(SxactIsReadOnly(sxact));
+	Assert(!SxactIsROSafe(sxact));
+
+	sxact->flags |= SXACT_FLAG_RO_UNSAFE;
+
+	/*
+	 * We know this isn't a safe snapshot, so we can stop looking for other
+	 * potential conflicts.
+	 */
+	conflict = (RWConflict)
+		SHMQueueNext(&sxact->possibleUnsafeConflicts,
+					 &sxact->possibleUnsafeConflicts,
+					 offsetof(RWConflictData, inLink));
+	while (conflict)
+	{
+		nextConflict = (RWConflict)
+			SHMQueueNext(&sxact->possibleUnsafeConflicts,
+						 &conflict->inLink,
+						 offsetof(RWConflictData, inLink));
+
+		Assert(!SxactIsReadOnly(conflict->sxactOut));
+		Assert(sxact == conflict->sxactIn);
+
+		ReleaseRWConflict(conflict);
+
+		conflict = nextConflict;
+	}
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Decide whether a Serial page number is "older" for truncation purposes.
+ * Analogous to CLOGPagePrecedes().
+ */
+static bool
+SerialPagePrecedesLogically(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * SERIAL_ENTRIESPERPAGE;
+	xid1 += FirstNormalTransactionId + 1;
+	xid2 = ((TransactionId) page2) * SERIAL_ENTRIESPERPAGE;
+	xid2 += FirstNormalTransactionId + 1;
+
+	return (TransactionIdPrecedes(xid1, xid2) &&
+			TransactionIdPrecedes(xid1, xid2 + SERIAL_ENTRIESPERPAGE - 1));
+}
+
+#ifdef USE_ASSERT_CHECKING
+static void
+SerialPagePrecedesLogicallyUnitTests(void)
+{
+	int			per_page = SERIAL_ENTRIESPERPAGE,
+				offset = per_page / 2;
+	int			newestPage,
+				oldestPage,
+				headPage,
+				targetPage;
+	TransactionId newestXact,
+				oldestXact;
+
+	/* GetNewTransactionId() has assigned the last XID it can safely use. */
+	newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;	/* nothing special */
+	newestXact = newestPage * per_page + offset;
+	Assert(newestXact / per_page == newestPage);
+	oldestXact = newestXact + 1;
+	oldestXact -= 1U << 31;
+	oldestPage = oldestXact / per_page;
+
+	/*
+	 * In this scenario, the SLRU headPage pertains to the last ~1000 XIDs
+	 * assigned.  oldestXact finishes, ~2B XIDs having elapsed since it
+	 * started.  Further transactions cause us to summarize oldestXact to
+	 * tailPage.  Function must return false so SerialAdd() doesn't zero
+	 * tailPage (which may contain entries for other old, recently-finished
+	 * XIDs) and half the SLRU.  Reaching this requires burning ~2B XIDs in
+	 * single-user mode, a negligible possibility.
+	 */
+	headPage = newestPage;
+	targetPage = oldestPage;
+	Assert(!SerialPagePrecedesLogically(headPage, targetPage));
+
+	/*
+	 * In this scenario, the SLRU headPage pertains to oldestXact.  We're
+	 * summarizing an XID near newestXact.  (Assume few other XIDs used
+	 * SERIALIZABLE, hence the minimal headPage advancement.  Assume
+	 * oldestXact was long-running and only recently reached the SLRU.)
+	 * Function must return true to make SerialAdd() create targetPage.
+	 *
+	 * Today's implementation mishandles this case, but it doesn't matter
+	 * enough to fix.  Verify that the defect affects just one page by
+	 * asserting correct treatment of its prior page.  Reaching this case
+	 * requires burning ~2B XIDs in single-user mode, a negligible
+	 * possibility.  Moreover, if it does happen, the consequence would be
+	 * mild, namely a new transaction failing in SimpleLruReadPage().
+	 */
+	headPage = oldestPage;
+	targetPage = newestPage;
+	Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+#if 0
+	Assert(SerialPagePrecedesLogically(headPage, targetPage));
+#endif
+}
+#endif
+
+/*
+ * Initialize for the tracking of old serializable committed xids.
+ */
+static void
+SerialInit(void)
+{
+	bool		found;
+
+	/*
+	 * Set up SLRU management of the pg_serial data.
+	 */
+	SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
+	SimpleLruInit(SerialSlruCtl, "Serial",
+				  NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+				  LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
+#ifdef USE_ASSERT_CHECKING
+	SerialPagePrecedesLogicallyUnitTests();
+#endif
+	SlruPagePrecedesUnitTests(SerialSlruCtl, SERIAL_ENTRIESPERPAGE);
+
+	/*
+	 * Create or attach to the SerialControl structure.
+	 */
+	serialControl = (SerialControl)
+		ShmemInitStruct("SerialControlData", sizeof(SerialControlData), &found);
+
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+	{
+		/*
+		 * Set control information to reflect empty SLRU.
+		 */
+		serialControl->headPage = -1;
+		serialControl->headXid = InvalidTransactionId;
+		serialControl->tailXid = InvalidTransactionId;
+	}
+}
+
+/*
+ * Record a committed read write serializable xid and the minimum
+ * commitSeqNo of any transactions to which this xid had a rw-conflict out.
+ * An invalid commitSeqNo means that there were no conflicts out from xid.
+ */
+static void
+SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo)
+{
+	TransactionId tailXid;
+	int			targetPage;
+	int			slotno;
+	int			firstZeroPage;
+	bool		isNewPage;
+
+	Assert(TransactionIdIsValid(xid));
+
+	targetPage = SerialPage(xid);
+
+	LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * If no serializable transactions are active, there shouldn't be anything
+	 * to push out to the SLRU.  Hitting this assert would mean there's
+	 * something wrong with the earlier cleanup logic.
+	 */
+	tailXid = serialControl->tailXid;
+	Assert(TransactionIdIsValid(tailXid));
+
+	/*
+	 * If the SLRU is currently unused, zero out the whole active region from
+	 * tailXid to headXid before taking it into use. Otherwise zero out only
+	 * any new pages that enter the tailXid-headXid range as we advance
+	 * headXid.
+	 */
+	if (serialControl->headPage < 0)
+	{
+		firstZeroPage = SerialPage(tailXid);
+		isNewPage = true;
+	}
+	else
+	{
+		firstZeroPage = SerialNextPage(serialControl->headPage);
+		isNewPage = SerialPagePrecedesLogically(serialControl->headPage,
+												targetPage);
+	}
+
+	if (!TransactionIdIsValid(serialControl->headXid)
+		|| TransactionIdFollows(xid, serialControl->headXid))
+		serialControl->headXid = xid;
+	if (isNewPage)
+		serialControl->headPage = targetPage;
+
+	if (isNewPage)
+	{
+		/* Initialize intervening pages. */
+		while (firstZeroPage != targetPage)
+		{
+			(void) SimpleLruZeroPage(SerialSlruCtl, firstZeroPage);
+			firstZeroPage = SerialNextPage(firstZeroPage);
+		}
+		slotno = SimpleLruZeroPage(SerialSlruCtl, targetPage);
+	}
+	else
+		slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid);
+
+	SerialValue(slotno, xid) = minConflictCommitSeqNo;
+	SerialSlruCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(SerialSLRULock);
+}
+
+/*
+ * Get the minimum commitSeqNo for any conflict out for the given xid.  For
+ * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo
+ * will be returned.
+ */
+static SerCommitSeqNo
+SerialGetMinConflictCommitSeqNo(TransactionId xid)
+{
+	TransactionId headXid;
+	TransactionId tailXid;
+	SerCommitSeqNo val;
+	int			slotno;
+
+	Assert(TransactionIdIsValid(xid));
+
+	LWLockAcquire(SerialSLRULock, LW_SHARED);
+	headXid = serialControl->headXid;
+	tailXid = serialControl->tailXid;
+	LWLockRelease(SerialSLRULock);
+
+	if (!TransactionIdIsValid(headXid))
+		return 0;
+
+	Assert(TransactionIdIsValid(tailXid));
+
+	if (TransactionIdPrecedes(xid, tailXid)
+		|| TransactionIdFollows(xid, headXid))
+		return 0;
+
+	/*
+	 * The following function must be called without holding SerialSLRULock,
+	 * but will return with that lock held, which must then be released.
+	 */
+	slotno = SimpleLruReadPage_ReadOnly(SerialSlruCtl,
+										SerialPage(xid), xid);
+	val = SerialValue(slotno, xid);
+	LWLockRelease(SerialSLRULock);
+	return val;
+}
+
+/*
+ * Call this whenever there is a new xmin for active serializable
+ * transactions.  We don't need to keep information on transactions which
+ * precede that.  InvalidTransactionId means none active, so everything in
+ * the SLRU can be discarded.
+ */
+static void
+SerialSetActiveSerXmin(TransactionId xid)
+{
+	LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * When no sxacts are active, nothing overlaps, set the xid values to
+	 * invalid to show that there are no valid entries.  Don't clear headPage,
+	 * though.  A new xmin might still land on that page, and we don't want to
+	 * repeatedly zero out the same page.
+	 */
+	if (!TransactionIdIsValid(xid))
+	{
+		serialControl->tailXid = InvalidTransactionId;
+		serialControl->headXid = InvalidTransactionId;
+		LWLockRelease(SerialSLRULock);
+		return;
+	}
+
+	/*
+	 * When we're recovering prepared transactions, the global xmin might move
+	 * backwards depending on the order they're recovered. Normally that's not
+	 * OK, but during recovery no serializable transactions will commit, so
+	 * the SLRU is empty and we can get away with it.
+	 */
+	if (RecoveryInProgress())
+	{
+		Assert(serialControl->headPage < 0);
+		if (!TransactionIdIsValid(serialControl->tailXid)
+			|| TransactionIdPrecedes(xid, serialControl->tailXid))
+		{
+			serialControl->tailXid = xid;
+		}
+		LWLockRelease(SerialSLRULock);
+		return;
+	}
+
+	Assert(!TransactionIdIsValid(serialControl->tailXid)
+		   || TransactionIdFollows(xid, serialControl->tailXid));
+
+	serialControl->tailXid = xid;
+
+	LWLockRelease(SerialSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ *
+ * We don't have any data that needs to survive a restart, but this is a
+ * convenient place to truncate the SLRU.
+ */
+void
+CheckPointPredicate(void)
+{
+	int			tailPage;
+
+	LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+	/* Exit quickly if the SLRU is currently not in use. */
+	if (serialControl->headPage < 0)
+	{
+		LWLockRelease(SerialSLRULock);
+		return;
+	}
+
+	if (TransactionIdIsValid(serialControl->tailXid))
+	{
+		/* We can truncate the SLRU up to the page containing tailXid */
+		tailPage = SerialPage(serialControl->tailXid);
+	}
+	else
+	{
+		/*----------
+		 * The SLRU is no longer needed. Truncate to head before we set head
+		 * invalid.
+		 *
+		 * XXX: It's possible that the SLRU is not needed again until XID
+		 * wrap-around has happened, so that the segment containing headPage
+		 * that we leave behind will appear to be new again. In that case it
+		 * won't be removed until XID horizon advances enough to make it
+		 * current again.
+		 *
+		 * XXX: This should happen in vac_truncate_clog(), not in checkpoints.
+		 * Consider this scenario, starting from a system with no in-progress
+		 * transactions and VACUUM FREEZE having maximized oldestXact:
+		 * - Start a SERIALIZABLE transaction.
+		 * - Start, finish, and summarize a SERIALIZABLE transaction, creating
+		 *   one SLRU page.
+		 * - Consume XIDs to reach xidStopLimit.
+		 * - Finish all transactions.  Due to the long-running SERIALIZABLE
+		 *   transaction, earlier checkpoints did not touch headPage.  The
+		 *   next checkpoint will change it, but that checkpoint happens after
+		 *   the end of the scenario.
+		 * - VACUUM to advance XID limits.
+		 * - Consume ~2M XIDs, crossing the former xidWrapLimit.
+		 * - Start, finish, and summarize a SERIALIZABLE transaction.
+		 *   SerialAdd() declines to create the targetPage, because headPage
+		 *   is not regarded as in the past relative to that targetPage.  The
+		 *   transaction instigating the summarize fails in
+		 *   SimpleLruReadPage().
+		 */
+		tailPage = serialControl->headPage;
+		serialControl->headPage = -1;
+	}
+
+	LWLockRelease(SerialSLRULock);
+
+	/* Truncate away pages that are no longer required */
+	SimpleLruTruncate(SerialSlruCtl, tailPage);
+
+	/*
+	 * Write dirty SLRU pages to disk
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely as a debugging aid.
+	 *
+	 * We're doing this after the truncation to avoid writing pages right
+	 * before deleting the file in which they sit, which would be completely
+	 * pointless.
+	 */
+	SimpleLruWriteAll(SerialSlruCtl, true);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * InitPredicateLocks -- Initialize the predicate locking data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments.  In the normal postmaster case, the shared hash tables
+ * are created here.  Backends inherit the pointers
+ * to the shared tables via fork().  In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables.
+ */
+void
+InitPredicateLocks(void)
+{
+	HASHCTL		info;
+	long		max_table_size;
+	Size		requestSize;
+	bool		found;
+
+#ifndef EXEC_BACKEND
+	Assert(!IsUnderPostmaster);
+#endif
+
+	/*
+	 * Compute size of predicate lock target hashtable. Note these
+	 * calculations must agree with PredicateLockShmemSize!
+	 */
+	max_table_size = NPREDICATELOCKTARGETENTS();
+
+	/*
+	 * Allocate hash table for PREDICATELOCKTARGET structs.  This stores
+	 * per-predicate-lock-target information.
+	 */
+	info.keysize = sizeof(PREDICATELOCKTARGETTAG);
+	info.entrysize = sizeof(PREDICATELOCKTARGET);
+	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+
+	PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
+											max_table_size,
+											max_table_size,
+											&info,
+											HASH_ELEM | HASH_BLOBS |
+											HASH_PARTITION | HASH_FIXED_SIZE);
+
+	/*
+	 * Reserve a dummy entry in the hash table; we use it to make sure there's
+	 * always one entry available when we need to split or combine a page,
+	 * because running out of space there could mean aborting a
+	 * non-serializable transaction.
+	 */
+	if (!IsUnderPostmaster)
+	{
+		(void) hash_search(PredicateLockTargetHash, &ScratchTargetTag,
+						   HASH_ENTER, &found);
+		Assert(!found);
+	}
+
+	/* Pre-calculate the hash and partition lock of the scratch entry */
+	ScratchTargetTagHash = PredicateLockTargetTagHashCode(&ScratchTargetTag);
+	ScratchPartitionLock = PredicateLockHashPartitionLock(ScratchTargetTagHash);
+
+	/*
+	 * Allocate hash table for PREDICATELOCK structs.  This stores per
+	 * xact-lock-of-a-target information.
+	 */
+	info.keysize = sizeof(PREDICATELOCKTAG);
+	info.entrysize = sizeof(PREDICATELOCK);
+	info.hash = predicatelock_hash;
+	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+
+	/* Assume an average of 2 xacts per target */
+	max_table_size *= 2;
+
+	PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
+									  max_table_size,
+									  max_table_size,
+									  &info,
+									  HASH_ELEM | HASH_FUNCTION |
+									  HASH_PARTITION | HASH_FIXED_SIZE);
+
+	/*
+	 * Compute size for serializable transaction hashtable. Note these
+	 * calculations must agree with PredicateLockShmemSize!
+	 */
+	max_table_size = (MaxBackends + max_prepared_xacts);
+
+	/*
+	 * Allocate a list to hold information on transactions participating in
+	 * predicate locking.
+	 *
+	 * Assume an average of 10 predicate locking transactions per backend.
+	 * This allows aggressive cleanup while detail is present before data must
+	 * be summarized for storage in SLRU and the "dummy" transaction.
+	 */
+	max_table_size *= 10;
+
+	PredXact = ShmemInitStruct("PredXactList",
+							   PredXactListDataSize,
+							   &found);
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+	{
+		int			i;
+
+		SHMQueueInit(&PredXact->availableList);
+		SHMQueueInit(&PredXact->activeList);
+		PredXact->SxactGlobalXmin = InvalidTransactionId;
+		PredXact->SxactGlobalXminCount = 0;
+		PredXact->WritableSxactCount = 0;
+		PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1;
+		PredXact->CanPartialClearThrough = 0;
+		PredXact->HavePartialClearedThrough = 0;
+		requestSize = mul_size((Size) max_table_size,
+							   PredXactListElementDataSize);
+		PredXact->element = ShmemAlloc(requestSize);
+		/* Add all elements to available list, clean. */
+		memset(PredXact->element, 0, requestSize);
+		for (i = 0; i < max_table_size; i++)
+		{
+			LWLockInitialize(&PredXact->element[i].sxact.perXactPredicateListLock,
+							 LWTRANCHE_PER_XACT_PREDICATE_LIST);
+			SHMQueueInsertBefore(&(PredXact->availableList),
+								 &(PredXact->element[i].link));
+		}
+		PredXact->OldCommittedSxact = CreatePredXact();
+		SetInvalidVirtualTransactionId(PredXact->OldCommittedSxact->vxid);
+		PredXact->OldCommittedSxact->prepareSeqNo = 0;
+		PredXact->OldCommittedSxact->commitSeqNo = 0;
+		PredXact->OldCommittedSxact->SeqNo.lastCommitBeforeSnapshot = 0;
+		SHMQueueInit(&PredXact->OldCommittedSxact->outConflicts);
+		SHMQueueInit(&PredXact->OldCommittedSxact->inConflicts);
+		SHMQueueInit(&PredXact->OldCommittedSxact->predicateLocks);
+		SHMQueueInit(&PredXact->OldCommittedSxact->finishedLink);
+		SHMQueueInit(&PredXact->OldCommittedSxact->possibleUnsafeConflicts);
+		PredXact->OldCommittedSxact->topXid = InvalidTransactionId;
+		PredXact->OldCommittedSxact->finishedBefore = InvalidTransactionId;
+		PredXact->OldCommittedSxact->xmin = InvalidTransactionId;
+		PredXact->OldCommittedSxact->flags = SXACT_FLAG_COMMITTED;
+		PredXact->OldCommittedSxact->pid = 0;
+	}
+	/* This never changes, so let's keep a local copy. */
+	OldCommittedSxact = PredXact->OldCommittedSxact;
+
+	/*
+	 * Allocate hash table for SERIALIZABLEXID structs.  This stores per-xid
+	 * information for serializable transactions which have accessed data.
+	 */
+	info.keysize = sizeof(SERIALIZABLEXIDTAG);
+	info.entrysize = sizeof(SERIALIZABLEXID);
+
+	SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
+										max_table_size,
+										max_table_size,
+										&info,
+										HASH_ELEM | HASH_BLOBS |
+										HASH_FIXED_SIZE);
+
+	/*
+	 * Allocate space for tracking rw-conflicts in lists attached to the
+	 * transactions.
+	 *
+	 * Assume an average of 5 conflicts per transaction.  Calculations suggest
+	 * that this will prevent resource exhaustion in even the most pessimal
+	 * loads up to max_connections = 200 with all 200 connections pounding the
+	 * database with serializable transactions.  Beyond that, there may be
+	 * occasional transactions canceled when trying to flag conflicts. That's
+	 * probably OK.
+	 */
+	max_table_size *= 5;
+
+	RWConflictPool = ShmemInitStruct("RWConflictPool",
+									 RWConflictPoolHeaderDataSize,
+									 &found);
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+	{
+		int			i;
+
+		SHMQueueInit(&RWConflictPool->availableList);
+		requestSize = mul_size((Size) max_table_size,
+							   RWConflictDataSize);
+		RWConflictPool->element = ShmemAlloc(requestSize);
+		/* Add all elements to available list, clean. */
+		memset(RWConflictPool->element, 0, requestSize);
+		for (i = 0; i < max_table_size; i++)
+		{
+			SHMQueueInsertBefore(&(RWConflictPool->availableList),
+								 &(RWConflictPool->element[i].outLink));
+		}
+	}
+
+	/*
+	 * Create or attach to the header for the list of finished serializable
+	 * transactions.
+	 */
+	FinishedSerializableTransactions = (SHM_QUEUE *)
+		ShmemInitStruct("FinishedSerializableTransactions",
+						sizeof(SHM_QUEUE),
+						&found);
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+		SHMQueueInit(FinishedSerializableTransactions);
+
+	/*
+	 * Initialize the SLRU storage for old committed serializable
+	 * transactions.
+	 */
+	SerialInit();
+}
+
+/*
+ * Estimate shared-memory space used for predicate lock table
+ */
+Size
+PredicateLockShmemSize(void)
+{
+	Size		size = 0;
+	long		max_table_size;
+
+	/* predicate lock target hash table */
+	max_table_size = NPREDICATELOCKTARGETENTS();
+	size = add_size(size, hash_estimate_size(max_table_size,
+											 sizeof(PREDICATELOCKTARGET)));
+
+	/* predicate lock hash table */
+	max_table_size *= 2;
+	size = add_size(size, hash_estimate_size(max_table_size,
+											 sizeof(PREDICATELOCK)));
+
+	/*
+	 * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
+	 * margin.
+	 */
+	size = add_size(size, size / 10);
+
+	/* transaction list */
+	max_table_size = MaxBackends + max_prepared_xacts;
+	max_table_size *= 10;
+	size = add_size(size, PredXactListDataSize);
+	size = add_size(size, mul_size((Size) max_table_size,
+								   PredXactListElementDataSize));
+
+	/* transaction xid table */
+	size = add_size(size, hash_estimate_size(max_table_size,
+											 sizeof(SERIALIZABLEXID)));
+
+	/* rw-conflict pool */
+	max_table_size *= 5;
+	size = add_size(size, RWConflictPoolHeaderDataSize);
+	size = add_size(size, mul_size((Size) max_table_size,
+								   RWConflictDataSize));
+
+	/* Head for list of finished serializable transactions. */
+	size = add_size(size, sizeof(SHM_QUEUE));
+
+	/* Shared memory structures for SLRU tracking of old committed xids. */
+	size = add_size(size, sizeof(SerialControlData));
+	size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+
+	return size;
+}
+
+
+/*
+ * Compute the hash code associated with a PREDICATELOCKTAG.
+ *
+ * Because we want to use just one set of partition locks for both the
+ * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
+ * that PREDICATELOCKs fall into the same partition number as their
+ * associated PREDICATELOCKTARGETs.  dynahash.c expects the partition number
+ * to be the low-order bits of the hash code, and therefore a
+ * PREDICATELOCKTAG's hash code must have the same low-order bits as the
+ * associated PREDICATELOCKTARGETTAG's hash code.  We achieve this with this
+ * specialized hash function.
+ */
+static uint32
+predicatelock_hash(const void *key, Size keysize)
+{
+	const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
+	uint32		targethash;
+
+	Assert(keysize == sizeof(PREDICATELOCKTAG));
+
+	/* Look into the associated target object, and compute its hash code */
+	targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
+
+	return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
+}
+
+
+/*
+ * GetPredicateLockStatusData
+ *		Return a table containing the internal state of the predicate
+ *		lock manager for use in pg_lock_status.
+ *
+ * Like GetLockStatusData, this function tries to hold the partition LWLocks
+ * for as short a time as possible by returning two arrays that simply
+ * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
+ * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
+ * SERIALIZABLEXACT will likely appear.
+ */
+PredicateLockData *
+GetPredicateLockStatusData(void)
+{
+	PredicateLockData *data;
+	int			i;
+	int			els,
+				el;
+	HASH_SEQ_STATUS seqstat;
+	PREDICATELOCK *predlock;
+
+	data = (PredicateLockData *) palloc(sizeof(PredicateLockData));
+
+	/*
+	 * To ensure consistency, take simultaneous locks on all partition locks
+	 * in ascending order, then SerializableXactHashLock.
+	 */
+	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+		LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+	/* Get number of locks and allocate appropriately-sized arrays. */
+	els = hash_get_num_entries(PredicateLockHash);
+	data->nelements = els;
+	data->locktags = (PREDICATELOCKTARGETTAG *)
+		palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
+	data->xacts = (SERIALIZABLEXACT *)
+		palloc(sizeof(SERIALIZABLEXACT) * els);
+
+
+	/* Scan through PredicateLockHash and copy contents */
+	hash_seq_init(&seqstat, PredicateLockHash);
+
+	el = 0;
+
+	while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
+	{
+		data->locktags[el] = predlock->tag.myTarget->tag;
+		data->xacts[el] = *predlock->tag.myXact;
+		el++;
+	}
+
+	Assert(el == els);
+
+	/* Release locks in reverse order */
+	LWLockRelease(SerializableXactHashLock);
+	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+		LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+
+	return data;
+}
+
+/*
+ * Free up shared memory structures by pushing the oldest sxact (the one at
+ * the front of the SummarizeOldestCommittedSxact queue) into summary form.
+ * Each call will free exactly one SERIALIZABLEXACT structure and may also
+ * free one or more of these structures: SERIALIZABLEXID, PREDICATELOCK,
+ * PREDICATELOCKTARGET, RWConflictData.
+ */
+static void
+SummarizeOldestCommittedSxact(void)
+{
+	SERIALIZABLEXACT *sxact;
+
+	LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+
+	/*
+	 * This function is only called if there are no sxact slots available.
+	 * Some of them must belong to old, already-finished transactions, so
+	 * there should be something in FinishedSerializableTransactions list that
+	 * we can summarize. However, there's a race condition: while we were not
+	 * holding any locks, a transaction might have ended and cleaned up all
+	 * the finished sxact entries already, freeing up their sxact slots. In
+	 * that case, we have nothing to do here. The caller will find one of the
+	 * slots released by the other backend when it retries.
+	 */
+	if (SHMQueueEmpty(FinishedSerializableTransactions))
+	{
+		LWLockRelease(SerializableFinishedListLock);
+		return;
+	}
+
+	/*
+	 * Grab the first sxact off the finished list -- this will be the earliest
+	 * commit.  Remove it from the list.
+	 */
+	sxact = (SERIALIZABLEXACT *)
+		SHMQueueNext(FinishedSerializableTransactions,
+					 FinishedSerializableTransactions,
+					 offsetof(SERIALIZABLEXACT, finishedLink));
+	SHMQueueDelete(&(sxact->finishedLink));
+
+	/* Add to SLRU summary information. */
+	if (TransactionIdIsValid(sxact->topXid) && !SxactIsReadOnly(sxact))
+		SerialAdd(sxact->topXid, SxactHasConflictOut(sxact)
+				  ? sxact->SeqNo.earliestOutConflictCommit : InvalidSerCommitSeqNo);
+
+	/* Summarize and release the detail. */
+	ReleaseOneSerializableXact(sxact, false, true);
+
+	LWLockRelease(SerializableFinishedListLock);
+}
+
+/*
+ * GetSafeSnapshot
+ *		Obtain and register a snapshot for a READ ONLY DEFERRABLE
+ *		transaction. Ensures that the snapshot is "safe", i.e. a
+ *		read-only transaction running on it can execute serializably
+ *		without further checks. This requires waiting for concurrent
+ *		transactions to complete, and retrying with a new snapshot if
+ *		one of them could possibly create a conflict.
+ *
+ *		As with GetSerializableTransactionSnapshot (which this is a subroutine
+ *		for), the passed-in Snapshot pointer should reference a static data
+ *		area that can safely be passed to GetSnapshotData.
+ */
+static Snapshot
+GetSafeSnapshot(Snapshot origSnapshot)
+{
+	Snapshot	snapshot;
+
+	Assert(XactReadOnly && XactDeferrable);
+
+	while (true)
+	{
+		/*
+		 * GetSerializableTransactionSnapshotInt is going to call
+		 * GetSnapshotData, so we need to provide it the static snapshot area
+		 * our caller passed to us.  The pointer returned is actually the same
+		 * one passed to it, but we avoid assuming that here.
+		 */
+		snapshot = GetSerializableTransactionSnapshotInt(origSnapshot,
+														 NULL, InvalidPid);
+
+		if (MySerializableXact == InvalidSerializableXact)
+			return snapshot;	/* no concurrent r/w xacts; it's safe */
+
+		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+		/*
+		 * Wait for concurrent transactions to finish. Stop early if one of
+		 * them marked us as conflicted.
+		 */
+		MySerializableXact->flags |= SXACT_FLAG_DEFERRABLE_WAITING;
+		while (!(SHMQueueEmpty(&MySerializableXact->possibleUnsafeConflicts) ||
+				 SxactIsROUnsafe(MySerializableXact)))
+		{
+			LWLockRelease(SerializableXactHashLock);
+			ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT);
+			LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+		}
+		MySerializableXact->flags &= ~SXACT_FLAG_DEFERRABLE_WAITING;
+
+		if (!SxactIsROUnsafe(MySerializableXact))
+		{
+			LWLockRelease(SerializableXactHashLock);
+			break;				/* success */
+		}
+
+		LWLockRelease(SerializableXactHashLock);
+
+		/* else, need to retry... */
+		ereport(DEBUG2,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg_internal("deferrable snapshot was unsafe; trying a new one")));
+		ReleasePredicateLocks(false, false);
+	}
+
+	/*
+	 * Now we have a safe snapshot, so we don't need to do any further checks.
+	 */
+	Assert(SxactIsROSafe(MySerializableXact));
+	ReleasePredicateLocks(false, true);
+
+	return snapshot;
+}
+
+/*
+ * GetSafeSnapshotBlockingPids
+ *		If the specified process is currently blocked in GetSafeSnapshot,
+ *		write the process IDs of all processes that it is blocked by
+ *		into the caller-supplied buffer output[].  The list is truncated at
+ *		output_size, and the number of PIDs written into the buffer is
+ *		returned.  Returns zero if the given PID is not currently blocked
+ *		in GetSafeSnapshot.
+ */
+int
+GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size)
+{
+	int			num_written = 0;
+	SERIALIZABLEXACT *sxact;
+
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+	/* Find blocked_pid's SERIALIZABLEXACT by linear search. */
+	for (sxact = FirstPredXact(); sxact != NULL; sxact = NextPredXact(sxact))
+	{
+		if (sxact->pid == blocked_pid)
+			break;
+	}
+
+	/* Did we find it, and is it currently waiting in GetSafeSnapshot? */
+	if (sxact != NULL && SxactIsDeferrableWaiting(sxact))
+	{
+		RWConflict	possibleUnsafeConflict;
+
+		/* Traverse the list of possible unsafe conflicts collecting PIDs. */
+		possibleUnsafeConflict = (RWConflict)
+			SHMQueueNext(&sxact->possibleUnsafeConflicts,
+						 &sxact->possibleUnsafeConflicts,
+						 offsetof(RWConflictData, inLink));
+
+		while (possibleUnsafeConflict != NULL && num_written < output_size)
+		{
+			output[num_written++] = possibleUnsafeConflict->sxactOut->pid;
+			possibleUnsafeConflict = (RWConflict)
+				SHMQueueNext(&sxact->possibleUnsafeConflicts,
+							 &possibleUnsafeConflict->inLink,
+							 offsetof(RWConflictData, inLink));
+		}
+	}
+
+	LWLockRelease(SerializableXactHashLock);
+
+	return num_written;
+}
+
+/*
+ * Acquire a snapshot that can be used for the current transaction.
+ *
+ * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+ * It should be current for this process and be contained in PredXact.
+ *
+ * The passed-in Snapshot pointer should reference a static data area that
+ * can safely be passed to GetSnapshotData.  The return value is actually
+ * always this same pointer; no new snapshot data structure is allocated
+ * within this function.
+ */
+Snapshot
+GetSerializableTransactionSnapshot(Snapshot snapshot)
+{
+	Assert(IsolationIsSerializable());
+
+	/*
+	 * Can't use serializable mode while recovery is still active, as it is,
+	 * for example, on a hot standby.  We could get here despite the check in
+	 * check_XactIsoLevel() if default_transaction_isolation is set to
+	 * serializable, so phrase the hint accordingly.
+	 */
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot use serializable mode in a hot standby"),
+				 errdetail("\"default_transaction_isolation\" is set to \"serializable\"."),
+				 errhint("You can use \"SET default_transaction_isolation = 'repeatable read'\" to change the default.")));
+
+	/*
+	 * A special optimization is available for SERIALIZABLE READ ONLY
+	 * DEFERRABLE transactions -- we can wait for a suitable snapshot and
+	 * thereby avoid all SSI overhead once it's running.
+	 */
+	if (XactReadOnly && XactDeferrable)
+		return GetSafeSnapshot(snapshot);
+
+	return GetSerializableTransactionSnapshotInt(snapshot,
+												 NULL, InvalidPid);
+}
+
+/*
+ * Import a snapshot to be used for the current transaction.
+ *
+ * This is nearly the same as GetSerializableTransactionSnapshot, except that
+ * we don't take a new snapshot, but rather use the data we're handed.
+ *
+ * The caller must have verified that the snapshot came from a serializable
+ * transaction; and if we're read-write, the source transaction must not be
+ * read-only.
+ */
+void
+SetSerializableTransactionSnapshot(Snapshot snapshot,
+								   VirtualTransactionId *sourcevxid,
+								   int sourcepid)
+{
+	Assert(IsolationIsSerializable());
+
+	/*
+	 * If this is called by parallel.c in a parallel worker, we don't want to
+	 * create a SERIALIZABLEXACT just yet because the leader's
+	 * SERIALIZABLEXACT will be installed with AttachSerializableXact().  We
+	 * also don't want to reject SERIALIZABLE READ ONLY DEFERRABLE in this
+	 * case, because the leader has already determined that the snapshot it
+	 * has passed us is safe.  So there is nothing for us to do.
+	 */
+	if (IsParallelWorker())
+		return;
+
+	/*
+	 * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to
+	 * import snapshots, since there's no way to wait for a safe snapshot when
+	 * we're using the snap we're told to.  (XXX instead of throwing an error,
+	 * we could just ignore the XactDeferrable flag?)
+	 */
+	if (XactReadOnly && XactDeferrable)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE")));
+
+	(void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid,
+												 sourcepid);
+}
+
+/*
+ * Guts of GetSerializableTransactionSnapshot
+ *
+ * If sourcevxid is valid, this is actually an import operation and we should
+ * skip calling GetSnapshotData, because the snapshot contents are already
+ * loaded up.  HOWEVER: to avoid race conditions, we must check that the
+ * source xact is still running after we acquire SerializableXactHashLock.
+ * We do that by calling ProcArrayInstallImportedXmin.
+ */
+static Snapshot
+GetSerializableTransactionSnapshotInt(Snapshot snapshot,
+									  VirtualTransactionId *sourcevxid,
+									  int sourcepid)
+{
+	PGPROC	   *proc;
+	VirtualTransactionId vxid;
+	SERIALIZABLEXACT *sxact,
+			   *othersxact;
+
+	/* We only do this for serializable transactions.  Once. */
+	Assert(MySerializableXact == InvalidSerializableXact);
+
+	Assert(!RecoveryInProgress());
+
+	/*
+	 * Since all parts of a serializable transaction must use the same
+	 * snapshot, it is too late to establish one after a parallel operation
+	 * has begun.
+	 */
+	if (IsInParallelMode())
+		elog(ERROR, "cannot establish serializable snapshot during a parallel operation");
+
+	proc = MyProc;
+	Assert(proc != NULL);
+	GET_VXID_FROM_PGPROC(vxid, *proc);
+
+	/*
+	 * First we get the sxact structure, which may involve looping and access
+	 * to the "finished" list to free a structure for use.
+	 *
+	 * We must hold SerializableXactHashLock when taking/checking the snapshot
+	 * to avoid race conditions, for much the same reasons that
+	 * GetSnapshotData takes the ProcArrayLock.  Since we might have to
+	 * release SerializableXactHashLock to call SummarizeOldestCommittedSxact,
+	 * this means we have to create the sxact first, which is a bit annoying
+	 * (in particular, an elog(ERROR) in procarray.c would cause us to leak
+	 * the sxact).  Consider refactoring to avoid this.
+	 */
+#ifdef TEST_SUMMARIZE_SERIAL
+	SummarizeOldestCommittedSxact();
+#endif
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+	do
+	{
+		sxact = CreatePredXact();
+		/* If null, push out committed sxact to SLRU summary & retry. */
+		if (!sxact)
+		{
+			LWLockRelease(SerializableXactHashLock);
+			SummarizeOldestCommittedSxact();
+			LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+		}
+	} while (!sxact);
+
+	/* Get the snapshot, or check that it's safe to use */
+	if (!sourcevxid)
+		snapshot = GetSnapshotData(snapshot);
+	else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid))
+	{
+		ReleasePredXact(sxact);
+		LWLockRelease(SerializableXactHashLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not import the requested snapshot"),
+				 errdetail("The source process with PID %d is not running anymore.",
+						   sourcepid)));
+	}
+
+	/*
+	 * If there are no serializable transactions which are not read-only, we
+	 * can "opt out" of predicate locking and conflict checking for a
+	 * read-only transaction.
+	 *
+	 * The reason this is safe is that a read-only transaction can only become
+	 * part of a dangerous structure if it overlaps a writable transaction
+	 * which in turn overlaps a writable transaction which committed before
+	 * the read-only transaction started.  A new writable transaction can
+	 * overlap this one, but it can't meet the other condition of overlapping
+	 * a transaction which committed before this one started.
+	 */
+	if (XactReadOnly && PredXact->WritableSxactCount == 0)
+	{
+		ReleasePredXact(sxact);
+		LWLockRelease(SerializableXactHashLock);
+		return snapshot;
+	}
+
+	/* Maintain serializable global xmin info. */
+	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+	{
+		Assert(PredXact->SxactGlobalXminCount == 0);
+		PredXact->SxactGlobalXmin = snapshot->xmin;
+		PredXact->SxactGlobalXminCount = 1;
+		SerialSetActiveSerXmin(snapshot->xmin);
+	}
+	else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin))
+	{
+		Assert(PredXact->SxactGlobalXminCount > 0);
+		PredXact->SxactGlobalXminCount++;
+	}
+	else
+	{
+		Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin));
+	}
+
+	/* Initialize the structure. */
+	sxact->vxid = vxid;
+	sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo;
+	sxact->prepareSeqNo = InvalidSerCommitSeqNo;
+	sxact->commitSeqNo = InvalidSerCommitSeqNo;
+	SHMQueueInit(&(sxact->outConflicts));
+	SHMQueueInit(&(sxact->inConflicts));
+	SHMQueueInit(&(sxact->possibleUnsafeConflicts));
+	sxact->topXid = GetTopTransactionIdIfAny();
+	sxact->finishedBefore = InvalidTransactionId;
+	sxact->xmin = snapshot->xmin;
+	sxact->pid = MyProcPid;
+	SHMQueueInit(&(sxact->predicateLocks));
+	SHMQueueElemInit(&(sxact->finishedLink));
+	sxact->flags = 0;
+	if (XactReadOnly)
+	{
+		sxact->flags |= SXACT_FLAG_READ_ONLY;
+
+		/*
+		 * Register all concurrent r/w transactions as possible conflicts; if
+		 * all of them commit without any outgoing conflicts to earlier
+		 * transactions then this snapshot can be deemed safe (and we can run
+		 * without tracking predicate locks).
+		 */
+		for (othersxact = FirstPredXact();
+			 othersxact != NULL;
+			 othersxact = NextPredXact(othersxact))
+		{
+			if (!SxactIsCommitted(othersxact)
+				&& !SxactIsDoomed(othersxact)
+				&& !SxactIsReadOnly(othersxact))
+			{
+				SetPossibleUnsafeConflict(sxact, othersxact);
+			}
+		}
+	}
+	else
+	{
+		++(PredXact->WritableSxactCount);
+		Assert(PredXact->WritableSxactCount <=
+			   (MaxBackends + max_prepared_xacts));
+	}
+
+	MySerializableXact = sxact;
+	MyXactDidWrite = false;		/* haven't written anything yet */
+
+	LWLockRelease(SerializableXactHashLock);
+
+	CreateLocalPredicateLockHash();
+
+	return snapshot;
+}
+
+static void
+CreateLocalPredicateLockHash(void)
+{
+	HASHCTL		hash_ctl;
+
+	/* Initialize the backend-local hash table of parent locks */
+	Assert(LocalPredicateLockHash == NULL);
+	hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
+	hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
+	LocalPredicateLockHash = hash_create("Local predicate lock",
+										 max_predicate_locks_per_xact,
+										 &hash_ctl,
+										 HASH_ELEM | HASH_BLOBS);
+}
+
+/*
+ * Register the top level XID in SerializableXidHash.
+ * Also store it for easy reference in MySerializableXact.
+ */
+void
+RegisterPredicateLockingXid(TransactionId xid)
+{
+	SERIALIZABLEXIDTAG sxidtag;
+	SERIALIZABLEXID *sxid;
+	bool		found;
+
+	/*
+	 * If we're not tracking predicate lock data for this transaction, we
+	 * should ignore the request and return quickly.
+	 */
+	if (MySerializableXact == InvalidSerializableXact)
+		return;
+
+	/* We should have a valid XID and be at the top level. */
+	Assert(TransactionIdIsValid(xid));
+
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/* This should only be done once per transaction. */
+	Assert(MySerializableXact->topXid == InvalidTransactionId);
+
+	MySerializableXact->topXid = xid;
+
+	sxidtag.xid = xid;
+	sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+										   &sxidtag,
+										   HASH_ENTER, &found);
+	Assert(!found);
+
+	/* Initialize the structure. */
+	sxid->myXact = MySerializableXact;
+	LWLockRelease(SerializableXactHashLock);
+}
+
+
+/*
+ * Check whether there are any predicate locks held by any transaction
+ * for the page at the given block number.
+ *
+ * Note that the transaction may be completed but not yet subject to
+ * cleanup due to overlapping serializable transactions.  This must
+ * return valid information regardless of transaction isolation level.
+ *
+ * Also note that this doesn't check for a conflicting relation lock,
+ * just a lock specifically on the given page.
+ *
+ * One use is to support proper behavior during GiST index vacuum.
+ */
+bool
+PageIsPredicateLocked(Relation relation, BlockNumber blkno)
+{
+	PREDICATELOCKTARGETTAG targettag;
+	uint32		targettaghash;
+	LWLock	   *partitionLock;
+	PREDICATELOCKTARGET *target;
+
+	SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+									relation->rd_node.dbNode,
+									relation->rd_id,
+									blkno);
+
+	targettaghash = PredicateLockTargetTagHashCode(&targettag);
+	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+	LWLockAcquire(partitionLock, LW_SHARED);
+	target = (PREDICATELOCKTARGET *)
+		hash_search_with_hash_value(PredicateLockTargetHash,
+									&targettag, targettaghash,
+									HASH_FIND, NULL);
+	LWLockRelease(partitionLock);
+
+	return (target != NULL);
+}
+
+
+/*
+ * Check whether a particular lock is held by this transaction.
+ *
+ * Important note: this function may return false even if the lock is
+ * being held, because it uses the local lock table which is not
+ * updated if another transaction modifies our lock list (e.g. to
+ * split an index page). It can also return true when a coarser
+ * granularity lock that covers this target is being held. Be careful
+ * to only use this function in circumstances where such errors are
+ * acceptable!
+ */
+static bool
+PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
+{
+	LOCALPREDICATELOCK *lock;
+
+	/* check local hash table */
+	lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+											  targettag,
+											  HASH_FIND, NULL);
+
+	if (!lock)
+		return false;
+
+	/*
+	 * Found entry in the table, but still need to check whether it's actually
+	 * held -- it could just be a parent of some held lock.
+	 */
+	return lock->held;
+}
+
+/*
+ * Return the parent lock tag in the lock hierarchy: the next coarser
+ * lock that covers the provided tag.
+ *
+ * Returns true and sets *parent to the parent tag if one exists,
+ * returns false if none exists.
+ */
+static bool
+GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+						  PREDICATELOCKTARGETTAG *parent)
+{
+	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+	{
+		case PREDLOCKTAG_RELATION:
+			/* relation locks have no parent lock */
+			return false;
+
+		case PREDLOCKTAG_PAGE:
+			/* parent lock is relation lock */
+			SET_PREDICATELOCKTARGETTAG_RELATION(*parent,
+												GET_PREDICATELOCKTARGETTAG_DB(*tag),
+												GET_PREDICATELOCKTARGETTAG_RELATION(*tag));
+
+			return true;
+
+		case PREDLOCKTAG_TUPLE:
+			/* parent lock is page lock */
+			SET_PREDICATELOCKTARGETTAG_PAGE(*parent,
+											GET_PREDICATELOCKTARGETTAG_DB(*tag),
+											GET_PREDICATELOCKTARGETTAG_RELATION(*tag),
+											GET_PREDICATELOCKTARGETTAG_PAGE(*tag));
+			return true;
+	}
+
+	/* not reachable */
+	Assert(false);
+	return false;
+}
+
+/*
+ * Check whether the lock we are considering is already covered by a
+ * coarser lock for our transaction.
+ *
+ * Like PredicateLockExists, this function might return a false
+ * negative, but it will never return a false positive.
+ */
+static bool
+CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
+{
+	PREDICATELOCKTARGETTAG targettag,
+				parenttag;
+
+	targettag = *newtargettag;
+
+	/* check parents iteratively until no more */
+	while (GetParentPredicateLockTag(&targettag, &parenttag))
+	{
+		targettag = parenttag;
+		if (PredicateLockExists(&targettag))
+			return true;
+	}
+
+	/* no more parents to check; lock is not covered */
+	return false;
+}
+
+/*
+ * Remove the dummy entry from the predicate lock target hash, to free up some
+ * scratch space. The caller must be holding SerializablePredicateListLock,
+ * and must restore the entry with RestoreScratchTarget() before releasing the
+ * lock.
+ *
+ * If lockheld is true, the caller is already holding the partition lock
+ * of the partition containing the scratch entry.
+ */
+static void
+RemoveScratchTarget(bool lockheld)
+{
+	bool		found;
+
+	Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+	if (!lockheld)
+		LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
+	hash_search_with_hash_value(PredicateLockTargetHash,
+								&ScratchTargetTag,
+								ScratchTargetTagHash,
+								HASH_REMOVE, &found);
+	Assert(found);
+	if (!lockheld)
+		LWLockRelease(ScratchPartitionLock);
+}
+
+/*
+ * Re-insert the dummy entry in predicate lock target hash.
+ */
+static void
+RestoreScratchTarget(bool lockheld)
+{
+	bool		found;
+
+	Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+	if (!lockheld)
+		LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
+	hash_search_with_hash_value(PredicateLockTargetHash,
+								&ScratchTargetTag,
+								ScratchTargetTagHash,
+								HASH_ENTER, &found);
+	Assert(!found);
+	if (!lockheld)
+		LWLockRelease(ScratchPartitionLock);
+}
+
+/*
+ * Check whether the list of related predicate locks is empty for a
+ * predicate lock target, and remove the target if it is.
+ */
+static void
+RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
+{
+	PREDICATELOCKTARGET *rmtarget PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+	/* Can't remove it until no locks at this target. */
+	if (!SHMQueueEmpty(&target->predicateLocks))
+		return;
+
+	/* Actually remove the target. */
+	rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+										   &target->tag,
+										   targettaghash,
+										   HASH_REMOVE, NULL);
+	Assert(rmtarget == target);
+}
+
+/*
+ * Delete child target locks owned by this process.
+ * This implementation is assuming that the usage of each target tag field
+ * is uniform.  No need to make this hard if we don't have to.
+ *
+ * We acquire an LWLock in the case of parallel mode, because worker
+ * backends have access to the leader's SERIALIZABLEXACT.  Otherwise,
+ * we aren't acquiring LWLocks for the predicate lock or lock
+ * target structures associated with this transaction unless we're going
+ * to modify them, because no other process is permitted to modify our
+ * locks.
+ */
+static void
+DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
+{
+	SERIALIZABLEXACT *sxact;
+	PREDICATELOCK *predlock;
+
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+	sxact = MySerializableXact;
+	if (IsInParallelMode())
+		LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+	predlock = (PREDICATELOCK *)
+		SHMQueueNext(&(sxact->predicateLocks),
+					 &(sxact->predicateLocks),
+					 offsetof(PREDICATELOCK, xactLink));
+	while (predlock)
+	{
+		SHM_QUEUE  *predlocksxactlink;
+		PREDICATELOCK *nextpredlock;
+		PREDICATELOCKTAG oldlocktag;
+		PREDICATELOCKTARGET *oldtarget;
+		PREDICATELOCKTARGETTAG oldtargettag;
+
+		predlocksxactlink = &(predlock->xactLink);
+		nextpredlock = (PREDICATELOCK *)
+			SHMQueueNext(&(sxact->predicateLocks),
+						 predlocksxactlink,
+						 offsetof(PREDICATELOCK, xactLink));
+
+		oldlocktag = predlock->tag;
+		Assert(oldlocktag.myXact == sxact);
+		oldtarget = oldlocktag.myTarget;
+		oldtargettag = oldtarget->tag;
+
+		if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
+		{
+			uint32		oldtargettaghash;
+			LWLock	   *partitionLock;
+			PREDICATELOCK *rmpredlock PG_USED_FOR_ASSERTS_ONLY;
+
+			oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+			partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+
+			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+			SHMQueueDelete(predlocksxactlink);
+			SHMQueueDelete(&(predlock->targetLink));
+			rmpredlock = hash_search_with_hash_value
+				(PredicateLockHash,
+				 &oldlocktag,
+				 PredicateLockHashCodeFromTargetHashCode(&oldlocktag,
+														 oldtargettaghash),
+				 HASH_REMOVE, NULL);
+			Assert(rmpredlock == predlock);
+
+			RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
+
+			LWLockRelease(partitionLock);
+
+			DecrementParentLocks(&oldtargettag);
+		}
+
+		predlock = nextpredlock;
+	}
+	if (IsInParallelMode())
+		LWLockRelease(&sxact->perXactPredicateListLock);
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * Returns the promotion limit for a given predicate lock target.  This is the
+ * max number of descendant locks allowed before promoting to the specified
+ * tag. Note that the limit includes non-direct descendants (e.g., both tuples
+ * and pages for a relation lock).
+ *
+ * Currently the default limit is 2 for a page lock, and half of the value of
+ * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior
+ * of earlier releases when upgrading.
+ *
+ * TODO SSI: We should probably add additional GUCs to allow a maximum ratio
+ * of page and tuple locks based on the pages in a relation, and the maximum
+ * ratio of tuple locks to tuples in a page.  This would provide more
+ * generally "balanced" allocation of locks to where they are most useful,
+ * while still allowing the absolute numbers to prevent one relation from
+ * tying up all predicate lock resources.
+ */
+static int
+MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag)
+{
+	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+	{
+		case PREDLOCKTAG_RELATION:
+			return max_predicate_locks_per_relation < 0
+				? (max_predicate_locks_per_xact
+				   / (-max_predicate_locks_per_relation)) - 1
+				: max_predicate_locks_per_relation;
+
+		case PREDLOCKTAG_PAGE:
+			return max_predicate_locks_per_page;
+
+		case PREDLOCKTAG_TUPLE:
+
+			/*
+			 * not reachable: nothing is finer-granularity than a tuple, so we
+			 * should never try to promote to it.
+			 */
+			Assert(false);
+			return 0;
+	}
+
+	/* not reachable */
+	Assert(false);
+	return 0;
+}
+
+/*
+ * For all ancestors of a newly-acquired predicate lock, increment
+ * their child count in the parent hash table. If any of them have
+ * more descendants than their promotion threshold, acquire the
+ * coarsest such lock.
+ *
+ * Returns true if a parent lock was acquired and false otherwise.
+ */
+static bool
+CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
+{
+	PREDICATELOCKTARGETTAG targettag,
+				nexttag,
+				promotiontag;
+	LOCALPREDICATELOCK *parentlock;
+	bool		found,
+				promote;
+
+	promote = false;
+
+	targettag = *reqtag;
+
+	/* check parents iteratively */
+	while (GetParentPredicateLockTag(&targettag, &nexttag))
+	{
+		targettag = nexttag;
+		parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+														&targettag,
+														HASH_ENTER,
+														&found);
+		if (!found)
+		{
+			parentlock->held = false;
+			parentlock->childLocks = 1;
+		}
+		else
+			parentlock->childLocks++;
+
+		if (parentlock->childLocks >
+			MaxPredicateChildLocks(&targettag))
+		{
+			/*
+			 * We should promote to this parent lock. Continue to check its
+			 * ancestors, however, both to get their child counts right and to
+			 * check whether we should just go ahead and promote to one of
+			 * them.
+			 */
+			promotiontag = targettag;
+			promote = true;
+		}
+	}
+
+	if (promote)
+	{
+		/* acquire coarsest ancestor eligible for promotion */
+		PredicateLockAcquire(&promotiontag);
+		return true;
+	}
+	else
+		return false;
+}
+
+/*
+ * When releasing a lock, decrement the child count on all ancestor
+ * locks.
+ *
+ * This is called only when releasing a lock via
+ * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
+ * we've acquired its parent, possibly due to promotion) or when a new
+ * MVCC write lock makes the predicate lock unnecessary. There's no
+ * point in calling it when locks are released at transaction end, as
+ * this information is no longer needed.
+ */
+static void
+DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
+{
+	PREDICATELOCKTARGETTAG parenttag,
+				nexttag;
+
+	parenttag = *targettag;
+
+	while (GetParentPredicateLockTag(&parenttag, &nexttag))
+	{
+		uint32		targettaghash;
+		LOCALPREDICATELOCK *parentlock,
+				   *rmlock PG_USED_FOR_ASSERTS_ONLY;
+
+		parenttag = nexttag;
+		targettaghash = PredicateLockTargetTagHashCode(&parenttag);
+		parentlock = (LOCALPREDICATELOCK *)
+			hash_search_with_hash_value(LocalPredicateLockHash,
+										&parenttag, targettaghash,
+										HASH_FIND, NULL);
+
+		/*
+		 * There's a small chance the parent lock doesn't exist in the lock
+		 * table. This can happen if we prematurely removed it because an
+		 * index split caused the child refcount to be off.
+		 */
+		if (parentlock == NULL)
+			continue;
+
+		parentlock->childLocks--;
+
+		/*
+		 * Under similar circumstances the parent lock's refcount might be
+		 * zero. This only happens if we're holding that lock (otherwise we
+		 * would have removed the entry).
+		 */
+		if (parentlock->childLocks < 0)
+		{
+			Assert(parentlock->held);
+			parentlock->childLocks = 0;
+		}
+
+		if ((parentlock->childLocks == 0) && (!parentlock->held))
+		{
+			rmlock = (LOCALPREDICATELOCK *)
+				hash_search_with_hash_value(LocalPredicateLockHash,
+											&parenttag, targettaghash,
+											HASH_REMOVE, NULL);
+			Assert(rmlock == parentlock);
+		}
+	}
+}
+
+/*
+ * Indicate that a predicate lock on the given target is held by the
+ * specified transaction. Has no effect if the lock is already held.
+ *
+ * This updates the lock table and the sxact's lock list, and creates
+ * the lock target if necessary, but does *not* do anything related to
+ * granularity promotion or the local lock table. See
+ * PredicateLockAcquire for that.
+ */
+static void
+CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
+					uint32 targettaghash,
+					SERIALIZABLEXACT *sxact)
+{
+	PREDICATELOCKTARGET *target;
+	PREDICATELOCKTAG locktag;
+	PREDICATELOCK *lock;
+	LWLock	   *partitionLock;
+	bool		found;
+
+	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+	if (IsInParallelMode())
+		LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/* Make sure that the target is represented. */
+	target = (PREDICATELOCKTARGET *)
+		hash_search_with_hash_value(PredicateLockTargetHash,
+									targettag, targettaghash,
+									HASH_ENTER_NULL, &found);
+	if (!target)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+				 errhint("You might need to increase max_pred_locks_per_transaction.")));
+	if (!found)
+		SHMQueueInit(&(target->predicateLocks));
+
+	/* We've got the sxact and target, make sure they're joined. */
+	locktag.myTarget = target;
+	locktag.myXact = sxact;
+	lock = (PREDICATELOCK *)
+		hash_search_with_hash_value(PredicateLockHash, &locktag,
+									PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
+									HASH_ENTER_NULL, &found);
+	if (!lock)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+				 errhint("You might need to increase max_pred_locks_per_transaction.")));
+
+	if (!found)
+	{
+		SHMQueueInsertBefore(&(target->predicateLocks), &(lock->targetLink));
+		SHMQueueInsertBefore(&(sxact->predicateLocks),
+							 &(lock->xactLink));
+		lock->commitSeqNo = InvalidSerCommitSeqNo;
+	}
+
+	LWLockRelease(partitionLock);
+	if (IsInParallelMode())
+		LWLockRelease(&sxact->perXactPredicateListLock);
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * Acquire a predicate lock on the specified target for the current
+ * connection if not already held. This updates the local lock table
+ * and uses it to implement granularity promotion. It will consolidate
+ * multiple locks into a coarser lock if warranted, and will release
+ * any finer-grained locks covered by the new one.
+ */
+static void
+PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
+{
+	uint32		targettaghash;
+	bool		found;
+	LOCALPREDICATELOCK *locallock;
+
+	/* Do we have the lock already, or a covering lock? */
+	if (PredicateLockExists(targettag))
+		return;
+
+	if (CoarserLockCovers(targettag))
+		return;
+
+	/* the same hash and LW lock apply to the lock target and the local lock. */
+	targettaghash = PredicateLockTargetTagHashCode(targettag);
+
+	/* Acquire lock in local table */
+	locallock = (LOCALPREDICATELOCK *)
+		hash_search_with_hash_value(LocalPredicateLockHash,
+									targettag, targettaghash,
+									HASH_ENTER, &found);
+	locallock->held = true;
+	if (!found)
+		locallock->childLocks = 0;
+
+	/* Actually create the lock */
+	CreatePredicateLock(targettag, targettaghash, MySerializableXact);
+
+	/*
+	 * Lock has been acquired. Check whether it should be promoted to a
+	 * coarser granularity, or whether there are finer-granularity locks to
+	 * clean up.
+	 */
+	if (CheckAndPromotePredicateLockRequest(targettag))
+	{
+		/*
+		 * Lock request was promoted to a coarser-granularity lock, and that
+		 * lock was acquired. It will delete this lock and any of its
+		 * children, so we're done.
+		 */
+	}
+	else
+	{
+		/* Clean up any finer-granularity locks */
+		if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE)
+			DeleteChildTargetLocks(targettag);
+	}
+}
+
+
+/*
+ *		PredicateLockRelation
+ *
+ * Gets a predicate lock at the relation level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ * Clear any finer-grained predicate locks this session has on the relation.
+ */
+void
+PredicateLockRelation(Relation relation, Snapshot snapshot)
+{
+	PREDICATELOCKTARGETTAG tag;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
+	SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+										relation->rd_node.dbNode,
+										relation->rd_id);
+	PredicateLockAcquire(&tag);
+}
+
+/*
+ *		PredicateLockPage
+ *
+ * Gets a predicate lock at the page level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ * Skip if a coarser predicate lock already covers this page.
+ * Clear any finer-grained predicate locks this session has on the relation.
+ */
+void
+PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
+{
+	PREDICATELOCKTARGETTAG tag;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
+	SET_PREDICATELOCKTARGETTAG_PAGE(tag,
+									relation->rd_node.dbNode,
+									relation->rd_id,
+									blkno);
+	PredicateLockAcquire(&tag);
+}
+
+/*
+ *		PredicateLockTID
+ *
+ * Gets a predicate lock at the tuple level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ */
+void
+PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
+				 TransactionId tuple_xid)
+{
+	PREDICATELOCKTARGETTAG tag;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
+	/*
+	 * Return if this xact wrote it.
+	 */
+	if (relation->rd_index == NULL)
+	{
+		/* If we wrote it; we already have a write lock. */
+		if (TransactionIdIsCurrentTransactionId(tuple_xid))
+			return;
+	}
+
+	/*
+	 * Do quick-but-not-definitive test for a relation lock first.  This will
+	 * never cause a return when the relation is *not* locked, but will
+	 * occasionally let the check continue when there really *is* a relation
+	 * level lock.
+	 */
+	SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+										relation->rd_node.dbNode,
+										relation->rd_id);
+	if (PredicateLockExists(&tag))
+		return;
+
+	SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
+									 relation->rd_node.dbNode,
+									 relation->rd_id,
+									 ItemPointerGetBlockNumber(tid),
+									 ItemPointerGetOffsetNumber(tid));
+	PredicateLockAcquire(&tag);
+}
+
+
+/*
+ *		DeleteLockTarget
+ *
+ * Remove a predicate lock target along with any locks held for it.
+ *
+ * Caller must hold SerializablePredicateListLock and the
+ * appropriate hash partition lock for the target.
+ */
+static void
+DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
+{
+	PREDICATELOCK *predlock;
+	SHM_QUEUE  *predlocktargetlink;
+	PREDICATELOCK *nextpredlock;
+	bool		found;
+
+	Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
+								LW_EXCLUSIVE));
+	Assert(LWLockHeldByMe(PredicateLockHashPartitionLock(targettaghash)));
+
+	predlock = (PREDICATELOCK *)
+		SHMQueueNext(&(target->predicateLocks),
+					 &(target->predicateLocks),
+					 offsetof(PREDICATELOCK, targetLink));
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+	while (predlock)
+	{
+		predlocktargetlink = &(predlock->targetLink);
+		nextpredlock = (PREDICATELOCK *)
+			SHMQueueNext(&(target->predicateLocks),
+						 predlocktargetlink,
+						 offsetof(PREDICATELOCK, targetLink));
+
+		SHMQueueDelete(&(predlock->xactLink));
+		SHMQueueDelete(&(predlock->targetLink));
+
+		hash_search_with_hash_value
+			(PredicateLockHash,
+			 &predlock->tag,
+			 PredicateLockHashCodeFromTargetHashCode(&predlock->tag,
+													 targettaghash),
+			 HASH_REMOVE, &found);
+		Assert(found);
+
+		predlock = nextpredlock;
+	}
+	LWLockRelease(SerializableXactHashLock);
+
+	/* Remove the target itself, if possible. */
+	RemoveTargetIfNoLongerUsed(target, targettaghash);
+}
+
+
+/*
+ *		TransferPredicateLocksToNewTarget
+ *
+ * Move or copy all the predicate locks for a lock target, for use by
+ * index page splits/combines and other things that create or replace
+ * lock targets. If 'removeOld' is true, the old locks and the target
+ * will be removed.
+ *
+ * Returns true on success, or false if we ran out of shared memory to
+ * allocate the new target or locks. Guaranteed to always succeed if
+ * removeOld is set (by using the scratch entry in PredicateLockTargetHash
+ * for scratch space).
+ *
+ * Warning: the "removeOld" option should be used only with care,
+ * because this function does not (indeed, can not) update other
+ * backends' LocalPredicateLockHash. If we are only adding new
+ * entries, this is not a problem: the local lock table is used only
+ * as a hint, so missing entries for locks that are held are
+ * OK. Having entries for locks that are no longer held, as can happen
+ * when using "removeOld", is not in general OK. We can only use it
+ * safely when replacing a lock with a coarser-granularity lock that
+ * covers it, or if we are absolutely certain that no one will need to
+ * refer to that lock in the future.
+ *
+ * Caller must hold SerializablePredicateListLock exclusively.
+ */
+static bool
+TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
+								  PREDICATELOCKTARGETTAG newtargettag,
+								  bool removeOld)
+{
+	uint32		oldtargettaghash;
+	LWLock	   *oldpartitionLock;
+	PREDICATELOCKTARGET *oldtarget;
+	uint32		newtargettaghash;
+	LWLock	   *newpartitionLock;
+	bool		found;
+	bool		outOfShmem = false;
+
+	Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
+								LW_EXCLUSIVE));
+
+	oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+	newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+	oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+	newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+
+	if (removeOld)
+	{
+		/*
+		 * Remove the dummy entry to give us scratch space, so we know we'll
+		 * be able to create the new lock target.
+		 */
+		RemoveScratchTarget(false);
+	}
+
+	/*
+	 * We must get the partition locks in ascending sequence to avoid
+	 * deadlocks. If old and new partitions are the same, we must request the
+	 * lock only once.
+	 */
+	if (oldpartitionLock < newpartitionLock)
+	{
+		LWLockAcquire(oldpartitionLock,
+					  (removeOld ? LW_EXCLUSIVE : LW_SHARED));
+		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+	}
+	else if (oldpartitionLock > newpartitionLock)
+	{
+		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+		LWLockAcquire(oldpartitionLock,
+					  (removeOld ? LW_EXCLUSIVE : LW_SHARED));
+	}
+	else
+		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Look for the old target.  If not found, that's OK; no predicate locks
+	 * are affected, so we can just clean up and return. If it does exist,
+	 * walk its list of predicate locks and move or copy them to the new
+	 * target.
+	 */
+	oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+											&oldtargettag,
+											oldtargettaghash,
+											HASH_FIND, NULL);
+
+	if (oldtarget)
+	{
+		PREDICATELOCKTARGET *newtarget;
+		PREDICATELOCK *oldpredlock;
+		PREDICATELOCKTAG newpredlocktag;
+
+		newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+												&newtargettag,
+												newtargettaghash,
+												HASH_ENTER_NULL, &found);
+
+		if (!newtarget)
+		{
+			/* Failed to allocate due to insufficient shmem */
+			outOfShmem = true;
+			goto exit;
+		}
+
+		/* If we created a new entry, initialize it */
+		if (!found)
+			SHMQueueInit(&(newtarget->predicateLocks));
+
+		newpredlocktag.myTarget = newtarget;
+
+		/*
+		 * Loop through all the locks on the old target, replacing them with
+		 * locks on the new target.
+		 */
+		oldpredlock = (PREDICATELOCK *)
+			SHMQueueNext(&(oldtarget->predicateLocks),
+						 &(oldtarget->predicateLocks),
+						 offsetof(PREDICATELOCK, targetLink));
+		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+		while (oldpredlock)
+		{
+			SHM_QUEUE  *predlocktargetlink;
+			PREDICATELOCK *nextpredlock;
+			PREDICATELOCK *newpredlock;
+			SerCommitSeqNo oldCommitSeqNo = oldpredlock->commitSeqNo;
+
+			predlocktargetlink = &(oldpredlock->targetLink);
+			nextpredlock = (PREDICATELOCK *)
+				SHMQueueNext(&(oldtarget->predicateLocks),
+							 predlocktargetlink,
+							 offsetof(PREDICATELOCK, targetLink));
+			newpredlocktag.myXact = oldpredlock->tag.myXact;
+
+			if (removeOld)
+			{
+				SHMQueueDelete(&(oldpredlock->xactLink));
+				SHMQueueDelete(&(oldpredlock->targetLink));
+
+				hash_search_with_hash_value
+					(PredicateLockHash,
+					 &oldpredlock->tag,
+					 PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag,
+															 oldtargettaghash),
+					 HASH_REMOVE, &found);
+				Assert(found);
+			}
+
+			newpredlock = (PREDICATELOCK *)
+				hash_search_with_hash_value(PredicateLockHash,
+											&newpredlocktag,
+											PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+																					newtargettaghash),
+											HASH_ENTER_NULL,
+											&found);
+			if (!newpredlock)
+			{
+				/* Out of shared memory. Undo what we've done so far. */
+				LWLockRelease(SerializableXactHashLock);
+				DeleteLockTarget(newtarget, newtargettaghash);
+				outOfShmem = true;
+				goto exit;
+			}
+			if (!found)
+			{
+				SHMQueueInsertBefore(&(newtarget->predicateLocks),
+									 &(newpredlock->targetLink));
+				SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks),
+									 &(newpredlock->xactLink));
+				newpredlock->commitSeqNo = oldCommitSeqNo;
+			}
+			else
+			{
+				if (newpredlock->commitSeqNo < oldCommitSeqNo)
+					newpredlock->commitSeqNo = oldCommitSeqNo;
+			}
+
+			Assert(newpredlock->commitSeqNo != 0);
+			Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
+				   || (newpredlock->tag.myXact == OldCommittedSxact));
+
+			oldpredlock = nextpredlock;
+		}
+		LWLockRelease(SerializableXactHashLock);
+
+		if (removeOld)
+		{
+			Assert(SHMQueueEmpty(&oldtarget->predicateLocks));
+			RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
+		}
+	}
+
+
+exit:
+	/* Release partition locks in reverse order of acquisition. */
+	if (oldpartitionLock < newpartitionLock)
+	{
+		LWLockRelease(newpartitionLock);
+		LWLockRelease(oldpartitionLock);
+	}
+	else if (oldpartitionLock > newpartitionLock)
+	{
+		LWLockRelease(oldpartitionLock);
+		LWLockRelease(newpartitionLock);
+	}
+	else
+		LWLockRelease(newpartitionLock);
+
+	if (removeOld)
+	{
+		/* We shouldn't run out of memory if we're moving locks */
+		Assert(!outOfShmem);
+
+		/* Put the scratch entry back */
+		RestoreScratchTarget(false);
+	}
+
+	return !outOfShmem;
+}
+
+/*
+ * Drop all predicate locks of any granularity from the specified relation,
+ * which can be a heap relation or an index relation.  If 'transfer' is true,
+ * acquire a relation lock on the heap for any transactions with any lock(s)
+ * on the specified relation.
+ *
+ * This requires grabbing a lot of LW locks and scanning the entire lock
+ * target table for matches.  That makes this more expensive than most
+ * predicate lock management functions, but it will only be called for DDL
+ * type commands that are expensive anyway, and there are fast returns when
+ * no serializable transactions are active or the relation is temporary.
+ *
+ * We don't use the TransferPredicateLocksToNewTarget function because it
+ * acquires its own locks on the partitions of the two targets involved,
+ * and we'll already be holding all partition locks.
+ *
+ * We can't throw an error from here, because the call could be from a
+ * transaction which is not serializable.
+ *
+ * NOTE: This is currently only called with transfer set to true, but that may
+ * change.  If we decide to clean up the locks from a table on commit of a
+ * transaction which executed DROP TABLE, the false condition will be useful.
+ */
+static void
+DropAllPredicateLocksFromTable(Relation relation, bool transfer)
+{
+	HASH_SEQ_STATUS seqstat;
+	PREDICATELOCKTARGET *oldtarget;
+	PREDICATELOCKTARGET *heaptarget;
+	Oid			dbId;
+	Oid			relId;
+	Oid			heapId;
+	int			i;
+	bool		isIndex;
+	bool		found;
+	uint32		heaptargettaghash;
+
+	/*
+	 * Bail out quickly if there are no serializable transactions running.
+	 * It's safe to check this without taking locks because the caller is
+	 * holding an ACCESS EXCLUSIVE lock on the relation.  No new locks which
+	 * would matter here can be acquired while that is held.
+	 */
+	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+		return;
+
+	if (!PredicateLockingNeededForRelation(relation))
+		return;
+
+	dbId = relation->rd_node.dbNode;
+	relId = relation->rd_id;
+	if (relation->rd_index == NULL)
+	{
+		isIndex = false;
+		heapId = relId;
+	}
+	else
+	{
+		isIndex = true;
+		heapId = relation->rd_index->indrelid;
+	}
+	Assert(heapId != InvalidOid);
+	Assert(transfer || !isIndex);	/* index OID only makes sense with
+									 * transfer */
+
+	/* Retrieve first time needed, then keep. */
+	heaptargettaghash = 0;
+	heaptarget = NULL;
+
+	/* Acquire locks on all lock partitions */
+	LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+		LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/*
+	 * Remove the dummy entry to give us scratch space, so we know we'll be
+	 * able to create the new lock target.
+	 */
+	if (transfer)
+		RemoveScratchTarget(true);
+
+	/* Scan through target map */
+	hash_seq_init(&seqstat, PredicateLockTargetHash);
+
+	while ((oldtarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
+	{
+		PREDICATELOCK *oldpredlock;
+
+		/*
+		 * Check whether this is a target which needs attention.
+		 */
+		if (GET_PREDICATELOCKTARGETTAG_RELATION(oldtarget->tag) != relId)
+			continue;			/* wrong relation id */
+		if (GET_PREDICATELOCKTARGETTAG_DB(oldtarget->tag) != dbId)
+			continue;			/* wrong database id */
+		if (transfer && !isIndex
+			&& GET_PREDICATELOCKTARGETTAG_TYPE(oldtarget->tag) == PREDLOCKTAG_RELATION)
+			continue;			/* already the right lock */
+
+		/*
+		 * If we made it here, we have work to do.  We make sure the heap
+		 * relation lock exists, then we walk the list of predicate locks for
+		 * the old target we found, moving all locks to the heap relation lock
+		 * -- unless they already hold that.
+		 */
+
+		/*
+		 * First make sure we have the heap relation target.  We only need to
+		 * do this once.
+		 */
+		if (transfer && heaptarget == NULL)
+		{
+			PREDICATELOCKTARGETTAG heaptargettag;
+
+			SET_PREDICATELOCKTARGETTAG_RELATION(heaptargettag, dbId, heapId);
+			heaptargettaghash = PredicateLockTargetTagHashCode(&heaptargettag);
+			heaptarget = hash_search_with_hash_value(PredicateLockTargetHash,
+													 &heaptargettag,
+													 heaptargettaghash,
+													 HASH_ENTER, &found);
+			if (!found)
+				SHMQueueInit(&heaptarget->predicateLocks);
+		}
+
+		/*
+		 * Loop through all the locks on the old target, replacing them with
+		 * locks on the new target.
+		 */
+		oldpredlock = (PREDICATELOCK *)
+			SHMQueueNext(&(oldtarget->predicateLocks),
+						 &(oldtarget->predicateLocks),
+						 offsetof(PREDICATELOCK, targetLink));
+		while (oldpredlock)
+		{
+			PREDICATELOCK *nextpredlock;
+			PREDICATELOCK *newpredlock;
+			SerCommitSeqNo oldCommitSeqNo;
+			SERIALIZABLEXACT *oldXact;
+
+			nextpredlock = (PREDICATELOCK *)
+				SHMQueueNext(&(oldtarget->predicateLocks),
+							 &(oldpredlock->targetLink),
+							 offsetof(PREDICATELOCK, targetLink));
+
+			/*
+			 * Remove the old lock first. This avoids the chance of running
+			 * out of lock structure entries for the hash table.
+			 */
+			oldCommitSeqNo = oldpredlock->commitSeqNo;
+			oldXact = oldpredlock->tag.myXact;
+
+			SHMQueueDelete(&(oldpredlock->xactLink));
+
+			/*
+			 * No need for retail delete from oldtarget list, we're removing
+			 * the whole target anyway.
+			 */
+			hash_search(PredicateLockHash,
+						&oldpredlock->tag,
+						HASH_REMOVE, &found);
+			Assert(found);
+
+			if (transfer)
+			{
+				PREDICATELOCKTAG newpredlocktag;
+
+				newpredlocktag.myTarget = heaptarget;
+				newpredlocktag.myXact = oldXact;
+				newpredlock = (PREDICATELOCK *)
+					hash_search_with_hash_value(PredicateLockHash,
+												&newpredlocktag,
+												PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+																						heaptargettaghash),
+												HASH_ENTER,
+												&found);
+				if (!found)
+				{
+					SHMQueueInsertBefore(&(heaptarget->predicateLocks),
+										 &(newpredlock->targetLink));
+					SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks),
+										 &(newpredlock->xactLink));
+					newpredlock->commitSeqNo = oldCommitSeqNo;
+				}
+				else
+				{
+					if (newpredlock->commitSeqNo < oldCommitSeqNo)
+						newpredlock->commitSeqNo = oldCommitSeqNo;
+				}
+
+				Assert(newpredlock->commitSeqNo != 0);
+				Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
+					   || (newpredlock->tag.myXact == OldCommittedSxact));
+			}
+
+			oldpredlock = nextpredlock;
+		}
+
+		hash_search(PredicateLockTargetHash, &oldtarget->tag, HASH_REMOVE,
+					&found);
+		Assert(found);
+	}
+
+	/* Put the scratch entry back */
+	if (transfer)
+		RestoreScratchTarget(true);
+
+	/* Release locks in reverse order */
+	LWLockRelease(SerializableXactHashLock);
+	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+		LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * TransferPredicateLocksToHeapRelation
+ *		For all transactions, transfer all predicate locks for the given
+ *		relation to a single relation lock on the heap.
+ */
+void
+TransferPredicateLocksToHeapRelation(Relation relation)
+{
+	DropAllPredicateLocksFromTable(relation, true);
+}
+
+
+/*
+ *		PredicateLockPageSplit
+ *
+ * Copies any predicate locks for the old page to the new page.
+ * Skip if this is a temporary table or toast table.
+ *
+ * NOTE: A page split (or overflow) affects all serializable transactions,
+ * even if it occurs in the context of another transaction isolation level.
+ *
+ * NOTE: This currently leaves the local copy of the locks without
+ * information on the new lock which is in shared memory.  This could cause
+ * problems if enough page splits occur on locked pages without the processes
+ * which hold the locks getting in and noticing.
+ */
+void
+PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+					   BlockNumber newblkno)
+{
+	PREDICATELOCKTARGETTAG oldtargettag;
+	PREDICATELOCKTARGETTAG newtargettag;
+	bool		success;
+
+	/*
+	 * Bail out quickly if there are no serializable transactions running.
+	 *
+	 * It's safe to do this check without taking any additional locks. Even if
+	 * a serializable transaction starts concurrently, we know it can't take
+	 * any SIREAD locks on the page being split because the caller is holding
+	 * the associated buffer page lock. Memory reordering isn't an issue; the
+	 * memory barrier in the LWLock acquisition guarantees that this read
+	 * occurs while the buffer page lock is held.
+	 */
+	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+		return;
+
+	if (!PredicateLockingNeededForRelation(relation))
+		return;
+
+	Assert(oldblkno != newblkno);
+	Assert(BlockNumberIsValid(oldblkno));
+	Assert(BlockNumberIsValid(newblkno));
+
+	SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+									relation->rd_node.dbNode,
+									relation->rd_id,
+									oldblkno);
+	SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+									relation->rd_node.dbNode,
+									relation->rd_id,
+									newblkno);
+
+	LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+
+	/*
+	 * Try copying the locks over to the new page's tag, creating it if
+	 * necessary.
+	 */
+	success = TransferPredicateLocksToNewTarget(oldtargettag,
+												newtargettag,
+												false);
+
+	if (!success)
+	{
+		/*
+		 * No more predicate lock entries are available. Failure isn't an
+		 * option here, so promote the page lock to a relation lock.
+		 */
+
+		/* Get the parent relation lock's lock tag */
+		success = GetParentPredicateLockTag(&oldtargettag,
+											&newtargettag);
+		Assert(success);
+
+		/*
+		 * Move the locks to the parent. This shouldn't fail.
+		 *
+		 * Note that here we are removing locks held by other backends,
+		 * leading to a possible inconsistency in their local lock hash table.
+		 * This is OK because we're replacing it with a lock that covers the
+		 * old one.
+		 */
+		success = TransferPredicateLocksToNewTarget(oldtargettag,
+													newtargettag,
+													true);
+		Assert(success);
+	}
+
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ *		PredicateLockPageCombine
+ *
+ * Combines predicate locks for two existing pages.
+ * Skip if this is a temporary table or toast table.
+ *
+ * NOTE: A page combine affects all serializable transactions, even if it
+ * occurs in the context of another transaction isolation level.
+ */
+void
+PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+						 BlockNumber newblkno)
+{
+	/*
+	 * Page combines differ from page splits in that we ought to be able to
+	 * remove the locks on the old page after transferring them to the new
+	 * page, instead of duplicating them. However, because we can't edit other
+	 * backends' local lock tables, removing the old lock would leave them
+	 * with an entry in their LocalPredicateLockHash for a lock they're not
+	 * holding, which isn't acceptable. So we wind up having to do the same
+	 * work as a page split, acquiring a lock on the new page and keeping the
+	 * old page locked too. That can lead to some false positives, but should
+	 * be rare in practice.
+	 */
+	PredicateLockPageSplit(relation, oldblkno, newblkno);
+}
+
+/*
+ * Walk the list of in-progress serializable transactions and find the new
+ * xmin.
+ */
+static void
+SetNewSxactGlobalXmin(void)
+{
+	SERIALIZABLEXACT *sxact;
+
+	Assert(LWLockHeldByMe(SerializableXactHashLock));
+
+	PredXact->SxactGlobalXmin = InvalidTransactionId;
+	PredXact->SxactGlobalXminCount = 0;
+
+	for (sxact = FirstPredXact(); sxact != NULL; sxact = NextPredXact(sxact))
+	{
+		if (!SxactIsRolledBack(sxact)
+			&& !SxactIsCommitted(sxact)
+			&& sxact != OldCommittedSxact)
+		{
+			Assert(sxact->xmin != InvalidTransactionId);
+			if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
+				|| TransactionIdPrecedes(sxact->xmin,
+										 PredXact->SxactGlobalXmin))
+			{
+				PredXact->SxactGlobalXmin = sxact->xmin;
+				PredXact->SxactGlobalXminCount = 1;
+			}
+			else if (TransactionIdEquals(sxact->xmin,
+										 PredXact->SxactGlobalXmin))
+				PredXact->SxactGlobalXminCount++;
+		}
+	}
+
+	SerialSetActiveSerXmin(PredXact->SxactGlobalXmin);
+}
+
+/*
+ *		ReleasePredicateLocks
+ *
+ * Releases predicate locks based on completion of the current transaction,
+ * whether committed or rolled back.  It can also be called for a read only
+ * transaction when it becomes impossible for the transaction to become
+ * part of a dangerous structure.
+ *
+ * We do nothing unless this is a serializable transaction.
+ *
+ * This method must ensure that shared memory hash tables are cleaned
+ * up in some relatively timely fashion.
+ *
+ * If this transaction is committing and is holding any predicate locks,
+ * it must be added to a list of completed serializable transactions still
+ * holding locks.
+ *
+ * If isReadOnlySafe is true, then predicate locks are being released before
+ * the end of the transaction because MySerializableXact has been determined
+ * to be RO_SAFE.  In non-parallel mode we can release it completely, but it
+ * in parallel mode we partially release the SERIALIZABLEXACT and keep it
+ * around until the end of the transaction, allowing each backend to clear its
+ * MySerializableXact variable and benefit from the optimization in its own
+ * time.
+ */
+void
+ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
+{
+	bool		needToClear;
+	RWConflict	conflict,
+				nextConflict,
+				possibleUnsafeConflict;
+	SERIALIZABLEXACT *roXact;
+
+	/*
+	 * We can't trust XactReadOnly here, because a transaction which started
+	 * as READ WRITE can show as READ ONLY later, e.g., within
+	 * subtransactions.  We want to flag a transaction as READ ONLY if it
+	 * commits without writing so that de facto READ ONLY transactions get the
+	 * benefit of some RO optimizations, so we will use this local variable to
+	 * get some cleanup logic right which is based on whether the transaction
+	 * was declared READ ONLY at the top level.
+	 */
+	bool		topLevelIsDeclaredReadOnly;
+
+	/* We can't be both committing and releasing early due to RO_SAFE. */
+	Assert(!(isCommit && isReadOnlySafe));
+
+	/* Are we at the end of a transaction, that is, a commit or abort? */
+	if (!isReadOnlySafe)
+	{
+		/*
+		 * Parallel workers mustn't release predicate locks at the end of
+		 * their transaction.  The leader will do that at the end of its
+		 * transaction.
+		 */
+		if (IsParallelWorker())
+		{
+			ReleasePredicateLocksLocal();
+			return;
+		}
+
+		/*
+		 * By the time the leader in a parallel query reaches end of
+		 * transaction, it has waited for all workers to exit.
+		 */
+		Assert(!ParallelContextActive());
+
+		/*
+		 * If the leader in a parallel query earlier stashed a partially
+		 * released SERIALIZABLEXACT for final clean-up at end of transaction
+		 * (because workers might still have been accessing it), then it's
+		 * time to restore it.
+		 */
+		if (SavedSerializableXact != InvalidSerializableXact)
+		{
+			Assert(MySerializableXact == InvalidSerializableXact);
+			MySerializableXact = SavedSerializableXact;
+			SavedSerializableXact = InvalidSerializableXact;
+			Assert(SxactIsPartiallyReleased(MySerializableXact));
+		}
+	}
+
+	if (MySerializableXact == InvalidSerializableXact)
+	{
+		Assert(LocalPredicateLockHash == NULL);
+		return;
+	}
+
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/*
+	 * If the transaction is committing, but it has been partially released
+	 * already, then treat this as a roll back.  It was marked as rolled back.
+	 */
+	if (isCommit && SxactIsPartiallyReleased(MySerializableXact))
+		isCommit = false;
+
+	/*
+	 * If we're called in the middle of a transaction because we discovered
+	 * that the SXACT_FLAG_RO_SAFE flag was set, then we'll partially release
+	 * it (that is, release the predicate locks and conflicts, but not the
+	 * SERIALIZABLEXACT itself) if we're the first backend to have noticed.
+	 */
+	if (isReadOnlySafe && IsInParallelMode())
+	{
+		/*
+		 * The leader needs to stash a pointer to it, so that it can
+		 * completely release it at end-of-transaction.
+		 */
+		if (!IsParallelWorker())
+			SavedSerializableXact = MySerializableXact;
+
+		/*
+		 * The first backend to reach this condition will partially release
+		 * the SERIALIZABLEXACT.  All others will just clear their
+		 * backend-local state so that they stop doing SSI checks for the rest
+		 * of the transaction.
+		 */
+		if (SxactIsPartiallyReleased(MySerializableXact))
+		{
+			LWLockRelease(SerializableXactHashLock);
+			ReleasePredicateLocksLocal();
+			return;
+		}
+		else
+		{
+			MySerializableXact->flags |= SXACT_FLAG_PARTIALLY_RELEASED;
+			/* ... and proceed to perform the partial release below. */
+		}
+	}
+	Assert(!isCommit || SxactIsPrepared(MySerializableXact));
+	Assert(!isCommit || !SxactIsDoomed(MySerializableXact));
+	Assert(!SxactIsCommitted(MySerializableXact));
+	Assert(SxactIsPartiallyReleased(MySerializableXact)
+		   || !SxactIsRolledBack(MySerializableXact));
+
+	/* may not be serializable during COMMIT/ROLLBACK PREPARED */
+	Assert(MySerializableXact->pid == 0 || IsolationIsSerializable());
+
+	/* We'd better not already be on the cleanup list. */
+	Assert(!SxactIsOnFinishedList(MySerializableXact));
+
+	topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact);
+
+	/*
+	 * We don't hold XidGenLock lock here, assuming that TransactionId is
+	 * atomic!
+	 *
+	 * If this value is changing, we don't care that much whether we get the
+	 * old or new value -- it is just used to determine how far
+	 * SxactGlobalXmin must advance before this transaction can be fully
+	 * cleaned up.  The worst that could happen is we wait for one more
+	 * transaction to complete before freeing some RAM; correctness of visible
+	 * behavior is not affected.
+	 */
+	MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+	/*
+	 * If it's not a commit it's either a rollback or a read-only transaction
+	 * flagged SXACT_FLAG_RO_SAFE, and we can clear our locks immediately.
+	 */
+	if (isCommit)
+	{
+		MySerializableXact->flags |= SXACT_FLAG_COMMITTED;
+		MySerializableXact->commitSeqNo = ++(PredXact->LastSxactCommitSeqNo);
+		/* Recognize implicit read-only transaction (commit without write). */
+		if (!MyXactDidWrite)
+			MySerializableXact->flags |= SXACT_FLAG_READ_ONLY;
+	}
+	else
+	{
+		/*
+		 * The DOOMED flag indicates that we intend to roll back this
+		 * transaction and so it should not cause serialization failures for
+		 * other transactions that conflict with it. Note that this flag might
+		 * already be set, if another backend marked this transaction for
+		 * abort.
+		 *
+		 * The ROLLED_BACK flag further indicates that ReleasePredicateLocks
+		 * has been called, and so the SerializableXact is eligible for
+		 * cleanup. This means it should not be considered when calculating
+		 * SxactGlobalXmin.
+		 */
+		MySerializableXact->flags |= SXACT_FLAG_DOOMED;
+		MySerializableXact->flags |= SXACT_FLAG_ROLLED_BACK;
+
+		/*
+		 * If the transaction was previously prepared, but is now failing due
+		 * to a ROLLBACK PREPARED or (hopefully very rare) error after the
+		 * prepare, clear the prepared flag.  This simplifies conflict
+		 * checking.
+		 */
+		MySerializableXact->flags &= ~SXACT_FLAG_PREPARED;
+	}
+
+	if (!topLevelIsDeclaredReadOnly)
+	{
+		Assert(PredXact->WritableSxactCount > 0);
+		if (--(PredXact->WritableSxactCount) == 0)
+		{
+			/*
+			 * Release predicate locks and rw-conflicts in for all committed
+			 * transactions.  There are no longer any transactions which might
+			 * conflict with the locks and no chance for new transactions to
+			 * overlap.  Similarly, existing conflicts in can't cause pivots,
+			 * and any conflicts in which could have completed a dangerous
+			 * structure would already have caused a rollback, so any
+			 * remaining ones must be benign.
+			 */
+			PredXact->CanPartialClearThrough = PredXact->LastSxactCommitSeqNo;
+		}
+	}
+	else
+	{
+		/*
+		 * Read-only transactions: clear the list of transactions that might
+		 * make us unsafe. Note that we use 'inLink' for the iteration as
+		 * opposed to 'outLink' for the r/w xacts.
+		 */
+		possibleUnsafeConflict = (RWConflict)
+			SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
+						 &MySerializableXact->possibleUnsafeConflicts,
+						 offsetof(RWConflictData, inLink));
+		while (possibleUnsafeConflict)
+		{
+			nextConflict = (RWConflict)
+				SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
+							 &possibleUnsafeConflict->inLink,
+							 offsetof(RWConflictData, inLink));
+
+			Assert(!SxactIsReadOnly(possibleUnsafeConflict->sxactOut));
+			Assert(MySerializableXact == possibleUnsafeConflict->sxactIn);
+
+			ReleaseRWConflict(possibleUnsafeConflict);
+
+			possibleUnsafeConflict = nextConflict;
+		}
+	}
+
+	/* Check for conflict out to old committed transactions. */
+	if (isCommit
+		&& !SxactIsReadOnly(MySerializableXact)
+		&& SxactHasSummaryConflictOut(MySerializableXact))
+	{
+		/*
+		 * we don't know which old committed transaction we conflicted with,
+		 * so be conservative and use FirstNormalSerCommitSeqNo here
+		 */
+		MySerializableXact->SeqNo.earliestOutConflictCommit =
+			FirstNormalSerCommitSeqNo;
+		MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
+	}
+
+	/*
+	 * Release all outConflicts to committed transactions.  If we're rolling
+	 * back clear them all.  Set SXACT_FLAG_CONFLICT_OUT if any point to
+	 * previously committed transactions.
+	 */
+	conflict = (RWConflict)
+		SHMQueueNext(&MySerializableXact->outConflicts,
+					 &MySerializableXact->outConflicts,
+					 offsetof(RWConflictData, outLink));
+	while (conflict)
+	{
+		nextConflict = (RWConflict)
+			SHMQueueNext(&MySerializableXact->outConflicts,
+						 &conflict->outLink,
+						 offsetof(RWConflictData, outLink));
+
+		if (isCommit
+			&& !SxactIsReadOnly(MySerializableXact)
+			&& SxactIsCommitted(conflict->sxactIn))
+		{
+			if ((MySerializableXact->flags & SXACT_FLAG_CONFLICT_OUT) == 0
+				|| conflict->sxactIn->prepareSeqNo < MySerializableXact->SeqNo.earliestOutConflictCommit)
+				MySerializableXact->SeqNo.earliestOutConflictCommit = conflict->sxactIn->prepareSeqNo;
+			MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
+		}
+
+		if (!isCommit
+			|| SxactIsCommitted(conflict->sxactIn)
+			|| (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo))
+			ReleaseRWConflict(conflict);
+
+		conflict = nextConflict;
+	}
+
+	/*
+	 * Release all inConflicts from committed and read-only transactions. If
+	 * we're rolling back, clear them all.
+	 */
+	conflict = (RWConflict)
+		SHMQueueNext(&MySerializableXact->inConflicts,
+					 &MySerializableXact->inConflicts,
+					 offsetof(RWConflictData, inLink));
+	while (conflict)
+	{
+		nextConflict = (RWConflict)
+			SHMQueueNext(&MySerializableXact->inConflicts,
+						 &conflict->inLink,
+						 offsetof(RWConflictData, inLink));
+
+		if (!isCommit
+			|| SxactIsCommitted(conflict->sxactOut)
+			|| SxactIsReadOnly(conflict->sxactOut))
+			ReleaseRWConflict(conflict);
+
+		conflict = nextConflict;
+	}
+
+	if (!topLevelIsDeclaredReadOnly)
+	{
+		/*
+		 * Remove ourselves from the list of possible conflicts for concurrent
+		 * READ ONLY transactions, flagging them as unsafe if we have a
+		 * conflict out. If any are waiting DEFERRABLE transactions, wake them
+		 * up if they are known safe or known unsafe.
+		 */
+		possibleUnsafeConflict = (RWConflict)
+			SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
+						 &MySerializableXact->possibleUnsafeConflicts,
+						 offsetof(RWConflictData, outLink));
+		while (possibleUnsafeConflict)
+		{
+			nextConflict = (RWConflict)
+				SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
+							 &possibleUnsafeConflict->outLink,
+							 offsetof(RWConflictData, outLink));
+
+			roXact = possibleUnsafeConflict->sxactIn;
+			Assert(MySerializableXact == possibleUnsafeConflict->sxactOut);
+			Assert(SxactIsReadOnly(roXact));
+
+			/* Mark conflicted if necessary. */
+			if (isCommit
+				&& MyXactDidWrite
+				&& SxactHasConflictOut(MySerializableXact)
+				&& (MySerializableXact->SeqNo.earliestOutConflictCommit
+					<= roXact->SeqNo.lastCommitBeforeSnapshot))
+			{
+				/*
+				 * This releases possibleUnsafeConflict (as well as all other
+				 * possible conflicts for roXact)
+				 */
+				FlagSxactUnsafe(roXact);
+			}
+			else
+			{
+				ReleaseRWConflict(possibleUnsafeConflict);
+
+				/*
+				 * If we were the last possible conflict, flag it safe. The
+				 * transaction can now safely release its predicate locks (but
+				 * that transaction's backend has to do that itself).
+				 */
+				if (SHMQueueEmpty(&roXact->possibleUnsafeConflicts))
+					roXact->flags |= SXACT_FLAG_RO_SAFE;
+			}
+
+			/*
+			 * Wake up the process for a waiting DEFERRABLE transaction if we
+			 * now know it's either safe or conflicted.
+			 */
+			if (SxactIsDeferrableWaiting(roXact) &&
+				(SxactIsROUnsafe(roXact) || SxactIsROSafe(roXact)))
+				ProcSendSignal(roXact->pid);
+
+			possibleUnsafeConflict = nextConflict;
+		}
+	}
+
+	/*
+	 * Check whether it's time to clean up old transactions. This can only be
+	 * done when the last serializable transaction with the oldest xmin among
+	 * serializable transactions completes.  We then find the "new oldest"
+	 * xmin and purge any transactions which finished before this transaction
+	 * was launched.
+	 */
+	needToClear = false;
+	if (TransactionIdEquals(MySerializableXact->xmin, PredXact->SxactGlobalXmin))
+	{
+		Assert(PredXact->SxactGlobalXminCount > 0);
+		if (--(PredXact->SxactGlobalXminCount) == 0)
+		{
+			SetNewSxactGlobalXmin();
+			needToClear = true;
+		}
+	}
+
+	LWLockRelease(SerializableXactHashLock);
+
+	LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+
+	/* Add this to the list of transactions to check for later cleanup. */
+	if (isCommit)
+		SHMQueueInsertBefore(FinishedSerializableTransactions,
+							 &MySerializableXact->finishedLink);
+
+	/*
+	 * If we're releasing a RO_SAFE transaction in parallel mode, we'll only
+	 * partially release it.  That's necessary because other backends may have
+	 * a reference to it.  The leader will release the SERIALIZABLEXACT itself
+	 * at the end of the transaction after workers have stopped running.
+	 */
+	if (!isCommit)
+		ReleaseOneSerializableXact(MySerializableXact,
+								   isReadOnlySafe && IsInParallelMode(),
+								   false);
+
+	LWLockRelease(SerializableFinishedListLock);
+
+	if (needToClear)
+		ClearOldPredicateLocks();
+
+	ReleasePredicateLocksLocal();
+}
+
+static void
+ReleasePredicateLocksLocal(void)
+{
+	MySerializableXact = InvalidSerializableXact;
+	MyXactDidWrite = false;
+
+	/* Delete per-transaction lock table */
+	if (LocalPredicateLockHash != NULL)
+	{
+		hash_destroy(LocalPredicateLockHash);
+		LocalPredicateLockHash = NULL;
+	}
+}
+
+/*
+ * Clear old predicate locks, belonging to committed transactions that are no
+ * longer interesting to any in-progress transaction.
+ */
+static void
+ClearOldPredicateLocks(void)
+{
+	SERIALIZABLEXACT *finishedSxact;
+	PREDICATELOCK *predlock;
+
+	/*
+	 * Loop through finished transactions. They are in commit order, so we can
+	 * stop as soon as we find one that's still interesting.
+	 */
+	LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+	finishedSxact = (SERIALIZABLEXACT *)
+		SHMQueueNext(FinishedSerializableTransactions,
+					 FinishedSerializableTransactions,
+					 offsetof(SERIALIZABLEXACT, finishedLink));
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+	while (finishedSxact)
+	{
+		SERIALIZABLEXACT *nextSxact;
+
+		nextSxact = (SERIALIZABLEXACT *)
+			SHMQueueNext(FinishedSerializableTransactions,
+						 &(finishedSxact->finishedLink),
+						 offsetof(SERIALIZABLEXACT, finishedLink));
+		if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
+			|| TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore,
+											 PredXact->SxactGlobalXmin))
+		{
+			/*
+			 * This transaction committed before any in-progress transaction
+			 * took its snapshot. It's no longer interesting.
+			 */
+			LWLockRelease(SerializableXactHashLock);
+			SHMQueueDelete(&(finishedSxact->finishedLink));
+			ReleaseOneSerializableXact(finishedSxact, false, false);
+			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		}
+		else if (finishedSxact->commitSeqNo > PredXact->HavePartialClearedThrough
+				 && finishedSxact->commitSeqNo <= PredXact->CanPartialClearThrough)
+		{
+			/*
+			 * Any active transactions that took their snapshot before this
+			 * transaction committed are read-only, so we can clear part of
+			 * its state.
+			 */
+			LWLockRelease(SerializableXactHashLock);
+
+			if (SxactIsReadOnly(finishedSxact))
+			{
+				/* A read-only transaction can be removed entirely */
+				SHMQueueDelete(&(finishedSxact->finishedLink));
+				ReleaseOneSerializableXact(finishedSxact, false, false);
+			}
+			else
+			{
+				/*
+				 * A read-write transaction can only be partially cleared. We
+				 * need to keep the SERIALIZABLEXACT but can release the
+				 * SIREAD locks and conflicts in.
+				 */
+				ReleaseOneSerializableXact(finishedSxact, true, false);
+			}
+
+			PredXact->HavePartialClearedThrough = finishedSxact->commitSeqNo;
+			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		}
+		else
+		{
+			/* Still interesting. */
+			break;
+		}
+		finishedSxact = nextSxact;
+	}
+	LWLockRelease(SerializableXactHashLock);
+
+	/*
+	 * Loop through predicate locks on dummy transaction for summarized data.
+	 */
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+	predlock = (PREDICATELOCK *)
+		SHMQueueNext(&OldCommittedSxact->predicateLocks,
+					 &OldCommittedSxact->predicateLocks,
+					 offsetof(PREDICATELOCK, xactLink));
+	while (predlock)
+	{
+		PREDICATELOCK *nextpredlock;
+		bool		canDoPartialCleanup;
+
+		nextpredlock = (PREDICATELOCK *)
+			SHMQueueNext(&OldCommittedSxact->predicateLocks,
+						 &predlock->xactLink,
+						 offsetof(PREDICATELOCK, xactLink));
+
+		LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		Assert(predlock->commitSeqNo != 0);
+		Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
+		canDoPartialCleanup = (predlock->commitSeqNo <= PredXact->CanPartialClearThrough);
+		LWLockRelease(SerializableXactHashLock);
+
+		/*
+		 * If this lock originally belonged to an old enough transaction, we
+		 * can release it.
+		 */
+		if (canDoPartialCleanup)
+		{
+			PREDICATELOCKTAG tag;
+			PREDICATELOCKTARGET *target;
+			PREDICATELOCKTARGETTAG targettag;
+			uint32		targettaghash;
+			LWLock	   *partitionLock;
+
+			tag = predlock->tag;
+			target = tag.myTarget;
+			targettag = target->tag;
+			targettaghash = PredicateLockTargetTagHashCode(&targettag);
+			partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+			SHMQueueDelete(&(predlock->targetLink));
+			SHMQueueDelete(&(predlock->xactLink));
+
+			hash_search_with_hash_value(PredicateLockHash, &tag,
+										PredicateLockHashCodeFromTargetHashCode(&tag,
+																				targettaghash),
+										HASH_REMOVE, NULL);
+			RemoveTargetIfNoLongerUsed(target, targettaghash);
+
+			LWLockRelease(partitionLock);
+		}
+
+		predlock = nextpredlock;
+	}
+
+	LWLockRelease(SerializablePredicateListLock);
+	LWLockRelease(SerializableFinishedListLock);
+}
+
+/*
+ * This is the normal way to delete anything from any of the predicate
+ * locking hash tables.  Given a transaction which we know can be deleted:
+ * delete all predicate locks held by that transaction and any predicate
+ * lock targets which are now unreferenced by a lock; delete all conflicts
+ * for the transaction; delete all xid values for the transaction; then
+ * delete the transaction.
+ *
+ * When the partial flag is set, we can release all predicate locks and
+ * in-conflict information -- we've established that there are no longer
+ * any overlapping read write transactions for which this transaction could
+ * matter -- but keep the transaction entry itself and any outConflicts.
+ *
+ * When the summarize flag is set, we've run short of room for sxact data
+ * and must summarize to the SLRU.  Predicate locks are transferred to a
+ * dummy "old" transaction, with duplicate locks on a single target
+ * collapsing to a single lock with the "latest" commitSeqNo from among
+ * the conflicting locks..
+ */
+static void
+ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
+						   bool summarize)
+{
+	PREDICATELOCK *predlock;
+	SERIALIZABLEXIDTAG sxidtag;
+	RWConflict	conflict,
+				nextConflict;
+
+	Assert(sxact != NULL);
+	Assert(SxactIsRolledBack(sxact) || SxactIsCommitted(sxact));
+	Assert(partial || !SxactIsOnFinishedList(sxact));
+	Assert(LWLockHeldByMe(SerializableFinishedListLock));
+
+	/*
+	 * First release all the predicate locks held by this xact (or transfer
+	 * them to OldCommittedSxact if summarize is true)
+	 */
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+	if (IsInParallelMode())
+		LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+	predlock = (PREDICATELOCK *)
+		SHMQueueNext(&(sxact->predicateLocks),
+					 &(sxact->predicateLocks),
+					 offsetof(PREDICATELOCK, xactLink));
+	while (predlock)
+	{
+		PREDICATELOCK *nextpredlock;
+		PREDICATELOCKTAG tag;
+		SHM_QUEUE  *targetLink;
+		PREDICATELOCKTARGET *target;
+		PREDICATELOCKTARGETTAG targettag;
+		uint32		targettaghash;
+		LWLock	   *partitionLock;
+
+		nextpredlock = (PREDICATELOCK *)
+			SHMQueueNext(&(sxact->predicateLocks),
+						 &(predlock->xactLink),
+						 offsetof(PREDICATELOCK, xactLink));
+
+		tag = predlock->tag;
+		targetLink = &(predlock->targetLink);
+		target = tag.myTarget;
+		targettag = target->tag;
+		targettaghash = PredicateLockTargetTagHashCode(&targettag);
+		partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		SHMQueueDelete(targetLink);
+
+		hash_search_with_hash_value(PredicateLockHash, &tag,
+									PredicateLockHashCodeFromTargetHashCode(&tag,
+																			targettaghash),
+									HASH_REMOVE, NULL);
+		if (summarize)
+		{
+			bool		found;
+
+			/* Fold into dummy transaction list. */
+			tag.myXact = OldCommittedSxact;
+			predlock = hash_search_with_hash_value(PredicateLockHash, &tag,
+												   PredicateLockHashCodeFromTargetHashCode(&tag,
+																						   targettaghash),
+												   HASH_ENTER_NULL, &found);
+			if (!predlock)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of shared memory"),
+						 errhint("You might need to increase max_pred_locks_per_transaction.")));
+			if (found)
+			{
+				Assert(predlock->commitSeqNo != 0);
+				Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
+				if (predlock->commitSeqNo < sxact->commitSeqNo)
+					predlock->commitSeqNo = sxact->commitSeqNo;
+			}
+			else
+			{
+				SHMQueueInsertBefore(&(target->predicateLocks),
+									 &(predlock->targetLink));
+				SHMQueueInsertBefore(&(OldCommittedSxact->predicateLocks),
+									 &(predlock->xactLink));
+				predlock->commitSeqNo = sxact->commitSeqNo;
+			}
+		}
+		else
+			RemoveTargetIfNoLongerUsed(target, targettaghash);
+
+		LWLockRelease(partitionLock);
+
+		predlock = nextpredlock;
+	}
+
+	/*
+	 * Rather than retail removal, just re-init the head after we've run
+	 * through the list.
+	 */
+	SHMQueueInit(&sxact->predicateLocks);
+
+	if (IsInParallelMode())
+		LWLockRelease(&sxact->perXactPredicateListLock);
+	LWLockRelease(SerializablePredicateListLock);
+
+	sxidtag.xid = sxact->topXid;
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/* Release all outConflicts (unless 'partial' is true) */
+	if (!partial)
+	{
+		conflict = (RWConflict)
+			SHMQueueNext(&sxact->outConflicts,
+						 &sxact->outConflicts,
+						 offsetof(RWConflictData, outLink));
+		while (conflict)
+		{
+			nextConflict = (RWConflict)
+				SHMQueueNext(&sxact->outConflicts,
+							 &conflict->outLink,
+							 offsetof(RWConflictData, outLink));
+			if (summarize)
+				conflict->sxactIn->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+			ReleaseRWConflict(conflict);
+			conflict = nextConflict;
+		}
+	}
+
+	/* Release all inConflicts. */
+	conflict = (RWConflict)
+		SHMQueueNext(&sxact->inConflicts,
+					 &sxact->inConflicts,
+					 offsetof(RWConflictData, inLink));
+	while (conflict)
+	{
+		nextConflict = (RWConflict)
+			SHMQueueNext(&sxact->inConflicts,
+						 &conflict->inLink,
+						 offsetof(RWConflictData, inLink));
+		if (summarize)
+			conflict->sxactOut->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+		ReleaseRWConflict(conflict);
+		conflict = nextConflict;
+	}
+
+	/* Finally, get rid of the xid and the record of the transaction itself. */
+	if (!partial)
+	{
+		if (sxidtag.xid != InvalidTransactionId)
+			hash_search(SerializableXidHash, &sxidtag, HASH_REMOVE, NULL);
+		ReleasePredXact(sxact);
+	}
+
+	LWLockRelease(SerializableXactHashLock);
+}
+
+/*
+ * Tests whether the given top level transaction is concurrent with
+ * (overlaps) our current transaction.
+ *
+ * We need to identify the top level transaction for SSI, anyway, so pass
+ * that to this function to save the overhead of checking the snapshot's
+ * subxip array.
+ */
+static bool
+XidIsConcurrent(TransactionId xid)
+{
+	Snapshot	snap;
+	uint32		i;
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+	snap = GetTransactionSnapshot();
+
+	if (TransactionIdPrecedes(xid, snap->xmin))
+		return false;
+
+	if (TransactionIdFollowsOrEquals(xid, snap->xmax))
+		return true;
+
+	for (i = 0; i < snap->xcnt; i++)
+	{
+		if (xid == snap->xip[i])
+			return true;
+	}
+
+	return false;
+}
+
+bool
+CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
+{
+	if (!SerializationNeededForRead(relation, snapshot))
+		return false;
+
+	/* Check if someone else has already decided that we need to die */
+	if (SxactIsDoomed(MySerializableXact))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+				 errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
+				 errhint("The transaction might succeed if retried.")));
+	}
+
+	return true;
+}
+
+/*
+ * CheckForSerializableConflictOut
+ *		A table AM is reading a tuple that has been modified.  If it determines
+ *		that the tuple version it is reading is not visible to us, it should
+ *		pass in the top level xid of the transaction that created it.
+ *		Otherwise, if it determines that it is visible to us but it has been
+ *		deleted or there is a newer version available due to an update, it
+ *		should pass in the top level xid of the modifying transaction.
+ *
+ * This function will check for overlap with our own transaction.  If the given
+ * xid is also serializable and the transactions overlap (i.e., they cannot see
+ * each other's writes), then we have a conflict out.
+ */
+void
+CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
+{
+	SERIALIZABLEXIDTAG sxidtag;
+	SERIALIZABLEXID *sxid;
+	SERIALIZABLEXACT *sxact;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
+	/* Check if someone else has already decided that we need to die */
+	if (SxactIsDoomed(MySerializableXact))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+				 errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
+				 errhint("The transaction might succeed if retried.")));
+	}
+	Assert(TransactionIdIsValid(xid));
+
+	if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
+		return;
+
+	/*
+	 * Find sxact or summarized info for the top level xid.
+	 */
+	sxidtag.xid = xid;
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+	sxid = (SERIALIZABLEXID *)
+		hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+	if (!sxid)
+	{
+		/*
+		 * Transaction not found in "normal" SSI structures.  Check whether it
+		 * got pushed out to SLRU storage for "old committed" transactions.
+		 */
+		SerCommitSeqNo conflictCommitSeqNo;
+
+		conflictCommitSeqNo = SerialGetMinConflictCommitSeqNo(xid);
+		if (conflictCommitSeqNo != 0)
+		{
+			if (conflictCommitSeqNo != InvalidSerCommitSeqNo
+				&& (!SxactIsReadOnly(MySerializableXact)
+					|| conflictCommitSeqNo
+					<= MySerializableXact->SeqNo.lastCommitBeforeSnapshot))
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to read/write dependencies among transactions"),
+						 errdetail_internal("Reason code: Canceled on conflict out to old pivot %u.", xid),
+						 errhint("The transaction might succeed if retried.")));
+
+			if (SxactHasSummaryConflictIn(MySerializableXact)
+				|| !SHMQueueEmpty(&MySerializableXact->inConflicts))
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to read/write dependencies among transactions"),
+						 errdetail_internal("Reason code: Canceled on identification as a pivot, with conflict out to old committed transaction %u.", xid),
+						 errhint("The transaction might succeed if retried.")));
+
+			MySerializableXact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+		}
+
+		/* It's not serializable or otherwise not important. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+	sxact = sxid->myXact;
+	Assert(TransactionIdEquals(sxact->topXid, xid));
+	if (sxact == MySerializableXact || SxactIsDoomed(sxact))
+	{
+		/* Can't conflict with ourself or a transaction that will roll back. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+
+	/*
+	 * We have a conflict out to a transaction which has a conflict out to a
+	 * summarized transaction.  That summarized transaction must have
+	 * committed first, and we can't tell when it committed in relation to our
+	 * snapshot acquisition, so something needs to be canceled.
+	 */
+	if (SxactHasSummaryConflictOut(sxact))
+	{
+		if (!SxactIsPrepared(sxact))
+		{
+			sxact->flags |= SXACT_FLAG_DOOMED;
+			LWLockRelease(SerializableXactHashLock);
+			return;
+		}
+		else
+		{
+			LWLockRelease(SerializableXactHashLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+					 errmsg("could not serialize access due to read/write dependencies among transactions"),
+					 errdetail_internal("Reason code: Canceled on conflict out to old pivot."),
+					 errhint("The transaction might succeed if retried.")));
+		}
+	}
+
+	/*
+	 * If this is a read-only transaction and the writing transaction has
+	 * committed, and it doesn't have a rw-conflict to a transaction which
+	 * committed before it, no conflict.
+	 */
+	if (SxactIsReadOnly(MySerializableXact)
+		&& SxactIsCommitted(sxact)
+		&& !SxactHasSummaryConflictOut(sxact)
+		&& (!SxactHasConflictOut(sxact)
+			|| MySerializableXact->SeqNo.lastCommitBeforeSnapshot < sxact->SeqNo.earliestOutConflictCommit))
+	{
+		/* Read-only transaction will appear to run first.  No conflict. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+
+	if (!XidIsConcurrent(xid))
+	{
+		/* This write was already in our snapshot; no conflict. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+
+	if (RWConflictExists(MySerializableXact, sxact))
+	{
+		/* We don't want duplicate conflict records in the list. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+
+	/*
+	 * Flag the conflict.  But first, if this conflict creates a dangerous
+	 * structure, ereport an error.
+	 */
+	FlagRWConflict(MySerializableXact, sxact);
+	LWLockRelease(SerializableXactHashLock);
+}
+
+/*
+ * Check a particular target for rw-dependency conflict in. A subroutine of
+ * CheckForSerializableConflictIn().
+ */
+static void
+CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
+{
+	uint32		targettaghash;
+	LWLock	   *partitionLock;
+	PREDICATELOCKTARGET *target;
+	PREDICATELOCK *predlock;
+	PREDICATELOCK *mypredlock = NULL;
+	PREDICATELOCKTAG mypredlocktag;
+
+	Assert(MySerializableXact != InvalidSerializableXact);
+
+	/*
+	 * The same hash and LW lock apply to the lock target and the lock itself.
+	 */
+	targettaghash = PredicateLockTargetTagHashCode(targettag);
+	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+	LWLockAcquire(partitionLock, LW_SHARED);
+	target = (PREDICATELOCKTARGET *)
+		hash_search_with_hash_value(PredicateLockTargetHash,
+									targettag, targettaghash,
+									HASH_FIND, NULL);
+	if (!target)
+	{
+		/* Nothing has this target locked; we're done here. */
+		LWLockRelease(partitionLock);
+		return;
+	}
+
+	/*
+	 * Each lock for an overlapping transaction represents a conflict: a
+	 * rw-dependency in to this transaction.
+	 */
+	predlock = (PREDICATELOCK *)
+		SHMQueueNext(&(target->predicateLocks),
+					 &(target->predicateLocks),
+					 offsetof(PREDICATELOCK, targetLink));
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+	while (predlock)
+	{
+		SHM_QUEUE  *predlocktargetlink;
+		PREDICATELOCK *nextpredlock;
+		SERIALIZABLEXACT *sxact;
+
+		predlocktargetlink = &(predlock->targetLink);
+		nextpredlock = (PREDICATELOCK *)
+			SHMQueueNext(&(target->predicateLocks),
+						 predlocktargetlink,
+						 offsetof(PREDICATELOCK, targetLink));
+
+		sxact = predlock->tag.myXact;
+		if (sxact == MySerializableXact)
+		{
+			/*
+			 * If we're getting a write lock on a tuple, we don't need a
+			 * predicate (SIREAD) lock on the same tuple. We can safely remove
+			 * our SIREAD lock, but we'll defer doing so until after the loop
+			 * because that requires upgrading to an exclusive partition lock.
+			 *
+			 * We can't use this optimization within a subtransaction because
+			 * the subtransaction could roll back, and we would be left
+			 * without any lock at the top level.
+			 */
+			if (!IsSubTransaction()
+				&& GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
+			{
+				mypredlock = predlock;
+				mypredlocktag = predlock->tag;
+			}
+		}
+		else if (!SxactIsDoomed(sxact)
+				 && (!SxactIsCommitted(sxact)
+					 || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+											  sxact->finishedBefore))
+				 && !RWConflictExists(sxact, MySerializableXact))
+		{
+			LWLockRelease(SerializableXactHashLock);
+			LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+			/*
+			 * Re-check after getting exclusive lock because the other
+			 * transaction may have flagged a conflict.
+			 */
+			if (!SxactIsDoomed(sxact)
+				&& (!SxactIsCommitted(sxact)
+					|| TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+											 sxact->finishedBefore))
+				&& !RWConflictExists(sxact, MySerializableXact))
+			{
+				FlagRWConflict(sxact, MySerializableXact);
+			}
+
+			LWLockRelease(SerializableXactHashLock);
+			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		}
+
+		predlock = nextpredlock;
+	}
+	LWLockRelease(SerializableXactHashLock);
+	LWLockRelease(partitionLock);
+
+	/*
+	 * If we found one of our own SIREAD locks to remove, remove it now.
+	 *
+	 * At this point our transaction already has a RowExclusiveLock on the
+	 * relation, so we are OK to drop the predicate lock on the tuple, if
+	 * found, without fearing that another write against the tuple will occur
+	 * before the MVCC information makes it to the buffer.
+	 */
+	if (mypredlock != NULL)
+	{
+		uint32		predlockhashcode;
+		PREDICATELOCK *rmpredlock;
+
+		LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+		if (IsInParallelMode())
+			LWLockAcquire(&MySerializableXact->perXactPredicateListLock, LW_EXCLUSIVE);
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+		/*
+		 * Remove the predicate lock from shared memory, if it wasn't removed
+		 * while the locks were released.  One way that could happen is from
+		 * autovacuum cleaning up an index.
+		 */
+		predlockhashcode = PredicateLockHashCodeFromTargetHashCode
+			(&mypredlocktag, targettaghash);
+		rmpredlock = (PREDICATELOCK *)
+			hash_search_with_hash_value(PredicateLockHash,
+										&mypredlocktag,
+										predlockhashcode,
+										HASH_FIND, NULL);
+		if (rmpredlock != NULL)
+		{
+			Assert(rmpredlock == mypredlock);
+
+			SHMQueueDelete(&(mypredlock->targetLink));
+			SHMQueueDelete(&(mypredlock->xactLink));
+
+			rmpredlock = (PREDICATELOCK *)
+				hash_search_with_hash_value(PredicateLockHash,
+											&mypredlocktag,
+											predlockhashcode,
+											HASH_REMOVE, NULL);
+			Assert(rmpredlock == mypredlock);
+
+			RemoveTargetIfNoLongerUsed(target, targettaghash);
+		}
+
+		LWLockRelease(SerializableXactHashLock);
+		LWLockRelease(partitionLock);
+		if (IsInParallelMode())
+			LWLockRelease(&MySerializableXact->perXactPredicateListLock);
+		LWLockRelease(SerializablePredicateListLock);
+
+		if (rmpredlock != NULL)
+		{
+			/*
+			 * Remove entry in local lock table if it exists. It's OK if it
+			 * doesn't exist; that means the lock was transferred to a new
+			 * target by a different backend.
+			 */
+			hash_search_with_hash_value(LocalPredicateLockHash,
+										targettag, targettaghash,
+										HASH_REMOVE, NULL);
+
+			DecrementParentLocks(targettag);
+		}
+	}
+}
+
+/*
+ * CheckForSerializableConflictIn
+ *		We are writing the given tuple.  If that indicates a rw-conflict
+ *		in from another serializable transaction, take appropriate action.
+ *
+ * Skip checking for any granularity for which a parameter is missing.
+ *
+ * A tuple update or delete is in conflict if we have a predicate lock
+ * against the relation or page in which the tuple exists, or against the
+ * tuple itself.
+ */
+void
+CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
+{
+	PREDICATELOCKTARGETTAG targettag;
+
+	if (!SerializationNeededForWrite(relation))
+		return;
+
+	/* Check if someone else has already decided that we need to die */
+	if (SxactIsDoomed(MySerializableXact))
+		ereport(ERROR,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+				 errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict in checking."),
+				 errhint("The transaction might succeed if retried.")));
+
+	/*
+	 * We're doing a write which might cause rw-conflicts now or later.
+	 * Memorize that fact.
+	 */
+	MyXactDidWrite = true;
+
+	/*
+	 * It is important that we check for locks from the finest granularity to
+	 * the coarsest granularity, so that granularity promotion doesn't cause
+	 * us to miss a lock.  The new (coarser) lock will be acquired before the
+	 * old (finer) locks are released.
+	 *
+	 * It is not possible to take and hold a lock across the checks for all
+	 * granularities because each target could be in a separate partition.
+	 */
+	if (tid != NULL)
+	{
+		SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
+										 relation->rd_node.dbNode,
+										 relation->rd_id,
+										 ItemPointerGetBlockNumber(tid),
+										 ItemPointerGetOffsetNumber(tid));
+		CheckTargetForConflictsIn(&targettag);
+	}
+
+	if (blkno != InvalidBlockNumber)
+	{
+		SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+										relation->rd_node.dbNode,
+										relation->rd_id,
+										blkno);
+		CheckTargetForConflictsIn(&targettag);
+	}
+
+	SET_PREDICATELOCKTARGETTAG_RELATION(targettag,
+										relation->rd_node.dbNode,
+										relation->rd_id);
+	CheckTargetForConflictsIn(&targettag);
+}
+
+/*
+ * CheckTableForSerializableConflictIn
+ *		The entire table is going through a DDL-style logical mass delete
+ *		like TRUNCATE or DROP TABLE.  If that causes a rw-conflict in from
+ *		another serializable transaction, take appropriate action.
+ *
+ * While these operations do not operate entirely within the bounds of
+ * snapshot isolation, they can occur inside a serializable transaction, and
+ * will logically occur after any reads which saw rows which were destroyed
+ * by these operations, so we do what we can to serialize properly under
+ * SSI.
+ *
+ * The relation passed in must be a heap relation. Any predicate lock of any
+ * granularity on the heap will cause a rw-conflict in to this transaction.
+ * Predicate locks on indexes do not matter because they only exist to guard
+ * against conflicting inserts into the index, and this is a mass *delete*.
+ * When a table is truncated or dropped, the index will also be truncated
+ * or dropped, and we'll deal with locks on the index when that happens.
+ *
+ * Dropping or truncating a table also needs to drop any existing predicate
+ * locks on heap tuples or pages, because they're about to go away. This
+ * should be done before altering the predicate locks because the transaction
+ * could be rolled back because of a conflict, in which case the lock changes
+ * are not needed. (At the moment, we don't actually bother to drop the
+ * existing locks on a dropped or truncated table at the moment. That might
+ * lead to some false positives, but it doesn't seem worth the trouble.)
+ */
+void
+CheckTableForSerializableConflictIn(Relation relation)
+{
+	HASH_SEQ_STATUS seqstat;
+	PREDICATELOCKTARGET *target;
+	Oid			dbId;
+	Oid			heapId;
+	int			i;
+
+	/*
+	 * Bail out quickly if there are no serializable transactions running.
+	 * It's safe to check this without taking locks because the caller is
+	 * holding an ACCESS EXCLUSIVE lock on the relation.  No new locks which
+	 * would matter here can be acquired while that is held.
+	 */
+	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+		return;
+
+	if (!SerializationNeededForWrite(relation))
+		return;
+
+	/*
+	 * We're doing a write which might cause rw-conflicts now or later.
+	 * Memorize that fact.
+	 */
+	MyXactDidWrite = true;
+
+	Assert(relation->rd_index == NULL); /* not an index relation */
+
+	dbId = relation->rd_node.dbNode;
+	heapId = relation->rd_id;
+
+	LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+		LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/* Scan through target list */
+	hash_seq_init(&seqstat, PredicateLockTargetHash);
+
+	while ((target = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
+	{
+		PREDICATELOCK *predlock;
+
+		/*
+		 * Check whether this is a target which needs attention.
+		 */
+		if (GET_PREDICATELOCKTARGETTAG_RELATION(target->tag) != heapId)
+			continue;			/* wrong relation id */
+		if (GET_PREDICATELOCKTARGETTAG_DB(target->tag) != dbId)
+			continue;			/* wrong database id */
+
+		/*
+		 * Loop through locks for this target and flag conflicts.
+		 */
+		predlock = (PREDICATELOCK *)
+			SHMQueueNext(&(target->predicateLocks),
+						 &(target->predicateLocks),
+						 offsetof(PREDICATELOCK, targetLink));
+		while (predlock)
+		{
+			PREDICATELOCK *nextpredlock;
+
+			nextpredlock = (PREDICATELOCK *)
+				SHMQueueNext(&(target->predicateLocks),
+							 &(predlock->targetLink),
+							 offsetof(PREDICATELOCK, targetLink));
+
+			if (predlock->tag.myXact != MySerializableXact
+				&& !RWConflictExists(predlock->tag.myXact, MySerializableXact))
+			{
+				FlagRWConflict(predlock->tag.myXact, MySerializableXact);
+			}
+
+			predlock = nextpredlock;
+		}
+	}
+
+	/* Release locks in reverse order */
+	LWLockRelease(SerializableXactHashLock);
+	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+		LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+
+/*
+ * Flag a rw-dependency between two serializable transactions.
+ *
+ * The caller is responsible for ensuring that we have a LW lock on
+ * the transaction hash table.
+ */
+static void
+FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+{
+	Assert(reader != writer);
+
+	/* First, see if this conflict causes failure. */
+	OnConflict_CheckForSerializationFailure(reader, writer);
+
+	/* Actually do the conflict flagging. */
+	if (reader == OldCommittedSxact)
+		writer->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+	else if (writer == OldCommittedSxact)
+		reader->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+	else
+		SetRWConflict(reader, writer);
+}
+
+/*----------------------------------------------------------------------------
+ * We are about to add a RW-edge to the dependency graph - check that we don't
+ * introduce a dangerous structure by doing so, and abort one of the
+ * transactions if so.
+ *
+ * A serialization failure can only occur if there is a dangerous structure
+ * in the dependency graph:
+ *
+ *		Tin ------> Tpivot ------> Tout
+ *			  rw			 rw
+ *
+ * Furthermore, Tout must commit first.
+ *
+ * One more optimization is that if Tin is declared READ ONLY (or commits
+ * without writing), we can only have a problem if Tout committed before Tin
+ * acquired its snapshot.
+ *----------------------------------------------------------------------------
+ */
+static void
+OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+										SERIALIZABLEXACT *writer)
+{
+	bool		failure;
+	RWConflict	conflict;
+
+	Assert(LWLockHeldByMe(SerializableXactHashLock));
+
+	failure = false;
+
+	/*------------------------------------------------------------------------
+	 * Check for already-committed writer with rw-conflict out flagged
+	 * (conflict-flag on W means that T2 committed before W):
+	 *
+	 *		R ------> W ------> T2
+	 *			rw		  rw
+	 *
+	 * That is a dangerous structure, so we must abort. (Since the writer
+	 * has already committed, we must be the reader)
+	 *------------------------------------------------------------------------
+	 */
+	if (SxactIsCommitted(writer)
+		&& (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
+		failure = true;
+
+	/*------------------------------------------------------------------------
+	 * Check whether the writer has become a pivot with an out-conflict
+	 * committed transaction (T2), and T2 committed first:
+	 *
+	 *		R ------> W ------> T2
+	 *			rw		  rw
+	 *
+	 * Because T2 must've committed first, there is no anomaly if:
+	 * - the reader committed before T2
+	 * - the writer committed before T2
+	 * - the reader is a READ ONLY transaction and the reader was concurrent
+	 *	 with T2 (= reader acquired its snapshot before T2 committed)
+	 *
+	 * We also handle the case that T2 is prepared but not yet committed
+	 * here. In that case T2 has already checked for conflicts, so if it
+	 * commits first, making the above conflict real, it's too late for it
+	 * to abort.
+	 *------------------------------------------------------------------------
+	 */
+	if (!failure)
+	{
+		if (SxactHasSummaryConflictOut(writer))
+		{
+			failure = true;
+			conflict = NULL;
+		}
+		else
+			conflict = (RWConflict)
+				SHMQueueNext(&writer->outConflicts,
+							 &writer->outConflicts,
+							 offsetof(RWConflictData, outLink));
+		while (conflict)
+		{
+			SERIALIZABLEXACT *t2 = conflict->sxactIn;
+
+			if (SxactIsPrepared(t2)
+				&& (!SxactIsCommitted(reader)
+					|| t2->prepareSeqNo <= reader->commitSeqNo)
+				&& (!SxactIsCommitted(writer)
+					|| t2->prepareSeqNo <= writer->commitSeqNo)
+				&& (!SxactIsReadOnly(reader)
+					|| t2->prepareSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
+			{
+				failure = true;
+				break;
+			}
+			conflict = (RWConflict)
+				SHMQueueNext(&writer->outConflicts,
+							 &conflict->outLink,
+							 offsetof(RWConflictData, outLink));
+		}
+	}
+
+	/*------------------------------------------------------------------------
+	 * Check whether the reader has become a pivot with a writer
+	 * that's committed (or prepared):
+	 *
+	 *		T0 ------> R ------> W
+	 *			 rw		   rw
+	 *
+	 * Because W must've committed first for an anomaly to occur, there is no
+	 * anomaly if:
+	 * - T0 committed before the writer
+	 * - T0 is READ ONLY, and overlaps the writer
+	 *------------------------------------------------------------------------
+	 */
+	if (!failure && SxactIsPrepared(writer) && !SxactIsReadOnly(reader))
+	{
+		if (SxactHasSummaryConflictIn(reader))
+		{
+			failure = true;
+			conflict = NULL;
+		}
+		else
+			conflict = (RWConflict)
+				SHMQueueNext(&reader->inConflicts,
+							 &reader->inConflicts,
+							 offsetof(RWConflictData, inLink));
+		while (conflict)
+		{
+			SERIALIZABLEXACT *t0 = conflict->sxactOut;
+
+			if (!SxactIsDoomed(t0)
+				&& (!SxactIsCommitted(t0)
+					|| t0->commitSeqNo >= writer->prepareSeqNo)
+				&& (!SxactIsReadOnly(t0)
+					|| t0->SeqNo.lastCommitBeforeSnapshot >= writer->prepareSeqNo))
+			{
+				failure = true;
+				break;
+			}
+			conflict = (RWConflict)
+				SHMQueueNext(&reader->inConflicts,
+							 &conflict->inLink,
+							 offsetof(RWConflictData, inLink));
+		}
+	}
+
+	if (failure)
+	{
+		/*
+		 * We have to kill a transaction to avoid a possible anomaly from
+		 * occurring. If the writer is us, we can just ereport() to cause a
+		 * transaction abort. Otherwise we flag the writer for termination,
+		 * causing it to abort when it tries to commit. However, if the writer
+		 * is a prepared transaction, already prepared, we can't abort it
+		 * anymore, so we have to kill the reader instead.
+		 */
+		if (MySerializableXact == writer)
+		{
+			LWLockRelease(SerializableXactHashLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+					 errmsg("could not serialize access due to read/write dependencies among transactions"),
+					 errdetail_internal("Reason code: Canceled on identification as a pivot, during write."),
+					 errhint("The transaction might succeed if retried.")));
+		}
+		else if (SxactIsPrepared(writer))
+		{
+			LWLockRelease(SerializableXactHashLock);
+
+			/* if we're not the writer, we have to be the reader */
+			Assert(MySerializableXact == reader);
+			ereport(ERROR,
+					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+					 errmsg("could not serialize access due to read/write dependencies among transactions"),
+					 errdetail_internal("Reason code: Canceled on conflict out to pivot %u, during read.", writer->topXid),
+					 errhint("The transaction might succeed if retried.")));
+		}
+		writer->flags |= SXACT_FLAG_DOOMED;
+	}
+}
+
+/*
+ * PreCommit_CheckForSerializationFailure
+ *		Check for dangerous structures in a serializable transaction
+ *		at commit.
+ *
+ * We're checking for a dangerous structure as each conflict is recorded.
+ * The only way we could have a problem at commit is if this is the "out"
+ * side of a pivot, and neither the "in" side nor the pivot has yet
+ * committed.
+ *
+ * If a dangerous structure is found, the pivot (the near conflict) is
+ * marked for death, because rolling back another transaction might mean
+ * that we fail without ever making progress.  This transaction is
+ * committing writes, so letting it commit ensures progress.  If we
+ * canceled the far conflict, it might immediately fail again on retry.
+ */
+void
+PreCommit_CheckForSerializationFailure(void)
+{
+	RWConflict	nearConflict;
+
+	if (MySerializableXact == InvalidSerializableXact)
+		return;
+
+	Assert(IsolationIsSerializable());
+
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/* Check if someone else has already decided that we need to die */
+	if (SxactIsDoomed(MySerializableXact))
+	{
+		Assert(!SxactIsPartiallyReleased(MySerializableXact));
+		LWLockRelease(SerializableXactHashLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+				 errdetail_internal("Reason code: Canceled on identification as a pivot, during commit attempt."),
+				 errhint("The transaction might succeed if retried.")));
+	}
+
+	nearConflict = (RWConflict)
+		SHMQueueNext(&MySerializableXact->inConflicts,
+					 &MySerializableXact->inConflicts,
+					 offsetof(RWConflictData, inLink));
+	while (nearConflict)
+	{
+		if (!SxactIsCommitted(nearConflict->sxactOut)
+			&& !SxactIsDoomed(nearConflict->sxactOut))
+		{
+			RWConflict	farConflict;
+
+			farConflict = (RWConflict)
+				SHMQueueNext(&nearConflict->sxactOut->inConflicts,
+							 &nearConflict->sxactOut->inConflicts,
+							 offsetof(RWConflictData, inLink));
+			while (farConflict)
+			{
+				if (farConflict->sxactOut == MySerializableXact
+					|| (!SxactIsCommitted(farConflict->sxactOut)
+						&& !SxactIsReadOnly(farConflict->sxactOut)
+						&& !SxactIsDoomed(farConflict->sxactOut)))
+				{
+					/*
+					 * Normally, we kill the pivot transaction to make sure we
+					 * make progress if the failing transaction is retried.
+					 * However, we can't kill it if it's already prepared, so
+					 * in that case we commit suicide instead.
+					 */
+					if (SxactIsPrepared(nearConflict->sxactOut))
+					{
+						LWLockRelease(SerializableXactHashLock);
+						ereport(ERROR,
+								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+								 errmsg("could not serialize access due to read/write dependencies among transactions"),
+								 errdetail_internal("Reason code: Canceled on commit attempt with conflict in from prepared pivot."),
+								 errhint("The transaction might succeed if retried.")));
+					}
+					nearConflict->sxactOut->flags |= SXACT_FLAG_DOOMED;
+					break;
+				}
+				farConflict = (RWConflict)
+					SHMQueueNext(&nearConflict->sxactOut->inConflicts,
+								 &farConflict->inLink,
+								 offsetof(RWConflictData, inLink));
+			}
+		}
+
+		nearConflict = (RWConflict)
+			SHMQueueNext(&MySerializableXact->inConflicts,
+						 &nearConflict->inLink,
+						 offsetof(RWConflictData, inLink));
+	}
+
+	MySerializableXact->prepareSeqNo = ++(PredXact->LastSxactCommitSeqNo);
+	MySerializableXact->flags |= SXACT_FLAG_PREPARED;
+
+	LWLockRelease(SerializableXactHashLock);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Two-phase commit support
+ */
+
+/*
+ * AtPrepare_Locks
+ *		Do the preparatory work for a PREPARE: make 2PC state file
+ *		records for all predicate locks currently held.
+ */
+void
+AtPrepare_PredicateLocks(void)
+{
+	PREDICATELOCK *predlock;
+	SERIALIZABLEXACT *sxact;
+	TwoPhasePredicateRecord record;
+	TwoPhasePredicateXactRecord *xactRecord;
+	TwoPhasePredicateLockRecord *lockRecord;
+
+	sxact = MySerializableXact;
+	xactRecord = &(record.data.xactRecord);
+	lockRecord = &(record.data.lockRecord);
+
+	if (MySerializableXact == InvalidSerializableXact)
+		return;
+
+	/* Generate an xact record for our SERIALIZABLEXACT */
+	record.type = TWOPHASEPREDICATERECORD_XACT;
+	xactRecord->xmin = MySerializableXact->xmin;
+	xactRecord->flags = MySerializableXact->flags;
+
+	/*
+	 * Note that we don't include the list of conflicts in our out in the
+	 * statefile, because new conflicts can be added even after the
+	 * transaction prepares. We'll just make a conservative assumption during
+	 * recovery instead.
+	 */
+
+	RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
+						   &record, sizeof(record));
+
+	/*
+	 * Generate a lock record for each lock.
+	 *
+	 * To do this, we need to walk the predicate lock list in our sxact rather
+	 * than using the local predicate lock table because the latter is not
+	 * guaranteed to be accurate.
+	 */
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+
+	/*
+	 * No need to take sxact->perXactPredicateListLock in parallel mode
+	 * because there cannot be any parallel workers running while we are
+	 * preparing a transaction.
+	 */
+	Assert(!IsParallelWorker() && !ParallelContextActive());
+
+	predlock = (PREDICATELOCK *)
+		SHMQueueNext(&(sxact->predicateLocks),
+					 &(sxact->predicateLocks),
+					 offsetof(PREDICATELOCK, xactLink));
+
+	while (predlock != NULL)
+	{
+		record.type = TWOPHASEPREDICATERECORD_LOCK;
+		lockRecord->target = predlock->tag.myTarget->tag;
+
+		RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
+							   &record, sizeof(record));
+
+		predlock = (PREDICATELOCK *)
+			SHMQueueNext(&(sxact->predicateLocks),
+						 &(predlock->xactLink),
+						 offsetof(PREDICATELOCK, xactLink));
+	}
+
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * PostPrepare_Locks
+ *		Clean up after successful PREPARE. Unlike the non-predicate
+ *		lock manager, we do not need to transfer locks to a dummy
+ *		PGPROC because our SERIALIZABLEXACT will stay around
+ *		anyway. We only need to clean up our local state.
+ */
+void
+PostPrepare_PredicateLocks(TransactionId xid)
+{
+	if (MySerializableXact == InvalidSerializableXact)
+		return;
+
+	Assert(SxactIsPrepared(MySerializableXact));
+
+	MySerializableXact->pid = 0;
+
+	hash_destroy(LocalPredicateLockHash);
+	LocalPredicateLockHash = NULL;
+
+	MySerializableXact = InvalidSerializableXact;
+	MyXactDidWrite = false;
+}
+
+/*
+ * PredicateLockTwoPhaseFinish
+ *		Release a prepared transaction's predicate locks once it
+ *		commits or aborts.
+ */
+void
+PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
+{
+	SERIALIZABLEXID *sxid;
+	SERIALIZABLEXIDTAG sxidtag;
+
+	sxidtag.xid = xid;
+
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+	sxid = (SERIALIZABLEXID *)
+		hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+	LWLockRelease(SerializableXactHashLock);
+
+	/* xid will not be found if it wasn't a serializable transaction */
+	if (sxid == NULL)
+		return;
+
+	/* Release its locks */
+	MySerializableXact = sxid->myXact;
+	MyXactDidWrite = true;		/* conservatively assume that we wrote
+								 * something */
+	ReleasePredicateLocks(isCommit, false);
+}
+
+/*
+ * Re-acquire a predicate lock belonging to a transaction that was prepared.
+ */
+void
+predicatelock_twophase_recover(TransactionId xid, uint16 info,
+							   void *recdata, uint32 len)
+{
+	TwoPhasePredicateRecord *record;
+
+	Assert(len == sizeof(TwoPhasePredicateRecord));
+
+	record = (TwoPhasePredicateRecord *) recdata;
+
+	Assert((record->type == TWOPHASEPREDICATERECORD_XACT) ||
+		   (record->type == TWOPHASEPREDICATERECORD_LOCK));
+
+	if (record->type == TWOPHASEPREDICATERECORD_XACT)
+	{
+		/* Per-transaction record. Set up a SERIALIZABLEXACT. */
+		TwoPhasePredicateXactRecord *xactRecord;
+		SERIALIZABLEXACT *sxact;
+		SERIALIZABLEXID *sxid;
+		SERIALIZABLEXIDTAG sxidtag;
+		bool		found;
+
+		xactRecord = (TwoPhasePredicateXactRecord *) &record->data.xactRecord;
+
+		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+		sxact = CreatePredXact();
+		if (!sxact)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory")));
+
+		/* vxid for a prepared xact is InvalidBackendId/xid; no pid */
+		sxact->vxid.backendId = InvalidBackendId;
+		sxact->vxid.localTransactionId = (LocalTransactionId) xid;
+		sxact->pid = 0;
+
+		/* a prepared xact hasn't committed yet */
+		sxact->prepareSeqNo = RecoverySerCommitSeqNo;
+		sxact->commitSeqNo = InvalidSerCommitSeqNo;
+		sxact->finishedBefore = InvalidTransactionId;
+
+		sxact->SeqNo.lastCommitBeforeSnapshot = RecoverySerCommitSeqNo;
+
+		/*
+		 * Don't need to track this; no transactions running at the time the
+		 * recovered xact started are still active, except possibly other
+		 * prepared xacts and we don't care whether those are RO_SAFE or not.
+		 */
+		SHMQueueInit(&(sxact->possibleUnsafeConflicts));
+
+		SHMQueueInit(&(sxact->predicateLocks));
+		SHMQueueElemInit(&(sxact->finishedLink));
+
+		sxact->topXid = xid;
+		sxact->xmin = xactRecord->xmin;
+		sxact->flags = xactRecord->flags;
+		Assert(SxactIsPrepared(sxact));
+		if (!SxactIsReadOnly(sxact))
+		{
+			++(PredXact->WritableSxactCount);
+			Assert(PredXact->WritableSxactCount <=
+				   (MaxBackends + max_prepared_xacts));
+		}
+
+		/*
+		 * We don't know whether the transaction had any conflicts or not, so
+		 * we'll conservatively assume that it had both a conflict in and a
+		 * conflict out, and represent that with the summary conflict flags.
+		 */
+		SHMQueueInit(&(sxact->outConflicts));
+		SHMQueueInit(&(sxact->inConflicts));
+		sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+		sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+
+		/* Register the transaction's xid */
+		sxidtag.xid = xid;
+		sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+											   &sxidtag,
+											   HASH_ENTER, &found);
+		Assert(sxid != NULL);
+		Assert(!found);
+		sxid->myXact = (SERIALIZABLEXACT *) sxact;
+
+		/*
+		 * Update global xmin. Note that this is a special case compared to
+		 * registering a normal transaction, because the global xmin might go
+		 * backwards. That's OK, because until recovery is over we're not
+		 * going to complete any transactions or create any non-prepared
+		 * transactions, so there's no danger of throwing away.
+		 */
+		if ((!TransactionIdIsValid(PredXact->SxactGlobalXmin)) ||
+			(TransactionIdFollows(PredXact->SxactGlobalXmin, sxact->xmin)))
+		{
+			PredXact->SxactGlobalXmin = sxact->xmin;
+			PredXact->SxactGlobalXminCount = 1;
+			SerialSetActiveSerXmin(sxact->xmin);
+		}
+		else if (TransactionIdEquals(sxact->xmin, PredXact->SxactGlobalXmin))
+		{
+			Assert(PredXact->SxactGlobalXminCount > 0);
+			PredXact->SxactGlobalXminCount++;
+		}
+
+		LWLockRelease(SerializableXactHashLock);
+	}
+	else if (record->type == TWOPHASEPREDICATERECORD_LOCK)
+	{
+		/* Lock record. Recreate the PREDICATELOCK */
+		TwoPhasePredicateLockRecord *lockRecord;
+		SERIALIZABLEXID *sxid;
+		SERIALIZABLEXACT *sxact;
+		SERIALIZABLEXIDTAG sxidtag;
+		uint32		targettaghash;
+
+		lockRecord = (TwoPhasePredicateLockRecord *) &record->data.lockRecord;
+		targettaghash = PredicateLockTargetTagHashCode(&lockRecord->target);
+
+		LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		sxidtag.xid = xid;
+		sxid = (SERIALIZABLEXID *)
+			hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+		LWLockRelease(SerializableXactHashLock);
+
+		Assert(sxid != NULL);
+		sxact = sxid->myXact;
+		Assert(sxact != InvalidSerializableXact);
+
+		CreatePredicateLock(&lockRecord->target, targettaghash, sxact);
+	}
+}
+
+/*
+ * Prepare to share the current SERIALIZABLEXACT with parallel workers.
+ * Return a handle object that can be used by AttachSerializableXact() in a
+ * parallel worker.
+ */
+SerializableXactHandle
+ShareSerializableXact(void)
+{
+	return MySerializableXact;
+}
+
+/*
+ * Allow parallel workers to import the leader's SERIALIZABLEXACT.
+ */
+void
+AttachSerializableXact(SerializableXactHandle handle)
+{
+
+	Assert(MySerializableXact == InvalidSerializableXact);
+
+	MySerializableXact = (SERIALIZABLEXACT *) handle;
+	if (MySerializableXact != InvalidSerializableXact)
+		CreateLocalPredicateLockHash();
+}
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
new file mode 100644
index 0000000..c50a419
--- /dev/null
+++ b/src/backend/storage/lmgr/proc.c
@@ -0,0 +1,2012 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.c
+ *	  routines to manage per-process shared memory data structure
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/proc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * Interface (a):
+ *		ProcSleep(), ProcWakeup(),
+ *		ProcQueueAlloc() -- create a shm queue for sleeping processes
+ *		ProcQueueInit() -- create a queue without allocing memory
+ *
+ * Waiting for a lock causes the backend to be put to sleep.  Whoever releases
+ * the lock wakes the process up again (and gives it an error code so it knows
+ * whether it was awoken on an error condition).
+ *
+ * Interface (b):
+ *
+ * ProcReleaseLocks -- frees the locks associated with current transaction
+ *
+ * ProcKill -- destroys the shared memory state (and locks)
+ * associated with the process.
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "replication/slot.h"
+#include "replication/syncrep.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/spin.h"
+#include "storage/standby.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/* GUC variables */
+int			DeadlockTimeout = 1000;
+int			StatementTimeout = 0;
+int			LockTimeout = 0;
+int			IdleInTransactionSessionTimeout = 0;
+int			IdleSessionTimeout = 0;
+bool		log_lock_waits = false;
+
+/* Pointer to this process's PGPROC struct, if any */
+PGPROC	   *MyProc = NULL;
+
+/*
+ * This spinlock protects the freelist of recycled PGPROC structures.
+ * We cannot use an LWLock because the LWLock manager depends on already
+ * having a PGPROC and a wait semaphore!  But these structures are touched
+ * relatively infrequently (only at backend startup or shutdown) and not for
+ * very long, so a spinlock is okay.
+ */
+NON_EXEC_STATIC slock_t *ProcStructLock = NULL;
+
+/* Pointers to shared-memory structures */
+PROC_HDR   *ProcGlobal = NULL;
+NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL;
+PGPROC	   *PreparedXactProcs = NULL;
+
+/* If we are waiting for a lock, this points to the associated LOCALLOCK */
+static LOCALLOCK *lockAwaited = NULL;
+
+static DeadLockState deadlock_state = DS_NOT_YET_CHECKED;
+
+/* Is a deadlock check pending? */
+static volatile sig_atomic_t got_deadlock_timeout;
+
+static void RemoveProcFromArray(int code, Datum arg);
+static void ProcKill(int code, Datum arg);
+static void AuxiliaryProcKill(int code, Datum arg);
+static void CheckDeadLock(void);
+
+
+/*
+ * Report shared-memory space needed by InitProcGlobal.
+ */
+Size
+ProcGlobalShmemSize(void)
+{
+	Size		size = 0;
+	Size		TotalProcs =
+	add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts));
+
+	/* ProcGlobal */
+	size = add_size(size, sizeof(PROC_HDR));
+	size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC)));
+	size = add_size(size, sizeof(slock_t));
+
+	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids)));
+	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates)));
+	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags)));
+
+	return size;
+}
+
+/*
+ * Report number of semaphores needed by InitProcGlobal.
+ */
+int
+ProcGlobalSemas(void)
+{
+	/*
+	 * We need a sema per backend (including autovacuum), plus one for each
+	 * auxiliary process.
+	 */
+	return MaxBackends + NUM_AUXILIARY_PROCS;
+}
+
+/*
+ * InitProcGlobal -
+ *	  Initialize the global process table during postmaster or standalone
+ *	  backend startup.
+ *
+ *	  We also create all the per-process semaphores we will need to support
+ *	  the requested number of backends.  We used to allocate semaphores
+ *	  only when backends were actually started up, but that is bad because
+ *	  it lets Postgres fail under load --- a lot of Unix systems are
+ *	  (mis)configured with small limits on the number of semaphores, and
+ *	  running out when trying to start another backend is a common failure.
+ *	  So, now we grab enough semaphores to support the desired max number
+ *	  of backends immediately at initialization --- if the sysadmin has set
+ *	  MaxConnections, max_worker_processes, max_wal_senders, or
+ *	  autovacuum_max_workers higher than his kernel will support, he'll
+ *	  find out sooner rather than later.
+ *
+ *	  Another reason for creating semaphores here is that the semaphore
+ *	  implementation typically requires us to create semaphores in the
+ *	  postmaster, not in backends.
+ *
+ * Note: this is NOT called by individual backends under a postmaster,
+ * not even in the EXEC_BACKEND case.  The ProcGlobal and AuxiliaryProcs
+ * pointers must be propagated specially for EXEC_BACKEND operation.
+ */
+void
+InitProcGlobal(void)
+{
+	PGPROC	   *procs;
+	int			i,
+				j;
+	bool		found;
+	uint32		TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS + max_prepared_xacts;
+
+	/* Create the ProcGlobal shared structure */
+	ProcGlobal = (PROC_HDR *)
+		ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found);
+	Assert(!found);
+
+	/*
+	 * Initialize the data structures.
+	 */
+	ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+	ProcGlobal->freeProcs = NULL;
+	ProcGlobal->autovacFreeProcs = NULL;
+	ProcGlobal->bgworkerFreeProcs = NULL;
+	ProcGlobal->walsenderFreeProcs = NULL;
+	ProcGlobal->startupProc = NULL;
+	ProcGlobal->startupProcPid = 0;
+	ProcGlobal->startupBufferPinWaitBufId = -1;
+	ProcGlobal->walwriterLatch = NULL;
+	ProcGlobal->checkpointerLatch = NULL;
+	pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PGPROCNO);
+	pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PGPROCNO);
+
+	/*
+	 * Create and initialize all the PGPROC structures we'll need.  There are
+	 * five separate consumers: (1) normal backends, (2) autovacuum workers
+	 * and the autovacuum launcher, (3) background workers, (4) auxiliary
+	 * processes, and (5) prepared transactions.  Each PGPROC structure is
+	 * dedicated to exactly one of these purposes, and they do not move
+	 * between groups.
+	 */
+	procs = (PGPROC *) ShmemAlloc(TotalProcs * sizeof(PGPROC));
+	MemSet(procs, 0, TotalProcs * sizeof(PGPROC));
+	ProcGlobal->allProcs = procs;
+	/* XXX allProcCount isn't really all of them; it excludes prepared xacts */
+	ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS;
+
+	/*
+	 * Allocate arrays mirroring PGPROC fields in a dense manner. See
+	 * PROC_HDR.
+	 *
+	 * XXX: It might make sense to increase padding for these arrays, given
+	 * how hotly they are accessed.
+	 */
+	ProcGlobal->xids =
+		(TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids));
+	MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids));
+	ProcGlobal->subxidStates = (XidCacheStatus *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->subxidStates));
+	MemSet(ProcGlobal->subxidStates, 0, TotalProcs * sizeof(*ProcGlobal->subxidStates));
+	ProcGlobal->statusFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->statusFlags));
+	MemSet(ProcGlobal->statusFlags, 0, TotalProcs * sizeof(*ProcGlobal->statusFlags));
+
+	for (i = 0; i < TotalProcs; i++)
+	{
+		/* Common initialization for all PGPROCs, regardless of type. */
+
+		/*
+		 * Set up per-PGPROC semaphore, latch, and fpInfoLock.  Prepared xact
+		 * dummy PGPROCs don't need these though - they're never associated
+		 * with a real process
+		 */
+		if (i < MaxBackends + NUM_AUXILIARY_PROCS)
+		{
+			procs[i].sem = PGSemaphoreCreate();
+			InitSharedLatch(&(procs[i].procLatch));
+			LWLockInitialize(&(procs[i].fpInfoLock), LWTRANCHE_LOCK_FASTPATH);
+		}
+		procs[i].pgprocno = i;
+
+		/*
+		 * Newly created PGPROCs for normal backends, autovacuum and bgworkers
+		 * must be queued up on the appropriate free list.  Because there can
+		 * only ever be a small, fixed number of auxiliary processes, no free
+		 * list is used in that case; InitAuxiliaryProcess() instead uses a
+		 * linear search.   PGPROCs for prepared transactions are added to a
+		 * free list by TwoPhaseShmemInit().
+		 */
+		if (i < MaxConnections)
+		{
+			/* PGPROC for normal backend, add to freeProcs list */
+			procs[i].links.next = (SHM_QUEUE *) ProcGlobal->freeProcs;
+			ProcGlobal->freeProcs = &procs[i];
+			procs[i].procgloballist = &ProcGlobal->freeProcs;
+		}
+		else if (i < MaxConnections + autovacuum_max_workers + 1)
+		{
+			/* PGPROC for AV launcher/worker, add to autovacFreeProcs list */
+			procs[i].links.next = (SHM_QUEUE *) ProcGlobal->autovacFreeProcs;
+			ProcGlobal->autovacFreeProcs = &procs[i];
+			procs[i].procgloballist = &ProcGlobal->autovacFreeProcs;
+		}
+		else if (i < MaxConnections + autovacuum_max_workers + 1 + max_worker_processes)
+		{
+			/* PGPROC for bgworker, add to bgworkerFreeProcs list */
+			procs[i].links.next = (SHM_QUEUE *) ProcGlobal->bgworkerFreeProcs;
+			ProcGlobal->bgworkerFreeProcs = &procs[i];
+			procs[i].procgloballist = &ProcGlobal->bgworkerFreeProcs;
+		}
+		else if (i < MaxBackends)
+		{
+			/* PGPROC for walsender, add to walsenderFreeProcs list */
+			procs[i].links.next = (SHM_QUEUE *) ProcGlobal->walsenderFreeProcs;
+			ProcGlobal->walsenderFreeProcs = &procs[i];
+			procs[i].procgloballist = &ProcGlobal->walsenderFreeProcs;
+		}
+
+		/* Initialize myProcLocks[] shared memory queues. */
+		for (j = 0; j < NUM_LOCK_PARTITIONS; j++)
+			SHMQueueInit(&(procs[i].myProcLocks[j]));
+
+		/* Initialize lockGroupMembers list. */
+		dlist_init(&procs[i].lockGroupMembers);
+
+		/*
+		 * Initialize the atomic variables, otherwise, it won't be safe to
+		 * access them for backends that aren't currently in use.
+		 */
+		pg_atomic_init_u32(&(procs[i].procArrayGroupNext), INVALID_PGPROCNO);
+		pg_atomic_init_u32(&(procs[i].clogGroupNext), INVALID_PGPROCNO);
+		pg_atomic_init_u64(&(procs[i].waitStart), 0);
+	}
+
+	/*
+	 * Save pointers to the blocks of PGPROC structures reserved for auxiliary
+	 * processes and prepared transactions.
+	 */
+	AuxiliaryProcs = &procs[MaxBackends];
+	PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS];
+
+	/* Create ProcStructLock spinlock, too */
+	ProcStructLock = (slock_t *) ShmemAlloc(sizeof(slock_t));
+	SpinLockInit(ProcStructLock);
+}
+
+/*
+ * InitProcess -- initialize a per-process data structure for this backend
+ */
+void
+InitProcess(void)
+{
+	PGPROC	   *volatile *procgloballist;
+
+	/*
+	 * ProcGlobal should be set up already (if we are a backend, we inherit
+	 * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+	 */
+	if (ProcGlobal == NULL)
+		elog(PANIC, "proc header uninitialized");
+
+	if (MyProc != NULL)
+		elog(ERROR, "you already exist");
+
+	/* Decide which list should supply our PGPROC. */
+	if (IsAnyAutoVacuumProcess())
+		procgloballist = &ProcGlobal->autovacFreeProcs;
+	else if (IsBackgroundWorker)
+		procgloballist = &ProcGlobal->bgworkerFreeProcs;
+	else if (am_walsender)
+		procgloballist = &ProcGlobal->walsenderFreeProcs;
+	else
+		procgloballist = &ProcGlobal->freeProcs;
+
+	/*
+	 * Try to get a proc struct from the appropriate free list.  If this
+	 * fails, we must be out of PGPROC structures (not to mention semaphores).
+	 *
+	 * While we are holding the ProcStructLock, also copy the current shared
+	 * estimate of spins_per_delay to local storage.
+	 */
+	SpinLockAcquire(ProcStructLock);
+
+	set_spins_per_delay(ProcGlobal->spins_per_delay);
+
+	MyProc = *procgloballist;
+
+	if (MyProc != NULL)
+	{
+		*procgloballist = (PGPROC *) MyProc->links.next;
+		SpinLockRelease(ProcStructLock);
+	}
+	else
+	{
+		/*
+		 * If we reach here, all the PGPROCs are in use.  This is one of the
+		 * possible places to detect "too many backends", so give the standard
+		 * error message.  XXX do we need to give a different failure message
+		 * in the autovacuum case?
+		 */
+		SpinLockRelease(ProcStructLock);
+		if (am_walsender)
+			ereport(FATAL,
+					(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+					 errmsg("number of requested standby connections exceeds max_wal_senders (currently %d)",
+							max_wal_senders)));
+		ereport(FATAL,
+				(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+				 errmsg("sorry, too many clients already")));
+	}
+
+	/*
+	 * Cross-check that the PGPROC is of the type we expect; if this were not
+	 * the case, it would get returned to the wrong list.
+	 */
+	Assert(MyProc->procgloballist == procgloballist);
+
+	/*
+	 * Now that we have a PGPROC, mark ourselves as an active postmaster
+	 * child; this is so that the postmaster can detect it if we exit without
+	 * cleaning up.  (XXX autovac launcher currently doesn't participate in
+	 * this; it probably should.)
+	 */
+	if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+		MarkPostmasterChildActive();
+
+	/*
+	 * Initialize all fields of MyProc, except for those previously
+	 * initialized by InitProcGlobal.
+	 */
+	SHMQueueElemInit(&(MyProc->links));
+	MyProc->waitStatus = PROC_WAIT_STATUS_OK;
+	MyProc->lxid = InvalidLocalTransactionId;
+	MyProc->fpVXIDLock = false;
+	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+	MyProc->xid = InvalidTransactionId;
+	MyProc->xmin = InvalidTransactionId;
+	MyProc->pid = MyProcPid;
+	/* backendId, databaseId and roleId will be filled in later */
+	MyProc->backendId = InvalidBackendId;
+	MyProc->databaseId = InvalidOid;
+	MyProc->roleId = InvalidOid;
+	MyProc->tempNamespaceId = InvalidOid;
+	MyProc->isBackgroundWorker = IsBackgroundWorker;
+	MyProc->delayChkpt = 0;
+	MyProc->statusFlags = 0;
+	/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
+	if (IsAutoVacuumWorkerProcess())
+		MyProc->statusFlags |= PROC_IS_AUTOVACUUM;
+	MyProc->lwWaiting = false;
+	MyProc->lwWaitMode = 0;
+	MyProc->waitLock = NULL;
+	MyProc->waitProcLock = NULL;
+	pg_atomic_write_u64(&MyProc->waitStart, 0);
+#ifdef USE_ASSERT_CHECKING
+	{
+		int			i;
+
+		/* Last process should have released all locks. */
+		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+			Assert(SHMQueueEmpty(&(MyProc->myProcLocks[i])));
+	}
+#endif
+	MyProc->recoveryConflictPending = false;
+
+	/* Initialize fields for sync rep */
+	MyProc->waitLSN = 0;
+	MyProc->syncRepState = SYNC_REP_NOT_WAITING;
+	SHMQueueElemInit(&(MyProc->syncRepLinks));
+
+	/* Initialize fields for group XID clearing. */
+	MyProc->procArrayGroupMember = false;
+	MyProc->procArrayGroupMemberXid = InvalidTransactionId;
+	Assert(pg_atomic_read_u32(&MyProc->procArrayGroupNext) == INVALID_PGPROCNO);
+
+	/* Check that group locking fields are in a proper initial state. */
+	Assert(MyProc->lockGroupLeader == NULL);
+	Assert(dlist_is_empty(&MyProc->lockGroupMembers));
+
+	/* Initialize wait event information. */
+	MyProc->wait_event_info = 0;
+
+	/* Initialize fields for group transaction status update. */
+	MyProc->clogGroupMember = false;
+	MyProc->clogGroupMemberXid = InvalidTransactionId;
+	MyProc->clogGroupMemberXidStatus = TRANSACTION_STATUS_IN_PROGRESS;
+	MyProc->clogGroupMemberPage = -1;
+	MyProc->clogGroupMemberLsn = InvalidXLogRecPtr;
+	Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO);
+
+	/*
+	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
+	 * on it.  That allows us to repoint the process latch, which so far
+	 * points to process local one, to the shared one.
+	 */
+	OwnLatch(&MyProc->procLatch);
+	SwitchToSharedLatch();
+
+	/* now that we have a proc, report wait events to shared memory */
+	pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+
+	/*
+	 * We might be reusing a semaphore that belonged to a failed process. So
+	 * be careful and reinitialize its value here.  (This is not strictly
+	 * necessary anymore, but seems like a good idea for cleanliness.)
+	 */
+	PGSemaphoreReset(MyProc->sem);
+
+	/*
+	 * Arrange to clean up at backend exit.
+	 */
+	on_shmem_exit(ProcKill, 0);
+
+	/*
+	 * Now that we have a PGPROC, we could try to acquire locks, so initialize
+	 * local state needed for LWLocks, and the deadlock checker.
+	 */
+	InitLWLockAccess();
+	InitDeadLockChecking();
+}
+
+/*
+ * InitProcessPhase2 -- make MyProc visible in the shared ProcArray.
+ *
+ * This is separate from InitProcess because we can't acquire LWLocks until
+ * we've created a PGPROC, but in the EXEC_BACKEND case ProcArrayAdd won't
+ * work until after we've done CreateSharedMemoryAndSemaphores.
+ */
+void
+InitProcessPhase2(void)
+{
+	Assert(MyProc != NULL);
+
+	/*
+	 * Add our PGPROC to the PGPROC array in shared memory.
+	 */
+	ProcArrayAdd(MyProc);
+
+	/*
+	 * Arrange to clean that up at backend exit.
+	 */
+	on_shmem_exit(RemoveProcFromArray, 0);
+}
+
+/*
+ * InitAuxiliaryProcess -- create a per-auxiliary-process data structure
+ *
+ * This is called by bgwriter and similar processes so that they will have a
+ * MyProc value that's real enough to let them wait for LWLocks.  The PGPROC
+ * and sema that are assigned are one of the extra ones created during
+ * InitProcGlobal.
+ *
+ * Auxiliary processes are presently not expected to wait for real (lockmgr)
+ * locks, so we need not set up the deadlock checker.  They are never added
+ * to the ProcArray or the sinval messaging mechanism, either.  They also
+ * don't get a VXID assigned, since this is only useful when we actually
+ * hold lockmgr locks.
+ *
+ * Startup process however uses locks but never waits for them in the
+ * normal backend sense. Startup process also takes part in sinval messaging
+ * as a sendOnly process, so never reads messages from sinval queue. So
+ * Startup process does have a VXID and does show up in pg_locks.
+ */
+void
+InitAuxiliaryProcess(void)
+{
+	PGPROC	   *auxproc;
+	int			proctype;
+
+	/*
+	 * ProcGlobal should be set up already (if we are a backend, we inherit
+	 * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+	 */
+	if (ProcGlobal == NULL || AuxiliaryProcs == NULL)
+		elog(PANIC, "proc header uninitialized");
+
+	if (MyProc != NULL)
+		elog(ERROR, "you already exist");
+
+	/*
+	 * We use the ProcStructLock to protect assignment and releasing of
+	 * AuxiliaryProcs entries.
+	 *
+	 * While we are holding the ProcStructLock, also copy the current shared
+	 * estimate of spins_per_delay to local storage.
+	 */
+	SpinLockAcquire(ProcStructLock);
+
+	set_spins_per_delay(ProcGlobal->spins_per_delay);
+
+	/*
+	 * Find a free auxproc ... *big* trouble if there isn't one ...
+	 */
+	for (proctype = 0; proctype < NUM_AUXILIARY_PROCS; proctype++)
+	{
+		auxproc = &AuxiliaryProcs[proctype];
+		if (auxproc->pid == 0)
+			break;
+	}
+	if (proctype >= NUM_AUXILIARY_PROCS)
+	{
+		SpinLockRelease(ProcStructLock);
+		elog(FATAL, "all AuxiliaryProcs are in use");
+	}
+
+	/* Mark auxiliary proc as in use by me */
+	/* use volatile pointer to prevent code rearrangement */
+	((volatile PGPROC *) auxproc)->pid = MyProcPid;
+
+	MyProc = auxproc;
+
+	SpinLockRelease(ProcStructLock);
+
+	/*
+	 * Initialize all fields of MyProc, except for those previously
+	 * initialized by InitProcGlobal.
+	 */
+	SHMQueueElemInit(&(MyProc->links));
+	MyProc->waitStatus = PROC_WAIT_STATUS_OK;
+	MyProc->lxid = InvalidLocalTransactionId;
+	MyProc->fpVXIDLock = false;
+	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+	MyProc->xid = InvalidTransactionId;
+	MyProc->xmin = InvalidTransactionId;
+	MyProc->backendId = InvalidBackendId;
+	MyProc->databaseId = InvalidOid;
+	MyProc->roleId = InvalidOid;
+	MyProc->tempNamespaceId = InvalidOid;
+	MyProc->isBackgroundWorker = IsBackgroundWorker;
+	MyProc->delayChkpt = 0;
+	MyProc->statusFlags = 0;
+	MyProc->lwWaiting = false;
+	MyProc->lwWaitMode = 0;
+	MyProc->waitLock = NULL;
+	MyProc->waitProcLock = NULL;
+	pg_atomic_write_u64(&MyProc->waitStart, 0);
+#ifdef USE_ASSERT_CHECKING
+	{
+		int			i;
+
+		/* Last process should have released all locks. */
+		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+			Assert(SHMQueueEmpty(&(MyProc->myProcLocks[i])));
+	}
+#endif
+
+	/*
+	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
+	 * on it.  That allows us to repoint the process latch, which so far
+	 * points to process local one, to the shared one.
+	 */
+	OwnLatch(&MyProc->procLatch);
+	SwitchToSharedLatch();
+
+	/* now that we have a proc, report wait events to shared memory */
+	pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+
+	/* Check that group locking fields are in a proper initial state. */
+	Assert(MyProc->lockGroupLeader == NULL);
+	Assert(dlist_is_empty(&MyProc->lockGroupMembers));
+
+	/*
+	 * We might be reusing a semaphore that belonged to a failed process. So
+	 * be careful and reinitialize its value here.  (This is not strictly
+	 * necessary anymore, but seems like a good idea for cleanliness.)
+	 */
+	PGSemaphoreReset(MyProc->sem);
+
+	/*
+	 * Arrange to clean up at process exit.
+	 */
+	on_shmem_exit(AuxiliaryProcKill, Int32GetDatum(proctype));
+}
+
+/*
+ * Record the PID and PGPROC structures for the Startup process, for use in
+ * ProcSendSignal().  See comments there for further explanation.
+ */
+void
+PublishStartupProcessInformation(void)
+{
+	SpinLockAcquire(ProcStructLock);
+
+	ProcGlobal->startupProc = MyProc;
+	ProcGlobal->startupProcPid = MyProcPid;
+
+	SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * Used from bufmgr to share the value of the buffer that Startup waits on,
+ * or to reset the value to "not waiting" (-1). This allows processing
+ * of recovery conflicts for buffer pins. Set is made before backends look
+ * at this value, so locking not required, especially since the set is
+ * an atomic integer set operation.
+ */
+void
+SetStartupBufferPinWaitBufId(int bufid)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile PROC_HDR *procglobal = ProcGlobal;
+
+	procglobal->startupBufferPinWaitBufId = bufid;
+}
+
+/*
+ * Used by backends when they receive a request to check for buffer pin waits.
+ */
+int
+GetStartupBufferPinWaitBufId(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile PROC_HDR *procglobal = ProcGlobal;
+
+	return procglobal->startupBufferPinWaitBufId;
+}
+
+/*
+ * Check whether there are at least N free PGPROC objects.
+ *
+ * Note: this is designed on the assumption that N will generally be small.
+ */
+bool
+HaveNFreeProcs(int n)
+{
+	PGPROC	   *proc;
+
+	SpinLockAcquire(ProcStructLock);
+
+	proc = ProcGlobal->freeProcs;
+
+	while (n > 0 && proc != NULL)
+	{
+		proc = (PGPROC *) proc->links.next;
+		n--;
+	}
+
+	SpinLockRelease(ProcStructLock);
+
+	return (n <= 0);
+}
+
+/*
+ * Check if the current process is awaiting a lock.
+ */
+bool
+IsWaitingForLock(void)
+{
+	if (lockAwaited == NULL)
+		return false;
+
+	return true;
+}
+
+/*
+ * Cancel any pending wait for lock, when aborting a transaction, and revert
+ * any strong lock count acquisition for a lock being acquired.
+ *
+ * (Normally, this would only happen if we accept a cancel/die
+ * interrupt while waiting; but an ereport(ERROR) before or during the lock
+ * wait is within the realm of possibility, too.)
+ */
+void
+LockErrorCleanup(void)
+{
+	LWLock	   *partitionLock;
+	DisableTimeoutParams timeouts[2];
+
+	HOLD_INTERRUPTS();
+
+	AbortStrongLockAcquire();
+
+	/* Nothing to do if we weren't waiting for a lock */
+	if (lockAwaited == NULL)
+	{
+		RESUME_INTERRUPTS();
+		return;
+	}
+
+	/*
+	 * Turn off the deadlock and lock timeout timers, if they are still
+	 * running (see ProcSleep).  Note we must preserve the LOCK_TIMEOUT
+	 * indicator flag, since this function is executed before
+	 * ProcessInterrupts when responding to SIGINT; else we'd lose the
+	 * knowledge that the SIGINT came from a lock timeout and not an external
+	 * source.
+	 */
+	timeouts[0].id = DEADLOCK_TIMEOUT;
+	timeouts[0].keep_indicator = false;
+	timeouts[1].id = LOCK_TIMEOUT;
+	timeouts[1].keep_indicator = true;
+	disable_timeouts(timeouts, 2);
+
+	/* Unlink myself from the wait queue, if on it (might not be anymore!) */
+	partitionLock = LockHashPartitionLock(lockAwaited->hashcode);
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	if (MyProc->links.next != NULL)
+	{
+		/* We could not have been granted the lock yet */
+		RemoveFromWaitQueue(MyProc, lockAwaited->hashcode);
+	}
+	else
+	{
+		/*
+		 * Somebody kicked us off the lock queue already.  Perhaps they
+		 * granted us the lock, or perhaps they detected a deadlock. If they
+		 * did grant us the lock, we'd better remember it in our local lock
+		 * table.
+		 */
+		if (MyProc->waitStatus == PROC_WAIT_STATUS_OK)
+			GrantAwaitedLock();
+	}
+
+	lockAwaited = NULL;
+
+	LWLockRelease(partitionLock);
+
+	RESUME_INTERRUPTS();
+}
+
+
+/*
+ * ProcReleaseLocks() -- release locks associated with current transaction
+ *			at main transaction commit or abort
+ *
+ * At main transaction commit, we release standard locks except session locks.
+ * At main transaction abort, we release all locks including session locks.
+ *
+ * Advisory locks are released only if they are transaction-level;
+ * session-level holds remain, whether this is a commit or not.
+ *
+ * At subtransaction commit, we don't release any locks (so this func is not
+ * needed at all); we will defer the releasing to the parent transaction.
+ * At subtransaction abort, we release all locks held by the subtransaction;
+ * this is implemented by retail releasing of the locks under control of
+ * the ResourceOwner mechanism.
+ */
+void
+ProcReleaseLocks(bool isCommit)
+{
+	if (!MyProc)
+		return;
+	/* If waiting, get off wait queue (should only be needed after error) */
+	LockErrorCleanup();
+	/* Release standard locks, including session-level if aborting */
+	LockReleaseAll(DEFAULT_LOCKMETHOD, !isCommit);
+	/* Release transaction-level advisory locks */
+	LockReleaseAll(USER_LOCKMETHOD, false);
+}
+
+
+/*
+ * RemoveProcFromArray() -- Remove this process from the shared ProcArray.
+ */
+static void
+RemoveProcFromArray(int code, Datum arg)
+{
+	Assert(MyProc != NULL);
+	ProcArrayRemove(MyProc, InvalidTransactionId);
+}
+
+/*
+ * ProcKill() -- Destroy the per-proc data structure for
+ *		this process. Release any of its held LW locks.
+ */
+static void
+ProcKill(int code, Datum arg)
+{
+	PGPROC	   *proc;
+	PGPROC	   *volatile *procgloballist;
+
+	Assert(MyProc != NULL);
+
+	/* Make sure we're out of the sync rep lists */
+	SyncRepCleanupAtProcExit();
+
+#ifdef USE_ASSERT_CHECKING
+	{
+		int			i;
+
+		/* Last process should have released all locks. */
+		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+			Assert(SHMQueueEmpty(&(MyProc->myProcLocks[i])));
+	}
+#endif
+
+	/*
+	 * Release any LW locks I am holding.  There really shouldn't be any, but
+	 * it's cheap to check again before we cut the knees off the LWLock
+	 * facility by releasing our PGPROC ...
+	 */
+	LWLockReleaseAll();
+
+	/* Cancel any pending condition variable sleep, too */
+	ConditionVariableCancelSleep();
+
+	/* Make sure active replication slots are released */
+	if (MyReplicationSlot != NULL)
+		ReplicationSlotRelease();
+
+	/* Also cleanup all the temporary slots. */
+	ReplicationSlotCleanup();
+
+	/*
+	 * Detach from any lock group of which we are a member.  If the leader
+	 * exist before all other group members, its PGPROC will remain allocated
+	 * until the last group process exits; that process must return the
+	 * leader's PGPROC to the appropriate list.
+	 */
+	if (MyProc->lockGroupLeader != NULL)
+	{
+		PGPROC	   *leader = MyProc->lockGroupLeader;
+		LWLock	   *leader_lwlock = LockHashPartitionLockByProc(leader);
+
+		LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+		Assert(!dlist_is_empty(&leader->lockGroupMembers));
+		dlist_delete(&MyProc->lockGroupLink);
+		if (dlist_is_empty(&leader->lockGroupMembers))
+		{
+			leader->lockGroupLeader = NULL;
+			if (leader != MyProc)
+			{
+				procgloballist = leader->procgloballist;
+
+				/* Leader exited first; return its PGPROC. */
+				SpinLockAcquire(ProcStructLock);
+				leader->links.next = (SHM_QUEUE *) *procgloballist;
+				*procgloballist = leader;
+				SpinLockRelease(ProcStructLock);
+			}
+		}
+		else if (leader != MyProc)
+			MyProc->lockGroupLeader = NULL;
+		LWLockRelease(leader_lwlock);
+	}
+
+	/*
+	 * Reset MyLatch to the process local one.  This is so that signal
+	 * handlers et al can continue using the latch after the shared latch
+	 * isn't ours anymore.
+	 *
+	 * Similarly, stop reporting wait events to MyProc->wait_event_info.
+	 *
+	 * After that clear MyProc and disown the shared latch.
+	 */
+	SwitchBackToLocalLatch();
+	pgstat_reset_wait_event_storage();
+
+	proc = MyProc;
+	MyProc = NULL;
+	DisownLatch(&proc->procLatch);
+
+	procgloballist = proc->procgloballist;
+	SpinLockAcquire(ProcStructLock);
+
+	/*
+	 * If we're still a member of a locking group, that means we're a leader
+	 * which has somehow exited before its children.  The last remaining child
+	 * will release our PGPROC.  Otherwise, release it now.
+	 */
+	if (proc->lockGroupLeader == NULL)
+	{
+		/* Since lockGroupLeader is NULL, lockGroupMembers should be empty. */
+		Assert(dlist_is_empty(&proc->lockGroupMembers));
+
+		/* Return PGPROC structure (and semaphore) to appropriate freelist */
+		proc->links.next = (SHM_QUEUE *) *procgloballist;
+		*procgloballist = proc;
+	}
+
+	/* Update shared estimate of spins_per_delay */
+	ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay);
+
+	SpinLockRelease(ProcStructLock);
+
+	/*
+	 * This process is no longer present in shared memory in any meaningful
+	 * way, so tell the postmaster we've cleaned up acceptably well. (XXX
+	 * autovac launcher should be included here someday)
+	 */
+	if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+		MarkPostmasterChildInactive();
+
+	/* wake autovac launcher if needed -- see comments in FreeWorkerInfo */
+	if (AutovacuumLauncherPid != 0)
+		kill(AutovacuumLauncherPid, SIGUSR2);
+}
+
+/*
+ * AuxiliaryProcKill() -- Cut-down version of ProcKill for auxiliary
+ *		processes (bgwriter, etc).  The PGPROC and sema are not released, only
+ *		marked as not-in-use.
+ */
+static void
+AuxiliaryProcKill(int code, Datum arg)
+{
+	int			proctype = DatumGetInt32(arg);
+	PGPROC	   *auxproc PG_USED_FOR_ASSERTS_ONLY;
+	PGPROC	   *proc;
+
+	Assert(proctype >= 0 && proctype < NUM_AUXILIARY_PROCS);
+
+	auxproc = &AuxiliaryProcs[proctype];
+
+	Assert(MyProc == auxproc);
+
+	/* Release any LW locks I am holding (see notes above) */
+	LWLockReleaseAll();
+
+	/* Cancel any pending condition variable sleep, too */
+	ConditionVariableCancelSleep();
+
+	/* look at the equivalent ProcKill() code for comments */
+	SwitchBackToLocalLatch();
+	pgstat_reset_wait_event_storage();
+
+	proc = MyProc;
+	MyProc = NULL;
+	DisownLatch(&proc->procLatch);
+
+	SpinLockAcquire(ProcStructLock);
+
+	/* Mark auxiliary proc no longer in use */
+	proc->pid = 0;
+
+	/* Update shared estimate of spins_per_delay */
+	ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay);
+
+	SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * AuxiliaryPidGetProc -- get PGPROC for an auxiliary process
+ * given its PID
+ *
+ * Returns NULL if not found.
+ */
+PGPROC *
+AuxiliaryPidGetProc(int pid)
+{
+	PGPROC	   *result = NULL;
+	int			index;
+
+	if (pid == 0)				/* never match dummy PGPROCs */
+		return NULL;
+
+	for (index = 0; index < NUM_AUXILIARY_PROCS; index++)
+	{
+		PGPROC	   *proc = &AuxiliaryProcs[index];
+
+		if (proc->pid == pid)
+		{
+			result = proc;
+			break;
+		}
+	}
+	return result;
+}
+
+/*
+ * ProcQueue package: routines for putting processes to sleep
+ *		and  waking them up
+ */
+
+/*
+ * ProcQueueAlloc -- alloc/attach to a shared memory process queue
+ *
+ * Returns: a pointer to the queue
+ * Side Effects: Initializes the queue if it wasn't there before
+ */
+#ifdef NOT_USED
+PROC_QUEUE *
+ProcQueueAlloc(const char *name)
+{
+	PROC_QUEUE *queue;
+	bool		found;
+
+	queue = (PROC_QUEUE *)
+		ShmemInitStruct(name, sizeof(PROC_QUEUE), &found);
+
+	if (!found)
+		ProcQueueInit(queue);
+
+	return queue;
+}
+#endif
+
+/*
+ * ProcQueueInit -- initialize a shared memory process queue
+ */
+void
+ProcQueueInit(PROC_QUEUE *queue)
+{
+	SHMQueueInit(&(queue->links));
+	queue->size = 0;
+}
+
+
+/*
+ * ProcSleep -- put a process to sleep on the specified lock
+ *
+ * Caller must have set MyProc->heldLocks to reflect locks already held
+ * on the lockable object by this process (under all XIDs).
+ *
+ * The lock table's partition lock must be held at entry, and will be held
+ * at exit.
+ *
+ * Result: PROC_WAIT_STATUS_OK if we acquired the lock, PROC_WAIT_STATUS_ERROR if not (deadlock).
+ *
+ * ASSUME: that no one will fiddle with the queue until after
+ *		we release the partition lock.
+ *
+ * NOTES: The process queue is now a priority queue for locking.
+ */
+ProcWaitStatus
+ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
+{
+	LOCKMODE	lockmode = locallock->tag.mode;
+	LOCK	   *lock = locallock->lock;
+	PROCLOCK   *proclock = locallock->proclock;
+	uint32		hashcode = locallock->hashcode;
+	LWLock	   *partitionLock = LockHashPartitionLock(hashcode);
+	PROC_QUEUE *waitQueue = &(lock->waitProcs);
+	LOCKMASK	myHeldLocks = MyProc->heldLocks;
+	TimestampTz standbyWaitStart = 0;
+	bool		early_deadlock = false;
+	bool		allow_autovacuum_cancel = true;
+	bool		logged_recovery_conflict = false;
+	ProcWaitStatus myWaitStatus;
+	PGPROC	   *proc;
+	PGPROC	   *leader = MyProc->lockGroupLeader;
+	int			i;
+
+	/*
+	 * If group locking is in use, locks held by members of my locking group
+	 * need to be included in myHeldLocks.  This is not required for relation
+	 * extension or page locks which conflict among group members. However,
+	 * including them in myHeldLocks will give group members the priority to
+	 * get those locks as compared to other backends which are also trying to
+	 * acquire those locks.  OTOH, we can avoid giving priority to group
+	 * members for that kind of locks, but there doesn't appear to be a clear
+	 * advantage of the same.
+	 */
+	if (leader != NULL)
+	{
+		SHM_QUEUE  *procLocks = &(lock->procLocks);
+		PROCLOCK   *otherproclock;
+
+		otherproclock = (PROCLOCK *)
+			SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, lockLink));
+		while (otherproclock != NULL)
+		{
+			if (otherproclock->groupLeader == leader)
+				myHeldLocks |= otherproclock->holdMask;
+			otherproclock = (PROCLOCK *)
+				SHMQueueNext(procLocks, &otherproclock->lockLink,
+							 offsetof(PROCLOCK, lockLink));
+		}
+	}
+
+	/*
+	 * Determine where to add myself in the wait queue.
+	 *
+	 * Normally I should go at the end of the queue.  However, if I already
+	 * hold locks that conflict with the request of any previous waiter, put
+	 * myself in the queue just in front of the first such waiter. This is not
+	 * a necessary step, since deadlock detection would move me to before that
+	 * waiter anyway; but it's relatively cheap to detect such a conflict
+	 * immediately, and avoid delaying till deadlock timeout.
+	 *
+	 * Special case: if I find I should go in front of some waiter, check to
+	 * see if I conflict with already-held locks or the requests before that
+	 * waiter.  If not, then just grant myself the requested lock immediately.
+	 * This is the same as the test for immediate grant in LockAcquire, except
+	 * we are only considering the part of the wait queue before my insertion
+	 * point.
+	 */
+	if (myHeldLocks != 0)
+	{
+		LOCKMASK	aheadRequests = 0;
+
+		proc = (PGPROC *) waitQueue->links.next;
+		for (i = 0; i < waitQueue->size; i++)
+		{
+			/*
+			 * If we're part of the same locking group as this waiter, its
+			 * locks neither conflict with ours nor contribute to
+			 * aheadRequests.
+			 */
+			if (leader != NULL && leader == proc->lockGroupLeader)
+			{
+				proc = (PGPROC *) proc->links.next;
+				continue;
+			}
+			/* Must he wait for me? */
+			if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks)
+			{
+				/* Must I wait for him ? */
+				if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks)
+				{
+					/*
+					 * Yes, so we have a deadlock.  Easiest way to clean up
+					 * correctly is to call RemoveFromWaitQueue(), but we
+					 * can't do that until we are *on* the wait queue. So, set
+					 * a flag to check below, and break out of loop.  Also,
+					 * record deadlock info for later message.
+					 */
+					RememberSimpleDeadLock(MyProc, lockmode, lock, proc);
+					early_deadlock = true;
+					break;
+				}
+				/* I must go before this waiter.  Check special case. */
+				if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
+					!LockCheckConflicts(lockMethodTable, lockmode, lock,
+										proclock))
+				{
+					/* Skip the wait and just grant myself the lock. */
+					GrantLock(lock, proclock, lockmode);
+					GrantAwaitedLock();
+					return PROC_WAIT_STATUS_OK;
+				}
+				/* Break out of loop to put myself before him */
+				break;
+			}
+			/* Nope, so advance to next waiter */
+			aheadRequests |= LOCKBIT_ON(proc->waitLockMode);
+			proc = (PGPROC *) proc->links.next;
+		}
+
+		/*
+		 * If we fall out of loop normally, proc points to waitQueue head, so
+		 * we will insert at tail of queue as desired.
+		 */
+	}
+	else
+	{
+		/* I hold no locks, so I can't push in front of anyone. */
+		proc = (PGPROC *) &(waitQueue->links);
+	}
+
+	/*
+	 * Insert self into queue, ahead of the given proc (or at tail of queue).
+	 */
+	SHMQueueInsertBefore(&(proc->links), &(MyProc->links));
+	waitQueue->size++;
+
+	lock->waitMask |= LOCKBIT_ON(lockmode);
+
+	/* Set up wait information in PGPROC object, too */
+	MyProc->waitLock = lock;
+	MyProc->waitProcLock = proclock;
+	MyProc->waitLockMode = lockmode;
+
+	MyProc->waitStatus = PROC_WAIT_STATUS_WAITING;
+
+	/*
+	 * If we detected deadlock, give up without waiting.  This must agree with
+	 * CheckDeadLock's recovery code.
+	 */
+	if (early_deadlock)
+	{
+		RemoveFromWaitQueue(MyProc, hashcode);
+		return PROC_WAIT_STATUS_ERROR;
+	}
+
+	/* mark that we are waiting for a lock */
+	lockAwaited = locallock;
+
+	/*
+	 * Release the lock table's partition lock.
+	 *
+	 * NOTE: this may also cause us to exit critical-section state, possibly
+	 * allowing a cancel/die interrupt to be accepted. This is OK because we
+	 * have recorded the fact that we are waiting for a lock, and so
+	 * LockErrorCleanup will clean up if cancel/die happens.
+	 */
+	LWLockRelease(partitionLock);
+
+	/*
+	 * Also, now that we will successfully clean up after an ereport, it's
+	 * safe to check to see if there's a buffer pin deadlock against the
+	 * Startup process.  Of course, that's only necessary if we're doing Hot
+	 * Standby and are not the Startup process ourselves.
+	 */
+	if (RecoveryInProgress() && !InRecovery)
+		CheckRecoveryConflictDeadlock();
+
+	/* Reset deadlock_state before enabling the timeout handler */
+	deadlock_state = DS_NOT_YET_CHECKED;
+	got_deadlock_timeout = false;
+
+	/*
+	 * Set timer so we can wake up after awhile and check for a deadlock. If a
+	 * deadlock is detected, the handler sets MyProc->waitStatus =
+	 * PROC_WAIT_STATUS_ERROR, allowing us to know that we must report failure
+	 * rather than success.
+	 *
+	 * By delaying the check until we've waited for a bit, we can avoid
+	 * running the rather expensive deadlock-check code in most cases.
+	 *
+	 * If LockTimeout is set, also enable the timeout for that.  We can save a
+	 * few cycles by enabling both timeout sources in one call.
+	 *
+	 * If InHotStandby we set lock waits slightly later for clarity with other
+	 * code.
+	 */
+	if (!InHotStandby)
+	{
+		if (LockTimeout > 0)
+		{
+			EnableTimeoutParams timeouts[2];
+
+			timeouts[0].id = DEADLOCK_TIMEOUT;
+			timeouts[0].type = TMPARAM_AFTER;
+			timeouts[0].delay_ms = DeadlockTimeout;
+			timeouts[1].id = LOCK_TIMEOUT;
+			timeouts[1].type = TMPARAM_AFTER;
+			timeouts[1].delay_ms = LockTimeout;
+			enable_timeouts(timeouts, 2);
+		}
+		else
+			enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout);
+
+		/*
+		 * Use the current time obtained for the deadlock timeout timer as
+		 * waitStart (i.e., the time when this process started waiting for the
+		 * lock). Since getting the current time newly can cause overhead, we
+		 * reuse the already-obtained time to avoid that overhead.
+		 *
+		 * Note that waitStart is updated without holding the lock table's
+		 * partition lock, to avoid the overhead by additional lock
+		 * acquisition. This can cause "waitstart" in pg_locks to become NULL
+		 * for a very short period of time after the wait started even though
+		 * "granted" is false. This is OK in practice because we can assume
+		 * that users are likely to look at "waitstart" when waiting for the
+		 * lock for a long time.
+		 */
+		pg_atomic_write_u64(&MyProc->waitStart,
+							get_timeout_start_time(DEADLOCK_TIMEOUT));
+	}
+	else if (log_recovery_conflict_waits)
+	{
+		/*
+		 * Set the wait start timestamp if logging is enabled and in hot
+		 * standby.
+		 */
+		standbyWaitStart = GetCurrentTimestamp();
+	}
+
+	/*
+	 * If somebody wakes us between LWLockRelease and WaitLatch, the latch
+	 * will not wait. But a set latch does not necessarily mean that the lock
+	 * is free now, as there are many other sources for latch sets than
+	 * somebody releasing the lock.
+	 *
+	 * We process interrupts whenever the latch has been set, so cancel/die
+	 * interrupts are processed quickly. This means we must not mind losing
+	 * control to a cancel/die interrupt here.  We don't, because we have no
+	 * shared-state-change work to do after being granted the lock (the
+	 * grantor did it all).  We do have to worry about canceling the deadlock
+	 * timeout and updating the locallock table, but if we lose control to an
+	 * error, LockErrorCleanup will fix that up.
+	 */
+	do
+	{
+		if (InHotStandby)
+		{
+			bool		maybe_log_conflict =
+			(standbyWaitStart != 0 && !logged_recovery_conflict);
+
+			/* Set a timer and wait for that or for the lock to be granted */
+			ResolveRecoveryConflictWithLock(locallock->tag.lock,
+											maybe_log_conflict);
+
+			/*
+			 * Emit the log message if the startup process is waiting longer
+			 * than deadlock_timeout for recovery conflict on lock.
+			 */
+			if (maybe_log_conflict)
+			{
+				TimestampTz now = GetCurrentTimestamp();
+
+				if (TimestampDifferenceExceeds(standbyWaitStart, now,
+											   DeadlockTimeout))
+				{
+					VirtualTransactionId *vxids;
+					int			cnt;
+
+					vxids = GetLockConflicts(&locallock->tag.lock,
+											 AccessExclusiveLock, &cnt);
+
+					/*
+					 * Log the recovery conflict and the list of PIDs of
+					 * backends holding the conflicting lock. Note that we do
+					 * logging even if there are no such backends right now
+					 * because the startup process here has already waited
+					 * longer than deadlock_timeout.
+					 */
+					LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+										standbyWaitStart, now,
+										cnt > 0 ? vxids : NULL, true);
+					logged_recovery_conflict = true;
+				}
+			}
+		}
+		else
+		{
+			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+							 PG_WAIT_LOCK | locallock->tag.lock.locktag_type);
+			ResetLatch(MyLatch);
+			/* check for deadlocks first, as that's probably log-worthy */
+			if (got_deadlock_timeout)
+			{
+				CheckDeadLock();
+				got_deadlock_timeout = false;
+			}
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		/*
+		 * waitStatus could change from PROC_WAIT_STATUS_WAITING to something
+		 * else asynchronously.  Read it just once per loop to prevent
+		 * surprising behavior (such as missing log messages).
+		 */
+		myWaitStatus = *((volatile ProcWaitStatus *) &MyProc->waitStatus);
+
+		/*
+		 * If we are not deadlocked, but are waiting on an autovacuum-induced
+		 * task, send a signal to interrupt it.
+		 */
+		if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel)
+		{
+			PGPROC	   *autovac = GetBlockingAutoVacuumPgproc();
+			uint8		statusFlags;
+			uint8		lockmethod_copy;
+			LOCKTAG		locktag_copy;
+
+			/*
+			 * Grab info we need, then release lock immediately.  Note this
+			 * coding means that there is a tiny chance that the process
+			 * terminates its current transaction and starts a different one
+			 * before we have a change to send the signal; the worst possible
+			 * consequence is that a for-wraparound vacuum is cancelled.  But
+			 * that could happen in any case unless we were to do kill() with
+			 * the lock held, which is much more undesirable.
+			 */
+			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+			statusFlags = ProcGlobal->statusFlags[autovac->pgxactoff];
+			lockmethod_copy = lock->tag.locktag_lockmethodid;
+			locktag_copy = lock->tag;
+			LWLockRelease(ProcArrayLock);
+
+			/*
+			 * Only do it if the worker is not working to protect against Xid
+			 * wraparound.
+			 */
+			if ((statusFlags & PROC_IS_AUTOVACUUM) &&
+				!(statusFlags & PROC_VACUUM_FOR_WRAPAROUND))
+			{
+				int			pid = autovac->pid;
+
+				/* report the case, if configured to do so */
+				if (message_level_is_interesting(DEBUG1))
+				{
+					StringInfoData locktagbuf;
+					StringInfoData logbuf;	/* errdetail for server log */
+
+					initStringInfo(&locktagbuf);
+					initStringInfo(&logbuf);
+					DescribeLockTag(&locktagbuf, &locktag_copy);
+					appendStringInfo(&logbuf,
+									 "Process %d waits for %s on %s.",
+									 MyProcPid,
+									 GetLockmodeName(lockmethod_copy, lockmode),
+									 locktagbuf.data);
+
+					ereport(DEBUG1,
+							(errmsg_internal("sending cancel to blocking autovacuum PID %d",
+											 pid),
+							 errdetail_log("%s", logbuf.data)));
+
+					pfree(locktagbuf.data);
+					pfree(logbuf.data);
+				}
+
+				/* send the autovacuum worker Back to Old Kent Road */
+				if (kill(pid, SIGINT) < 0)
+				{
+					/*
+					 * There's a race condition here: once we release the
+					 * ProcArrayLock, it's possible for the autovac worker to
+					 * close up shop and exit before we can do the kill().
+					 * Therefore, we do not whinge about no-such-process.
+					 * Other errors such as EPERM could conceivably happen if
+					 * the kernel recycles the PID fast enough, but such cases
+					 * seem improbable enough that it's probably best to issue
+					 * a warning if we see some other errno.
+					 */
+					if (errno != ESRCH)
+						ereport(WARNING,
+								(errmsg("could not send signal to process %d: %m",
+										pid)));
+				}
+			}
+
+			/* prevent signal from being sent again more than once */
+			allow_autovacuum_cancel = false;
+		}
+
+		/*
+		 * If awoken after the deadlock check interrupt has run, and
+		 * log_lock_waits is on, then report about the wait.
+		 */
+		if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED)
+		{
+			StringInfoData buf,
+						lock_waiters_sbuf,
+						lock_holders_sbuf;
+			const char *modename;
+			long		secs;
+			int			usecs;
+			long		msecs;
+			SHM_QUEUE  *procLocks;
+			PROCLOCK   *proclock;
+			bool		first_holder = true,
+						first_waiter = true;
+			int			lockHoldersNum = 0;
+
+			initStringInfo(&buf);
+			initStringInfo(&lock_waiters_sbuf);
+			initStringInfo(&lock_holders_sbuf);
+
+			DescribeLockTag(&buf, &locallock->tag.lock);
+			modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid,
+									   lockmode);
+			TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT),
+								GetCurrentTimestamp(),
+								&secs, &usecs);
+			msecs = secs * 1000 + usecs / 1000;
+			usecs = usecs % 1000;
+
+			/*
+			 * we loop over the lock's procLocks to gather a list of all
+			 * holders and waiters. Thus we will be able to provide more
+			 * detailed information for lock debugging purposes.
+			 *
+			 * lock->procLocks contains all processes which hold or wait for
+			 * this lock.
+			 */
+
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			procLocks = &(lock->procLocks);
+			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+												 offsetof(PROCLOCK, lockLink));
+
+			while (proclock)
+			{
+				/*
+				 * we are a waiter if myProc->waitProcLock == proclock; we are
+				 * a holder if it is NULL or something different
+				 */
+				if (proclock->tag.myProc->waitProcLock == proclock)
+				{
+					if (first_waiter)
+					{
+						appendStringInfo(&lock_waiters_sbuf, "%d",
+										 proclock->tag.myProc->pid);
+						first_waiter = false;
+					}
+					else
+						appendStringInfo(&lock_waiters_sbuf, ", %d",
+										 proclock->tag.myProc->pid);
+				}
+				else
+				{
+					if (first_holder)
+					{
+						appendStringInfo(&lock_holders_sbuf, "%d",
+										 proclock->tag.myProc->pid);
+						first_holder = false;
+					}
+					else
+						appendStringInfo(&lock_holders_sbuf, ", %d",
+										 proclock->tag.myProc->pid);
+
+					lockHoldersNum++;
+				}
+
+				proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
+													 offsetof(PROCLOCK, lockLink));
+			}
+
+			LWLockRelease(partitionLock);
+
+			if (deadlock_state == DS_SOFT_DEADLOCK)
+				ereport(LOG,
+						(errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms",
+								MyProcPid, modename, buf.data, msecs, usecs),
+						 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+											   "Processes holding the lock: %s. Wait queue: %s.",
+											   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+			else if (deadlock_state == DS_HARD_DEADLOCK)
+			{
+				/*
+				 * This message is a bit redundant with the error that will be
+				 * reported subsequently, but in some cases the error report
+				 * might not make it to the log (eg, if it's caught by an
+				 * exception handler), and we want to ensure all long-wait
+				 * events get logged.
+				 */
+				ereport(LOG,
+						(errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms",
+								MyProcPid, modename, buf.data, msecs, usecs),
+						 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+											   "Processes holding the lock: %s. Wait queue: %s.",
+											   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+			}
+
+			if (myWaitStatus == PROC_WAIT_STATUS_WAITING)
+				ereport(LOG,
+						(errmsg("process %d still waiting for %s on %s after %ld.%03d ms",
+								MyProcPid, modename, buf.data, msecs, usecs),
+						 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+											   "Processes holding the lock: %s. Wait queue: %s.",
+											   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+			else if (myWaitStatus == PROC_WAIT_STATUS_OK)
+				ereport(LOG,
+						(errmsg("process %d acquired %s on %s after %ld.%03d ms",
+								MyProcPid, modename, buf.data, msecs, usecs)));
+			else
+			{
+				Assert(myWaitStatus == PROC_WAIT_STATUS_ERROR);
+
+				/*
+				 * Currently, the deadlock checker always kicks its own
+				 * process, which means that we'll only see
+				 * PROC_WAIT_STATUS_ERROR when deadlock_state ==
+				 * DS_HARD_DEADLOCK, and there's no need to print redundant
+				 * messages.  But for completeness and future-proofing, print
+				 * a message if it looks like someone else kicked us off the
+				 * lock.
+				 */
+				if (deadlock_state != DS_HARD_DEADLOCK)
+					ereport(LOG,
+							(errmsg("process %d failed to acquire %s on %s after %ld.%03d ms",
+									MyProcPid, modename, buf.data, msecs, usecs),
+							 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+												   "Processes holding the lock: %s. Wait queue: %s.",
+												   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+			}
+
+			/*
+			 * At this point we might still need to wait for the lock. Reset
+			 * state so we don't print the above messages again.
+			 */
+			deadlock_state = DS_NO_DEADLOCK;
+
+			pfree(buf.data);
+			pfree(lock_holders_sbuf.data);
+			pfree(lock_waiters_sbuf.data);
+		}
+	} while (myWaitStatus == PROC_WAIT_STATUS_WAITING);
+
+	/*
+	 * Disable the timers, if they are still running.  As in LockErrorCleanup,
+	 * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has
+	 * already caused QueryCancelPending to become set, we want the cancel to
+	 * be reported as a lock timeout, not a user cancel.
+	 */
+	if (!InHotStandby)
+	{
+		if (LockTimeout > 0)
+		{
+			DisableTimeoutParams timeouts[2];
+
+			timeouts[0].id = DEADLOCK_TIMEOUT;
+			timeouts[0].keep_indicator = false;
+			timeouts[1].id = LOCK_TIMEOUT;
+			timeouts[1].keep_indicator = true;
+			disable_timeouts(timeouts, 2);
+		}
+		else
+			disable_timeout(DEADLOCK_TIMEOUT, false);
+	}
+
+	/*
+	 * Emit the log message if recovery conflict on lock was resolved but the
+	 * startup process waited longer than deadlock_timeout for it.
+	 */
+	if (InHotStandby && logged_recovery_conflict)
+		LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+							standbyWaitStart, GetCurrentTimestamp(),
+							NULL, false);
+
+	/*
+	 * Re-acquire the lock table's partition lock.  We have to do this to hold
+	 * off cancel/die interrupts before we can mess with lockAwaited (else we
+	 * might have a missed or duplicated locallock update).
+	 */
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * We no longer want LockErrorCleanup to do anything.
+	 */
+	lockAwaited = NULL;
+
+	/*
+	 * If we got the lock, be sure to remember it in the locallock table.
+	 */
+	if (MyProc->waitStatus == PROC_WAIT_STATUS_OK)
+		GrantAwaitedLock();
+
+	/*
+	 * We don't have to do anything else, because the awaker did all the
+	 * necessary update of the lock table and MyProc.
+	 */
+	return MyProc->waitStatus;
+}
+
+
+/*
+ * ProcWakeup -- wake up a process by setting its latch.
+ *
+ *	 Also remove the process from the wait queue and set its links invalid.
+ *	 RETURN: the next process in the wait queue.
+ *
+ * The appropriate lock partition lock must be held by caller.
+ *
+ * XXX: presently, this code is only used for the "success" case, and only
+ * works correctly for that case.  To clean up in failure case, would need
+ * to twiddle the lock's request counts too --- see RemoveFromWaitQueue.
+ * Hence, in practice the waitStatus parameter must be PROC_WAIT_STATUS_OK.
+ */
+PGPROC *
+ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus)
+{
+	PGPROC	   *retProc;
+
+	/* Proc should be sleeping ... */
+	if (proc->links.prev == NULL ||
+		proc->links.next == NULL)
+		return NULL;
+	Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING);
+
+	/* Save next process before we zap the list link */
+	retProc = (PGPROC *) proc->links.next;
+
+	/* Remove process from wait queue */
+	SHMQueueDelete(&(proc->links));
+	(proc->waitLock->waitProcs.size)--;
+
+	/* Clean up process' state and pass it the ok/fail signal */
+	proc->waitLock = NULL;
+	proc->waitProcLock = NULL;
+	proc->waitStatus = waitStatus;
+	pg_atomic_write_u64(&MyProc->waitStart, 0);
+
+	/* And awaken it */
+	SetLatch(&proc->procLatch);
+
+	return retProc;
+}
+
+/*
+ * ProcLockWakeup -- routine for waking up processes when a lock is
+ *		released (or a prior waiter is aborted).  Scan all waiters
+ *		for lock, waken any that are no longer blocked.
+ *
+ * The appropriate lock partition lock must be held by caller.
+ */
+void
+ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
+{
+	PROC_QUEUE *waitQueue = &(lock->waitProcs);
+	int			queue_size = waitQueue->size;
+	PGPROC	   *proc;
+	LOCKMASK	aheadRequests = 0;
+
+	Assert(queue_size >= 0);
+
+	if (queue_size == 0)
+		return;
+
+	proc = (PGPROC *) waitQueue->links.next;
+
+	while (queue_size-- > 0)
+	{
+		LOCKMODE	lockmode = proc->waitLockMode;
+
+		/*
+		 * Waken if (a) doesn't conflict with requests of earlier waiters, and
+		 * (b) doesn't conflict with already-held locks.
+		 */
+		if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
+			!LockCheckConflicts(lockMethodTable, lockmode, lock,
+								proc->waitProcLock))
+		{
+			/* OK to waken */
+			GrantLock(lock, proc->waitProcLock, lockmode);
+			proc = ProcWakeup(proc, PROC_WAIT_STATUS_OK);
+
+			/*
+			 * ProcWakeup removes proc from the lock's waiting process queue
+			 * and returns the next proc in chain; don't use proc's next-link,
+			 * because it's been cleared.
+			 */
+		}
+		else
+		{
+			/*
+			 * Cannot wake this guy. Remember his request for later checks.
+			 */
+			aheadRequests |= LOCKBIT_ON(lockmode);
+			proc = (PGPROC *) proc->links.next;
+		}
+	}
+
+	Assert(waitQueue->size >= 0);
+}
+
+/*
+ * CheckDeadLock
+ *
+ * We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a
+ * lock to be released by some other process.  Check if there's a deadlock; if
+ * not, just return.  (But signal ProcSleep to log a message, if
+ * log_lock_waits is true.)  If we have a real deadlock, remove ourselves from
+ * the lock's wait queue and signal an error to ProcSleep.
+ */
+static void
+CheckDeadLock(void)
+{
+	int			i;
+
+	/*
+	 * Acquire exclusive lock on the entire shared lock data structures. Must
+	 * grab LWLocks in partition-number order to avoid LWLock deadlock.
+	 *
+	 * Note that the deadlock check interrupt had better not be enabled
+	 * anywhere that this process itself holds lock partition locks, else this
+	 * will wait forever.  Also note that LWLockAcquire creates a critical
+	 * section, so that this routine cannot be interrupted by cancel/die
+	 * interrupts.
+	 */
+	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+		LWLockAcquire(LockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
+
+	/*
+	 * Check to see if we've been awoken by anyone in the interim.
+	 *
+	 * If we have, we can return and resume our transaction -- happy day.
+	 * Before we are awoken the process releasing the lock grants it to us so
+	 * we know that we don't have to wait anymore.
+	 *
+	 * We check by looking to see if we've been unlinked from the wait queue.
+	 * This is safe because we hold the lock partition lock.
+	 */
+	if (MyProc->links.prev == NULL ||
+		MyProc->links.next == NULL)
+		goto check_done;
+
+#ifdef LOCK_DEBUG
+	if (Debug_deadlocks)
+		DumpAllLocks();
+#endif
+
+	/* Run the deadlock check, and set deadlock_state for use by ProcSleep */
+	deadlock_state = DeadLockCheck(MyProc);
+
+	if (deadlock_state == DS_HARD_DEADLOCK)
+	{
+		/*
+		 * Oops.  We have a deadlock.
+		 *
+		 * Get this process out of wait state. (Note: we could do this more
+		 * efficiently by relying on lockAwaited, but use this coding to
+		 * preserve the flexibility to kill some other transaction than the
+		 * one detecting the deadlock.)
+		 *
+		 * RemoveFromWaitQueue sets MyProc->waitStatus to
+		 * PROC_WAIT_STATUS_ERROR, so ProcSleep will report an error after we
+		 * return from the signal handler.
+		 */
+		Assert(MyProc->waitLock != NULL);
+		RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag)));
+
+		/*
+		 * We're done here.  Transaction abort caused by the error that
+		 * ProcSleep will raise will cause any other locks we hold to be
+		 * released, thus allowing other processes to wake up; we don't need
+		 * to do that here.  NOTE: an exception is that releasing locks we
+		 * hold doesn't consider the possibility of waiters that were blocked
+		 * behind us on the lock we just failed to get, and might now be
+		 * wakable because we're not in front of them anymore.  However,
+		 * RemoveFromWaitQueue took care of waking up any such processes.
+		 */
+	}
+
+	/*
+	 * And release locks.  We do this in reverse order for two reasons: (1)
+	 * Anyone else who needs more than one of the locks will be trying to lock
+	 * them in increasing order; we don't want to release the other process
+	 * until it can get all the locks it needs. (2) This avoids O(N^2)
+	 * behavior inside LWLockRelease.
+	 */
+check_done:
+	for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+		LWLockRelease(LockHashPartitionLockByIndex(i));
+}
+
+/*
+ * CheckDeadLockAlert - Handle the expiry of deadlock_timeout.
+ *
+ * NB: Runs inside a signal handler, be careful.
+ */
+void
+CheckDeadLockAlert(void)
+{
+	int			save_errno = errno;
+
+	got_deadlock_timeout = true;
+
+	/*
+	 * Have to set the latch again, even if handle_sig_alarm already did. Back
+	 * then got_deadlock_timeout wasn't yet set... It's unlikely that this
+	 * ever would be a problem, but setting a set latch again is cheap.
+	 *
+	 * Note that, when this function runs inside procsignal_sigusr1_handler(),
+	 * the handler function sets the latch again after the latch is set here.
+	 */
+	SetLatch(MyLatch);
+	errno = save_errno;
+}
+
+/*
+ * ProcWaitForSignal - wait for a signal from another backend.
+ *
+ * As this uses the generic process latch the caller has to be robust against
+ * unrelated wakeups: Always check that the desired state has occurred, and
+ * wait again if not.
+ */
+void
+ProcWaitForSignal(uint32 wait_event_info)
+{
+	(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+					 wait_event_info);
+	ResetLatch(MyLatch);
+	CHECK_FOR_INTERRUPTS();
+}
+
+/*
+ * ProcSendSignal - send a signal to a backend identified by PID
+ */
+void
+ProcSendSignal(int pid)
+{
+	PGPROC	   *proc = NULL;
+
+	if (RecoveryInProgress())
+	{
+		SpinLockAcquire(ProcStructLock);
+
+		/*
+		 * Check to see whether it is the Startup process we wish to signal.
+		 * This call is made by the buffer manager when it wishes to wake up a
+		 * process that has been waiting for a pin in so it can obtain a
+		 * cleanup lock using LockBufferForCleanup(). Startup is not a normal
+		 * backend, so BackendPidGetProc() will not return any pid at all. So
+		 * we remember the information for this special case.
+		 */
+		if (pid == ProcGlobal->startupProcPid)
+			proc = ProcGlobal->startupProc;
+
+		SpinLockRelease(ProcStructLock);
+	}
+
+	if (proc == NULL)
+		proc = BackendPidGetProc(pid);
+
+	if (proc != NULL)
+	{
+		SetLatch(&proc->procLatch);
+	}
+}
+
+/*
+ * BecomeLockGroupLeader - designate process as lock group leader
+ *
+ * Once this function has returned, other processes can join the lock group
+ * by calling BecomeLockGroupMember.
+ */
+void
+BecomeLockGroupLeader(void)
+{
+	LWLock	   *leader_lwlock;
+
+	/* If we already did it, we don't need to do it again. */
+	if (MyProc->lockGroupLeader == MyProc)
+		return;
+
+	/* We had better not be a follower. */
+	Assert(MyProc->lockGroupLeader == NULL);
+
+	/* Create single-member group, containing only ourselves. */
+	leader_lwlock = LockHashPartitionLockByProc(MyProc);
+	LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+	MyProc->lockGroupLeader = MyProc;
+	dlist_push_head(&MyProc->lockGroupMembers, &MyProc->lockGroupLink);
+	LWLockRelease(leader_lwlock);
+}
+
+/*
+ * BecomeLockGroupMember - designate process as lock group member
+ *
+ * This is pretty straightforward except for the possibility that the leader
+ * whose group we're trying to join might exit before we manage to do so;
+ * and the PGPROC might get recycled for an unrelated process.  To avoid
+ * that, we require the caller to pass the PID of the intended PGPROC as
+ * an interlock.  Returns true if we successfully join the intended lock
+ * group, and false if not.
+ */
+bool
+BecomeLockGroupMember(PGPROC *leader, int pid)
+{
+	LWLock	   *leader_lwlock;
+	bool		ok = false;
+
+	/* Group leader can't become member of group */
+	Assert(MyProc != leader);
+
+	/* Can't already be a member of a group */
+	Assert(MyProc->lockGroupLeader == NULL);
+
+	/* PID must be valid. */
+	Assert(pid != 0);
+
+	/*
+	 * Get lock protecting the group fields.  Note LockHashPartitionLockByProc
+	 * accesses leader->pgprocno in a PGPROC that might be free.  This is safe
+	 * because all PGPROCs' pgprocno fields are set during shared memory
+	 * initialization and never change thereafter; so we will acquire the
+	 * correct lock even if the leader PGPROC is in process of being recycled.
+	 */
+	leader_lwlock = LockHashPartitionLockByProc(leader);
+	LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+
+	/* Is this the leader we're looking for? */
+	if (leader->pid == pid && leader->lockGroupLeader == leader)
+	{
+		/* OK, join the group */
+		ok = true;
+		MyProc->lockGroupLeader = leader;
+		dlist_push_tail(&leader->lockGroupMembers, &MyProc->lockGroupLink);
+	}
+	LWLockRelease(leader_lwlock);
+
+	return ok;
+}
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index 0000000..2dc2d67
--- /dev/null
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -0,0 +1,377 @@
+/*-------------------------------------------------------------------------
+ *
+ * s_lock.c
+ *	   Hardware-dependent implementation of spinlocks.
+ *
+ * When waiting for a contended spinlock we loop tightly for awhile, then
+ * delay using pg_usleep() and try again.  Preferably, "awhile" should be a
+ * small multiple of the maximum time we expect a spinlock to be held.  100
+ * iterations seems about right as an initial guess.  However, on a
+ * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario
+ * it's usually better to spin a bit longer than to call the kernel, so we try
+ * to adapt the spin loop count depending on whether we seem to be in a
+ * uniprocessor or multiprocessor.
+ *
+ * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
+ * be wrong; there are platforms where that can result in a "stuck
+ * spinlock" failure.  This has been seen particularly on Alphas; it seems
+ * that the first TAS after returning from kernel space will always fail
+ * on that hardware.
+ *
+ * Once we do decide to block, we use randomly increasing pg_usleep()
+ * delays. The first delay is 1 msec, then the delay randomly increases to
+ * about one second, after which we reset to 1 msec and start again.  The
+ * idea here is that in the presence of heavy contention we need to
+ * increase the delay, else the spinlock holder may never get to run and
+ * release the lock.  (Consider situation where spinlock holder has been
+ * nice'd down in priority by the scheduler --- it will not get scheduled
+ * until all would-be acquirers are sleeping, so if we always use a 1-msec
+ * sleep, there is a real possibility of starvation.)  But we can't just
+ * clamp the delay to an upper bound, else it would take a long time to
+ * make a reasonable number of tries.
+ *
+ * We time out and declare error after NUM_DELAYS delays (thus, exactly
+ * that many tries).  With the given settings, this will usually take 2 or
+ * so minutes.  It seems better to fix the total number of tries (and thus
+ * the probability of unintended failure) than to fix the total time
+ * spent.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/s_lock.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <time.h>
+#include <unistd.h>
+
+#include "port/atomics.h"
+#include "storage/s_lock.h"
+
+#define MIN_SPINS_PER_DELAY 10
+#define MAX_SPINS_PER_DELAY 1000
+#define NUM_DELAYS			1000
+#define MIN_DELAY_USEC		1000L
+#define MAX_DELAY_USEC		1000000L
+
+
+slock_t		dummy_spinlock;
+
+static int	spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+
+
+/*
+ * s_lock_stuck() - complain about a stuck spinlock
+ */
+static void
+s_lock_stuck(const char *file, int line, const char *func)
+{
+	if (!func)
+		func = "(unknown)";
+#if defined(S_LOCK_TEST)
+	fprintf(stderr,
+			"\nStuck spinlock detected at %s, %s:%d.\n",
+			func, file, line);
+	exit(1);
+#else
+	elog(PANIC, "stuck spinlock detected at %s, %s:%d",
+		 func, file, line);
+#endif
+}
+
+/*
+ * s_lock(lock) - platform-independent portion of waiting for a spinlock.
+ */
+int
+s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
+{
+	SpinDelayStatus delayStatus;
+
+	init_spin_delay(&delayStatus, file, line, func);
+
+	while (TAS_SPIN(lock))
+	{
+		perform_spin_delay(&delayStatus);
+	}
+
+	finish_spin_delay(&delayStatus);
+
+	return delayStatus.delays;
+}
+
+#ifdef USE_DEFAULT_S_UNLOCK
+void
+s_unlock(volatile slock_t *lock)
+{
+#ifdef TAS_ACTIVE_WORD
+	/* HP's PA-RISC */
+	*TAS_ACTIVE_WORD(lock) = -1;
+#else
+	*lock = 0;
+#endif
+}
+#endif
+
+/*
+ * Wait while spinning on a contended spinlock.
+ */
+void
+perform_spin_delay(SpinDelayStatus *status)
+{
+	/* CPU-specific delay each time through the loop */
+	SPIN_DELAY();
+
+	/* Block the process every spins_per_delay tries */
+	if (++(status->spins) >= spins_per_delay)
+	{
+		if (++(status->delays) > NUM_DELAYS)
+			s_lock_stuck(status->file, status->line, status->func);
+
+		if (status->cur_delay == 0) /* first time to delay? */
+			status->cur_delay = MIN_DELAY_USEC;
+
+		pg_usleep(status->cur_delay);
+
+#if defined(S_LOCK_TEST)
+		fprintf(stdout, "*");
+		fflush(stdout);
+#endif
+
+		/* increase delay by a random fraction between 1X and 2X */
+		status->cur_delay += (int) (status->cur_delay *
+									((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
+		/* wrap back to minimum delay when max is exceeded */
+		if (status->cur_delay > MAX_DELAY_USEC)
+			status->cur_delay = MIN_DELAY_USEC;
+
+		status->spins = 0;
+	}
+}
+
+/*
+ * After acquiring a spinlock, update estimates about how long to loop.
+ *
+ * If we were able to acquire the lock without delaying, it's a good
+ * indication we are in a multiprocessor.  If we had to delay, it's a sign
+ * (but not a sure thing) that we are in a uniprocessor. Hence, we
+ * decrement spins_per_delay slowly when we had to delay, and increase it
+ * rapidly when we didn't.  It's expected that spins_per_delay will
+ * converge to the minimum value on a uniprocessor and to the maximum
+ * value on a multiprocessor.
+ *
+ * Note: spins_per_delay is local within our current process. We want to
+ * average these observations across multiple backends, since it's
+ * relatively rare for this function to even get entered, and so a single
+ * backend might not live long enough to converge on a good value.  That
+ * is handled by the two routines below.
+ */
+void
+finish_spin_delay(SpinDelayStatus *status)
+{
+	if (status->cur_delay == 0)
+	{
+		/* we never had to delay */
+		if (spins_per_delay < MAX_SPINS_PER_DELAY)
+			spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY);
+	}
+	else
+	{
+		if (spins_per_delay > MIN_SPINS_PER_DELAY)
+			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
+	}
+}
+
+/*
+ * Set local copy of spins_per_delay during backend startup.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+void
+set_spins_per_delay(int shared_spins_per_delay)
+{
+	spins_per_delay = shared_spins_per_delay;
+}
+
+/*
+ * Update shared estimate of spins_per_delay during backend exit.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+int
+update_spins_per_delay(int shared_spins_per_delay)
+{
+	/*
+	 * We use an exponential moving average with a relatively slow adaption
+	 * rate, so that noise in any one backend's result won't affect the shared
+	 * value too much.  As long as both inputs are within the allowed range,
+	 * the result must be too, so we need not worry about clamping the result.
+	 *
+	 * We deliberately truncate rather than rounding; this is so that single
+	 * adjustments inside a backend can affect the shared estimate (see the
+	 * asymmetric adjustment rules above).
+	 */
+	return (shared_spins_per_delay * 15 + spins_per_delay) / 16;
+}
+
+
+/*
+ * Various TAS implementations that cannot live in s_lock.h as no inline
+ * definition exists (yet).
+ * In the future, get rid of tas.[cso] and fold it into this file.
+ *
+ * If you change something here, you will likely need to modify s_lock.h too,
+ * because the definitions for these are split between this file and s_lock.h.
+ */
+
+
+#ifdef HAVE_SPINLOCKS			/* skip spinlocks if requested */
+
+
+#if defined(__GNUC__)
+
+/*
+ * All the gcc flavors that are not inlined
+ */
+
+
+/*
+ * Note: all the if-tests here probably ought to be testing gcc version
+ * rather than platform, but I don't have adequate info to know what to
+ * write.  Ideally we'd flush all this in favor of the inline version.
+ */
+#if defined(__m68k__) && !defined(__linux__)
+/* really means: extern int tas(slock_t* **lock); */
+static void
+tas_dummy()
+{
+	__asm__ __volatile__(
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(__ELF__)
+/* no underscore for label and % for registers */
+						 "\
+.global		tas 				\n\
+tas:							\n\
+			movel	%sp@(0x4),%a0	\n\
+			tas 	%a0@		\n\
+			beq 	_success	\n\
+			moveq	#-128,%d0	\n\
+			rts 				\n\
+_success:						\n\
+			moveq	#0,%d0		\n\
+			rts 				\n"
+#else
+						 "\
+.global		_tas				\n\
+_tas:							\n\
+			movel	sp@(0x4),a0	\n\
+			tas 	a0@			\n\
+			beq 	_success	\n\
+			moveq 	#-128,d0	\n\
+			rts					\n\
+_success:						\n\
+			moveq 	#0,d0		\n\
+			rts					\n"
+#endif							/* (__NetBSD__ || __OpenBSD__) && __ELF__ */
+		);
+}
+#endif							/* __m68k__ && !__linux__ */
+#endif							/* not __GNUC__ */
+#endif							/* HAVE_SPINLOCKS */
+
+
+
+/*****************************************************************************/
+#if defined(S_LOCK_TEST)
+
+/*
+ * test program for verifying a port's spinlock support.
+ */
+
+struct test_lock_struct
+{
+	char		pad1;
+	slock_t		lock;
+	char		pad2;
+};
+
+volatile struct test_lock_struct test_lock;
+
+int
+main()
+{
+	srandom((unsigned int) time(NULL));
+
+	test_lock.pad1 = test_lock.pad2 = 0x44;
+
+	S_INIT_LOCK(&test_lock.lock);
+
+	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+	{
+		printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+		return 1;
+	}
+
+	if (!S_LOCK_FREE(&test_lock.lock))
+	{
+		printf("S_LOCK_TEST: failed, lock not initialized\n");
+		return 1;
+	}
+
+	S_LOCK(&test_lock.lock);
+
+	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+	{
+		printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+		return 1;
+	}
+
+	if (S_LOCK_FREE(&test_lock.lock))
+	{
+		printf("S_LOCK_TEST: failed, lock not locked\n");
+		return 1;
+	}
+
+	S_UNLOCK(&test_lock.lock);
+
+	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+	{
+		printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+		return 1;
+	}
+
+	if (!S_LOCK_FREE(&test_lock.lock))
+	{
+		printf("S_LOCK_TEST: failed, lock not unlocked\n");
+		return 1;
+	}
+
+	S_LOCK(&test_lock.lock);
+
+	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+	{
+		printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+		return 1;
+	}
+
+	if (S_LOCK_FREE(&test_lock.lock))
+	{
+		printf("S_LOCK_TEST: failed, lock not re-locked\n");
+		return 1;
+	}
+
+	printf("S_LOCK_TEST: this will print %d stars and then\n", NUM_DELAYS);
+	printf("             exit with a 'stuck spinlock' message\n");
+	printf("             if S_LOCK() and TAS() are working.\n");
+	fflush(stdout);
+
+	s_lock(&test_lock.lock, __FILE__, __LINE__);
+
+	printf("S_LOCK_TEST: failed, lock not locked\n");
+	return 1;
+}
+
+#endif							/* S_LOCK_TEST */
diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c
new file mode 100644
index 0000000..557672c
--- /dev/null
+++ b/src/backend/storage/lmgr/spin.c
@@ -0,0 +1,180 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.c
+ *	   Hardware-independent implementation of spinlocks.
+ *
+ *
+ * For machines that have test-and-set (TAS) instructions, s_lock.h/.c
+ * define the spinlock implementation.  This file contains only a stub
+ * implementation for spinlocks using PGSemaphores.  Unless semaphores
+ * are implemented in a way that doesn't involve a kernel call, this
+ * is too slow to be very useful :-(
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/spin.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/pg_sema.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+
+
+#ifndef HAVE_SPINLOCKS
+
+/*
+ * No TAS, so spinlocks are implemented as PGSemaphores.
+ */
+
+#ifndef HAVE_ATOMICS
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES)
+#else
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES)
+#endif							/* DISABLE_ATOMICS */
+
+PGSemaphore *SpinlockSemaArray;
+
+#else							/* !HAVE_SPINLOCKS */
+
+#define NUM_EMULATION_SEMAPHORES 0
+
+#endif							/* HAVE_SPINLOCKS */
+
+/*
+ * Report the amount of shared memory needed to store semaphores for spinlock
+ * support.
+ */
+Size
+SpinlockSemaSize(void)
+{
+	return NUM_EMULATION_SEMAPHORES * sizeof(PGSemaphore);
+}
+
+/*
+ * Report number of semaphores needed to support spinlocks.
+ */
+int
+SpinlockSemas(void)
+{
+	return NUM_EMULATION_SEMAPHORES;
+}
+
+#ifndef HAVE_SPINLOCKS
+
+/*
+ * Initialize spinlock emulation.
+ *
+ * This must be called after PGReserveSemaphores().
+ */
+void
+SpinlockSemaInit(void)
+{
+	PGSemaphore *spinsemas;
+	int			nsemas = SpinlockSemas();
+	int			i;
+
+	/*
+	 * We must use ShmemAllocUnlocked(), since the spinlock protecting
+	 * ShmemAlloc() obviously can't be ready yet.
+	 */
+	spinsemas = (PGSemaphore *) ShmemAllocUnlocked(SpinlockSemaSize());
+	for (i = 0; i < nsemas; ++i)
+		spinsemas[i] = PGSemaphoreCreate();
+	SpinlockSemaArray = spinsemas;
+}
+
+/*
+ * s_lock.h hardware-spinlock emulation using semaphores
+ *
+ * We map all spinlocks onto NUM_EMULATION_SEMAPHORES semaphores.  It's okay to
+ * map multiple spinlocks onto one semaphore because no process should ever
+ * hold more than one at a time.  We just need enough semaphores so that we
+ * aren't adding too much extra contention from that.
+ *
+ * There is one exception to the restriction of only holding one spinlock at a
+ * time, which is that it's ok if emulated atomic operations are nested inside
+ * spinlocks. To avoid the danger of spinlocks and atomic using the same sema,
+ * we make sure "normal" spinlocks and atomics backed by spinlocks use
+ * distinct semaphores (see the nested argument to s_init_lock_sema).
+ *
+ * slock_t is just an int for this implementation; it holds the spinlock
+ * number from 1..NUM_EMULATION_SEMAPHORES.  We intentionally ensure that 0
+ * is not a valid value, so that testing with this code can help find
+ * failures to initialize spinlocks.
+ */
+
+static inline void
+s_check_valid(int lockndx)
+{
+	if (unlikely(lockndx <= 0 || lockndx > NUM_EMULATION_SEMAPHORES))
+		elog(ERROR, "invalid spinlock number: %d", lockndx);
+}
+
+void
+s_init_lock_sema(volatile slock_t *lock, bool nested)
+{
+	static uint32 counter = 0;
+	uint32		offset;
+	uint32		sema_total;
+	uint32		idx;
+
+	if (nested)
+	{
+		/*
+		 * To allow nesting atomics inside spinlocked sections, use a
+		 * different spinlock. See comment above.
+		 */
+		offset = 1 + NUM_SPINLOCK_SEMAPHORES;
+		sema_total = NUM_ATOMICS_SEMAPHORES;
+	}
+	else
+	{
+		offset = 1;
+		sema_total = NUM_SPINLOCK_SEMAPHORES;
+	}
+
+	idx = (counter++ % sema_total) + offset;
+
+	/* double check we did things correctly */
+	s_check_valid(idx);
+
+	*lock = idx;
+}
+
+void
+s_unlock_sema(volatile slock_t *lock)
+{
+	int			lockndx = *lock;
+
+	s_check_valid(lockndx);
+
+	PGSemaphoreUnlock(SpinlockSemaArray[lockndx - 1]);
+}
+
+bool
+s_lock_free_sema(volatile slock_t *lock)
+{
+	/* We don't currently use S_LOCK_FREE anyway */
+	elog(ERROR, "spin.c does not support S_LOCK_FREE()");
+	return false;
+}
+
+int
+tas_sema(volatile slock_t *lock)
+{
+	int			lockndx = *lock;
+
+	s_check_valid(lockndx);
+
+	/* Note that TAS macros return 0 if *success* */
+	return !PGSemaphoreTryLock(SpinlockSemaArray[lockndx - 1]);
+}
+
+#endif							/* !HAVE_SPINLOCKS */
diff --git a/src/backend/storage/page/Makefile b/src/backend/storage/page/Makefile
new file mode 100644
index 0000000..da539b1
--- /dev/null
+++ b/src/backend/storage/page/Makefile
@@ -0,0 +1,23 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/page
+#
+# IDENTIFICATION
+#    src/backend/storage/page/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/page
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS =  \
+	bufpage.o \
+	checksum.o \
+	itemptr.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+# Provide special optimization flags for checksum.c
+checksum.o: CFLAGS += ${CFLAGS_UNROLL_LOOPS} ${CFLAGS_VECTORIZE}
diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README
new file mode 100644
index 0000000..e30d7ac
--- /dev/null
+++ b/src/backend/storage/page/README
@@ -0,0 +1,64 @@
+src/backend/storage/page/README
+
+Checksums
+---------
+
+Checksums on data pages are designed to detect corruption by the I/O system.
+We do not protect buffers against uncorrectable memory errors, since these
+have a very low measured incidence according to research on large server farms,
+http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed
+2010/12/22 on -hackers list.
+
+Current implementation requires this be enabled system-wide at initdb time, or
+by using the pg_checksums tool on an offline cluster.
+
+The checksum is not valid at all times on a data page!!
+The checksum is valid when the page leaves the shared pool and is checked
+when it later re-enters the shared pool as a result of I/O.
+We set the checksum on a buffer in the shared pool immediately before we
+flush the buffer. As a result we implicitly invalidate the page's checksum
+when we modify the page for a data change or even a hint. This means that
+many or even most pages in shared buffers have invalid page checksums,
+so be careful how you interpret the pd_checksum field.
+
+That means that WAL-logged changes to a page do NOT update the page checksum,
+so full page images may not have a valid checksum. But those page images have
+the WAL CRC covering them and so are verified separately from this
+mechanism. WAL replay should not test the checksum of a full-page image.
+
+The best way to understand this is that WAL CRCs protect records entering the
+WAL stream, and data page verification protects blocks entering the shared
+buffer pool. They are similar in purpose, yet completely separate.  Together
+they ensure we are able to detect errors in data re-entering
+PostgreSQL-controlled memory. Note also that the WAL checksum is a 32-bit CRC,
+whereas the page checksum is only 16-bits.
+
+Any write of a data block can cause a torn page if the write is unsuccessful.
+Full page writes protect us from that, which are stored in WAL.  Setting hint
+bits when a page is already dirty is OK because a full page write must already
+have been written for it since the last checkpoint.  Setting hint bits on an
+otherwise clean page can allow torn pages; this doesn't normally matter since
+they are just hints, but when the page has checksums, then losing a few bits
+would cause the checksum to be invalid.  So if we have full_page_writes = on
+and checksums enabled then we must write a WAL record specifically so that we
+record a full page image in WAL.  Hint bits updates should be protected using
+MarkBufferDirtyHint(), which is responsible for writing the full-page image
+when necessary.
+
+Note that when we write a page checksum we include the hopefully zeroed bytes
+that form the hole in the centre of a standard page. Thus, when we read the
+block back from storage we implicitly check that the hole is still all zeroes.
+We do this to ensure that we spot errors that could have destroyed data even
+if they haven't actually done so. Full page images stored in WAL do *not*
+check that the hole is all zero; the data in the hole is simply skipped and
+re-zeroed if the backup block is reapplied. We do this because a failure in
+WAL is a fatal error and prevents further recovery, whereas a checksum failure
+on a normal data block is a hard error but not a critical one for the server,
+even if it is a very bad thing for the user.
+
+New WAL records cannot be written during recovery, so hint bits set during
+recovery must not dirty the page if the buffer is not already dirty, when
+checksums are enabled.  Systems in Hot-Standby mode may benefit from hint bits
+being set, but with checksums enabled, a page cannot be dirtied after setting a
+hint bit (due to the torn page risk). So, it must wait for full-page images
+containing the hint bit updates to arrive from the primary.
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
new file mode 100644
index 0000000..82ca91f
--- /dev/null
+++ b/src/backend/storage/page/bufpage.c
@@ -0,0 +1,1539 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.c
+ *	  POSTGRES standard buffer page code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/page/bufpage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/itup.h"
+#include "access/xlog.h"
+#include "pgstat.h"
+#include "storage/checksum.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+
+/* GUC variable */
+bool		ignore_checksum_failure = false;
+
+
+/* ----------------------------------------------------------------
+ *						Page support functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * PageInit
+ *		Initializes the contents of a page.
+ *		Note that we don't calculate an initial checksum here; that's not done
+ *		until it's time to write.
+ */
+void
+PageInit(Page page, Size pageSize, Size specialSize)
+{
+	PageHeader	p = (PageHeader) page;
+
+	specialSize = MAXALIGN(specialSize);
+
+	Assert(pageSize == BLCKSZ);
+	Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+	/* Make sure all fields of page are zero, as well as unused space */
+	MemSet(p, 0, pageSize);
+
+	p->pd_flags = 0;
+	p->pd_lower = SizeOfPageHeaderData;
+	p->pd_upper = pageSize - specialSize;
+	p->pd_special = pageSize - specialSize;
+	PageSetPageSizeAndVersion(page, pageSize, PG_PAGE_LAYOUT_VERSION);
+	/* p->pd_prune_xid = InvalidTransactionId;		done by above MemSet */
+}
+
+
+/*
+ * PageIsVerifiedExtended
+ *		Check that the page header and checksum (if any) appear valid.
+ *
+ * This is called when a page has just been read in from disk.  The idea is
+ * to cheaply detect trashed pages before we go nuts following bogus line
+ * pointers, testing invalid transaction identifiers, etc.
+ *
+ * It turns out to be necessary to allow zeroed pages here too.  Even though
+ * this routine is *not* called when deliberately adding a page to a relation,
+ * there are scenarios in which a zeroed page might be found in a table.
+ * (Example: a backend extends a relation, then crashes before it can write
+ * any WAL entry about the new page.  The kernel will already have the
+ * zeroed page in the file, and it will stay that way after restart.)  So we
+ * allow zeroed pages here, and are careful that the page access macros
+ * treat such a page as empty and without free space.  Eventually, VACUUM
+ * will clean up such a page and make it usable.
+ *
+ * If flag PIV_LOG_WARNING is set, a WARNING is logged in the event of
+ * a checksum failure.
+ *
+ * If flag PIV_REPORT_STAT is set, a checksum failure is reported directly
+ * to pgstat.
+ */
+bool
+PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
+{
+	PageHeader	p = (PageHeader) page;
+	size_t	   *pagebytes;
+	int			i;
+	bool		checksum_failure = false;
+	bool		header_sane = false;
+	bool		all_zeroes = false;
+	uint16		checksum = 0;
+
+	/*
+	 * Don't verify page data unless the page passes basic non-zero test
+	 */
+	if (!PageIsNew(page))
+	{
+		if (DataChecksumsEnabled())
+		{
+			checksum = pg_checksum_page((char *) page, blkno);
+
+			if (checksum != p->pd_checksum)
+				checksum_failure = true;
+		}
+
+		/*
+		 * The following checks don't prove the header is correct, only that
+		 * it looks sane enough to allow into the buffer pool. Later usage of
+		 * the block can still reveal problems, which is why we offer the
+		 * checksum option.
+		 */
+		if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 &&
+			p->pd_lower <= p->pd_upper &&
+			p->pd_upper <= p->pd_special &&
+			p->pd_special <= BLCKSZ &&
+			p->pd_special == MAXALIGN(p->pd_special))
+			header_sane = true;
+
+		if (header_sane && !checksum_failure)
+			return true;
+	}
+
+	/* Check all-zeroes case */
+	all_zeroes = true;
+	pagebytes = (size_t *) page;
+	for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
+	{
+		if (pagebytes[i] != 0)
+		{
+			all_zeroes = false;
+			break;
+		}
+	}
+
+	if (all_zeroes)
+		return true;
+
+	/*
+	 * Throw a WARNING if the checksum fails, but only after we've checked for
+	 * the all-zeroes case.
+	 */
+	if (checksum_failure)
+	{
+		if ((flags & PIV_LOG_WARNING) != 0)
+			ereport(WARNING,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("page verification failed, calculated checksum %u but expected %u",
+							checksum, p->pd_checksum)));
+
+		if ((flags & PIV_REPORT_STAT) != 0)
+			pgstat_report_checksum_failure();
+
+		if (header_sane && ignore_checksum_failure)
+			return true;
+	}
+
+	return false;
+}
+
+
+/*
+ *	PageAddItemExtended
+ *
+ *	Add an item to a page.  Return value is the offset at which it was
+ *	inserted, or InvalidOffsetNumber if the item is not inserted for any
+ *	reason.  A WARNING is issued indicating the reason for the refusal.
+ *
+ *	offsetNumber must be either InvalidOffsetNumber to specify finding a
+ *	free line pointer, or a value between FirstOffsetNumber and one past
+ *	the last existing item, to specify using that particular line pointer.
+ *
+ *	If offsetNumber is valid and flag PAI_OVERWRITE is set, we just store
+ *	the item at the specified offsetNumber, which must be either a
+ *	currently-unused line pointer, or one past the last existing item.
+ *
+ *	If offsetNumber is valid and flag PAI_OVERWRITE is not set, insert
+ *	the item at the specified offsetNumber, moving existing items later
+ *	in the array to make room.
+ *
+ *	If offsetNumber is not valid, then assign a slot by finding the first
+ *	one that is both unused and deallocated.
+ *
+ *	If flag PAI_IS_HEAP is set, we enforce that there can't be more than
+ *	MaxHeapTuplesPerPage line pointers on the page.
+ *
+ *	!!! EREPORT(ERROR) IS DISALLOWED HERE !!!
+ */
+OffsetNumber
+PageAddItemExtended(Page page,
+					Item item,
+					Size size,
+					OffsetNumber offsetNumber,
+					int flags)
+{
+	PageHeader	phdr = (PageHeader) page;
+	Size		alignedSize;
+	int			lower;
+	int			upper;
+	ItemId		itemId;
+	OffsetNumber limit;
+	bool		needshuffle = false;
+
+	/*
+	 * Be wary about corrupted page pointers
+	 */
+	if (phdr->pd_lower < SizeOfPageHeaderData ||
+		phdr->pd_lower > phdr->pd_upper ||
+		phdr->pd_upper > phdr->pd_special ||
+		phdr->pd_special > BLCKSZ)
+		ereport(PANIC,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+						phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
+
+	/*
+	 * Select offsetNumber to place the new item at
+	 */
+	limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+	/* was offsetNumber passed in? */
+	if (OffsetNumberIsValid(offsetNumber))
+	{
+		/* yes, check it */
+		if ((flags & PAI_OVERWRITE) != 0)
+		{
+			if (offsetNumber < limit)
+			{
+				itemId = PageGetItemId(phdr, offsetNumber);
+				if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId))
+				{
+					elog(WARNING, "will not overwrite a used ItemId");
+					return InvalidOffsetNumber;
+				}
+			}
+		}
+		else
+		{
+			if (offsetNumber < limit)
+				needshuffle = true; /* need to move existing linp's */
+		}
+	}
+	else
+	{
+		/* offsetNumber was not passed in, so find a free slot */
+		/* if no free slot, we'll put it at limit (1st open slot) */
+		if (PageHasFreeLinePointers(phdr))
+		{
+			/*
+			 * Scan line pointer array to locate a "recyclable" (unused)
+			 * ItemId.
+			 *
+			 * Always use earlier items first.  PageTruncateLinePointerArray
+			 * can only truncate unused items when they appear as a contiguous
+			 * group at the end of the line pointer array.
+			 */
+			for (offsetNumber = FirstOffsetNumber;
+				 offsetNumber < limit;	/* limit is maxoff+1 */
+				 offsetNumber++)
+			{
+				itemId = PageGetItemId(phdr, offsetNumber);
+
+				/*
+				 * We check for no storage as well, just to be paranoid;
+				 * unused items should never have storage.  Assert() that the
+				 * invariant is respected too.
+				 */
+				Assert(ItemIdIsUsed(itemId) || !ItemIdHasStorage(itemId));
+
+				if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId))
+					break;
+			}
+			if (offsetNumber >= limit)
+			{
+				/* the hint is wrong, so reset it */
+				PageClearHasFreeLinePointers(phdr);
+			}
+		}
+		else
+		{
+			/* don't bother searching if hint says there's no free slot */
+			offsetNumber = limit;
+		}
+	}
+
+	/* Reject placing items beyond the first unused line pointer */
+	if (offsetNumber > limit)
+	{
+		elog(WARNING, "specified item offset is too large");
+		return InvalidOffsetNumber;
+	}
+
+	/* Reject placing items beyond heap boundary, if heap */
+	if ((flags & PAI_IS_HEAP) != 0 && offsetNumber > MaxHeapTuplesPerPage)
+	{
+		elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page");
+		return InvalidOffsetNumber;
+	}
+
+	/*
+	 * Compute new lower and upper pointers for page, see if it'll fit.
+	 *
+	 * Note: do arithmetic as signed ints, to avoid mistakes if, say,
+	 * alignedSize > pd_upper.
+	 */
+	if (offsetNumber == limit || needshuffle)
+		lower = phdr->pd_lower + sizeof(ItemIdData);
+	else
+		lower = phdr->pd_lower;
+
+	alignedSize = MAXALIGN(size);
+
+	upper = (int) phdr->pd_upper - (int) alignedSize;
+
+	if (lower > upper)
+		return InvalidOffsetNumber;
+
+	/*
+	 * OK to insert the item.  First, shuffle the existing pointers if needed.
+	 */
+	itemId = PageGetItemId(phdr, offsetNumber);
+
+	if (needshuffle)
+		memmove(itemId + 1, itemId,
+				(limit - offsetNumber) * sizeof(ItemIdData));
+
+	/* set the line pointer */
+	ItemIdSetNormal(itemId, upper, size);
+
+	/*
+	 * Items normally contain no uninitialized bytes.  Core bufpage consumers
+	 * conform, but this is not a necessary coding rule; a new index AM could
+	 * opt to depart from it.  However, data type input functions and other
+	 * C-language functions that synthesize datums should initialize all
+	 * bytes; datumIsEqual() relies on this.  Testing here, along with the
+	 * similar check in printtup(), helps to catch such mistakes.
+	 *
+	 * Values of the "name" type retrieved via index-only scans may contain
+	 * uninitialized bytes; see comment in btrescan().  Valgrind will report
+	 * this as an error, but it is safe to ignore.
+	 */
+	VALGRIND_CHECK_MEM_IS_DEFINED(item, size);
+
+	/* copy the item's data onto the page */
+	memcpy((char *) page + upper, item, size);
+
+	/* adjust page header */
+	phdr->pd_lower = (LocationIndex) lower;
+	phdr->pd_upper = (LocationIndex) upper;
+
+	return offsetNumber;
+}
+
+
+/*
+ * PageGetTempPage
+ *		Get a temporary page in local memory for special processing.
+ *		The returned page is not initialized at all; caller must do that.
+ */
+Page
+PageGetTempPage(Page page)
+{
+	Size		pageSize;
+	Page		temp;
+
+	pageSize = PageGetPageSize(page);
+	temp = (Page) palloc(pageSize);
+
+	return temp;
+}
+
+/*
+ * PageGetTempPageCopy
+ *		Get a temporary page in local memory for special processing.
+ *		The page is initialized by copying the contents of the given page.
+ */
+Page
+PageGetTempPageCopy(Page page)
+{
+	Size		pageSize;
+	Page		temp;
+
+	pageSize = PageGetPageSize(page);
+	temp = (Page) palloc(pageSize);
+
+	memcpy(temp, page, pageSize);
+
+	return temp;
+}
+
+/*
+ * PageGetTempPageCopySpecial
+ *		Get a temporary page in local memory for special processing.
+ *		The page is PageInit'd with the same special-space size as the
+ *		given page, and the special space is copied from the given page.
+ */
+Page
+PageGetTempPageCopySpecial(Page page)
+{
+	Size		pageSize;
+	Page		temp;
+
+	pageSize = PageGetPageSize(page);
+	temp = (Page) palloc(pageSize);
+
+	PageInit(temp, pageSize, PageGetSpecialSize(page));
+	memcpy(PageGetSpecialPointer(temp),
+		   PageGetSpecialPointer(page),
+		   PageGetSpecialSize(page));
+
+	return temp;
+}
+
+/*
+ * PageRestoreTempPage
+ *		Copy temporary page back to permanent page after special processing
+ *		and release the temporary page.
+ */
+void
+PageRestoreTempPage(Page tempPage, Page oldPage)
+{
+	Size		pageSize;
+
+	pageSize = PageGetPageSize(tempPage);
+	memcpy((char *) oldPage, (char *) tempPage, pageSize);
+
+	pfree(tempPage);
+}
+
+/*
+ * Tuple defrag support for PageRepairFragmentation and PageIndexMultiDelete
+ */
+typedef struct itemIdCompactData
+{
+	uint16		offsetindex;	/* linp array index */
+	int16		itemoff;		/* page offset of item data */
+	uint16		alignedlen;		/* MAXALIGN(item data len) */
+} itemIdCompactData;
+typedef itemIdCompactData *itemIdCompact;
+
+/*
+ * After removing or marking some line pointers unused, move the tuples to
+ * remove the gaps caused by the removed items and reorder them back into
+ * reverse line pointer order in the page.
+ *
+ * This function can often be fairly hot, so it pays to take some measures to
+ * make it as optimal as possible.
+ *
+ * Callers may pass 'presorted' as true if the 'itemidbase' array is sorted in
+ * descending order of itemoff.  When this is true we can just memmove()
+ * tuples towards the end of the page.  This is quite a common case as it's
+ * the order that tuples are initially inserted into pages.  When we call this
+ * function to defragment the tuples in the page then any new line pointers
+ * added to the page will keep that presorted order, so hitting this case is
+ * still very common for tables that are commonly updated.
+ *
+ * When the 'itemidbase' array is not presorted then we're unable to just
+ * memmove() tuples around freely.  Doing so could cause us to overwrite the
+ * memory belonging to a tuple we've not moved yet.  In this case, we copy all
+ * the tuples that need to be moved into a temporary buffer.  We can then
+ * simply memcpy() out of that temp buffer back into the page at the correct
+ * location.  Tuples are copied back into the page in the same order as the
+ * 'itemidbase' array, so we end up reordering the tuples back into reverse
+ * line pointer order.  This will increase the chances of hitting the
+ * presorted case the next time around.
+ *
+ * Callers must ensure that nitems is > 0
+ */
+static void
+compactify_tuples(itemIdCompact itemidbase, int nitems, Page page, bool presorted)
+{
+	PageHeader	phdr = (PageHeader) page;
+	Offset		upper;
+	Offset		copy_tail;
+	Offset		copy_head;
+	itemIdCompact itemidptr;
+	int			i;
+
+	/* Code within will not work correctly if nitems == 0 */
+	Assert(nitems > 0);
+
+	if (presorted)
+	{
+
+#ifdef USE_ASSERT_CHECKING
+		{
+			/*
+			 * Verify we've not gotten any new callers that are incorrectly
+			 * passing a true presorted value.
+			 */
+			Offset		lastoff = phdr->pd_special;
+
+			for (i = 0; i < nitems; i++)
+			{
+				itemidptr = &itemidbase[i];
+
+				Assert(lastoff > itemidptr->itemoff);
+
+				lastoff = itemidptr->itemoff;
+			}
+		}
+#endif							/* USE_ASSERT_CHECKING */
+
+		/*
+		 * 'itemidbase' is already in the optimal order, i.e, lower item
+		 * pointers have a higher offset.  This allows us to memmove() the
+		 * tuples up to the end of the page without having to worry about
+		 * overwriting other tuples that have not been moved yet.
+		 *
+		 * There's a good chance that there are tuples already right at the
+		 * end of the page that we can simply skip over because they're
+		 * already in the correct location within the page.  We'll do that
+		 * first...
+		 */
+		upper = phdr->pd_special;
+		i = 0;
+		do
+		{
+			itemidptr = &itemidbase[i];
+			if (upper != itemidptr->itemoff + itemidptr->alignedlen)
+				break;
+			upper -= itemidptr->alignedlen;
+
+			i++;
+		} while (i < nitems);
+
+		/*
+		 * Now that we've found the first tuple that needs to be moved, we can
+		 * do the tuple compactification.  We try and make the least number of
+		 * memmove() calls and only call memmove() when there's a gap.  When
+		 * we see a gap we just move all tuples after the gap up until the
+		 * point of the last move operation.
+		 */
+		copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen;
+		for (; i < nitems; i++)
+		{
+			ItemId		lp;
+
+			itemidptr = &itemidbase[i];
+			lp = PageGetItemId(page, itemidptr->offsetindex + 1);
+
+			if (copy_head != itemidptr->itemoff + itemidptr->alignedlen)
+			{
+				memmove((char *) page + upper,
+						page + copy_head,
+						copy_tail - copy_head);
+
+				/*
+				 * We've now moved all tuples already seen, but not the
+				 * current tuple, so we set the copy_tail to the end of this
+				 * tuple so it can be moved in another iteration of the loop.
+				 */
+				copy_tail = itemidptr->itemoff + itemidptr->alignedlen;
+			}
+			/* shift the target offset down by the length of this tuple */
+			upper -= itemidptr->alignedlen;
+			/* point the copy_head to the start of this tuple */
+			copy_head = itemidptr->itemoff;
+
+			/* update the line pointer to reference the new offset */
+			lp->lp_off = upper;
+
+		}
+
+		/* move the remaining tuples. */
+		memmove((char *) page + upper,
+				page + copy_head,
+				copy_tail - copy_head);
+	}
+	else
+	{
+		PGAlignedBlock scratch;
+		char	   *scratchptr = scratch.data;
+
+		/*
+		 * Non-presorted case:  The tuples in the itemidbase array may be in
+		 * any order.  So, in order to move these to the end of the page we
+		 * must make a temp copy of each tuple that needs to be moved before
+		 * we copy them back into the page at the new offset.
+		 *
+		 * If a large percentage of tuples have been pruned (>75%) then we'll
+		 * copy these into the temp buffer tuple-by-tuple, otherwise, we'll
+		 * just do a single memcpy() for all tuples that need to be moved.
+		 * When so many tuples have been removed there's likely to be a lot of
+		 * gaps and it's unlikely that many non-movable tuples remain at the
+		 * end of the page.
+		 */
+		if (nitems < PageGetMaxOffsetNumber(page) / 4)
+		{
+			i = 0;
+			do
+			{
+				itemidptr = &itemidbase[i];
+				memcpy(scratchptr + itemidptr->itemoff, page + itemidptr->itemoff,
+					   itemidptr->alignedlen);
+				i++;
+			} while (i < nitems);
+
+			/* Set things up for the compactification code below */
+			i = 0;
+			itemidptr = &itemidbase[0];
+			upper = phdr->pd_special;
+		}
+		else
+		{
+			upper = phdr->pd_special;
+
+			/*
+			 * Many tuples are likely to already be in the correct location.
+			 * There's no need to copy these into the temp buffer.  Instead
+			 * we'll just skip forward in the itemidbase array to the position
+			 * that we do need to move tuples from so that the code below just
+			 * leaves these ones alone.
+			 */
+			i = 0;
+			do
+			{
+				itemidptr = &itemidbase[i];
+				if (upper != itemidptr->itemoff + itemidptr->alignedlen)
+					break;
+				upper -= itemidptr->alignedlen;
+
+				i++;
+			} while (i < nitems);
+
+			/* Copy all tuples that need to be moved into the temp buffer */
+			memcpy(scratchptr + phdr->pd_upper,
+				   page + phdr->pd_upper,
+				   upper - phdr->pd_upper);
+		}
+
+		/*
+		 * Do the tuple compactification.  itemidptr is already pointing to
+		 * the first tuple that we're going to move.  Here we collapse the
+		 * memcpy calls for adjacent tuples into a single call.  This is done
+		 * by delaying the memcpy call until we find a gap that needs to be
+		 * closed.
+		 */
+		copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen;
+		for (; i < nitems; i++)
+		{
+			ItemId		lp;
+
+			itemidptr = &itemidbase[i];
+			lp = PageGetItemId(page, itemidptr->offsetindex + 1);
+
+			/* copy pending tuples when we detect a gap */
+			if (copy_head != itemidptr->itemoff + itemidptr->alignedlen)
+			{
+				memcpy((char *) page + upper,
+					   scratchptr + copy_head,
+					   copy_tail - copy_head);
+
+				/*
+				 * We've now copied all tuples already seen, but not the
+				 * current tuple, so we set the copy_tail to the end of this
+				 * tuple.
+				 */
+				copy_tail = itemidptr->itemoff + itemidptr->alignedlen;
+			}
+			/* shift the target offset down by the length of this tuple */
+			upper -= itemidptr->alignedlen;
+			/* point the copy_head to the start of this tuple */
+			copy_head = itemidptr->itemoff;
+
+			/* update the line pointer to reference the new offset */
+			lp->lp_off = upper;
+
+		}
+
+		/* Copy the remaining chunk */
+		memcpy((char *) page + upper,
+			   scratchptr + copy_head,
+			   copy_tail - copy_head);
+	}
+
+	phdr->pd_upper = upper;
+}
+
+/*
+ * PageRepairFragmentation
+ *
+ * Frees fragmented space on a heap page following pruning.
+ *
+ * This routine is usable for heap pages only, but see PageIndexMultiDelete.
+ *
+ * Never removes unused line pointers.  PageTruncateLinePointerArray can
+ * safely remove some unused line pointers.  It ought to be safe for this
+ * routine to free unused line pointers in roughly the same way, but it's not
+ * clear that that would be beneficial.
+ *
+ * PageTruncateLinePointerArray is only called during VACUUM's second pass
+ * over the heap.  Any unused line pointers that it sees are likely to have
+ * been set to LP_UNUSED (from LP_DEAD) immediately before the time it is
+ * called.  On the other hand, many tables have the vast majority of all
+ * required pruning performed opportunistically (not during VACUUM).  And so
+ * there is, in general, a good chance that even large groups of unused line
+ * pointers that we see here will be recycled quickly.
+ *
+ * Caller had better have a super-exclusive lock on page's buffer.  As a side
+ * effect the page's PD_HAS_FREE_LINES hint bit will be set or unset as
+ * needed.
+ */
+void
+PageRepairFragmentation(Page page)
+{
+	Offset		pd_lower = ((PageHeader) page)->pd_lower;
+	Offset		pd_upper = ((PageHeader) page)->pd_upper;
+	Offset		pd_special = ((PageHeader) page)->pd_special;
+	Offset		last_offset;
+	itemIdCompactData itemidbase[MaxHeapTuplesPerPage];
+	itemIdCompact itemidptr;
+	ItemId		lp;
+	int			nline,
+				nstorage,
+				nunused;
+	int			i;
+	Size		totallen;
+	bool		presorted = true;	/* For now */
+
+	/*
+	 * It's worth the trouble to be more paranoid here than in most places,
+	 * because we are about to reshuffle data in (what is usually) a shared
+	 * disk buffer.  If we aren't careful then corrupted pointers, lengths,
+	 * etc could cause us to clobber adjacent disk buffers, spreading the data
+	 * loss further.  So, check everything.
+	 */
+	if (pd_lower < SizeOfPageHeaderData ||
+		pd_lower > pd_upper ||
+		pd_upper > pd_special ||
+		pd_special > BLCKSZ ||
+		pd_special != MAXALIGN(pd_special))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+						pd_lower, pd_upper, pd_special)));
+
+	/*
+	 * Run through the line pointer array and collect data about live items.
+	 */
+	nline = PageGetMaxOffsetNumber(page);
+	itemidptr = itemidbase;
+	nunused = totallen = 0;
+	last_offset = pd_special;
+	for (i = FirstOffsetNumber; i <= nline; i++)
+	{
+		lp = PageGetItemId(page, i);
+		if (ItemIdIsUsed(lp))
+		{
+			if (ItemIdHasStorage(lp))
+			{
+				itemidptr->offsetindex = i - 1;
+				itemidptr->itemoff = ItemIdGetOffset(lp);
+
+				if (last_offset > itemidptr->itemoff)
+					last_offset = itemidptr->itemoff;
+				else
+					presorted = false;
+
+				if (unlikely(itemidptr->itemoff < (int) pd_upper ||
+							 itemidptr->itemoff >= (int) pd_special))
+					ereport(ERROR,
+							(errcode(ERRCODE_DATA_CORRUPTED),
+							 errmsg("corrupted line pointer: %u",
+									itemidptr->itemoff)));
+				itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
+				totallen += itemidptr->alignedlen;
+				itemidptr++;
+			}
+		}
+		else
+		{
+			/* Unused entries should have lp_len = 0, but make sure */
+			ItemIdSetUnused(lp);
+			nunused++;
+		}
+	}
+
+	nstorage = itemidptr - itemidbase;
+	if (nstorage == 0)
+	{
+		/* Page is completely empty, so just reset it quickly */
+		((PageHeader) page)->pd_upper = pd_special;
+	}
+	else
+	{
+		/* Need to compact the page the hard way */
+		if (totallen > (Size) (pd_special - pd_lower))
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("corrupted item lengths: total %u, available space %u",
+							(unsigned int) totallen, pd_special - pd_lower)));
+
+		compactify_tuples(itemidbase, nstorage, page, presorted);
+	}
+
+	/* Set hint bit for PageAddItemExtended */
+	if (nunused > 0)
+		PageSetHasFreeLinePointers(page);
+	else
+		PageClearHasFreeLinePointers(page);
+}
+
+/*
+ * PageTruncateLinePointerArray
+ *
+ * Removes unused line pointers at the end of the line pointer array.
+ *
+ * This routine is usable for heap pages only.  It is called by VACUUM during
+ * its second pass over the heap.  We expect at least one LP_UNUSED line
+ * pointer on the page (if VACUUM didn't have an LP_DEAD item on the page that
+ * it just set to LP_UNUSED then it should not call here).
+ *
+ * We avoid truncating the line pointer array to 0 items, if necessary by
+ * leaving behind a single remaining LP_UNUSED item.  This is a little
+ * arbitrary, but it seems like a good idea to avoid leaving a PageIsEmpty()
+ * page behind.
+ *
+ * Caller can have either an exclusive lock or a super-exclusive lock on
+ * page's buffer.  The page's PD_HAS_FREE_LINES hint bit will be set or unset
+ * based on whether or not we leave behind any remaining LP_UNUSED items.
+ */
+void
+PageTruncateLinePointerArray(Page page)
+{
+	PageHeader	phdr = (PageHeader) page;
+	bool		countdone = false,
+				sethint = false;
+	int			nunusedend = 0;
+
+	/* Scan line pointer array back-to-front */
+	for (int i = PageGetMaxOffsetNumber(page); i >= FirstOffsetNumber; i--)
+	{
+		ItemId		lp = PageGetItemId(page, i);
+
+		if (!countdone && i > FirstOffsetNumber)
+		{
+			/*
+			 * Still determining which line pointers from the end of the array
+			 * will be truncated away.  Either count another line pointer as
+			 * safe to truncate, or notice that it's not safe to truncate
+			 * additional line pointers (stop counting line pointers).
+			 */
+			if (!ItemIdIsUsed(lp))
+				nunusedend++;
+			else
+				countdone = true;
+		}
+		else
+		{
+			/*
+			 * Once we've stopped counting we still need to figure out if
+			 * there are any remaining LP_UNUSED line pointers somewhere more
+			 * towards the front of the array.
+			 */
+			if (!ItemIdIsUsed(lp))
+			{
+				/*
+				 * This is an unused line pointer that we won't be truncating
+				 * away -- so there is at least one.  Set hint on page.
+				 */
+				sethint = true;
+				break;
+			}
+		}
+	}
+
+	if (nunusedend > 0)
+	{
+		phdr->pd_lower -= sizeof(ItemIdData) * nunusedend;
+
+#ifdef CLOBBER_FREED_MEMORY
+		memset((char *) page + phdr->pd_lower, 0x7F,
+			   sizeof(ItemIdData) * nunusedend);
+#endif
+	}
+	else
+		Assert(sethint);
+
+	/* Set hint bit for PageAddItemExtended */
+	if (sethint)
+		PageSetHasFreeLinePointers(page);
+	else
+		PageClearHasFreeLinePointers(page);
+}
+
+/*
+ * PageGetFreeSpace
+ *		Returns the size of the free (allocatable) space on a page,
+ *		reduced by the space needed for a new line pointer.
+ *
+ * Note: this should usually only be used on index pages.  Use
+ * PageGetHeapFreeSpace on heap pages.
+ */
+Size
+PageGetFreeSpace(Page page)
+{
+	int			space;
+
+	/*
+	 * Use signed arithmetic here so that we behave sensibly if pd_lower >
+	 * pd_upper.
+	 */
+	space = (int) ((PageHeader) page)->pd_upper -
+		(int) ((PageHeader) page)->pd_lower;
+
+	if (space < (int) sizeof(ItemIdData))
+		return 0;
+	space -= sizeof(ItemIdData);
+
+	return (Size) space;
+}
+
+/*
+ * PageGetFreeSpaceForMultipleTuples
+ *		Returns the size of the free (allocatable) space on a page,
+ *		reduced by the space needed for multiple new line pointers.
+ *
+ * Note: this should usually only be used on index pages.  Use
+ * PageGetHeapFreeSpace on heap pages.
+ */
+Size
+PageGetFreeSpaceForMultipleTuples(Page page, int ntups)
+{
+	int			space;
+
+	/*
+	 * Use signed arithmetic here so that we behave sensibly if pd_lower >
+	 * pd_upper.
+	 */
+	space = (int) ((PageHeader) page)->pd_upper -
+		(int) ((PageHeader) page)->pd_lower;
+
+	if (space < (int) (ntups * sizeof(ItemIdData)))
+		return 0;
+	space -= ntups * sizeof(ItemIdData);
+
+	return (Size) space;
+}
+
+/*
+ * PageGetExactFreeSpace
+ *		Returns the size of the free (allocatable) space on a page,
+ *		without any consideration for adding/removing line pointers.
+ */
+Size
+PageGetExactFreeSpace(Page page)
+{
+	int			space;
+
+	/*
+	 * Use signed arithmetic here so that we behave sensibly if pd_lower >
+	 * pd_upper.
+	 */
+	space = (int) ((PageHeader) page)->pd_upper -
+		(int) ((PageHeader) page)->pd_lower;
+
+	if (space < 0)
+		return 0;
+
+	return (Size) space;
+}
+
+
+/*
+ * PageGetHeapFreeSpace
+ *		Returns the size of the free (allocatable) space on a page,
+ *		reduced by the space needed for a new line pointer.
+ *
+ * The difference between this and PageGetFreeSpace is that this will return
+ * zero if there are already MaxHeapTuplesPerPage line pointers in the page
+ * and none are free.  We use this to enforce that no more than
+ * MaxHeapTuplesPerPage line pointers are created on a heap page.  (Although
+ * no more tuples than that could fit anyway, in the presence of redirected
+ * or dead line pointers it'd be possible to have too many line pointers.
+ * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit
+ * on the number of line pointers, we make this extra check.)
+ */
+Size
+PageGetHeapFreeSpace(Page page)
+{
+	Size		space;
+
+	space = PageGetFreeSpace(page);
+	if (space > 0)
+	{
+		OffsetNumber offnum,
+					nline;
+
+		/*
+		 * Are there already MaxHeapTuplesPerPage line pointers in the page?
+		 */
+		nline = PageGetMaxOffsetNumber(page);
+		if (nline >= MaxHeapTuplesPerPage)
+		{
+			if (PageHasFreeLinePointers((PageHeader) page))
+			{
+				/*
+				 * Since this is just a hint, we must confirm that there is
+				 * indeed a free line pointer
+				 */
+				for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
+				{
+					ItemId		lp = PageGetItemId(page, offnum);
+
+					if (!ItemIdIsUsed(lp))
+						break;
+				}
+
+				if (offnum > nline)
+				{
+					/*
+					 * The hint is wrong, but we can't clear it here since we
+					 * don't have the ability to mark the page dirty.
+					 */
+					space = 0;
+				}
+			}
+			else
+			{
+				/*
+				 * Although the hint might be wrong, PageAddItem will believe
+				 * it anyway, so we must believe it too.
+				 */
+				space = 0;
+			}
+		}
+	}
+	return space;
+}
+
+
+/*
+ * PageIndexTupleDelete
+ *
+ * This routine does the work of removing a tuple from an index page.
+ *
+ * Unlike heap pages, we compact out the line pointer for the removed tuple.
+ */
+void
+PageIndexTupleDelete(Page page, OffsetNumber offnum)
+{
+	PageHeader	phdr = (PageHeader) page;
+	char	   *addr;
+	ItemId		tup;
+	Size		size;
+	unsigned	offset;
+	int			nbytes;
+	int			offidx;
+	int			nline;
+
+	/*
+	 * As with PageRepairFragmentation, paranoia seems justified.
+	 */
+	if (phdr->pd_lower < SizeOfPageHeaderData ||
+		phdr->pd_lower > phdr->pd_upper ||
+		phdr->pd_upper > phdr->pd_special ||
+		phdr->pd_special > BLCKSZ ||
+		phdr->pd_special != MAXALIGN(phdr->pd_special))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+						phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
+
+	nline = PageGetMaxOffsetNumber(page);
+	if ((int) offnum <= 0 || (int) offnum > nline)
+		elog(ERROR, "invalid index offnum: %u", offnum);
+
+	/* change offset number to offset index */
+	offidx = offnum - 1;
+
+	tup = PageGetItemId(page, offnum);
+	Assert(ItemIdHasStorage(tup));
+	size = ItemIdGetLength(tup);
+	offset = ItemIdGetOffset(tup);
+
+	if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special ||
+		offset != MAXALIGN(offset))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted line pointer: offset = %u, size = %u",
+						offset, (unsigned int) size)));
+
+	/* Amount of space to actually be deleted */
+	size = MAXALIGN(size);
+
+	/*
+	 * First, we want to get rid of the pd_linp entry for the index tuple. We
+	 * copy all subsequent linp's back one slot in the array. We don't use
+	 * PageGetItemId, because we are manipulating the _array_, not individual
+	 * linp's.
+	 */
+	nbytes = phdr->pd_lower -
+		((char *) &phdr->pd_linp[offidx + 1] - (char *) phdr);
+
+	if (nbytes > 0)
+		memmove((char *) &(phdr->pd_linp[offidx]),
+				(char *) &(phdr->pd_linp[offidx + 1]),
+				nbytes);
+
+	/*
+	 * Now move everything between the old upper bound (beginning of tuple
+	 * space) and the beginning of the deleted tuple forward, so that space in
+	 * the middle of the page is left free.  If we've just deleted the tuple
+	 * at the beginning of tuple space, then there's no need to do the copy.
+	 */
+
+	/* beginning of tuple space */
+	addr = (char *) page + phdr->pd_upper;
+
+	if (offset > phdr->pd_upper)
+		memmove(addr + size, addr, offset - phdr->pd_upper);
+
+	/* adjust free space boundary pointers */
+	phdr->pd_upper += size;
+	phdr->pd_lower -= sizeof(ItemIdData);
+
+	/*
+	 * Finally, we need to adjust the linp entries that remain.
+	 *
+	 * Anything that used to be before the deleted tuple's data was moved
+	 * forward by the size of the deleted tuple.
+	 */
+	if (!PageIsEmpty(page))
+	{
+		int			i;
+
+		nline--;				/* there's one less than when we started */
+		for (i = 1; i <= nline; i++)
+		{
+			ItemId		ii = PageGetItemId(phdr, i);
+
+			Assert(ItemIdHasStorage(ii));
+			if (ItemIdGetOffset(ii) <= offset)
+				ii->lp_off += size;
+		}
+	}
+}
+
+
+/*
+ * PageIndexMultiDelete
+ *
+ * This routine handles the case of deleting multiple tuples from an
+ * index page at once.  It is considerably faster than a loop around
+ * PageIndexTupleDelete ... however, the caller *must* supply the array
+ * of item numbers to be deleted in item number order!
+ */
+void
+PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
+{
+	PageHeader	phdr = (PageHeader) page;
+	Offset		pd_lower = phdr->pd_lower;
+	Offset		pd_upper = phdr->pd_upper;
+	Offset		pd_special = phdr->pd_special;
+	Offset		last_offset;
+	itemIdCompactData itemidbase[MaxIndexTuplesPerPage];
+	ItemIdData	newitemids[MaxIndexTuplesPerPage];
+	itemIdCompact itemidptr;
+	ItemId		lp;
+	int			nline,
+				nused;
+	Size		totallen;
+	Size		size;
+	unsigned	offset;
+	int			nextitm;
+	OffsetNumber offnum;
+	bool		presorted = true;	/* For now */
+
+	Assert(nitems <= MaxIndexTuplesPerPage);
+
+	/*
+	 * If there aren't very many items to delete, then retail
+	 * PageIndexTupleDelete is the best way.  Delete the items in reverse
+	 * order so we don't have to think about adjusting item numbers for
+	 * previous deletions.
+	 *
+	 * TODO: tune the magic number here
+	 */
+	if (nitems <= 2)
+	{
+		while (--nitems >= 0)
+			PageIndexTupleDelete(page, itemnos[nitems]);
+		return;
+	}
+
+	/*
+	 * As with PageRepairFragmentation, paranoia seems justified.
+	 */
+	if (pd_lower < SizeOfPageHeaderData ||
+		pd_lower > pd_upper ||
+		pd_upper > pd_special ||
+		pd_special > BLCKSZ ||
+		pd_special != MAXALIGN(pd_special))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+						pd_lower, pd_upper, pd_special)));
+
+	/*
+	 * Scan the line pointer array and build a list of just the ones we are
+	 * going to keep.  Notice we do not modify the page yet, since we are
+	 * still validity-checking.
+	 */
+	nline = PageGetMaxOffsetNumber(page);
+	itemidptr = itemidbase;
+	totallen = 0;
+	nused = 0;
+	nextitm = 0;
+	last_offset = pd_special;
+	for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
+	{
+		lp = PageGetItemId(page, offnum);
+		Assert(ItemIdHasStorage(lp));
+		size = ItemIdGetLength(lp);
+		offset = ItemIdGetOffset(lp);
+		if (offset < pd_upper ||
+			(offset + size) > pd_special ||
+			offset != MAXALIGN(offset))
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("corrupted line pointer: offset = %u, size = %u",
+							offset, (unsigned int) size)));
+
+		if (nextitm < nitems && offnum == itemnos[nextitm])
+		{
+			/* skip item to be deleted */
+			nextitm++;
+		}
+		else
+		{
+			itemidptr->offsetindex = nused; /* where it will go */
+			itemidptr->itemoff = offset;
+
+			if (last_offset > itemidptr->itemoff)
+				last_offset = itemidptr->itemoff;
+			else
+				presorted = false;
+
+			itemidptr->alignedlen = MAXALIGN(size);
+			totallen += itemidptr->alignedlen;
+			newitemids[nused] = *lp;
+			itemidptr++;
+			nused++;
+		}
+	}
+
+	/* this will catch invalid or out-of-order itemnos[] */
+	if (nextitm != nitems)
+		elog(ERROR, "incorrect index offsets supplied");
+
+	if (totallen > (Size) (pd_special - pd_lower))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted item lengths: total %u, available space %u",
+						(unsigned int) totallen, pd_special - pd_lower)));
+
+	/*
+	 * Looks good. Overwrite the line pointers with the copy, from which we've
+	 * removed all the unused items.
+	 */
+	memcpy(phdr->pd_linp, newitemids, nused * sizeof(ItemIdData));
+	phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData);
+
+	/* and compactify the tuple data */
+	if (nused > 0)
+		compactify_tuples(itemidbase, nused, page, presorted);
+	else
+		phdr->pd_upper = pd_special;
+}
+
+
+/*
+ * PageIndexTupleDeleteNoCompact
+ *
+ * Remove the specified tuple from an index page, but set its line pointer
+ * to "unused" instead of compacting it out, except that it can be removed
+ * if it's the last line pointer on the page.
+ *
+ * This is used for index AMs that require that existing TIDs of live tuples
+ * remain unchanged, and are willing to allow unused line pointers instead.
+ */
+void
+PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum)
+{
+	PageHeader	phdr = (PageHeader) page;
+	char	   *addr;
+	ItemId		tup;
+	Size		size;
+	unsigned	offset;
+	int			nline;
+
+	/*
+	 * As with PageRepairFragmentation, paranoia seems justified.
+	 */
+	if (phdr->pd_lower < SizeOfPageHeaderData ||
+		phdr->pd_lower > phdr->pd_upper ||
+		phdr->pd_upper > phdr->pd_special ||
+		phdr->pd_special > BLCKSZ ||
+		phdr->pd_special != MAXALIGN(phdr->pd_special))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+						phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
+
+	nline = PageGetMaxOffsetNumber(page);
+	if ((int) offnum <= 0 || (int) offnum > nline)
+		elog(ERROR, "invalid index offnum: %u", offnum);
+
+	tup = PageGetItemId(page, offnum);
+	Assert(ItemIdHasStorage(tup));
+	size = ItemIdGetLength(tup);
+	offset = ItemIdGetOffset(tup);
+
+	if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special ||
+		offset != MAXALIGN(offset))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted line pointer: offset = %u, size = %u",
+						offset, (unsigned int) size)));
+
+	/* Amount of space to actually be deleted */
+	size = MAXALIGN(size);
+
+	/*
+	 * Either set the line pointer to "unused", or zap it if it's the last
+	 * one.  (Note: it's possible that the next-to-last one(s) are already
+	 * unused, but we do not trouble to try to compact them out if so.)
+	 */
+	if ((int) offnum < nline)
+		ItemIdSetUnused(tup);
+	else
+	{
+		phdr->pd_lower -= sizeof(ItemIdData);
+		nline--;				/* there's one less than when we started */
+	}
+
+	/*
+	 * Now move everything between the old upper bound (beginning of tuple
+	 * space) and the beginning of the deleted tuple forward, so that space in
+	 * the middle of the page is left free.  If we've just deleted the tuple
+	 * at the beginning of tuple space, then there's no need to do the copy.
+	 */
+
+	/* beginning of tuple space */
+	addr = (char *) page + phdr->pd_upper;
+
+	if (offset > phdr->pd_upper)
+		memmove(addr + size, addr, offset - phdr->pd_upper);
+
+	/* adjust free space boundary pointer */
+	phdr->pd_upper += size;
+
+	/*
+	 * Finally, we need to adjust the linp entries that remain.
+	 *
+	 * Anything that used to be before the deleted tuple's data was moved
+	 * forward by the size of the deleted tuple.
+	 */
+	if (!PageIsEmpty(page))
+	{
+		int			i;
+
+		for (i = 1; i <= nline; i++)
+		{
+			ItemId		ii = PageGetItemId(phdr, i);
+
+			if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset)
+				ii->lp_off += size;
+		}
+	}
+}
+
+
+/*
+ * PageIndexTupleOverwrite
+ *
+ * Replace a specified tuple on an index page.
+ *
+ * The new tuple is placed exactly where the old one had been, shifting
+ * other tuples' data up or down as needed to keep the page compacted.
+ * This is better than deleting and reinserting the tuple, because it
+ * avoids any data shifting when the tuple size doesn't change; and
+ * even when it does, we avoid moving the line pointers around.
+ * This could be used by an index AM that doesn't want to unset the
+ * LP_DEAD bit when it happens to be set.  It could conceivably also be
+ * used by an index AM that cares about the physical order of tuples as
+ * well as their logical/ItemId order.
+ *
+ * If there's insufficient space for the new tuple, return false.  Other
+ * errors represent data-corruption problems, so we just elog.
+ */
+bool
+PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
+						Item newtup, Size newsize)
+{
+	PageHeader	phdr = (PageHeader) page;
+	ItemId		tupid;
+	int			oldsize;
+	unsigned	offset;
+	Size		alignednewsize;
+	int			size_diff;
+	int			itemcount;
+
+	/*
+	 * As with PageRepairFragmentation, paranoia seems justified.
+	 */
+	if (phdr->pd_lower < SizeOfPageHeaderData ||
+		phdr->pd_lower > phdr->pd_upper ||
+		phdr->pd_upper > phdr->pd_special ||
+		phdr->pd_special > BLCKSZ ||
+		phdr->pd_special != MAXALIGN(phdr->pd_special))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+						phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
+
+	itemcount = PageGetMaxOffsetNumber(page);
+	if ((int) offnum <= 0 || (int) offnum > itemcount)
+		elog(ERROR, "invalid index offnum: %u", offnum);
+
+	tupid = PageGetItemId(page, offnum);
+	Assert(ItemIdHasStorage(tupid));
+	oldsize = ItemIdGetLength(tupid);
+	offset = ItemIdGetOffset(tupid);
+
+	if (offset < phdr->pd_upper || (offset + oldsize) > phdr->pd_special ||
+		offset != MAXALIGN(offset))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("corrupted line pointer: offset = %u, size = %u",
+						offset, (unsigned int) oldsize)));
+
+	/*
+	 * Determine actual change in space requirement, check for page overflow.
+	 */
+	oldsize = MAXALIGN(oldsize);
+	alignednewsize = MAXALIGN(newsize);
+	if (alignednewsize > oldsize + (phdr->pd_upper - phdr->pd_lower))
+		return false;
+
+	/*
+	 * Relocate existing data and update line pointers, unless the new tuple
+	 * is the same size as the old (after alignment), in which case there's
+	 * nothing to do.  Notice that what we have to relocate is data before the
+	 * target tuple, not data after, so it's convenient to express size_diff
+	 * as the amount by which the tuple's size is decreasing, making it the
+	 * delta to add to pd_upper and affected line pointers.
+	 */
+	size_diff = oldsize - (int) alignednewsize;
+	if (size_diff != 0)
+	{
+		char	   *addr = (char *) page + phdr->pd_upper;
+		int			i;
+
+		/* relocate all tuple data before the target tuple */
+		memmove(addr + size_diff, addr, offset - phdr->pd_upper);
+
+		/* adjust free space boundary pointer */
+		phdr->pd_upper += size_diff;
+
+		/* adjust affected line pointers too */
+		for (i = FirstOffsetNumber; i <= itemcount; i++)
+		{
+			ItemId		ii = PageGetItemId(phdr, i);
+
+			/* Allow items without storage; currently only BRIN needs that */
+			if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset)
+				ii->lp_off += size_diff;
+		}
+	}
+
+	/* Update the item's tuple length without changing its lp_flags field */
+	tupid->lp_off = offset + size_diff;
+	tupid->lp_len = newsize;
+
+	/* Copy new tuple data onto page */
+	memcpy(PageGetItem(page, tupid), newtup, newsize);
+
+	return true;
+}
+
+
+/*
+ * Set checksum for a page in shared buffers.
+ *
+ * If checksums are disabled, or if the page is not initialized, just return
+ * the input.  Otherwise, we must make a copy of the page before calculating
+ * the checksum, to prevent concurrent modifications (e.g. setting hint bits)
+ * from making the final checksum invalid.  It doesn't matter if we include or
+ * exclude hints during the copy, as long as we write a valid page and
+ * associated checksum.
+ *
+ * Returns a pointer to the block-sized data that needs to be written. Uses
+ * statically-allocated memory, so the caller must immediately write the
+ * returned page and not refer to it again.
+ */
+char *
+PageSetChecksumCopy(Page page, BlockNumber blkno)
+{
+	static char *pageCopy = NULL;
+
+	/* If we don't need a checksum, just return the passed-in data */
+	if (PageIsNew(page) || !DataChecksumsEnabled())
+		return (char *) page;
+
+	/*
+	 * We allocate the copy space once and use it over on each subsequent
+	 * call.  The point of palloc'ing here, rather than having a static char
+	 * array, is first to ensure adequate alignment for the checksumming code
+	 * and second to avoid wasting space in processes that never call this.
+	 */
+	if (pageCopy == NULL)
+		pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ);
+
+	memcpy(pageCopy, (char *) page, BLCKSZ);
+	((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
+	return pageCopy;
+}
+
+/*
+ * Set checksum for a page in private memory.
+ *
+ * This must only be used when we know that no other process can be modifying
+ * the page buffer.
+ */
+void
+PageSetChecksumInplace(Page page, BlockNumber blkno)
+{
+	/* If we don't need a checksum, just return */
+	if (PageIsNew(page) || !DataChecksumsEnabled())
+		return;
+
+	((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno);
+}
diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c
new file mode 100644
index 0000000..6462ddd
--- /dev/null
+++ b/src/backend/storage/page/checksum.c
@@ -0,0 +1,22 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum.c
+ *	  Checksum implementation for data pages.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/page/checksum.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/checksum.h"
+/*
+ * The actual code is in storage/checksum_impl.h.  This is done so that
+ * external programs can incorporate the checksum code by #include'ing
+ * that file from the exported Postgres headers.  (Compare our CRC code.)
+ */
+#include "storage/checksum_impl.h"
diff --git a/src/backend/storage/page/itemptr.c b/src/backend/storage/page/itemptr.c
new file mode 100644
index 0000000..f40d6c2
--- /dev/null
+++ b/src/backend/storage/page/itemptr.c
@@ -0,0 +1,132 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.c
+ *	  POSTGRES disk item pointer code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/page/itemptr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/itemptr.h"
+
+
+/*
+ * ItemPointerEquals
+ *	Returns true if both item pointers point to the same item,
+ *	 otherwise returns false.
+ *
+ * Note:
+ *	Asserts that the disk item pointers are both valid!
+ */
+bool
+ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
+{
+	/*
+	 * We really want ItemPointerData to be exactly 6 bytes.  This is rather a
+	 * random place to check, but there is no better place.
+	 */
+	StaticAssertStmt(sizeof(ItemPointerData) == 3 * sizeof(uint16),
+					 "ItemPointerData struct is improperly padded");
+
+	if (ItemPointerGetBlockNumber(pointer1) ==
+		ItemPointerGetBlockNumber(pointer2) &&
+		ItemPointerGetOffsetNumber(pointer1) ==
+		ItemPointerGetOffsetNumber(pointer2))
+		return true;
+	else
+		return false;
+}
+
+/*
+ * ItemPointerCompare
+ *		Generic btree-style comparison for item pointers.
+ */
+int32
+ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
+{
+	/*
+	 * Use ItemPointerGet{Offset,Block}NumberNoCheck to avoid asserting
+	 * ip_posid != 0, which may not be true for a user-supplied TID.
+	 */
+	BlockNumber b1 = ItemPointerGetBlockNumberNoCheck(arg1);
+	BlockNumber b2 = ItemPointerGetBlockNumberNoCheck(arg2);
+
+	if (b1 < b2)
+		return -1;
+	else if (b1 > b2)
+		return 1;
+	else if (ItemPointerGetOffsetNumberNoCheck(arg1) <
+			 ItemPointerGetOffsetNumberNoCheck(arg2))
+		return -1;
+	else if (ItemPointerGetOffsetNumberNoCheck(arg1) >
+			 ItemPointerGetOffsetNumberNoCheck(arg2))
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * ItemPointerInc
+ *		Increment 'pointer' by 1 only paying attention to the ItemPointer's
+ *		type's range limits and not MaxOffsetNumber and FirstOffsetNumber.
+ *		This may result in 'pointer' becoming !OffsetNumberIsValid.
+ *
+ * If the pointer is already the maximum possible values permitted by the
+ * range of the ItemPointer's types, then do nothing.
+ */
+void
+ItemPointerInc(ItemPointer pointer)
+{
+	BlockNumber blk = ItemPointerGetBlockNumberNoCheck(pointer);
+	OffsetNumber off = ItemPointerGetOffsetNumberNoCheck(pointer);
+
+	if (off == PG_UINT16_MAX)
+	{
+		if (blk != InvalidBlockNumber)
+		{
+			off = 0;
+			blk++;
+		}
+	}
+	else
+		off++;
+
+	ItemPointerSet(pointer, blk, off);
+}
+
+/*
+ * ItemPointerDec
+ *		Decrement 'pointer' by 1 only paying attention to the ItemPointer's
+ *		type's range limits and not MaxOffsetNumber and FirstOffsetNumber.
+ *		This may result in 'pointer' becoming !OffsetNumberIsValid.
+ *
+ * If the pointer is already the minimum possible values permitted by the
+ * range of the ItemPointer's types, then do nothing.  This does rely on
+ * FirstOffsetNumber being 1 rather than 0.
+ */
+void
+ItemPointerDec(ItemPointer pointer)
+{
+	BlockNumber blk = ItemPointerGetBlockNumberNoCheck(pointer);
+	OffsetNumber off = ItemPointerGetOffsetNumberNoCheck(pointer);
+
+	if (off == 0)
+	{
+		if (blk != 0)
+		{
+			off = PG_UINT16_MAX;
+			blk--;
+		}
+	}
+	else
+		off--;
+
+	ItemPointerSet(pointer, blk, off);
+}
diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile
new file mode 100644
index 0000000..596b564
--- /dev/null
+++ b/src/backend/storage/smgr/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/smgr
+#
+# IDENTIFICATION
+#    src/backend/storage/smgr/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/smgr
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	md.o \
+	smgr.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/smgr/README b/src/backend/storage/smgr/README
new file mode 100644
index 0000000..e1cfc6c
--- /dev/null
+++ b/src/backend/storage/smgr/README
@@ -0,0 +1,52 @@
+src/backend/storage/smgr/README
+
+Storage Managers
+================
+
+In the original Berkeley Postgres system, there were several storage managers,
+of which only the "magnetic disk" manager remains.  (At Berkeley there were
+also managers for the Sony WORM optical disk jukebox and persistent main
+memory, but these were never supported in any externally released Postgres,
+nor in any version of PostgreSQL.)  The "magnetic disk" manager is itself
+seriously misnamed, because actually it supports any kind of device for
+which the operating system provides standard filesystem operations; which
+these days is pretty much everything of interest.  However, we retain the
+notion of a storage manager switch in case anyone ever wants to reintroduce
+other kinds of storage managers.  Removing the switch layer would save
+nothing noticeable anyway, since storage-access operations are surely far
+more expensive than one extra layer of C function calls.
+
+In Berkeley Postgres each relation was tagged with the ID of the storage
+manager to use for it.  This is gone.  It would be probably more reasonable
+to associate storage managers with tablespaces, should we ever re-introduce
+multiple storage managers into the system catalogs.
+
+The files in this directory, and their contents, are
+
+    smgr.c	The storage manager switch dispatch code.  The routines in
+		this file call the appropriate storage manager to do storage
+		accesses requested by higher-level code.  smgr.c also manages
+		the file handle cache (SMgrRelation table).
+
+    md.c	The "magnetic disk" storage manager, which is really just
+		an interface to the kernel's filesystem operations.
+
+Note that md.c in turn relies on src/backend/storage/file/fd.c.
+
+
+Relation Forks
+==============
+
+Since 8.4, a single smgr relation can be comprised of multiple physical
+files, called relation forks. This allows storing additional metadata like
+Free Space information in additional forks, which can be grown and truncated
+independently of the main data file, while still treating it all as a single
+physical relation in system catalogs.
+
+It is assumed that the main fork, fork number 0 or MAIN_FORKNUM, always
+exists. Fork numbers are assigned in src/include/common/relpath.h.
+Functions in smgr.c and md.c take an extra fork number argument, in addition
+to relfilenode and block number, to identify which relation fork you want to
+access. Since most code wants to access the main fork, a shortcut version of
+ReadBuffer that accesses MAIN_FORKNUM is provided in the buffer manager for
+convenience.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
new file mode 100644
index 0000000..b4bca7e
--- /dev/null
+++ b/src/backend/storage/smgr/md.c
@@ -0,0 +1,1409 @@
+/*-------------------------------------------------------------------------
+ *
+ * md.c
+ *	  This code manages relations that reside on magnetic disk.
+ *
+ * Or at least, that was what the Berkeley folk had in mind when they named
+ * this file.  In reality, what this code provides is an interface from
+ * the smgr API to Unix-like filesystem APIs, so it will work with any type
+ * of device for which the operating system provides filesystem support.
+ * It doesn't matter whether the bits are on spinning rust or some other
+ * storage technology.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/smgr/md.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "commands/tablespace.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/md.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/sync.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+
+/*
+ *	The magnetic disk storage manager keeps track of open file
+ *	descriptors in its own descriptor pool.  This is done to make it
+ *	easier to support relations that are larger than the operating
+ *	system's file size limit (often 2GBytes).  In order to do that,
+ *	we break relations up into "segment" files that are each shorter than
+ *	the OS file size limit.  The segment size is set by the RELSEG_SIZE
+ *	configuration constant in pg_config.h.
+ *
+ *	On disk, a relation must consist of consecutively numbered segment
+ *	files in the pattern
+ *		-- Zero or more full segments of exactly RELSEG_SIZE blocks each
+ *		-- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
+ *		-- Optionally, any number of inactive segments of size 0 blocks.
+ *	The full and partial segments are collectively the "active" segments.
+ *	Inactive segments are those that once contained data but are currently
+ *	not needed because of an mdtruncate() operation.  The reason for leaving
+ *	them present at size zero, rather than unlinking them, is that other
+ *	backends and/or the checkpointer might be holding open file references to
+ *	such segments.  If the relation expands again after mdtruncate(), such
+ *	that a deactivated segment becomes active again, it is important that
+ *	such file references still be valid --- else data might get written
+ *	out to an unlinked old copy of a segment file that will eventually
+ *	disappear.
+ *
+ *	File descriptors are stored in the per-fork md_seg_fds arrays inside
+ *	SMgrRelation. The length of these arrays is stored in md_num_open_segs.
+ *	Note that a fork's md_num_open_segs having a specific value does not
+ *	necessarily mean the relation doesn't have additional segments; we may
+ *	just not have opened the next segment yet.  (We could not have "all
+ *	segments are in the array" as an invariant anyway, since another backend
+ *	could extend the relation while we aren't looking.)  We do not have
+ *	entries for inactive segments, however; as soon as we find a partial
+ *	segment, we assume that any subsequent segments are inactive.
+ *
+ *	The entire MdfdVec array is palloc'd in the MdCxt memory context.
+ */
+
+typedef struct _MdfdVec
+{
+	File		mdfd_vfd;		/* fd number in fd.c's pool */
+	BlockNumber mdfd_segno;		/* segment number, from 0 */
+} MdfdVec;
+
+static MemoryContext MdCxt;		/* context for all MdfdVec objects */
+
+
+/* Populate a file tag describing an md.c segment file. */
+#define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \
+( \
+	memset(&(a), 0, sizeof(FileTag)), \
+	(a).handler = SYNC_HANDLER_MD, \
+	(a).rnode = (xx_rnode), \
+	(a).forknum = (xx_forknum), \
+	(a).segno = (xx_segno) \
+)
+
+
+/*** behavior for mdopen & _mdfd_getseg ***/
+/* ereport if segment not present */
+#define EXTENSION_FAIL				(1 << 0)
+/* return NULL if segment not present */
+#define EXTENSION_RETURN_NULL		(1 << 1)
+/* create new segments as needed */
+#define EXTENSION_CREATE			(1 << 2)
+/* create new segments if needed during recovery */
+#define EXTENSION_CREATE_RECOVERY	(1 << 3)
+/*
+ * Allow opening segments which are preceded by segments smaller than
+ * RELSEG_SIZE, e.g. inactive segments (see above). Note that this breaks
+ * mdnblocks() and related functionality henceforth - which currently is ok,
+ * because this is only required in the checkpointer which never uses
+ * mdnblocks().
+ */
+#define EXTENSION_DONT_CHECK_SIZE	(1 << 4)
+
+
+/* local routines */
+static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
+						 bool isRedo);
+static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
+static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
+								   MdfdVec *seg);
+static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
+									BlockNumber segno);
+static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
+									BlockNumber segno);
+static void _fdvec_resize(SMgrRelation reln,
+						  ForkNumber forknum,
+						  int nseg);
+static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber segno);
+static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
+							  BlockNumber segno, int oflags);
+static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
+							 BlockNumber blkno, bool skipFsync, int behavior);
+static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
+							  MdfdVec *seg);
+
+
+/*
+ *	mdinit() -- Initialize private state for magnetic disk storage manager.
+ */
+void
+mdinit(void)
+{
+	MdCxt = AllocSetContextCreate(TopMemoryContext,
+								  "MdSmgr",
+								  ALLOCSET_DEFAULT_SIZES);
+}
+
+/*
+ *	mdexists() -- Does the physical file exist?
+ *
+ * Note: this will return true for lingering files, with pending deletions
+ */
+bool
+mdexists(SMgrRelation reln, ForkNumber forkNum)
+{
+	/*
+	 * Close it first, to ensure that we notice if the fork has been unlinked
+	 * since we opened it.
+	 */
+	mdclose(reln, forkNum);
+
+	return (mdopenfork(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
+}
+
+/*
+ *	mdcreate() -- Create a new relation on magnetic disk.
+ *
+ * If isRedo is true, it's okay for the relation to exist already.
+ */
+void
+mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
+{
+	MdfdVec    *mdfd;
+	char	   *path;
+	File		fd;
+
+	if (isRedo && reln->md_num_open_segs[forkNum] > 0)
+		return;					/* created and opened already... */
+
+	Assert(reln->md_num_open_segs[forkNum] == 0);
+
+	/*
+	 * We may be using the target table space for the first time in this
+	 * database, so create a per-database subdirectory if needed.
+	 *
+	 * XXX this is a fairly ugly violation of module layering, but this seems
+	 * to be the best place to put the check.  Maybe TablespaceCreateDbspace
+	 * should be here and not in commands/tablespace.c?  But that would imply
+	 * importing a lot of stuff that smgr.c oughtn't know, either.
+	 */
+	TablespaceCreateDbspace(reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							isRedo);
+
+	path = relpath(reln->smgr_rnode, forkNum);
+
+	fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+
+	if (fd < 0)
+	{
+		int			save_errno = errno;
+
+		if (isRedo)
+			fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+		if (fd < 0)
+		{
+			/* be sure to report the error reported by create, not open */
+			errno = save_errno;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not create file \"%s\": %m", path)));
+		}
+	}
+
+	pfree(path);
+
+	_fdvec_resize(reln, forkNum, 1);
+	mdfd = &reln->md_seg_fds[forkNum][0];
+	mdfd->mdfd_vfd = fd;
+	mdfd->mdfd_segno = 0;
+}
+
+/*
+ *	mdunlink() -- Unlink a relation.
+ *
+ * Note that we're passed a RelFileNodeBackend --- by the time this is called,
+ * there won't be an SMgrRelation hashtable entry anymore.
+ *
+ * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
+ * to delete all forks.
+ *
+ * For regular relations, we don't unlink the first segment file of the rel,
+ * but just truncate it to zero length, and record a request to unlink it after
+ * the next checkpoint.  Additional segments can be unlinked immediately,
+ * however.  Leaving the empty file in place prevents that relfilenode
+ * number from being reused.  The scenario this protects us from is:
+ * 1. We delete a relation (and commit, and actually remove its file).
+ * 2. We create a new relation, which by chance gets the same relfilenode as
+ *	  the just-deleted one (OIDs must've wrapped around for that to happen).
+ * 3. We crash before another checkpoint occurs.
+ * During replay, we would delete the file and then recreate it, which is fine
+ * if the contents of the file were repopulated by subsequent WAL entries.
+ * But if we didn't WAL-log insertions, but instead relied on fsyncing the
+ * file after populating it (as we do at wal_level=minimal), the contents of
+ * the file would be lost forever.  By leaving the empty file until after the
+ * next checkpoint, we prevent reassignment of the relfilenode number until
+ * it's safe, because relfilenode assignment skips over any existing file.
+ *
+ * We do not need to go through this dance for temp relations, though, because
+ * we never make WAL entries for temp rels, and so a temp rel poses no threat
+ * to the health of a regular rel that has taken over its relfilenode number.
+ * The fact that temp rels and regular rels have different file naming
+ * patterns provides additional safety.
+ *
+ * All the above applies only to the relation's main fork; other forks can
+ * just be removed immediately, since they are not needed to prevent the
+ * relfilenode number from being recycled.  Also, we do not carefully
+ * track whether other forks have been created or not, but just attempt to
+ * unlink them unconditionally; so we should never complain about ENOENT.
+ *
+ * If isRedo is true, it's unsurprising for the relation to be already gone.
+ * Also, we should remove the file immediately instead of queuing a request
+ * for later, since during redo there's no possibility of creating a
+ * conflicting relation.
+ *
+ * Note: any failure should be reported as WARNING not ERROR, because
+ * we are usually not in a transaction anymore when this is called.
+ */
+void
+mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
+{
+	/* Now do the per-fork work */
+	if (forkNum == InvalidForkNumber)
+	{
+		for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
+			mdunlinkfork(rnode, forkNum, isRedo);
+	}
+	else
+		mdunlinkfork(rnode, forkNum, isRedo);
+}
+
+/*
+ * Truncate a file to release disk space.
+ */
+static int
+do_truncate(const char *path)
+{
+	int			save_errno;
+	int			ret;
+
+	ret = pg_truncate(path, 0);
+
+	/* Log a warning here to avoid repetition in callers. */
+	if (ret < 0 && errno != ENOENT)
+	{
+		save_errno = errno;
+		ereport(WARNING,
+				(errcode_for_file_access(),
+				 errmsg("could not truncate file \"%s\": %m", path)));
+		errno = save_errno;
+	}
+
+	return ret;
+}
+
+static void
+mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
+{
+	char	   *path;
+	int			ret;
+
+	path = relpath(rnode, forkNum);
+
+	/*
+	 * Delete or truncate the first segment.
+	 */
+	if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
+	{
+		if (!RelFileNodeBackendIsTemp(rnode))
+		{
+			/* Prevent other backends' fds from holding on to the disk space */
+			ret = do_truncate(path);
+
+			/* Forget any pending sync requests for the first segment */
+			register_forget_request(rnode, forkNum, 0 /* first seg */ );
+		}
+		else
+			ret = 0;
+
+		/* Next unlink the file, unless it was already found to be missing */
+		if (ret == 0 || errno != ENOENT)
+		{
+			ret = unlink(path);
+			if (ret < 0 && errno != ENOENT)
+				ereport(WARNING,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m", path)));
+		}
+	}
+	else
+	{
+		/* Prevent other backends' fds from holding on to the disk space */
+		ret = do_truncate(path);
+
+		/* Register request to unlink first segment later */
+		register_unlink_segment(rnode, forkNum, 0 /* first seg */ );
+	}
+
+	/*
+	 * Delete any additional segments.
+	 */
+	if (ret >= 0)
+	{
+		char	   *segpath = (char *) palloc(strlen(path) + 12);
+		BlockNumber segno;
+
+		/*
+		 * Note that because we loop until getting ENOENT, we will correctly
+		 * remove all inactive segments as well as active ones.
+		 */
+		for (segno = 1;; segno++)
+		{
+			sprintf(segpath, "%s.%u", path, segno);
+
+			if (!RelFileNodeBackendIsTemp(rnode))
+			{
+				/*
+				 * Prevent other backends' fds from holding on to the disk
+				 * space.
+				 */
+				if (do_truncate(segpath) < 0 && errno == ENOENT)
+					break;
+
+				/*
+				 * Forget any pending sync requests for this segment before we
+				 * try to unlink.
+				 */
+				register_forget_request(rnode, forkNum, segno);
+			}
+
+			if (unlink(segpath) < 0)
+			{
+				/* ENOENT is expected after the last segment... */
+				if (errno != ENOENT)
+					ereport(WARNING,
+							(errcode_for_file_access(),
+							 errmsg("could not remove file \"%s\": %m", segpath)));
+				break;
+			}
+		}
+		pfree(segpath);
+	}
+
+	pfree(path);
+}
+
+/*
+ *	mdextend() -- Add a block to the specified relation.
+ *
+ *		The semantics are nearly the same as mdwrite(): write at the
+ *		specified position.  However, this is to be used for the case of
+ *		extending a relation (i.e., blocknum is at or beyond the current
+ *		EOF).  Note that we assume writing a block beyond current EOF
+ *		causes intervening file space to become filled with zeroes.
+ */
+void
+mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		 char *buffer, bool skipFsync)
+{
+	off_t		seekpos;
+	int			nbytes;
+	MdfdVec    *v;
+
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+	/*
+	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+	 * more --- we mustn't create a block whose number actually is
+	 * InvalidBlockNumber.  (Note that this failure should be unreachable
+	 * because of upstream checks in bufmgr.c.)
+	 */
+	if (blocknum == InvalidBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+						relpath(reln->smgr_rnode, forknum),
+						InvalidBlockNumber)));
+
+	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
+
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+	{
+		if (nbytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not extend file \"%s\": %m",
+							FilePathName(v->mdfd_vfd)),
+					 errhint("Check free disk space.")));
+		/* short write: complain appropriately */
+		ereport(ERROR,
+				(errcode(ERRCODE_DISK_FULL),
+				 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
+						FilePathName(v->mdfd_vfd),
+						nbytes, BLCKSZ, blocknum),
+				 errhint("Check free disk space.")));
+	}
+
+	if (!skipFsync && !SmgrIsTemp(reln))
+		register_dirty_segment(reln, forknum, v);
+
+	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+}
+
+/*
+ *	mdopenfork() -- Open one fork of the specified relation.
+ *
+ * Note we only open the first segment, when there are multiple segments.
+ *
+ * If first segment is not present, either ereport or return NULL according
+ * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
+ * EXTENSION_CREATE means it's OK to extend an existing relation, not to
+ * invent one out of whole cloth.
+ */
+static MdfdVec *
+mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
+{
+	MdfdVec    *mdfd;
+	char	   *path;
+	File		fd;
+
+	/* No work if already open */
+	if (reln->md_num_open_segs[forknum] > 0)
+		return &reln->md_seg_fds[forknum][0];
+
+	path = relpath(reln->smgr_rnode, forknum);
+
+	fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+
+	if (fd < 0)
+	{
+		if ((behavior & EXTENSION_RETURN_NULL) &&
+			FILE_POSSIBLY_DELETED(errno))
+		{
+			pfree(path);
+			return NULL;
+		}
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+	}
+
+	pfree(path);
+
+	_fdvec_resize(reln, forknum, 1);
+	mdfd = &reln->md_seg_fds[forknum][0];
+	mdfd->mdfd_vfd = fd;
+	mdfd->mdfd_segno = 0;
+
+	Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
+
+	return mdfd;
+}
+
+/*
+ *  mdopen() -- Initialize newly-opened relation.
+ */
+void
+mdopen(SMgrRelation reln)
+{
+	/* mark it not open */
+	for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		reln->md_num_open_segs[forknum] = 0;
+}
+
+/*
+ *	mdclose() -- Close the specified relation, if it isn't closed already.
+ */
+void
+mdclose(SMgrRelation reln, ForkNumber forknum)
+{
+	int			nopensegs = reln->md_num_open_segs[forknum];
+
+	/* No work if already closed */
+	if (nopensegs == 0)
+		return;
+
+	/* close segments starting from the end */
+	while (nopensegs > 0)
+	{
+		MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
+
+		FileClose(v->mdfd_vfd);
+		_fdvec_resize(reln, forknum, nopensegs - 1);
+		nopensegs--;
+	}
+}
+
+/*
+ *	mdprefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+#ifdef USE_PREFETCH
+	off_t		seekpos;
+	MdfdVec    *v;
+
+	v = _mdfd_getseg(reln, forknum, blocknum, false,
+					 InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
+	if (v == NULL)
+		return false;
+
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+	(void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
+#endif							/* USE_PREFETCH */
+
+	return true;
+}
+
+/*
+ * mdwriteback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a range of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+mdwriteback(SMgrRelation reln, ForkNumber forknum,
+			BlockNumber blocknum, BlockNumber nblocks)
+{
+	/*
+	 * Issue flush requests in as few requests as possible; have to split at
+	 * segment boundaries though, since those are actually separate files.
+	 */
+	while (nblocks > 0)
+	{
+		BlockNumber nflush = nblocks;
+		off_t		seekpos;
+		MdfdVec    *v;
+		int			segnum_start,
+					segnum_end;
+
+		v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
+						 EXTENSION_RETURN_NULL);
+
+		/*
+		 * We might be flushing buffers of already removed relations, that's
+		 * ok, just ignore that case.
+		 */
+		if (!v)
+			return;
+
+		/* compute offset inside the current segment */
+		segnum_start = blocknum / RELSEG_SIZE;
+
+		/* compute number of desired writes within the current segment */
+		segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
+		if (segnum_start != segnum_end)
+			nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+		Assert(nflush >= 1);
+		Assert(nflush <= nblocks);
+
+		seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+		FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
+
+		nblocks -= nflush;
+		blocknum += nflush;
+	}
+}
+
+/*
+ *	mdread() -- Read the specified block from a relation.
+ */
+void
+mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+	   char *buffer)
+{
+	off_t		seekpos;
+	int			nbytes;
+	MdfdVec    *v;
+
+	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
+										reln->smgr_rnode.node.spcNode,
+										reln->smgr_rnode.node.dbNode,
+										reln->smgr_rnode.node.relNode,
+										reln->smgr_rnode.backend);
+
+	v = _mdfd_getseg(reln, forknum, blocknum, false,
+					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
+
+	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
+									   reln->smgr_rnode.node.spcNode,
+									   reln->smgr_rnode.node.dbNode,
+									   reln->smgr_rnode.node.relNode,
+									   reln->smgr_rnode.backend,
+									   nbytes,
+									   BLCKSZ);
+
+	if (nbytes != BLCKSZ)
+	{
+		if (nbytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read block %u in file \"%s\": %m",
+							blocknum, FilePathName(v->mdfd_vfd))));
+
+		/*
+		 * Short read: we are at or past EOF, or we read a partial block at
+		 * EOF.  Normally this is an error; upper levels should never try to
+		 * read a nonexistent block.  However, if zero_damaged_pages is ON or
+		 * we are InRecovery, we should instead return zeroes without
+		 * complaining.  This allows, for example, the case of trying to
+		 * update a block that was later truncated away.
+		 */
+		if (zero_damaged_pages || InRecovery)
+			MemSet(buffer, 0, BLCKSZ);
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
+							blocknum, FilePathName(v->mdfd_vfd),
+							nbytes, BLCKSZ)));
+	}
+}
+
+/*
+ *	mdwrite() -- Write the supplied block at the appropriate location.
+ *
+ *		This is to be used only for updating already-existing blocks of a
+ *		relation (ie, those before the current EOF).  To extend a relation,
+ *		use mdextend().
+ */
+void
+mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		char *buffer, bool skipFsync)
+{
+	off_t		seekpos;
+	int			nbytes;
+	MdfdVec    *v;
+
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum < mdnblocks(reln, forknum));
+#endif
+
+	TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
+										 reln->smgr_rnode.node.spcNode,
+										 reln->smgr_rnode.node.dbNode,
+										 reln->smgr_rnode.node.relNode,
+										 reln->smgr_rnode.backend);
+
+	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
+					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+	nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
+
+	TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
+										reln->smgr_rnode.node.spcNode,
+										reln->smgr_rnode.node.dbNode,
+										reln->smgr_rnode.node.relNode,
+										reln->smgr_rnode.backend,
+										nbytes,
+										BLCKSZ);
+
+	if (nbytes != BLCKSZ)
+	{
+		if (nbytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write block %u in file \"%s\": %m",
+							blocknum, FilePathName(v->mdfd_vfd))));
+		/* short write: complain appropriately */
+		ereport(ERROR,
+				(errcode(ERRCODE_DISK_FULL),
+				 errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
+						blocknum,
+						FilePathName(v->mdfd_vfd),
+						nbytes, BLCKSZ),
+				 errhint("Check free disk space.")));
+	}
+
+	if (!skipFsync && !SmgrIsTemp(reln))
+		register_dirty_segment(reln, forknum, v);
+}
+
+/*
+ *	mdnblocks() -- Get the number of blocks stored in a relation.
+ *
+ *		Important side effect: all active segments of the relation are opened
+ *		and added to the md_seg_fds array.  If this routine has not been
+ *		called, then only segments up to the last one actually touched
+ *		are present in the array.
+ */
+BlockNumber
+mdnblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	MdfdVec    *v;
+	BlockNumber nblocks;
+	BlockNumber segno;
+
+	mdopenfork(reln, forknum, EXTENSION_FAIL);
+
+	/* mdopen has opened the first segment */
+	Assert(reln->md_num_open_segs[forknum] > 0);
+
+	/*
+	 * Start from the last open segments, to avoid redundant seeks.  We have
+	 * previously verified that these segments are exactly RELSEG_SIZE long,
+	 * and it's useless to recheck that each time.
+	 *
+	 * NOTE: this assumption could only be wrong if another backend has
+	 * truncated the relation.  We rely on higher code levels to handle that
+	 * scenario by closing and re-opening the md fd, which is handled via
+	 * relcache flush.  (Since the checkpointer doesn't participate in
+	 * relcache flush, it could have segment entries for inactive segments;
+	 * that's OK because the checkpointer never needs to compute relation
+	 * size.)
+	 */
+	segno = reln->md_num_open_segs[forknum] - 1;
+	v = &reln->md_seg_fds[forknum][segno];
+
+	for (;;)
+	{
+		nblocks = _mdnblocks(reln, forknum, v);
+		if (nblocks > ((BlockNumber) RELSEG_SIZE))
+			elog(FATAL, "segment too big");
+		if (nblocks < ((BlockNumber) RELSEG_SIZE))
+			return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
+
+		/*
+		 * If segment is exactly RELSEG_SIZE, advance to next one.
+		 */
+		segno++;
+
+		/*
+		 * We used to pass O_CREAT here, but that has the disadvantage that it
+		 * might create a segment which has vanished through some operating
+		 * system misadventure.  In such a case, creating the segment here
+		 * undermines _mdfd_getseg's attempts to notice and report an error
+		 * upon access to a missing segment.
+		 */
+		v = _mdfd_openseg(reln, forknum, segno, 0);
+		if (v == NULL)
+			return segno * ((BlockNumber) RELSEG_SIZE);
+	}
+}
+
+/*
+ *	mdtruncate() -- Truncate relation to specified number of blocks.
+ */
+void
+mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+	BlockNumber curnblk;
+	BlockNumber priorblocks;
+	int			curopensegs;
+
+	/*
+	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
+	 * truncation loop will get them all!
+	 */
+	curnblk = mdnblocks(reln, forknum);
+	if (nblocks > curnblk)
+	{
+		/* Bogus request ... but no complaint if InRecovery */
+		if (InRecovery)
+			return;
+		ereport(ERROR,
+				(errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
+						relpath(reln->smgr_rnode, forknum),
+						nblocks, curnblk)));
+	}
+	if (nblocks == curnblk)
+		return;					/* no work */
+
+	/*
+	 * Truncate segments, starting at the last one. Starting at the end makes
+	 * managing the memory for the fd array easier, should there be errors.
+	 */
+	curopensegs = reln->md_num_open_segs[forknum];
+	while (curopensegs > 0)
+	{
+		MdfdVec    *v;
+
+		priorblocks = (curopensegs - 1) * RELSEG_SIZE;
+
+		v = &reln->md_seg_fds[forknum][curopensegs - 1];
+
+		if (priorblocks > nblocks)
+		{
+			/*
+			 * This segment is no longer active. We truncate the file, but do
+			 * not delete it, for reasons explained in the header comments.
+			 */
+			if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not truncate file \"%s\": %m",
+								FilePathName(v->mdfd_vfd))));
+
+			if (!SmgrIsTemp(reln))
+				register_dirty_segment(reln, forknum, v);
+
+			/* we never drop the 1st segment */
+			Assert(v != &reln->md_seg_fds[forknum][0]);
+
+			FileClose(v->mdfd_vfd);
+			_fdvec_resize(reln, forknum, curopensegs - 1);
+		}
+		else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
+		{
+			/*
+			 * This is the last segment we want to keep. Truncate the file to
+			 * the right length. NOTE: if nblocks is exactly a multiple K of
+			 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
+			 * keep it. This adheres to the invariant given in the header
+			 * comments.
+			 */
+			BlockNumber lastsegblocks = nblocks - priorblocks;
+
+			if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not truncate file \"%s\" to %u blocks: %m",
+								FilePathName(v->mdfd_vfd),
+								nblocks)));
+			if (!SmgrIsTemp(reln))
+				register_dirty_segment(reln, forknum, v);
+		}
+		else
+		{
+			/*
+			 * We still need this segment, so nothing to do for this and any
+			 * earlier segment.
+			 */
+			break;
+		}
+		curopensegs--;
+	}
+}
+
+/*
+ *	mdimmedsync() -- Immediately sync a relation to stable storage.
+ *
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager.  We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive.  If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
+ */
+void
+mdimmedsync(SMgrRelation reln, ForkNumber forknum)
+{
+	int			segno;
+	int			min_inactive_seg;
+
+	/*
+	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
+	 * fsync loop will get them all!
+	 */
+	mdnblocks(reln, forknum);
+
+	min_inactive_seg = segno = reln->md_num_open_segs[forknum];
+
+	/*
+	 * Temporarily open inactive segments, then close them after sync.  There
+	 * may be some inactive segments left opened after fsync() error, but that
+	 * is harmless.  We don't bother to clean them up and take a risk of
+	 * further trouble.  The next mdclose() will soon close them.
+	 */
+	while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
+		segno++;
+
+	while (segno > 0)
+	{
+		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
+
+		if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
+			ereport(data_sync_elevel(ERROR),
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m",
+							FilePathName(v->mdfd_vfd))));
+
+		/* Close inactive segments immediately */
+		if (segno > min_inactive_seg)
+		{
+			FileClose(v->mdfd_vfd);
+			_fdvec_resize(reln, forknum, segno - 1);
+		}
+
+		segno--;
+	}
+}
+
+/*
+ * register_dirty_segment() -- Mark a relation segment as needing fsync
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * ProcessSyncRequests to process later.  Otherwise, try to pass off the
+ * fsync request to the checkpointer process.  If that fails, just do the
+ * fsync locally before returning (we hope this will not happen often
+ * enough to be a performance problem).
+ */
+static void
+register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
+{
+	FileTag		tag;
+
+	INIT_MD_FILETAG(tag, reln->smgr_rnode.node, forknum, seg->mdfd_segno);
+
+	/* Temp relations should never be fsync'd */
+	Assert(!SmgrIsTemp(reln));
+
+	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
+	{
+		ereport(DEBUG1,
+				(errmsg_internal("could not forward fsync request because request queue is full")));
+
+		if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
+			ereport(data_sync_elevel(ERROR),
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m",
+							FilePathName(seg->mdfd_vfd))));
+	}
+}
+
+/*
+ * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
+ */
+static void
+register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
+						BlockNumber segno)
+{
+	FileTag		tag;
+
+	INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
+
+	/* Should never be used with temp relations */
+	Assert(!RelFileNodeBackendIsTemp(rnode));
+
+	RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
+}
+
+/*
+ * register_forget_request() -- forget any fsyncs for a relation fork's segment
+ */
+static void
+register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
+						BlockNumber segno)
+{
+	FileTag		tag;
+
+	INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
+
+	RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
+}
+
+/*
+ * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
+ */
+void
+ForgetDatabaseSyncRequests(Oid dbid)
+{
+	FileTag		tag;
+	RelFileNode rnode;
+
+	rnode.dbNode = dbid;
+	rnode.spcNode = 0;
+	rnode.relNode = 0;
+
+	INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber);
+
+	RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
+}
+
+/*
+ * DropRelationFiles -- drop files of all given relations
+ */
+void
+DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
+{
+	SMgrRelation *srels;
+	int			i;
+
+	srels = palloc(sizeof(SMgrRelation) * ndelrels);
+	for (i = 0; i < ndelrels; i++)
+	{
+		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
+
+		if (isRedo)
+		{
+			ForkNumber	fork;
+
+			for (fork = 0; fork <= MAX_FORKNUM; fork++)
+				XLogDropRelation(delrels[i], fork);
+		}
+		srels[i] = srel;
+	}
+
+	smgrdounlinkall(srels, ndelrels, isRedo);
+
+	for (i = 0; i < ndelrels; i++)
+		smgrclose(srels[i]);
+	pfree(srels);
+}
+
+
+/*
+ *	_fdvec_resize() -- Resize the fork's open segments array
+ */
+static void
+_fdvec_resize(SMgrRelation reln,
+			  ForkNumber forknum,
+			  int nseg)
+{
+	if (nseg == 0)
+	{
+		if (reln->md_num_open_segs[forknum] > 0)
+		{
+			pfree(reln->md_seg_fds[forknum]);
+			reln->md_seg_fds[forknum] = NULL;
+		}
+	}
+	else if (reln->md_num_open_segs[forknum] == 0)
+	{
+		reln->md_seg_fds[forknum] =
+			MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
+	}
+	else
+	{
+		/*
+		 * It doesn't seem worthwhile complicating the code to amortize
+		 * repalloc() calls.  Those are far faster than PathNameOpenFile() or
+		 * FileClose(), and the memory context internally will sometimes avoid
+		 * doing an actual reallocation.
+		 */
+		reln->md_seg_fds[forknum] =
+			repalloc(reln->md_seg_fds[forknum],
+					 sizeof(MdfdVec) * nseg);
+	}
+
+	reln->md_num_open_segs[forknum] = nseg;
+}
+
+/*
+ * Return the filename for the specified segment of the relation. The
+ * returned string is palloc'd.
+ */
+static char *
+_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
+{
+	char	   *path,
+			   *fullpath;
+
+	path = relpath(reln->smgr_rnode, forknum);
+
+	if (segno > 0)
+	{
+		fullpath = psprintf("%s.%u", path, segno);
+		pfree(path);
+	}
+	else
+		fullpath = path;
+
+	return fullpath;
+}
+
+/*
+ * Open the specified segment of the relation,
+ * and make a MdfdVec object for it.  Returns NULL on failure.
+ */
+static MdfdVec *
+_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
+			  int oflags)
+{
+	MdfdVec    *v;
+	File		fd;
+	char	   *fullpath;
+
+	fullpath = _mdfd_segpath(reln, forknum, segno);
+
+	/* open the file */
+	fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
+
+	pfree(fullpath);
+
+	if (fd < 0)
+		return NULL;
+
+	/*
+	 * Segments are always opened in order from lowest to highest, so we must
+	 * be adding a new one at the end.
+	 */
+	Assert(segno == reln->md_num_open_segs[forknum]);
+
+	_fdvec_resize(reln, forknum, segno + 1);
+
+	/* fill the entry */
+	v = &reln->md_seg_fds[forknum][segno];
+	v->mdfd_vfd = fd;
+	v->mdfd_segno = segno;
+
+	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+	/* all done */
+	return v;
+}
+
+/*
+ *	_mdfd_getseg() -- Find the segment of the relation holding the
+ *		specified block.
+ *
+ * If the segment doesn't exist, we ereport, return NULL, or create the
+ * segment, according to "behavior".  Note: skipFsync is only used in the
+ * EXTENSION_CREATE case.
+ */
+static MdfdVec *
+_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			 bool skipFsync, int behavior)
+{
+	MdfdVec    *v;
+	BlockNumber targetseg;
+	BlockNumber nextsegno;
+
+	/* some way to handle non-existent segments needs to be specified */
+	Assert(behavior &
+		   (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL));
+
+	targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
+
+	/* if an existing and opened segment, we're done */
+	if (targetseg < reln->md_num_open_segs[forknum])
+	{
+		v = &reln->md_seg_fds[forknum][targetseg];
+		return v;
+	}
+
+	/*
+	 * The target segment is not yet open. Iterate over all the segments
+	 * between the last opened and the target segment. This way missing
+	 * segments either raise an error, or get created (according to
+	 * 'behavior'). Start with either the last opened, or the first segment if
+	 * none was opened before.
+	 */
+	if (reln->md_num_open_segs[forknum] > 0)
+		v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
+	else
+	{
+		v = mdopenfork(reln, forknum, behavior);
+		if (!v)
+			return NULL;		/* if behavior & EXTENSION_RETURN_NULL */
+	}
+
+	for (nextsegno = reln->md_num_open_segs[forknum];
+		 nextsegno <= targetseg; nextsegno++)
+	{
+		BlockNumber nblocks = _mdnblocks(reln, forknum, v);
+		int			flags = 0;
+
+		Assert(nextsegno == v->mdfd_segno + 1);
+
+		if (nblocks > ((BlockNumber) RELSEG_SIZE))
+			elog(FATAL, "segment too big");
+
+		if ((behavior & EXTENSION_CREATE) ||
+			(InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
+		{
+			/*
+			 * Normally we will create new segments only if authorized by the
+			 * caller (i.e., we are doing mdextend()).  But when doing WAL
+			 * recovery, create segments anyway; this allows cases such as
+			 * replaying WAL data that has a write into a high-numbered
+			 * segment of a relation that was later deleted. We want to go
+			 * ahead and create the segments so we can finish out the replay.
+			 *
+			 * We have to maintain the invariant that segments before the last
+			 * active segment are of size RELSEG_SIZE; therefore, if
+			 * extending, pad them out with zeroes if needed.  (This only
+			 * matters if in recovery, or if the caller is extending the
+			 * relation discontiguously, but that can happen in hash indexes.)
+			 */
+			if (nblocks < ((BlockNumber) RELSEG_SIZE))
+			{
+				char	   *zerobuf = palloc0(BLCKSZ);
+
+				mdextend(reln, forknum,
+						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+						 zerobuf, skipFsync);
+				pfree(zerobuf);
+			}
+			flags = O_CREAT;
+		}
+		else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
+				 nblocks < ((BlockNumber) RELSEG_SIZE))
+		{
+			/*
+			 * When not extending (or explicitly including truncated
+			 * segments), only open the next segment if the current one is
+			 * exactly RELSEG_SIZE.  If not (this branch), either return NULL
+			 * or fail.
+			 */
+			if (behavior & EXTENSION_RETURN_NULL)
+			{
+				/*
+				 * Some callers discern between reasons for _mdfd_getseg()
+				 * returning NULL based on errno. As there's no failing
+				 * syscall involved in this case, explicitly set errno to
+				 * ENOENT, as that seems the closest interpretation.
+				 */
+				errno = ENOENT;
+				return NULL;
+			}
+
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
+							_mdfd_segpath(reln, forknum, nextsegno),
+							blkno, nblocks)));
+		}
+
+		v = _mdfd_openseg(reln, forknum, nextsegno, flags);
+
+		if (v == NULL)
+		{
+			if ((behavior & EXTENSION_RETURN_NULL) &&
+				FILE_POSSIBLY_DELETED(errno))
+				return NULL;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\" (target block %u): %m",
+							_mdfd_segpath(reln, forknum, nextsegno),
+							blkno)));
+		}
+	}
+
+	return v;
+}
+
+/*
+ * Get number of blocks present in a single disk file
+ */
+static BlockNumber
+_mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
+{
+	off_t		len;
+
+	len = FileSize(seg->mdfd_vfd);
+	if (len < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not seek to end of file \"%s\": %m",
+						FilePathName(seg->mdfd_vfd))));
+	/* note that this calculation will ignore any partial block at EOF */
+	return (BlockNumber) (len / BLCKSZ);
+}
+
+/*
+ * Sync a file to disk, given a file tag.  Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdsyncfiletag(const FileTag *ftag, char *path)
+{
+	SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
+	File		file;
+	bool		need_to_close;
+	int			result,
+				save_errno;
+
+	/* See if we already have the file open, or need to open it. */
+	if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
+	{
+		file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
+		strlcpy(path, FilePathName(file), MAXPGPATH);
+		need_to_close = false;
+	}
+	else
+	{
+		char	   *p;
+
+		p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
+		strlcpy(path, p, MAXPGPATH);
+		pfree(p);
+
+		file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+		if (file < 0)
+			return -1;
+		need_to_close = true;
+	}
+
+	/* Sync the file. */
+	result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
+	save_errno = errno;
+
+	if (need_to_close)
+		FileClose(file);
+
+	errno = save_errno;
+	return result;
+}
+
+/*
+ * Unlink a file, given a file tag.  Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdunlinkfiletag(const FileTag *ftag, char *path)
+{
+	char	   *p;
+
+	/* Compute the path. */
+	p = relpathperm(ftag->rnode, MAIN_FORKNUM);
+	strlcpy(path, p, MAXPGPATH);
+	pfree(p);
+
+	/* Try to unlink the file. */
+	return unlink(path);
+}
+
+/*
+ * Check if a given candidate request matches a given tag, when processing
+ * a SYNC_FILTER_REQUEST request.  This will be called for all pending
+ * requests to find out whether to forget them.
+ */
+bool
+mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
+{
+	/*
+	 * For now we only use filter requests as a way to drop all scheduled
+	 * callbacks relating to a given database, when dropping the database.
+	 * We'll return true for all candidates that have the same database OID as
+	 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
+	 */
+	return ftag->rnode.dbNode == candidate->rnode.dbNode;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
new file mode 100644
index 0000000..4dc2464
--- /dev/null
+++ b/src/backend/storage/smgr/smgr.c
@@ -0,0 +1,695 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.c
+ *	  public interface routines to storage manager switch.
+ *
+ *	  All file system operations in POSTGRES dispatch through these
+ *	  routines.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/smgr/smgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "lib/ilist.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/md.h"
+#include "storage/smgr.h"
+#include "utils/hsearch.h"
+#include "utils/inval.h"
+
+
+/*
+ * This struct of function pointers defines the API between smgr.c and
+ * any individual storage manager module.  Note that smgr subfunctions are
+ * generally expected to report problems via elog(ERROR).  An exception is
+ * that smgr_unlink should use elog(WARNING), rather than erroring out,
+ * because we normally unlink relations during post-commit/abort cleanup,
+ * and so it's too late to raise an error.  Also, various conditions that
+ * would normally be errors should be allowed during bootstrap and/or WAL
+ * recovery --- see comments in md.c for details.
+ */
+typedef struct f_smgr
+{
+	void		(*smgr_init) (void);	/* may be NULL */
+	void		(*smgr_shutdown) (void);	/* may be NULL */
+	void		(*smgr_open) (SMgrRelation reln);
+	void		(*smgr_close) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_create) (SMgrRelation reln, ForkNumber forknum,
+								bool isRedo);
+	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum,
+								bool isRedo);
+	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
+								BlockNumber blocknum, char *buffer, bool skipFsync);
+	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
+								  BlockNumber blocknum);
+	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
+							  BlockNumber blocknum, char *buffer);
+	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
+							   BlockNumber blocknum, char *buffer, bool skipFsync);
+	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
+								   BlockNumber blocknum, BlockNumber nblocks);
+	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
+								  BlockNumber nblocks);
+	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+} f_smgr;
+
+static const f_smgr smgrsw[] = {
+	/* magnetic disk */
+	{
+		.smgr_init = mdinit,
+		.smgr_shutdown = NULL,
+		.smgr_open = mdopen,
+		.smgr_close = mdclose,
+		.smgr_create = mdcreate,
+		.smgr_exists = mdexists,
+		.smgr_unlink = mdunlink,
+		.smgr_extend = mdextend,
+		.smgr_prefetch = mdprefetch,
+		.smgr_read = mdread,
+		.smgr_write = mdwrite,
+		.smgr_writeback = mdwriteback,
+		.smgr_nblocks = mdnblocks,
+		.smgr_truncate = mdtruncate,
+		.smgr_immedsync = mdimmedsync,
+	}
+};
+
+static const int NSmgr = lengthof(smgrsw);
+
+/*
+ * Each backend has a hashtable that stores all extant SMgrRelation objects.
+ * In addition, "unowned" SMgrRelation objects are chained together in a list.
+ */
+static HTAB *SMgrRelationHash = NULL;
+
+static dlist_head unowned_relns;
+
+/* local function prototypes */
+static void smgrshutdown(int code, Datum arg);
+
+
+/*
+ *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
+ *								  managers.
+ *
+ * Note: smgrinit is called during backend startup (normal or standalone
+ * case), *not* during postmaster start.  Therefore, any resources created
+ * here or destroyed in smgrshutdown are backend-local.
+ */
+void
+smgrinit(void)
+{
+	int			i;
+
+	for (i = 0; i < NSmgr; i++)
+	{
+		if (smgrsw[i].smgr_init)
+			smgrsw[i].smgr_init();
+	}
+
+	/* register the shutdown proc */
+	on_proc_exit(smgrshutdown, 0);
+}
+
+/*
+ * on_proc_exit hook for smgr cleanup during backend shutdown
+ */
+static void
+smgrshutdown(int code, Datum arg)
+{
+	int			i;
+
+	for (i = 0; i < NSmgr; i++)
+	{
+		if (smgrsw[i].smgr_shutdown)
+			smgrsw[i].smgr_shutdown();
+	}
+}
+
+/*
+ *	smgropen() -- Return an SMgrRelation object, creating it if need be.
+ *
+ *		This does not attempt to actually open the underlying file.
+ */
+SMgrRelation
+smgropen(RelFileNode rnode, BackendId backend)
+{
+	RelFileNodeBackend brnode;
+	SMgrRelation reln;
+	bool		found;
+
+	if (SMgrRelationHash == NULL)
+	{
+		/* First time through: initialize the hash table */
+		HASHCTL		ctl;
+
+		ctl.keysize = sizeof(RelFileNodeBackend);
+		ctl.entrysize = sizeof(SMgrRelationData);
+		SMgrRelationHash = hash_create("smgr relation table", 400,
+									   &ctl, HASH_ELEM | HASH_BLOBS);
+		dlist_init(&unowned_relns);
+	}
+
+	/* Look up or create an entry */
+	brnode.node = rnode;
+	brnode.backend = backend;
+	reln = (SMgrRelation) hash_search(SMgrRelationHash,
+									  (void *) &brnode,
+									  HASH_ENTER, &found);
+
+	/* Initialize it if not present before */
+	if (!found)
+	{
+		/* hash_search already filled in the lookup key */
+		reln->smgr_owner = NULL;
+		reln->smgr_targblock = InvalidBlockNumber;
+		for (int i = 0; i <= MAX_FORKNUM; ++i)
+			reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
+		reln->smgr_which = 0;	/* we only have md.c at present */
+
+		/* implementation-specific initialization */
+		smgrsw[reln->smgr_which].smgr_open(reln);
+
+		/* it has no owner yet */
+		dlist_push_tail(&unowned_relns, &reln->node);
+	}
+
+	return reln;
+}
+
+/*
+ * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
+ *
+ * There can be only one owner at a time; this is sufficient since currently
+ * the only such owners exist in the relcache.
+ */
+void
+smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
+{
+	/* We don't support "disowning" an SMgrRelation here, use smgrclearowner */
+	Assert(owner != NULL);
+
+	/*
+	 * First, unhook any old owner.  (Normally there shouldn't be any, but it
+	 * seems possible that this can happen during swap_relation_files()
+	 * depending on the order of processing.  It's ok to close the old
+	 * relcache entry early in that case.)
+	 *
+	 * If there isn't an old owner, then the reln should be in the unowned
+	 * list, and we need to remove it.
+	 */
+	if (reln->smgr_owner)
+		*(reln->smgr_owner) = NULL;
+	else
+		dlist_delete(&reln->node);
+
+	/* Now establish the ownership relationship. */
+	reln->smgr_owner = owner;
+	*owner = reln;
+}
+
+/*
+ * smgrclearowner() -- Remove long-lived reference to an SMgrRelation object
+ *					   if one exists
+ */
+void
+smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
+{
+	/* Do nothing if the SMgrRelation object is not owned by the owner */
+	if (reln->smgr_owner != owner)
+		return;
+
+	/* unset the owner's reference */
+	*owner = NULL;
+
+	/* unset our reference to the owner */
+	reln->smgr_owner = NULL;
+
+	/* add to list of unowned relations */
+	dlist_push_tail(&unowned_relns, &reln->node);
+}
+
+/*
+ *	smgrexists() -- Does the underlying file for a fork exist?
+ */
+bool
+smgrexists(SMgrRelation reln, ForkNumber forknum)
+{
+	return smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
+}
+
+/*
+ *	smgrclose() -- Close and delete an SMgrRelation object.
+ */
+void
+smgrclose(SMgrRelation reln)
+{
+	SMgrRelation *owner;
+	ForkNumber	forknum;
+
+	for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		smgrsw[reln->smgr_which].smgr_close(reln, forknum);
+
+	owner = reln->smgr_owner;
+
+	if (!owner)
+		dlist_delete(&reln->node);
+
+	if (hash_search(SMgrRelationHash,
+					(void *) &(reln->smgr_rnode),
+					HASH_REMOVE, NULL) == NULL)
+		elog(ERROR, "SMgrRelation hashtable corrupted");
+
+	/*
+	 * Unhook the owner pointer, if any.  We do this last since in the remote
+	 * possibility of failure above, the SMgrRelation object will still exist.
+	 */
+	if (owner)
+		*owner = NULL;
+}
+
+/*
+ *	smgrcloseall() -- Close all existing SMgrRelation objects.
+ */
+void
+smgrcloseall(void)
+{
+	HASH_SEQ_STATUS status;
+	SMgrRelation reln;
+
+	/* Nothing to do if hashtable not set up */
+	if (SMgrRelationHash == NULL)
+		return;
+
+	hash_seq_init(&status, SMgrRelationHash);
+
+	while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
+		smgrclose(reln);
+}
+
+/*
+ *	smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
+ *					   if one exists.
+ *
+ * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
+ * uselessly creating a hashtable entry only to drop it again when no
+ * such entry exists already.
+ */
+void
+smgrclosenode(RelFileNodeBackend rnode)
+{
+	SMgrRelation reln;
+
+	/* Nothing to do if hashtable not set up */
+	if (SMgrRelationHash == NULL)
+		return;
+
+	reln = (SMgrRelation) hash_search(SMgrRelationHash,
+									  (void *) &rnode,
+									  HASH_FIND, NULL);
+	if (reln != NULL)
+		smgrclose(reln);
+}
+
+/*
+ *	smgrcreate() -- Create a new relation.
+ *
+ *		Given an already-created (but presumably unused) SMgrRelation,
+ *		cause the underlying disk file or other storage for the fork
+ *		to be created.
+ */
+void
+smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+	smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
+}
+
+/*
+ *	smgrdosyncall() -- Immediately sync all forks of all given relations
+ *
+ *		All forks of all given relations are synced out to the store.
+ *
+ *		This is equivalent to FlushRelationBuffers() for each smgr relation,
+ *		then calling smgrimmedsync() for all forks of each relation, but it's
+ *		significantly quicker so should be preferred when possible.
+ */
+void
+smgrdosyncall(SMgrRelation *rels, int nrels)
+{
+	int			i = 0;
+	ForkNumber	forknum;
+
+	if (nrels == 0)
+		return;
+
+	FlushRelationsAllBuffers(rels, nrels);
+
+	/*
+	 * Sync the physical file(s).
+	 */
+	for (i = 0; i < nrels; i++)
+	{
+		int			which = rels[i]->smgr_which;
+
+		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			if (smgrsw[which].smgr_exists(rels[i], forknum))
+				smgrsw[which].smgr_immedsync(rels[i], forknum);
+		}
+	}
+}
+
+/*
+ *	smgrdounlinkall() -- Immediately unlink all forks of all given relations
+ *
+ *		All forks of all given relations are removed from the store.  This
+ *		should not be used during transactional operations, since it can't be
+ *		undone.
+ *
+ *		If isRedo is true, it is okay for the underlying file(s) to be gone
+ *		already.
+ */
+void
+smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
+{
+	int			i = 0;
+	RelFileNodeBackend *rnodes;
+	ForkNumber	forknum;
+
+	if (nrels == 0)
+		return;
+
+	/*
+	 * Get rid of any remaining buffers for the relations.  bufmgr will just
+	 * drop them without bothering to write the contents.
+	 */
+	DropRelFileNodesAllBuffers(rels, nrels);
+
+	/*
+	 * create an array which contains all relations to be dropped, and close
+	 * each relation's forks at the smgr level while at it
+	 */
+	rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
+	for (i = 0; i < nrels; i++)
+	{
+		RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+		int			which = rels[i]->smgr_which;
+
+		rnodes[i] = rnode;
+
+		/* Close the forks at smgr level */
+		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+			smgrsw[which].smgr_close(rels[i], forknum);
+	}
+
+	/*
+	 * It'd be nice to tell the stats collector to forget them immediately,
+	 * too. But we can't because we don't know the OIDs.
+	 */
+
+	/*
+	 * Send a shared-inval message to force other backends to close any
+	 * dangling smgr references they may have for these rels.  We should do
+	 * this before starting the actual unlinking, in case we fail partway
+	 * through that step.  Note that the sinval messages will eventually come
+	 * back to this backend, too, and thereby provide a backstop that we
+	 * closed our own smgr rel.
+	 */
+	for (i = 0; i < nrels; i++)
+		CacheInvalidateSmgr(rnodes[i]);
+
+	/*
+	 * Delete the physical file(s).
+	 *
+	 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+	 * ERROR, because we've already decided to commit or abort the current
+	 * xact.
+	 */
+
+	for (i = 0; i < nrels; i++)
+	{
+		int			which = rels[i]->smgr_which;
+
+		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+			smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo);
+	}
+
+	pfree(rnodes);
+}
+
+
+/*
+ *	smgrextend() -- Add a new block to a file.
+ *
+ *		The semantics are nearly the same as smgrwrite(): write at the
+ *		specified position.  However, this is to be used for the case of
+ *		extending a relation (i.e., blocknum is at or beyond the current
+ *		EOF).  Note that we assume writing a block beyond current EOF
+ *		causes intervening file space to become filled with zeroes.
+ */
+void
+smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		   char *buffer, bool skipFsync)
+{
+	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
+										 buffer, skipFsync);
+
+	/*
+	 * Normally we expect this to increase nblocks by one, but if the cached
+	 * value isn't as expected, just invalidate it so the next call asks the
+	 * kernel.
+	 */
+	if (reln->smgr_cached_nblocks[forknum] == blocknum)
+		reln->smgr_cached_nblocks[forknum] = blocknum + 1;
+	else
+		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+}
+
+/*
+ *	smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
+ *
+ *		In recovery only, this can return false to indicate that a file
+ *		doesn't	exist (presumably it has been dropped by a later WAL
+ *		record).
+ */
+bool
+smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
+}
+
+/*
+ *	smgrread() -- read a particular block from a relation into the supplied
+ *				  buffer.
+ *
+ *		This routine is called from the buffer manager in order to
+ *		instantiate pages in the shared buffer cache.  All storage managers
+ *		return pages in the format that POSTGRES expects.
+ */
+void
+smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		 char *buffer)
+{
+	smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
+}
+
+/*
+ *	smgrwrite() -- Write the supplied buffer out.
+ *
+ *		This is to be used only for updating already-existing blocks of a
+ *		relation (ie, those before the current EOF).  To extend a relation,
+ *		use smgrextend().
+ *
+ *		This is not a synchronous write -- the block is not necessarily
+ *		on disk at return, only dumped out to the kernel.  However,
+ *		provisions will be made to fsync the write before the next checkpoint.
+ *
+ *		skipFsync indicates that the caller will make other provisions to
+ *		fsync the relation, so we needn't bother.  Temporary relations also
+ *		do not require fsync.
+ */
+void
+smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		  char *buffer, bool skipFsync)
+{
+	smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
+										buffer, skipFsync);
+}
+
+
+/*
+ *	smgrwriteback() -- Trigger kernel writeback for the supplied range of
+ *					   blocks.
+ */
+void
+smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			  BlockNumber nblocks)
+{
+	smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
+											nblocks);
+}
+
+/*
+ *	smgrnblocks() -- Calculate the number of blocks in the
+ *					 supplied relation.
+ */
+BlockNumber
+smgrnblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	BlockNumber result;
+
+	/* Check and return if we get the cached value for the number of blocks. */
+	result = smgrnblocks_cached(reln, forknum);
+	if (result != InvalidBlockNumber)
+		return result;
+
+	result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+
+	reln->smgr_cached_nblocks[forknum] = result;
+
+	return result;
+}
+
+/*
+ *	smgrnblocks_cached() -- Get the cached number of blocks in the supplied
+ *							relation.
+ *
+ * Returns an InvalidBlockNumber when not in recovery and when the relation
+ * fork size is not cached.
+ */
+BlockNumber
+smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
+{
+	/*
+	 * For now, we only use cached values in recovery due to lack of a shared
+	 * invalidation mechanism for changes in file size.
+	 */
+	if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
+		return reln->smgr_cached_nblocks[forknum];
+
+	return InvalidBlockNumber;
+}
+
+/*
+ *	smgrtruncate() -- Truncate the given forks of supplied relation to
+ *					  each specified numbers of blocks
+ *
+ * The truncation is done immediately, so this can't be rolled back.
+ *
+ * The caller must hold AccessExclusiveLock on the relation, to ensure that
+ * other backends receive the smgr invalidation event that this function sends
+ * before they access any forks of the relation again.
+ */
+void
+smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks)
+{
+	int			i;
+
+	/*
+	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
+	 * just drop them without bothering to write the contents.
+	 */
+	DropRelFileNodeBuffers(reln, forknum, nforks, nblocks);
+
+	/*
+	 * Send a shared-inval message to force other backends to close any smgr
+	 * references they may have for this rel.  This is useful because they
+	 * might have open file pointers to segments that got removed, and/or
+	 * smgr_targblock variables pointing past the new rel end.  (The inval
+	 * message will come back to our backend, too, causing a
+	 * probably-unnecessary local smgr flush.  But we don't expect that this
+	 * is a performance-critical path.)  As in the unlink code, we want to be
+	 * sure the message is sent before we start changing things on-disk.
+	 */
+	CacheInvalidateSmgr(reln->smgr_rnode);
+
+	/* Do the truncation */
+	for (i = 0; i < nforks; i++)
+	{
+		/* Make the cached size is invalid if we encounter an error. */
+		reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
+
+		smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
+
+		/*
+		 * We might as well update the local smgr_cached_nblocks values. The
+		 * smgr cache inval message that this function sent will cause other
+		 * backends to invalidate their copies of smgr_fsm_nblocks and
+		 * smgr_vm_nblocks, and these ones too at the next command boundary.
+		 * But these ensure they aren't outright wrong until then.
+		 */
+		reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
+	}
+}
+
+/*
+ *	smgrimmedsync() -- Force the specified relation to stable storage.
+ *
+ *		Synchronously force all previous writes to the specified relation
+ *		down to disk.
+ *
+ *		This is useful for building completely new relations (eg, new
+ *		indexes).  Instead of incrementally WAL-logging the index build
+ *		steps, we can just write completed index pages to disk with smgrwrite
+ *		or smgrextend, and then fsync the completed index file before
+ *		committing the transaction.  (This is sufficient for purposes of
+ *		crash recovery, since it effectively duplicates forcing a checkpoint
+ *		for the completed index.  But it is *not* sufficient if one wishes
+ *		to use the WAL log for PITR or replication purposes: in that case
+ *		we have to make WAL entries as well.)
+ *
+ *		The preceding writes should specify skipFsync = true to avoid
+ *		duplicative fsyncs.
+ *
+ *		Note that you need to do FlushRelationBuffers() first if there is
+ *		any possibility that there are dirty buffers for the relation;
+ *		otherwise the sync is not very meaningful.
+ */
+void
+smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
+{
+	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
+}
+
+/*
+ * AtEOXact_SMgr
+ *
+ * This routine is called during transaction commit or abort (it doesn't
+ * particularly care which).  All transient SMgrRelation objects are closed.
+ *
+ * We do this as a compromise between wanting transient SMgrRelations to
+ * live awhile (to amortize the costs of blind writes of multiple blocks)
+ * and needing them to not live forever (since we're probably holding open
+ * a kernel file descriptor for the underlying file, and we need to ensure
+ * that gets closed reasonably soon if the file gets deleted).
+ */
+void
+AtEOXact_SMgr(void)
+{
+	dlist_mutable_iter iter;
+
+	/*
+	 * Zap all unowned SMgrRelations.  We rely on smgrclose() to remove each
+	 * one from the list.
+	 */
+	dlist_foreach_modify(iter, &unowned_relns)
+	{
+		SMgrRelation rel = dlist_container(SMgrRelationData, node,
+										   iter.cur);
+
+		Assert(rel->smgr_owner == NULL);
+
+		smgrclose(rel);
+	}
+}
diff --git a/src/backend/storage/sync/Makefile b/src/backend/storage/sync/Makefile
new file mode 100644
index 0000000..be88b44
--- /dev/null
+++ b/src/backend/storage/sync/Makefile
@@ -0,0 +1,18 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/sync
+#
+# IDENTIFICATION
+#    src/backend/storage/sync/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/sync
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	sync.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
new file mode 100644
index 0000000..28cbfe6
--- /dev/null
+++ b/src/backend/storage/sync/sync.c
@@ -0,0 +1,651 @@
+/*-------------------------------------------------------------------------
+ *
+ * sync.c
+ *	  File synchronization management code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/sync/sync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "access/commit_ts.h"
+#include "access/clog.h"
+#include "access/multixact.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "commands/tablespace.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "portability/instr_time.h"
+#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/md.h"
+#include "utils/hsearch.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+
+static MemoryContext pendingOpsCxt; /* context for the pending ops state  */
+
+/*
+ * In some contexts (currently, standalone backends and the checkpointer)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint.  This hash
+ * table remembers the pending operations.  We use a hash table mostly as
+ * a convenient way of merging duplicate requests.
+ *
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the checkpointer.)
+ */
+typedef uint16 CycleCtr;		/* can be any convenient integer size */
+
+typedef struct
+{
+	FileTag		tag;			/* identifies handler and file */
+	CycleCtr	cycle_ctr;		/* sync_cycle_ctr of oldest request */
+	bool		canceled;		/* canceled is true if we canceled "recently" */
+} PendingFsyncEntry;
+
+typedef struct
+{
+	FileTag		tag;			/* identifies handler and file */
+	CycleCtr	cycle_ctr;		/* checkpoint_cycle_ctr when request was made */
+	bool		canceled;		/* true if request has been canceled */
+} PendingUnlinkEntry;
+
+static HTAB *pendingOps = NULL;
+static List *pendingUnlinks = NIL;
+static MemoryContext pendingOpsCxt; /* context for the above  */
+
+static CycleCtr sync_cycle_ctr = 0;
+static CycleCtr checkpoint_cycle_ctr = 0;
+
+/* Intervals for calling AbsorbSyncRequests */
+#define FSYNCS_PER_ABSORB		10
+#define UNLINKS_PER_ABSORB		10
+
+/*
+ * Function pointers for handling sync and unlink requests.
+ */
+typedef struct SyncOps
+{
+	int			(*sync_syncfiletag) (const FileTag *ftag, char *path);
+	int			(*sync_unlinkfiletag) (const FileTag *ftag, char *path);
+	bool		(*sync_filetagmatches) (const FileTag *ftag,
+										const FileTag *candidate);
+} SyncOps;
+
+/*
+ * These indexes must correspond to the values of the SyncRequestHandler enum.
+ */
+static const SyncOps syncsw[] = {
+	/* magnetic disk */
+	[SYNC_HANDLER_MD] = {
+		.sync_syncfiletag = mdsyncfiletag,
+		.sync_unlinkfiletag = mdunlinkfiletag,
+		.sync_filetagmatches = mdfiletagmatches
+	},
+	/* pg_xact */
+	[SYNC_HANDLER_CLOG] = {
+		.sync_syncfiletag = clogsyncfiletag
+	},
+	/* pg_commit_ts */
+	[SYNC_HANDLER_COMMIT_TS] = {
+		.sync_syncfiletag = committssyncfiletag
+	},
+	/* pg_multixact/offsets */
+	[SYNC_HANDLER_MULTIXACT_OFFSET] = {
+		.sync_syncfiletag = multixactoffsetssyncfiletag
+	},
+	/* pg_multixact/members */
+	[SYNC_HANDLER_MULTIXACT_MEMBER] = {
+		.sync_syncfiletag = multixactmemberssyncfiletag
+	}
+};
+
+/*
+ * Initialize data structures for the file sync tracking.
+ */
+void
+InitSync(void)
+{
+	/*
+	 * Create pending-operations hashtable if we need it.  Currently, we need
+	 * it if we are standalone (not under a postmaster) or if we are a startup
+	 * or checkpointer auxiliary process.
+	 */
+	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
+	{
+		HASHCTL		hash_ctl;
+
+		/*
+		 * XXX: The checkpointer needs to add entries to the pending ops table
+		 * when absorbing fsync requests.  That is done within a critical
+		 * section, which isn't usually allowed, but we make an exception. It
+		 * means that there's a theoretical possibility that you run out of
+		 * memory while absorbing fsync requests, which leads to a PANIC.
+		 * Fortunately the hash table is small so that's unlikely to happen in
+		 * practice.
+		 */
+		pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
+											  "Pending ops context",
+											  ALLOCSET_DEFAULT_SIZES);
+		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
+
+		hash_ctl.keysize = sizeof(FileTag);
+		hash_ctl.entrysize = sizeof(PendingFsyncEntry);
+		hash_ctl.hcxt = pendingOpsCxt;
+		pendingOps = hash_create("Pending Ops Table",
+								 100L,
+								 &hash_ctl,
+								 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+		pendingUnlinks = NIL;
+	}
+
+}
+
+/*
+ * SyncPreCheckpoint() -- Do pre-checkpoint work
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon.  Since this calls
+ * AbsorbSyncRequests(), which performs memory allocations, it cannot be
+ * called within a critical section.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+SyncPreCheckpoint(void)
+{
+	/*
+	 * Operations such as DROP TABLESPACE assume that the next checkpoint will
+	 * process all recently forwarded unlink requests, but if they aren't
+	 * absorbed prior to advancing the cycle counter, they won't be processed
+	 * until a future checkpoint.  The following absorb ensures that any
+	 * unlink requests forwarded before the checkpoint began will be processed
+	 * in the current checkpoint.
+	 */
+	AbsorbSyncRequests();
+
+	/*
+	 * Any unlink requests arriving after this point will be assigned the next
+	 * cycle counter, and won't be unlinked until next checkpoint.
+	 */
+	checkpoint_cycle_ctr++;
+}
+
+/*
+ * SyncPostCheckpoint() -- Do post-checkpoint work
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+void
+SyncPostCheckpoint(void)
+{
+	int			absorb_counter;
+	ListCell   *lc;
+
+	absorb_counter = UNLINKS_PER_ABSORB;
+	foreach(lc, pendingUnlinks)
+	{
+		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
+		char		path[MAXPGPATH];
+
+		/* Skip over any canceled entries */
+		if (entry->canceled)
+			continue;
+
+		/*
+		 * New entries are appended to the end, so if the entry is new we've
+		 * reached the end of old entries.
+		 *
+		 * Note: if just the right number of consecutive checkpoints fail, we
+		 * could be fooled here by cycle_ctr wraparound.  However, the only
+		 * consequence is that we'd delay unlinking for one more checkpoint,
+		 * which is perfectly tolerable.
+		 */
+		if (entry->cycle_ctr == checkpoint_cycle_ctr)
+			break;
+
+		/* Unlink the file */
+		if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
+														  path) < 0)
+		{
+			/*
+			 * There's a race condition, when the database is dropped at the
+			 * same time that we process the pending unlink requests. If the
+			 * DROP DATABASE deletes the file before we do, we will get ENOENT
+			 * here. rmtree() also has to ignore ENOENT errors, to deal with
+			 * the possibility that we delete the file first.
+			 */
+			if (errno != ENOENT)
+				ereport(WARNING,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m", path)));
+		}
+
+		/* Mark the list entry as canceled, just in case */
+		entry->canceled = true;
+
+		/*
+		 * As in ProcessSyncRequests, we don't want to stop absorbing fsync
+		 * requests for a long time when there are many deletions to be done.
+		 * We can safely call AbsorbSyncRequests() at this point in the loop.
+		 */
+		if (--absorb_counter <= 0)
+		{
+			AbsorbSyncRequests();
+			absorb_counter = UNLINKS_PER_ABSORB;
+		}
+	}
+
+	/*
+	 * If we reached the end of the list, we can just remove the whole list
+	 * (remembering to pfree all the PendingUnlinkEntry objects).  Otherwise,
+	 * we must keep the entries at or after "lc".
+	 */
+	if (lc == NULL)
+	{
+		list_free_deep(pendingUnlinks);
+		pendingUnlinks = NIL;
+	}
+	else
+	{
+		int			ntodelete = list_cell_number(pendingUnlinks, lc);
+
+		for (int i = 0; i < ntodelete; i++)
+			pfree(list_nth(pendingUnlinks, i));
+
+		pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
+	}
+}
+
+/*
+
+ *	ProcessSyncRequests() -- Process queued fsync requests.
+ */
+void
+ProcessSyncRequests(void)
+{
+	static bool sync_in_progress = false;
+
+	HASH_SEQ_STATUS hstat;
+	PendingFsyncEntry *entry;
+	int			absorb_counter;
+
+	/* Statistics on sync times */
+	int			processed = 0;
+	instr_time	sync_start,
+				sync_end,
+				sync_diff;
+	uint64		elapsed;
+	uint64		longest = 0;
+	uint64		total_elapsed = 0;
+
+	/*
+	 * This is only called during checkpoints, and checkpoints should only
+	 * occur in processes that have created a pendingOps.
+	 */
+	if (!pendingOps)
+		elog(ERROR, "cannot sync without a pendingOps table");
+
+	/*
+	 * If we are in the checkpointer, the sync had better include all fsync
+	 * requests that were queued by backends up to this point.  The tightest
+	 * race condition that could occur is that a buffer that must be written
+	 * and fsync'd for the checkpoint could have been dumped by a backend just
+	 * before it was visited by BufferSync().  We know the backend will have
+	 * queued an fsync request before clearing the buffer's dirtybit, so we
+	 * are safe as long as we do an Absorb after completing BufferSync().
+	 */
+	AbsorbSyncRequests();
+
+	/*
+	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
+	 * checkpoint), we want to ignore fsync requests that are entered into the
+	 * hashtable after this point --- they should be processed next time,
+	 * instead.  We use sync_cycle_ctr to tell old entries apart from new
+	 * ones: new ones will have cycle_ctr equal to the incremented value of
+	 * sync_cycle_ctr.
+	 *
+	 * In normal circumstances, all entries present in the table at this point
+	 * will have cycle_ctr exactly equal to the current (about to be old)
+	 * value of sync_cycle_ctr.  However, if we fail partway through the
+	 * fsync'ing loop, then older values of cycle_ctr might remain when we
+	 * come back here to try again.  Repeated checkpoint failures would
+	 * eventually wrap the counter around to the point where an old entry
+	 * might appear new, causing us to skip it, possibly allowing a checkpoint
+	 * to succeed that should not have.  To forestall wraparound, any time the
+	 * previous ProcessSyncRequests() failed to complete, run through the
+	 * table and forcibly set cycle_ctr = sync_cycle_ctr.
+	 *
+	 * Think not to merge this loop with the main loop, as the problem is
+	 * exactly that that loop may fail before having visited all the entries.
+	 * From a performance point of view it doesn't matter anyway, as this path
+	 * will never be taken in a system that's functioning normally.
+	 */
+	if (sync_in_progress)
+	{
+		/* prior try failed, so update any stale cycle_ctr values */
+		hash_seq_init(&hstat, pendingOps);
+		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			entry->cycle_ctr = sync_cycle_ctr;
+		}
+	}
+
+	/* Advance counter so that new hashtable entries are distinguishable */
+	sync_cycle_ctr++;
+
+	/* Set flag to detect failure if we don't reach the end of the loop */
+	sync_in_progress = true;
+
+	/* Now scan the hashtable for fsync requests to process */
+	absorb_counter = FSYNCS_PER_ABSORB;
+	hash_seq_init(&hstat, pendingOps);
+	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+	{
+		int			failures;
+
+		/*
+		 * If the entry is new then don't process it this time; it is new.
+		 * Note "continue" bypasses the hash-remove call at the bottom of the
+		 * loop.
+		 */
+		if (entry->cycle_ctr == sync_cycle_ctr)
+			continue;
+
+		/* Else assert we haven't missed it */
+		Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
+
+		/*
+		 * If fsync is off then we don't have to bother opening the file at
+		 * all.  (We delay checking until this point so that changing fsync on
+		 * the fly behaves sensibly.)
+		 */
+		if (enableFsync)
+		{
+			/*
+			 * If in checkpointer, we want to absorb pending requests every so
+			 * often to prevent overflow of the fsync request queue.  It is
+			 * unspecified whether newly-added entries will be visited by
+			 * hash_seq_search, but we don't care since we don't need to
+			 * process them anyway.
+			 */
+			if (--absorb_counter <= 0)
+			{
+				AbsorbSyncRequests();
+				absorb_counter = FSYNCS_PER_ABSORB;
+			}
+
+			/*
+			 * The fsync table could contain requests to fsync segments that
+			 * have been deleted (unlinked) by the time we get to them. Rather
+			 * than just hoping an ENOENT (or EACCES on Windows) error can be
+			 * ignored, what we do on error is absorb pending requests and
+			 * then retry. Since mdunlink() queues a "cancel" message before
+			 * actually unlinking, the fsync request is guaranteed to be
+			 * marked canceled after the absorb if it really was this case.
+			 * DROP DATABASE likewise has to tell us to forget fsync requests
+			 * before it starts deletions.
+			 */
+			for (failures = 0; !entry->canceled; failures++)
+			{
+				char		path[MAXPGPATH];
+
+				INSTR_TIME_SET_CURRENT(sync_start);
+				if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
+																path) == 0)
+				{
+					/* Success; update statistics about sync timing */
+					INSTR_TIME_SET_CURRENT(sync_end);
+					sync_diff = sync_end;
+					INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+					elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+					if (elapsed > longest)
+						longest = elapsed;
+					total_elapsed += elapsed;
+					processed++;
+
+					if (log_checkpoints)
+						elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
+							 processed,
+							 path,
+							 (double) elapsed / 1000);
+
+					break;		/* out of retry loop */
+				}
+
+				/*
+				 * It is possible that the relation has been dropped or
+				 * truncated since the fsync request was entered. Therefore,
+				 * allow ENOENT, but only if we didn't fail already on this
+				 * file.
+				 */
+				if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
+					ereport(data_sync_elevel(ERROR),
+							(errcode_for_file_access(),
+							 errmsg("could not fsync file \"%s\": %m",
+									path)));
+				else
+					ereport(DEBUG1,
+							(errcode_for_file_access(),
+							 errmsg_internal("could not fsync file \"%s\" but retrying: %m",
+											 path)));
+
+				/*
+				 * Absorb incoming requests and check to see if a cancel
+				 * arrived for this relation fork.
+				 */
+				AbsorbSyncRequests();
+				absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
+			}					/* end retry loop */
+		}
+
+		/* We are done with this entry, remove it */
+		if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
+			elog(ERROR, "pendingOps corrupted");
+	}							/* end loop over hashtable entries */
+
+	/* Return sync performance metrics for report at checkpoint end */
+	CheckpointStats.ckpt_sync_rels = processed;
+	CheckpointStats.ckpt_longest_sync = longest;
+	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
+	/* Flag successful completion of ProcessSyncRequests */
+	sync_in_progress = false;
+}
+
+/*
+ * RememberSyncRequest() -- callback from checkpointer side of sync request
+ *
+ * We stuff fsync requests into the local hash table for execution
+ * during the checkpointer's next checkpoint.  UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
+ *
+ * See sync.h for more information on the types of sync requests supported.
+ */
+void
+RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
+{
+	Assert(pendingOps);
+
+	if (type == SYNC_FORGET_REQUEST)
+	{
+		PendingFsyncEntry *entry;
+
+		/* Cancel previously entered request */
+		entry = (PendingFsyncEntry *) hash_search(pendingOps,
+												  (void *) ftag,
+												  HASH_FIND,
+												  NULL);
+		if (entry != NULL)
+			entry->canceled = true;
+	}
+	else if (type == SYNC_FILTER_REQUEST)
+	{
+		HASH_SEQ_STATUS hstat;
+		PendingFsyncEntry *entry;
+		ListCell   *cell;
+
+		/* Cancel matching fsync requests */
+		hash_seq_init(&hstat, pendingOps);
+		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (entry->tag.handler == ftag->handler &&
+				syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
+				entry->canceled = true;
+		}
+
+		/* Cancel matching unlink requests */
+		foreach(cell, pendingUnlinks)
+		{
+			PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
+
+			if (entry->tag.handler == ftag->handler &&
+				syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
+				entry->canceled = true;
+		}
+	}
+	else if (type == SYNC_UNLINK_REQUEST)
+	{
+		/* Unlink request: put it in the linked list */
+		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+		PendingUnlinkEntry *entry;
+
+		entry = palloc(sizeof(PendingUnlinkEntry));
+		entry->tag = *ftag;
+		entry->cycle_ctr = checkpoint_cycle_ctr;
+		entry->canceled = false;
+
+		pendingUnlinks = lappend(pendingUnlinks, entry);
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+	else
+	{
+		/* Normal case: enter a request to fsync this segment */
+		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+		PendingFsyncEntry *entry;
+		bool		found;
+
+		Assert(type == SYNC_REQUEST);
+
+		entry = (PendingFsyncEntry *) hash_search(pendingOps,
+												  (void *) ftag,
+												  HASH_ENTER,
+												  &found);
+		/* if new entry, or was previously canceled, initialize it */
+		if (!found || entry->canceled)
+		{
+			entry->cycle_ctr = sync_cycle_ctr;
+			entry->canceled = false;
+		}
+
+		/*
+		 * NB: it's intentional that we don't change cycle_ctr if the entry
+		 * already exists.  The cycle_ctr must represent the oldest fsync
+		 * request that could be in the entry.
+		 */
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+}
+
+/*
+ * Register the sync request locally, or forward it to the checkpointer.
+ *
+ * If retryOnError is true, we'll keep trying if there is no space in the
+ * queue.  Return true if we succeeded, or false if there wasn't space.
+ */
+bool
+RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
+					bool retryOnError)
+{
+	bool		ret;
+
+	if (pendingOps != NULL)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberSyncRequest(ftag, type);
+		return true;
+	}
+
+	for (;;)
+	{
+		/*
+		 * Notify the checkpointer about it.  If we fail to queue a message in
+		 * retryOnError mode, we have to sleep and try again ... ugly, but
+		 * hopefully won't happen often.
+		 *
+		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
+		 * error in the case of SYNC_UNLINK_REQUEST would leave the
+		 * no-longer-used file still present on disk, which would be bad, so
+		 * I'm inclined to assume that the checkpointer will always empty the
+		 * queue soon.
+		 */
+		ret = ForwardSyncRequest(ftag, type);
+
+		/*
+		 * If we are successful in queueing the request, or we failed and were
+		 * instructed not to retry on error, break.
+		 */
+		if (ret || (!ret && !retryOnError))
+			break;
+
+		WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
+				  WAIT_EVENT_REGISTER_SYNC_REQUEST);
+	}
+
+	return ret;
+}
+
+/*
+ * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
+ * already created the pendingOps during initialization of the startup
+ * process.  Calling this function drops the local pendingOps so that
+ * subsequent requests will be forwarded to checkpointer.
+ */
+void
+EnableSyncRequestForwarding(void)
+{
+	/* Perform any pending fsyncs we may have queued up, then drop table */
+	if (pendingOps)
+	{
+		ProcessSyncRequests();
+		hash_destroy(pendingOps);
+	}
+	pendingOps = NULL;
+
+	/*
+	 * We should not have any pending unlink requests, since mdunlink doesn't
+	 * queue unlink requests when isRedo.
+	 */
+	Assert(pendingUnlinks == NIL);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
commit	46651ce6fe013220ed397add242004d764fc0153 (patch)
tree	6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/storage
parent	Initial commit. (diff)
download	postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip