Adding upstream version 16.2.upstream/16.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-13 13:44:03 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-13 13:44:03 +0000
commit: 293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch)
tree: fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/storage/lmgr
parent: Initial commit. (diff)
download: postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz
postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip
19 files changed, 18689 insertions, 0 deletions
diff --git a/src/backend/storage/lmgr/.gitignore b/src/backend/storage/lmgr/.gitignore
new file mode 100644
index 0000000..dab4c3f
--- /dev/null
+++ b/src/backend/storage/lmgr/.gitignore
@@ -0,0 +1,3 @@
+/lwlocknames.c
+/lwlocknames.h
+/s_lock_test
diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile
new file mode 100644
index 0000000..b25b7ee
--- /dev/null
+++ b/src/backend/storage/lmgr/Makefile
@@ -0,0 +1,52 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/lmgr
+#
+# IDENTIFICATION
+#    src/backend/storage/lmgr/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/lmgr
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	condition_variable.o \
+	deadlock.o \
+	lmgr.o \
+	lock.o \
+	lwlock.o \
+	lwlocknames.o \
+	predicate.o \
+	proc.o \
+	s_lock.o \
+	spin.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+ifdef TAS
+TASPATH = $(top_builddir)/src/backend/port/tas.o
+endif
+
+s_lock_test: s_lock.c $(top_builddir)/src/common/libpgcommon.a $(top_builddir)/src/port/libpgport.a
+	$(CC) $(CPPFLAGS) $(CFLAGS) -DS_LOCK_TEST=1 $(srcdir)/s_lock.c \
+		$(TASPATH) -L $(top_builddir)/src/common -lpgcommon \
+		-L $(top_builddir)/src/port -lpgport -o s_lock_test
+
+# see notes in src/backend/parser/Makefile
+lwlocknames.c: lwlocknames.h
+	touch $@
+
+lwlocknames.h: $(top_srcdir)/src/backend/storage/lmgr/lwlocknames.txt generate-lwlocknames.pl
+	$(PERL) $(srcdir)/generate-lwlocknames.pl $<
+
+check: s_lock_test
+	./s_lock_test
+
+clean distclean:
+	rm -f s_lock_test
+
+maintainer-clean: clean
+	rm -f lwlocknames.h lwlocknames.c
diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README
new file mode 100644
index 0000000..45de0fd
--- /dev/null
+++ b/src/backend/storage/lmgr/README
@@ -0,0 +1,731 @@
+src/backend/storage/lmgr/README
+
+Locking Overview
+================
+
+Postgres uses four types of interprocess locks:
+
+* Spinlocks.  These are intended for *very* short-term locks.  If a lock
+is to be held more than a few dozen instructions, or across any sort of
+kernel call (or even a call to a nontrivial subroutine), don't use a
+spinlock. Spinlocks are primarily used as infrastructure for lightweight
+locks. They are implemented using a hardware atomic-test-and-set
+instruction, if available.  Waiting processes busy-loop until they can
+get the lock. There is no provision for deadlock detection, automatic
+release on error, or any other nicety.  There is a timeout if the lock
+cannot be gotten after a minute or so (which is approximately forever in
+comparison to the intended lock hold time, so this is certainly an error
+condition).
+
+* Lightweight locks (LWLocks).  These locks are typically used to
+interlock access to datastructures in shared memory.  LWLocks support
+both exclusive and shared lock modes (for read/write and read-only
+access to a shared object). There is no provision for deadlock
+detection, but the LWLock manager will automatically release held
+LWLocks during elog() recovery, so it is safe to raise an error while
+holding LWLocks.  Obtaining or releasing an LWLock is quite fast (a few
+dozen instructions) when there is no contention for the lock.  When a
+process has to wait for an LWLock, it blocks on a SysV semaphore so as
+to not consume CPU time.  Waiting processes will be granted the lock in
+arrival order.  There is no timeout.
+
+* Regular locks (a/k/a heavyweight locks).  The regular lock manager
+supports a variety of lock modes with table-driven semantics, and it has
+full deadlock detection and automatic release at transaction end.
+Regular locks should be used for all user-driven lock requests.
+
+* SIReadLock predicate locks.  See separate README-SSI file for details.
+
+Acquisition of either a spinlock or a lightweight lock causes query
+cancel and die() interrupts to be held off until all such locks are
+released. No such restriction exists for regular locks, however.  Also
+note that we can accept query cancel and die() interrupts while waiting
+for a regular lock, but we will not accept them while waiting for
+spinlocks or LW locks. It is therefore not a good idea to use LW locks
+when the wait time might exceed a few seconds.
+
+The rest of this README file discusses the regular lock manager in detail.
+
+
+Lock Data Structures
+--------------------
+
+Lock methods describe the overall locking behavior.  Currently there are
+two lock methods: DEFAULT and USER.
+
+Lock modes describe the type of the lock (read/write or shared/exclusive).
+In principle, each lock method can have its own set of lock modes with
+different conflict rules, but currently DEFAULT and USER methods use
+identical lock mode sets. See src/include/storage/lock.h for more details.
+(Lock modes are also called lock types in some places in the code and
+documentation.)
+
+There are two main methods for recording locks in shared memory.  The primary
+mechanism uses two main structures: the per-lockable-object LOCK struct, and
+the per-lock-and-requestor PROCLOCK struct.  A LOCK object exists for each
+lockable object that currently has locks held or requested on it.  A PROCLOCK
+struct exists for each backend that is holding or requesting lock(s) on each
+LOCK object.
+
+There is also a special "fast path" mechanism which backends may use to
+record a limited number of locks with very specific characteristics: they must
+use the DEFAULT lockmethod; they must represent a lock on a database relation
+(not a shared relation), they must be a "weak" lock which is unlikely to
+conflict (AccessShareLock, RowShareLock, or RowExclusiveLock); and the system
+must be able to quickly verify that no conflicting locks could possibly be
+present.  See "Fast Path Locking", below, for more details.
+
+Each backend also maintains an unshared LOCALLOCK structure for each lockable
+object and lock mode that it is currently holding or requesting.  The shared
+lock structures only allow a single lock grant to be made per lockable
+object/lock mode/backend.  Internally to a backend, however, the same lock may
+be requested and perhaps released multiple times in a transaction, and it can
+also be held both transactionally and session-wide.  The internal request
+counts are held in LOCALLOCK so that the shared data structures need not be
+accessed to alter them.
+
+---------------------------------------------------------------------------
+
+The lock manager's LOCK objects contain:
+
+tag -
+    The key fields that are used for hashing locks in the shared memory
+    lock hash table.  The contents of the tag essentially define an
+    individual lockable object.  See include/storage/lock.h for details
+    about the supported types of lockable objects.  This is declared as
+    a separate struct to ensure that we always zero out the correct number
+    of bytes.  It is critical that any alignment-padding bytes the compiler
+    might insert in the struct be zeroed out, else the hash computation
+    will be random.  (Currently, we are careful to define struct LOCKTAG
+    so that there are no padding bytes.)
+
+grantMask -
+    This bitmask indicates what types of locks are currently held on the
+    given lockable object.  It is used (against the lock table's conflict
+    table) to determine if a new lock request will conflict with existing
+    lock types held.  Conflicts are determined by bitwise AND operations
+    between the grantMask and the conflict table entry for the requested
+    lock type.  Bit i of grantMask is 1 if and only if granted[i] > 0.
+
+waitMask -
+    This bitmask shows the types of locks being waited for.  Bit i of waitMask
+    is 1 if and only if requested[i] > granted[i].
+
+procLocks -
+    This is a shared memory queue of all the PROCLOCK structs associated with
+    the lock object.  Note that both granted and waiting PROCLOCKs are in this
+    list (indeed, the same PROCLOCK might have some already-granted locks and
+    be waiting for more!).
+
+waitProcs -
+    This is a shared memory queue of all PGPROC structures corresponding to
+    backends that are waiting (sleeping) until another backend releases this
+    lock.  The process structure holds the information needed to determine
+    if it should be woken up when the lock is released.
+
+nRequested -
+    Keeps a count of how many times this lock has been attempted to be
+    acquired.  The count includes attempts by processes which were put
+    to sleep due to conflicts.  It also counts the same backend twice
+    if, for example, a backend process first acquires a read and then
+    acquires a write.  (But multiple acquisitions of the same lock/lock mode
+    within a backend are not multiply counted here; they are recorded
+    only in the backend's LOCALLOCK structure.)
+
+requested -
+    Keeps a count of how many locks of each type have been attempted.  Only
+    elements 1 through MAX_LOCKMODES-1 are used as they correspond to the lock
+    type defined constants.  Summing the values of requested[] should come out
+    equal to nRequested.
+
+nGranted -
+    Keeps count of how many times this lock has been successfully acquired.
+    This count does not include attempts that are waiting due to conflicts.
+    Otherwise the counting rules are the same as for nRequested.
+
+granted -
+    Keeps count of how many locks of each type are currently held.  Once again
+    only elements 1 through MAX_LOCKMODES-1 are used (0 is not).  Also, like
+    requested[], summing the values of granted[] should total to the value
+    of nGranted.
+
+We should always have 0 <= nGranted <= nRequested, and
+0 <= granted[i] <= requested[i] for each i.  When all the request counts
+go to zero, the LOCK object is no longer needed and can be freed.
+
+---------------------------------------------------------------------------
+
+The lock manager's PROCLOCK objects contain:
+
+tag -
+    The key fields that are used for hashing entries in the shared memory
+    PROCLOCK hash table.  This is declared as a separate struct to ensure that
+    we always zero out the correct number of bytes.  It is critical that any
+    alignment-padding bytes the compiler might insert in the struct be zeroed
+    out, else the hash computation will be random.  (Currently, we are careful
+    to define struct PROCLOCKTAG so that there are no padding bytes.)
+
+    tag.myLock
+        Pointer to the shared LOCK object this PROCLOCK is for.
+
+    tag.myProc
+        Pointer to the PGPROC of backend process that owns this PROCLOCK.
+
+    Note: it's OK to use pointers here because a PROCLOCK never outlives
+    either its lock or its proc.  The tag is therefore unique for as long
+    as it needs to be, even though the same tag values might mean something
+    else at other times.
+
+holdMask -
+    A bitmask for the lock modes successfully acquired by this PROCLOCK.
+    This should be a subset of the LOCK object's grantMask, and also a
+    subset of the PGPROC object's heldLocks mask (if the PGPROC is
+    currently waiting for another lock mode on this lock).
+
+releaseMask -
+    A bitmask for the lock modes due to be released during LockReleaseAll.
+    This must be a subset of the holdMask.  Note that it is modified without
+    taking the partition LWLock, and therefore it is unsafe for any
+    backend except the one owning the PROCLOCK to examine/change it.
+
+lockLink -
+    List link for shared memory queue of all the PROCLOCK objects for the
+    same LOCK.
+
+procLink -
+    List link for shared memory queue of all the PROCLOCK objects for the
+    same backend.
+
+---------------------------------------------------------------------------
+
+
+Lock Manager Internal Locking
+-----------------------------
+
+Before PostgreSQL 8.2, all of the shared-memory data structures used by
+the lock manager were protected by a single LWLock, the LockMgrLock;
+any operation involving these data structures had to exclusively lock
+LockMgrLock.  Not too surprisingly, this became a contention bottleneck.
+To reduce contention, the lock manager's data structures have been split
+into multiple "partitions", each protected by an independent LWLock.
+Most operations only need to lock the single partition they are working in.
+Here are the details:
+
+* Each possible lock is assigned to one partition according to a hash of
+its LOCKTAG value.  The partition's LWLock is considered to protect all the
+LOCK objects of that partition as well as their subsidiary PROCLOCKs.
+
+* The shared-memory hash tables for LOCKs and PROCLOCKs are organized
+so that different partitions use different hash chains, and thus there
+is no conflict in working with objects in different partitions.  This
+is supported directly by dynahash.c's "partitioned table" mechanism
+for the LOCK table: we need only ensure that the partition number is
+taken from the low-order bits of the dynahash hash value for the LOCKTAG.
+To make it work for PROCLOCKs, we have to ensure that a PROCLOCK's hash
+value has the same low-order bits as its associated LOCK.  This requires
+a specialized hash function (see proclock_hash).
+
+* Formerly, each PGPROC had a single list of PROCLOCKs belonging to it.
+This has now been split into per-partition lists, so that access to a
+particular PROCLOCK list can be protected by the associated partition's
+LWLock.  (This rule allows one backend to manipulate another backend's
+PROCLOCK lists, which was not originally necessary but is now required in
+connection with fast-path locking; see below.)
+
+* The other lock-related fields of a PGPROC are only interesting when
+the PGPROC is waiting for a lock, so we consider that they are protected
+by the partition LWLock of the awaited lock.
+
+For normal lock acquisition and release, it is sufficient to lock the
+partition containing the desired lock.  Deadlock checking needs to touch
+multiple partitions in general; for simplicity, we just make it lock all
+the partitions in partition-number order.  (To prevent LWLock deadlock,
+we establish the rule that any backend needing to lock more than one
+partition at once must lock them in partition-number order.)  It's
+possible that deadlock checking could be done without touching every
+partition in typical cases, but since in a properly functioning system
+deadlock checking should not occur often enough to be performance-critical,
+trying to make this work does not seem a productive use of effort.
+
+A backend's internal LOCALLOCK hash table is not partitioned.  We do store
+a copy of the locktag hash code in LOCALLOCK table entries, from which the
+partition number can be computed, but this is a straight speed-for-space
+tradeoff: we could instead recalculate the partition number from the LOCKTAG
+when needed.
+
+
+Fast Path Locking
+-----------------
+
+Fast path locking is a special purpose mechanism designed to reduce the
+overhead of taking and releasing certain types of locks which are taken
+and released very frequently but rarely conflict.  Currently, this includes
+two categories of locks:
+
+(1) Weak relation locks.  SELECT, INSERT, UPDATE, and DELETE must acquire a
+lock on every relation they operate on, as well as various system catalogs
+that can be used internally.  Many DML operations can proceed in parallel
+against the same table at the same time; only DDL operations such as
+CLUSTER, ALTER TABLE, or DROP -- or explicit user action such as LOCK TABLE
+-- will create lock conflicts with the "weak" locks (AccessShareLock,
+RowShareLock, RowExclusiveLock) acquired by DML operations.
+
+(2) VXID locks.  Every transaction takes a lock on its own virtual
+transaction ID.  Currently, the only operations that wait for these locks
+are CREATE INDEX CONCURRENTLY and Hot Standby (in the case of a conflict),
+so most VXID locks are taken and released by the owner without anyone else
+needing to care.
+
+The primary locking mechanism does not cope well with this workload.  Even
+though the lock manager locks are partitioned, the locktag for any given
+relation still falls in one, and only one, partition.  Thus, if many short
+queries are accessing the same relation, the lock manager partition lock for
+that partition becomes a contention bottleneck.  This effect is measurable
+even on 2-core servers, and becomes very pronounced as core count increases.
+
+To alleviate this bottleneck, beginning in PostgreSQL 9.2, each backend is
+permitted to record a limited number of locks on unshared relations in an
+array within its PGPROC structure, rather than using the primary lock table.
+This mechanism can only be used when the locker can verify that no conflicting
+locks exist at the time of taking the lock.
+
+A key point of this algorithm is that it must be possible to verify the
+absence of possibly conflicting locks without fighting over a shared LWLock or
+spinlock.  Otherwise, this effort would simply move the contention bottleneck
+from one place to another.  We accomplish this using an array of 1024 integer
+counters, which are in effect a 1024-way partitioning of the lock space.
+Each counter records the number of "strong" locks (that is, ShareLock,
+ShareRowExclusiveLock, ExclusiveLock, and AccessExclusiveLock) on unshared
+relations that fall into that partition.  When this counter is non-zero, the
+fast path mechanism may not be used to take new relation locks within that
+partition.  A strong locker bumps the counter and then scans each per-backend
+array for matching fast-path locks; any which are found must be transferred to
+the primary lock table before attempting to acquire the lock, to ensure proper
+lock conflict and deadlock detection.
+
+On an SMP system, we must guarantee proper memory synchronization.  Here we
+rely on the fact that LWLock acquisition acts as a memory sequence point: if
+A performs a store, A and B both acquire an LWLock in either order, and B
+then performs a load on the same memory location, it is guaranteed to see
+A's store.  In this case, each backend's fast-path lock queue is protected
+by an LWLock.  A backend wishing to acquire a fast-path lock grabs this
+LWLock before examining FastPathStrongRelationLocks to check for the presence
+of a conflicting strong lock.  And the backend attempting to acquire a strong
+lock, because it must transfer any matching weak locks taken via the fast-path
+mechanism to the shared lock table, will acquire every LWLock protecting a
+backend fast-path queue in turn.  So, if we examine
+FastPathStrongRelationLocks and see a zero, then either the value is truly
+zero, or if it is a stale value, the strong locker has yet to acquire the
+per-backend LWLock we now hold (or, indeed, even the first per-backend LWLock)
+and will notice any weak lock we take when it does.
+
+Fast-path VXID locks do not use the FastPathStrongRelationLocks table.  The
+first lock taken on a VXID is always the ExclusiveLock taken by its owner.
+Any subsequent lockers are share lockers waiting for the VXID to terminate.
+Indeed, the only reason VXID locks use the lock manager at all (rather than
+waiting for the VXID to terminate via some other method) is for deadlock
+detection.  Thus, the initial VXID lock can *always* be taken via the fast
+path without checking for conflicts.  Any subsequent locker must check
+whether the lock has been transferred to the main lock table, and if not,
+do so.  The backend owning the VXID must be careful to clean up any entry
+made in the main lock table at end of transaction.
+
+Deadlock detection does not need to examine the fast-path data structures,
+because any lock that could possibly be involved in a deadlock must have
+been transferred to the main tables beforehand.
+
+
+The Deadlock Detection Algorithm
+--------------------------------
+
+Since we allow user transactions to request locks in any order, deadlock
+is possible.  We use a deadlock detection/breaking algorithm that is
+fairly standard in essence, but there are many special considerations
+needed to deal with Postgres' generalized locking model.
+
+A key design consideration is that we want to make routine operations
+(lock grant and release) run quickly when there is no deadlock, and
+avoid the overhead of deadlock handling as much as possible.  We do this
+using an "optimistic waiting" approach: if a process cannot acquire the
+lock it wants immediately, it goes to sleep without any deadlock check.
+But it also sets a delay timer, with a delay of DeadlockTimeout
+milliseconds (typically set to one second).  If the delay expires before
+the process is granted the lock it wants, it runs the deadlock
+detection/breaking code. Normally this code will determine that there is
+no deadlock condition, and then the process will go back to sleep and
+wait quietly until it is granted the lock.  But if a deadlock condition
+does exist, it will be resolved, usually by aborting the detecting
+process' transaction.  In this way, we avoid deadlock handling overhead
+whenever the wait time for a lock is less than DeadlockTimeout, while
+not imposing an unreasonable delay of detection when there is an error.
+
+Lock acquisition (routines LockAcquire and ProcSleep) follows these rules:
+
+1. A lock request is granted immediately if it does not conflict with
+any existing or waiting lock request, or if the process already holds an
+instance of the same lock type (eg, there's no penalty to acquire a read
+lock twice).  Note that a process never conflicts with itself, eg one
+can obtain read lock when one already holds exclusive lock.
+
+2. Otherwise the process joins the lock's wait queue.  Normally it will
+be added to the end of the queue, but there is an exception: if the
+process already holds locks on this same lockable object that conflict
+with the request of any pending waiter, then the process will be
+inserted in the wait queue just ahead of the first such waiter.  (If we
+did not make this check, the deadlock detection code would adjust the
+queue order to resolve the conflict, but it's relatively cheap to make
+the check in ProcSleep and avoid a deadlock timeout delay in this case.)
+Note special case when inserting before the end of the queue: if the
+process's request does not conflict with any existing lock nor any
+waiting request before its insertion point, then go ahead and grant the
+lock without waiting.
+
+When a lock is released, the lock release routine (ProcLockWakeup) scans
+the lock object's wait queue.  Each waiter is awoken if (a) its request
+does not conflict with already-granted locks, and (b) its request does
+not conflict with the requests of prior un-wakable waiters.  Rule (b)
+ensures that conflicting requests are granted in order of arrival. There
+are cases where a later waiter must be allowed to go in front of
+conflicting earlier waiters to avoid deadlock, but it is not
+ProcLockWakeup's responsibility to recognize these cases; instead, the
+deadlock detection code will re-order the wait queue when necessary.
+
+To perform deadlock checking, we use the standard method of viewing the
+various processes as nodes in a directed graph (the waits-for graph or
+WFG).  There is a graph edge leading from process A to process B if A
+waits for B, ie, A is waiting for some lock and B holds a conflicting
+lock.  There is a deadlock condition if and only if the WFG contains a
+cycle.  We detect cycles by searching outward along waits-for edges to
+see if we return to our starting point.  There are three possible
+outcomes:
+
+1. All outgoing paths terminate at a running process (which has no
+outgoing edge).
+
+2. A deadlock is detected by looping back to the start point.  We
+resolve such a deadlock by canceling the start point's lock request and
+reporting an error in that transaction, which normally leads to
+transaction abort and release of that transaction's held locks.  Note
+that it's sufficient to cancel one request to remove the cycle; we don't
+need to kill all the transactions involved.
+
+3. Some path(s) loop back to a node other than the start point.  This
+indicates a deadlock, but one that does not involve our starting
+process. We ignore this condition on the grounds that resolving such a
+deadlock is the responsibility of the processes involved --- killing our
+start-point process would not resolve the deadlock.  So, cases 1 and 3
+both report "no deadlock".
+
+Postgres' situation is a little more complex than the standard discussion
+of deadlock detection, for two reasons:
+
+1. A process can be waiting for more than one other process, since there
+might be multiple PROCLOCKs of (non-conflicting) lock types that all
+conflict with the waiter's request.  This creates no real difficulty
+however; we simply need to be prepared to trace more than one outgoing
+edge.
+
+2. If a process A is behind a process B in some lock's wait queue, and
+their requested locks conflict, then we must say that A waits for B, since
+ProcLockWakeup will never awaken A before B.  This creates additional
+edges in the WFG.  We call these "soft" edges, as opposed to the "hard"
+edges induced by locks already held.  Note that if B already holds any
+locks conflicting with A's request, then their relationship is a hard edge
+not a soft edge.
+
+A "soft" block, or wait-priority block, has the same potential for
+inducing deadlock as a hard block.  However, we may be able to resolve
+a soft block without aborting the transactions involved: we can instead
+rearrange the order of the wait queue.  This rearrangement reverses the
+direction of the soft edge between two processes with conflicting requests
+whose queue order is reversed.  If we can find a rearrangement that
+eliminates a cycle without creating new ones, then we can avoid an abort.
+Checking for such possible rearrangements is the trickiest part of the
+algorithm.
+
+The workhorse of the deadlock detector is a routine FindLockCycle() which
+is given a starting point process (which must be a waiting process).
+It recursively scans outward across waits-for edges as discussed above.
+If it finds no cycle involving the start point, it returns "false".
+(As discussed above, we can ignore cycles not involving the start point.)
+When such a cycle is found, FindLockCycle() returns "true", and as it
+unwinds it also builds a list of any "soft" edges involved in the cycle.
+If the resulting list is empty then there is a hard deadlock and the
+configuration cannot succeed.  However, if the list is not empty, then
+reversing any one of the listed edges through wait-queue rearrangement
+will eliminate that cycle.  Since such a reversal might create cycles
+elsewhere, we may need to try every possibility.  Therefore, we need to
+be able to invoke FindLockCycle() on hypothetical configurations (wait
+orders) as well as the current real order.
+
+The easiest way to handle this seems to be to have a lookaside table that
+shows the proposed new queue order for each wait queue that we are
+considering rearranging.  This table is checked by FindLockCycle, and it
+believes the proposed queue order rather than the real order for each lock
+that has an entry in the lookaside table.
+
+We build a proposed new queue order by doing a "topological sort" of the
+existing entries.  Each soft edge that we are currently considering
+reversing creates a property of the partial order that the topological sort
+has to enforce.  We must use a sort method that preserves the input
+ordering as much as possible, so as not to gratuitously break arrival
+order for processes not involved in a deadlock.  (This is not true of the
+tsort method shown in Knuth, for example, but it's easily done by a simple
+doubly-nested-loop method that emits the first legal candidate at each
+step.  Fortunately, we don't need a highly efficient sort algorithm, since
+the number of partial order constraints is not likely to be large.)  Note
+that failure of the topological sort tells us we have conflicting ordering
+constraints, and therefore that the last-added soft edge reversal
+conflicts with a prior edge reversal.  We need to detect this case to
+avoid an infinite loop in the case where no possible rearrangement will
+work: otherwise, we might try a reversal, find that it still leads to
+a cycle, then try to un-reverse the reversal while trying to get rid of
+that cycle, etc etc.  Topological sort failure tells us the un-reversal
+is not a legitimate move in this context.
+
+So, the basic step in our rearrangement method is to take a list of
+soft edges in a cycle (as returned by FindLockCycle()) and successively
+try the reversal of each one as a topological-sort constraint added to
+whatever constraints we are already considering.  We recursively search
+through all such sets of constraints to see if any one eliminates all
+the deadlock cycles at once.  Although this might seem impossibly
+inefficient, it shouldn't be a big problem in practice, because there
+will normally be very few, and not very large, deadlock cycles --- if
+any at all.  So the combinatorial inefficiency isn't going to hurt us.
+Besides, it's better to spend some time to guarantee that we've checked
+all possible escape routes than to abort a transaction when we didn't
+really have to.
+
+Each edge reversal constraint can be viewed as requesting that the waiting
+process A be moved to before the blocking process B in the wait queue they
+are both in.  This action will reverse the desired soft edge, as well as
+any other soft edges between A and other processes it is advanced over.
+No other edges will be affected (note this is actually a constraint on our
+topological sort method to not re-order the queue more than necessary.)
+Therefore, we can be sure we have not created any new deadlock cycles if
+neither FindLockCycle(A) nor FindLockCycle(B) discovers any cycle.  Given
+the above-defined behavior of FindLockCycle, each of these searches is
+necessary as well as sufficient, since FindLockCycle starting at the
+original start point will not complain about cycles that include A or B
+but not the original start point.
+
+In short then, a proposed rearrangement of the wait queue(s) is determined
+by one or more broken soft edges A->B, fully specified by the output of
+topological sorts of each wait queue involved, and then tested by invoking
+FindLockCycle() starting at the original start point as well as each of
+the mentioned processes (A's and B's).  If none of the tests detect a
+cycle, then we have a valid configuration and can implement it by
+reordering the wait queues per the sort outputs (and then applying
+ProcLockWakeup on each reordered queue, in case a waiter has become wakable).
+If any test detects a soft cycle, we can try to resolve it by adding each
+soft link in that cycle, in turn, to the proposed rearrangement list.
+This is repeated recursively until we either find a workable rearrangement
+or determine that none exists.  In the latter case, the outer level
+resolves the deadlock by aborting the original start-point transaction.
+
+The particular order in which rearrangements are tried depends on the
+order FindLockCycle() happens to scan in, so if there are multiple
+workable rearrangements of the wait queues, then it is unspecified which
+one will be chosen.  What's more important is that we guarantee to try
+every queue rearrangement that could lead to success.  (For example,
+if we have A before B before C and the needed order constraints are
+C before A and B before C, we would first discover that A before C
+doesn't work and try the rearrangement C before A before B.  This would
+eventually lead to the discovery of the additional constraint B before C.)
+
+Got that?
+
+Miscellaneous Notes
+-------------------
+
+1. It is easily proven that no deadlock will be missed due to our
+asynchronous invocation of deadlock checking.  A deadlock cycle in the WFG
+is formed when the last edge in the cycle is added; therefore the last
+process in the cycle to wait (the one from which that edge is outgoing) is
+certain to detect and resolve the cycle when it later runs CheckDeadLock.
+This holds even if that edge addition created multiple cycles; the process
+may indeed abort without ever noticing those additional cycles, but we
+don't particularly care.  The only other possible creation of deadlocks is
+during deadlock resolution's rearrangement of wait queues, and we already
+saw that that algorithm will prove that it creates no new deadlocks before
+it attempts to actually execute any rearrangement.
+
+2. It is not certain that a deadlock will be resolved by aborting the
+last-to-wait process.  If earlier waiters in the cycle have not yet run
+CheckDeadLock, then the first one to do so will be the victim.
+
+3. No live (wakable) process can be missed by ProcLockWakeup, since it
+examines every member of the wait queue (this was not true in the 7.0
+implementation, BTW).  Therefore, if ProcLockWakeup is always invoked
+after a lock is released or a wait queue is rearranged, there can be no
+failure to wake a wakable process.  One should also note that
+LockErrorCleanup (abort a waiter due to outside factors) must run
+ProcLockWakeup, in case the canceled waiter was soft-blocking other
+waiters.
+
+4. We can minimize excess rearrangement-trial work by being careful to
+scan the wait queue from the front when looking for soft edges.  For
+example, if we have queue order A,B,C and C has deadlock conflicts with
+both A and B, we want to generate the "C before A" constraint first,
+rather than wasting time with "C before B", which won't move C far
+enough up.  So we look for soft edges outgoing from C starting at the
+front of the wait queue.
+
+5. The working data structures needed by the deadlock detection code can
+be limited to numbers of entries computed from MaxBackends.  Therefore,
+we can allocate the worst-case space needed during backend startup. This
+seems a safer approach than trying to allocate workspace on the fly; we
+don't want to risk having the deadlock detector run out of memory, else
+we really have no guarantees at all that deadlock will be detected.
+
+6. We abuse the deadlock detector to implement autovacuum cancellation.
+When we run the detector and we find that there's an autovacuum worker
+involved in the waits-for graph, we store a pointer to its PGPROC, and
+return a special return code (unless a hard deadlock has been detected).
+The caller can then send a cancellation signal.  This implements the
+principle that autovacuum has a low locking priority (eg it must not block
+DDL on the table).
+
+Group Locking
+-------------
+
+As if all of that weren't already complicated enough, PostgreSQL now supports
+parallelism (see src/backend/access/transam/README.parallel), which means that
+we might need to resolve deadlocks that occur between gangs of related
+processes rather than individual processes.  This doesn't change the basic
+deadlock detection algorithm very much, but it makes the bookkeeping more
+complicated.
+
+We choose to regard locks held by processes in the same parallel group as
+non-conflicting with the exception of relation extension lock.  This means that
+two processes in a parallel group can hold a self-exclusive lock on the same
+relation at the same time, or one process can acquire an AccessShareLock while
+the other already holds AccessExclusiveLock.  This might seem dangerous and
+could be in some cases (more on that below), but if we didn't do this then
+parallel query would be extremely prone to self-deadlock.  For example, a
+parallel query against a relation on which the leader already had
+AccessExclusiveLock would hang, because the workers would try to lock the same
+relation and be blocked by the leader; yet the leader can't finish until it
+receives completion indications from all workers.  An undetected deadlock
+results.  This is far from the only scenario where such a problem happens.  The
+same thing will occur if the leader holds only AccessShareLock, the worker
+seeks AccessShareLock, but between the time the leader attempts to acquire the
+lock and the time the worker attempts to acquire it, some other process queues
+up waiting for an AccessExclusiveLock.  In this case, too, an indefinite hang
+results.
+
+It might seem that we could predict which locks the workers will attempt to
+acquire and ensure before going parallel that those locks would be acquired
+successfully.  But this is very difficult to make work in a general way.  For
+example, a parallel worker's portion of the query plan could involve an
+SQL-callable function which generates a query dynamically, and that query
+might happen to hit a table on which the leader happens to hold
+AccessExclusiveLock.  By imposing enough restrictions on what workers can do,
+we could eventually create a situation where their behavior can be adequately
+restricted, but these restrictions would be fairly onerous, and even then, the
+system required to decide whether the workers will succeed at acquiring the
+necessary locks would be complex and possibly buggy.
+
+So, instead, we take the approach of deciding that locks within a lock group
+do not conflict.  This eliminates the possibility of an undetected deadlock,
+but also opens up some problem cases: if the leader and worker try to do some
+operation at the same time which would ordinarily be prevented by the
+heavyweight lock mechanism, undefined behavior might result.  In practice, the
+dangers are modest.  The leader and worker share the same transaction,
+snapshot, and combo CID hash, and neither can perform any DDL or, indeed,
+write any data at all.  Thus, for either to read a table locked exclusively by
+the other is safe enough.  Problems would occur if the leader initiated
+parallelism from a point in the code at which it had some backend-private
+state that made table access from another process unsafe, for example after
+calling SetReindexProcessing and before calling ResetReindexProcessing,
+catastrophe could ensue, because the worker won't have that state.  Similarly,
+problems could occur with certain kinds of non-relation locks, such as
+GIN page locks.  It's no safer for two related processes to perform GIN clean
+up at the same time than for unrelated processes to do the same.
+However, since parallel mode is strictly read-only at present, neither this
+nor most of the similar cases can arise at present.  To allow parallel writes,
+we'll either need to (1) further enhance the deadlock detector to handle those
+types of locks in a different way than other types; or (2) have parallel
+workers use some other mutual exclusion method for such cases.
+
+Group locking adds three new members to each PGPROC: lockGroupLeader,
+lockGroupMembers, and lockGroupLink. A PGPROC's lockGroupLeader is NULL for
+processes not involved in parallel query. When a process wants to cooperate
+with parallel workers, it becomes a lock group leader, which means setting
+this field to point to its own PGPROC. When a parallel worker starts up, it
+points this field at the leader. The lockGroupMembers field is only used in
+the leader; it is a list of the member PGPROCs of the lock group (the leader
+and all workers). The lockGroupLink field is the list link for this list.
+
+All three of these fields are considered to be protected by a lock manager
+partition lock.  The partition lock that protects these fields within a given
+lock group is chosen by taking the leader's pgprocno modulo the number of lock
+manager partitions.  This unusual arrangement has a major advantage: the
+deadlock detector can count on the fact that no lockGroupLeader field can
+change while the deadlock detector is running, because it knows that it holds
+all the lock manager locks.  Also, holding this single lock allows safe
+manipulation of the lockGroupMembers list for the lock group.
+
+We need an additional interlock when setting these fields, because a newly
+started parallel worker has to try to join the leader's lock group, but it
+has no guarantee that the group leader is still alive by the time it gets
+started.  We try to ensure that the parallel leader dies after all workers
+in normal cases, but also that the system could survive relatively intact
+if that somehow fails to happen.  This is one of the precautions against
+such a scenario: the leader relays its PGPROC and also its PID to the
+worker, and the worker fails to join the lock group unless the given PGPROC
+still has the same PID and is still a lock group leader.  We assume that
+PIDs are not recycled quickly enough for this interlock to fail.
+
+
+User Locks (Advisory Locks)
+---------------------------
+
+User locks are handled totally on the application side as long term
+cooperative locks which may extend beyond the normal transaction boundaries.
+Their purpose is to indicate to an application that someone is `working'
+on an item.  So it is possible to put a user lock on a tuple's oid,
+retrieve the tuple, work on it for an hour and then update it and remove
+the lock.  While the lock is active other clients can still read and write
+the tuple but they can be aware that it has been locked at the application
+level by someone.
+
+User locks and normal locks are completely orthogonal and they don't
+interfere with each other.
+
+User locks can be acquired either at session level or transaction level.
+A session-level lock request is not automatically released at transaction
+end, but must be explicitly released by the application.  (However, any
+remaining locks are always released at session end.)  Transaction-level
+user lock requests behave the same as normal lock requests, in that they
+are released at transaction end and do not need explicit unlocking.
+
+Locking during Hot Standby
+--------------------------
+
+The Startup process is the only backend that can make changes during
+recovery, all other backends are read only.  As a result the Startup
+process does not acquire locks on relations or objects except when the lock
+level is AccessExclusiveLock.
+
+Regular backends are only allowed to take locks on relations or objects
+at RowExclusiveLock or lower. This ensures that they do not conflict with
+each other or with the Startup process, unless AccessExclusiveLocks are
+requested by the Startup process.
+
+Deadlocks involving AccessExclusiveLocks are not possible, so we need
+not be concerned that a user initiated deadlock can prevent recovery from
+progressing.
+
+AccessExclusiveLocks on the primary node generate WAL records
+that are then applied by the Startup process. Locks are released at end
+of transaction just as they are in normal processing. These locks are
+held by the Startup process, acting as a proxy for the backends that
+originally acquired these locks. Again, these locks cannot conflict with
+one another, so the Startup process cannot deadlock itself either.
+
+Although deadlock is not possible, a regular backend's weak lock can
+prevent the Startup process from making progress in applying WAL, which is
+usually not something that should be tolerated for very long.  Mechanisms
+exist to forcibly cancel a regular backend's query if it blocks the
+Startup process for too long.
diff --git a/src/backend/storage/lmgr/README-SSI b/src/backend/storage/lmgr/README-SSI
new file mode 100644
index 0000000..50d2ecc
--- /dev/null
+++ b/src/backend/storage/lmgr/README-SSI
@@ -0,0 +1,646 @@
+src/backend/storage/lmgr/README-SSI
+
+Serializable Snapshot Isolation (SSI) and Predicate Locking
+===========================================================
+
+This code is in the lmgr directory because about 90% of it is an
+implementation of predicate locking, which is required for SSI,
+rather than being directly related to SSI itself.  When another use
+for predicate locking justifies the effort to tease these two things
+apart, this README file should probably be split.
+
+
+Credits
+-------
+
+This feature was developed by Kevin Grittner and Dan R. K. Ports,
+with review and suggestions from Joe Conway, Heikki Linnakangas, and
+Jeff Davis.  It is based on work published in these papers:
+
+	Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
+	Serializable isolation for snapshot databases.
+	In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
+	international conference on Management of data,
+	pages 729-738, New York, NY, USA. ACM.
+	http://doi.acm.org/10.1145/1376616.1376690
+
+	Michael James Cahill. 2009.
+	Serializable Isolation for Snapshot Databases.
+	Sydney Digital Theses.
+	University of Sydney, School of Information Technologies.
+	http://hdl.handle.net/2123/5353
+
+
+Overview
+--------
+
+With true serializable transactions, if you can show that your
+transaction will do the right thing if there are no concurrent
+transactions, it will do the right thing in any mix of serializable
+transactions or be rolled back with a serialization failure.  This
+feature has been implemented in PostgreSQL using SSI.
+
+
+Serializable and Snapshot Transaction Isolation Levels
+------------------------------------------------------
+
+Serializable transaction isolation is attractive for shops with
+active development by many programmers against a complex schema
+because it guarantees data integrity with very little staff time --
+if a transaction can be shown to always do the right thing when it is
+run alone (before or after any other transaction), it will always do
+the right thing in any mix of concurrent serializable transactions.
+Where conflicts with other transactions would result in an
+inconsistent state within the database or an inconsistent view of
+the data, a serializable transaction will block or roll back to
+prevent the anomaly. The SQL standard provides a specific SQLSTATE
+for errors generated when a transaction rolls back for this reason,
+so that transactions can be retried automatically.
+
+Before version 9.1, PostgreSQL did not support a full serializable
+isolation level. A request for serializable transaction isolation
+actually provided snapshot isolation. This has well known anomalies
+which can allow data corruption or inconsistent views of the data
+during concurrent transactions; although these anomalies only occur
+when certain patterns of read-write dependencies exist within a set
+of concurrent transactions. Where these patterns exist, the anomalies
+can be prevented by introducing conflicts through explicitly
+programmed locks or otherwise unnecessary writes to the database.
+Snapshot isolation is popular because performance is better than
+serializable isolation and the integrity guarantees which it does
+provide allow anomalies to be avoided or managed with reasonable
+effort in many environments.
+
+
+Serializable Isolation Implementation Strategies
+------------------------------------------------
+
+Techniques for implementing full serializable isolation have been
+published and in use in many database products for decades. The
+primary technique which has been used is Strict Two-Phase Locking
+(S2PL), which operates by blocking writes against data which has been
+read by concurrent transactions and blocking any access (read or
+write) against data which has been written by concurrent
+transactions. A cycle in a graph of blocking indicates a deadlock,
+requiring a rollback. Blocking and deadlocks under S2PL in high
+contention workloads can be debilitating, crippling throughput and
+response time.
+
+A new technique for implementing full serializable isolation in an
+MVCC database appears in the literature beginning in 2008. This
+technique, known as Serializable Snapshot Isolation (SSI) has many of
+the advantages of snapshot isolation. In particular, reads don't
+block anything and writes don't block reads. Essentially, it runs
+snapshot isolation but monitors the read-write conflicts between
+transactions to identify dangerous structures in the transaction
+graph which indicate that a set of concurrent transactions might
+produce an anomaly, and rolls back transactions to ensure that no
+anomalies occur. It will produce some false positives (where a
+transaction is rolled back even though there would not have been an
+anomaly), but will never let an anomaly occur. In the two known
+prototype implementations, performance for many workloads (even with
+the need to restart transactions which are rolled back) is very close
+to snapshot isolation and generally far better than an S2PL
+implementation.
+
+
+Apparent Serial Order of Execution
+----------------------------------
+
+One way to understand when snapshot anomalies can occur, and to
+visualize the difference between the serializable implementations
+described above, is to consider that among transactions executing at
+the serializable transaction isolation level, the results are
+required to be consistent with some serial (one-at-a-time) execution
+of the transactions [1]. How is that order determined in each?
+
+In S2PL, each transaction locks any data it accesses. It holds the
+locks until committing, preventing other transactions from making
+conflicting accesses to the same data in the interim. Some
+transactions may have to be rolled back to prevent deadlock. But
+successful transactions can always be viewed as having occurred
+sequentially, in the order they committed.
+
+With snapshot isolation, reads never block writes, nor vice versa, so
+more concurrency is possible. The order in which transactions appear
+to have executed is determined by something more subtle than in S2PL:
+read/write dependencies. If a transaction reads data, it appears to
+execute after the transaction that wrote the data it is reading.
+Similarly, if it updates data, it appears to execute after the
+transaction that wrote the previous version. These dependencies, which
+we call "wr-dependencies" and "ww-dependencies", are consistent with
+the commit order, because the first transaction must have committed
+before the second starts. However, there can also be dependencies
+between two *concurrent* transactions, i.e. where one was running when
+the other acquired its snapshot.  These "rw-conflicts" occur when one
+transaction attempts to read data which is not visible to it because
+the transaction which wrote it (or will later write it) is
+concurrent. The reading transaction appears to have executed first,
+regardless of the actual sequence of transaction starts or commits,
+because it sees a database state prior to that in which the other
+transaction leaves it.
+
+Anomalies occur when a cycle is created in the graph of dependencies:
+when a dependency or series of dependencies causes transaction A to
+appear to have executed before transaction B, but another series of
+dependencies causes B to appear before A. If that's the case, then
+the results can't be consistent with any serial execution of the
+transactions.
+
+
+SSI Algorithm
+-------------
+
+As of 9.1, serializable transactions in PostgreSQL are implemented using
+Serializable Snapshot Isolation (SSI), based on the work of Cahill
+et al. Fundamentally, this allows snapshot isolation to run as it
+previously did, while monitoring for conditions which could create a
+serialization anomaly.
+
+SSI is based on the observation [2] that each snapshot isolation
+anomaly corresponds to a cycle that contains a "dangerous structure"
+of two adjacent rw-conflict edges:
+
+      Tin ------> Tpivot ------> Tout
+            rw             rw
+
+SSI works by watching for this dangerous structure, and rolling
+back a transaction when needed to prevent any anomaly. This means it
+only needs to track rw-conflicts between concurrent transactions, not
+wr- and ww-dependencies. It also means there is a risk of false
+positives, because not every dangerous structure is embedded in an
+actual cycle.  The number of false positives is low in practice, so
+this represents an acceptable tradeoff for keeping the detection
+overhead low.
+
+The PostgreSQL implementation uses two additional optimizations:
+
+* Tout must commit before any other transaction in the cycle
+  (see proof of Theorem 2.1 of [2]). We only roll back a transaction
+  if Tout commits before Tpivot and Tin.
+
+* if Tin is read-only, there can only be an anomaly if Tout committed
+  before Tin takes its snapshot. This optimization is an original
+  one. Proof:
+
+  - Because there is a cycle, there must be some transaction T0 that
+    precedes Tin in the cycle. (T0 might be the same as Tout.)
+
+  - The edge between T0 and Tin can't be a rw-conflict or ww-dependency,
+    because Tin was read-only, so it must be a wr-dependency.
+    Those can only occur if T0 committed before Tin took its snapshot,
+    else Tin would have ignored T0's output.
+
+  - Because Tout must commit before any other transaction in the
+    cycle, it must commit before T0 commits -- and thus before Tin
+    starts.
+
+
+PostgreSQL Implementation
+-------------------------
+
+    * Since this technique is based on Snapshot Isolation (SI), those
+areas in PostgreSQL which don't use SI can't be brought under SSI.
+This includes system tables, temporary tables, sequences, hint bit
+rewrites, etc.  SSI can not eliminate existing anomalies in these
+areas.
+
+    * Any transaction which is run at a transaction isolation level
+other than SERIALIZABLE will not be affected by SSI.  If you want to
+enforce business rules through SSI, all transactions should be run at
+the SERIALIZABLE transaction isolation level, and that should
+probably be set as the default.
+
+    * If all transactions are run at the SERIALIZABLE transaction
+isolation level, business rules can be enforced in triggers or
+application code without ever having a need to acquire an explicit
+lock or to use SELECT FOR SHARE or SELECT FOR UPDATE.
+
+    * Those who want to continue to use snapshot isolation without
+the additional protections of SSI (and the associated costs of
+enforcing those protections), can use the REPEATABLE READ transaction
+isolation level.  This level retains its legacy behavior, which
+is identical to the old SERIALIZABLE implementation and fully
+consistent with the standard's requirements for the REPEATABLE READ
+transaction isolation level.
+
+    * Performance under this SSI implementation will be significantly
+improved if transactions which don't modify permanent tables are
+declared to be READ ONLY before they begin reading data.
+
+    * Performance under SSI will tend to degrade more rapidly with a
+large number of active database transactions than under less strict
+isolation levels.  Limiting the number of active transactions through
+use of a connection pool or similar techniques may be necessary to
+maintain good performance.
+
+    * Any transaction which must be rolled back to prevent
+serialization anomalies will fail with SQLSTATE 40001, which has a
+standard meaning of "serialization failure".
+
+    * This SSI implementation makes an effort to choose the
+transaction to be canceled such that an immediate retry of the
+transaction will not fail due to conflicts with exactly the same
+transactions.  Pursuant to this goal, no transaction is canceled
+until one of the other transactions in the set of conflicts which
+could generate an anomaly has successfully committed.  This is
+conceptually similar to how write conflicts are handled.  To fully
+implement this guarantee there needs to be a way to roll back the
+active transaction for another process with a serialization failure
+SQLSTATE, even if it is "idle in transaction".
+
+
+Predicate Locking
+-----------------
+
+Both S2PL and SSI require some form of predicate locking to handle
+situations where reads conflict with later inserts or with later
+updates which move data into the selected range.  PostgreSQL didn't
+already have predicate locking, so it needed to be added to support
+full serializable transactions under either strategy. Practical
+implementations of predicate locking generally involve acquiring
+locks against data as it is accessed, using multiple granularities
+(tuple, page, table, etc.) with escalation as needed to keep the lock
+count to a number which can be tracked within RAM structures.  This
+approach was used in PostgreSQL.  Coarse granularities can cause some
+false positive indications of conflict. The number of false positives
+can be influenced by plan choice.
+
+
+Implementation overview
+-----------------------
+
+New RAM structures, inspired by those used to track traditional locks
+in PostgreSQL, but tailored to the needs of SIREAD predicate locking,
+are used.  These refer to physical objects actually accessed in the
+course of executing the query, to model the predicates through
+inference.  Anyone interested in this subject should review the
+Hellerstein, Stonebraker and Hamilton paper [3], along with the
+locking papers referenced from that and the Cahill papers.
+
+Because the SIREAD locks don't block, traditional locking techniques
+have to be modified.  Intent locking (locking higher level objects
+before locking lower level objects) doesn't work with non-blocking
+"locks" (which are, in some respects, more like flags than locks).
+
+A configurable amount of shared memory is reserved at postmaster
+start-up to track predicate locks. This size cannot be changed
+without a restart.
+
+To prevent resource exhaustion, multiple fine-grained locks may
+be promoted to a single coarser-grained lock as needed.
+
+An attempt to acquire an SIREAD lock on a tuple when the same
+transaction already holds an SIREAD lock on the page or the relation
+will be ignored. Likewise, an attempt to lock a page when the
+relation is locked will be ignored, and the acquisition of a coarser
+lock will result in the automatic release of all finer-grained locks
+it covers.
+
+
+Heap locking
+------------
+
+Predicate locks will be acquired for the heap based on the following:
+
+    * For a table scan, the entire relation will be locked.
+
+    * Each tuple read which is visible to the reading transaction
+will be locked, whether or not it meets selection criteria; except
+that there is no need to acquire an SIREAD lock on a tuple when the
+transaction already holds a write lock on any tuple representing the
+row, since a rw-conflict would also create a ww-dependency which
+has more aggressive enforcement and thus will prevent any anomaly.
+
+    * Modifying a heap tuple creates a rw-conflict with any transaction
+that holds a SIREAD lock on that tuple, or on the page or relation
+that contains it.
+
+    * Inserting a new tuple creates a rw-conflict with any transaction
+holding a SIREAD lock on the entire relation. It doesn't conflict with
+page-level locks, because page-level locks are only used to aggregate
+tuple locks. Unlike index page locks, they don't lock "gaps" on the page.
+
+
+Index AM implementations
+------------------------
+
+Since predicate locks only exist to detect writes which conflict with
+earlier reads, and heap tuple locks are acquired to cover all heap
+tuples actually read, including those read through indexes, the index
+tuples which were actually scanned are not of interest in themselves;
+we only care about their "new neighbors" -- later inserts into the
+index which would have been included in the scan had they existed at
+the time.  Conceptually, we want to lock the gaps between and
+surrounding index entries within the scanned range.
+
+Correctness requires that any insert into an index generates a
+rw-conflict with a concurrent serializable transaction if, after that
+insert, re-execution of any index scan of the other transaction would
+access the heap for a row not accessed during the previous execution.
+Note that a non-HOT update which expires an old index entry covered
+by the scan and adds a new entry for the modified row's new tuple
+need not generate a conflict, although an update which "moves" a row
+into the scan must generate a conflict.  While correctness allows
+false positives, they should be minimized for performance reasons.
+
+Several optimizations are possible, though not all are implemented yet:
+
+    * An index scan which is just finding the right position for an
+index insertion or deletion need not acquire a predicate lock.
+
+    * An index scan which is comparing for equality on the entire key
+for a unique index need not acquire a predicate lock as long as a key
+is found corresponding to a visible tuple which has not been modified
+by another transaction -- there are no "between or around" gaps to
+cover.
+
+    * As long as built-in foreign key enforcement continues to use
+its current "special tricks" to deal with MVCC issues, predicate
+locks should not be needed for scans done by enforcement code.
+
+    * If a search determines that no rows can be found regardless of
+index contents because the search conditions are contradictory (e.g.,
+x = 1 AND x = 2), then no predicate lock is needed.
+
+Other index AM implementation considerations:
+
+    * For an index AM that doesn't have support for predicate locking,
+we just acquire a predicate lock on the whole index for any search.
+
+    * B-tree index searches acquire predicate locks only on the
+index *leaf* pages needed to lock the appropriate index range. If,
+however, a search discovers that no root page has yet been created, a
+predicate lock on the index relation is required.
+
+    * Like a B-tree, GIN searches acquire predicate locks only on the
+leaf pages of entry tree. When performing an equality scan, and an
+entry has a posting tree, the posting tree root is locked instead, to
+lock only that key value. However, fastupdate=on postpones the
+insertion of tuples into index structure by temporarily storing them
+into pending list. That makes us unable to detect r-w conflicts using
+page-level locks. To cope with that, insertions to the pending list
+conflict with all scans.
+
+    * GiST searches can determine that there are no matches at any
+level of the index, so we acquire predicate lock at each index
+level during a GiST search. An index insert at the leaf level can
+then be trusted to ripple up to all levels and locations where
+conflicting predicate locks may exist. In case there is a page split,
+we need to copy predicate lock from the original page to all the new
+pages.
+
+    * Hash index searches acquire predicate locks on the primary
+page of a bucket. It acquires a lock on both the old and new buckets
+for scans that happen concurrently with page splits. During a bucket
+split, a predicate lock is copied from the primary page of an old
+bucket to the primary page of a new bucket.
+
+    * The effects of page splits, overflows, consolidations, and
+removals must be carefully reviewed to ensure that predicate locks
+aren't "lost" during those operations, or kept with pages which could
+get re-used for different parts of the index.
+
+
+Innovations
+-----------
+
+The PostgreSQL implementation of Serializable Snapshot Isolation
+differs from what is described in the cited papers for several
+reasons:
+
+   1. PostgreSQL didn't have any existing predicate locking. It had
+to be added from scratch.
+
+   2. The existing in-memory lock structures were not suitable for
+tracking SIREAD locks.
+          * In PostgreSQL, tuple level locks are not held in RAM for
+any length of time; lock information is written to the tuples
+involved in the transactions.
+          * In PostgreSQL, existing lock structures have pointers to
+memory which is related to a session. SIREAD locks need to persist
+past the end of the originating transaction and even the session
+which ran it.
+          * PostgreSQL needs to be able to tolerate a large number of
+transactions executing while one long-running transaction stays open
+-- the in-RAM techniques discussed in the papers wouldn't support
+that.
+
+   3. Unlike the database products used for the prototypes described
+in the papers, PostgreSQL didn't already have a true serializable
+isolation level distinct from snapshot isolation.
+
+   4. PostgreSQL supports subtransactions -- an issue not mentioned
+in the papers.
+
+   5. PostgreSQL doesn't assign a transaction number to a database
+transaction until and unless necessary (normally, when the transaction
+attempts to modify data).
+
+   6. PostgreSQL has pluggable data types with user-definable
+operators, as well as pluggable index types, not all of which are
+based around data types which support ordering.
+
+   7. Some possible optimizations became apparent during development
+and testing.
+
+Differences from the implementation described in the papers are
+listed below.
+
+    * New structures needed to be created in shared memory to track
+the proper information for serializable transactions and their SIREAD
+locks.
+
+    * Because PostgreSQL does not have the same concept of an "oldest
+transaction ID" for all serializable transactions as assumed in the
+Cahill thesis, we track the oldest snapshot xmin among serializable
+transactions, and a count of how many active transactions use that
+xmin. When the count hits zero we find the new oldest xmin and run a
+clean-up based on that.
+
+    * Because reads in a subtransaction may cause that subtransaction
+to roll back, thereby affecting what is written by the top level
+transaction, predicate locks must survive a subtransaction rollback.
+As a consequence, all xid usage in SSI, including predicate locking,
+is based on the top level xid.  When looking at an xid that comes
+from a tuple's xmin or xmax, for example, we always call
+SubTransGetTopmostTransaction() before doing much else with it.
+
+    * PostgreSQL does not use "update in place" with a rollback log
+for its MVCC implementation.  Where possible it uses "HOT" updates on
+the same page (if there is room and no indexed value is changed).
+For non-HOT updates the old tuple is expired in place and a new tuple
+is inserted at a new location.  Because of this difference, a tuple
+lock in PostgreSQL doesn't automatically lock any other versions of a
+row.  We don't try to copy or expand a tuple lock to any other
+versions of the row, based on the following proof that any additional
+serialization failures we would get from that would be false
+positives:
+
+          o If transaction T1 reads a row version (thus acquiring a
+predicate lock on it) and a second transaction T2 updates that row
+version (thus creating a rw-conflict graph edge from T1 to T2), must a
+third transaction T3 which re-updates the new version of the row also
+have a rw-conflict in from T1 to prevent anomalies?  In other words,
+does it matter whether we recognize the edge T1 -> T3?
+
+          o If T1 has a conflict in, it certainly doesn't. Adding the
+edge T1 -> T3 would create a dangerous structure, but we already had
+one from the edge T1 -> T2, so we would have aborted something anyway.
+(T2 has already committed, else T3 could not have updated its output;
+but we would have aborted either T1 or T1's predecessor(s).  Hence
+no cycle involving T1 and T3 can survive.)
+
+          o Now let's consider the case where T1 doesn't have a
+rw-conflict in. If that's the case, for this edge T1 -> T3 to make a
+difference, T3 must have a rw-conflict out that induces a cycle in the
+dependency graph, i.e. a conflict out to some transaction preceding T1
+in the graph. (A conflict out to T1 itself would be problematic too,
+but that would mean T1 has a conflict in, the case we already
+eliminated.)
+
+          o So now we're trying to figure out if there can be an
+rw-conflict edge T3 -> T0, where T0 is some transaction that precedes
+T1. For T0 to precede T1, there has to be some edge, or sequence of
+edges, from T0 to T1. At least the last edge has to be a wr-dependency
+or ww-dependency rather than a rw-conflict, because T1 doesn't have a
+rw-conflict in. And that gives us enough information about the order
+of transactions to see that T3 can't have a rw-conflict to T0:
+ - T0 committed before T1 started (the wr/ww-dependency implies this)
+ - T1 started before T2 committed (the T1->T2 rw-conflict implies this)
+ - T2 committed before T3 started (otherwise, T3 would get aborted
+                                   because of an update conflict)
+
+          o That means T0 committed before T3 started, and therefore
+there can't be a rw-conflict from T3 to T0.
+
+          o So in all cases, we don't need the T1 -> T3 edge to
+recognize cycles.  Therefore it's not necessary for T1's SIREAD lock
+on the original tuple version to cover later versions as well.
+
+    * Predicate locking in PostgreSQL starts at the tuple level
+when possible. Multiple fine-grained locks are promoted to a single
+coarser-granularity lock as needed to avoid resource exhaustion.  The
+amount of memory used for these structures is configurable, to balance
+RAM usage against SIREAD lock granularity.
+
+    * Each backend keeps a process-local table of the locks it holds.
+To support granularity promotion decisions with low CPU and locking
+overhead, this table also includes the coarser covering locks and the
+number of finer-granularity locks they cover.
+
+    * Conflicts are identified by looking for predicate locks
+when tuples are written, and by looking at the MVCC information when
+tuples are read. There is no matching between two RAM-based locks.
+
+    * Because write locks are stored in the heap tuples rather than a
+RAM-based lock table, the optimization described in the Cahill thesis
+which eliminates an SIREAD lock where there is a write lock is
+implemented by the following:
+         1. When checking a heap write for conflicts against existing
+predicate locks, a tuple lock on the tuple being written is removed.
+         2. When acquiring a predicate lock on a heap tuple, we
+return quickly without doing anything if it is a tuple written by the
+reading transaction.
+
+    * Rather than using conflictIn and conflictOut pointers which use
+NULL to indicate no conflict and a self-reference to indicate
+multiple conflicts or conflicts with committed transactions, we use a
+list of rw-conflicts. With the more complete information, false
+positives are reduced and we have sufficient data for more aggressive
+clean-up and other optimizations:
+
+          o We can avoid ever rolling back a transaction until and
+unless there is a pivot where a transaction on the conflict *out*
+side of the pivot committed before either of the other transactions.
+
+          o We can avoid ever rolling back a transaction when the
+transaction on the conflict *in* side of the pivot is explicitly or
+implicitly READ ONLY unless the transaction on the conflict *out*
+side of the pivot committed before the READ ONLY transaction acquired
+its snapshot. (An implicit READ ONLY transaction is one which
+committed without writing, even though it was not explicitly declared
+to be READ ONLY.)
+
+          o We can more aggressively clean up conflicts, predicate
+locks, and SSI transaction information.
+
+    * We allow a READ ONLY transaction to "opt out" of SSI if there are
+no READ WRITE transactions which could cause the READ ONLY
+transaction to ever become part of a "dangerous structure" of
+overlapping transaction dependencies.
+
+    * We allow the user to request that a READ ONLY transaction wait
+until the conditions are right for it to start in the "opt out" state
+described above. We add a DEFERRABLE state to transactions, which is
+specified and maintained in a way similar to READ ONLY. It is
+ignored for transactions that are not SERIALIZABLE and READ ONLY.
+
+    * When a transaction must be rolled back, we pick among the
+active transactions such that an immediate retry will not fail again
+on conflicts with the same transactions.
+
+    * We use the PostgreSQL SLRU system to hold summarized
+information about older committed transactions to put an upper bound
+on RAM used. Beyond that limit, information spills to disk.
+Performance can degrade in a pessimal situation, but it should be
+tolerable, and transactions won't need to be canceled or blocked
+from starting.
+
+
+R&D Issues
+----------
+
+This is intended to be the place to record specific issues which need
+more detailed review or analysis.
+
+    * WAL file replay. While serializable implementations using S2PL
+can guarantee that the write-ahead log contains commits in a sequence
+consistent with some serial execution of serializable transactions,
+SSI cannot make that guarantee. While the WAL replay is no less
+consistent than under snapshot isolation, it is possible that under
+PITR recovery or hot standby a database could reach a readable state
+where some transactions appear before other transactions which would
+have had to precede them to maintain serializable consistency. In
+essence, if we do nothing, WAL replay will be at snapshot isolation
+even for serializable transactions. Is this OK? If not, how do we
+address it?
+
+    * External replication. Look at how this impacts external
+replication solutions, like Postgres-R, Slony, pgpool, HS/SR, etc.
+This is related to the "WAL file replay" issue.
+
+    * UNIQUE btree search for equality on all columns. Since a search
+of a UNIQUE index using equality tests on all columns will lock the
+heap tuple if an entry is found, it appears that there is no need to
+get a predicate lock on the index in that case. A predicate lock is
+still needed for such a search if a matching index entry which points
+to a visible tuple is not found.
+
+    * Minimize touching of shared memory. Should lists in shared
+memory push entries which have just been returned to the front of the
+available list, so they will be popped back off soon and some memory
+might never be touched, or should we keep adding returned items to
+the end of the available list?
+
+
+References
+----------
+
+[1] http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
+Search for serial execution to find the relevant section.
+
+[2] A. Fekete et al. Making Snapshot Isolation Serializable. In ACM
+Transactions on Database Systems 30:2, Jun. 2005.
+http://dx.doi.org/10.1145/1071610.1071615
+
+[3] Joseph M. Hellerstein, Michael Stonebraker and James Hamilton. 2007.
+Architecture of a Database System. Foundations and Trends(R) in
+Databases Vol. 1, No. 2 (2007) 141-259.
+http://db.cs.berkeley.edu/papers/fntdb07-architecture.pdf
+  Of particular interest:
+    * 6.1 A Note on ACID
+    * 6.2 A Brief Review of Serializability
+    * 6.3 Locking and Latching
+    * 6.3.1 Transaction Isolation Levels
+    * 6.5.3 Next-Key Locking: Physical Surrogates for Logical Properties
diff --git a/src/backend/storage/lmgr/README.barrier b/src/backend/storage/lmgr/README.barrier
new file mode 100644
index 0000000..f78e5ac
--- /dev/null
+++ b/src/backend/storage/lmgr/README.barrier
@@ -0,0 +1,197 @@
+Memory Barriers
+===============
+
+Modern CPUs make extensive use of pipe-lining and out-of-order execution,
+meaning that the CPU is often executing more than one instruction at a
+time, and not necessarily in the order that the source code would suggest.
+Furthermore, even before the CPU gets a chance to reorder operations, the
+compiler may (and often does) reorganize the code for greater efficiency,
+particularly at higher optimization levels.  Optimizing compilers and
+out-of-order execution are both critical for good performance, but they
+can lead to surprising results when multiple processes access the same
+memory space.
+
+Example
+=======
+
+Suppose x is a pointer to a structure stored in shared memory, and that the
+entire structure has been initialized to zero bytes.  One backend executes
+the following code fragment:
+
+    x->foo = 1;
+    x->bar = 1;
+
+Meanwhile, at approximately the same time, another backend executes this
+code fragment:
+
+    bar = x->bar;
+    foo = x->foo;
+
+The second backend might end up with foo = 1 and bar = 1 (if it executes
+both statements after the first backend), or with foo = 0 and bar = 0 (if
+it executes both statements before the first backend), or with foo = 1 and
+bar = 0 (if the first backend executes the first statement, the second
+backend executes both statements, and then the first backend executes the
+second statement).
+
+Surprisingly, however, the second backend could also end up with foo = 0
+and bar = 1.  The compiler might swap the order of the two stores performed
+by the first backend, or the two loads performed by the second backend.
+Even if it doesn't, on a machine with weak memory ordering (such as PowerPC
+or ARM) the CPU might choose to execute either the loads or the stores
+out of order.  This surprising result can lead to bugs.
+
+A common pattern where this actually does result in a bug is when adding items
+onto a queue.  The writer does this:
+
+    q->items[q->num_items] = new_item;
+    ++q->num_items;
+
+The reader does this:
+
+    num_items = q->num_items;
+    for (i = 0; i < num_items; ++i)
+        /* do something with q->items[i] */
+
+This code turns out to be unsafe, because the writer might increment
+q->num_items before it finishes storing the new item into the appropriate slot.
+More subtly, the reader might prefetch the contents of the q->items array
+before reading q->num_items.  Thus, there's still a bug here *even if the
+writer does everything in the order we expect*.  We need the writer to update
+the array before bumping the item counter, and the reader to examine the item
+counter before examining the array.
+
+Note that these types of highly counterintuitive bugs can *only* occur when
+multiple processes are interacting with the same memory segment.  A given
+process always perceives its *own* writes to memory in program order.
+
+Avoiding Memory Ordering Bugs
+=============================
+
+The simplest (and often best) way to avoid memory ordering bugs is to
+protect the data structures involved with an lwlock.  For more details, see
+src/backend/storage/lmgr/README.  For instance, in the above example, the
+writer could acquire an lwlock in exclusive mode before appending to the
+queue, and each reader could acquire the same lock in shared mode before
+reading it.  If the data structure is not heavily trafficked, this solution is
+generally entirely adequate.
+
+However, in some cases, it is desirable to avoid the overhead of acquiring
+and releasing locks.  In this case, memory barriers may be used to ensure
+that the apparent order of execution is as the programmer desires.   In
+PostgreSQL backend code, the pg_memory_barrier() macro may be used to achieve
+this result.  In the example above, we can prevent the reader from seeing a
+garbage value by having the writer do this:
+
+    q->items[q->num_items] = new_item;
+    pg_memory_barrier();
+    ++q->num_items;
+
+And by having the reader do this:
+
+    num_items = q->num_items;
+    pg_memory_barrier();
+    for (i = 0; i < num_items; ++i)
+        /* do something with q->items[i] */
+
+The pg_memory_barrier() macro will (1) prevent the compiler from rearranging
+the code in such a way as to allow the memory accesses to occur out of order
+and (2) generate any code (often, inline assembly) that is needed to prevent
+the CPU from executing the memory accesses out of order.  Specifically, the
+barrier prevents loads and stores written after the barrier from being
+performed before the barrier, and vice-versa.
+
+Although this code will work, it is needlessly inefficient.  On systems with
+strong memory ordering (such as x86), the CPU never reorders loads with other
+loads, nor stores with other stores.  It can, however, allow a load to be
+performed before a subsequent store.  To avoid emitting unnecessary memory
+instructions, we provide two additional primitives: pg_read_barrier(), and
+pg_write_barrier().  When a memory barrier is being used to separate two
+loads, use pg_read_barrier(); when it is separating two stores, use
+pg_write_barrier(); when it is a separating a load and a store (in either
+order), use pg_memory_barrier().  pg_memory_barrier() can always substitute
+for either a read or a write barrier, but is typically more expensive, and
+therefore should be used only when needed.
+
+With these guidelines in mind, the writer can do this:
+
+    q->items[q->num_items] = new_item;
+    pg_write_barrier();
+    ++q->num_items;
+
+And the reader can do this:
+
+    num_items = q->num_items;
+    pg_read_barrier();
+    for (i = 0; i < num_items; ++i)
+        /* do something with q->items[i] */
+
+On machines with strong memory ordering, these weaker barriers will simply
+prevent compiler rearrangement, without emitting any actual machine code.
+On machines with weak memory ordering, they will prevent compiler
+reordering and also emit whatever hardware barrier may be required.  Even
+on machines with weak memory ordering, a read or write barrier may be able
+to use a less expensive instruction than a full barrier.
+
+Weaknesses of Memory Barriers
+=============================
+
+While memory barriers are a powerful tool, and much cheaper than locks, they
+are also much less capable than locks.  Here are some of the problems.
+
+1. Concurrent writers are unsafe.  In the above example of a queue, using
+memory barriers doesn't make it safe for two processes to add items to the
+same queue at the same time.  If more than one process can write to the queue,
+a spinlock or lwlock must be used to synchronize access. The readers can
+perhaps proceed without any lock, but the writers may not.
+
+Even very simple write operations often require additional synchronization.
+For example, it's not safe for multiple writers to simultaneously execute
+this code (supposing x is a pointer into shared memory):
+
+    x->foo++;
+
+Although this may compile down to a single machine-language instruction,
+the CPU will execute that instruction by reading the current value of foo,
+adding one to it, and then storing the result back to the original address.
+If two CPUs try to do this simultaneously, both may do their reads before
+either one does their writes.  Such a case could be made safe by using an
+atomic variable and an atomic add.  See port/atomics.h.
+
+2. Eight-byte loads and stores aren't necessarily atomic.  We assume in
+various places in the source code that an aligned four-byte load or store is
+atomic, and that other processes therefore won't see a half-set value.
+Sadly, the same can't be said for eight-byte value: on some platforms, an
+aligned eight-byte load or store will generate two four-byte operations.  If
+you need an atomic eight-byte read or write, you must either serialize access
+with a lock or use an atomic variable.
+
+3. No ordering guarantees.  While memory barriers ensure that any given
+process performs loads and stores to shared memory in order, they don't
+guarantee synchronization.  In the queue example above, we can use memory
+barriers to be sure that readers won't see garbage, but there's nothing to
+say whether a given reader will run before or after a given writer.  If this
+matters in a given situation, some other mechanism must be used instead of
+or in addition to memory barriers.
+
+4. Barrier proliferation.  Many algorithms that at first seem appealing
+require multiple barriers.  If the number of barriers required is more than
+one or two, you may be better off just using a lock.  Keep in mind that, on
+some platforms, a barrier may be implemented by acquiring and releasing a
+backend-private spinlock.  This may be better than a centralized lock under
+contention, but it may also be slower in the uncontended case.
+
+Further Reading
+===============
+
+Much of the documentation about memory barriers appears to be quite
+Linux-specific.  The following papers may be helpful:
+
+Memory Ordering in Modern Microprocessors, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf
+
+Memory Barriers: a Hardware View for Software Hackers, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/whymb.2010.06.07c.pdf
+
+The Linux kernel also has some useful documentation on this topic.  Start
+with Documentation/memory-barriers.txt
diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
new file mode 100644
index 0000000..910a768
--- /dev/null
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -0,0 +1,360 @@
+/*-------------------------------------------------------------------------
+ *
+ * condition_variable.c
+ *	  Implementation of condition variables.  Condition variables provide
+ *	  a way for one process to wait until a specific condition occurs,
+ *	  without needing to know the specific identity of the process for
+ *	  which they are waiting.  Waits for condition variables can be
+ *	  interrupted, unlike LWLock waits.  Condition variables are safe
+ *	  to use within dynamic shared memory segments.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/lmgr/condition_variable.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+/* Initially, we are not prepared to sleep on any condition variable. */
+static ConditionVariable *cv_sleep_target = NULL;
+
+/*
+ * Initialize a condition variable.
+ */
+void
+ConditionVariableInit(ConditionVariable *cv)
+{
+	SpinLockInit(&cv->mutex);
+	proclist_init(&cv->wakeup);
+}
+
+/*
+ * Prepare to wait on a given condition variable.
+ *
+ * This can optionally be called before entering a test/sleep loop.
+ * Doing so is more efficient if we'll need to sleep at least once.
+ * However, if the first test of the exit condition is likely to succeed,
+ * it's more efficient to omit the ConditionVariablePrepareToSleep call.
+ * See comments in ConditionVariableSleep for more detail.
+ *
+ * Caution: "before entering the loop" means you *must* test the exit
+ * condition between calling ConditionVariablePrepareToSleep and calling
+ * ConditionVariableSleep.  If that is inconvenient, omit calling
+ * ConditionVariablePrepareToSleep.
+ */
+void
+ConditionVariablePrepareToSleep(ConditionVariable *cv)
+{
+	int			pgprocno = MyProc->pgprocno;
+
+	/*
+	 * If some other sleep is already prepared, cancel it; this is necessary
+	 * because we have just one static variable tracking the prepared sleep,
+	 * and also only one cvWaitLink in our PGPROC.  It's okay to do this
+	 * because whenever control does return to the other test-and-sleep loop,
+	 * its ConditionVariableSleep call will just re-establish that sleep as
+	 * the prepared one.
+	 */
+	if (cv_sleep_target != NULL)
+		ConditionVariableCancelSleep();
+
+	/* Record the condition variable on which we will sleep. */
+	cv_sleep_target = cv;
+
+	/* Add myself to the wait queue. */
+	SpinLockAcquire(&cv->mutex);
+	proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink);
+	SpinLockRelease(&cv->mutex);
+}
+
+/*
+ * Wait for the given condition variable to be signaled.
+ *
+ * This should be called in a predicate loop that tests for a specific exit
+ * condition and otherwise sleeps, like so:
+ *
+ *	 ConditionVariablePrepareToSleep(cv);  // optional
+ *	 while (condition for which we are waiting is not true)
+ *		 ConditionVariableSleep(cv, wait_event_info);
+ *	 ConditionVariableCancelSleep();
+ *
+ * wait_event_info should be a value from one of the WaitEventXXX enums
+ * defined in pgstat.h.  This controls the contents of pg_stat_activity's
+ * wait_event_type and wait_event columns while waiting.
+ */
+void
+ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
+{
+	(void) ConditionVariableTimedSleep(cv, -1 /* no timeout */ ,
+									   wait_event_info);
+}
+
+/*
+ * Wait for a condition variable to be signaled or a timeout to be reached.
+ *
+ * Returns true when timeout expires, otherwise returns false.
+ *
+ * See ConditionVariableSleep() for general usage.
+ */
+bool
+ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
+							uint32 wait_event_info)
+{
+	long		cur_timeout = -1;
+	instr_time	start_time;
+	instr_time	cur_time;
+	int			wait_events;
+
+	/*
+	 * If the caller didn't prepare to sleep explicitly, then do so now and
+	 * return immediately.  The caller's predicate loop should immediately
+	 * call again if its exit condition is not yet met.  This will result in
+	 * the exit condition being tested twice before we first sleep.  The extra
+	 * test can be prevented by calling ConditionVariablePrepareToSleep(cv)
+	 * first.  Whether it's worth doing that depends on whether you expect the
+	 * exit condition to be met initially, in which case skipping the prepare
+	 * is recommended because it avoids manipulations of the wait list, or not
+	 * met initially, in which case preparing first is better because it
+	 * avoids one extra test of the exit condition.
+	 *
+	 * If we are currently prepared to sleep on some other CV, we just cancel
+	 * that and prepare this one; see ConditionVariablePrepareToSleep.
+	 */
+	if (cv_sleep_target != cv)
+	{
+		ConditionVariablePrepareToSleep(cv);
+		return false;
+	}
+
+	/*
+	 * Record the current time so that we can calculate the remaining timeout
+	 * if we are woken up spuriously.
+	 */
+	if (timeout >= 0)
+	{
+		INSTR_TIME_SET_CURRENT(start_time);
+		Assert(timeout >= 0 && timeout <= INT_MAX);
+		cur_timeout = timeout;
+		wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH;
+	}
+	else
+		wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
+
+	while (true)
+	{
+		bool		done = false;
+
+		/*
+		 * Wait for latch to be set.  (If we're awakened for some other
+		 * reason, the code below will cope anyway.)
+		 */
+		(void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info);
+
+		/* Reset latch before examining the state of the wait list. */
+		ResetLatch(MyLatch);
+
+		/*
+		 * If this process has been taken out of the wait list, then we know
+		 * that it has been signaled by ConditionVariableSignal (or
+		 * ConditionVariableBroadcast), so we should return to the caller. But
+		 * that doesn't guarantee that the exit condition is met, only that we
+		 * ought to check it.  So we must put the process back into the wait
+		 * list, to ensure we don't miss any additional wakeup occurring while
+		 * the caller checks its exit condition.  We can take ourselves out of
+		 * the wait list only when the caller calls
+		 * ConditionVariableCancelSleep.
+		 *
+		 * If we're still in the wait list, then the latch must have been set
+		 * by something other than ConditionVariableSignal; though we don't
+		 * guarantee not to return spuriously, we'll avoid this obvious case.
+		 */
+		SpinLockAcquire(&cv->mutex);
+		if (!proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink))
+		{
+			done = true;
+			proclist_push_tail(&cv->wakeup, MyProc->pgprocno, cvWaitLink);
+		}
+		SpinLockRelease(&cv->mutex);
+
+		/*
+		 * Check for interrupts, and return spuriously if that caused the
+		 * current sleep target to change (meaning that interrupt handler code
+		 * waited for a different condition variable).
+		 */
+		CHECK_FOR_INTERRUPTS();
+		if (cv != cv_sleep_target)
+			done = true;
+
+		/* We were signaled, so return */
+		if (done)
+			return false;
+
+		/* If we're not done, update cur_timeout for next iteration */
+		if (timeout >= 0)
+		{
+			INSTR_TIME_SET_CURRENT(cur_time);
+			INSTR_TIME_SUBTRACT(cur_time, start_time);
+			cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+
+			/* Have we crossed the timeout threshold? */
+			if (cur_timeout <= 0)
+				return true;
+		}
+	}
+}
+
+/*
+ * Cancel any pending sleep operation.
+ *
+ * We just need to remove ourselves from the wait queue of any condition
+ * variable for which we have previously prepared a sleep.
+ *
+ * Do nothing if nothing is pending; this allows this function to be called
+ * during transaction abort to clean up any unfinished CV sleep.
+ *
+ * Return true if we've been signaled.
+ */
+bool
+ConditionVariableCancelSleep(void)
+{
+	ConditionVariable *cv = cv_sleep_target;
+	bool		signaled = false;
+
+	if (cv == NULL)
+		return false;
+
+	SpinLockAcquire(&cv->mutex);
+	if (proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink))
+		proclist_delete(&cv->wakeup, MyProc->pgprocno, cvWaitLink);
+	else
+		signaled = true;
+	SpinLockRelease(&cv->mutex);
+
+	cv_sleep_target = NULL;
+
+	return signaled;
+}
+
+/*
+ * Wake up the oldest process sleeping on the CV, if there is any.
+ *
+ * Note: it's difficult to tell whether this has any real effect: we know
+ * whether we took an entry off the list, but the entry might only be a
+ * sentinel.  Hence, think twice before proposing that this should return
+ * a flag telling whether it woke somebody.
+ */
+void
+ConditionVariableSignal(ConditionVariable *cv)
+{
+	PGPROC	   *proc = NULL;
+
+	/* Remove the first process from the wakeup queue (if any). */
+	SpinLockAcquire(&cv->mutex);
+	if (!proclist_is_empty(&cv->wakeup))
+		proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+	SpinLockRelease(&cv->mutex);
+
+	/* If we found someone sleeping, set their latch to wake them up. */
+	if (proc != NULL)
+		SetLatch(&proc->procLatch);
+}
+
+/*
+ * Wake up all processes sleeping on the given CV.
+ *
+ * This guarantees to wake all processes that were sleeping on the CV
+ * at time of call, but processes that add themselves to the list mid-call
+ * will typically not get awakened.
+ */
+void
+ConditionVariableBroadcast(ConditionVariable *cv)
+{
+	int			pgprocno = MyProc->pgprocno;
+	PGPROC	   *proc = NULL;
+	bool		have_sentinel = false;
+
+	/*
+	 * In some use-cases, it is common for awakened processes to immediately
+	 * re-queue themselves.  If we just naively try to reduce the wakeup list
+	 * to empty, we'll get into a potentially-indefinite loop against such a
+	 * process.  The semantics we really want are just to be sure that we have
+	 * wakened all processes that were in the list at entry.  We can use our
+	 * own cvWaitLink as a sentinel to detect when we've finished.
+	 *
+	 * A seeming flaw in this approach is that someone else might signal the
+	 * CV and in doing so remove our sentinel entry.  But that's fine: since
+	 * CV waiters are always added and removed in order, that must mean that
+	 * every previous waiter has been wakened, so we're done.  We'll get an
+	 * extra "set" on our latch from the someone else's signal, which is
+	 * slightly inefficient but harmless.
+	 *
+	 * We can't insert our cvWaitLink as a sentinel if it's already in use in
+	 * some other proclist.  While that's not expected to be true for typical
+	 * uses of this function, we can deal with it by simply canceling any
+	 * prepared CV sleep.  The next call to ConditionVariableSleep will take
+	 * care of re-establishing the lost state.
+	 */
+	if (cv_sleep_target != NULL)
+		ConditionVariableCancelSleep();
+
+	/*
+	 * Inspect the state of the queue.  If it's empty, we have nothing to do.
+	 * If there's exactly one entry, we need only remove and signal that
+	 * entry.  Otherwise, remove the first entry and insert our sentinel.
+	 */
+	SpinLockAcquire(&cv->mutex);
+	/* While we're here, let's assert we're not in the list. */
+	Assert(!proclist_contains(&cv->wakeup, pgprocno, cvWaitLink));
+
+	if (!proclist_is_empty(&cv->wakeup))
+	{
+		proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+		if (!proclist_is_empty(&cv->wakeup))
+		{
+			proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink);
+			have_sentinel = true;
+		}
+	}
+	SpinLockRelease(&cv->mutex);
+
+	/* Awaken first waiter, if there was one. */
+	if (proc != NULL)
+		SetLatch(&proc->procLatch);
+
+	while (have_sentinel)
+	{
+		/*
+		 * Each time through the loop, remove the first wakeup list entry, and
+		 * signal it unless it's our sentinel.  Repeat as long as the sentinel
+		 * remains in the list.
+		 *
+		 * Notice that if someone else removes our sentinel, we will waken one
+		 * additional process before exiting.  That's intentional, because if
+		 * someone else signals the CV, they may be intending to waken some
+		 * third process that added itself to the list after we added the
+		 * sentinel.  Better to give a spurious wakeup (which should be
+		 * harmless beyond wasting some cycles) than to lose a wakeup.
+		 */
+		proc = NULL;
+		SpinLockAcquire(&cv->mutex);
+		if (!proclist_is_empty(&cv->wakeup))
+			proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+		have_sentinel = proclist_contains(&cv->wakeup, pgprocno, cvWaitLink);
+		SpinLockRelease(&cv->mutex);
+
+		if (proc != NULL && proc != MyProc)
+			SetLatch(&proc->procLatch);
+	}
+}
diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c
new file mode 100644
index 0000000..2bdd20b
--- /dev/null
+++ b/src/backend/storage/lmgr/deadlock.c
@@ -0,0 +1,1159 @@
+/*-------------------------------------------------------------------------
+ *
+ * deadlock.c
+ *	  POSTGRES deadlock detection code
+ *
+ * See src/backend/storage/lmgr/README for a description of the deadlock
+ * detection and resolution algorithms.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/deadlock.c
+ *
+ *	Interface:
+ *
+ *	DeadLockCheck()
+ *	DeadLockReport()
+ *	RememberSimpleDeadLock()
+ *	InitDeadLockChecking()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "utils/memutils.h"
+
+
+/*
+ * One edge in the waits-for graph.
+ *
+ * waiter and blocker may or may not be members of a lock group, but if either
+ * is, it will be the leader rather than any other member of the lock group.
+ * The group leaders act as representatives of the whole group even though
+ * those particular processes need not be waiting at all.  There will be at
+ * least one member of the waiter's lock group on the wait queue for the given
+ * lock, maybe more.
+ */
+typedef struct
+{
+	PGPROC	   *waiter;			/* the leader of the waiting lock group */
+	PGPROC	   *blocker;		/* the leader of the group it is waiting for */
+	LOCK	   *lock;			/* the lock being waited for */
+	int			pred;			/* workspace for TopoSort */
+	int			link;			/* workspace for TopoSort */
+} EDGE;
+
+/* One potential reordering of a lock's wait queue */
+typedef struct
+{
+	LOCK	   *lock;			/* the lock whose wait queue is described */
+	PGPROC	  **procs;			/* array of PGPROC *'s in new wait order */
+	int			nProcs;
+} WAIT_ORDER;
+
+/*
+ * Information saved about each edge in a detected deadlock cycle.  This
+ * is used to print a diagnostic message upon failure.
+ *
+ * Note: because we want to examine this info after releasing the lock
+ * manager's partition locks, we can't just store LOCK and PGPROC pointers;
+ * we must extract out all the info we want to be able to print.
+ */
+typedef struct
+{
+	LOCKTAG		locktag;		/* ID of awaited lock object */
+	LOCKMODE	lockmode;		/* type of lock we're waiting for */
+	int			pid;			/* PID of blocked backend */
+} DEADLOCK_INFO;
+
+
+static bool DeadLockCheckRecurse(PGPROC *proc);
+static int	TestConfiguration(PGPROC *startProc);
+static bool FindLockCycle(PGPROC *checkProc,
+						  EDGE *softEdges, int *nSoftEdges);
+static bool FindLockCycleRecurse(PGPROC *checkProc, int depth,
+								 EDGE *softEdges, int *nSoftEdges);
+static bool FindLockCycleRecurseMember(PGPROC *checkProc,
+									   PGPROC *checkProcLeader,
+									   int depth, EDGE *softEdges, int *nSoftEdges);
+static bool ExpandConstraints(EDGE *constraints, int nConstraints);
+static bool TopoSort(LOCK *lock, EDGE *constraints, int nConstraints,
+					 PGPROC **ordering);
+
+#ifdef DEBUG_DEADLOCK
+static void PrintLockQueue(LOCK *lock, const char *info);
+#endif
+
+
+/*
+ * Working space for the deadlock detector
+ */
+
+/* Workspace for FindLockCycle */
+static PGPROC **visitedProcs;	/* Array of visited procs */
+static int	nVisitedProcs;
+
+/* Workspace for TopoSort */
+static PGPROC **topoProcs;		/* Array of not-yet-output procs */
+static int *beforeConstraints;	/* Counts of remaining before-constraints */
+static int *afterConstraints;	/* List head for after-constraints */
+
+/* Output area for ExpandConstraints */
+static WAIT_ORDER *waitOrders;	/* Array of proposed queue rearrangements */
+static int	nWaitOrders;
+static PGPROC **waitOrderProcs; /* Space for waitOrders queue contents */
+
+/* Current list of constraints being considered */
+static EDGE *curConstraints;
+static int	nCurConstraints;
+static int	maxCurConstraints;
+
+/* Storage space for results from FindLockCycle */
+static EDGE *possibleConstraints;
+static int	nPossibleConstraints;
+static int	maxPossibleConstraints;
+static DEADLOCK_INFO *deadlockDetails;
+static int	nDeadlockDetails;
+
+/* PGPROC pointer of any blocking autovacuum worker found */
+static PGPROC *blocking_autovacuum_proc = NULL;
+
+
+/*
+ * InitDeadLockChecking -- initialize deadlock checker during backend startup
+ *
+ * This does per-backend initialization of the deadlock checker; primarily,
+ * allocation of working memory for DeadLockCheck.  We do this per-backend
+ * since there's no percentage in making the kernel do copy-on-write
+ * inheritance of workspace from the postmaster.  We want to allocate the
+ * space at startup because (a) the deadlock checker might be invoked when
+ * there's no free memory left, and (b) the checker is normally run inside a
+ * signal handler, which is a very dangerous place to invoke palloc from.
+ */
+void
+InitDeadLockChecking(void)
+{
+	MemoryContext oldcxt;
+
+	/* Make sure allocations are permanent */
+	oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * FindLockCycle needs at most MaxBackends entries in visitedProcs[] and
+	 * deadlockDetails[].
+	 */
+	visitedProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
+	deadlockDetails = (DEADLOCK_INFO *) palloc(MaxBackends * sizeof(DEADLOCK_INFO));
+
+	/*
+	 * TopoSort needs to consider at most MaxBackends wait-queue entries, and
+	 * it needn't run concurrently with FindLockCycle.
+	 */
+	topoProcs = visitedProcs;	/* re-use this space */
+	beforeConstraints = (int *) palloc(MaxBackends * sizeof(int));
+	afterConstraints = (int *) palloc(MaxBackends * sizeof(int));
+
+	/*
+	 * We need to consider rearranging at most MaxBackends/2 wait queues
+	 * (since it takes at least two waiters in a queue to create a soft edge),
+	 * and the expanded form of the wait queues can't involve more than
+	 * MaxBackends total waiters.
+	 */
+	waitOrders = (WAIT_ORDER *)
+		palloc((MaxBackends / 2) * sizeof(WAIT_ORDER));
+	waitOrderProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
+
+	/*
+	 * Allow at most MaxBackends distinct constraints in a configuration. (Is
+	 * this enough?  In practice it seems it should be, but I don't quite see
+	 * how to prove it.  If we run out, we might fail to find a workable wait
+	 * queue rearrangement even though one exists.)  NOTE that this number
+	 * limits the maximum recursion depth of DeadLockCheckRecurse. Making it
+	 * really big might potentially allow a stack-overflow problem.
+	 */
+	maxCurConstraints = MaxBackends;
+	curConstraints = (EDGE *) palloc(maxCurConstraints * sizeof(EDGE));
+
+	/*
+	 * Allow up to 3*MaxBackends constraints to be saved without having to
+	 * re-run TestConfiguration.  (This is probably more than enough, but we
+	 * can survive if we run low on space by doing excess runs of
+	 * TestConfiguration to re-compute constraint lists each time needed.) The
+	 * last MaxBackends entries in possibleConstraints[] are reserved as
+	 * output workspace for FindLockCycle.
+	 */
+	maxPossibleConstraints = MaxBackends * 4;
+	possibleConstraints =
+		(EDGE *) palloc(maxPossibleConstraints * sizeof(EDGE));
+
+	MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * DeadLockCheck -- Checks for deadlocks for a given process
+ *
+ * This code looks for deadlocks involving the given process.  If any
+ * are found, it tries to rearrange lock wait queues to resolve the
+ * deadlock.  If resolution is impossible, return DS_HARD_DEADLOCK ---
+ * the caller is then expected to abort the given proc's transaction.
+ *
+ * Caller must already have locked all partitions of the lock tables.
+ *
+ * On failure, deadlock details are recorded in deadlockDetails[] for
+ * subsequent printing by DeadLockReport().  That activity is separate
+ * because (a) we don't want to do it while holding all those LWLocks,
+ * and (b) we are typically invoked inside a signal handler.
+ */
+DeadLockState
+DeadLockCheck(PGPROC *proc)
+{
+	/* Initialize to "no constraints" */
+	nCurConstraints = 0;
+	nPossibleConstraints = 0;
+	nWaitOrders = 0;
+
+	/* Initialize to not blocked by an autovacuum worker */
+	blocking_autovacuum_proc = NULL;
+
+	/* Search for deadlocks and possible fixes */
+	if (DeadLockCheckRecurse(proc))
+	{
+		/*
+		 * Call FindLockCycle one more time, to record the correct
+		 * deadlockDetails[] for the basic state with no rearrangements.
+		 */
+		int			nSoftEdges;
+
+		TRACE_POSTGRESQL_DEADLOCK_FOUND();
+
+		nWaitOrders = 0;
+		if (!FindLockCycle(proc, possibleConstraints, &nSoftEdges))
+			elog(FATAL, "deadlock seems to have disappeared");
+
+		return DS_HARD_DEADLOCK;	/* cannot find a non-deadlocked state */
+	}
+
+	/* Apply any needed rearrangements of wait queues */
+	for (int i = 0; i < nWaitOrders; i++)
+	{
+		LOCK	   *lock = waitOrders[i].lock;
+		PGPROC	  **procs = waitOrders[i].procs;
+		int			nProcs = waitOrders[i].nProcs;
+		dclist_head *waitQueue = &lock->waitProcs;
+
+		Assert(nProcs == dclist_count(waitQueue));
+
+#ifdef DEBUG_DEADLOCK
+		PrintLockQueue(lock, "DeadLockCheck:");
+#endif
+
+		/* Reset the queue and re-add procs in the desired order */
+		dclist_init(waitQueue);
+		for (int j = 0; j < nProcs; j++)
+			dclist_push_tail(waitQueue, &procs[j]->links);
+
+#ifdef DEBUG_DEADLOCK
+		PrintLockQueue(lock, "rearranged to:");
+#endif
+
+		/* See if any waiters for the lock can be woken up now */
+		ProcLockWakeup(GetLocksMethodTable(lock), lock);
+	}
+
+	/* Return code tells caller if we had to escape a deadlock or not */
+	if (nWaitOrders > 0)
+		return DS_SOFT_DEADLOCK;
+	else if (blocking_autovacuum_proc != NULL)
+		return DS_BLOCKED_BY_AUTOVACUUM;
+	else
+		return DS_NO_DEADLOCK;
+}
+
+/*
+ * Return the PGPROC of the autovacuum that's blocking a process.
+ *
+ * We reset the saved pointer as soon as we pass it back.
+ */
+PGPROC *
+GetBlockingAutoVacuumPgproc(void)
+{
+	PGPROC	   *ptr;
+
+	ptr = blocking_autovacuum_proc;
+	blocking_autovacuum_proc = NULL;
+
+	return ptr;
+}
+
+/*
+ * DeadLockCheckRecurse -- recursively search for valid orderings
+ *
+ * curConstraints[] holds the current set of constraints being considered
+ * by an outer level of recursion.  Add to this each possible solution
+ * constraint for any cycle detected at this level.
+ *
+ * Returns true if no solution exists.  Returns false if a deadlock-free
+ * state is attainable, in which case waitOrders[] shows the required
+ * rearrangements of lock wait queues (if any).
+ */
+static bool
+DeadLockCheckRecurse(PGPROC *proc)
+{
+	int			nEdges;
+	int			oldPossibleConstraints;
+	bool		savedList;
+	int			i;
+
+	nEdges = TestConfiguration(proc);
+	if (nEdges < 0)
+		return true;			/* hard deadlock --- no solution */
+	if (nEdges == 0)
+		return false;			/* good configuration found */
+	if (nCurConstraints >= maxCurConstraints)
+		return true;			/* out of room for active constraints? */
+	oldPossibleConstraints = nPossibleConstraints;
+	if (nPossibleConstraints + nEdges + MaxBackends <= maxPossibleConstraints)
+	{
+		/* We can save the edge list in possibleConstraints[] */
+		nPossibleConstraints += nEdges;
+		savedList = true;
+	}
+	else
+	{
+		/* Not room; will need to regenerate the edges on-the-fly */
+		savedList = false;
+	}
+
+	/*
+	 * Try each available soft edge as an addition to the configuration.
+	 */
+	for (i = 0; i < nEdges; i++)
+	{
+		if (!savedList && i > 0)
+		{
+			/* Regenerate the list of possible added constraints */
+			if (nEdges != TestConfiguration(proc))
+				elog(FATAL, "inconsistent results during deadlock check");
+		}
+		curConstraints[nCurConstraints] =
+			possibleConstraints[oldPossibleConstraints + i];
+		nCurConstraints++;
+		if (!DeadLockCheckRecurse(proc))
+			return false;		/* found a valid solution! */
+		/* give up on that added constraint, try again */
+		nCurConstraints--;
+	}
+	nPossibleConstraints = oldPossibleConstraints;
+	return true;				/* no solution found */
+}
+
+
+/*--------------------
+ * Test a configuration (current set of constraints) for validity.
+ *
+ * Returns:
+ *		0: the configuration is good (no deadlocks)
+ *	   -1: the configuration has a hard deadlock or is not self-consistent
+ *		>0: the configuration has one or more soft deadlocks
+ *
+ * In the soft-deadlock case, one of the soft cycles is chosen arbitrarily
+ * and a list of its soft edges is returned beginning at
+ * possibleConstraints+nPossibleConstraints.  The return value is the
+ * number of soft edges.
+ *--------------------
+ */
+static int
+TestConfiguration(PGPROC *startProc)
+{
+	int			softFound = 0;
+	EDGE	   *softEdges = possibleConstraints + nPossibleConstraints;
+	int			nSoftEdges;
+	int			i;
+
+	/*
+	 * Make sure we have room for FindLockCycle's output.
+	 */
+	if (nPossibleConstraints + MaxBackends > maxPossibleConstraints)
+		return -1;
+
+	/*
+	 * Expand current constraint set into wait orderings.  Fail if the
+	 * constraint set is not self-consistent.
+	 */
+	if (!ExpandConstraints(curConstraints, nCurConstraints))
+		return -1;
+
+	/*
+	 * Check for cycles involving startProc or any of the procs mentioned in
+	 * constraints.  We check startProc last because if it has a soft cycle
+	 * still to be dealt with, we want to deal with that first.
+	 */
+	for (i = 0; i < nCurConstraints; i++)
+	{
+		if (FindLockCycle(curConstraints[i].waiter, softEdges, &nSoftEdges))
+		{
+			if (nSoftEdges == 0)
+				return -1;		/* hard deadlock detected */
+			softFound = nSoftEdges;
+		}
+		if (FindLockCycle(curConstraints[i].blocker, softEdges, &nSoftEdges))
+		{
+			if (nSoftEdges == 0)
+				return -1;		/* hard deadlock detected */
+			softFound = nSoftEdges;
+		}
+	}
+	if (FindLockCycle(startProc, softEdges, &nSoftEdges))
+	{
+		if (nSoftEdges == 0)
+			return -1;			/* hard deadlock detected */
+		softFound = nSoftEdges;
+	}
+	return softFound;
+}
+
+
+/*
+ * FindLockCycle -- basic check for deadlock cycles
+ *
+ * Scan outward from the given proc to see if there is a cycle in the
+ * waits-for graph that includes this proc.  Return true if a cycle
+ * is found, else false.  If a cycle is found, we return a list of
+ * the "soft edges", if any, included in the cycle.  These edges could
+ * potentially be eliminated by rearranging wait queues.  We also fill
+ * deadlockDetails[] with information about the detected cycle; this info
+ * is not used by the deadlock algorithm itself, only to print a useful
+ * message after failing.
+ *
+ * Since we need to be able to check hypothetical configurations that would
+ * exist after wait queue rearrangement, the routine pays attention to the
+ * table of hypothetical queue orders in waitOrders[].  These orders will
+ * be believed in preference to the actual ordering seen in the locktable.
+ */
+static bool
+FindLockCycle(PGPROC *checkProc,
+			  EDGE *softEdges,	/* output argument */
+			  int *nSoftEdges)	/* output argument */
+{
+	nVisitedProcs = 0;
+	nDeadlockDetails = 0;
+	*nSoftEdges = 0;
+	return FindLockCycleRecurse(checkProc, 0, softEdges, nSoftEdges);
+}
+
+static bool
+FindLockCycleRecurse(PGPROC *checkProc,
+					 int depth,
+					 EDGE *softEdges,	/* output argument */
+					 int *nSoftEdges)	/* output argument */
+{
+	int			i;
+	dlist_iter	iter;
+
+	/*
+	 * If this process is a lock group member, check the leader instead. (Note
+	 * that we might be the leader, in which case this is a no-op.)
+	 */
+	if (checkProc->lockGroupLeader != NULL)
+		checkProc = checkProc->lockGroupLeader;
+
+	/*
+	 * Have we already seen this proc?
+	 */
+	for (i = 0; i < nVisitedProcs; i++)
+	{
+		if (visitedProcs[i] == checkProc)
+		{
+			/* If we return to starting point, we have a deadlock cycle */
+			if (i == 0)
+			{
+				/*
+				 * record total length of cycle --- outer levels will now fill
+				 * deadlockDetails[]
+				 */
+				Assert(depth <= MaxBackends);
+				nDeadlockDetails = depth;
+
+				return true;
+			}
+
+			/*
+			 * Otherwise, we have a cycle but it does not include the start
+			 * point, so say "no deadlock".
+			 */
+			return false;
+		}
+	}
+	/* Mark proc as seen */
+	Assert(nVisitedProcs < MaxBackends);
+	visitedProcs[nVisitedProcs++] = checkProc;
+
+	/*
+	 * If the process is waiting, there is an outgoing waits-for edge to each
+	 * process that blocks it.
+	 */
+	if (checkProc->links.next != NULL && checkProc->waitLock != NULL &&
+		FindLockCycleRecurseMember(checkProc, checkProc, depth, softEdges,
+								   nSoftEdges))
+		return true;
+
+	/*
+	 * If the process is not waiting, there could still be outgoing waits-for
+	 * edges if it is part of a lock group, because other members of the lock
+	 * group might be waiting even though this process is not.  (Given lock
+	 * groups {A1, A2} and {B1, B2}, if A1 waits for B1 and B2 waits for A2,
+	 * that is a deadlock even neither of B1 and A2 are waiting for anything.)
+	 */
+	dlist_foreach(iter, &checkProc->lockGroupMembers)
+	{
+		PGPROC	   *memberProc;
+
+		memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur);
+
+		if (memberProc->links.next != NULL && memberProc->waitLock != NULL &&
+			memberProc != checkProc &&
+			FindLockCycleRecurseMember(memberProc, checkProc, depth, softEdges,
+									   nSoftEdges))
+			return true;
+	}
+
+	return false;
+}
+
+static bool
+FindLockCycleRecurseMember(PGPROC *checkProc,
+						   PGPROC *checkProcLeader,
+						   int depth,
+						   EDGE *softEdges, /* output argument */
+						   int *nSoftEdges) /* output argument */
+{
+	PGPROC	   *proc;
+	LOCK	   *lock = checkProc->waitLock;
+	dlist_iter	proclock_iter;
+	LockMethod	lockMethodTable;
+	int			conflictMask;
+	int			i;
+	int			numLockModes,
+				lm;
+
+	/*
+	 * The relation extension lock can never participate in actual deadlock
+	 * cycle.  See Assert in LockAcquireExtended.  So, there is no advantage
+	 * in checking wait edges from it.
+	 */
+	if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND)
+		return false;
+
+	lockMethodTable = GetLocksMethodTable(lock);
+	numLockModes = lockMethodTable->numLockModes;
+	conflictMask = lockMethodTable->conflictTab[checkProc->waitLockMode];
+
+	/*
+	 * Scan for procs that already hold conflicting locks.  These are "hard"
+	 * edges in the waits-for graph.
+	 */
+	dlist_foreach(proclock_iter, &lock->procLocks)
+	{
+		PROCLOCK   *proclock = dlist_container(PROCLOCK, lockLink, proclock_iter.cur);
+		PGPROC	   *leader;
+
+		proc = proclock->tag.myProc;
+		leader = proc->lockGroupLeader == NULL ? proc : proc->lockGroupLeader;
+
+		/* A proc never blocks itself or any other lock group member */
+		if (leader != checkProcLeader)
+		{
+			for (lm = 1; lm <= numLockModes; lm++)
+			{
+				if ((proclock->holdMask & LOCKBIT_ON(lm)) &&
+					(conflictMask & LOCKBIT_ON(lm)))
+				{
+					/* This proc hard-blocks checkProc */
+					if (FindLockCycleRecurse(proc, depth + 1,
+											 softEdges, nSoftEdges))
+					{
+						/* fill deadlockDetails[] */
+						DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+						info->locktag = lock->tag;
+						info->lockmode = checkProc->waitLockMode;
+						info->pid = checkProc->pid;
+
+						return true;
+					}
+
+					/*
+					 * No deadlock here, but see if this proc is an autovacuum
+					 * that is directly hard-blocking our own proc.  If so,
+					 * report it so that the caller can send a cancel signal
+					 * to it, if appropriate.  If there's more than one such
+					 * proc, it's indeterminate which one will be reported.
+					 *
+					 * We don't touch autovacuums that are indirectly blocking
+					 * us; it's up to the direct blockee to take action.  This
+					 * rule simplifies understanding the behavior and ensures
+					 * that an autovacuum won't be canceled with less than
+					 * deadlock_timeout grace period.
+					 *
+					 * Note we read statusFlags without any locking.  This is
+					 * OK only for checking the PROC_IS_AUTOVACUUM flag,
+					 * because that flag is set at process start and never
+					 * reset.  There is logic elsewhere to avoid canceling an
+					 * autovacuum that is working to prevent XID wraparound
+					 * problems (which needs to read a different statusFlags
+					 * bit), but we don't do that here to avoid grabbing
+					 * ProcArrayLock.
+					 */
+					if (checkProc == MyProc &&
+						proc->statusFlags & PROC_IS_AUTOVACUUM)
+						blocking_autovacuum_proc = proc;
+
+					/* We're done looking at this proclock */
+					break;
+				}
+			}
+		}
+	}
+
+	/*
+	 * Scan for procs that are ahead of this one in the lock's wait queue.
+	 * Those that have conflicting requests soft-block this one.  This must be
+	 * done after the hard-block search, since if another proc both hard- and
+	 * soft-blocks this one, we want to call it a hard edge.
+	 *
+	 * If there is a proposed re-ordering of the lock's wait order, use that
+	 * rather than the current wait order.
+	 */
+	for (i = 0; i < nWaitOrders; i++)
+	{
+		if (waitOrders[i].lock == lock)
+			break;
+	}
+
+	if (i < nWaitOrders)
+	{
+		/* Use the given hypothetical wait queue order */
+		PGPROC	  **procs = waitOrders[i].procs;
+		int			queue_size = waitOrders[i].nProcs;
+
+		for (i = 0; i < queue_size; i++)
+		{
+			PGPROC	   *leader;
+
+			proc = procs[i];
+			leader = proc->lockGroupLeader == NULL ? proc :
+				proc->lockGroupLeader;
+
+			/*
+			 * TopoSort will always return an ordering with group members
+			 * adjacent to each other in the wait queue (see comments
+			 * therein). So, as soon as we reach a process in the same lock
+			 * group as checkProc, we know we've found all the conflicts that
+			 * precede any member of the lock group lead by checkProcLeader.
+			 */
+			if (leader == checkProcLeader)
+				break;
+
+			/* Is there a conflict with this guy's request? */
+			if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0)
+			{
+				/* This proc soft-blocks checkProc */
+				if (FindLockCycleRecurse(proc, depth + 1,
+										 softEdges, nSoftEdges))
+				{
+					/* fill deadlockDetails[] */
+					DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+					info->locktag = lock->tag;
+					info->lockmode = checkProc->waitLockMode;
+					info->pid = checkProc->pid;
+
+					/*
+					 * Add this edge to the list of soft edges in the cycle
+					 */
+					Assert(*nSoftEdges < MaxBackends);
+					softEdges[*nSoftEdges].waiter = checkProcLeader;
+					softEdges[*nSoftEdges].blocker = leader;
+					softEdges[*nSoftEdges].lock = lock;
+					(*nSoftEdges)++;
+					return true;
+				}
+			}
+		}
+	}
+	else
+	{
+		PGPROC	   *lastGroupMember = NULL;
+		dlist_iter	proc_iter;
+		dclist_head *waitQueue;
+
+		/* Use the true lock wait queue order */
+		waitQueue = &lock->waitProcs;
+
+		/*
+		 * Find the last member of the lock group that is present in the wait
+		 * queue.  Anything after this is not a soft lock conflict. If group
+		 * locking is not in use, then we know immediately which process we're
+		 * looking for, but otherwise we've got to search the wait queue to
+		 * find the last process actually present.
+		 */
+		if (checkProc->lockGroupLeader == NULL)
+			lastGroupMember = checkProc;
+		else
+		{
+			dclist_foreach(proc_iter, waitQueue)
+			{
+				proc = dlist_container(PGPROC, links, proc_iter.cur);
+
+				if (proc->lockGroupLeader == checkProcLeader)
+					lastGroupMember = proc;
+			}
+			Assert(lastGroupMember != NULL);
+		}
+
+		/*
+		 * OK, now rescan (or scan) the queue to identify the soft conflicts.
+		 */
+		dclist_foreach(proc_iter, waitQueue)
+		{
+			PGPROC	   *leader;
+
+			proc = dlist_container(PGPROC, links, proc_iter.cur);
+
+			leader = proc->lockGroupLeader == NULL ? proc :
+				proc->lockGroupLeader;
+
+			/* Done when we reach the target proc */
+			if (proc == lastGroupMember)
+				break;
+
+			/* Is there a conflict with this guy's request? */
+			if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0 &&
+				leader != checkProcLeader)
+			{
+				/* This proc soft-blocks checkProc */
+				if (FindLockCycleRecurse(proc, depth + 1,
+										 softEdges, nSoftEdges))
+				{
+					/* fill deadlockDetails[] */
+					DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+					info->locktag = lock->tag;
+					info->lockmode = checkProc->waitLockMode;
+					info->pid = checkProc->pid;
+
+					/*
+					 * Add this edge to the list of soft edges in the cycle
+					 */
+					Assert(*nSoftEdges < MaxBackends);
+					softEdges[*nSoftEdges].waiter = checkProcLeader;
+					softEdges[*nSoftEdges].blocker = leader;
+					softEdges[*nSoftEdges].lock = lock;
+					(*nSoftEdges)++;
+					return true;
+				}
+			}
+		}
+	}
+
+	/*
+	 * No conflict detected here.
+	 */
+	return false;
+}
+
+
+/*
+ * ExpandConstraints -- expand a list of constraints into a set of
+ *		specific new orderings for affected wait queues
+ *
+ * Input is a list of soft edges to be reversed.  The output is a list
+ * of nWaitOrders WAIT_ORDER structs in waitOrders[], with PGPROC array
+ * workspace in waitOrderProcs[].
+ *
+ * Returns true if able to build an ordering that satisfies all the
+ * constraints, false if not (there are contradictory constraints).
+ */
+static bool
+ExpandConstraints(EDGE *constraints,
+				  int nConstraints)
+{
+	int			nWaitOrderProcs = 0;
+	int			i,
+				j;
+
+	nWaitOrders = 0;
+
+	/*
+	 * Scan constraint list backwards.  This is because the last-added
+	 * constraint is the only one that could fail, and so we want to test it
+	 * for inconsistency first.
+	 */
+	for (i = nConstraints; --i >= 0;)
+	{
+		LOCK	   *lock = constraints[i].lock;
+
+		/* Did we already make a list for this lock? */
+		for (j = nWaitOrders; --j >= 0;)
+		{
+			if (waitOrders[j].lock == lock)
+				break;
+		}
+		if (j >= 0)
+			continue;
+		/* No, so allocate a new list */
+		waitOrders[nWaitOrders].lock = lock;
+		waitOrders[nWaitOrders].procs = waitOrderProcs + nWaitOrderProcs;
+		waitOrders[nWaitOrders].nProcs = dclist_count(&lock->waitProcs);
+		nWaitOrderProcs += dclist_count(&lock->waitProcs);
+		Assert(nWaitOrderProcs <= MaxBackends);
+
+		/*
+		 * Do the topo sort.  TopoSort need not examine constraints after this
+		 * one, since they must be for different locks.
+		 */
+		if (!TopoSort(lock, constraints, i + 1,
+					  waitOrders[nWaitOrders].procs))
+			return false;
+		nWaitOrders++;
+	}
+	return true;
+}
+
+
+/*
+ * TopoSort -- topological sort of a wait queue
+ *
+ * Generate a re-ordering of a lock's wait queue that satisfies given
+ * constraints about certain procs preceding others.  (Each such constraint
+ * is a fact of a partial ordering.)  Minimize rearrangement of the queue
+ * not needed to achieve the partial ordering.
+ *
+ * This is a lot simpler and slower than, for example, the topological sort
+ * algorithm shown in Knuth's Volume 1.  However, Knuth's method doesn't
+ * try to minimize the damage to the existing order.  In practice we are
+ * not likely to be working with more than a few constraints, so the apparent
+ * slowness of the algorithm won't really matter.
+ *
+ * The initial queue ordering is taken directly from the lock's wait queue.
+ * The output is an array of PGPROC pointers, of length equal to the lock's
+ * wait queue length (the caller is responsible for providing this space).
+ * The partial order is specified by an array of EDGE structs.  Each EDGE
+ * is one that we need to reverse, therefore the "waiter" must appear before
+ * the "blocker" in the output array.  The EDGE array may well contain
+ * edges associated with other locks; these should be ignored.
+ *
+ * Returns true if able to build an ordering that satisfies all the
+ * constraints, false if not (there are contradictory constraints).
+ */
+static bool
+TopoSort(LOCK *lock,
+		 EDGE *constraints,
+		 int nConstraints,
+		 PGPROC **ordering)		/* output argument */
+{
+	dclist_head *waitQueue = &lock->waitProcs;
+	int			queue_size = dclist_count(waitQueue);
+	PGPROC	   *proc;
+	int			i,
+				j,
+				jj,
+				k,
+				kk,
+				last;
+	dlist_iter	proc_iter;
+
+	/* First, fill topoProcs[] array with the procs in their current order */
+	i = 0;
+	dclist_foreach(proc_iter, waitQueue)
+	{
+		proc = dlist_container(PGPROC, links, proc_iter.cur);
+		topoProcs[i++] = proc;
+	}
+	Assert(i == queue_size);
+
+	/*
+	 * Scan the constraints, and for each proc in the array, generate a count
+	 * of the number of constraints that say it must be before something else,
+	 * plus a list of the constraints that say it must be after something
+	 * else. The count for the j'th proc is stored in beforeConstraints[j],
+	 * and the head of its list in afterConstraints[j].  Each constraint
+	 * stores its list link in constraints[i].link (note any constraint will
+	 * be in just one list). The array index for the before-proc of the i'th
+	 * constraint is remembered in constraints[i].pred.
+	 *
+	 * Note that it's not necessarily the case that every constraint affects
+	 * this particular wait queue.  Prior to group locking, a process could be
+	 * waiting for at most one lock.  But a lock group can be waiting for
+	 * zero, one, or multiple locks.  Since topoProcs[] is an array of the
+	 * processes actually waiting, while constraints[] is an array of group
+	 * leaders, we've got to scan through topoProcs[] for each constraint,
+	 * checking whether both a waiter and a blocker for that group are
+	 * present.  If so, the constraint is relevant to this wait queue; if not,
+	 * it isn't.
+	 */
+	MemSet(beforeConstraints, 0, queue_size * sizeof(int));
+	MemSet(afterConstraints, 0, queue_size * sizeof(int));
+	for (i = 0; i < nConstraints; i++)
+	{
+		/*
+		 * Find a representative process that is on the lock queue and part of
+		 * the waiting lock group.  This may or may not be the leader, which
+		 * may or may not be waiting at all.  If there are any other processes
+		 * in the same lock group on the queue, set their number of
+		 * beforeConstraints to -1 to indicate that they should be emitted
+		 * with their groupmates rather than considered separately.
+		 *
+		 * In this loop and the similar one just below, it's critical that we
+		 * consistently select the same representative member of any one lock
+		 * group, so that all the constraints are associated with the same
+		 * proc, and the -1's are only associated with not-representative
+		 * members.  We select the last one in the topoProcs array.
+		 */
+		proc = constraints[i].waiter;
+		Assert(proc != NULL);
+		jj = -1;
+		for (j = queue_size; --j >= 0;)
+		{
+			PGPROC	   *waiter = topoProcs[j];
+
+			if (waiter == proc || waiter->lockGroupLeader == proc)
+			{
+				Assert(waiter->waitLock == lock);
+				if (jj == -1)
+					jj = j;
+				else
+				{
+					Assert(beforeConstraints[j] <= 0);
+					beforeConstraints[j] = -1;
+				}
+			}
+		}
+
+		/* If no matching waiter, constraint is not relevant to this lock. */
+		if (jj < 0)
+			continue;
+
+		/*
+		 * Similarly, find a representative process that is on the lock queue
+		 * and waiting for the blocking lock group.  Again, this could be the
+		 * leader but does not need to be.
+		 */
+		proc = constraints[i].blocker;
+		Assert(proc != NULL);
+		kk = -1;
+		for (k = queue_size; --k >= 0;)
+		{
+			PGPROC	   *blocker = topoProcs[k];
+
+			if (blocker == proc || blocker->lockGroupLeader == proc)
+			{
+				Assert(blocker->waitLock == lock);
+				if (kk == -1)
+					kk = k;
+				else
+				{
+					Assert(beforeConstraints[k] <= 0);
+					beforeConstraints[k] = -1;
+				}
+			}
+		}
+
+		/* If no matching blocker, constraint is not relevant to this lock. */
+		if (kk < 0)
+			continue;
+
+		Assert(beforeConstraints[jj] >= 0);
+		beforeConstraints[jj]++;	/* waiter must come before */
+		/* add this constraint to list of after-constraints for blocker */
+		constraints[i].pred = jj;
+		constraints[i].link = afterConstraints[kk];
+		afterConstraints[kk] = i + 1;
+	}
+
+	/*--------------------
+	 * Now scan the topoProcs array backwards.  At each step, output the
+	 * last proc that has no remaining before-constraints plus any other
+	 * members of the same lock group; then decrease the beforeConstraints
+	 * count of each of the procs it was constrained against.
+	 * i = index of ordering[] entry we want to output this time
+	 * j = search index for topoProcs[]
+	 * k = temp for scanning constraint list for proc j
+	 * last = last non-null index in topoProcs (avoid redundant searches)
+	 *--------------------
+	 */
+	last = queue_size - 1;
+	for (i = queue_size - 1; i >= 0;)
+	{
+		int			c;
+		int			nmatches = 0;
+
+		/* Find next candidate to output */
+		while (topoProcs[last] == NULL)
+			last--;
+		for (j = last; j >= 0; j--)
+		{
+			if (topoProcs[j] != NULL && beforeConstraints[j] == 0)
+				break;
+		}
+
+		/* If no available candidate, topological sort fails */
+		if (j < 0)
+			return false;
+
+		/*
+		 * Output everything in the lock group.  There's no point in
+		 * outputting an ordering where members of the same lock group are not
+		 * consecutive on the wait queue: if some other waiter is between two
+		 * requests that belong to the same group, then either it conflicts
+		 * with both of them and is certainly not a solution; or it conflicts
+		 * with at most one of them and is thus isomorphic to an ordering
+		 * where the group members are consecutive.
+		 */
+		proc = topoProcs[j];
+		if (proc->lockGroupLeader != NULL)
+			proc = proc->lockGroupLeader;
+		Assert(proc != NULL);
+		for (c = 0; c <= last; ++c)
+		{
+			if (topoProcs[c] == proc || (topoProcs[c] != NULL &&
+										 topoProcs[c]->lockGroupLeader == proc))
+			{
+				ordering[i - nmatches] = topoProcs[c];
+				topoProcs[c] = NULL;
+				++nmatches;
+			}
+		}
+		Assert(nmatches > 0);
+		i -= nmatches;
+
+		/* Update beforeConstraints counts of its predecessors */
+		for (k = afterConstraints[j]; k > 0; k = constraints[k - 1].link)
+			beforeConstraints[constraints[k - 1].pred]--;
+	}
+
+	/* Done */
+	return true;
+}
+
+#ifdef DEBUG_DEADLOCK
+static void
+PrintLockQueue(LOCK *lock, const char *info)
+{
+	dclist_head *waitQueue = &lock->waitProcs;
+	dlist_iter	proc_iter;
+
+	printf("%s lock %p queue ", info, lock);
+
+	dclist_foreach(proc_iter, waitQueue)
+	{
+		PGPROC	   *proc = dlist_container(PGPROC, links, proc_iter.cur);
+
+		printf(" %d", proc->pid);
+	}
+	printf("\n");
+	fflush(stdout);
+}
+#endif
+
+/*
+ * Report a detected deadlock, with available details.
+ */
+void
+DeadLockReport(void)
+{
+	StringInfoData clientbuf;	/* errdetail for client */
+	StringInfoData logbuf;		/* errdetail for server log */
+	StringInfoData locktagbuf;
+	int			i;
+
+	initStringInfo(&clientbuf);
+	initStringInfo(&logbuf);
+	initStringInfo(&locktagbuf);
+
+	/* Generate the "waits for" lines sent to the client */
+	for (i = 0; i < nDeadlockDetails; i++)
+	{
+		DEADLOCK_INFO *info = &deadlockDetails[i];
+		int			nextpid;
+
+		/* The last proc waits for the first one... */
+		if (i < nDeadlockDetails - 1)
+			nextpid = info[1].pid;
+		else
+			nextpid = deadlockDetails[0].pid;
+
+		/* reset locktagbuf to hold next object description */
+		resetStringInfo(&locktagbuf);
+
+		DescribeLockTag(&locktagbuf, &info->locktag);
+
+		if (i > 0)
+			appendStringInfoChar(&clientbuf, '\n');
+
+		appendStringInfo(&clientbuf,
+						 _("Process %d waits for %s on %s; blocked by process %d."),
+						 info->pid,
+						 GetLockmodeName(info->locktag.locktag_lockmethodid,
+										 info->lockmode),
+						 locktagbuf.data,
+						 nextpid);
+	}
+
+	/* Duplicate all the above for the server ... */
+	appendBinaryStringInfo(&logbuf, clientbuf.data, clientbuf.len);
+
+	/* ... and add info about query strings */
+	for (i = 0; i < nDeadlockDetails; i++)
+	{
+		DEADLOCK_INFO *info = &deadlockDetails[i];
+
+		appendStringInfoChar(&logbuf, '\n');
+
+		appendStringInfo(&logbuf,
+						 _("Process %d: %s"),
+						 info->pid,
+						 pgstat_get_backend_current_activity(info->pid, false));
+	}
+
+	pgstat_report_deadlock();
+
+	ereport(ERROR,
+			(errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
+			 errmsg("deadlock detected"),
+			 errdetail_internal("%s", clientbuf.data),
+			 errdetail_log("%s", logbuf.data),
+			 errhint("See server log for query details.")));
+}
+
+/*
+ * RememberSimpleDeadLock: set up info for DeadLockReport when ProcSleep
+ * detects a trivial (two-way) deadlock.  proc1 wants to block for lockmode
+ * on lock, but proc2 is already waiting and would be blocked by proc1.
+ */
+void
+RememberSimpleDeadLock(PGPROC *proc1,
+					   LOCKMODE lockmode,
+					   LOCK *lock,
+					   PGPROC *proc2)
+{
+	DEADLOCK_INFO *info = &deadlockDetails[0];
+
+	info->locktag = lock->tag;
+	info->lockmode = lockmode;
+	info->pid = proc1->pid;
+	info++;
+	info->locktag = proc2->waitLock->tag;
+	info->lockmode = proc2->waitLockMode;
+	info->pid = proc2->pid;
+	nDeadlockDetails = 2;
+}
diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl
new file mode 100644
index 0000000..863c882
--- /dev/null
+++ b/src/backend/storage/lmgr/generate-lwlocknames.pl
@@ -0,0 +1,77 @@
+#!/usr/bin/perl
+#
+# Generate lwlocknames.h and lwlocknames.c from lwlocknames.txt
+# Copyright (c) 2000-2023, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $output_path = '.';
+
+my $lastlockidx = -1;
+my $continue = "\n";
+
+GetOptions('outdir:s' => \$output_path);
+
+open my $lwlocknames, '<', $ARGV[0] or die;
+
+# Include PID in suffix in case parallel make runs this multiple times.
+my $htmp = "$output_path/lwlocknames.h.tmp$$";
+my $ctmp = "$output_path/lwlocknames.c.tmp$$";
+open my $h, '>', $htmp or die "Could not open $htmp: $!";
+open my $c, '>', $ctmp or die "Could not open $ctmp: $!";
+
+my $autogen =
+  "/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */\n";
+print $h $autogen;
+print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n";
+print $c $autogen, "\n";
+
+print $c "const char *const IndividualLWLockNames[] = {";
+
+while (<$lwlocknames>)
+{
+	chomp;
+
+	# Skip comments
+	next if /^#/;
+	next if /^\s*$/;
+
+	die "unable to parse lwlocknames.txt"
+	  unless /^(\w+)\s+(\d+)$/;
+
+	(my $lockname, my $lockidx) = ($1, $2);
+
+	my $trimmedlockname = $lockname;
+	$trimmedlockname =~ s/Lock$//;
+	die "lock names must end with 'Lock'" if $trimmedlockname eq $lockname;
+
+	die "lwlocknames.txt not in order" if $lockidx < $lastlockidx;
+	die "lwlocknames.txt has duplicates" if $lockidx == $lastlockidx;
+
+	while ($lastlockidx < $lockidx - 1)
+	{
+		++$lastlockidx;
+		printf $c "%s	\"<unassigned:%d>\"", $continue, $lastlockidx;
+		$continue = ",\n";
+	}
+	printf $c "%s	\"%s\"", $continue, $trimmedlockname;
+	$lastlockidx = $lockidx;
+	$continue = ",\n";
+
+	print $h "#define $lockname (&MainLWLockArray[$lockidx].lock)\n";
+}
+
+printf $c "\n};\n";
+print $h "\n";
+printf $h "#define NUM_INDIVIDUAL_LWLOCKS		%s\n", $lastlockidx + 1;
+
+close $h;
+close $c;
+
+rename($htmp, "$output_path/lwlocknames.h")
+  || die "rename: $htmp to $output_path/lwlocknames.h: $!";
+rename($ctmp, "$output_path/lwlocknames.c") || die "rename: $ctmp: $!";
+
+close $lwlocknames;
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
new file mode 100644
index 0000000..ee9b89a
--- /dev/null
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -0,0 +1,1270 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.c
+ *	  POSTGRES lock manager code
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/lmgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "catalog/catalog.h"
+#include "commands/progress.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "utils/inval.h"
+
+
+/*
+ * Per-backend counter for generating speculative insertion tokens.
+ *
+ * This may wrap around, but that's OK as it's only used for the short
+ * duration between inserting a tuple and checking that there are no (unique)
+ * constraint violations.  It's theoretically possible that a backend sees a
+ * tuple that was speculatively inserted by another backend, but before it has
+ * started waiting on the token, the other backend completes its insertion,
+ * and then performs 2^32 unrelated insertions.  And after all that, the
+ * first backend finally calls SpeculativeInsertionLockAcquire(), with the
+ * intention of waiting for the first insertion to complete, but ends up
+ * waiting for the latest unrelated insertion instead.  Even then, nothing
+ * particularly bad happens: in the worst case they deadlock, causing one of
+ * the transactions to abort.
+ */
+static uint32 speculativeInsertionToken = 0;
+
+
+/*
+ * Struct to hold context info for transaction lock waits.
+ *
+ * 'oper' is the operation that needs to wait for the other transaction; 'rel'
+ * and 'ctid' specify the address of the tuple being waited for.
+ */
+typedef struct XactLockTableWaitInfo
+{
+	XLTW_Oper	oper;
+	Relation	rel;
+	ItemPointer ctid;
+} XactLockTableWaitInfo;
+
+static void XactLockTableWaitErrorCb(void *arg);
+
+/*
+ * RelationInitLockInfo
+ *		Initializes the lock information in a relation descriptor.
+ *
+ *		relcache.c must call this during creation of any reldesc.
+ */
+void
+RelationInitLockInfo(Relation relation)
+{
+	Assert(RelationIsValid(relation));
+	Assert(OidIsValid(RelationGetRelid(relation)));
+
+	relation->rd_lockInfo.lockRelId.relId = RelationGetRelid(relation);
+
+	if (relation->rd_rel->relisshared)
+		relation->rd_lockInfo.lockRelId.dbId = InvalidOid;
+	else
+		relation->rd_lockInfo.lockRelId.dbId = MyDatabaseId;
+}
+
+/*
+ * SetLocktagRelationOid
+ *		Set up a locktag for a relation, given only relation OID
+ */
+static inline void
+SetLocktagRelationOid(LOCKTAG *tag, Oid relid)
+{
+	Oid			dbid;
+
+	if (IsSharedRelation(relid))
+		dbid = InvalidOid;
+	else
+		dbid = MyDatabaseId;
+
+	SET_LOCKTAG_RELATION(*tag, dbid, relid);
+}
+
+/*
+ *		LockRelationOid
+ *
+ * Lock a relation given only its OID.  This should generally be used
+ * before attempting to open the relation's relcache entry.
+ */
+void
+LockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+	/*
+	 * Now that we have the lock, check for invalidation messages, so that we
+	 * will update or flush any stale relcache entry before we try to use it.
+	 * RangeVarGetRelid() specifically relies on us for this.  We can skip
+	 * this in the not-uncommon case that we already had the same type of lock
+	 * being requested, since then no one else could have modified the
+	 * relcache entry in an undesirable way.  (In the case where our own xact
+	 * modifies the rel, the relcache update happens via
+	 * CommandCounterIncrement, not here.)
+	 *
+	 * However, in corner cases where code acts on tables (usually catalogs)
+	 * recursively, we might get here while still processing invalidation
+	 * messages in some outer execution of this function or a sibling.  The
+	 * "cleared" status of the lock tells us whether we really are done
+	 * absorbing relevant inval messages.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+}
+
+/*
+ *		ConditionalLockRelationOid
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ *
+ * NOTE: we do not currently need conditional versions of all the
+ * LockXXX routines in this file, but they could easily be added if needed.
+ */
+bool
+ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+	if (res == LOCKACQUIRE_NOT_AVAIL)
+		return false;
+
+	/*
+	 * Now that we have the lock, check for invalidation messages; see notes
+	 * in LockRelationOid.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+
+	return true;
+}
+
+/*
+ *		LockRelationId
+ *
+ * Lock, given a LockRelId.  Same as LockRelationOid but take LockRelId as an
+ * input.
+ */
+void
+LockRelationId(LockRelId *relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+	res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+	/*
+	 * Now that we have the lock, check for invalidation messages; see notes
+	 * in LockRelationOid.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+}
+
+/*
+ *		UnlockRelationId
+ *
+ * Unlock, given a LockRelId.  This is preferred over UnlockRelationOid
+ * for speed reasons.
+ */
+void
+UnlockRelationId(LockRelId *relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		UnlockRelationOid
+ *
+ * Unlock, given only a relation Oid.  Use UnlockRelationId if you can.
+ */
+void
+UnlockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockRelation
+ *
+ * This is a convenience routine for acquiring an additional lock on an
+ * already-open relation.  Never try to do "relation_open(foo, NoLock)"
+ * and then lock with this.
+ */
+void
+LockRelation(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+	/*
+	 * Now that we have the lock, check for invalidation messages; see notes
+	 * in LockRelationOid.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+}
+
+/*
+ *		ConditionalLockRelation
+ *
+ * This is a convenience routine for acquiring an additional lock on an
+ * already-open relation.  Never try to do "relation_open(foo, NoLock)"
+ * and then lock with this.
+ */
+bool
+ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+	if (res == LOCKACQUIRE_NOT_AVAIL)
+		return false;
+
+	/*
+	 * Now that we have the lock, check for invalidation messages; see notes
+	 * in LockRelationOid.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+
+	return true;
+}
+
+/*
+ *		UnlockRelation
+ *
+ * This is a convenience routine for unlocking a relation without also
+ * closing it.
+ */
+void
+UnlockRelation(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		CheckRelationLockedByMe
+ *
+ * Returns true if current transaction holds a lock on 'relation' of mode
+ * 'lockmode'.  If 'orstronger' is true, a stronger lockmode is also OK.
+ * ("Stronger" is defined as "numerically higher", which is a bit
+ * semantically dubious but is OK for the purposes we use this for.)
+ */
+bool
+CheckRelationLockedByMe(Relation relation, LOCKMODE lockmode, bool orstronger)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	if (LockHeldByMe(&tag, lockmode))
+		return true;
+
+	if (orstronger)
+	{
+		LOCKMODE	slockmode;
+
+		for (slockmode = lockmode + 1;
+			 slockmode <= MaxLockMode;
+			 slockmode++)
+		{
+			if (LockHeldByMe(&tag, slockmode))
+			{
+#ifdef NOT_USED
+				/* Sometimes this might be useful for debugging purposes */
+				elog(WARNING, "lock mode %s substituted for %s on relation %s",
+					 GetLockmodeName(tag.locktag_lockmethodid, slockmode),
+					 GetLockmodeName(tag.locktag_lockmethodid, lockmode),
+					 RelationGetRelationName(relation));
+#endif
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+/*
+ *		LockHasWaitersRelation
+ *
+ * This is a function to check whether someone else is waiting for a
+ * lock which we are currently holding.
+ */
+bool
+LockHasWaitersRelation(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag,
+						 relation->rd_lockInfo.lockRelId.dbId,
+						 relation->rd_lockInfo.lockRelId.relId);
+
+	return LockHasWaiters(&tag, lockmode, false);
+}
+
+/*
+ *		LockRelationIdForSession
+ *
+ * This routine grabs a session-level lock on the target relation.  The
+ * session lock persists across transaction boundaries.  It will be removed
+ * when UnlockRelationIdForSession() is called, or if an ereport(ERROR) occurs,
+ * or if the backend exits.
+ *
+ * Note that one should also grab a transaction-level lock on the rel
+ * in any transaction that actually uses the rel, to ensure that the
+ * relcache entry is up to date.
+ */
+void
+LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+	(void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ *		UnlockRelationIdForSession
+ */
+void
+UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+	LockRelease(&tag, lockmode, true);
+}
+
+/*
+ *		LockRelationForExtension
+ *
+ * This lock tag is used to interlock addition of pages to relations.
+ * We need such locking because bufmgr/smgr definition of P_NEW is not
+ * race-condition-proof.
+ *
+ * We assume the caller is already holding some type of regular lock on
+ * the relation, so no AcceptInvalidationMessages call is needed here.
+ */
+void
+LockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION_EXTEND(tag,
+								relation->rd_lockInfo.lockRelId.dbId,
+								relation->rd_lockInfo.lockRelId.relId);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ *		ConditionalLockRelationForExtension
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION_EXTEND(tag,
+								relation->rd_lockInfo.lockRelId.dbId,
+								relation->rd_lockInfo.lockRelId.relId);
+
+	return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ *		RelationExtensionLockWaiterCount
+ *
+ * Count the number of processes waiting for the given relation extension lock.
+ */
+int
+RelationExtensionLockWaiterCount(Relation relation)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION_EXTEND(tag,
+								relation->rd_lockInfo.lockRelId.dbId,
+								relation->rd_lockInfo.lockRelId.relId);
+
+	return LockWaiterCount(&tag);
+}
+
+/*
+ *		UnlockRelationForExtension
+ */
+void
+UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_RELATION_EXTEND(tag,
+								relation->rd_lockInfo.lockRelId.dbId,
+								relation->rd_lockInfo.lockRelId.relId);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ *		LockPage
+ *
+ * Obtain a page-level lock.  This is currently used by some index access
+ * methods to lock individual index pages.
+ */
+void
+LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_PAGE(tag,
+					 relation->rd_lockInfo.lockRelId.dbId,
+					 relation->rd_lockInfo.lockRelId.relId,
+					 blkno);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ *		ConditionalLockPage
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_PAGE(tag,
+					 relation->rd_lockInfo.lockRelId.dbId,
+					 relation->rd_lockInfo.lockRelId.relId,
+					 blkno);
+
+	return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ *		UnlockPage
+ */
+void
+UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_PAGE(tag,
+					 relation->rd_lockInfo.lockRelId.dbId,
+					 relation->rd_lockInfo.lockRelId.relId,
+					 blkno);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockTuple
+ *
+ * Obtain a tuple-level lock.  This is used in a less-than-intuitive fashion
+ * because we can't afford to keep a separate lock in shared memory for every
+ * tuple.  See heap_lock_tuple before using this!
+ */
+void
+LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TUPLE(tag,
+					  relation->rd_lockInfo.lockRelId.dbId,
+					  relation->rd_lockInfo.lockRelId.relId,
+					  ItemPointerGetBlockNumber(tid),
+					  ItemPointerGetOffsetNumber(tid));
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ *		ConditionalLockTuple
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TUPLE(tag,
+					  relation->rd_lockInfo.lockRelId.dbId,
+					  relation->rd_lockInfo.lockRelId.relId,
+					  ItemPointerGetBlockNumber(tid),
+					  ItemPointerGetOffsetNumber(tid));
+
+	return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ *		UnlockTuple
+ */
+void
+UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TUPLE(tag,
+					  relation->rd_lockInfo.lockRelId.dbId,
+					  relation->rd_lockInfo.lockRelId.relId,
+					  ItemPointerGetBlockNumber(tid),
+					  ItemPointerGetOffsetNumber(tid));
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		XactLockTableInsert
+ *
+ * Insert a lock showing that the given transaction ID is running ---
+ * this is done when an XID is acquired by a transaction or subtransaction.
+ * The lock can then be used to wait for the transaction to finish.
+ */
+void
+XactLockTableInsert(TransactionId xid)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TRANSACTION(tag, xid);
+
+	(void) LockAcquire(&tag, ExclusiveLock, false, false);
+}
+
+/*
+ *		XactLockTableDelete
+ *
+ * Delete the lock showing that the given transaction ID is running.
+ * (This is never used for main transaction IDs; those locks are only
+ * released implicitly at transaction end.  But we do use it for subtrans IDs.)
+ */
+void
+XactLockTableDelete(TransactionId xid)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_TRANSACTION(tag, xid);
+
+	LockRelease(&tag, ExclusiveLock, false);
+}
+
+/*
+ *		XactLockTableWait
+ *
+ * Wait for the specified transaction to commit or abort.  If an operation
+ * is specified, an error context callback is set up.  If 'oper' is passed as
+ * None, no error context callback is set up.
+ *
+ * Note that this does the right thing for subtransactions: if we wait on a
+ * subtransaction, we will exit as soon as it aborts or its top parent commits.
+ * It takes some extra work to ensure this, because to save on shared memory
+ * the XID lock of a subtransaction is released when it ends, whether
+ * successfully or unsuccessfully.  So we have to check if it's "still running"
+ * and if so wait for its parent.
+ */
+void
+XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid,
+				  XLTW_Oper oper)
+{
+	LOCKTAG		tag;
+	XactLockTableWaitInfo info;
+	ErrorContextCallback callback;
+	bool		first = true;
+
+	/*
+	 * If an operation is specified, set up our verbose error context
+	 * callback.
+	 */
+	if (oper != XLTW_None)
+	{
+		Assert(RelationIsValid(rel));
+		Assert(ItemPointerIsValid(ctid));
+
+		info.rel = rel;
+		info.ctid = ctid;
+		info.oper = oper;
+
+		callback.callback = XactLockTableWaitErrorCb;
+		callback.arg = &info;
+		callback.previous = error_context_stack;
+		error_context_stack = &callback;
+	}
+
+	for (;;)
+	{
+		Assert(TransactionIdIsValid(xid));
+		Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+		SET_LOCKTAG_TRANSACTION(tag, xid);
+
+		(void) LockAcquire(&tag, ShareLock, false, false);
+
+		LockRelease(&tag, ShareLock, false);
+
+		if (!TransactionIdIsInProgress(xid))
+			break;
+
+		/*
+		 * If the Xid belonged to a subtransaction, then the lock would have
+		 * gone away as soon as it was finished; for correct tuple visibility,
+		 * the right action is to wait on its parent transaction to go away.
+		 * But instead of going levels up one by one, we can just wait for the
+		 * topmost transaction to finish with the same end result, which also
+		 * incurs less locktable traffic.
+		 *
+		 * Some uses of this function don't involve tuple visibility -- such
+		 * as when building snapshots for logical decoding.  It is possible to
+		 * see a transaction in ProcArray before it registers itself in the
+		 * locktable.  The topmost transaction in that case is the same xid,
+		 * so we try again after a short sleep.  (Don't sleep the first time
+		 * through, to avoid slowing down the normal case.)
+		 */
+		if (!first)
+			pg_usleep(1000L);
+		first = false;
+		xid = SubTransGetTopmostTransaction(xid);
+	}
+
+	if (oper != XLTW_None)
+		error_context_stack = callback.previous;
+}
+
+/*
+ *		ConditionalXactLockTableWait
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true if the lock was acquired.
+ */
+bool
+ConditionalXactLockTableWait(TransactionId xid)
+{
+	LOCKTAG		tag;
+	bool		first = true;
+
+	for (;;)
+	{
+		Assert(TransactionIdIsValid(xid));
+		Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+		SET_LOCKTAG_TRANSACTION(tag, xid);
+
+		if (LockAcquire(&tag, ShareLock, false, true) == LOCKACQUIRE_NOT_AVAIL)
+			return false;
+
+		LockRelease(&tag, ShareLock, false);
+
+		if (!TransactionIdIsInProgress(xid))
+			break;
+
+		/* See XactLockTableWait about this case */
+		if (!first)
+			pg_usleep(1000L);
+		first = false;
+		xid = SubTransGetTopmostTransaction(xid);
+	}
+
+	return true;
+}
+
+/*
+ *		SpeculativeInsertionLockAcquire
+ *
+ * Insert a lock showing that the given transaction ID is inserting a tuple,
+ * but hasn't yet decided whether it's going to keep it.  The lock can then be
+ * used to wait for the decision to go ahead with the insertion, or aborting
+ * it.
+ *
+ * The token is used to distinguish multiple insertions by the same
+ * transaction.  It is returned to caller.
+ */
+uint32
+SpeculativeInsertionLockAcquire(TransactionId xid)
+{
+	LOCKTAG		tag;
+
+	speculativeInsertionToken++;
+
+	/*
+	 * Check for wrap-around. Zero means no token is held, so don't use that.
+	 */
+	if (speculativeInsertionToken == 0)
+		speculativeInsertionToken = 1;
+
+	SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken);
+
+	(void) LockAcquire(&tag, ExclusiveLock, false, false);
+
+	return speculativeInsertionToken;
+}
+
+/*
+ *		SpeculativeInsertionLockRelease
+ *
+ * Delete the lock showing that the given transaction is speculatively
+ * inserting a tuple.
+ */
+void
+SpeculativeInsertionLockRelease(TransactionId xid)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken);
+
+	LockRelease(&tag, ExclusiveLock, false);
+}
+
+/*
+ *		SpeculativeInsertionWait
+ *
+ * Wait for the specified transaction to finish or abort the insertion of a
+ * tuple.
+ */
+void
+SpeculativeInsertionWait(TransactionId xid, uint32 token)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, token);
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(token != 0);
+
+	(void) LockAcquire(&tag, ShareLock, false, false);
+	LockRelease(&tag, ShareLock, false);
+}
+
+/*
+ * XactLockTableWaitErrorCb
+ *		Error context callback for transaction lock waits.
+ */
+static void
+XactLockTableWaitErrorCb(void *arg)
+{
+	XactLockTableWaitInfo *info = (XactLockTableWaitInfo *) arg;
+
+	/*
+	 * We would like to print schema name too, but that would require a
+	 * syscache lookup.
+	 */
+	if (info->oper != XLTW_None &&
+		ItemPointerIsValid(info->ctid) && RelationIsValid(info->rel))
+	{
+		const char *cxt;
+
+		switch (info->oper)
+		{
+			case XLTW_Update:
+				cxt = gettext_noop("while updating tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_Delete:
+				cxt = gettext_noop("while deleting tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_Lock:
+				cxt = gettext_noop("while locking tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_LockUpdated:
+				cxt = gettext_noop("while locking updated version (%u,%u) of tuple in relation \"%s\"");
+				break;
+			case XLTW_InsertIndex:
+				cxt = gettext_noop("while inserting index tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_InsertIndexUnique:
+				cxt = gettext_noop("while checking uniqueness of tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_FetchUpdated:
+				cxt = gettext_noop("while rechecking updated tuple (%u,%u) in relation \"%s\"");
+				break;
+			case XLTW_RecheckExclusionConstr:
+				cxt = gettext_noop("while checking exclusion constraint on tuple (%u,%u) in relation \"%s\"");
+				break;
+
+			default:
+				return;
+		}
+
+		errcontext(cxt,
+				   ItemPointerGetBlockNumber(info->ctid),
+				   ItemPointerGetOffsetNumber(info->ctid),
+				   RelationGetRelationName(info->rel));
+	}
+}
+
+/*
+ * WaitForLockersMultiple
+ *		Wait until no transaction holds locks that conflict with the given
+ *		locktags at the given lockmode.
+ *
+ * To do this, obtain the current list of lockers, and wait on their VXIDs
+ * until they are finished.
+ *
+ * Note we don't try to acquire the locks on the given locktags, only the
+ * VXIDs and XIDs of their lock holders; if somebody grabs a conflicting lock
+ * on the objects after we obtained our initial list of lockers, we will not
+ * wait for them.
+ */
+void
+WaitForLockersMultiple(List *locktags, LOCKMODE lockmode, bool progress)
+{
+	List	   *holders = NIL;
+	ListCell   *lc;
+	int			total = 0;
+	int			done = 0;
+
+	/* Done if no locks to wait for */
+	if (locktags == NIL)
+		return;
+
+	/* Collect the transactions we need to wait on */
+	foreach(lc, locktags)
+	{
+		LOCKTAG    *locktag = lfirst(lc);
+		int			count;
+
+		holders = lappend(holders,
+						  GetLockConflicts(locktag, lockmode,
+										   progress ? &count : NULL));
+		if (progress)
+			total += count;
+	}
+
+	if (progress)
+		pgstat_progress_update_param(PROGRESS_WAITFOR_TOTAL, total);
+
+	/*
+	 * Note: GetLockConflicts() never reports our own xid, hence we need not
+	 * check for that.  Also, prepared xacts are reported and awaited.
+	 */
+
+	/* Finally wait for each such transaction to complete */
+	foreach(lc, holders)
+	{
+		VirtualTransactionId *lockholders = lfirst(lc);
+
+		while (VirtualTransactionIdIsValid(*lockholders))
+		{
+			/* If requested, publish who we're going to wait for. */
+			if (progress)
+			{
+				PGPROC	   *holder = BackendIdGetProc(lockholders->backendId);
+
+				if (holder)
+					pgstat_progress_update_param(PROGRESS_WAITFOR_CURRENT_PID,
+												 holder->pid);
+			}
+			VirtualXactLock(*lockholders, true);
+			lockholders++;
+
+			if (progress)
+				pgstat_progress_update_param(PROGRESS_WAITFOR_DONE, ++done);
+		}
+	}
+	if (progress)
+	{
+		const int	index[] = {
+			PROGRESS_WAITFOR_TOTAL,
+			PROGRESS_WAITFOR_DONE,
+			PROGRESS_WAITFOR_CURRENT_PID
+		};
+		const int64 values[] = {
+			0, 0, 0
+		};
+
+		pgstat_progress_update_multi_param(3, index, values);
+	}
+
+	list_free_deep(holders);
+}
+
+/*
+ * WaitForLockers
+ *
+ * Same as WaitForLockersMultiple, for a single lock tag.
+ */
+void
+WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode, bool progress)
+{
+	List	   *l;
+
+	l = list_make1(&heaplocktag);
+	WaitForLockersMultiple(l, lockmode, progress);
+	list_free(l);
+}
+
+
+/*
+ *		LockDatabaseObject
+ *
+ * Obtain a lock on a general object of the current database.  Don't use
+ * this for shared objects (such as tablespaces).  It's unwise to apply it
+ * to relations, also, since a lock taken this way will NOT conflict with
+ * locks taken via LockRelation and friends.
+ */
+void
+LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+				   LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   MyDatabaseId,
+					   classid,
+					   objid,
+					   objsubid);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+
+	/* Make sure syscaches are up-to-date with any changes we waited for */
+	AcceptInvalidationMessages();
+}
+
+/*
+ *		UnlockDatabaseObject
+ */
+void
+UnlockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+					 LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   MyDatabaseId,
+					   classid,
+					   objid,
+					   objsubid);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockSharedObject
+ *
+ * Obtain a lock on a shared-across-databases object.
+ */
+void
+LockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+				 LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   InvalidOid,
+					   classid,
+					   objid,
+					   objsubid);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+
+	/* Make sure syscaches are up-to-date with any changes we waited for */
+	AcceptInvalidationMessages();
+}
+
+/*
+ *		UnlockSharedObject
+ */
+void
+UnlockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+				   LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   InvalidOid,
+					   classid,
+					   objid,
+					   objsubid);
+
+	LockRelease(&tag, lockmode, false);
+}
+
+/*
+ *		LockSharedObjectForSession
+ *
+ * Obtain a session-level lock on a shared-across-databases object.
+ * See LockRelationIdForSession for notes about session-level locks.
+ */
+void
+LockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+						   LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   InvalidOid,
+					   classid,
+					   objid,
+					   objsubid);
+
+	(void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ *		UnlockSharedObjectForSession
+ */
+void
+UnlockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+							 LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   InvalidOid,
+					   classid,
+					   objid,
+					   objsubid);
+
+	LockRelease(&tag, lockmode, true);
+}
+
+/*
+ *		LockApplyTransactionForSession
+ *
+ * Obtain a session-level lock on a transaction being applied on a logical
+ * replication subscriber. See LockRelationIdForSession for notes about
+ * session-level locks.
+ */
+void
+LockApplyTransactionForSession(Oid suboid, TransactionId xid, uint16 objid,
+							   LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_APPLY_TRANSACTION(tag,
+								  MyDatabaseId,
+								  suboid,
+								  xid,
+								  objid);
+
+	(void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ *		UnlockApplyTransactionForSession
+ */
+void
+UnlockApplyTransactionForSession(Oid suboid, TransactionId xid, uint16 objid,
+								 LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_APPLY_TRANSACTION(tag,
+								  MyDatabaseId,
+								  suboid,
+								  xid,
+								  objid);
+
+	LockRelease(&tag, lockmode, true);
+}
+
+/*
+ * Append a description of a lockable object to buf.
+ *
+ * Ideally we would print names for the numeric values, but that requires
+ * getting locks on system tables, which might cause problems since this is
+ * typically used to report deadlock situations.
+ */
+void
+DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
+{
+	switch ((LockTagType) tag->locktag_type)
+	{
+		case LOCKTAG_RELATION:
+			appendStringInfo(buf,
+							 _("relation %u of database %u"),
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_RELATION_EXTEND:
+			appendStringInfo(buf,
+							 _("extension of relation %u of database %u"),
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_DATABASE_FROZEN_IDS:
+			appendStringInfo(buf,
+							 _("pg_database.datfrozenxid of database %u"),
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_PAGE:
+			appendStringInfo(buf,
+							 _("page %u of relation %u of database %u"),
+							 tag->locktag_field3,
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_TUPLE:
+			appendStringInfo(buf,
+							 _("tuple (%u,%u) of relation %u of database %u"),
+							 tag->locktag_field3,
+							 tag->locktag_field4,
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_TRANSACTION:
+			appendStringInfo(buf,
+							 _("transaction %u"),
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_VIRTUALTRANSACTION:
+			appendStringInfo(buf,
+							 _("virtual transaction %d/%u"),
+							 tag->locktag_field1,
+							 tag->locktag_field2);
+			break;
+		case LOCKTAG_SPECULATIVE_TOKEN:
+			appendStringInfo(buf,
+							 _("speculative token %u of transaction %u"),
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_OBJECT:
+			appendStringInfo(buf,
+							 _("object %u of class %u of database %u"),
+							 tag->locktag_field3,
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		case LOCKTAG_USERLOCK:
+			/* reserved for old contrib code, now on pgfoundry */
+			appendStringInfo(buf,
+							 _("user lock [%u,%u,%u]"),
+							 tag->locktag_field1,
+							 tag->locktag_field2,
+							 tag->locktag_field3);
+			break;
+		case LOCKTAG_ADVISORY:
+			appendStringInfo(buf,
+							 _("advisory lock [%u,%u,%u,%u]"),
+							 tag->locktag_field1,
+							 tag->locktag_field2,
+							 tag->locktag_field3,
+							 tag->locktag_field4);
+			break;
+		case LOCKTAG_APPLY_TRANSACTION:
+			appendStringInfo(buf,
+							 _("remote transaction %u of subscription %u of database %u"),
+							 tag->locktag_field3,
+							 tag->locktag_field2,
+							 tag->locktag_field1);
+			break;
+		default:
+			appendStringInfo(buf,
+							 _("unrecognized locktag type %d"),
+							 (int) tag->locktag_type);
+			break;
+	}
+}
+
+/*
+ * GetLockNameFromTagType
+ *
+ *	Given locktag type, return the corresponding lock name.
+ */
+const char *
+GetLockNameFromTagType(uint16 locktag_type)
+{
+	if (locktag_type > LOCKTAG_LAST_TYPE)
+		return "???";
+	return LockTagTypeNames[locktag_type];
+}
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
new file mode 100644
index 0000000..ec6240f
--- /dev/null
+++ b/src/backend/storage/lmgr/lock.c
@@ -0,0 +1,4651 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.c
+ *	  POSTGRES primary lock mechanism
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/lock.c
+ *
+ * NOTES
+ *	  A lock table is a shared memory hash table.  When
+ *	  a process tries to acquire a lock of a type that conflicts
+ *	  with existing locks, it is put to sleep using the routines
+ *	  in storage/lmgr/proc.c.
+ *
+ *	  For the most part, this code should be invoked via lmgr.c
+ *	  or another lock-management module, not directly.
+ *
+ *	Interface:
+ *
+ *	InitLocks(), GetLocksMethodTable(), GetLockTagsMethodTable(),
+ *	LockAcquire(), LockRelease(), LockReleaseAll(),
+ *	LockCheckConflicts(), GrantLock()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "storage/standby.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/resowner_private.h"
+
+
+/* This configuration variable is used to set the lock table size */
+int			max_locks_per_xact; /* set by guc.c */
+
+#define NLOCKENTS() \
+	mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+
+
+/*
+ * Data structures defining the semantics of the standard lock methods.
+ *
+ * The conflict table defines the semantics of the various lock modes.
+ */
+static const LOCKMASK LockConflicts[] = {
+	0,
+
+	/* AccessShareLock */
+	LOCKBIT_ON(AccessExclusiveLock),
+
+	/* RowShareLock */
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* RowExclusiveLock */
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* ShareUpdateExclusiveLock */
+	LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* ShareLock */
+	LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* ShareRowExclusiveLock */
+	LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* ExclusiveLock */
+	LOCKBIT_ON(RowShareLock) |
+	LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+	/* AccessExclusiveLock */
+	LOCKBIT_ON(AccessShareLock) | LOCKBIT_ON(RowShareLock) |
+	LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+	LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+	LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock)
+
+};
+
+/* Names of lock modes, for debug printouts */
+static const char *const lock_mode_names[] =
+{
+	"INVALID",
+	"AccessShareLock",
+	"RowShareLock",
+	"RowExclusiveLock",
+	"ShareUpdateExclusiveLock",
+	"ShareLock",
+	"ShareRowExclusiveLock",
+	"ExclusiveLock",
+	"AccessExclusiveLock"
+};
+
+#ifndef LOCK_DEBUG
+static bool Dummy_trace = false;
+#endif
+
+static const LockMethodData default_lockmethod = {
+	MaxLockMode,
+	LockConflicts,
+	lock_mode_names,
+#ifdef LOCK_DEBUG
+	&Trace_locks
+#else
+	&Dummy_trace
+#endif
+};
+
+static const LockMethodData user_lockmethod = {
+	MaxLockMode,
+	LockConflicts,
+	lock_mode_names,
+#ifdef LOCK_DEBUG
+	&Trace_userlocks
+#else
+	&Dummy_trace
+#endif
+};
+
+/*
+ * map from lock method id to the lock table data structures
+ */
+static const LockMethod LockMethods[] = {
+	NULL,
+	&default_lockmethod,
+	&user_lockmethod
+};
+
+
+/* Record that's written to 2PC state file when a lock is persisted */
+typedef struct TwoPhaseLockRecord
+{
+	LOCKTAG		locktag;
+	LOCKMODE	lockmode;
+} TwoPhaseLockRecord;
+
+
+/*
+ * Count of the number of fast path lock slots we believe to be used.  This
+ * might be higher than the real number if another backend has transferred
+ * our locks to the primary lock table, but it can never be lower than the
+ * real value, since only we can acquire locks on our own behalf.
+ */
+static int	FastPathLocalUseCount = 0;
+
+/*
+ * Flag to indicate if the relation extension lock is held by this backend.
+ * This flag is used to ensure that while holding the relation extension lock
+ * we don't try to acquire a heavyweight lock on any other object.  This
+ * restriction implies that the relation extension lock won't ever participate
+ * in the deadlock cycle because we can never wait for any other heavyweight
+ * lock after acquiring this lock.
+ *
+ * Such a restriction is okay for relation extension locks as unlike other
+ * heavyweight locks these are not held till the transaction end.  These are
+ * taken for a short duration to extend a particular relation and then
+ * released.
+ */
+static bool IsRelationExtensionLockHeld PG_USED_FOR_ASSERTS_ONLY = false;
+
+/* Macros for manipulating proc->fpLockBits */
+#define FAST_PATH_BITS_PER_SLOT			3
+#define FAST_PATH_LOCKNUMBER_OFFSET		1
+#define FAST_PATH_MASK					((1 << FAST_PATH_BITS_PER_SLOT) - 1)
+#define FAST_PATH_GET_BITS(proc, n) \
+	(((proc)->fpLockBits >> (FAST_PATH_BITS_PER_SLOT * n)) & FAST_PATH_MASK)
+#define FAST_PATH_BIT_POSITION(n, l) \
+	(AssertMacro((l) >= FAST_PATH_LOCKNUMBER_OFFSET), \
+	 AssertMacro((l) < FAST_PATH_BITS_PER_SLOT+FAST_PATH_LOCKNUMBER_OFFSET), \
+	 AssertMacro((n) < FP_LOCK_SLOTS_PER_BACKEND), \
+	 ((l) - FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (n)))
+#define FAST_PATH_SET_LOCKMODE(proc, n, l) \
+	 (proc)->fpLockBits |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)
+#define FAST_PATH_CLEAR_LOCKMODE(proc, n, l) \
+	 (proc)->fpLockBits &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))
+#define FAST_PATH_CHECK_LOCKMODE(proc, n, l) \
+	 ((proc)->fpLockBits & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)))
+
+/*
+ * The fast-path lock mechanism is concerned only with relation locks on
+ * unshared relations by backends bound to a database.  The fast-path
+ * mechanism exists mostly to accelerate acquisition and release of locks
+ * that rarely conflict.  Because ShareUpdateExclusiveLock is
+ * self-conflicting, it can't use the fast-path mechanism; but it also does
+ * not conflict with any of the locks that do, so we can ignore it completely.
+ */
+#define EligibleForRelationFastPath(locktag, mode) \
+	((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \
+	(locktag)->locktag_type == LOCKTAG_RELATION && \
+	(locktag)->locktag_field1 == MyDatabaseId && \
+	MyDatabaseId != InvalidOid && \
+	(mode) < ShareUpdateExclusiveLock)
+#define ConflictsWithRelationFastPath(locktag, mode) \
+	((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \
+	(locktag)->locktag_type == LOCKTAG_RELATION && \
+	(locktag)->locktag_field1 != InvalidOid && \
+	(mode) > ShareUpdateExclusiveLock)
+
+static bool FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode);
+static bool FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode);
+static bool FastPathTransferRelationLocks(LockMethod lockMethodTable,
+										  const LOCKTAG *locktag, uint32 hashcode);
+static PROCLOCK *FastPathGetRelationLockEntry(LOCALLOCK *locallock);
+
+/*
+ * To make the fast-path lock mechanism work, we must have some way of
+ * preventing the use of the fast-path when a conflicting lock might be present.
+ * We partition* the locktag space into FAST_PATH_STRONG_LOCK_HASH_PARTITIONS,
+ * and maintain an integer count of the number of "strong" lockers
+ * in each partition.  When any "strong" lockers are present (which is
+ * hopefully not very often), the fast-path mechanism can't be used, and we
+ * must fall back to the slower method of pushing matching locks directly
+ * into the main lock tables.
+ *
+ * The deadlock detector does not know anything about the fast path mechanism,
+ * so any locks that might be involved in a deadlock must be transferred from
+ * the fast-path queues to the main lock table.
+ */
+
+#define FAST_PATH_STRONG_LOCK_HASH_BITS			10
+#define FAST_PATH_STRONG_LOCK_HASH_PARTITIONS \
+	(1 << FAST_PATH_STRONG_LOCK_HASH_BITS)
+#define FastPathStrongLockHashPartition(hashcode) \
+	((hashcode) % FAST_PATH_STRONG_LOCK_HASH_PARTITIONS)
+
+typedef struct
+{
+	slock_t		mutex;
+	uint32		count[FAST_PATH_STRONG_LOCK_HASH_PARTITIONS];
+} FastPathStrongRelationLockData;
+
+static volatile FastPathStrongRelationLockData *FastPathStrongRelationLocks;
+
+
+/*
+ * Pointers to hash tables containing lock state
+ *
+ * The LockMethodLockHash and LockMethodProcLockHash hash tables are in
+ * shared memory; LockMethodLocalHash is local to each backend.
+ */
+static HTAB *LockMethodLockHash;
+static HTAB *LockMethodProcLockHash;
+static HTAB *LockMethodLocalHash;
+
+
+/* private state for error cleanup */
+static LOCALLOCK *StrongLockInProgress;
+static LOCALLOCK *awaitedLock;
+static ResourceOwner awaitedOwner;
+
+
+#ifdef LOCK_DEBUG
+
+/*------
+ * The following configuration options are available for lock debugging:
+ *
+ *	   TRACE_LOCKS		-- give a bunch of output what's going on in this file
+ *	   TRACE_USERLOCKS	-- same but for user locks
+ *	   TRACE_LOCK_OIDMIN-- do not trace locks for tables below this oid
+ *						   (use to avoid output on system tables)
+ *	   TRACE_LOCK_TABLE -- trace locks on this table (oid) unconditionally
+ *	   DEBUG_DEADLOCKS	-- currently dumps locks at untimely occasions ;)
+ *
+ * Furthermore, but in storage/lmgr/lwlock.c:
+ *	   TRACE_LWLOCKS	-- trace lightweight locks (pretty useless)
+ *
+ * Define LOCK_DEBUG at compile time to get all these enabled.
+ * --------
+ */
+
+int			Trace_lock_oidmin = FirstNormalObjectId;
+bool		Trace_locks = false;
+bool		Trace_userlocks = false;
+int			Trace_lock_table = 0;
+bool		Debug_deadlocks = false;
+
+
+inline static bool
+LOCK_DEBUG_ENABLED(const LOCKTAG *tag)
+{
+	return
+		(*(LockMethods[tag->locktag_lockmethodid]->trace_flag) &&
+		 ((Oid) tag->locktag_field2 >= (Oid) Trace_lock_oidmin))
+		|| (Trace_lock_table &&
+			(tag->locktag_field2 == Trace_lock_table));
+}
+
+
+inline static void
+LOCK_PRINT(const char *where, const LOCK *lock, LOCKMODE type)
+{
+	if (LOCK_DEBUG_ENABLED(&lock->tag))
+		elog(LOG,
+			 "%s: lock(%p) id(%u,%u,%u,%u,%u,%u) grantMask(%x) "
+			 "req(%d,%d,%d,%d,%d,%d,%d)=%d "
+			 "grant(%d,%d,%d,%d,%d,%d,%d)=%d wait(%d) type(%s)",
+			 where, lock,
+			 lock->tag.locktag_field1, lock->tag.locktag_field2,
+			 lock->tag.locktag_field3, lock->tag.locktag_field4,
+			 lock->tag.locktag_type, lock->tag.locktag_lockmethodid,
+			 lock->grantMask,
+			 lock->requested[1], lock->requested[2], lock->requested[3],
+			 lock->requested[4], lock->requested[5], lock->requested[6],
+			 lock->requested[7], lock->nRequested,
+			 lock->granted[1], lock->granted[2], lock->granted[3],
+			 lock->granted[4], lock->granted[5], lock->granted[6],
+			 lock->granted[7], lock->nGranted,
+			 dclist_count(&lock->waitProcs),
+			 LockMethods[LOCK_LOCKMETHOD(*lock)]->lockModeNames[type]);
+}
+
+
+inline static void
+PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP)
+{
+	if (LOCK_DEBUG_ENABLED(&proclockP->tag.myLock->tag))
+		elog(LOG,
+			 "%s: proclock(%p) lock(%p) method(%u) proc(%p) hold(%x)",
+			 where, proclockP, proclockP->tag.myLock,
+			 PROCLOCK_LOCKMETHOD(*(proclockP)),
+			 proclockP->tag.myProc, (int) proclockP->holdMask);
+}
+#else							/* not LOCK_DEBUG */
+
+#define LOCK_PRINT(where, lock, type)  ((void) 0)
+#define PROCLOCK_PRINT(where, proclockP)  ((void) 0)
+#endif							/* not LOCK_DEBUG */
+
+
+static uint32 proclock_hash(const void *key, Size keysize);
+static void RemoveLocalLock(LOCALLOCK *locallock);
+static PROCLOCK *SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
+								  const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode);
+static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner);
+static void BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode);
+static void FinishStrongLockAcquire(void);
+static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner);
+static void ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock);
+static void LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent);
+static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode,
+						PROCLOCK *proclock, LockMethod lockMethodTable);
+static void CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+						LockMethod lockMethodTable, uint32 hashcode,
+						bool wakeupNeeded);
+static void LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc,
+								 LOCKTAG *locktag, LOCKMODE lockmode,
+								 bool decrement_strong_lock_count);
+static void GetSingleProcBlockerStatusData(PGPROC *blocked_proc,
+										   BlockedProcsData *data);
+
+
+/*
+ * InitLocks -- Initialize the lock manager's data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments.  In the normal postmaster case, the shared hash tables
+ * are created here, as well as a locallock hash table that will remain
+ * unused and empty in the postmaster itself.  Backends inherit the pointers
+ * to the shared tables via fork(), and also inherit an image of the locallock
+ * hash table, which they proceed to use.  In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables and to create its locallock hash table.
+ */
+void
+InitLocks(void)
+{
+	HASHCTL		info;
+	long		init_table_size,
+				max_table_size;
+	bool		found;
+
+	/*
+	 * Compute init/max size to request for lock hashtables.  Note these
+	 * calculations must agree with LockShmemSize!
+	 */
+	max_table_size = NLOCKENTS();
+	init_table_size = max_table_size / 2;
+
+	/*
+	 * Allocate hash table for LOCK structs.  This stores per-locked-object
+	 * information.
+	 */
+	info.keysize = sizeof(LOCKTAG);
+	info.entrysize = sizeof(LOCK);
+	info.num_partitions = NUM_LOCK_PARTITIONS;
+
+	LockMethodLockHash = ShmemInitHash("LOCK hash",
+									   init_table_size,
+									   max_table_size,
+									   &info,
+									   HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
+
+	/* Assume an average of 2 holders per lock */
+	max_table_size *= 2;
+	init_table_size *= 2;
+
+	/*
+	 * Allocate hash table for PROCLOCK structs.  This stores
+	 * per-lock-per-holder information.
+	 */
+	info.keysize = sizeof(PROCLOCKTAG);
+	info.entrysize = sizeof(PROCLOCK);
+	info.hash = proclock_hash;
+	info.num_partitions = NUM_LOCK_PARTITIONS;
+
+	LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash",
+										   init_table_size,
+										   max_table_size,
+										   &info,
+										   HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+	/*
+	 * Allocate fast-path structures.
+	 */
+	FastPathStrongRelationLocks =
+		ShmemInitStruct("Fast Path Strong Relation Lock Data",
+						sizeof(FastPathStrongRelationLockData), &found);
+	if (!found)
+		SpinLockInit(&FastPathStrongRelationLocks->mutex);
+
+	/*
+	 * Allocate non-shared hash table for LOCALLOCK structs.  This stores lock
+	 * counts and resource owner information.
+	 *
+	 * The non-shared table could already exist in this process (this occurs
+	 * when the postmaster is recreating shared memory after a backend crash).
+	 * If so, delete and recreate it.  (We could simply leave it, since it
+	 * ought to be empty in the postmaster, but for safety let's zap it.)
+	 */
+	if (LockMethodLocalHash)
+		hash_destroy(LockMethodLocalHash);
+
+	info.keysize = sizeof(LOCALLOCKTAG);
+	info.entrysize = sizeof(LOCALLOCK);
+
+	LockMethodLocalHash = hash_create("LOCALLOCK hash",
+									  16,
+									  &info,
+									  HASH_ELEM | HASH_BLOBS);
+}
+
+
+/*
+ * Fetch the lock method table associated with a given lock
+ */
+LockMethod
+GetLocksMethodTable(const LOCK *lock)
+{
+	LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*lock);
+
+	Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+	return LockMethods[lockmethodid];
+}
+
+/*
+ * Fetch the lock method table associated with a given locktag
+ */
+LockMethod
+GetLockTagsMethodTable(const LOCKTAG *locktag)
+{
+	LOCKMETHODID lockmethodid = (LOCKMETHODID) locktag->locktag_lockmethodid;
+
+	Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+	return LockMethods[lockmethodid];
+}
+
+
+/*
+ * Compute the hash code associated with a LOCKTAG.
+ *
+ * To avoid unnecessary recomputations of the hash code, we try to do this
+ * just once per function, and then pass it around as needed.  Aside from
+ * passing the hashcode to hash_search_with_hash_value(), we can extract
+ * the lock partition number from the hashcode.
+ */
+uint32
+LockTagHashCode(const LOCKTAG *locktag)
+{
+	return get_hash_value(LockMethodLockHash, (const void *) locktag);
+}
+
+/*
+ * Compute the hash code associated with a PROCLOCKTAG.
+ *
+ * Because we want to use just one set of partition locks for both the
+ * LOCK and PROCLOCK hash tables, we have to make sure that PROCLOCKs
+ * fall into the same partition number as their associated LOCKs.
+ * dynahash.c expects the partition number to be the low-order bits of
+ * the hash code, and therefore a PROCLOCKTAG's hash code must have the
+ * same low-order bits as the associated LOCKTAG's hash code.  We achieve
+ * this with this specialized hash function.
+ */
+static uint32
+proclock_hash(const void *key, Size keysize)
+{
+	const PROCLOCKTAG *proclocktag = (const PROCLOCKTAG *) key;
+	uint32		lockhash;
+	Datum		procptr;
+
+	Assert(keysize == sizeof(PROCLOCKTAG));
+
+	/* Look into the associated LOCK object, and compute its hash code */
+	lockhash = LockTagHashCode(&proclocktag->myLock->tag);
+
+	/*
+	 * To make the hash code also depend on the PGPROC, we xor the proc
+	 * struct's address into the hash code, left-shifted so that the
+	 * partition-number bits don't change.  Since this is only a hash, we
+	 * don't care if we lose high-order bits of the address; use an
+	 * intermediate variable to suppress cast-pointer-to-int warnings.
+	 */
+	procptr = PointerGetDatum(proclocktag->myProc);
+	lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS;
+
+	return lockhash;
+}
+
+/*
+ * Compute the hash code associated with a PROCLOCKTAG, given the hashcode
+ * for its underlying LOCK.
+ *
+ * We use this just to avoid redundant calls of LockTagHashCode().
+ */
+static inline uint32
+ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode)
+{
+	uint32		lockhash = hashcode;
+	Datum		procptr;
+
+	/*
+	 * This must match proclock_hash()!
+	 */
+	procptr = PointerGetDatum(proclocktag->myProc);
+	lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS;
+
+	return lockhash;
+}
+
+/*
+ * Given two lock modes, return whether they would conflict.
+ */
+bool
+DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
+{
+	LockMethod	lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+
+	if (lockMethodTable->conflictTab[mode1] & LOCKBIT_ON(mode2))
+		return true;
+
+	return false;
+}
+
+/*
+ * LockHeldByMe -- test whether lock 'locktag' is held with mode 'lockmode'
+ *		by the current transaction
+ */
+bool
+LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode)
+{
+	LOCALLOCKTAG localtag;
+	LOCALLOCK  *locallock;
+
+	/*
+	 * See if there is a LOCALLOCK entry for this lock and lockmode
+	 */
+	MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+	localtag.lock = *locktag;
+	localtag.mode = lockmode;
+
+	locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+										  &localtag,
+										  HASH_FIND, NULL);
+
+	return (locallock && locallock->nLocks > 0);
+}
+
+#ifdef USE_ASSERT_CHECKING
+/*
+ * GetLockMethodLocalHash -- return the hash of local locks, for modules that
+ *		evaluate assertions based on all locks held.
+ */
+HTAB *
+GetLockMethodLocalHash(void)
+{
+	return LockMethodLocalHash;
+}
+#endif
+
+/*
+ * LockHasWaiters -- look up 'locktag' and check if releasing this
+ *		lock would wake up other processes waiting for it.
+ */
+bool
+LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
+{
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LockMethod	lockMethodTable;
+	LOCALLOCKTAG localtag;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	LWLock	   *partitionLock;
+	bool		hasWaiters = false;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+		elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+#ifdef LOCK_DEBUG
+	if (LOCK_DEBUG_ENABLED(locktag))
+		elog(LOG, "LockHasWaiters: lock [%u,%u] %s",
+			 locktag->locktag_field1, locktag->locktag_field2,
+			 lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+	/*
+	 * Find the LOCALLOCK entry for this lock and lockmode
+	 */
+	MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+	localtag.lock = *locktag;
+	localtag.mode = lockmode;
+
+	locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+										  &localtag,
+										  HASH_FIND, NULL);
+
+	/*
+	 * let the caller print its own error message, too. Do not ereport(ERROR).
+	 */
+	if (!locallock || locallock->nLocks <= 0)
+	{
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		return false;
+	}
+
+	/*
+	 * Check the shared lock table.
+	 */
+	partitionLock = LockHashPartitionLock(locallock->hashcode);
+
+	LWLockAcquire(partitionLock, LW_SHARED);
+
+	/*
+	 * We don't need to re-find the lock or proclock, since we kept their
+	 * addresses in the locallock table, and they couldn't have been removed
+	 * while we were holding a lock on them.
+	 */
+	lock = locallock->lock;
+	LOCK_PRINT("LockHasWaiters: found", lock, lockmode);
+	proclock = locallock->proclock;
+	PROCLOCK_PRINT("LockHasWaiters: found", proclock);
+
+	/*
+	 * Double-check that we are actually holding a lock of the type we want to
+	 * release.
+	 */
+	if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+	{
+		PROCLOCK_PRINT("LockHasWaiters: WRONGTYPE", proclock);
+		LWLockRelease(partitionLock);
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		RemoveLocalLock(locallock);
+		return false;
+	}
+
+	/*
+	 * Do the checking.
+	 */
+	if ((lockMethodTable->conflictTab[lockmode] & lock->waitMask) != 0)
+		hasWaiters = true;
+
+	LWLockRelease(partitionLock);
+
+	return hasWaiters;
+}
+
+/*
+ * LockAcquire -- Check for lock conflicts, sleep if conflict found,
+ *		set lock if/when no conflicts.
+ *
+ * Inputs:
+ *	locktag: unique identifier for the lockable object
+ *	lockmode: lock mode to acquire
+ *	sessionLock: if true, acquire lock for session not current transaction
+ *	dontWait: if true, don't wait to acquire lock
+ *
+ * Returns one of:
+ *		LOCKACQUIRE_NOT_AVAIL		lock not available, and dontWait=true
+ *		LOCKACQUIRE_OK				lock successfully acquired
+ *		LOCKACQUIRE_ALREADY_HELD	incremented count for lock already held
+ *		LOCKACQUIRE_ALREADY_CLEAR	incremented count for lock already clear
+ *
+ * In the normal case where dontWait=false and the caller doesn't need to
+ * distinguish a freshly acquired lock from one already taken earlier in
+ * this same transaction, there is no need to examine the return value.
+ *
+ * Side Effects: The lock is acquired and recorded in lock tables.
+ *
+ * NOTE: if we wait for the lock, there is no way to abort the wait
+ * short of aborting the transaction.
+ */
+LockAcquireResult
+LockAcquire(const LOCKTAG *locktag,
+			LOCKMODE lockmode,
+			bool sessionLock,
+			bool dontWait)
+{
+	return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait,
+							   true, NULL);
+}
+
+/*
+ * LockAcquireExtended - allows us to specify additional options
+ *
+ * reportMemoryError specifies whether a lock request that fills the lock
+ * table should generate an ERROR or not.  Passing "false" allows the caller
+ * to attempt to recover from lock-table-full situations, perhaps by forcibly
+ * canceling other lock holders and then retrying.  Note, however, that the
+ * return code for that is LOCKACQUIRE_NOT_AVAIL, so that it's unsafe to use
+ * in combination with dontWait = true, as the cause of failure couldn't be
+ * distinguished.
+ *
+ * If locallockp isn't NULL, *locallockp receives a pointer to the LOCALLOCK
+ * table entry if a lock is successfully acquired, or NULL if not.
+ */
+LockAcquireResult
+LockAcquireExtended(const LOCKTAG *locktag,
+					LOCKMODE lockmode,
+					bool sessionLock,
+					bool dontWait,
+					bool reportMemoryError,
+					LOCALLOCK **locallockp)
+{
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LockMethod	lockMethodTable;
+	LOCALLOCKTAG localtag;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	bool		found;
+	ResourceOwner owner;
+	uint32		hashcode;
+	LWLock	   *partitionLock;
+	bool		found_conflict;
+	bool		log_lock = false;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+		elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+	if (RecoveryInProgress() && !InRecovery &&
+		(locktag->locktag_type == LOCKTAG_OBJECT ||
+		 locktag->locktag_type == LOCKTAG_RELATION) &&
+		lockmode > RowExclusiveLock)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot acquire lock mode %s on database objects while recovery is in progress",
+						lockMethodTable->lockModeNames[lockmode]),
+				 errhint("Only RowExclusiveLock or less can be acquired on database objects during recovery.")));
+
+#ifdef LOCK_DEBUG
+	if (LOCK_DEBUG_ENABLED(locktag))
+		elog(LOG, "LockAcquire: lock [%u,%u] %s",
+			 locktag->locktag_field1, locktag->locktag_field2,
+			 lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+	/* Identify owner for lock */
+	if (sessionLock)
+		owner = NULL;
+	else
+		owner = CurrentResourceOwner;
+
+	/*
+	 * Find or create a LOCALLOCK entry for this lock and lockmode
+	 */
+	MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+	localtag.lock = *locktag;
+	localtag.mode = lockmode;
+
+	locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+										  &localtag,
+										  HASH_ENTER, &found);
+
+	/*
+	 * if it's a new locallock object, initialize it
+	 */
+	if (!found)
+	{
+		locallock->lock = NULL;
+		locallock->proclock = NULL;
+		locallock->hashcode = LockTagHashCode(&(localtag.lock));
+		locallock->nLocks = 0;
+		locallock->holdsStrongLockCount = false;
+		locallock->lockCleared = false;
+		locallock->numLockOwners = 0;
+		locallock->maxLockOwners = 8;
+		locallock->lockOwners = NULL;	/* in case next line fails */
+		locallock->lockOwners = (LOCALLOCKOWNER *)
+			MemoryContextAlloc(TopMemoryContext,
+							   locallock->maxLockOwners * sizeof(LOCALLOCKOWNER));
+	}
+	else
+	{
+		/* Make sure there will be room to remember the lock */
+		if (locallock->numLockOwners >= locallock->maxLockOwners)
+		{
+			int			newsize = locallock->maxLockOwners * 2;
+
+			locallock->lockOwners = (LOCALLOCKOWNER *)
+				repalloc(locallock->lockOwners,
+						 newsize * sizeof(LOCALLOCKOWNER));
+			locallock->maxLockOwners = newsize;
+		}
+	}
+	hashcode = locallock->hashcode;
+
+	if (locallockp)
+		*locallockp = locallock;
+
+	/*
+	 * If we already hold the lock, we can just increase the count locally.
+	 *
+	 * If lockCleared is already set, caller need not worry about absorbing
+	 * sinval messages related to the lock's object.
+	 */
+	if (locallock->nLocks > 0)
+	{
+		GrantLockLocal(locallock, owner);
+		if (locallock->lockCleared)
+			return LOCKACQUIRE_ALREADY_CLEAR;
+		else
+			return LOCKACQUIRE_ALREADY_HELD;
+	}
+
+	/*
+	 * We don't acquire any other heavyweight lock while holding the relation
+	 * extension lock.  We do allow to acquire the same relation extension
+	 * lock more than once but that case won't reach here.
+	 */
+	Assert(!IsRelationExtensionLockHeld);
+
+	/*
+	 * Prepare to emit a WAL record if acquisition of this lock needs to be
+	 * replayed in a standby server.
+	 *
+	 * Here we prepare to log; after lock is acquired we'll issue log record.
+	 * This arrangement simplifies error recovery in case the preparation step
+	 * fails.
+	 *
+	 * Only AccessExclusiveLocks can conflict with lock types that read-only
+	 * transactions can acquire in a standby server. Make sure this definition
+	 * matches the one in GetRunningTransactionLocks().
+	 */
+	if (lockmode >= AccessExclusiveLock &&
+		locktag->locktag_type == LOCKTAG_RELATION &&
+		!RecoveryInProgress() &&
+		XLogStandbyInfoActive())
+	{
+		LogAccessExclusiveLockPrepare();
+		log_lock = true;
+	}
+
+	/*
+	 * Attempt to take lock via fast path, if eligible.  But if we remember
+	 * having filled up the fast path array, we don't attempt to make any
+	 * further use of it until we release some locks.  It's possible that some
+	 * other backend has transferred some of those locks to the shared hash
+	 * table, leaving space free, but it's not worth acquiring the LWLock just
+	 * to check.  It's also possible that we're acquiring a second or third
+	 * lock type on a relation we have already locked using the fast-path, but
+	 * for now we don't worry about that case either.
+	 */
+	if (EligibleForRelationFastPath(locktag, lockmode) &&
+		FastPathLocalUseCount < FP_LOCK_SLOTS_PER_BACKEND)
+	{
+		uint32		fasthashcode = FastPathStrongLockHashPartition(hashcode);
+		bool		acquired;
+
+		/*
+		 * LWLockAcquire acts as a memory sequencing point, so it's safe to
+		 * assume that any strong locker whose increment to
+		 * FastPathStrongRelationLocks->counts becomes visible after we test
+		 * it has yet to begin to transfer fast-path locks.
+		 */
+		LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+		if (FastPathStrongRelationLocks->count[fasthashcode] != 0)
+			acquired = false;
+		else
+			acquired = FastPathGrantRelationLock(locktag->locktag_field2,
+												 lockmode);
+		LWLockRelease(&MyProc->fpInfoLock);
+		if (acquired)
+		{
+			/*
+			 * The locallock might contain stale pointers to some old shared
+			 * objects; we MUST reset these to null before considering the
+			 * lock to be acquired via fast-path.
+			 */
+			locallock->lock = NULL;
+			locallock->proclock = NULL;
+			GrantLockLocal(locallock, owner);
+			return LOCKACQUIRE_OK;
+		}
+	}
+
+	/*
+	 * If this lock could potentially have been taken via the fast-path by
+	 * some other backend, we must (temporarily) disable further use of the
+	 * fast-path for this lock tag, and migrate any locks already taken via
+	 * this method to the main lock table.
+	 */
+	if (ConflictsWithRelationFastPath(locktag, lockmode))
+	{
+		uint32		fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+		BeginStrongLockAcquire(locallock, fasthashcode);
+		if (!FastPathTransferRelationLocks(lockMethodTable, locktag,
+										   hashcode))
+		{
+			AbortStrongLockAcquire();
+			if (locallock->nLocks == 0)
+				RemoveLocalLock(locallock);
+			if (locallockp)
+				*locallockp = NULL;
+			if (reportMemoryError)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of shared memory"),
+						 errhint("You might need to increase %s.", "max_locks_per_transaction")));
+			else
+				return LOCKACQUIRE_NOT_AVAIL;
+		}
+	}
+
+	/*
+	 * We didn't find the lock in our LOCALLOCK table, and we didn't manage to
+	 * take it via the fast-path, either, so we've got to mess with the shared
+	 * lock table.
+	 */
+	partitionLock = LockHashPartitionLock(hashcode);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Find or create lock and proclock entries with this tag
+	 *
+	 * Note: if the locallock object already existed, it might have a pointer
+	 * to the lock already ... but we should not assume that that pointer is
+	 * valid, since a lock object with zero hold and request counts can go
+	 * away anytime.  So we have to use SetupLockInTable() to recompute the
+	 * lock and proclock pointers, even if they're already set.
+	 */
+	proclock = SetupLockInTable(lockMethodTable, MyProc, locktag,
+								hashcode, lockmode);
+	if (!proclock)
+	{
+		AbortStrongLockAcquire();
+		LWLockRelease(partitionLock);
+		if (locallock->nLocks == 0)
+			RemoveLocalLock(locallock);
+		if (locallockp)
+			*locallockp = NULL;
+		if (reportMemoryError)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory"),
+					 errhint("You might need to increase %s.", "max_locks_per_transaction")));
+		else
+			return LOCKACQUIRE_NOT_AVAIL;
+	}
+	locallock->proclock = proclock;
+	lock = proclock->tag.myLock;
+	locallock->lock = lock;
+
+	/*
+	 * If lock requested conflicts with locks requested by waiters, must join
+	 * wait queue.  Otherwise, check for conflict with already-held locks.
+	 * (That's last because most complex check.)
+	 */
+	if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
+		found_conflict = true;
+	else
+		found_conflict = LockCheckConflicts(lockMethodTable, lockmode,
+											lock, proclock);
+
+	if (!found_conflict)
+	{
+		/* No conflict with held or previously requested locks */
+		GrantLock(lock, proclock, lockmode);
+		GrantLockLocal(locallock, owner);
+	}
+	else
+	{
+		/*
+		 * We can't acquire the lock immediately.  If caller specified no
+		 * blocking, remove useless table entries and return
+		 * LOCKACQUIRE_NOT_AVAIL without waiting.
+		 */
+		if (dontWait)
+		{
+			AbortStrongLockAcquire();
+			if (proclock->holdMask == 0)
+			{
+				uint32		proclock_hashcode;
+
+				proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
+				dlist_delete(&proclock->lockLink);
+				dlist_delete(&proclock->procLink);
+				if (!hash_search_with_hash_value(LockMethodProcLockHash,
+												 &(proclock->tag),
+												 proclock_hashcode,
+												 HASH_REMOVE,
+												 NULL))
+					elog(PANIC, "proclock table corrupted");
+			}
+			else
+				PROCLOCK_PRINT("LockAcquire: NOWAIT", proclock);
+			lock->nRequested--;
+			lock->requested[lockmode]--;
+			LOCK_PRINT("LockAcquire: conditional lock failed", lock, lockmode);
+			Assert((lock->nRequested > 0) && (lock->requested[lockmode] >= 0));
+			Assert(lock->nGranted <= lock->nRequested);
+			LWLockRelease(partitionLock);
+			if (locallock->nLocks == 0)
+				RemoveLocalLock(locallock);
+			if (locallockp)
+				*locallockp = NULL;
+			return LOCKACQUIRE_NOT_AVAIL;
+		}
+
+		/*
+		 * Set bitmask of locks this process already holds on this object.
+		 */
+		MyProc->heldLocks = proclock->holdMask;
+
+		/*
+		 * Sleep till someone wakes me up.
+		 */
+
+		TRACE_POSTGRESQL_LOCK_WAIT_START(locktag->locktag_field1,
+										 locktag->locktag_field2,
+										 locktag->locktag_field3,
+										 locktag->locktag_field4,
+										 locktag->locktag_type,
+										 lockmode);
+
+		WaitOnLock(locallock, owner);
+
+		TRACE_POSTGRESQL_LOCK_WAIT_DONE(locktag->locktag_field1,
+										locktag->locktag_field2,
+										locktag->locktag_field3,
+										locktag->locktag_field4,
+										locktag->locktag_type,
+										lockmode);
+
+		/*
+		 * NOTE: do not do any material change of state between here and
+		 * return.  All required changes in locktable state must have been
+		 * done when the lock was granted to us --- see notes in WaitOnLock.
+		 */
+
+		/*
+		 * Check the proclock entry status, in case something in the ipc
+		 * communication doesn't work correctly.
+		 */
+		if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+		{
+			AbortStrongLockAcquire();
+			PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock);
+			LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode);
+			/* Should we retry ? */
+			LWLockRelease(partitionLock);
+			elog(ERROR, "LockAcquire failed");
+		}
+		PROCLOCK_PRINT("LockAcquire: granted", proclock);
+		LOCK_PRINT("LockAcquire: granted", lock, lockmode);
+	}
+
+	/*
+	 * Lock state is fully up-to-date now; if we error out after this, no
+	 * special error cleanup is required.
+	 */
+	FinishStrongLockAcquire();
+
+	LWLockRelease(partitionLock);
+
+	/*
+	 * Emit a WAL record if acquisition of this lock needs to be replayed in a
+	 * standby server.
+	 */
+	if (log_lock)
+	{
+		/*
+		 * Decode the locktag back to the original values, to avoid sending
+		 * lots of empty bytes with every message.  See lock.h to check how a
+		 * locktag is defined for LOCKTAG_RELATION
+		 */
+		LogAccessExclusiveLock(locktag->locktag_field1,
+							   locktag->locktag_field2);
+	}
+
+	return LOCKACQUIRE_OK;
+}
+
+/*
+ * Find or create LOCK and PROCLOCK objects as needed for a new lock
+ * request.
+ *
+ * Returns the PROCLOCK object, or NULL if we failed to create the objects
+ * for lack of shared memory.
+ *
+ * The appropriate partition lock must be held at entry, and will be
+ * held at exit.
+ */
+static PROCLOCK *
+SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
+				 const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode)
+{
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	uint32		proclock_hashcode;
+	bool		found;
+
+	/*
+	 * Find or create a lock with this tag.
+	 */
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												locktag,
+												hashcode,
+												HASH_ENTER_NULL,
+												&found);
+	if (!lock)
+		return NULL;
+
+	/*
+	 * if it's a new lock object, initialize it
+	 */
+	if (!found)
+	{
+		lock->grantMask = 0;
+		lock->waitMask = 0;
+		dlist_init(&lock->procLocks);
+		dclist_init(&lock->waitProcs);
+		lock->nRequested = 0;
+		lock->nGranted = 0;
+		MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
+		MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+		LOCK_PRINT("LockAcquire: new", lock, lockmode);
+	}
+	else
+	{
+		LOCK_PRINT("LockAcquire: found", lock, lockmode);
+		Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0));
+		Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0));
+		Assert(lock->nGranted <= lock->nRequested);
+	}
+
+	/*
+	 * Create the hash key for the proclock table.
+	 */
+	proclocktag.myLock = lock;
+	proclocktag.myProc = proc;
+
+	proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+	/*
+	 * Find or create a proclock entry with this tag
+	 */
+	proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+														&proclocktag,
+														proclock_hashcode,
+														HASH_ENTER_NULL,
+														&found);
+	if (!proclock)
+	{
+		/* Oops, not enough shmem for the proclock */
+		if (lock->nRequested == 0)
+		{
+			/*
+			 * There are no other requestors of this lock, so garbage-collect
+			 * the lock object.  We *must* do this to avoid a permanent leak
+			 * of shared memory, because there won't be anything to cause
+			 * anyone to release the lock object later.
+			 */
+			Assert(dlist_is_empty(&(lock->procLocks)));
+			if (!hash_search_with_hash_value(LockMethodLockHash,
+											 &(lock->tag),
+											 hashcode,
+											 HASH_REMOVE,
+											 NULL))
+				elog(PANIC, "lock table corrupted");
+		}
+		return NULL;
+	}
+
+	/*
+	 * If new, initialize the new entry
+	 */
+	if (!found)
+	{
+		uint32		partition = LockHashPartition(hashcode);
+
+		/*
+		 * It might seem unsafe to access proclock->groupLeader without a
+		 * lock, but it's not really.  Either we are initializing a proclock
+		 * on our own behalf, in which case our group leader isn't changing
+		 * because the group leader for a process can only ever be changed by
+		 * the process itself; or else we are transferring a fast-path lock to
+		 * the main lock table, in which case that process can't change it's
+		 * lock group leader without first releasing all of its locks (and in
+		 * particular the one we are currently transferring).
+		 */
+		proclock->groupLeader = proc->lockGroupLeader != NULL ?
+			proc->lockGroupLeader : proc;
+		proclock->holdMask = 0;
+		proclock->releaseMask = 0;
+		/* Add proclock to appropriate lists */
+		dlist_push_tail(&lock->procLocks, &proclock->lockLink);
+		dlist_push_tail(&proc->myProcLocks[partition], &proclock->procLink);
+		PROCLOCK_PRINT("LockAcquire: new", proclock);
+	}
+	else
+	{
+		PROCLOCK_PRINT("LockAcquire: found", proclock);
+		Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+#ifdef CHECK_DEADLOCK_RISK
+
+		/*
+		 * Issue warning if we already hold a lower-level lock on this object
+		 * and do not hold a lock of the requested level or higher. This
+		 * indicates a deadlock-prone coding practice (eg, we'd have a
+		 * deadlock if another backend were following the same code path at
+		 * about the same time).
+		 *
+		 * This is not enabled by default, because it may generate log entries
+		 * about user-level coding practices that are in fact safe in context.
+		 * It can be enabled to help find system-level problems.
+		 *
+		 * XXX Doing numeric comparison on the lockmodes is a hack; it'd be
+		 * better to use a table.  For now, though, this works.
+		 */
+		{
+			int			i;
+
+			for (i = lockMethodTable->numLockModes; i > 0; i--)
+			{
+				if (proclock->holdMask & LOCKBIT_ON(i))
+				{
+					if (i >= (int) lockmode)
+						break;	/* safe: we have a lock >= req level */
+					elog(LOG, "deadlock risk: raising lock level"
+						 " from %s to %s on object %u/%u/%u",
+						 lockMethodTable->lockModeNames[i],
+						 lockMethodTable->lockModeNames[lockmode],
+						 lock->tag.locktag_field1, lock->tag.locktag_field2,
+						 lock->tag.locktag_field3);
+					break;
+				}
+			}
+		}
+#endif							/* CHECK_DEADLOCK_RISK */
+	}
+
+	/*
+	 * lock->nRequested and lock->requested[] count the total number of
+	 * requests, whether granted or waiting, so increment those immediately.
+	 * The other counts don't increment till we get the lock.
+	 */
+	lock->nRequested++;
+	lock->requested[lockmode]++;
+	Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+
+	/*
+	 * We shouldn't already hold the desired lock; else locallock table is
+	 * broken.
+	 */
+	if (proclock->holdMask & LOCKBIT_ON(lockmode))
+		elog(ERROR, "lock %s on object %u/%u/%u is already held",
+			 lockMethodTable->lockModeNames[lockmode],
+			 lock->tag.locktag_field1, lock->tag.locktag_field2,
+			 lock->tag.locktag_field3);
+
+	return proclock;
+}
+
+/*
+ * Check and set/reset the flag that we hold the relation extension lock.
+ *
+ * It is callers responsibility that this function is called after
+ * acquiring/releasing the relation extension lock.
+ *
+ * Pass acquired as true if lock is acquired, false otherwise.
+ */
+static inline void
+CheckAndSetLockHeld(LOCALLOCK *locallock, bool acquired)
+{
+#ifdef USE_ASSERT_CHECKING
+	if (LOCALLOCK_LOCKTAG(*locallock) == LOCKTAG_RELATION_EXTEND)
+		IsRelationExtensionLockHeld = acquired;
+#endif
+}
+
+/*
+ * Subroutine to free a locallock entry
+ */
+static void
+RemoveLocalLock(LOCALLOCK *locallock)
+{
+	int			i;
+
+	for (i = locallock->numLockOwners - 1; i >= 0; i--)
+	{
+		if (locallock->lockOwners[i].owner != NULL)
+			ResourceOwnerForgetLock(locallock->lockOwners[i].owner, locallock);
+	}
+	locallock->numLockOwners = 0;
+	if (locallock->lockOwners != NULL)
+		pfree(locallock->lockOwners);
+	locallock->lockOwners = NULL;
+
+	if (locallock->holdsStrongLockCount)
+	{
+		uint32		fasthashcode;
+
+		fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode);
+
+		SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+		Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+		FastPathStrongRelationLocks->count[fasthashcode]--;
+		locallock->holdsStrongLockCount = false;
+		SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+	}
+
+	if (!hash_search(LockMethodLocalHash,
+					 &(locallock->tag),
+					 HASH_REMOVE, NULL))
+		elog(WARNING, "locallock table corrupted");
+
+	/*
+	 * Indicate that the lock is released for certain types of locks
+	 */
+	CheckAndSetLockHeld(locallock, false);
+}
+
+/*
+ * LockCheckConflicts -- test whether requested lock conflicts
+ *		with those already granted
+ *
+ * Returns true if conflict, false if no conflict.
+ *
+ * NOTES:
+ *		Here's what makes this complicated: one process's locks don't
+ * conflict with one another, no matter what purpose they are held for
+ * (eg, session and transaction locks do not conflict).  Nor do the locks
+ * of one process in a lock group conflict with those of another process in
+ * the same group.  So, we must subtract off these locks when determining
+ * whether the requested new lock conflicts with those already held.
+ */
+bool
+LockCheckConflicts(LockMethod lockMethodTable,
+				   LOCKMODE lockmode,
+				   LOCK *lock,
+				   PROCLOCK *proclock)
+{
+	int			numLockModes = lockMethodTable->numLockModes;
+	LOCKMASK	myLocks;
+	int			conflictMask = lockMethodTable->conflictTab[lockmode];
+	int			conflictsRemaining[MAX_LOCKMODES];
+	int			totalConflictsRemaining = 0;
+	dlist_iter	proclock_iter;
+	int			i;
+
+	/*
+	 * first check for global conflicts: If no locks conflict with my request,
+	 * then I get the lock.
+	 *
+	 * Checking for conflict: lock->grantMask represents the types of
+	 * currently held locks.  conflictTable[lockmode] has a bit set for each
+	 * type of lock that conflicts with request.   Bitwise compare tells if
+	 * there is a conflict.
+	 */
+	if (!(conflictMask & lock->grantMask))
+	{
+		PROCLOCK_PRINT("LockCheckConflicts: no conflict", proclock);
+		return false;
+	}
+
+	/*
+	 * Rats.  Something conflicts.  But it could still be my own lock, or a
+	 * lock held by another member of my locking group.  First, figure out how
+	 * many conflicts remain after subtracting out any locks I hold myself.
+	 */
+	myLocks = proclock->holdMask;
+	for (i = 1; i <= numLockModes; i++)
+	{
+		if ((conflictMask & LOCKBIT_ON(i)) == 0)
+		{
+			conflictsRemaining[i] = 0;
+			continue;
+		}
+		conflictsRemaining[i] = lock->granted[i];
+		if (myLocks & LOCKBIT_ON(i))
+			--conflictsRemaining[i];
+		totalConflictsRemaining += conflictsRemaining[i];
+	}
+
+	/* If no conflicts remain, we get the lock. */
+	if (totalConflictsRemaining == 0)
+	{
+		PROCLOCK_PRINT("LockCheckConflicts: resolved (simple)", proclock);
+		return false;
+	}
+
+	/* If no group locking, it's definitely a conflict. */
+	if (proclock->groupLeader == MyProc && MyProc->lockGroupLeader == NULL)
+	{
+		Assert(proclock->tag.myProc == MyProc);
+		PROCLOCK_PRINT("LockCheckConflicts: conflicting (simple)",
+					   proclock);
+		return true;
+	}
+
+	/*
+	 * The relation extension lock conflict even between the group members.
+	 */
+	if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND)
+	{
+		PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)",
+					   proclock);
+		return true;
+	}
+
+	/*
+	 * Locks held in conflicting modes by members of our own lock group are
+	 * not real conflicts; we can subtract those out and see if we still have
+	 * a conflict.  This is O(N) in the number of processes holding or
+	 * awaiting locks on this object.  We could improve that by making the
+	 * shared memory state more complex (and larger) but it doesn't seem worth
+	 * it.
+	 */
+	dlist_foreach(proclock_iter, &lock->procLocks)
+	{
+		PROCLOCK   *otherproclock =
+			dlist_container(PROCLOCK, lockLink, proclock_iter.cur);
+
+		if (proclock != otherproclock &&
+			proclock->groupLeader == otherproclock->groupLeader &&
+			(otherproclock->holdMask & conflictMask) != 0)
+		{
+			int			intersectMask = otherproclock->holdMask & conflictMask;
+
+			for (i = 1; i <= numLockModes; i++)
+			{
+				if ((intersectMask & LOCKBIT_ON(i)) != 0)
+				{
+					if (conflictsRemaining[i] <= 0)
+						elog(PANIC, "proclocks held do not match lock");
+					conflictsRemaining[i]--;
+					totalConflictsRemaining--;
+				}
+			}
+
+			if (totalConflictsRemaining == 0)
+			{
+				PROCLOCK_PRINT("LockCheckConflicts: resolved (group)",
+							   proclock);
+				return false;
+			}
+		}
+	}
+
+	/* Nope, it's a real conflict. */
+	PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)", proclock);
+	return true;
+}
+
+/*
+ * GrantLock -- update the lock and proclock data structures to show
+ *		the lock request has been granted.
+ *
+ * NOTE: if proc was blocked, it also needs to be removed from the wait list
+ * and have its waitLock/waitProcLock fields cleared.  That's not done here.
+ *
+ * NOTE: the lock grant also has to be recorded in the associated LOCALLOCK
+ * table entry; but since we may be awaking some other process, we can't do
+ * that here; it's done by GrantLockLocal, instead.
+ */
+void
+GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode)
+{
+	lock->nGranted++;
+	lock->granted[lockmode]++;
+	lock->grantMask |= LOCKBIT_ON(lockmode);
+	if (lock->granted[lockmode] == lock->requested[lockmode])
+		lock->waitMask &= LOCKBIT_OFF(lockmode);
+	proclock->holdMask |= LOCKBIT_ON(lockmode);
+	LOCK_PRINT("GrantLock", lock, lockmode);
+	Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0));
+	Assert(lock->nGranted <= lock->nRequested);
+}
+
+/*
+ * UnGrantLock -- opposite of GrantLock.
+ *
+ * Updates the lock and proclock data structures to show that the lock
+ * is no longer held nor requested by the current holder.
+ *
+ * Returns true if there were any waiters waiting on the lock that
+ * should now be woken up with ProcLockWakeup.
+ */
+static bool
+UnGrantLock(LOCK *lock, LOCKMODE lockmode,
+			PROCLOCK *proclock, LockMethod lockMethodTable)
+{
+	bool		wakeupNeeded = false;
+
+	Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+	Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0));
+	Assert(lock->nGranted <= lock->nRequested);
+
+	/*
+	 * fix the general lock stats
+	 */
+	lock->nRequested--;
+	lock->requested[lockmode]--;
+	lock->nGranted--;
+	lock->granted[lockmode]--;
+
+	if (lock->granted[lockmode] == 0)
+	{
+		/* change the conflict mask.  No more of this lock type. */
+		lock->grantMask &= LOCKBIT_OFF(lockmode);
+	}
+
+	LOCK_PRINT("UnGrantLock: updated", lock, lockmode);
+
+	/*
+	 * We need only run ProcLockWakeup if the released lock conflicts with at
+	 * least one of the lock types requested by waiter(s).  Otherwise whatever
+	 * conflict made them wait must still exist.  NOTE: before MVCC, we could
+	 * skip wakeup if lock->granted[lockmode] was still positive. But that's
+	 * not true anymore, because the remaining granted locks might belong to
+	 * some waiter, who could now be awakened because he doesn't conflict with
+	 * his own locks.
+	 */
+	if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
+		wakeupNeeded = true;
+
+	/*
+	 * Now fix the per-proclock state.
+	 */
+	proclock->holdMask &= LOCKBIT_OFF(lockmode);
+	PROCLOCK_PRINT("UnGrantLock: updated", proclock);
+
+	return wakeupNeeded;
+}
+
+/*
+ * CleanUpLock -- clean up after releasing a lock.  We garbage-collect the
+ * proclock and lock objects if possible, and call ProcLockWakeup if there
+ * are remaining requests and the caller says it's OK.  (Normally, this
+ * should be called after UnGrantLock, and wakeupNeeded is the result from
+ * UnGrantLock.)
+ *
+ * The appropriate partition lock must be held at entry, and will be
+ * held at exit.
+ */
+static void
+CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+			LockMethod lockMethodTable, uint32 hashcode,
+			bool wakeupNeeded)
+{
+	/*
+	 * If this was my last hold on this lock, delete my entry in the proclock
+	 * table.
+	 */
+	if (proclock->holdMask == 0)
+	{
+		uint32		proclock_hashcode;
+
+		PROCLOCK_PRINT("CleanUpLock: deleting", proclock);
+		dlist_delete(&proclock->lockLink);
+		dlist_delete(&proclock->procLink);
+		proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
+		if (!hash_search_with_hash_value(LockMethodProcLockHash,
+										 &(proclock->tag),
+										 proclock_hashcode,
+										 HASH_REMOVE,
+										 NULL))
+			elog(PANIC, "proclock table corrupted");
+	}
+
+	if (lock->nRequested == 0)
+	{
+		/*
+		 * The caller just released the last lock, so garbage-collect the lock
+		 * object.
+		 */
+		LOCK_PRINT("CleanUpLock: deleting", lock, 0);
+		Assert(dlist_is_empty(&lock->procLocks));
+		if (!hash_search_with_hash_value(LockMethodLockHash,
+										 &(lock->tag),
+										 hashcode,
+										 HASH_REMOVE,
+										 NULL))
+			elog(PANIC, "lock table corrupted");
+	}
+	else if (wakeupNeeded)
+	{
+		/* There are waiters on this lock, so wake them up. */
+		ProcLockWakeup(lockMethodTable, lock);
+	}
+}
+
+/*
+ * GrantLockLocal -- update the locallock data structures to show
+ *		the lock request has been granted.
+ *
+ * We expect that LockAcquire made sure there is room to add a new
+ * ResourceOwner entry.
+ */
+static void
+GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner)
+{
+	LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+	int			i;
+
+	Assert(locallock->numLockOwners < locallock->maxLockOwners);
+	/* Count the total */
+	locallock->nLocks++;
+	/* Count the per-owner lock */
+	for (i = 0; i < locallock->numLockOwners; i++)
+	{
+		if (lockOwners[i].owner == owner)
+		{
+			lockOwners[i].nLocks++;
+			return;
+		}
+	}
+	lockOwners[i].owner = owner;
+	lockOwners[i].nLocks = 1;
+	locallock->numLockOwners++;
+	if (owner != NULL)
+		ResourceOwnerRememberLock(owner, locallock);
+
+	/* Indicate that the lock is acquired for certain types of locks. */
+	CheckAndSetLockHeld(locallock, true);
+}
+
+/*
+ * BeginStrongLockAcquire - inhibit use of fastpath for a given LOCALLOCK,
+ * and arrange for error cleanup if it fails
+ */
+static void
+BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode)
+{
+	Assert(StrongLockInProgress == NULL);
+	Assert(locallock->holdsStrongLockCount == false);
+
+	/*
+	 * Adding to a memory location is not atomic, so we take a spinlock to
+	 * ensure we don't collide with someone else trying to bump the count at
+	 * the same time.
+	 *
+	 * XXX: It might be worth considering using an atomic fetch-and-add
+	 * instruction here, on architectures where that is supported.
+	 */
+
+	SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+	FastPathStrongRelationLocks->count[fasthashcode]++;
+	locallock->holdsStrongLockCount = true;
+	StrongLockInProgress = locallock;
+	SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+}
+
+/*
+ * FinishStrongLockAcquire - cancel pending cleanup for a strong lock
+ * acquisition once it's no longer needed
+ */
+static void
+FinishStrongLockAcquire(void)
+{
+	StrongLockInProgress = NULL;
+}
+
+/*
+ * AbortStrongLockAcquire - undo strong lock state changes performed by
+ * BeginStrongLockAcquire.
+ */
+void
+AbortStrongLockAcquire(void)
+{
+	uint32		fasthashcode;
+	LOCALLOCK  *locallock = StrongLockInProgress;
+
+	if (locallock == NULL)
+		return;
+
+	fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode);
+	Assert(locallock->holdsStrongLockCount == true);
+	SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+	Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+	FastPathStrongRelationLocks->count[fasthashcode]--;
+	locallock->holdsStrongLockCount = false;
+	StrongLockInProgress = NULL;
+	SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+}
+
+/*
+ * GrantAwaitedLock -- call GrantLockLocal for the lock we are doing
+ *		WaitOnLock on.
+ *
+ * proc.c needs this for the case where we are booted off the lock by
+ * timeout, but discover that someone granted us the lock anyway.
+ *
+ * We could just export GrantLockLocal, but that would require including
+ * resowner.h in lock.h, which creates circularity.
+ */
+void
+GrantAwaitedLock(void)
+{
+	GrantLockLocal(awaitedLock, awaitedOwner);
+}
+
+/*
+ * MarkLockClear -- mark an acquired lock as "clear"
+ *
+ * This means that we know we have absorbed all sinval messages that other
+ * sessions generated before we acquired this lock, and so we can confidently
+ * assume we know about any catalog changes protected by this lock.
+ */
+void
+MarkLockClear(LOCALLOCK *locallock)
+{
+	Assert(locallock->nLocks > 0);
+	locallock->lockCleared = true;
+}
+
+/*
+ * WaitOnLock -- wait to acquire a lock
+ *
+ * Caller must have set MyProc->heldLocks to reflect locks already held
+ * on the lockable object by this process.
+ *
+ * The appropriate partition lock must be held at entry.
+ */
+static void
+WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner)
+{
+	LOCKMETHODID lockmethodid = LOCALLOCK_LOCKMETHOD(*locallock);
+	LockMethod	lockMethodTable = LockMethods[lockmethodid];
+
+	LOCK_PRINT("WaitOnLock: sleeping on lock",
+			   locallock->lock, locallock->tag.mode);
+
+	/* adjust the process title to indicate that it's waiting */
+	set_ps_display_suffix("waiting");
+
+	awaitedLock = locallock;
+	awaitedOwner = owner;
+
+	/*
+	 * NOTE: Think not to put any shared-state cleanup after the call to
+	 * ProcSleep, in either the normal or failure path.  The lock state must
+	 * be fully set by the lock grantor, or by CheckDeadLock if we give up
+	 * waiting for the lock.  This is necessary because of the possibility
+	 * that a cancel/die interrupt will interrupt ProcSleep after someone else
+	 * grants us the lock, but before we've noticed it. Hence, after granting,
+	 * the locktable state must fully reflect the fact that we own the lock;
+	 * we can't do additional work on return.
+	 *
+	 * We can and do use a PG_TRY block to try to clean up after failure, but
+	 * this still has a major limitation: elog(FATAL) can occur while waiting
+	 * (eg, a "die" interrupt), and then control won't come back here. So all
+	 * cleanup of essential state should happen in LockErrorCleanup, not here.
+	 * We can use PG_TRY to clear the "waiting" status flags, since doing that
+	 * is unimportant if the process exits.
+	 */
+	PG_TRY();
+	{
+		if (ProcSleep(locallock, lockMethodTable) != PROC_WAIT_STATUS_OK)
+		{
+			/*
+			 * We failed as a result of a deadlock, see CheckDeadLock(). Quit
+			 * now.
+			 */
+			awaitedLock = NULL;
+			LOCK_PRINT("WaitOnLock: aborting on lock",
+					   locallock->lock, locallock->tag.mode);
+			LWLockRelease(LockHashPartitionLock(locallock->hashcode));
+
+			/*
+			 * Now that we aren't holding the partition lock, we can give an
+			 * error report including details about the detected deadlock.
+			 */
+			DeadLockReport();
+			/* not reached */
+		}
+	}
+	PG_CATCH();
+	{
+		/* In this path, awaitedLock remains set until LockErrorCleanup */
+
+		/* reset ps display to remove the suffix */
+		set_ps_display_remove_suffix();
+
+		/* and propagate the error */
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	awaitedLock = NULL;
+
+	/* reset ps display to remove the suffix */
+	set_ps_display_remove_suffix();
+
+	LOCK_PRINT("WaitOnLock: wakeup on lock",
+			   locallock->lock, locallock->tag.mode);
+}
+
+/*
+ * Remove a proc from the wait-queue it is on (caller must know it is on one).
+ * This is only used when the proc has failed to get the lock, so we set its
+ * waitStatus to PROC_WAIT_STATUS_ERROR.
+ *
+ * Appropriate partition lock must be held by caller.  Also, caller is
+ * responsible for signaling the proc if needed.
+ *
+ * NB: this does not clean up any locallock object that may exist for the lock.
+ */
+void
+RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode)
+{
+	LOCK	   *waitLock = proc->waitLock;
+	PROCLOCK   *proclock = proc->waitProcLock;
+	LOCKMODE	lockmode = proc->waitLockMode;
+	LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*waitLock);
+
+	/* Make sure proc is waiting */
+	Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING);
+	Assert(proc->links.next != NULL);
+	Assert(waitLock);
+	Assert(!dclist_is_empty(&waitLock->waitProcs));
+	Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+
+	/* Remove proc from lock's wait queue */
+	dclist_delete_from_thoroughly(&waitLock->waitProcs, &proc->links);
+
+	/* Undo increments of request counts by waiting process */
+	Assert(waitLock->nRequested > 0);
+	Assert(waitLock->nRequested > proc->waitLock->nGranted);
+	waitLock->nRequested--;
+	Assert(waitLock->requested[lockmode] > 0);
+	waitLock->requested[lockmode]--;
+	/* don't forget to clear waitMask bit if appropriate */
+	if (waitLock->granted[lockmode] == waitLock->requested[lockmode])
+		waitLock->waitMask &= LOCKBIT_OFF(lockmode);
+
+	/* Clean up the proc's own state, and pass it the ok/fail signal */
+	proc->waitLock = NULL;
+	proc->waitProcLock = NULL;
+	proc->waitStatus = PROC_WAIT_STATUS_ERROR;
+
+	/*
+	 * Delete the proclock immediately if it represents no already-held locks.
+	 * (This must happen now because if the owner of the lock decides to
+	 * release it, and the requested/granted counts then go to zero,
+	 * LockRelease expects there to be no remaining proclocks.) Then see if
+	 * any other waiters for the lock can be woken up now.
+	 */
+	CleanUpLock(waitLock, proclock,
+				LockMethods[lockmethodid], hashcode,
+				true);
+}
+
+/*
+ * LockRelease -- look up 'locktag' and release one 'lockmode' lock on it.
+ *		Release a session lock if 'sessionLock' is true, else release a
+ *		regular transaction lock.
+ *
+ * Side Effects: find any waiting processes that are now wakable,
+ *		grant them their requested locks and awaken them.
+ *		(We have to grant the lock here to avoid a race between
+ *		the waking process and any new process to
+ *		come along and request the lock.)
+ */
+bool
+LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
+{
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LockMethod	lockMethodTable;
+	LOCALLOCKTAG localtag;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	LWLock	   *partitionLock;
+	bool		wakeupNeeded;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+		elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+#ifdef LOCK_DEBUG
+	if (LOCK_DEBUG_ENABLED(locktag))
+		elog(LOG, "LockRelease: lock [%u,%u] %s",
+			 locktag->locktag_field1, locktag->locktag_field2,
+			 lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+	/*
+	 * Find the LOCALLOCK entry for this lock and lockmode
+	 */
+	MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+	localtag.lock = *locktag;
+	localtag.mode = lockmode;
+
+	locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+										  &localtag,
+										  HASH_FIND, NULL);
+
+	/*
+	 * let the caller print its own error message, too. Do not ereport(ERROR).
+	 */
+	if (!locallock || locallock->nLocks <= 0)
+	{
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		return false;
+	}
+
+	/*
+	 * Decrease the count for the resource owner.
+	 */
+	{
+		LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+		ResourceOwner owner;
+		int			i;
+
+		/* Identify owner for lock */
+		if (sessionLock)
+			owner = NULL;
+		else
+			owner = CurrentResourceOwner;
+
+		for (i = locallock->numLockOwners - 1; i >= 0; i--)
+		{
+			if (lockOwners[i].owner == owner)
+			{
+				Assert(lockOwners[i].nLocks > 0);
+				if (--lockOwners[i].nLocks == 0)
+				{
+					if (owner != NULL)
+						ResourceOwnerForgetLock(owner, locallock);
+					/* compact out unused slot */
+					locallock->numLockOwners--;
+					if (i < locallock->numLockOwners)
+						lockOwners[i] = lockOwners[locallock->numLockOwners];
+				}
+				break;
+			}
+		}
+		if (i < 0)
+		{
+			/* don't release a lock belonging to another owner */
+			elog(WARNING, "you don't own a lock of type %s",
+				 lockMethodTable->lockModeNames[lockmode]);
+			return false;
+		}
+	}
+
+	/*
+	 * Decrease the total local count.  If we're still holding the lock, we're
+	 * done.
+	 */
+	locallock->nLocks--;
+
+	if (locallock->nLocks > 0)
+		return true;
+
+	/*
+	 * At this point we can no longer suppose we are clear of invalidation
+	 * messages related to this lock.  Although we'll delete the LOCALLOCK
+	 * object before any intentional return from this routine, it seems worth
+	 * the trouble to explicitly reset lockCleared right now, just in case
+	 * some error prevents us from deleting the LOCALLOCK.
+	 */
+	locallock->lockCleared = false;
+
+	/* Attempt fast release of any lock eligible for the fast path. */
+	if (EligibleForRelationFastPath(locktag, lockmode) &&
+		FastPathLocalUseCount > 0)
+	{
+		bool		released;
+
+		/*
+		 * We might not find the lock here, even if we originally entered it
+		 * here.  Another backend may have moved it to the main table.
+		 */
+		LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+		released = FastPathUnGrantRelationLock(locktag->locktag_field2,
+											   lockmode);
+		LWLockRelease(&MyProc->fpInfoLock);
+		if (released)
+		{
+			RemoveLocalLock(locallock);
+			return true;
+		}
+	}
+
+	/*
+	 * Otherwise we've got to mess with the shared lock table.
+	 */
+	partitionLock = LockHashPartitionLock(locallock->hashcode);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Normally, we don't need to re-find the lock or proclock, since we kept
+	 * their addresses in the locallock table, and they couldn't have been
+	 * removed while we were holding a lock on them.  But it's possible that
+	 * the lock was taken fast-path and has since been moved to the main hash
+	 * table by another backend, in which case we will need to look up the
+	 * objects here.  We assume the lock field is NULL if so.
+	 */
+	lock = locallock->lock;
+	if (!lock)
+	{
+		PROCLOCKTAG proclocktag;
+
+		Assert(EligibleForRelationFastPath(locktag, lockmode));
+		lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+													locktag,
+													locallock->hashcode,
+													HASH_FIND,
+													NULL);
+		if (!lock)
+			elog(ERROR, "failed to re-find shared lock object");
+		locallock->lock = lock;
+
+		proclocktag.myLock = lock;
+		proclocktag.myProc = MyProc;
+		locallock->proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+													   &proclocktag,
+													   HASH_FIND,
+													   NULL);
+		if (!locallock->proclock)
+			elog(ERROR, "failed to re-find shared proclock object");
+	}
+	LOCK_PRINT("LockRelease: found", lock, lockmode);
+	proclock = locallock->proclock;
+	PROCLOCK_PRINT("LockRelease: found", proclock);
+
+	/*
+	 * Double-check that we are actually holding a lock of the type we want to
+	 * release.
+	 */
+	if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+	{
+		PROCLOCK_PRINT("LockRelease: WRONGTYPE", proclock);
+		LWLockRelease(partitionLock);
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		RemoveLocalLock(locallock);
+		return false;
+	}
+
+	/*
+	 * Do the releasing.  CleanUpLock will waken any now-wakable waiters.
+	 */
+	wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
+
+	CleanUpLock(lock, proclock,
+				lockMethodTable, locallock->hashcode,
+				wakeupNeeded);
+
+	LWLockRelease(partitionLock);
+
+	RemoveLocalLock(locallock);
+	return true;
+}
+
+/*
+ * LockReleaseAll -- Release all locks of the specified lock method that
+ *		are held by the current process.
+ *
+ * Well, not necessarily *all* locks.  The available behaviors are:
+ *		allLocks == true: release all locks including session locks.
+ *		allLocks == false: release all non-session locks.
+ */
+void
+LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
+{
+	HASH_SEQ_STATUS status;
+	LockMethod	lockMethodTable;
+	int			i,
+				numLockModes;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	int			partition;
+	bool		have_fast_path_lwlock = false;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+
+#ifdef LOCK_DEBUG
+	if (*(lockMethodTable->trace_flag))
+		elog(LOG, "LockReleaseAll: lockmethod=%d", lockmethodid);
+#endif
+
+	/*
+	 * Get rid of our fast-path VXID lock, if appropriate.  Note that this is
+	 * the only way that the lock we hold on our own VXID can ever get
+	 * released: it is always and only released when a toplevel transaction
+	 * ends.
+	 */
+	if (lockmethodid == DEFAULT_LOCKMETHOD)
+		VirtualXactLockTableCleanup();
+
+	numLockModes = lockMethodTable->numLockModes;
+
+	/*
+	 * First we run through the locallock table and get rid of unwanted
+	 * entries, then we scan the process's proclocks and get rid of those. We
+	 * do this separately because we may have multiple locallock entries
+	 * pointing to the same proclock, and we daren't end up with any dangling
+	 * pointers.  Fast-path locks are cleaned up during the locallock table
+	 * scan, though.
+	 */
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		/*
+		 * If the LOCALLOCK entry is unused, we must've run out of shared
+		 * memory while trying to set up this lock.  Just forget the local
+		 * entry.
+		 */
+		if (locallock->nLocks == 0)
+		{
+			RemoveLocalLock(locallock);
+			continue;
+		}
+
+		/* Ignore items that are not of the lockmethod to be removed */
+		if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+			continue;
+
+		/*
+		 * If we are asked to release all locks, we can just zap the entry.
+		 * Otherwise, must scan to see if there are session locks. We assume
+		 * there is at most one lockOwners entry for session locks.
+		 */
+		if (!allLocks)
+		{
+			LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+
+			/* If session lock is above array position 0, move it down to 0 */
+			for (i = 0; i < locallock->numLockOwners; i++)
+			{
+				if (lockOwners[i].owner == NULL)
+					lockOwners[0] = lockOwners[i];
+				else
+					ResourceOwnerForgetLock(lockOwners[i].owner, locallock);
+			}
+
+			if (locallock->numLockOwners > 0 &&
+				lockOwners[0].owner == NULL &&
+				lockOwners[0].nLocks > 0)
+			{
+				/* Fix the locallock to show just the session locks */
+				locallock->nLocks = lockOwners[0].nLocks;
+				locallock->numLockOwners = 1;
+				/* We aren't deleting this locallock, so done */
+				continue;
+			}
+			else
+				locallock->numLockOwners = 0;
+		}
+
+		/*
+		 * If the lock or proclock pointers are NULL, this lock was taken via
+		 * the relation fast-path (and is not known to have been transferred).
+		 */
+		if (locallock->proclock == NULL || locallock->lock == NULL)
+		{
+			LOCKMODE	lockmode = locallock->tag.mode;
+			Oid			relid;
+
+			/* Verify that a fast-path lock is what we've got. */
+			if (!EligibleForRelationFastPath(&locallock->tag.lock, lockmode))
+				elog(PANIC, "locallock table corrupted");
+
+			/*
+			 * If we don't currently hold the LWLock that protects our
+			 * fast-path data structures, we must acquire it before attempting
+			 * to release the lock via the fast-path.  We will continue to
+			 * hold the LWLock until we're done scanning the locallock table,
+			 * unless we hit a transferred fast-path lock.  (XXX is this
+			 * really such a good idea?  There could be a lot of entries ...)
+			 */
+			if (!have_fast_path_lwlock)
+			{
+				LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+				have_fast_path_lwlock = true;
+			}
+
+			/* Attempt fast-path release. */
+			relid = locallock->tag.lock.locktag_field2;
+			if (FastPathUnGrantRelationLock(relid, lockmode))
+			{
+				RemoveLocalLock(locallock);
+				continue;
+			}
+
+			/*
+			 * Our lock, originally taken via the fast path, has been
+			 * transferred to the main lock table.  That's going to require
+			 * some extra work, so release our fast-path lock before starting.
+			 */
+			LWLockRelease(&MyProc->fpInfoLock);
+			have_fast_path_lwlock = false;
+
+			/*
+			 * Now dump the lock.  We haven't got a pointer to the LOCK or
+			 * PROCLOCK in this case, so we have to handle this a bit
+			 * differently than a normal lock release.  Unfortunately, this
+			 * requires an extra LWLock acquire-and-release cycle on the
+			 * partitionLock, but hopefully it shouldn't happen often.
+			 */
+			LockRefindAndRelease(lockMethodTable, MyProc,
+								 &locallock->tag.lock, lockmode, false);
+			RemoveLocalLock(locallock);
+			continue;
+		}
+
+		/* Mark the proclock to show we need to release this lockmode */
+		if (locallock->nLocks > 0)
+			locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode);
+
+		/* And remove the locallock hashtable entry */
+		RemoveLocalLock(locallock);
+	}
+
+	/* Done with the fast-path data structures */
+	if (have_fast_path_lwlock)
+		LWLockRelease(&MyProc->fpInfoLock);
+
+	/*
+	 * Now, scan each lock partition separately.
+	 */
+	for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+	{
+		LWLock	   *partitionLock;
+		dlist_head *procLocks = &MyProc->myProcLocks[partition];
+		dlist_mutable_iter proclock_iter;
+
+		partitionLock = LockHashPartitionLockByIndex(partition);
+
+		/*
+		 * If the proclock list for this partition is empty, we can skip
+		 * acquiring the partition lock.  This optimization is trickier than
+		 * it looks, because another backend could be in process of adding
+		 * something to our proclock list due to promoting one of our
+		 * fast-path locks.  However, any such lock must be one that we
+		 * decided not to delete above, so it's okay to skip it again now;
+		 * we'd just decide not to delete it again.  We must, however, be
+		 * careful to re-fetch the list header once we've acquired the
+		 * partition lock, to be sure we have a valid, up-to-date pointer.
+		 * (There is probably no significant risk if pointer fetch/store is
+		 * atomic, but we don't wish to assume that.)
+		 *
+		 * XXX This argument assumes that the locallock table correctly
+		 * represents all of our fast-path locks.  While allLocks mode
+		 * guarantees to clean up all of our normal locks regardless of the
+		 * locallock situation, we lose that guarantee for fast-path locks.
+		 * This is not ideal.
+		 */
+		if (dlist_is_empty(procLocks))
+			continue;			/* needn't examine this partition */
+
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		dlist_foreach_modify(proclock_iter, procLocks)
+		{
+			PROCLOCK   *proclock = dlist_container(PROCLOCK, procLink, proclock_iter.cur);
+			bool		wakeupNeeded = false;
+
+			Assert(proclock->tag.myProc == MyProc);
+
+			lock = proclock->tag.myLock;
+
+			/* Ignore items that are not of the lockmethod to be removed */
+			if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
+				continue;
+
+			/*
+			 * In allLocks mode, force release of all locks even if locallock
+			 * table had problems
+			 */
+			if (allLocks)
+				proclock->releaseMask = proclock->holdMask;
+			else
+				Assert((proclock->releaseMask & ~proclock->holdMask) == 0);
+
+			/*
+			 * Ignore items that have nothing to be released, unless they have
+			 * holdMask == 0 and are therefore recyclable
+			 */
+			if (proclock->releaseMask == 0 && proclock->holdMask != 0)
+				continue;
+
+			PROCLOCK_PRINT("LockReleaseAll", proclock);
+			LOCK_PRINT("LockReleaseAll", lock, 0);
+			Assert(lock->nRequested >= 0);
+			Assert(lock->nGranted >= 0);
+			Assert(lock->nGranted <= lock->nRequested);
+			Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+			/*
+			 * Release the previously-marked lock modes
+			 */
+			for (i = 1; i <= numLockModes; i++)
+			{
+				if (proclock->releaseMask & LOCKBIT_ON(i))
+					wakeupNeeded |= UnGrantLock(lock, i, proclock,
+												lockMethodTable);
+			}
+			Assert((lock->nRequested >= 0) && (lock->nGranted >= 0));
+			Assert(lock->nGranted <= lock->nRequested);
+			LOCK_PRINT("LockReleaseAll: updated", lock, 0);
+
+			proclock->releaseMask = 0;
+
+			/* CleanUpLock will wake up waiters if needed. */
+			CleanUpLock(lock, proclock,
+						lockMethodTable,
+						LockTagHashCode(&lock->tag),
+						wakeupNeeded);
+		}						/* loop over PROCLOCKs within this partition */
+
+		LWLockRelease(partitionLock);
+	}							/* loop over partitions */
+
+#ifdef LOCK_DEBUG
+	if (*(lockMethodTable->trace_flag))
+		elog(LOG, "LockReleaseAll done");
+#endif
+}
+
+/*
+ * LockReleaseSession -- Release all session locks of the specified lock method
+ *		that are held by the current process.
+ */
+void
+LockReleaseSession(LOCKMETHODID lockmethodid)
+{
+	HASH_SEQ_STATUS status;
+	LOCALLOCK  *locallock;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		/* Ignore items that are not of the specified lock method */
+		if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+			continue;
+
+		ReleaseLockIfHeld(locallock, true);
+	}
+}
+
+/*
+ * LockReleaseCurrentOwner
+ *		Release all locks belonging to CurrentResourceOwner
+ *
+ * If the caller knows what those locks are, it can pass them as an array.
+ * That speeds up the call significantly, when a lot of locks are held.
+ * Otherwise, pass NULL for locallocks, and we'll traverse through our hash
+ * table to find them.
+ */
+void
+LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks)
+{
+	if (locallocks == NULL)
+	{
+		HASH_SEQ_STATUS status;
+		LOCALLOCK  *locallock;
+
+		hash_seq_init(&status, LockMethodLocalHash);
+
+		while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+			ReleaseLockIfHeld(locallock, false);
+	}
+	else
+	{
+		int			i;
+
+		for (i = nlocks - 1; i >= 0; i--)
+			ReleaseLockIfHeld(locallocks[i], false);
+	}
+}
+
+/*
+ * ReleaseLockIfHeld
+ *		Release any session-level locks on this lockable object if sessionLock
+ *		is true; else, release any locks held by CurrentResourceOwner.
+ *
+ * It is tempting to pass this a ResourceOwner pointer (or NULL for session
+ * locks), but without refactoring LockRelease() we cannot support releasing
+ * locks belonging to resource owners other than CurrentResourceOwner.
+ * If we were to refactor, it'd be a good idea to fix it so we don't have to
+ * do a hashtable lookup of the locallock, too.  However, currently this
+ * function isn't used heavily enough to justify refactoring for its
+ * convenience.
+ */
+static void
+ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock)
+{
+	ResourceOwner owner;
+	LOCALLOCKOWNER *lockOwners;
+	int			i;
+
+	/* Identify owner for lock (must match LockRelease!) */
+	if (sessionLock)
+		owner = NULL;
+	else
+		owner = CurrentResourceOwner;
+
+	/* Scan to see if there are any locks belonging to the target owner */
+	lockOwners = locallock->lockOwners;
+	for (i = locallock->numLockOwners - 1; i >= 0; i--)
+	{
+		if (lockOwners[i].owner == owner)
+		{
+			Assert(lockOwners[i].nLocks > 0);
+			if (lockOwners[i].nLocks < locallock->nLocks)
+			{
+				/*
+				 * We will still hold this lock after forgetting this
+				 * ResourceOwner.
+				 */
+				locallock->nLocks -= lockOwners[i].nLocks;
+				/* compact out unused slot */
+				locallock->numLockOwners--;
+				if (owner != NULL)
+					ResourceOwnerForgetLock(owner, locallock);
+				if (i < locallock->numLockOwners)
+					lockOwners[i] = lockOwners[locallock->numLockOwners];
+			}
+			else
+			{
+				Assert(lockOwners[i].nLocks == locallock->nLocks);
+				/* We want to call LockRelease just once */
+				lockOwners[i].nLocks = 1;
+				locallock->nLocks = 1;
+				if (!LockRelease(&locallock->tag.lock,
+								 locallock->tag.mode,
+								 sessionLock))
+					elog(WARNING, "ReleaseLockIfHeld: failed??");
+			}
+			break;
+		}
+	}
+}
+
+/*
+ * LockReassignCurrentOwner
+ *		Reassign all locks belonging to CurrentResourceOwner to belong
+ *		to its parent resource owner.
+ *
+ * If the caller knows what those locks are, it can pass them as an array.
+ * That speeds up the call significantly, when a lot of locks are held
+ * (e.g pg_dump with a large schema).  Otherwise, pass NULL for locallocks,
+ * and we'll traverse through our hash table to find them.
+ */
+void
+LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks)
+{
+	ResourceOwner parent = ResourceOwnerGetParent(CurrentResourceOwner);
+
+	Assert(parent != NULL);
+
+	if (locallocks == NULL)
+	{
+		HASH_SEQ_STATUS status;
+		LOCALLOCK  *locallock;
+
+		hash_seq_init(&status, LockMethodLocalHash);
+
+		while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+			LockReassignOwner(locallock, parent);
+	}
+	else
+	{
+		int			i;
+
+		for (i = nlocks - 1; i >= 0; i--)
+			LockReassignOwner(locallocks[i], parent);
+	}
+}
+
+/*
+ * Subroutine of LockReassignCurrentOwner. Reassigns a given lock belonging to
+ * CurrentResourceOwner to its parent.
+ */
+static void
+LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent)
+{
+	LOCALLOCKOWNER *lockOwners;
+	int			i;
+	int			ic = -1;
+	int			ip = -1;
+
+	/*
+	 * Scan to see if there are any locks belonging to current owner or its
+	 * parent
+	 */
+	lockOwners = locallock->lockOwners;
+	for (i = locallock->numLockOwners - 1; i >= 0; i--)
+	{
+		if (lockOwners[i].owner == CurrentResourceOwner)
+			ic = i;
+		else if (lockOwners[i].owner == parent)
+			ip = i;
+	}
+
+	if (ic < 0)
+		return;					/* no current locks */
+
+	if (ip < 0)
+	{
+		/* Parent has no slot, so just give it the child's slot */
+		lockOwners[ic].owner = parent;
+		ResourceOwnerRememberLock(parent, locallock);
+	}
+	else
+	{
+		/* Merge child's count with parent's */
+		lockOwners[ip].nLocks += lockOwners[ic].nLocks;
+		/* compact out unused slot */
+		locallock->numLockOwners--;
+		if (ic < locallock->numLockOwners)
+			lockOwners[ic] = lockOwners[locallock->numLockOwners];
+	}
+	ResourceOwnerForgetLock(CurrentResourceOwner, locallock);
+}
+
+/*
+ * FastPathGrantRelationLock
+ *		Grant lock using per-backend fast-path array, if there is space.
+ */
+static bool
+FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode)
+{
+	uint32		f;
+	uint32		unused_slot = FP_LOCK_SLOTS_PER_BACKEND;
+
+	/* Scan for existing entry for this relid, remembering empty slot. */
+	for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+	{
+		if (FAST_PATH_GET_BITS(MyProc, f) == 0)
+			unused_slot = f;
+		else if (MyProc->fpRelId[f] == relid)
+		{
+			Assert(!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode));
+			FAST_PATH_SET_LOCKMODE(MyProc, f, lockmode);
+			return true;
+		}
+	}
+
+	/* If no existing entry, use any empty slot. */
+	if (unused_slot < FP_LOCK_SLOTS_PER_BACKEND)
+	{
+		MyProc->fpRelId[unused_slot] = relid;
+		FAST_PATH_SET_LOCKMODE(MyProc, unused_slot, lockmode);
+		++FastPathLocalUseCount;
+		return true;
+	}
+
+	/* No existing entry, and no empty slot. */
+	return false;
+}
+
+/*
+ * FastPathUnGrantRelationLock
+ *		Release fast-path lock, if present.  Update backend-private local
+ *		use count, while we're at it.
+ */
+static bool
+FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode)
+{
+	uint32		f;
+	bool		result = false;
+
+	FastPathLocalUseCount = 0;
+	for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+	{
+		if (MyProc->fpRelId[f] == relid
+			&& FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode))
+		{
+			Assert(!result);
+			FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode);
+			result = true;
+			/* we continue iterating so as to update FastPathLocalUseCount */
+		}
+		if (FAST_PATH_GET_BITS(MyProc, f) != 0)
+			++FastPathLocalUseCount;
+	}
+	return result;
+}
+
+/*
+ * FastPathTransferRelationLocks
+ *		Transfer locks matching the given lock tag from per-backend fast-path
+ *		arrays to the shared hash table.
+ *
+ * Returns true if successful, false if ran out of shared memory.
+ */
+static bool
+FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag,
+							  uint32 hashcode)
+{
+	LWLock	   *partitionLock = LockHashPartitionLock(hashcode);
+	Oid			relid = locktag->locktag_field2;
+	uint32		i;
+
+	/*
+	 * Every PGPROC that can potentially hold a fast-path lock is present in
+	 * ProcGlobal->allProcs.  Prepared transactions are not, but any
+	 * outstanding fast-path locks held by prepared transactions are
+	 * transferred to the main lock table.
+	 */
+	for (i = 0; i < ProcGlobal->allProcCount; i++)
+	{
+		PGPROC	   *proc = &ProcGlobal->allProcs[i];
+		uint32		f;
+
+		LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
+
+		/*
+		 * If the target backend isn't referencing the same database as the
+		 * lock, then we needn't examine the individual relation IDs at all;
+		 * none of them can be relevant.
+		 *
+		 * proc->databaseId is set at backend startup time and never changes
+		 * thereafter, so it might be safe to perform this test before
+		 * acquiring &proc->fpInfoLock.  In particular, it's certainly safe to
+		 * assume that if the target backend holds any fast-path locks, it
+		 * must have performed a memory-fencing operation (in particular, an
+		 * LWLock acquisition) since setting proc->databaseId.  However, it's
+		 * less clear that our backend is certain to have performed a memory
+		 * fencing operation since the other backend set proc->databaseId.  So
+		 * for now, we test it after acquiring the LWLock just to be safe.
+		 */
+		if (proc->databaseId != locktag->locktag_field1)
+		{
+			LWLockRelease(&proc->fpInfoLock);
+			continue;
+		}
+
+		for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+		{
+			uint32		lockmode;
+
+			/* Look for an allocated slot matching the given relid. */
+			if (relid != proc->fpRelId[f] || FAST_PATH_GET_BITS(proc, f) == 0)
+				continue;
+
+			/* Find or create lock object. */
+			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+			for (lockmode = FAST_PATH_LOCKNUMBER_OFFSET;
+				 lockmode < FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT;
+				 ++lockmode)
+			{
+				PROCLOCK   *proclock;
+
+				if (!FAST_PATH_CHECK_LOCKMODE(proc, f, lockmode))
+					continue;
+				proclock = SetupLockInTable(lockMethodTable, proc, locktag,
+											hashcode, lockmode);
+				if (!proclock)
+				{
+					LWLockRelease(partitionLock);
+					LWLockRelease(&proc->fpInfoLock);
+					return false;
+				}
+				GrantLock(proclock->tag.myLock, proclock, lockmode);
+				FAST_PATH_CLEAR_LOCKMODE(proc, f, lockmode);
+			}
+			LWLockRelease(partitionLock);
+
+			/* No need to examine remaining slots. */
+			break;
+		}
+		LWLockRelease(&proc->fpInfoLock);
+	}
+	return true;
+}
+
+/*
+ * FastPathGetRelationLockEntry
+ *		Return the PROCLOCK for a lock originally taken via the fast-path,
+ *		transferring it to the primary lock table if necessary.
+ *
+ * Note: caller takes care of updating the locallock object.
+ */
+static PROCLOCK *
+FastPathGetRelationLockEntry(LOCALLOCK *locallock)
+{
+	LockMethod	lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+	LOCKTAG    *locktag = &locallock->tag.lock;
+	PROCLOCK   *proclock = NULL;
+	LWLock	   *partitionLock = LockHashPartitionLock(locallock->hashcode);
+	Oid			relid = locktag->locktag_field2;
+	uint32		f;
+
+	LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+	for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+	{
+		uint32		lockmode;
+
+		/* Look for an allocated slot matching the given relid. */
+		if (relid != MyProc->fpRelId[f] || FAST_PATH_GET_BITS(MyProc, f) == 0)
+			continue;
+
+		/* If we don't have a lock of the given mode, forget it! */
+		lockmode = locallock->tag.mode;
+		if (!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode))
+			break;
+
+		/* Find or create lock object. */
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		proclock = SetupLockInTable(lockMethodTable, MyProc, locktag,
+									locallock->hashcode, lockmode);
+		if (!proclock)
+		{
+			LWLockRelease(partitionLock);
+			LWLockRelease(&MyProc->fpInfoLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory"),
+					 errhint("You might need to increase %s.", "max_locks_per_transaction")));
+		}
+		GrantLock(proclock->tag.myLock, proclock, lockmode);
+		FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode);
+
+		LWLockRelease(partitionLock);
+
+		/* No need to examine remaining slots. */
+		break;
+	}
+
+	LWLockRelease(&MyProc->fpInfoLock);
+
+	/* Lock may have already been transferred by some other backend. */
+	if (proclock == NULL)
+	{
+		LOCK	   *lock;
+		PROCLOCKTAG proclocktag;
+		uint32		proclock_hashcode;
+
+		LWLockAcquire(partitionLock, LW_SHARED);
+
+		lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+													locktag,
+													locallock->hashcode,
+													HASH_FIND,
+													NULL);
+		if (!lock)
+			elog(ERROR, "failed to re-find shared lock object");
+
+		proclocktag.myLock = lock;
+		proclocktag.myProc = MyProc;
+
+		proclock_hashcode = ProcLockHashCode(&proclocktag, locallock->hashcode);
+		proclock = (PROCLOCK *)
+			hash_search_with_hash_value(LockMethodProcLockHash,
+										&proclocktag,
+										proclock_hashcode,
+										HASH_FIND,
+										NULL);
+		if (!proclock)
+			elog(ERROR, "failed to re-find shared proclock object");
+		LWLockRelease(partitionLock);
+	}
+
+	return proclock;
+}
+
+/*
+ * GetLockConflicts
+ *		Get an array of VirtualTransactionIds of xacts currently holding locks
+ *		that would conflict with the specified lock/lockmode.
+ *		xacts merely awaiting such a lock are NOT reported.
+ *
+ * The result array is palloc'd and is terminated with an invalid VXID.
+ * *countp, if not null, is updated to the number of items set.
+ *
+ * Of course, the result could be out of date by the time it's returned, so
+ * use of this function has to be thought about carefully.  Similarly, a
+ * PGPROC with no "lxid" will be considered non-conflicting regardless of any
+ * lock it holds.  Existing callers don't care about a locker after that
+ * locker's pg_xact updates complete.  CommitTransaction() clears "lxid" after
+ * pg_xact updates and before releasing locks.
+ *
+ * Note we never include the current xact's vxid in the result array,
+ * since an xact never blocks itself.
+ */
+VirtualTransactionId *
+GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
+{
+	static VirtualTransactionId *vxids;
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LockMethod	lockMethodTable;
+	LOCK	   *lock;
+	LOCKMASK	conflictMask;
+	dlist_iter	proclock_iter;
+	PROCLOCK   *proclock;
+	uint32		hashcode;
+	LWLock	   *partitionLock;
+	int			count = 0;
+	int			fast_count = 0;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+		elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+	/*
+	 * Allocate memory to store results, and fill with InvalidVXID.  We only
+	 * need enough space for MaxBackends + max_prepared_xacts + a terminator.
+	 * InHotStandby allocate once in TopMemoryContext.
+	 */
+	if (InHotStandby)
+	{
+		if (vxids == NULL)
+			vxids = (VirtualTransactionId *)
+				MemoryContextAlloc(TopMemoryContext,
+								   sizeof(VirtualTransactionId) *
+								   (MaxBackends + max_prepared_xacts + 1));
+	}
+	else
+		vxids = (VirtualTransactionId *)
+			palloc0(sizeof(VirtualTransactionId) *
+					(MaxBackends + max_prepared_xacts + 1));
+
+	/* Compute hash code and partition lock, and look up conflicting modes. */
+	hashcode = LockTagHashCode(locktag);
+	partitionLock = LockHashPartitionLock(hashcode);
+	conflictMask = lockMethodTable->conflictTab[lockmode];
+
+	/*
+	 * Fast path locks might not have been entered in the primary lock table.
+	 * If the lock we're dealing with could conflict with such a lock, we must
+	 * examine each backend's fast-path array for conflicts.
+	 */
+	if (ConflictsWithRelationFastPath(locktag, lockmode))
+	{
+		int			i;
+		Oid			relid = locktag->locktag_field2;
+		VirtualTransactionId vxid;
+
+		/*
+		 * Iterate over relevant PGPROCs.  Anything held by a prepared
+		 * transaction will have been transferred to the primary lock table,
+		 * so we need not worry about those.  This is all a bit fuzzy, because
+		 * new locks could be taken after we've visited a particular
+		 * partition, but the callers had better be prepared to deal with that
+		 * anyway, since the locks could equally well be taken between the
+		 * time we return the value and the time the caller does something
+		 * with it.
+		 */
+		for (i = 0; i < ProcGlobal->allProcCount; i++)
+		{
+			PGPROC	   *proc = &ProcGlobal->allProcs[i];
+			uint32		f;
+
+			/* A backend never blocks itself */
+			if (proc == MyProc)
+				continue;
+
+			LWLockAcquire(&proc->fpInfoLock, LW_SHARED);
+
+			/*
+			 * If the target backend isn't referencing the same database as
+			 * the lock, then we needn't examine the individual relation IDs
+			 * at all; none of them can be relevant.
+			 *
+			 * See FastPathTransferRelationLocks() for discussion of why we do
+			 * this test after acquiring the lock.
+			 */
+			if (proc->databaseId != locktag->locktag_field1)
+			{
+				LWLockRelease(&proc->fpInfoLock);
+				continue;
+			}
+
+			for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+			{
+				uint32		lockmask;
+
+				/* Look for an allocated slot matching the given relid. */
+				if (relid != proc->fpRelId[f])
+					continue;
+				lockmask = FAST_PATH_GET_BITS(proc, f);
+				if (!lockmask)
+					continue;
+				lockmask <<= FAST_PATH_LOCKNUMBER_OFFSET;
+
+				/*
+				 * There can only be one entry per relation, so if we found it
+				 * and it doesn't conflict, we can skip the rest of the slots.
+				 */
+				if ((lockmask & conflictMask) == 0)
+					break;
+
+				/* Conflict! */
+				GET_VXID_FROM_PGPROC(vxid, *proc);
+
+				if (VirtualTransactionIdIsValid(vxid))
+					vxids[count++] = vxid;
+				/* else, xact already committed or aborted */
+
+				/* No need to examine remaining slots. */
+				break;
+			}
+
+			LWLockRelease(&proc->fpInfoLock);
+		}
+	}
+
+	/* Remember how many fast-path conflicts we found. */
+	fast_count = count;
+
+	/*
+	 * Look up the lock object matching the tag.
+	 */
+	LWLockAcquire(partitionLock, LW_SHARED);
+
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												locktag,
+												hashcode,
+												HASH_FIND,
+												NULL);
+	if (!lock)
+	{
+		/*
+		 * If the lock object doesn't exist, there is nothing holding a lock
+		 * on this lockable object.
+		 */
+		LWLockRelease(partitionLock);
+		vxids[count].backendId = InvalidBackendId;
+		vxids[count].localTransactionId = InvalidLocalTransactionId;
+		if (countp)
+			*countp = count;
+		return vxids;
+	}
+
+	/*
+	 * Examine each existing holder (or awaiter) of the lock.
+	 */
+	dlist_foreach(proclock_iter, &lock->procLocks)
+	{
+		proclock = dlist_container(PROCLOCK, lockLink, proclock_iter.cur);
+
+		if (conflictMask & proclock->holdMask)
+		{
+			PGPROC	   *proc = proclock->tag.myProc;
+
+			/* A backend never blocks itself */
+			if (proc != MyProc)
+			{
+				VirtualTransactionId vxid;
+
+				GET_VXID_FROM_PGPROC(vxid, *proc);
+
+				if (VirtualTransactionIdIsValid(vxid))
+				{
+					int			i;
+
+					/* Avoid duplicate entries. */
+					for (i = 0; i < fast_count; ++i)
+						if (VirtualTransactionIdEquals(vxids[i], vxid))
+							break;
+					if (i >= fast_count)
+						vxids[count++] = vxid;
+				}
+				/* else, xact already committed or aborted */
+			}
+		}
+	}
+
+	LWLockRelease(partitionLock);
+
+	if (count > MaxBackends + max_prepared_xacts)	/* should never happen */
+		elog(PANIC, "too many conflicting locks found");
+
+	vxids[count].backendId = InvalidBackendId;
+	vxids[count].localTransactionId = InvalidLocalTransactionId;
+	if (countp)
+		*countp = count;
+	return vxids;
+}
+
+/*
+ * Find a lock in the shared lock table and release it.  It is the caller's
+ * responsibility to verify that this is a sane thing to do.  (For example, it
+ * would be bad to release a lock here if there might still be a LOCALLOCK
+ * object with pointers to it.)
+ *
+ * We currently use this in two situations: first, to release locks held by
+ * prepared transactions on commit (see lock_twophase_postcommit); and second,
+ * to release locks taken via the fast-path, transferred to the main hash
+ * table, and then released (see LockReleaseAll).
+ */
+static void
+LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc,
+					 LOCKTAG *locktag, LOCKMODE lockmode,
+					 bool decrement_strong_lock_count)
+{
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	uint32		hashcode;
+	uint32		proclock_hashcode;
+	LWLock	   *partitionLock;
+	bool		wakeupNeeded;
+
+	hashcode = LockTagHashCode(locktag);
+	partitionLock = LockHashPartitionLock(hashcode);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Re-find the lock object (it had better be there).
+	 */
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												locktag,
+												hashcode,
+												HASH_FIND,
+												NULL);
+	if (!lock)
+		elog(PANIC, "failed to re-find shared lock object");
+
+	/*
+	 * Re-find the proclock object (ditto).
+	 */
+	proclocktag.myLock = lock;
+	proclocktag.myProc = proc;
+
+	proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+	proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+														&proclocktag,
+														proclock_hashcode,
+														HASH_FIND,
+														NULL);
+	if (!proclock)
+		elog(PANIC, "failed to re-find shared proclock object");
+
+	/*
+	 * Double-check that we are actually holding a lock of the type we want to
+	 * release.
+	 */
+	if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+	{
+		PROCLOCK_PRINT("lock_twophase_postcommit: WRONGTYPE", proclock);
+		LWLockRelease(partitionLock);
+		elog(WARNING, "you don't own a lock of type %s",
+			 lockMethodTable->lockModeNames[lockmode]);
+		return;
+	}
+
+	/*
+	 * Do the releasing.  CleanUpLock will waken any now-wakable waiters.
+	 */
+	wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
+
+	CleanUpLock(lock, proclock,
+				lockMethodTable, hashcode,
+				wakeupNeeded);
+
+	LWLockRelease(partitionLock);
+
+	/*
+	 * Decrement strong lock count.  This logic is needed only for 2PC.
+	 */
+	if (decrement_strong_lock_count
+		&& ConflictsWithRelationFastPath(locktag, lockmode))
+	{
+		uint32		fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+		SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+		Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+		FastPathStrongRelationLocks->count[fasthashcode]--;
+		SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+	}
+}
+
+/*
+ * CheckForSessionAndXactLocks
+ *		Check to see if transaction holds both session-level and xact-level
+ *		locks on the same object; if so, throw an error.
+ *
+ * If we have both session- and transaction-level locks on the same object,
+ * PREPARE TRANSACTION must fail.  This should never happen with regular
+ * locks, since we only take those at session level in some special operations
+ * like VACUUM.  It's possible to hit this with advisory locks, though.
+ *
+ * It would be nice if we could keep the session hold and give away the
+ * transactional hold to the prepared xact.  However, that would require two
+ * PROCLOCK objects, and we cannot be sure that another PROCLOCK will be
+ * available when it comes time for PostPrepare_Locks to do the deed.
+ * So for now, we error out while we can still do so safely.
+ *
+ * Since the LOCALLOCK table stores a separate entry for each lockmode,
+ * we can't implement this check by examining LOCALLOCK entries in isolation.
+ * We must build a transient hashtable that is indexed by locktag only.
+ */
+static void
+CheckForSessionAndXactLocks(void)
+{
+	typedef struct
+	{
+		LOCKTAG		lock;		/* identifies the lockable object */
+		bool		sessLock;	/* is any lockmode held at session level? */
+		bool		xactLock;	/* is any lockmode held at xact level? */
+	} PerLockTagEntry;
+
+	HASHCTL		hash_ctl;
+	HTAB	   *lockhtab;
+	HASH_SEQ_STATUS status;
+	LOCALLOCK  *locallock;
+
+	/* Create a local hash table keyed by LOCKTAG only */
+	hash_ctl.keysize = sizeof(LOCKTAG);
+	hash_ctl.entrysize = sizeof(PerLockTagEntry);
+	hash_ctl.hcxt = CurrentMemoryContext;
+
+	lockhtab = hash_create("CheckForSessionAndXactLocks table",
+						   256, /* arbitrary initial size */
+						   &hash_ctl,
+						   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+	/* Scan local lock table to find entries for each LOCKTAG */
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+		PerLockTagEntry *hentry;
+		bool		found;
+		int			i;
+
+		/*
+		 * Ignore VXID locks.  We don't want those to be held by prepared
+		 * transactions, since they aren't meaningful after a restart.
+		 */
+		if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+			continue;
+
+		/* Ignore it if we don't actually hold the lock */
+		if (locallock->nLocks <= 0)
+			continue;
+
+		/* Otherwise, find or make an entry in lockhtab */
+		hentry = (PerLockTagEntry *) hash_search(lockhtab,
+												 &locallock->tag.lock,
+												 HASH_ENTER, &found);
+		if (!found)				/* initialize, if newly created */
+			hentry->sessLock = hentry->xactLock = false;
+
+		/* Scan to see if we hold lock at session or xact level or both */
+		for (i = locallock->numLockOwners - 1; i >= 0; i--)
+		{
+			if (lockOwners[i].owner == NULL)
+				hentry->sessLock = true;
+			else
+				hentry->xactLock = true;
+		}
+
+		/*
+		 * We can throw error immediately when we see both types of locks; no
+		 * need to wait around to see if there are more violations.
+		 */
+		if (hentry->sessLock && hentry->xactLock)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+	}
+
+	/* Success, so clean up */
+	hash_destroy(lockhtab);
+}
+
+/*
+ * AtPrepare_Locks
+ *		Do the preparatory work for a PREPARE: make 2PC state file records
+ *		for all locks currently held.
+ *
+ * Session-level locks are ignored, as are VXID locks.
+ *
+ * For the most part, we don't need to touch shared memory for this ---
+ * all the necessary state information is in the locallock table.
+ * Fast-path locks are an exception, however: we move any such locks to
+ * the main table before allowing PREPARE TRANSACTION to succeed.
+ */
+void
+AtPrepare_Locks(void)
+{
+	HASH_SEQ_STATUS status;
+	LOCALLOCK  *locallock;
+
+	/* First, verify there aren't locks of both xact and session level */
+	CheckForSessionAndXactLocks();
+
+	/* Now do the per-locallock cleanup work */
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		TwoPhaseLockRecord record;
+		LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+		bool		haveSessionLock;
+		bool		haveXactLock;
+		int			i;
+
+		/*
+		 * Ignore VXID locks.  We don't want those to be held by prepared
+		 * transactions, since they aren't meaningful after a restart.
+		 */
+		if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+			continue;
+
+		/* Ignore it if we don't actually hold the lock */
+		if (locallock->nLocks <= 0)
+			continue;
+
+		/* Scan to see whether we hold it at session or transaction level */
+		haveSessionLock = haveXactLock = false;
+		for (i = locallock->numLockOwners - 1; i >= 0; i--)
+		{
+			if (lockOwners[i].owner == NULL)
+				haveSessionLock = true;
+			else
+				haveXactLock = true;
+		}
+
+		/* Ignore it if we have only session lock */
+		if (!haveXactLock)
+			continue;
+
+		/* This can't happen, because we already checked it */
+		if (haveSessionLock)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+
+		/*
+		 * If the local lock was taken via the fast-path, we need to move it
+		 * to the primary lock table, or just get a pointer to the existing
+		 * primary lock table entry if by chance it's already been
+		 * transferred.
+		 */
+		if (locallock->proclock == NULL)
+		{
+			locallock->proclock = FastPathGetRelationLockEntry(locallock);
+			locallock->lock = locallock->proclock->tag.myLock;
+		}
+
+		/*
+		 * Arrange to not release any strong lock count held by this lock
+		 * entry.  We must retain the count until the prepared transaction is
+		 * committed or rolled back.
+		 */
+		locallock->holdsStrongLockCount = false;
+
+		/*
+		 * Create a 2PC record.
+		 */
+		memcpy(&(record.locktag), &(locallock->tag.lock), sizeof(LOCKTAG));
+		record.lockmode = locallock->tag.mode;
+
+		RegisterTwoPhaseRecord(TWOPHASE_RM_LOCK_ID, 0,
+							   &record, sizeof(TwoPhaseLockRecord));
+	}
+}
+
+/*
+ * PostPrepare_Locks
+ *		Clean up after successful PREPARE
+ *
+ * Here, we want to transfer ownership of our locks to a dummy PGPROC
+ * that's now associated with the prepared transaction, and we want to
+ * clean out the corresponding entries in the LOCALLOCK table.
+ *
+ * Note: by removing the LOCALLOCK entries, we are leaving dangling
+ * pointers in the transaction's resource owner.  This is OK at the
+ * moment since resowner.c doesn't try to free locks retail at a toplevel
+ * transaction commit or abort.  We could alternatively zero out nLocks
+ * and leave the LOCALLOCK entries to be garbage-collected by LockReleaseAll,
+ * but that probably costs more cycles.
+ */
+void
+PostPrepare_Locks(TransactionId xid)
+{
+	PGPROC	   *newproc = TwoPhaseGetDummyProc(xid, false);
+	HASH_SEQ_STATUS status;
+	LOCALLOCK  *locallock;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	int			partition;
+
+	/* Can't prepare a lock group follower. */
+	Assert(MyProc->lockGroupLeader == NULL ||
+		   MyProc->lockGroupLeader == MyProc);
+
+	/* This is a critical section: any error means big trouble */
+	START_CRIT_SECTION();
+
+	/*
+	 * First we run through the locallock table and get rid of unwanted
+	 * entries, then we scan the process's proclocks and transfer them to the
+	 * target proc.
+	 *
+	 * We do this separately because we may have multiple locallock entries
+	 * pointing to the same proclock, and we daren't end up with any dangling
+	 * pointers.
+	 */
+	hash_seq_init(&status, LockMethodLocalHash);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+		bool		haveSessionLock;
+		bool		haveXactLock;
+		int			i;
+
+		if (locallock->proclock == NULL || locallock->lock == NULL)
+		{
+			/*
+			 * We must've run out of shared memory while trying to set up this
+			 * lock.  Just forget the local entry.
+			 */
+			Assert(locallock->nLocks == 0);
+			RemoveLocalLock(locallock);
+			continue;
+		}
+
+		/* Ignore VXID locks */
+		if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+			continue;
+
+		/* Scan to see whether we hold it at session or transaction level */
+		haveSessionLock = haveXactLock = false;
+		for (i = locallock->numLockOwners - 1; i >= 0; i--)
+		{
+			if (lockOwners[i].owner == NULL)
+				haveSessionLock = true;
+			else
+				haveXactLock = true;
+		}
+
+		/* Ignore it if we have only session lock */
+		if (!haveXactLock)
+			continue;
+
+		/* This can't happen, because we already checked it */
+		if (haveSessionLock)
+			ereport(PANIC,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+
+		/* Mark the proclock to show we need to release this lockmode */
+		if (locallock->nLocks > 0)
+			locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode);
+
+		/* And remove the locallock hashtable entry */
+		RemoveLocalLock(locallock);
+	}
+
+	/*
+	 * Now, scan each lock partition separately.
+	 */
+	for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+	{
+		LWLock	   *partitionLock;
+		dlist_head *procLocks = &(MyProc->myProcLocks[partition]);
+		dlist_mutable_iter proclock_iter;
+
+		partitionLock = LockHashPartitionLockByIndex(partition);
+
+		/*
+		 * If the proclock list for this partition is empty, we can skip
+		 * acquiring the partition lock.  This optimization is safer than the
+		 * situation in LockReleaseAll, because we got rid of any fast-path
+		 * locks during AtPrepare_Locks, so there cannot be any case where
+		 * another backend is adding something to our lists now.  For safety,
+		 * though, we code this the same way as in LockReleaseAll.
+		 */
+		if (dlist_is_empty(procLocks))
+			continue;			/* needn't examine this partition */
+
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		dlist_foreach_modify(proclock_iter, procLocks)
+		{
+			proclock = dlist_container(PROCLOCK, procLink, proclock_iter.cur);
+
+			Assert(proclock->tag.myProc == MyProc);
+
+			lock = proclock->tag.myLock;
+
+			/* Ignore VXID locks */
+			if (lock->tag.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+				continue;
+
+			PROCLOCK_PRINT("PostPrepare_Locks", proclock);
+			LOCK_PRINT("PostPrepare_Locks", lock, 0);
+			Assert(lock->nRequested >= 0);
+			Assert(lock->nGranted >= 0);
+			Assert(lock->nGranted <= lock->nRequested);
+			Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+			/* Ignore it if nothing to release (must be a session lock) */
+			if (proclock->releaseMask == 0)
+				continue;
+
+			/* Else we should be releasing all locks */
+			if (proclock->releaseMask != proclock->holdMask)
+				elog(PANIC, "we seem to have dropped a bit somewhere");
+
+			/*
+			 * We cannot simply modify proclock->tag.myProc to reassign
+			 * ownership of the lock, because that's part of the hash key and
+			 * the proclock would then be in the wrong hash chain.  Instead
+			 * use hash_update_hash_key.  (We used to create a new hash entry,
+			 * but that risks out-of-memory failure if other processes are
+			 * busy making proclocks too.)	We must unlink the proclock from
+			 * our procLink chain and put it into the new proc's chain, too.
+			 *
+			 * Note: the updated proclock hash key will still belong to the
+			 * same hash partition, cf proclock_hash().  So the partition lock
+			 * we already hold is sufficient for this.
+			 */
+			dlist_delete(&proclock->procLink);
+
+			/*
+			 * Create the new hash key for the proclock.
+			 */
+			proclocktag.myLock = lock;
+			proclocktag.myProc = newproc;
+
+			/*
+			 * Update groupLeader pointer to point to the new proc.  (We'd
+			 * better not be a member of somebody else's lock group!)
+			 */
+			Assert(proclock->groupLeader == proclock->tag.myProc);
+			proclock->groupLeader = newproc;
+
+			/*
+			 * Update the proclock.  We should not find any existing entry for
+			 * the same hash key, since there can be only one entry for any
+			 * given lock with my own proc.
+			 */
+			if (!hash_update_hash_key(LockMethodProcLockHash,
+									  proclock,
+									  &proclocktag))
+				elog(PANIC, "duplicate entry found while reassigning a prepared transaction's locks");
+
+			/* Re-link into the new proc's proclock list */
+			dlist_push_tail(&newproc->myProcLocks[partition], &proclock->procLink);
+
+			PROCLOCK_PRINT("PostPrepare_Locks: updated", proclock);
+		}						/* loop over PROCLOCKs within this partition */
+
+		LWLockRelease(partitionLock);
+	}							/* loop over partitions */
+
+	END_CRIT_SECTION();
+}
+
+
+/*
+ * Estimate shared-memory space used for lock tables
+ */
+Size
+LockShmemSize(void)
+{
+	Size		size = 0;
+	long		max_table_size;
+
+	/* lock hash table */
+	max_table_size = NLOCKENTS();
+	size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK)));
+
+	/* proclock hash table */
+	max_table_size *= 2;
+	size = add_size(size, hash_estimate_size(max_table_size, sizeof(PROCLOCK)));
+
+	/*
+	 * Since NLOCKENTS is only an estimate, add 10% safety margin.
+	 */
+	size = add_size(size, size / 10);
+
+	return size;
+}
+
+/*
+ * GetLockStatusData - Return a summary of the lock manager's internal
+ * status, for use in a user-level reporting function.
+ *
+ * The return data consists of an array of LockInstanceData objects,
+ * which are a lightly abstracted version of the PROCLOCK data structures,
+ * i.e. there is one entry for each unique lock and interested PGPROC.
+ * It is the caller's responsibility to match up related items (such as
+ * references to the same lockable object or PGPROC) if wanted.
+ *
+ * The design goal is to hold the LWLocks for as short a time as possible;
+ * thus, this function simply makes a copy of the necessary data and releases
+ * the locks, allowing the caller to contemplate and format the data for as
+ * long as it pleases.
+ */
+LockData *
+GetLockStatusData(void)
+{
+	LockData   *data;
+	PROCLOCK   *proclock;
+	HASH_SEQ_STATUS seqstat;
+	int			els;
+	int			el;
+	int			i;
+
+	data = (LockData *) palloc(sizeof(LockData));
+
+	/* Guess how much space we'll need. */
+	els = MaxBackends;
+	el = 0;
+	data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * els);
+
+	/*
+	 * First, we iterate through the per-backend fast-path arrays, locking
+	 * them one at a time.  This might produce an inconsistent picture of the
+	 * system state, but taking all of those LWLocks at the same time seems
+	 * impractical (in particular, note MAX_SIMUL_LWLOCKS).  It shouldn't
+	 * matter too much, because none of these locks can be involved in lock
+	 * conflicts anyway - anything that might must be present in the main lock
+	 * table.  (For the same reason, we don't sweat about making leaderPid
+	 * completely valid.  We cannot safely dereference another backend's
+	 * lockGroupLeader field without holding all lock partition locks, and
+	 * it's not worth that.)
+	 */
+	for (i = 0; i < ProcGlobal->allProcCount; ++i)
+	{
+		PGPROC	   *proc = &ProcGlobal->allProcs[i];
+		uint32		f;
+
+		LWLockAcquire(&proc->fpInfoLock, LW_SHARED);
+
+		for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; ++f)
+		{
+			LockInstanceData *instance;
+			uint32		lockbits = FAST_PATH_GET_BITS(proc, f);
+
+			/* Skip unallocated slots. */
+			if (!lockbits)
+				continue;
+
+			if (el >= els)
+			{
+				els += MaxBackends;
+				data->locks = (LockInstanceData *)
+					repalloc(data->locks, sizeof(LockInstanceData) * els);
+			}
+
+			instance = &data->locks[el];
+			SET_LOCKTAG_RELATION(instance->locktag, proc->databaseId,
+								 proc->fpRelId[f]);
+			instance->holdMask = lockbits << FAST_PATH_LOCKNUMBER_OFFSET;
+			instance->waitLockMode = NoLock;
+			instance->backend = proc->backendId;
+			instance->lxid = proc->lxid;
+			instance->pid = proc->pid;
+			instance->leaderPid = proc->pid;
+			instance->fastpath = true;
+
+			/*
+			 * Successfully taking fast path lock means there were no
+			 * conflicting locks.
+			 */
+			instance->waitStart = 0;
+
+			el++;
+		}
+
+		if (proc->fpVXIDLock)
+		{
+			VirtualTransactionId vxid;
+			LockInstanceData *instance;
+
+			if (el >= els)
+			{
+				els += MaxBackends;
+				data->locks = (LockInstanceData *)
+					repalloc(data->locks, sizeof(LockInstanceData) * els);
+			}
+
+			vxid.backendId = proc->backendId;
+			vxid.localTransactionId = proc->fpLocalTransactionId;
+
+			instance = &data->locks[el];
+			SET_LOCKTAG_VIRTUALTRANSACTION(instance->locktag, vxid);
+			instance->holdMask = LOCKBIT_ON(ExclusiveLock);
+			instance->waitLockMode = NoLock;
+			instance->backend = proc->backendId;
+			instance->lxid = proc->lxid;
+			instance->pid = proc->pid;
+			instance->leaderPid = proc->pid;
+			instance->fastpath = true;
+			instance->waitStart = 0;
+
+			el++;
+		}
+
+		LWLockRelease(&proc->fpInfoLock);
+	}
+
+	/*
+	 * Next, acquire lock on the entire shared lock data structure.  We do
+	 * this so that, at least for locks in the primary lock table, the state
+	 * will be self-consistent.
+	 *
+	 * Since this is a read-only operation, we take shared instead of
+	 * exclusive lock.  There's not a whole lot of point to this, because all
+	 * the normal operations require exclusive lock, but it doesn't hurt
+	 * anything either. It will at least allow two backends to do
+	 * GetLockStatusData in parallel.
+	 *
+	 * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+	 */
+	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+		LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+	/* Now we can safely count the number of proclocks */
+	data->nelements = el + hash_get_num_entries(LockMethodProcLockHash);
+	if (data->nelements > els)
+	{
+		els = data->nelements;
+		data->locks = (LockInstanceData *)
+			repalloc(data->locks, sizeof(LockInstanceData) * els);
+	}
+
+	/* Now scan the tables to copy the data */
+	hash_seq_init(&seqstat, LockMethodProcLockHash);
+
+	while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+	{
+		PGPROC	   *proc = proclock->tag.myProc;
+		LOCK	   *lock = proclock->tag.myLock;
+		LockInstanceData *instance = &data->locks[el];
+
+		memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
+		instance->holdMask = proclock->holdMask;
+		if (proc->waitLock == proclock->tag.myLock)
+			instance->waitLockMode = proc->waitLockMode;
+		else
+			instance->waitLockMode = NoLock;
+		instance->backend = proc->backendId;
+		instance->lxid = proc->lxid;
+		instance->pid = proc->pid;
+		instance->leaderPid = proclock->groupLeader->pid;
+		instance->fastpath = false;
+		instance->waitStart = (TimestampTz) pg_atomic_read_u64(&proc->waitStart);
+
+		el++;
+	}
+
+	/*
+	 * And release locks.  We do this in reverse order for two reasons: (1)
+	 * Anyone else who needs more than one of the locks will be trying to lock
+	 * them in increasing order; we don't want to release the other process
+	 * until it can get all the locks it needs. (2) This avoids O(N^2)
+	 * behavior inside LWLockRelease.
+	 */
+	for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+		LWLockRelease(LockHashPartitionLockByIndex(i));
+
+	Assert(el == data->nelements);
+
+	return data;
+}
+
+/*
+ * GetBlockerStatusData - Return a summary of the lock manager's state
+ * concerning locks that are blocking the specified PID or any member of
+ * the PID's lock group, for use in a user-level reporting function.
+ *
+ * For each PID within the lock group that is awaiting some heavyweight lock,
+ * the return data includes an array of LockInstanceData objects, which are
+ * the same data structure used by GetLockStatusData; but unlike that function,
+ * this one reports only the PROCLOCKs associated with the lock that that PID
+ * is blocked on.  (Hence, all the locktags should be the same for any one
+ * blocked PID.)  In addition, we return an array of the PIDs of those backends
+ * that are ahead of the blocked PID in the lock's wait queue.  These can be
+ * compared with the PIDs in the LockInstanceData objects to determine which
+ * waiters are ahead of or behind the blocked PID in the queue.
+ *
+ * If blocked_pid isn't a valid backend PID or nothing in its lock group is
+ * waiting on any heavyweight lock, return empty arrays.
+ *
+ * The design goal is to hold the LWLocks for as short a time as possible;
+ * thus, this function simply makes a copy of the necessary data and releases
+ * the locks, allowing the caller to contemplate and format the data for as
+ * long as it pleases.
+ */
+BlockedProcsData *
+GetBlockerStatusData(int blocked_pid)
+{
+	BlockedProcsData *data;
+	PGPROC	   *proc;
+	int			i;
+
+	data = (BlockedProcsData *) palloc(sizeof(BlockedProcsData));
+
+	/*
+	 * Guess how much space we'll need, and preallocate.  Most of the time
+	 * this will avoid needing to do repalloc while holding the LWLocks.  (We
+	 * assume, but check with an Assert, that MaxBackends is enough entries
+	 * for the procs[] array; the other two could need enlargement, though.)
+	 */
+	data->nprocs = data->nlocks = data->npids = 0;
+	data->maxprocs = data->maxlocks = data->maxpids = MaxBackends;
+	data->procs = (BlockedProcData *) palloc(sizeof(BlockedProcData) * data->maxprocs);
+	data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * data->maxlocks);
+	data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids);
+
+	/*
+	 * In order to search the ProcArray for blocked_pid and assume that that
+	 * entry won't immediately disappear under us, we must hold ProcArrayLock.
+	 * In addition, to examine the lock grouping fields of any other backend,
+	 * we must hold all the hash partition locks.  (Only one of those locks is
+	 * actually relevant for any one lock group, but we can't know which one
+	 * ahead of time.)	It's fairly annoying to hold all those locks
+	 * throughout this, but it's no worse than GetLockStatusData(), and it
+	 * does have the advantage that we're guaranteed to return a
+	 * self-consistent instantaneous state.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	proc = BackendPidGetProcWithLock(blocked_pid);
+
+	/* Nothing to do if it's gone */
+	if (proc != NULL)
+	{
+		/*
+		 * Acquire lock on the entire shared lock data structure.  See notes
+		 * in GetLockStatusData().
+		 */
+		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+			LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+		if (proc->lockGroupLeader == NULL)
+		{
+			/* Easy case, proc is not a lock group member */
+			GetSingleProcBlockerStatusData(proc, data);
+		}
+		else
+		{
+			/* Examine all procs in proc's lock group */
+			dlist_iter	iter;
+
+			dlist_foreach(iter, &proc->lockGroupLeader->lockGroupMembers)
+			{
+				PGPROC	   *memberProc;
+
+				memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur);
+				GetSingleProcBlockerStatusData(memberProc, data);
+			}
+		}
+
+		/*
+		 * And release locks.  See notes in GetLockStatusData().
+		 */
+		for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+			LWLockRelease(LockHashPartitionLockByIndex(i));
+
+		Assert(data->nprocs <= data->maxprocs);
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return data;
+}
+
+/* Accumulate data about one possibly-blocked proc for GetBlockerStatusData */
+static void
+GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data)
+{
+	LOCK	   *theLock = blocked_proc->waitLock;
+	BlockedProcData *bproc;
+	dlist_iter	proclock_iter;
+	dlist_iter	proc_iter;
+	dclist_head *waitQueue;
+	int			queue_size;
+
+	/* Nothing to do if this proc is not blocked */
+	if (theLock == NULL)
+		return;
+
+	/* Set up a procs[] element */
+	bproc = &data->procs[data->nprocs++];
+	bproc->pid = blocked_proc->pid;
+	bproc->first_lock = data->nlocks;
+	bproc->first_waiter = data->npids;
+
+	/*
+	 * We may ignore the proc's fast-path arrays, since nothing in those could
+	 * be related to a contended lock.
+	 */
+
+	/* Collect all PROCLOCKs associated with theLock */
+	dlist_foreach(proclock_iter, &theLock->procLocks)
+	{
+		PROCLOCK   *proclock =
+			dlist_container(PROCLOCK, lockLink, proclock_iter.cur);
+		PGPROC	   *proc = proclock->tag.myProc;
+		LOCK	   *lock = proclock->tag.myLock;
+		LockInstanceData *instance;
+
+		if (data->nlocks >= data->maxlocks)
+		{
+			data->maxlocks += MaxBackends;
+			data->locks = (LockInstanceData *)
+				repalloc(data->locks, sizeof(LockInstanceData) * data->maxlocks);
+		}
+
+		instance = &data->locks[data->nlocks];
+		memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
+		instance->holdMask = proclock->holdMask;
+		if (proc->waitLock == lock)
+			instance->waitLockMode = proc->waitLockMode;
+		else
+			instance->waitLockMode = NoLock;
+		instance->backend = proc->backendId;
+		instance->lxid = proc->lxid;
+		instance->pid = proc->pid;
+		instance->leaderPid = proclock->groupLeader->pid;
+		instance->fastpath = false;
+		data->nlocks++;
+	}
+
+	/* Enlarge waiter_pids[] if it's too small to hold all wait queue PIDs */
+	waitQueue = &(theLock->waitProcs);
+	queue_size = dclist_count(waitQueue);
+
+	if (queue_size > data->maxpids - data->npids)
+	{
+		data->maxpids = Max(data->maxpids + MaxBackends,
+							data->npids + queue_size);
+		data->waiter_pids = (int *) repalloc(data->waiter_pids,
+											 sizeof(int) * data->maxpids);
+	}
+
+	/* Collect PIDs from the lock's wait queue, stopping at blocked_proc */
+	dclist_foreach(proc_iter, waitQueue)
+	{
+		PGPROC	   *queued_proc = dlist_container(PGPROC, links, proc_iter.cur);
+
+		if (queued_proc == blocked_proc)
+			break;
+		data->waiter_pids[data->npids++] = queued_proc->pid;
+		queued_proc = (PGPROC *) queued_proc->links.next;
+	}
+
+	bproc->num_locks = data->nlocks - bproc->first_lock;
+	bproc->num_waiters = data->npids - bproc->first_waiter;
+}
+
+/*
+ * Returns a list of currently held AccessExclusiveLocks, for use by
+ * LogStandbySnapshot().  The result is a palloc'd array,
+ * with the number of elements returned into *nlocks.
+ *
+ * XXX This currently takes a lock on all partitions of the lock table,
+ * but it's possible to do better.  By reference counting locks and storing
+ * the value in the ProcArray entry for each backend we could tell if any
+ * locks need recording without having to acquire the partition locks and
+ * scan the lock table.  Whether that's worth the additional overhead
+ * is pretty dubious though.
+ */
+xl_standby_lock *
+GetRunningTransactionLocks(int *nlocks)
+{
+	xl_standby_lock *accessExclusiveLocks;
+	PROCLOCK   *proclock;
+	HASH_SEQ_STATUS seqstat;
+	int			i;
+	int			index;
+	int			els;
+
+	/*
+	 * Acquire lock on the entire shared lock data structure.
+	 *
+	 * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+	 */
+	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+		LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+	/* Now we can safely count the number of proclocks */
+	els = hash_get_num_entries(LockMethodProcLockHash);
+
+	/*
+	 * Allocating enough space for all locks in the lock table is overkill,
+	 * but it's more convenient and faster than having to enlarge the array.
+	 */
+	accessExclusiveLocks = palloc(els * sizeof(xl_standby_lock));
+
+	/* Now scan the tables to copy the data */
+	hash_seq_init(&seqstat, LockMethodProcLockHash);
+
+	/*
+	 * If lock is a currently granted AccessExclusiveLock then it will have
+	 * just one proclock holder, so locks are never accessed twice in this
+	 * particular case. Don't copy this code for use elsewhere because in the
+	 * general case this will give you duplicate locks when looking at
+	 * non-exclusive lock types.
+	 */
+	index = 0;
+	while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+	{
+		/* make sure this definition matches the one used in LockAcquire */
+		if ((proclock->holdMask & LOCKBIT_ON(AccessExclusiveLock)) &&
+			proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION)
+		{
+			PGPROC	   *proc = proclock->tag.myProc;
+			LOCK	   *lock = proclock->tag.myLock;
+			TransactionId xid = proc->xid;
+
+			/*
+			 * Don't record locks for transactions if we know they have
+			 * already issued their WAL record for commit but not yet released
+			 * lock. It is still possible that we see locks held by already
+			 * complete transactions, if they haven't yet zeroed their xids.
+			 */
+			if (!TransactionIdIsValid(xid))
+				continue;
+
+			accessExclusiveLocks[index].xid = xid;
+			accessExclusiveLocks[index].dbOid = lock->tag.locktag_field1;
+			accessExclusiveLocks[index].relOid = lock->tag.locktag_field2;
+
+			index++;
+		}
+	}
+
+	Assert(index <= els);
+
+	/*
+	 * And release locks.  We do this in reverse order for two reasons: (1)
+	 * Anyone else who needs more than one of the locks will be trying to lock
+	 * them in increasing order; we don't want to release the other process
+	 * until it can get all the locks it needs. (2) This avoids O(N^2)
+	 * behavior inside LWLockRelease.
+	 */
+	for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+		LWLockRelease(LockHashPartitionLockByIndex(i));
+
+	*nlocks = index;
+	return accessExclusiveLocks;
+}
+
+/* Provide the textual name of any lock mode */
+const char *
+GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode)
+{
+	Assert(lockmethodid > 0 && lockmethodid < lengthof(LockMethods));
+	Assert(mode > 0 && mode <= LockMethods[lockmethodid]->numLockModes);
+	return LockMethods[lockmethodid]->lockModeNames[mode];
+}
+
+#ifdef LOCK_DEBUG
+/*
+ * Dump all locks in the given proc's myProcLocks lists.
+ *
+ * Caller is responsible for having acquired appropriate LWLocks.
+ */
+void
+DumpLocks(PGPROC *proc)
+{
+	int			i;
+
+	if (proc == NULL)
+		return;
+
+	if (proc->waitLock)
+		LOCK_PRINT("DumpLocks: waiting on", proc->waitLock, 0);
+
+	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+	{
+		dlist_head *procLocks = &proc->myProcLocks[i];
+		dlist_iter	iter;
+
+		dlist_foreach(iter, procLocks)
+		{
+			PROCLOCK   *proclock = dlist_container(PROCLOCK, procLink, iter.cur);
+			LOCK	   *lock = proclock->tag.myLock;
+
+			Assert(proclock->tag.myProc == proc);
+			PROCLOCK_PRINT("DumpLocks", proclock);
+			LOCK_PRINT("DumpLocks", lock, 0);
+		}
+	}
+}
+
+/*
+ * Dump all lmgr locks.
+ *
+ * Caller is responsible for having acquired appropriate LWLocks.
+ */
+void
+DumpAllLocks(void)
+{
+	PGPROC	   *proc;
+	PROCLOCK   *proclock;
+	LOCK	   *lock;
+	HASH_SEQ_STATUS status;
+
+	proc = MyProc;
+
+	if (proc && proc->waitLock)
+		LOCK_PRINT("DumpAllLocks: waiting on", proc->waitLock, 0);
+
+	hash_seq_init(&status, LockMethodProcLockHash);
+
+	while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		PROCLOCK_PRINT("DumpAllLocks", proclock);
+
+		lock = proclock->tag.myLock;
+		if (lock)
+			LOCK_PRINT("DumpAllLocks", lock, 0);
+		else
+			elog(LOG, "DumpAllLocks: proclock->tag.myLock = NULL");
+	}
+}
+#endif							/* LOCK_DEBUG */
+
+/*
+ * LOCK 2PC resource manager's routines
+ */
+
+/*
+ * Re-acquire a lock belonging to a transaction that was prepared.
+ *
+ * Because this function is run at db startup, re-acquiring the locks should
+ * never conflict with running transactions because there are none.  We
+ * assume that the lock state represented by the stored 2PC files is legal.
+ *
+ * When switching from Hot Standby mode to normal operation, the locks will
+ * be already held by the startup process. The locks are acquired for the new
+ * procs without checking for conflicts, so we don't get a conflict between the
+ * startup process and the dummy procs, even though we will momentarily have
+ * a situation where two procs are holding the same AccessExclusiveLock,
+ * which isn't normally possible because the conflict. If we're in standby
+ * mode, but a recovery snapshot hasn't been established yet, it's possible
+ * that some but not all of the locks are already held by the startup process.
+ *
+ * This approach is simple, but also a bit dangerous, because if there isn't
+ * enough shared memory to acquire the locks, an error will be thrown, which
+ * is promoted to FATAL and recovery will abort, bringing down postmaster.
+ * A safer approach would be to transfer the locks like we do in
+ * AtPrepare_Locks, but then again, in hot standby mode it's possible for
+ * read-only backends to use up all the shared lock memory anyway, so that
+ * replaying the WAL record that needs to acquire a lock will throw an error
+ * and PANIC anyway.
+ */
+void
+lock_twophase_recover(TransactionId xid, uint16 info,
+					  void *recdata, uint32 len)
+{
+	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+	PGPROC	   *proc = TwoPhaseGetDummyProc(xid, false);
+	LOCKTAG    *locktag;
+	LOCKMODE	lockmode;
+	LOCKMETHODID lockmethodid;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	bool		found;
+	uint32		hashcode;
+	uint32		proclock_hashcode;
+	int			partition;
+	LWLock	   *partitionLock;
+	LockMethod	lockMethodTable;
+
+	Assert(len == sizeof(TwoPhaseLockRecord));
+	locktag = &rec->locktag;
+	lockmode = rec->lockmode;
+	lockmethodid = locktag->locktag_lockmethodid;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+
+	hashcode = LockTagHashCode(locktag);
+	partition = LockHashPartition(hashcode);
+	partitionLock = LockHashPartitionLock(hashcode);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Find or create a lock with this tag.
+	 */
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												locktag,
+												hashcode,
+												HASH_ENTER_NULL,
+												&found);
+	if (!lock)
+	{
+		LWLockRelease(partitionLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+				 errhint("You might need to increase %s.", "max_locks_per_transaction")));
+	}
+
+	/*
+	 * if it's a new lock object, initialize it
+	 */
+	if (!found)
+	{
+		lock->grantMask = 0;
+		lock->waitMask = 0;
+		dlist_init(&lock->procLocks);
+		dclist_init(&lock->waitProcs);
+		lock->nRequested = 0;
+		lock->nGranted = 0;
+		MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
+		MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+		LOCK_PRINT("lock_twophase_recover: new", lock, lockmode);
+	}
+	else
+	{
+		LOCK_PRINT("lock_twophase_recover: found", lock, lockmode);
+		Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0));
+		Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0));
+		Assert(lock->nGranted <= lock->nRequested);
+	}
+
+	/*
+	 * Create the hash key for the proclock table.
+	 */
+	proclocktag.myLock = lock;
+	proclocktag.myProc = proc;
+
+	proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+	/*
+	 * Find or create a proclock entry with this tag
+	 */
+	proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+														&proclocktag,
+														proclock_hashcode,
+														HASH_ENTER_NULL,
+														&found);
+	if (!proclock)
+	{
+		/* Oops, not enough shmem for the proclock */
+		if (lock->nRequested == 0)
+		{
+			/*
+			 * There are no other requestors of this lock, so garbage-collect
+			 * the lock object.  We *must* do this to avoid a permanent leak
+			 * of shared memory, because there won't be anything to cause
+			 * anyone to release the lock object later.
+			 */
+			Assert(dlist_is_empty(&lock->procLocks));
+			if (!hash_search_with_hash_value(LockMethodLockHash,
+											 &(lock->tag),
+											 hashcode,
+											 HASH_REMOVE,
+											 NULL))
+				elog(PANIC, "lock table corrupted");
+		}
+		LWLockRelease(partitionLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+				 errhint("You might need to increase %s.", "max_locks_per_transaction")));
+	}
+
+	/*
+	 * If new, initialize the new entry
+	 */
+	if (!found)
+	{
+		Assert(proc->lockGroupLeader == NULL);
+		proclock->groupLeader = proc;
+		proclock->holdMask = 0;
+		proclock->releaseMask = 0;
+		/* Add proclock to appropriate lists */
+		dlist_push_tail(&lock->procLocks, &proclock->lockLink);
+		dlist_push_tail(&proc->myProcLocks[partition],
+						&proclock->procLink);
+		PROCLOCK_PRINT("lock_twophase_recover: new", proclock);
+	}
+	else
+	{
+		PROCLOCK_PRINT("lock_twophase_recover: found", proclock);
+		Assert((proclock->holdMask & ~lock->grantMask) == 0);
+	}
+
+	/*
+	 * lock->nRequested and lock->requested[] count the total number of
+	 * requests, whether granted or waiting, so increment those immediately.
+	 */
+	lock->nRequested++;
+	lock->requested[lockmode]++;
+	Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+
+	/*
+	 * We shouldn't already hold the desired lock.
+	 */
+	if (proclock->holdMask & LOCKBIT_ON(lockmode))
+		elog(ERROR, "lock %s on object %u/%u/%u is already held",
+			 lockMethodTable->lockModeNames[lockmode],
+			 lock->tag.locktag_field1, lock->tag.locktag_field2,
+			 lock->tag.locktag_field3);
+
+	/*
+	 * We ignore any possible conflicts and just grant ourselves the lock. Not
+	 * only because we don't bother, but also to avoid deadlocks when
+	 * switching from standby to normal mode. See function comment.
+	 */
+	GrantLock(lock, proclock, lockmode);
+
+	/*
+	 * Bump strong lock count, to make sure any fast-path lock requests won't
+	 * be granted without consulting the primary lock table.
+	 */
+	if (ConflictsWithRelationFastPath(&lock->tag, lockmode))
+	{
+		uint32		fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+		SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+		FastPathStrongRelationLocks->count[fasthashcode]++;
+		SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+	}
+
+	LWLockRelease(partitionLock);
+}
+
+/*
+ * Re-acquire a lock belonging to a transaction that was prepared, when
+ * starting up into hot standby mode.
+ */
+void
+lock_twophase_standby_recover(TransactionId xid, uint16 info,
+							  void *recdata, uint32 len)
+{
+	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+	LOCKTAG    *locktag;
+	LOCKMODE	lockmode;
+	LOCKMETHODID lockmethodid;
+
+	Assert(len == sizeof(TwoPhaseLockRecord));
+	locktag = &rec->locktag;
+	lockmode = rec->lockmode;
+	lockmethodid = locktag->locktag_lockmethodid;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+	if (lockmode == AccessExclusiveLock &&
+		locktag->locktag_type == LOCKTAG_RELATION)
+	{
+		StandbyAcquireAccessExclusiveLock(xid,
+										  locktag->locktag_field1 /* dboid */ ,
+										  locktag->locktag_field2 /* reloid */ );
+	}
+}
+
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Find and release the lock indicated by the 2PC record.
+ */
+void
+lock_twophase_postcommit(TransactionId xid, uint16 info,
+						 void *recdata, uint32 len)
+{
+	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+	PGPROC	   *proc = TwoPhaseGetDummyProc(xid, true);
+	LOCKTAG    *locktag;
+	LOCKMETHODID lockmethodid;
+	LockMethod	lockMethodTable;
+
+	Assert(len == sizeof(TwoPhaseLockRecord));
+	locktag = &rec->locktag;
+	lockmethodid = locktag->locktag_lockmethodid;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+	lockMethodTable = LockMethods[lockmethodid];
+
+	LockRefindAndRelease(lockMethodTable, proc, locktag, rec->lockmode, true);
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * This is actually just the same as the COMMIT case.
+ */
+void
+lock_twophase_postabort(TransactionId xid, uint16 info,
+						void *recdata, uint32 len)
+{
+	lock_twophase_postcommit(xid, info, recdata, len);
+}
+
+/*
+ *		VirtualXactLockTableInsert
+ *
+ *		Take vxid lock via the fast-path.  There can't be any pre-existing
+ *		lockers, as we haven't advertised this vxid via the ProcArray yet.
+ *
+ *		Since MyProc->fpLocalTransactionId will normally contain the same data
+ *		as MyProc->lxid, you might wonder if we really need both.  The
+ *		difference is that MyProc->lxid is set and cleared unlocked, and
+ *		examined by procarray.c, while fpLocalTransactionId is protected by
+ *		fpInfoLock and is used only by the locking subsystem.  Doing it this
+ *		way makes it easier to verify that there are no funny race conditions.
+ *
+ *		We don't bother recording this lock in the local lock table, since it's
+ *		only ever released at the end of a transaction.  Instead,
+ *		LockReleaseAll() calls VirtualXactLockTableCleanup().
+ */
+void
+VirtualXactLockTableInsert(VirtualTransactionId vxid)
+{
+	Assert(VirtualTransactionIdIsValid(vxid));
+
+	LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+	Assert(MyProc->backendId == vxid.backendId);
+	Assert(MyProc->fpLocalTransactionId == InvalidLocalTransactionId);
+	Assert(MyProc->fpVXIDLock == false);
+
+	MyProc->fpVXIDLock = true;
+	MyProc->fpLocalTransactionId = vxid.localTransactionId;
+
+	LWLockRelease(&MyProc->fpInfoLock);
+}
+
+/*
+ *		VirtualXactLockTableCleanup
+ *
+ *		Check whether a VXID lock has been materialized; if so, release it,
+ *		unblocking waiters.
+ */
+void
+VirtualXactLockTableCleanup(void)
+{
+	bool		fastpath;
+	LocalTransactionId lxid;
+
+	Assert(MyProc->backendId != InvalidBackendId);
+
+	/*
+	 * Clean up shared memory state.
+	 */
+	LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+	fastpath = MyProc->fpVXIDLock;
+	lxid = MyProc->fpLocalTransactionId;
+	MyProc->fpVXIDLock = false;
+	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+
+	LWLockRelease(&MyProc->fpInfoLock);
+
+	/*
+	 * If fpVXIDLock has been cleared without touching fpLocalTransactionId,
+	 * that means someone transferred the lock to the main lock table.
+	 */
+	if (!fastpath && LocalTransactionIdIsValid(lxid))
+	{
+		VirtualTransactionId vxid;
+		LOCKTAG		locktag;
+
+		vxid.backendId = MyBackendId;
+		vxid.localTransactionId = lxid;
+		SET_LOCKTAG_VIRTUALTRANSACTION(locktag, vxid);
+
+		LockRefindAndRelease(LockMethods[DEFAULT_LOCKMETHOD], MyProc,
+							 &locktag, ExclusiveLock, false);
+	}
+}
+
+/*
+ *		XactLockForVirtualXact
+ *
+ * If TransactionIdIsValid(xid), this is essentially XactLockTableWait(xid,
+ * NULL, NULL, XLTW_None) or ConditionalXactLockTableWait(xid).  Unlike those
+ * functions, it assumes "xid" is never a subtransaction and that "xid" is
+ * prepared, committed, or aborted.
+ *
+ * If !TransactionIdIsValid(xid), this locks every prepared XID having been
+ * known as "vxid" before its PREPARE TRANSACTION.
+ */
+static bool
+XactLockForVirtualXact(VirtualTransactionId vxid,
+					   TransactionId xid, bool wait)
+{
+	bool		more = false;
+
+	/* There is no point to wait for 2PCs if you have no 2PCs. */
+	if (max_prepared_xacts == 0)
+		return true;
+
+	do
+	{
+		LockAcquireResult lar;
+		LOCKTAG		tag;
+
+		/* Clear state from previous iterations. */
+		if (more)
+		{
+			xid = InvalidTransactionId;
+			more = false;
+		}
+
+		/* If we have no xid, try to find one. */
+		if (!TransactionIdIsValid(xid))
+			xid = TwoPhaseGetXidByVirtualXID(vxid, &more);
+		if (!TransactionIdIsValid(xid))
+		{
+			Assert(!more);
+			return true;
+		}
+
+		/* Check or wait for XID completion. */
+		SET_LOCKTAG_TRANSACTION(tag, xid);
+		lar = LockAcquire(&tag, ShareLock, false, !wait);
+		if (lar == LOCKACQUIRE_NOT_AVAIL)
+			return false;
+		LockRelease(&tag, ShareLock, false);
+	} while (more);
+
+	return true;
+}
+
+/*
+ *		VirtualXactLock
+ *
+ * If wait = true, wait as long as the given VXID or any XID acquired by the
+ * same transaction is still running.  Then, return true.
+ *
+ * If wait = false, just check whether that VXID or one of those XIDs is still
+ * running, and return true or false.
+ */
+bool
+VirtualXactLock(VirtualTransactionId vxid, bool wait)
+{
+	LOCKTAG		tag;
+	PGPROC	   *proc;
+	TransactionId xid = InvalidTransactionId;
+
+	Assert(VirtualTransactionIdIsValid(vxid));
+
+	if (VirtualTransactionIdIsRecoveredPreparedXact(vxid))
+		/* no vxid lock; localTransactionId is a normal, locked XID */
+		return XactLockForVirtualXact(vxid, vxid.localTransactionId, wait);
+
+	SET_LOCKTAG_VIRTUALTRANSACTION(tag, vxid);
+
+	/*
+	 * If a lock table entry must be made, this is the PGPROC on whose behalf
+	 * it must be done.  Note that the transaction might end or the PGPROC
+	 * might be reassigned to a new backend before we get around to examining
+	 * it, but it doesn't matter.  If we find upon examination that the
+	 * relevant lxid is no longer running here, that's enough to prove that
+	 * it's no longer running anywhere.
+	 */
+	proc = BackendIdGetProc(vxid.backendId);
+	if (proc == NULL)
+		return XactLockForVirtualXact(vxid, InvalidTransactionId, wait);
+
+	/*
+	 * We must acquire this lock before checking the backendId and lxid
+	 * against the ones we're waiting for.  The target backend will only set
+	 * or clear lxid while holding this lock.
+	 */
+	LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
+
+	if (proc->backendId != vxid.backendId
+		|| proc->fpLocalTransactionId != vxid.localTransactionId)
+	{
+		/* VXID ended */
+		LWLockRelease(&proc->fpInfoLock);
+		return XactLockForVirtualXact(vxid, InvalidTransactionId, wait);
+	}
+
+	/*
+	 * If we aren't asked to wait, there's no need to set up a lock table
+	 * entry.  The transaction is still in progress, so just return false.
+	 */
+	if (!wait)
+	{
+		LWLockRelease(&proc->fpInfoLock);
+		return false;
+	}
+
+	/*
+	 * OK, we're going to need to sleep on the VXID.  But first, we must set
+	 * up the primary lock table entry, if needed (ie, convert the proc's
+	 * fast-path lock on its VXID to a regular lock).
+	 */
+	if (proc->fpVXIDLock)
+	{
+		PROCLOCK   *proclock;
+		uint32		hashcode;
+		LWLock	   *partitionLock;
+
+		hashcode = LockTagHashCode(&tag);
+
+		partitionLock = LockHashPartitionLock(hashcode);
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		proclock = SetupLockInTable(LockMethods[DEFAULT_LOCKMETHOD], proc,
+									&tag, hashcode, ExclusiveLock);
+		if (!proclock)
+		{
+			LWLockRelease(partitionLock);
+			LWLockRelease(&proc->fpInfoLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory"),
+					 errhint("You might need to increase %s.", "max_locks_per_transaction")));
+		}
+		GrantLock(proclock->tag.myLock, proclock, ExclusiveLock);
+
+		LWLockRelease(partitionLock);
+
+		proc->fpVXIDLock = false;
+	}
+
+	/*
+	 * If the proc has an XID now, we'll avoid a TwoPhaseGetXidByVirtualXID()
+	 * search.  The proc might have assigned this XID but not yet locked it,
+	 * in which case the proc will lock this XID before releasing the VXID.
+	 * The fpInfoLock critical section excludes VirtualXactLockTableCleanup(),
+	 * so we won't save an XID of a different VXID.  It doesn't matter whether
+	 * we save this before or after setting up the primary lock table entry.
+	 */
+	xid = proc->xid;
+
+	/* Done with proc->fpLockBits */
+	LWLockRelease(&proc->fpInfoLock);
+
+	/* Time to wait. */
+	(void) LockAcquire(&tag, ShareLock, false, false);
+
+	LockRelease(&tag, ShareLock, false);
+	return XactLockForVirtualXact(vxid, xid, wait);
+}
+
+/*
+ * LockWaiterCount
+ *
+ * Find the number of lock requester on this locktag
+ */
+int
+LockWaiterCount(const LOCKTAG *locktag)
+{
+	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+	LOCK	   *lock;
+	bool		found;
+	uint32		hashcode;
+	LWLock	   *partitionLock;
+	int			waiters = 0;
+
+	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+	hashcode = LockTagHashCode(locktag);
+	partitionLock = LockHashPartitionLock(hashcode);
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+												locktag,
+												hashcode,
+												HASH_FIND,
+												&found);
+	if (found)
+	{
+		Assert(lock != NULL);
+		waiters = lock->nRequested;
+	}
+	LWLockRelease(partitionLock);
+
+	return waiters;
+}
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
new file mode 100644
index 0000000..01d738f
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -0,0 +1,1973 @@
+/*-------------------------------------------------------------------------
+ *
+ * lwlock.c
+ *	  Lightweight lock manager
+ *
+ * Lightweight locks are intended primarily to provide mutual exclusion of
+ * access to shared-memory data structures.  Therefore, they offer both
+ * exclusive and shared lock modes (to support read/write and read-only
+ * access to a shared object).  There are few other frammishes.  User-level
+ * locking should be done with the full lock manager --- which depends on
+ * LWLocks to protect its shared state.
+ *
+ * In addition to exclusive and shared modes, lightweight locks can be used to
+ * wait until a variable changes value.  The variable is initially not set
+ * when the lock is acquired with LWLockAcquire, i.e. it remains set to the
+ * value it was set to when the lock was released last, and can be updated
+ * without releasing the lock by calling LWLockUpdateVar.  LWLockWaitForVar
+ * waits for the variable to be updated, or until the lock is free.  When
+ * releasing the lock with LWLockReleaseClearVar() the value can be set to an
+ * appropriate value for a free lock.  The meaning of the variable is up to
+ * the caller, the lightweight lock code just assigns and compares it.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/lwlock.c
+ *
+ * NOTES:
+ *
+ * This used to be a pretty straight forward reader-writer lock
+ * implementation, in which the internal state was protected by a
+ * spinlock. Unfortunately the overhead of taking the spinlock proved to be
+ * too high for workloads/locks that were taken in shared mode very
+ * frequently. Often we were spinning in the (obviously exclusive) spinlock,
+ * while trying to acquire a shared lock that was actually free.
+ *
+ * Thus a new implementation was devised that provides wait-free shared lock
+ * acquisition for locks that aren't exclusively locked.
+ *
+ * The basic idea is to have a single atomic variable 'lockcount' instead of
+ * the formerly separate shared and exclusive counters and to use atomic
+ * operations to acquire the lock. That's fairly easy to do for plain
+ * rw-spinlocks, but a lot harder for something like LWLocks that want to wait
+ * in the OS.
+ *
+ * For lock acquisition we use an atomic compare-and-exchange on the lockcount
+ * variable. For exclusive lock we swap in a sentinel value
+ * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders.
+ *
+ * To release the lock we use an atomic decrement to release the lock. If the
+ * new value is zero (we get that atomically), we know we can/have to release
+ * waiters.
+ *
+ * Obviously it is important that the sentinel value for exclusive locks
+ * doesn't conflict with the maximum number of possible share lockers -
+ * luckily MAX_BACKENDS makes that easily possible.
+ *
+ *
+ * The attentive reader might have noticed that naively doing the above has a
+ * glaring race condition: We try to lock using the atomic operations and
+ * notice that we have to wait. Unfortunately by the time we have finished
+ * queuing, the former locker very well might have already finished it's
+ * work. That's problematic because we're now stuck waiting inside the OS.
+
+ * To mitigate those races we use a two phased attempt at locking:
+ *	 Phase 1: Try to do it atomically, if we succeed, nice
+ *	 Phase 2: Add ourselves to the waitqueue of the lock
+ *	 Phase 3: Try to grab the lock again, if we succeed, remove ourselves from
+ *			  the queue
+ *	 Phase 4: Sleep till wake-up, goto Phase 1
+ *
+ * This protects us against the problem from above as nobody can release too
+ *	  quick, before we're queued, since after Phase 2 we're already queued.
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "port/pg_bitutils.h"
+#include "postmaster/postmaster.h"
+#include "replication/slot.h"
+#include "storage/ipc.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+#ifdef LWLOCK_STATS
+#include "utils/hsearch.h"
+#endif
+
+
+/* We use the ShmemLock spinlock to protect LWLockCounter */
+extern slock_t *ShmemLock;
+
+#define LW_FLAG_HAS_WAITERS			((uint32) 1 << 30)
+#define LW_FLAG_RELEASE_OK			((uint32) 1 << 29)
+#define LW_FLAG_LOCKED				((uint32) 1 << 28)
+
+#define LW_VAL_EXCLUSIVE			((uint32) 1 << 24)
+#define LW_VAL_SHARED				1
+
+#define LW_LOCK_MASK				((uint32) ((1 << 25)-1))
+/* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */
+#define LW_SHARED_MASK				((uint32) ((1 << 24)-1))
+
+StaticAssertDecl(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
+				 "MAX_BACKENDS too big for lwlock.c");
+
+/*
+ * There are three sorts of LWLock "tranches":
+ *
+ * 1. The individually-named locks defined in lwlocknames.h each have their
+ * own tranche.  The names of these tranches appear in IndividualLWLockNames[]
+ * in lwlocknames.c.
+ *
+ * 2. There are some predefined tranches for built-in groups of locks.
+ * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names
+ * appear in BuiltinTrancheNames[] below.
+ *
+ * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche
+ * or LWLockRegisterTranche.  The names of these that are known in the current
+ * process appear in LWLockTrancheNames[].
+ *
+ * All these names are user-visible as wait event names, so choose with care
+ * ... and do not forget to update the documentation's list of wait events.
+ */
+extern const char *const IndividualLWLockNames[];	/* in lwlocknames.c */
+
+static const char *const BuiltinTrancheNames[] = {
+	/* LWTRANCHE_XACT_BUFFER: */
+	"XactBuffer",
+	/* LWTRANCHE_COMMITTS_BUFFER: */
+	"CommitTsBuffer",
+	/* LWTRANCHE_SUBTRANS_BUFFER: */
+	"SubtransBuffer",
+	/* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */
+	"MultiXactOffsetBuffer",
+	/* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */
+	"MultiXactMemberBuffer",
+	/* LWTRANCHE_NOTIFY_BUFFER: */
+	"NotifyBuffer",
+	/* LWTRANCHE_SERIAL_BUFFER: */
+	"SerialBuffer",
+	/* LWTRANCHE_WAL_INSERT: */
+	"WALInsert",
+	/* LWTRANCHE_BUFFER_CONTENT: */
+	"BufferContent",
+	/* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
+	"ReplicationOriginState",
+	/* LWTRANCHE_REPLICATION_SLOT_IO: */
+	"ReplicationSlotIO",
+	/* LWTRANCHE_LOCK_FASTPATH: */
+	"LockFastPath",
+	/* LWTRANCHE_BUFFER_MAPPING: */
+	"BufferMapping",
+	/* LWTRANCHE_LOCK_MANAGER: */
+	"LockManager",
+	/* LWTRANCHE_PREDICATE_LOCK_MANAGER: */
+	"PredicateLockManager",
+	/* LWTRANCHE_PARALLEL_HASH_JOIN: */
+	"ParallelHashJoin",
+	/* LWTRANCHE_PARALLEL_QUERY_DSA: */
+	"ParallelQueryDSA",
+	/* LWTRANCHE_PER_SESSION_DSA: */
+	"PerSessionDSA",
+	/* LWTRANCHE_PER_SESSION_RECORD_TYPE: */
+	"PerSessionRecordType",
+	/* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */
+	"PerSessionRecordTypmod",
+	/* LWTRANCHE_SHARED_TUPLESTORE: */
+	"SharedTupleStore",
+	/* LWTRANCHE_SHARED_TIDBITMAP: */
+	"SharedTidBitmap",
+	/* LWTRANCHE_PARALLEL_APPEND: */
+	"ParallelAppend",
+	/* LWTRANCHE_PER_XACT_PREDICATE_LIST: */
+	"PerXactPredicateList",
+	/* LWTRANCHE_PGSTATS_DSA: */
+	"PgStatsDSA",
+	/* LWTRANCHE_PGSTATS_HASH: */
+	"PgStatsHash",
+	/* LWTRANCHE_PGSTATS_DATA: */
+	"PgStatsData",
+	/* LWTRANCHE_LAUNCHER_DSA: */
+	"LogicalRepLauncherDSA",
+	/* LWTRANCHE_LAUNCHER_HASH: */
+	"LogicalRepLauncherHash",
+};
+
+StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
+				 LWTRANCHE_FIRST_USER_DEFINED - NUM_INDIVIDUAL_LWLOCKS,
+				 "missing entries in BuiltinTrancheNames[]");
+
+/*
+ * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and
+ * stores the names of all dynamically-created tranches known to the current
+ * process.  Any unused entries in the array will contain NULL.
+ */
+static const char **LWLockTrancheNames = NULL;
+static int	LWLockTrancheNamesAllocated = 0;
+
+/*
+ * This points to the main array of LWLocks in shared memory.  Backends inherit
+ * the pointer by fork from the postmaster (except in the EXEC_BACKEND case,
+ * where we have special measures to pass it down).
+ */
+LWLockPadded *MainLWLockArray = NULL;
+
+/*
+ * We use this structure to keep track of locked LWLocks for release
+ * during error recovery.  Normally, only a few will be held at once, but
+ * occasionally the number can be much higher; for example, the pg_buffercache
+ * extension locks all buffer partitions simultaneously.
+ */
+#define MAX_SIMUL_LWLOCKS	200
+
+/* struct representing the LWLocks we're holding */
+typedef struct LWLockHandle
+{
+	LWLock	   *lock;
+	LWLockMode	mode;
+} LWLockHandle;
+
+static int	num_held_lwlocks = 0;
+static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS];
+
+/* struct representing the LWLock tranche request for named tranche */
+typedef struct NamedLWLockTrancheRequest
+{
+	char		tranche_name[NAMEDATALEN];
+	int			num_lwlocks;
+} NamedLWLockTrancheRequest;
+
+static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL;
+static int	NamedLWLockTrancheRequestsAllocated = 0;
+
+/*
+ * NamedLWLockTrancheRequests is both the valid length of the request array,
+ * and the length of the shared-memory NamedLWLockTrancheArray later on.
+ * This variable and NamedLWLockTrancheArray are non-static so that
+ * postmaster.c can copy them to child processes in EXEC_BACKEND builds.
+ */
+int			NamedLWLockTrancheRequests = 0;
+
+/* points to data in shared memory: */
+NamedLWLockTranche *NamedLWLockTrancheArray = NULL;
+
+static void InitializeLWLocks(void);
+static inline void LWLockReportWaitStart(LWLock *lock);
+static inline void LWLockReportWaitEnd(void);
+static const char *GetLWTrancheName(uint16 trancheId);
+
+#define T_NAME(lock) \
+	GetLWTrancheName((lock)->tranche)
+
+#ifdef LWLOCK_STATS
+typedef struct lwlock_stats_key
+{
+	int			tranche;
+	void	   *instance;
+}			lwlock_stats_key;
+
+typedef struct lwlock_stats
+{
+	lwlock_stats_key key;
+	int			sh_acquire_count;
+	int			ex_acquire_count;
+	int			block_count;
+	int			dequeue_self_count;
+	int			spin_delay_count;
+}			lwlock_stats;
+
+static HTAB *lwlock_stats_htab;
+static lwlock_stats lwlock_stats_dummy;
+#endif
+
+#ifdef LOCK_DEBUG
+bool		Trace_lwlocks = false;
+
+inline static void
+PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode)
+{
+	/* hide statement & context here, otherwise the log is just too verbose */
+	if (Trace_lwlocks)
+	{
+		uint32		state = pg_atomic_read_u32(&lock->state);
+
+		ereport(LOG,
+				(errhidestmt(true),
+				 errhidecontext(true),
+				 errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d",
+								 MyProcPid,
+								 where, T_NAME(lock), lock,
+								 (state & LW_VAL_EXCLUSIVE) != 0,
+								 state & LW_SHARED_MASK,
+								 (state & LW_FLAG_HAS_WAITERS) != 0,
+								 pg_atomic_read_u32(&lock->nwaiters),
+								 (state & LW_FLAG_RELEASE_OK) != 0)));
+	}
+}
+
+inline static void
+LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg)
+{
+	/* hide statement & context here, otherwise the log is just too verbose */
+	if (Trace_lwlocks)
+	{
+		ereport(LOG,
+				(errhidestmt(true),
+				 errhidecontext(true),
+				 errmsg_internal("%s(%s %p): %s", where,
+								 T_NAME(lock), lock, msg)));
+	}
+}
+
+#else							/* not LOCK_DEBUG */
+#define PRINT_LWDEBUG(a,b,c) ((void)0)
+#define LOG_LWDEBUG(a,b,c) ((void)0)
+#endif							/* LOCK_DEBUG */
+
+#ifdef LWLOCK_STATS
+
+static void init_lwlock_stats(void);
+static void print_lwlock_stats(int code, Datum arg);
+static lwlock_stats * get_lwlock_stats_entry(LWLock *lock);
+
+static void
+init_lwlock_stats(void)
+{
+	HASHCTL		ctl;
+	static MemoryContext lwlock_stats_cxt = NULL;
+	static bool exit_registered = false;
+
+	if (lwlock_stats_cxt != NULL)
+		MemoryContextDelete(lwlock_stats_cxt);
+
+	/*
+	 * The LWLock stats will be updated within a critical section, which
+	 * requires allocating new hash entries. Allocations within a critical
+	 * section are normally not allowed because running out of memory would
+	 * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally
+	 * turned on in production, so that's an acceptable risk. The hash entries
+	 * are small, so the risk of running out of memory is minimal in practice.
+	 */
+	lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext,
+											 "LWLock stats",
+											 ALLOCSET_DEFAULT_SIZES);
+	MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true);
+
+	ctl.keysize = sizeof(lwlock_stats_key);
+	ctl.entrysize = sizeof(lwlock_stats);
+	ctl.hcxt = lwlock_stats_cxt;
+	lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl,
+									HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+	if (!exit_registered)
+	{
+		on_shmem_exit(print_lwlock_stats, 0);
+		exit_registered = true;
+	}
+}
+
+static void
+print_lwlock_stats(int code, Datum arg)
+{
+	HASH_SEQ_STATUS scan;
+	lwlock_stats *lwstats;
+
+	hash_seq_init(&scan, lwlock_stats_htab);
+
+	/* Grab an LWLock to keep different backends from mixing reports */
+	LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE);
+
+	while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL)
+	{
+		fprintf(stderr,
+				"PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n",
+				MyProcPid, GetLWTrancheName(lwstats->key.tranche),
+				lwstats->key.instance, lwstats->sh_acquire_count,
+				lwstats->ex_acquire_count, lwstats->block_count,
+				lwstats->spin_delay_count, lwstats->dequeue_self_count);
+	}
+
+	LWLockRelease(&MainLWLockArray[0].lock);
+}
+
+static lwlock_stats *
+get_lwlock_stats_entry(LWLock *lock)
+{
+	lwlock_stats_key key;
+	lwlock_stats *lwstats;
+	bool		found;
+
+	/*
+	 * During shared memory initialization, the hash table doesn't exist yet.
+	 * Stats of that phase aren't very interesting, so just collect operations
+	 * on all locks in a single dummy entry.
+	 */
+	if (lwlock_stats_htab == NULL)
+		return &lwlock_stats_dummy;
+
+	/* Fetch or create the entry. */
+	MemSet(&key, 0, sizeof(key));
+	key.tranche = lock->tranche;
+	key.instance = lock;
+	lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found);
+	if (!found)
+	{
+		lwstats->sh_acquire_count = 0;
+		lwstats->ex_acquire_count = 0;
+		lwstats->block_count = 0;
+		lwstats->dequeue_self_count = 0;
+		lwstats->spin_delay_count = 0;
+	}
+	return lwstats;
+}
+#endif							/* LWLOCK_STATS */
+
+
+/*
+ * Compute number of LWLocks required by named tranches.  These will be
+ * allocated in the main array.
+ */
+static int
+NumLWLocksForNamedTranches(void)
+{
+	int			numLocks = 0;
+	int			i;
+
+	for (i = 0; i < NamedLWLockTrancheRequests; i++)
+		numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+
+	return numLocks;
+}
+
+/*
+ * Compute shmem space needed for LWLocks and named tranches.
+ */
+Size
+LWLockShmemSize(void)
+{
+	Size		size;
+	int			i;
+	int			numLocks = NUM_FIXED_LWLOCKS;
+
+	/* Calculate total number of locks needed in the main array. */
+	numLocks += NumLWLocksForNamedTranches();
+
+	/* Space for the LWLock array. */
+	size = mul_size(numLocks, sizeof(LWLockPadded));
+
+	/* Space for dynamic allocation counter, plus room for alignment. */
+	size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE);
+
+	/* space for named tranches. */
+	size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche)));
+
+	/* space for name of each tranche. */
+	for (i = 0; i < NamedLWLockTrancheRequests; i++)
+		size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1);
+
+	return size;
+}
+
+/*
+ * Allocate shmem space for the main LWLock array and all tranches and
+ * initialize it.  We also register extension LWLock tranches here.
+ */
+void
+CreateLWLocks(void)
+{
+	if (!IsUnderPostmaster)
+	{
+		Size		spaceLocks = LWLockShmemSize();
+		int		   *LWLockCounter;
+		char	   *ptr;
+
+		/* Allocate space */
+		ptr = (char *) ShmemAlloc(spaceLocks);
+
+		/* Leave room for dynamic allocation of tranches */
+		ptr += sizeof(int);
+
+		/* Ensure desired alignment of LWLock array */
+		ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE;
+
+		MainLWLockArray = (LWLockPadded *) ptr;
+
+		/*
+		 * Initialize the dynamic-allocation counter for tranches, which is
+		 * stored just before the first LWLock.
+		 */
+		LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+		*LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED;
+
+		/* Initialize all LWLocks */
+		InitializeLWLocks();
+	}
+
+	/* Register named extension LWLock tranches in the current process. */
+	for (int i = 0; i < NamedLWLockTrancheRequests; i++)
+		LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId,
+							  NamedLWLockTrancheArray[i].trancheName);
+}
+
+/*
+ * Initialize LWLocks that are fixed and those belonging to named tranches.
+ */
+static void
+InitializeLWLocks(void)
+{
+	int			numNamedLocks = NumLWLocksForNamedTranches();
+	int			id;
+	int			i;
+	int			j;
+	LWLockPadded *lock;
+
+	/* Initialize all individual LWLocks in main array */
+	for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++)
+		LWLockInitialize(&lock->lock, id);
+
+	/* Initialize buffer mapping LWLocks in main array */
+	lock = MainLWLockArray + BUFFER_MAPPING_LWLOCK_OFFSET;
+	for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++)
+		LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING);
+
+	/* Initialize lmgrs' LWLocks in main array */
+	lock = MainLWLockArray + LOCK_MANAGER_LWLOCK_OFFSET;
+	for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++)
+		LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER);
+
+	/* Initialize predicate lmgrs' LWLocks in main array */
+	lock = MainLWLockArray + PREDICATELOCK_MANAGER_LWLOCK_OFFSET;
+	for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++)
+		LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER);
+
+	/*
+	 * Copy the info about any named tranches into shared memory (so that
+	 * other processes can see it), and initialize the requested LWLocks.
+	 */
+	if (NamedLWLockTrancheRequests > 0)
+	{
+		char	   *trancheNames;
+
+		NamedLWLockTrancheArray = (NamedLWLockTranche *)
+			&MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks];
+
+		trancheNames = (char *) NamedLWLockTrancheArray +
+			(NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche));
+		lock = &MainLWLockArray[NUM_FIXED_LWLOCKS];
+
+		for (i = 0; i < NamedLWLockTrancheRequests; i++)
+		{
+			NamedLWLockTrancheRequest *request;
+			NamedLWLockTranche *tranche;
+			char	   *name;
+
+			request = &NamedLWLockTrancheRequestArray[i];
+			tranche = &NamedLWLockTrancheArray[i];
+
+			name = trancheNames;
+			trancheNames += strlen(request->tranche_name) + 1;
+			strcpy(name, request->tranche_name);
+			tranche->trancheId = LWLockNewTrancheId();
+			tranche->trancheName = name;
+
+			for (j = 0; j < request->num_lwlocks; j++, lock++)
+				LWLockInitialize(&lock->lock, tranche->trancheId);
+		}
+	}
+}
+
+/*
+ * InitLWLockAccess - initialize backend-local state needed to hold LWLocks
+ */
+void
+InitLWLockAccess(void)
+{
+#ifdef LWLOCK_STATS
+	init_lwlock_stats();
+#endif
+}
+
+/*
+ * GetNamedLWLockTranche - returns the base address of LWLock from the
+ *		specified tranche.
+ *
+ * Caller needs to retrieve the requested number of LWLocks starting from
+ * the base lock address returned by this API.  This can be used for
+ * tranches that are requested by using RequestNamedLWLockTranche() API.
+ */
+LWLockPadded *
+GetNamedLWLockTranche(const char *tranche_name)
+{
+	int			lock_pos;
+	int			i;
+
+	/*
+	 * Obtain the position of base address of LWLock belonging to requested
+	 * tranche_name in MainLWLockArray.  LWLocks for named tranches are placed
+	 * in MainLWLockArray after fixed locks.
+	 */
+	lock_pos = NUM_FIXED_LWLOCKS;
+	for (i = 0; i < NamedLWLockTrancheRequests; i++)
+	{
+		if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name,
+				   tranche_name) == 0)
+			return &MainLWLockArray[lock_pos];
+
+		lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+	}
+
+	elog(ERROR, "requested tranche is not registered");
+
+	/* just to keep compiler quiet */
+	return NULL;
+}
+
+/*
+ * Allocate a new tranche ID.
+ */
+int
+LWLockNewTrancheId(void)
+{
+	int			result;
+	int		   *LWLockCounter;
+
+	LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+	SpinLockAcquire(ShmemLock);
+	result = (*LWLockCounter)++;
+	SpinLockRelease(ShmemLock);
+
+	return result;
+}
+
+/*
+ * Register a dynamic tranche name in the lookup table of the current process.
+ *
+ * This routine will save a pointer to the tranche name passed as an argument,
+ * so the name should be allocated in a backend-lifetime context
+ * (shared memory, TopMemoryContext, static constant, or similar).
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+LWLockRegisterTranche(int tranche_id, const char *tranche_name)
+{
+	/* This should only be called for user-defined tranches. */
+	if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED)
+		return;
+
+	/* Convert to array index. */
+	tranche_id -= LWTRANCHE_FIRST_USER_DEFINED;
+
+	/* If necessary, create or enlarge array. */
+	if (tranche_id >= LWLockTrancheNamesAllocated)
+	{
+		int			newalloc;
+
+		newalloc = pg_nextpower2_32(Max(8, tranche_id + 1));
+
+		if (LWLockTrancheNames == NULL)
+			LWLockTrancheNames = (const char **)
+				MemoryContextAllocZero(TopMemoryContext,
+									   newalloc * sizeof(char *));
+		else
+			LWLockTrancheNames =
+				repalloc0_array(LWLockTrancheNames, const char *, LWLockTrancheNamesAllocated, newalloc);
+		LWLockTrancheNamesAllocated = newalloc;
+	}
+
+	LWLockTrancheNames[tranche_id] = tranche_name;
+}
+
+/*
+ * RequestNamedLWLockTranche
+ *		Request that extra LWLocks be allocated during postmaster
+ *		startup.
+ *
+ * This may only be called via the shmem_request_hook of a library that is
+ * loaded into the postmaster via shared_preload_libraries.  Calls from
+ * elsewhere will fail.
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks)
+{
+	NamedLWLockTrancheRequest *request;
+
+	if (!process_shmem_requests_in_progress)
+		elog(FATAL, "cannot request additional LWLocks outside shmem_request_hook");
+
+	if (NamedLWLockTrancheRequestArray == NULL)
+	{
+		NamedLWLockTrancheRequestsAllocated = 16;
+		NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+			MemoryContextAlloc(TopMemoryContext,
+							   NamedLWLockTrancheRequestsAllocated
+							   * sizeof(NamedLWLockTrancheRequest));
+	}
+
+	if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated)
+	{
+		int			i = pg_nextpower2_32(NamedLWLockTrancheRequests + 1);
+
+		NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+			repalloc(NamedLWLockTrancheRequestArray,
+					 i * sizeof(NamedLWLockTrancheRequest));
+		NamedLWLockTrancheRequestsAllocated = i;
+	}
+
+	request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests];
+	Assert(strlen(tranche_name) + 1 <= NAMEDATALEN);
+	strlcpy(request->tranche_name, tranche_name, NAMEDATALEN);
+	request->num_lwlocks = num_lwlocks;
+	NamedLWLockTrancheRequests++;
+}
+
+/*
+ * LWLockInitialize - initialize a new lwlock; it's initially unlocked
+ */
+void
+LWLockInitialize(LWLock *lock, int tranche_id)
+{
+	pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK);
+#ifdef LOCK_DEBUG
+	pg_atomic_init_u32(&lock->nwaiters, 0);
+#endif
+	lock->tranche = tranche_id;
+	proclist_init(&lock->waiters);
+}
+
+/*
+ * Report start of wait event for light-weight locks.
+ *
+ * This function will be used by all the light-weight lock calls which
+ * needs to wait to acquire the lock.  This function distinguishes wait
+ * event based on tranche and lock id.
+ */
+static inline void
+LWLockReportWaitStart(LWLock *lock)
+{
+	pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche);
+}
+
+/*
+ * Report end of wait event for light-weight locks.
+ */
+static inline void
+LWLockReportWaitEnd(void)
+{
+	pgstat_report_wait_end();
+}
+
+/*
+ * Return the name of an LWLock tranche.
+ */
+static const char *
+GetLWTrancheName(uint16 trancheId)
+{
+	/* Individual LWLock? */
+	if (trancheId < NUM_INDIVIDUAL_LWLOCKS)
+		return IndividualLWLockNames[trancheId];
+
+	/* Built-in tranche? */
+	if (trancheId < LWTRANCHE_FIRST_USER_DEFINED)
+		return BuiltinTrancheNames[trancheId - NUM_INDIVIDUAL_LWLOCKS];
+
+	/*
+	 * It's an extension tranche, so look in LWLockTrancheNames[].  However,
+	 * it's possible that the tranche has never been registered in the current
+	 * process, in which case give up and return "extension".
+	 */
+	trancheId -= LWTRANCHE_FIRST_USER_DEFINED;
+
+	if (trancheId >= LWLockTrancheNamesAllocated ||
+		LWLockTrancheNames[trancheId] == NULL)
+		return "extension";
+
+	return LWLockTrancheNames[trancheId];
+}
+
+/*
+ * Return an identifier for an LWLock based on the wait class and event.
+ */
+const char *
+GetLWLockIdentifier(uint32 classId, uint16 eventId)
+{
+	Assert(classId == PG_WAIT_LWLOCK);
+	/* The event IDs are just tranche numbers. */
+	return GetLWTrancheName(eventId);
+}
+
+/*
+ * Internal function that tries to atomically acquire the lwlock in the passed
+ * in mode.
+ *
+ * This function will not block waiting for a lock to become free - that's the
+ * callers job.
+ *
+ * Returns true if the lock isn't free and we need to wait.
+ */
+static bool
+LWLockAttemptLock(LWLock *lock, LWLockMode mode)
+{
+	uint32		old_state;
+
+	Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED);
+
+	/*
+	 * Read once outside the loop, later iterations will get the newer value
+	 * via compare & exchange.
+	 */
+	old_state = pg_atomic_read_u32(&lock->state);
+
+	/* loop until we've determined whether we could acquire the lock or not */
+	while (true)
+	{
+		uint32		desired_state;
+		bool		lock_free;
+
+		desired_state = old_state;
+
+		if (mode == LW_EXCLUSIVE)
+		{
+			lock_free = (old_state & LW_LOCK_MASK) == 0;
+			if (lock_free)
+				desired_state += LW_VAL_EXCLUSIVE;
+		}
+		else
+		{
+			lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
+			if (lock_free)
+				desired_state += LW_VAL_SHARED;
+		}
+
+		/*
+		 * Attempt to swap in the state we are expecting. If we didn't see
+		 * lock to be free, that's just the old value. If we saw it as free,
+		 * we'll attempt to mark it acquired. The reason that we always swap
+		 * in the value is that this doubles as a memory barrier. We could try
+		 * to be smarter and only swap in values if we saw the lock as free,
+		 * but benchmark haven't shown it as beneficial so far.
+		 *
+		 * Retry if the value changed since we last looked at it.
+		 */
+		if (pg_atomic_compare_exchange_u32(&lock->state,
+										   &old_state, desired_state))
+		{
+			if (lock_free)
+			{
+				/* Great! Got the lock. */
+#ifdef LOCK_DEBUG
+				if (mode == LW_EXCLUSIVE)
+					lock->owner = MyProc;
+#endif
+				return false;
+			}
+			else
+				return true;	/* somebody else has the lock */
+		}
+	}
+	pg_unreachable();
+}
+
+/*
+ * Lock the LWLock's wait list against concurrent activity.
+ *
+ * NB: even though the wait list is locked, non-conflicting lock operations
+ * may still happen concurrently.
+ *
+ * Time spent holding mutex should be short!
+ */
+static void
+LWLockWaitListLock(LWLock *lock)
+{
+	uint32		old_state;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+	uint32		delays = 0;
+
+	lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+	while (true)
+	{
+		/* always try once to acquire lock directly */
+		old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED);
+		if (!(old_state & LW_FLAG_LOCKED))
+			break;				/* got lock */
+
+		/* and then spin without atomic operations until lock is released */
+		{
+			SpinDelayStatus delayStatus;
+
+			init_local_spin_delay(&delayStatus);
+
+			while (old_state & LW_FLAG_LOCKED)
+			{
+				perform_spin_delay(&delayStatus);
+				old_state = pg_atomic_read_u32(&lock->state);
+			}
+#ifdef LWLOCK_STATS
+			delays += delayStatus.delays;
+#endif
+			finish_spin_delay(&delayStatus);
+		}
+
+		/*
+		 * Retry. The lock might obviously already be re-acquired by the time
+		 * we're attempting to get it again.
+		 */
+	}
+
+#ifdef LWLOCK_STATS
+	lwstats->spin_delay_count += delays;
+#endif
+}
+
+/*
+ * Unlock the LWLock's wait list.
+ *
+ * Note that it can be more efficient to manipulate flags and release the
+ * locks in a single atomic operation.
+ */
+static void
+LWLockWaitListUnlock(LWLock *lock)
+{
+	uint32		old_state PG_USED_FOR_ASSERTS_ONLY;
+
+	old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED);
+
+	Assert(old_state & LW_FLAG_LOCKED);
+}
+
+/*
+ * Wakeup all the lockers that currently have a chance to acquire the lock.
+ */
+static void
+LWLockWakeup(LWLock *lock)
+{
+	bool		new_release_ok;
+	bool		wokeup_somebody = false;
+	proclist_head wakeup;
+	proclist_mutable_iter iter;
+
+	proclist_init(&wakeup);
+
+	new_release_ok = true;
+
+	/* lock wait list while collecting backends to wake up */
+	LWLockWaitListLock(lock);
+
+	proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+	{
+		PGPROC	   *waiter = GetPGProcByNumber(iter.cur);
+
+		if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE)
+			continue;
+
+		proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+		proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+
+		if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+		{
+			/*
+			 * Prevent additional wakeups until retryer gets to run. Backends
+			 * that are just waiting for the lock to become free don't retry
+			 * automatically.
+			 */
+			new_release_ok = false;
+
+			/*
+			 * Don't wakeup (further) exclusive locks.
+			 */
+			wokeup_somebody = true;
+		}
+
+		/*
+		 * Signal that the process isn't on the wait list anymore. This allows
+		 * LWLockDequeueSelf() to remove itself of the waitlist with a
+		 * proclist_delete(), rather than having to check if it has been
+		 * removed from the list.
+		 */
+		Assert(waiter->lwWaiting == LW_WS_WAITING);
+		waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
+
+		/*
+		 * Once we've woken up an exclusive lock, there's no point in waking
+		 * up anybody else.
+		 */
+		if (waiter->lwWaitMode == LW_EXCLUSIVE)
+			break;
+	}
+
+	Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS);
+
+	/* unset required flags, and release lock, in one fell swoop */
+	{
+		uint32		old_state;
+		uint32		desired_state;
+
+		old_state = pg_atomic_read_u32(&lock->state);
+		while (true)
+		{
+			desired_state = old_state;
+
+			/* compute desired flags */
+
+			if (new_release_ok)
+				desired_state |= LW_FLAG_RELEASE_OK;
+			else
+				desired_state &= ~LW_FLAG_RELEASE_OK;
+
+			if (proclist_is_empty(&wakeup))
+				desired_state &= ~LW_FLAG_HAS_WAITERS;
+
+			desired_state &= ~LW_FLAG_LOCKED;	/* release lock */
+
+			if (pg_atomic_compare_exchange_u32(&lock->state, &old_state,
+											   desired_state))
+				break;
+		}
+	}
+
+	/* Awaken any waiters I removed from the queue. */
+	proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+	{
+		PGPROC	   *waiter = GetPGProcByNumber(iter.cur);
+
+		LOG_LWDEBUG("LWLockRelease", lock, "release waiter");
+		proclist_delete(&wakeup, iter.cur, lwWaitLink);
+
+		/*
+		 * Guarantee that lwWaiting being unset only becomes visible once the
+		 * unlink from the link has completed. Otherwise the target backend
+		 * could be woken up for other reason and enqueue for a new lock - if
+		 * that happens before the list unlink happens, the list would end up
+		 * being corrupted.
+		 *
+		 * The barrier pairs with the LWLockWaitListLock() when enqueuing for
+		 * another lock.
+		 */
+		pg_write_barrier();
+		waiter->lwWaiting = LW_WS_NOT_WAITING;
+		PGSemaphoreUnlock(waiter->sem);
+	}
+}
+
+/*
+ * Add ourselves to the end of the queue.
+ *
+ * NB: Mode can be LW_WAIT_UNTIL_FREE here!
+ */
+static void
+LWLockQueueSelf(LWLock *lock, LWLockMode mode)
+{
+	/*
+	 * If we don't have a PGPROC structure, there's no way to wait. This
+	 * should never occur, since MyProc should only be null during shared
+	 * memory initialization.
+	 */
+	if (MyProc == NULL)
+		elog(PANIC, "cannot wait without a PGPROC structure");
+
+	if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
+		elog(PANIC, "queueing for lock while waiting on another one");
+
+	LWLockWaitListLock(lock);
+
+	/* setting the flag is protected by the spinlock */
+	pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS);
+
+	MyProc->lwWaiting = LW_WS_WAITING;
+	MyProc->lwWaitMode = mode;
+
+	/* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */
+	if (mode == LW_WAIT_UNTIL_FREE)
+		proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+	else
+		proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+
+	/* Can release the mutex now */
+	LWLockWaitListUnlock(lock);
+
+#ifdef LOCK_DEBUG
+	pg_atomic_fetch_add_u32(&lock->nwaiters, 1);
+#endif
+}
+
+/*
+ * Remove ourselves from the waitlist.
+ *
+ * This is used if we queued ourselves because we thought we needed to sleep
+ * but, after further checking, we discovered that we don't actually need to
+ * do so.
+ */
+static void
+LWLockDequeueSelf(LWLock *lock)
+{
+	bool		on_waitlist;
+
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+
+	lwstats = get_lwlock_stats_entry(lock);
+
+	lwstats->dequeue_self_count++;
+#endif
+
+	LWLockWaitListLock(lock);
+
+	/*
+	 * Remove ourselves from the waitlist, unless we've already been removed.
+	 * The removal happens with the wait list lock held, so there's no race in
+	 * this check.
+	 */
+	on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
+	if (on_waitlist)
+		proclist_delete(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+
+	if (proclist_is_empty(&lock->waiters) &&
+		(pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0)
+	{
+		pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS);
+	}
+
+	/* XXX: combine with fetch_and above? */
+	LWLockWaitListUnlock(lock);
+
+	/* clear waiting state again, nice for debugging */
+	if (on_waitlist)
+		MyProc->lwWaiting = LW_WS_NOT_WAITING;
+	else
+	{
+		int			extraWaits = 0;
+
+		/*
+		 * Somebody else dequeued us and has or will wake us up. Deal with the
+		 * superfluous absorption of a wakeup.
+		 */
+
+		/*
+		 * Reset RELEASE_OK flag if somebody woke us before we removed
+		 * ourselves - they'll have set it to false.
+		 */
+		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+		/*
+		 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
+		 * get reset at some inconvenient point later. Most of the time this
+		 * will immediately return.
+		 */
+		for (;;)
+		{
+			PGSemaphoreLock(MyProc->sem);
+			if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
+				break;
+			extraWaits++;
+		}
+
+		/*
+		 * Fix the process wait semaphore's count for any absorbed wakeups.
+		 */
+		while (extraWaits-- > 0)
+			PGSemaphoreUnlock(MyProc->sem);
+	}
+
+#ifdef LOCK_DEBUG
+	{
+		/* not waiting anymore */
+		uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+		Assert(nwaiters < MAX_BACKENDS);
+	}
+#endif
+}
+
+/*
+ * LWLockAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, sleep until it is.  Returns true if the lock
+ * was available immediately, false if we had to sleep.
+ *
+ * Side effect: cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockAcquire(LWLock *lock, LWLockMode mode)
+{
+	PGPROC	   *proc = MyProc;
+	bool		result = true;
+	int			extraWaits = 0;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+
+	lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+	Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+	PRINT_LWDEBUG("LWLockAcquire", lock, mode);
+
+#ifdef LWLOCK_STATS
+	/* Count lock acquisition attempts */
+	if (mode == LW_EXCLUSIVE)
+		lwstats->ex_acquire_count++;
+	else
+		lwstats->sh_acquire_count++;
+#endif							/* LWLOCK_STATS */
+
+	/*
+	 * We can't wait if we haven't got a PGPROC.  This should only occur
+	 * during bootstrap or shared memory initialization.  Put an Assert here
+	 * to catch unsafe coding practices.
+	 */
+	Assert(!(proc == NULL && IsUnderPostmaster));
+
+	/* Ensure we will have room to remember the lock */
+	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+		elog(ERROR, "too many LWLocks taken");
+
+	/*
+	 * Lock out cancel/die interrupts until we exit the code section protected
+	 * by the LWLock.  This ensures that interrupts will not interfere with
+	 * manipulations of data structures in shared memory.
+	 */
+	HOLD_INTERRUPTS();
+
+	/*
+	 * Loop here to try to acquire lock after each time we are signaled by
+	 * LWLockRelease.
+	 *
+	 * NOTE: it might seem better to have LWLockRelease actually grant us the
+	 * lock, rather than retrying and possibly having to go back to sleep. But
+	 * in practice that is no good because it means a process swap for every
+	 * lock acquisition when two or more processes are contending for the same
+	 * lock.  Since LWLocks are normally used to protect not-very-long
+	 * sections of computation, a process needs to be able to acquire and
+	 * release the same lock many times during a single CPU time slice, even
+	 * in the presence of contention.  The efficiency of being able to do that
+	 * outweighs the inefficiency of sometimes wasting a process dispatch
+	 * cycle because the lock is not free when a released waiter finally gets
+	 * to run.  See pgsql-hackers archives for 29-Dec-01.
+	 */
+	for (;;)
+	{
+		bool		mustwait;
+
+		/*
+		 * Try to grab the lock the first time, we're not in the waitqueue
+		 * yet/anymore.
+		 */
+		mustwait = LWLockAttemptLock(lock, mode);
+
+		if (!mustwait)
+		{
+			LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock");
+			break;				/* got the lock */
+		}
+
+		/*
+		 * Ok, at this point we couldn't grab the lock on the first try. We
+		 * cannot simply queue ourselves to the end of the list and wait to be
+		 * woken up because by now the lock could long have been released.
+		 * Instead add us to the queue and try to grab the lock again. If we
+		 * succeed we need to revert the queuing and be happy, otherwise we
+		 * recheck the lock. If we still couldn't grab it, we know that the
+		 * other locker will see our queue entries when releasing since they
+		 * existed before we checked for the lock.
+		 */
+
+		/* add to the queue */
+		LWLockQueueSelf(lock, mode);
+
+		/* we're now guaranteed to be woken up if necessary */
+		mustwait = LWLockAttemptLock(lock, mode);
+
+		/* ok, grabbed the lock the second time round, need to undo queueing */
+		if (!mustwait)
+		{
+			LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue");
+
+			LWLockDequeueSelf(lock);
+			break;
+		}
+
+		/*
+		 * Wait until awakened.
+		 *
+		 * It is possible that we get awakened for a reason other than being
+		 * signaled by LWLockRelease.  If so, loop back and wait again.  Once
+		 * we've gotten the LWLock, re-increment the sema by the number of
+		 * additional signals received.
+		 */
+		LOG_LWDEBUG("LWLockAcquire", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+		lwstats->block_count++;
+#endif
+
+		LWLockReportWaitStart(lock);
+		if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+		for (;;)
+		{
+			PGSemaphoreLock(proc->sem);
+			if (proc->lwWaiting == LW_WS_NOT_WAITING)
+				break;
+			extraWaits++;
+		}
+
+		/* Retrying, allow LWLockRelease to release waiters again. */
+		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+#ifdef LOCK_DEBUG
+		{
+			/* not waiting anymore */
+			uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+			Assert(nwaiters < MAX_BACKENDS);
+		}
+#endif
+
+		if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+		LWLockReportWaitEnd();
+
+		LOG_LWDEBUG("LWLockAcquire", lock, "awakened");
+
+		/* Now loop back and try to acquire lock again. */
+		result = false;
+	}
+
+	if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED())
+		TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode);
+
+	/* Add lock to list of locks held by this backend */
+	held_lwlocks[num_held_lwlocks].lock = lock;
+	held_lwlocks[num_held_lwlocks++].mode = mode;
+
+	/*
+	 * Fix the process wait semaphore's count for any absorbed wakeups.
+	 */
+	while (extraWaits-- > 0)
+		PGSemaphoreUnlock(proc->sem);
+
+	return result;
+}
+
+/*
+ * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, return false with no side-effects.
+ *
+ * If successful, cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
+{
+	bool		mustwait;
+
+	Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+	PRINT_LWDEBUG("LWLockConditionalAcquire", lock, mode);
+
+	/* Ensure we will have room to remember the lock */
+	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+		elog(ERROR, "too many LWLocks taken");
+
+	/*
+	 * Lock out cancel/die interrupts until we exit the code section protected
+	 * by the LWLock.  This ensures that interrupts will not interfere with
+	 * manipulations of data structures in shared memory.
+	 */
+	HOLD_INTERRUPTS();
+
+	/* Check for the lock */
+	mustwait = LWLockAttemptLock(lock, mode);
+
+	if (mustwait)
+	{
+		/* Failed to get lock, so release interrupt holdoff */
+		RESUME_INTERRUPTS();
+
+		LOG_LWDEBUG("LWLockConditionalAcquire", lock, "failed");
+		if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode);
+	}
+	else
+	{
+		/* Add lock to list of locks held by this backend */
+		held_lwlocks[num_held_lwlocks].lock = lock;
+		held_lwlocks[num_held_lwlocks++].mode = mode;
+		if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode);
+	}
+	return !mustwait;
+}
+
+/*
+ * LWLockAcquireOrWait - Acquire lock, or wait until it's free
+ *
+ * The semantics of this function are a bit funky.  If the lock is currently
+ * free, it is acquired in the given mode, and the function returns true.  If
+ * the lock isn't immediately free, the function waits until it is released
+ * and returns false, but does not acquire the lock.
+ *
+ * This is currently used for WALWriteLock: when a backend flushes the WAL,
+ * holding WALWriteLock, it can flush the commit records of many other
+ * backends as a side-effect.  Those other backends need to wait until the
+ * flush finishes, but don't need to acquire the lock anymore.  They can just
+ * wake up, observe that their records have already been flushed, and return.
+ */
+bool
+LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
+{
+	PGPROC	   *proc = MyProc;
+	bool		mustwait;
+	int			extraWaits = 0;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+
+	lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+	Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+	PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode);
+
+	/* Ensure we will have room to remember the lock */
+	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+		elog(ERROR, "too many LWLocks taken");
+
+	/*
+	 * Lock out cancel/die interrupts until we exit the code section protected
+	 * by the LWLock.  This ensures that interrupts will not interfere with
+	 * manipulations of data structures in shared memory.
+	 */
+	HOLD_INTERRUPTS();
+
+	/*
+	 * NB: We're using nearly the same twice-in-a-row lock acquisition
+	 * protocol as LWLockAcquire(). Check its comments for details.
+	 */
+	mustwait = LWLockAttemptLock(lock, mode);
+
+	if (mustwait)
+	{
+		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+		mustwait = LWLockAttemptLock(lock, mode);
+
+		if (mustwait)
+		{
+			/*
+			 * Wait until awakened.  Like in LWLockAcquire, be prepared for
+			 * bogus wakeups.
+			 */
+			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+			lwstats->block_count++;
+#endif
+
+			LWLockReportWaitStart(lock);
+			if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+				TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+			for (;;)
+			{
+				PGSemaphoreLock(proc->sem);
+				if (proc->lwWaiting == LW_WS_NOT_WAITING)
+					break;
+				extraWaits++;
+			}
+
+#ifdef LOCK_DEBUG
+			{
+				/* not waiting anymore */
+				uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+				Assert(nwaiters < MAX_BACKENDS);
+			}
+#endif
+			if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+				TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+			LWLockReportWaitEnd();
+
+			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened");
+		}
+		else
+		{
+			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue");
+
+			/*
+			 * Got lock in the second attempt, undo queueing. We need to treat
+			 * this as having successfully acquired the lock, otherwise we'd
+			 * not necessarily wake up people we've prevented from acquiring
+			 * the lock.
+			 */
+			LWLockDequeueSelf(lock);
+		}
+	}
+
+	/*
+	 * Fix the process wait semaphore's count for any absorbed wakeups.
+	 */
+	while (extraWaits-- > 0)
+		PGSemaphoreUnlock(proc->sem);
+
+	if (mustwait)
+	{
+		/* Failed to get lock, so release interrupt holdoff */
+		RESUME_INTERRUPTS();
+		LOG_LWDEBUG("LWLockAcquireOrWait", lock, "failed");
+		if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode);
+	}
+	else
+	{
+		LOG_LWDEBUG("LWLockAcquireOrWait", lock, "succeeded");
+		/* Add lock to list of locks held by this backend */
+		held_lwlocks[num_held_lwlocks].lock = lock;
+		held_lwlocks[num_held_lwlocks++].mode = mode;
+		if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode);
+	}
+
+	return !mustwait;
+}
+
+/*
+ * Does the lwlock in its current state need to wait for the variable value to
+ * change?
+ *
+ * If we don't need to wait, and it's because the value of the variable has
+ * changed, store the current value in newval.
+ *
+ * *result is set to true if the lock was free, and false otherwise.
+ */
+static bool
+LWLockConflictsWithVar(LWLock *lock,
+					   uint64 *valptr, uint64 oldval, uint64 *newval,
+					   bool *result)
+{
+	bool		mustwait;
+	uint64		value;
+
+	/*
+	 * Test first to see if it the slot is free right now.
+	 *
+	 * XXX: the caller uses a spinlock before this, so we don't need a memory
+	 * barrier here as far as the current usage is concerned.  But that might
+	 * not be safe in general.
+	 */
+	mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0;
+
+	if (!mustwait)
+	{
+		*result = true;
+		return false;
+	}
+
+	*result = false;
+
+	/*
+	 * Read value using the lwlock's wait list lock, as we can't generally
+	 * rely on atomic 64 bit reads/stores.  TODO: On platforms with a way to
+	 * do atomic 64 bit reads/writes the spinlock should be optimized away.
+	 */
+	LWLockWaitListLock(lock);
+	value = *valptr;
+	LWLockWaitListUnlock(lock);
+
+	if (value != oldval)
+	{
+		mustwait = false;
+		*newval = value;
+	}
+	else
+	{
+		mustwait = true;
+	}
+
+	return mustwait;
+}
+
+/*
+ * LWLockWaitForVar - Wait until lock is free, or a variable is updated.
+ *
+ * If the lock is held and *valptr equals oldval, waits until the lock is
+ * either freed, or the lock holder updates *valptr by calling
+ * LWLockUpdateVar.  If the lock is free on exit (immediately or after
+ * waiting), returns true.  If the lock is still held, but *valptr no longer
+ * matches oldval, returns false and sets *newval to the current value in
+ * *valptr.
+ *
+ * Note: this function ignores shared lock holders; if the lock is held
+ * in shared mode, returns 'true'.
+ */
+bool
+LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
+{
+	PGPROC	   *proc = MyProc;
+	int			extraWaits = 0;
+	bool		result = false;
+#ifdef LWLOCK_STATS
+	lwlock_stats *lwstats;
+
+	lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+	PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE);
+
+	/*
+	 * Lock out cancel/die interrupts while we sleep on the lock.  There is no
+	 * cleanup mechanism to remove us from the wait queue if we got
+	 * interrupted.
+	 */
+	HOLD_INTERRUPTS();
+
+	/*
+	 * Loop here to check the lock's status after each time we are signaled.
+	 */
+	for (;;)
+	{
+		bool		mustwait;
+
+		mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+										  &result);
+
+		if (!mustwait)
+			break;				/* the lock was free or value didn't match */
+
+		/*
+		 * Add myself to wait queue. Note that this is racy, somebody else
+		 * could wakeup before we're finished queuing. NB: We're using nearly
+		 * the same twice-in-a-row lock acquisition protocol as
+		 * LWLockAcquire(). Check its comments for details. The only
+		 * difference is that we also have to check the variable's values when
+		 * checking the state of the lock.
+		 */
+		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+		/*
+		 * Set RELEASE_OK flag, to make sure we get woken up as soon as the
+		 * lock is released.
+		 */
+		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+		/*
+		 * We're now guaranteed to be woken up if necessary. Recheck the lock
+		 * and variables state.
+		 */
+		mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+										  &result);
+
+		/* Ok, no conflict after we queued ourselves. Undo queueing. */
+		if (!mustwait)
+		{
+			LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue");
+
+			LWLockDequeueSelf(lock);
+			break;
+		}
+
+		/*
+		 * Wait until awakened.
+		 *
+		 * It is possible that we get awakened for a reason other than being
+		 * signaled by LWLockRelease.  If so, loop back and wait again.  Once
+		 * we've gotten the LWLock, re-increment the sema by the number of
+		 * additional signals received.
+		 */
+		LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+		lwstats->block_count++;
+#endif
+
+		LWLockReportWaitStart(lock);
+		if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE);
+
+		for (;;)
+		{
+			PGSemaphoreLock(proc->sem);
+			if (proc->lwWaiting == LW_WS_NOT_WAITING)
+				break;
+			extraWaits++;
+		}
+
+#ifdef LOCK_DEBUG
+		{
+			/* not waiting anymore */
+			uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+			Assert(nwaiters < MAX_BACKENDS);
+		}
+#endif
+
+		if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+			TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE);
+		LWLockReportWaitEnd();
+
+		LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened");
+
+		/* Now loop back and check the status of the lock again. */
+	}
+
+	/*
+	 * Fix the process wait semaphore's count for any absorbed wakeups.
+	 */
+	while (extraWaits-- > 0)
+		PGSemaphoreUnlock(proc->sem);
+
+	/*
+	 * Now okay to allow cancel/die interrupts.
+	 */
+	RESUME_INTERRUPTS();
+
+	return result;
+}
+
+
+/*
+ * LWLockUpdateVar - Update a variable and wake up waiters atomically
+ *
+ * Sets *valptr to 'val', and wakes up all processes waiting for us with
+ * LWLockWaitForVar().  Setting the value and waking up the processes happen
+ * atomically so that any process calling LWLockWaitForVar() on the same lock
+ * is guaranteed to see the new value, and act accordingly.
+ *
+ * The caller must be holding the lock in exclusive mode.
+ */
+void
+LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+	proclist_head wakeup;
+	proclist_mutable_iter iter;
+
+	PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE);
+
+	proclist_init(&wakeup);
+
+	LWLockWaitListLock(lock);
+
+	Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE);
+
+	/* Update the lock's value */
+	*valptr = val;
+
+	/*
+	 * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken
+	 * up. They are always in the front of the queue.
+	 */
+	proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+	{
+		PGPROC	   *waiter = GetPGProcByNumber(iter.cur);
+
+		if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+			break;
+
+		proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+		proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+
+		/* see LWLockWakeup() */
+		Assert(waiter->lwWaiting == LW_WS_WAITING);
+		waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
+	}
+
+	/* We are done updating shared state of the lock itself. */
+	LWLockWaitListUnlock(lock);
+
+	/*
+	 * Awaken any waiters I removed from the queue.
+	 */
+	proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+	{
+		PGPROC	   *waiter = GetPGProcByNumber(iter.cur);
+
+		proclist_delete(&wakeup, iter.cur, lwWaitLink);
+		/* check comment in LWLockWakeup() about this barrier */
+		pg_write_barrier();
+		waiter->lwWaiting = LW_WS_NOT_WAITING;
+		PGSemaphoreUnlock(waiter->sem);
+	}
+}
+
+
+/*
+ * LWLockRelease - release a previously acquired lock
+ */
+void
+LWLockRelease(LWLock *lock)
+{
+	LWLockMode	mode;
+	uint32		oldstate;
+	bool		check_waiters;
+	int			i;
+
+	/*
+	 * Remove lock from list of locks held.  Usually, but not always, it will
+	 * be the latest-acquired lock; so search array backwards.
+	 */
+	for (i = num_held_lwlocks; --i >= 0;)
+		if (lock == held_lwlocks[i].lock)
+			break;
+
+	if (i < 0)
+		elog(ERROR, "lock %s is not held", T_NAME(lock));
+
+	mode = held_lwlocks[i].mode;
+
+	num_held_lwlocks--;
+	for (; i < num_held_lwlocks; i++)
+		held_lwlocks[i] = held_lwlocks[i + 1];
+
+	PRINT_LWDEBUG("LWLockRelease", lock, mode);
+
+	/*
+	 * Release my hold on lock, after that it can immediately be acquired by
+	 * others, even if we still have to wakeup other waiters.
+	 */
+	if (mode == LW_EXCLUSIVE)
+		oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE);
+	else
+		oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED);
+
+	/* nobody else can have that kind of lock */
+	Assert(!(oldstate & LW_VAL_EXCLUSIVE));
+
+	if (TRACE_POSTGRESQL_LWLOCK_RELEASE_ENABLED())
+		TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock));
+
+	/*
+	 * We're still waiting for backends to get scheduled, don't wake them up
+	 * again.
+	 */
+	if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) ==
+		(LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) &&
+		(oldstate & LW_LOCK_MASK) == 0)
+		check_waiters = true;
+	else
+		check_waiters = false;
+
+	/*
+	 * As waking up waiters requires the spinlock to be acquired, only do so
+	 * if necessary.
+	 */
+	if (check_waiters)
+	{
+		/* XXX: remove before commit? */
+		LOG_LWDEBUG("LWLockRelease", lock, "releasing waiters");
+		LWLockWakeup(lock);
+	}
+
+	/*
+	 * Now okay to allow cancel/die interrupts.
+	 */
+	RESUME_INTERRUPTS();
+}
+
+/*
+ * LWLockReleaseClearVar - release a previously acquired lock, reset variable
+ */
+void
+LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+	LWLockWaitListLock(lock);
+
+	/*
+	 * Set the variable's value before releasing the lock, that prevents race
+	 * a race condition wherein a new locker acquires the lock, but hasn't yet
+	 * set the variables value.
+	 */
+	*valptr = val;
+	LWLockWaitListUnlock(lock);
+
+	LWLockRelease(lock);
+}
+
+
+/*
+ * LWLockReleaseAll - release all currently-held locks
+ *
+ * Used to clean up after ereport(ERROR). An important difference between this
+ * function and retail LWLockRelease calls is that InterruptHoldoffCount is
+ * unchanged by this operation.  This is necessary since InterruptHoldoffCount
+ * has been set to an appropriate level earlier in error recovery. We could
+ * decrement it below zero if we allow it to drop for each released lock!
+ */
+void
+LWLockReleaseAll(void)
+{
+	while (num_held_lwlocks > 0)
+	{
+		HOLD_INTERRUPTS();		/* match the upcoming RESUME_INTERRUPTS */
+
+		LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock);
+	}
+}
+
+
+/*
+ * LWLockHeldByMe - test whether my process holds a lock in any mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMe(LWLock *lock)
+{
+	int			i;
+
+	for (i = 0; i < num_held_lwlocks; i++)
+	{
+		if (held_lwlocks[i].lock == lock)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * LWLockHeldByMe - test whether my process holds any of an array of locks
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockAnyHeldByMe(LWLock *lock, int nlocks, size_t stride)
+{
+	char	   *held_lock_addr;
+	char	   *begin;
+	char	   *end;
+	int			i;
+
+	begin = (char *) lock;
+	end = begin + nlocks * stride;
+	for (i = 0; i < num_held_lwlocks; i++)
+	{
+		held_lock_addr = (char *) held_lwlocks[i].lock;
+		if (held_lock_addr >= begin &&
+			held_lock_addr < end &&
+			(held_lock_addr - begin) % stride == 0)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * LWLockHeldByMeInMode - test whether my process holds a lock in given mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
+{
+	int			i;
+
+	for (i = 0; i < num_held_lwlocks; i++)
+	{
+		if (held_lwlocks[i].lock == lock && held_lwlocks[i].mode == mode)
+			return true;
+	}
+	return false;
+}
diff --git a/src/backend/storage/lmgr/lwlocknames.c b/src/backend/storage/lmgr/lwlocknames.c
new file mode 100644
index 0000000..65f7c5b
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.c
@@ -0,0 +1,52 @@
+/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */
+
+const char *const IndividualLWLockNames[] = {
+	"<unassigned:0>",
+	"ShmemIndex",
+	"OidGen",
+	"XidGen",
+	"ProcArray",
+	"SInvalRead",
+	"SInvalWrite",
+	"WALBufMapping",
+	"WALWrite",
+	"ControlFile",
+	"<unassigned:10>",
+	"XactSLRU",
+	"SubtransSLRU",
+	"MultiXactGen",
+	"MultiXactOffsetSLRU",
+	"MultiXactMemberSLRU",
+	"RelCacheInit",
+	"CheckpointerComm",
+	"TwoPhaseState",
+	"TablespaceCreate",
+	"BtreeVacuum",
+	"AddinShmemInit",
+	"Autovacuum",
+	"AutovacuumSchedule",
+	"SyncScan",
+	"RelationMapping",
+	"NotifySLRU",
+	"NotifyQueue",
+	"SerializableXactHash",
+	"SerializableFinishedList",
+	"SerializablePredicateList",
+	"SerialSLRU",
+	"SyncRep",
+	"BackgroundWorker",
+	"DynamicSharedMemoryControl",
+	"AutoFile",
+	"ReplicationSlotAllocation",
+	"ReplicationSlotControl",
+	"CommitTsSLRU",
+	"CommitTs",
+	"ReplicationOrigin",
+	"MultiXactTruncation",
+	"OldSnapshotTimeMap",
+	"LogicalRepWorker",
+	"XactTruncation",
+	"<unassigned:45>",
+	"WrapLimitsVacuum",
+	"NotifyQueueTail"
+};
diff --git a/src/backend/storage/lmgr/lwlocknames.h b/src/backend/storage/lmgr/lwlocknames.h
new file mode 100644
index 0000000..e279f72
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.h
@@ -0,0 +1,50 @@
+/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */
+/* there is deliberately not an #ifndef LWLOCKNAMES_H here */
+
+#define ShmemIndexLock (&MainLWLockArray[1].lock)
+#define OidGenLock (&MainLWLockArray[2].lock)
+#define XidGenLock (&MainLWLockArray[3].lock)
+#define ProcArrayLock (&MainLWLockArray[4].lock)
+#define SInvalReadLock (&MainLWLockArray[5].lock)
+#define SInvalWriteLock (&MainLWLockArray[6].lock)
+#define WALBufMappingLock (&MainLWLockArray[7].lock)
+#define WALWriteLock (&MainLWLockArray[8].lock)
+#define ControlFileLock (&MainLWLockArray[9].lock)
+#define XactSLRULock (&MainLWLockArray[11].lock)
+#define SubtransSLRULock (&MainLWLockArray[12].lock)
+#define MultiXactGenLock (&MainLWLockArray[13].lock)
+#define MultiXactOffsetSLRULock (&MainLWLockArray[14].lock)
+#define MultiXactMemberSLRULock (&MainLWLockArray[15].lock)
+#define RelCacheInitLock (&MainLWLockArray[16].lock)
+#define CheckpointerCommLock (&MainLWLockArray[17].lock)
+#define TwoPhaseStateLock (&MainLWLockArray[18].lock)
+#define TablespaceCreateLock (&MainLWLockArray[19].lock)
+#define BtreeVacuumLock (&MainLWLockArray[20].lock)
+#define AddinShmemInitLock (&MainLWLockArray[21].lock)
+#define AutovacuumLock (&MainLWLockArray[22].lock)
+#define AutovacuumScheduleLock (&MainLWLockArray[23].lock)
+#define SyncScanLock (&MainLWLockArray[24].lock)
+#define RelationMappingLock (&MainLWLockArray[25].lock)
+#define NotifySLRULock (&MainLWLockArray[26].lock)
+#define NotifyQueueLock (&MainLWLockArray[27].lock)
+#define SerializableXactHashLock (&MainLWLockArray[28].lock)
+#define SerializableFinishedListLock (&MainLWLockArray[29].lock)
+#define SerializablePredicateListLock (&MainLWLockArray[30].lock)
+#define SerialSLRULock (&MainLWLockArray[31].lock)
+#define SyncRepLock (&MainLWLockArray[32].lock)
+#define BackgroundWorkerLock (&MainLWLockArray[33].lock)
+#define DynamicSharedMemoryControlLock (&MainLWLockArray[34].lock)
+#define AutoFileLock (&MainLWLockArray[35].lock)
+#define ReplicationSlotAllocationLock (&MainLWLockArray[36].lock)
+#define ReplicationSlotControlLock (&MainLWLockArray[37].lock)
+#define CommitTsSLRULock (&MainLWLockArray[38].lock)
+#define CommitTsLock (&MainLWLockArray[39].lock)
+#define ReplicationOriginLock (&MainLWLockArray[40].lock)
+#define MultiXactTruncationLock (&MainLWLockArray[41].lock)
+#define OldSnapshotTimeMapLock (&MainLWLockArray[42].lock)
+#define LogicalRepWorkerLock (&MainLWLockArray[43].lock)
+#define XactTruncationLock (&MainLWLockArray[44].lock)
+#define WrapLimitsVacuumLock (&MainLWLockArray[46].lock)
+#define NotifyQueueTailLock (&MainLWLockArray[47].lock)
+
+#define NUM_INDIVIDUAL_LWLOCKS		48
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
new file mode 100644
index 0000000..6c7cf6c
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -0,0 +1,55 @@
+# Some commonly-used locks have predefined positions within MainLWLockArray;
+# these are defined here.  If you add a lock, add it to the end to avoid
+# renumbering the existing locks; if you remove a lock, consider leaving a gap
+# in the numbering sequence for the benefit of DTrace and other external
+# debugging scripts.  Also, do not forget to update the list of wait events
+# in the user documentation.
+
+# 0 is available; was formerly BufFreelistLock
+ShmemIndexLock						1
+OidGenLock							2
+XidGenLock							3
+ProcArrayLock						4
+SInvalReadLock						5
+SInvalWriteLock						6
+WALBufMappingLock					7
+WALWriteLock						8
+ControlFileLock						9
+# 10 was CheckpointLock
+XactSLRULock						11
+SubtransSLRULock					12
+MultiXactGenLock					13
+MultiXactOffsetSLRULock				14
+MultiXactMemberSLRULock				15
+RelCacheInitLock					16
+CheckpointerCommLock				17
+TwoPhaseStateLock					18
+TablespaceCreateLock				19
+BtreeVacuumLock						20
+AddinShmemInitLock					21
+AutovacuumLock						22
+AutovacuumScheduleLock				23
+SyncScanLock						24
+RelationMappingLock					25
+NotifySLRULock						26
+NotifyQueueLock						27
+SerializableXactHashLock			28
+SerializableFinishedListLock		29
+SerializablePredicateListLock		30
+SerialSLRULock						31
+SyncRepLock							32
+BackgroundWorkerLock				33
+DynamicSharedMemoryControlLock		34
+AutoFileLock						35
+ReplicationSlotAllocationLock		36
+ReplicationSlotControlLock			37
+CommitTsSLRULock					38
+CommitTsLock						39
+ReplicationOriginLock				40
+MultiXactTruncationLock				41
+OldSnapshotTimeMapLock				42
+LogicalRepWorkerLock				43
+XactTruncationLock					44
+# 45 was XactTruncationLock until removal of BackendRandomLock
+WrapLimitsVacuumLock				46
+NotifyQueueTailLock					47
diff --git a/src/backend/storage/lmgr/meson.build b/src/backend/storage/lmgr/meson.build
new file mode 100644
index 0000000..0b2c93d
--- /dev/null
+++ b/src/backend/storage/lmgr/meson.build
@@ -0,0 +1,15 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+  'condition_variable.c',
+  'deadlock.c',
+  'lmgr.c',
+  'lock.c',
+  'lwlock.c',
+  'predicate.c',
+  'proc.c',
+  's_lock.c',
+  'spin.c',
+)
+
+generated_backend_sources += lwlocknames[1]
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
new file mode 100644
index 0000000..1af4121
--- /dev/null
+++ b/src/backend/storage/lmgr/predicate.c
@@ -0,0 +1,4997 @@
+/*-------------------------------------------------------------------------
+ *
+ * predicate.c
+ *	  POSTGRES predicate locking
+ *	  to support full serializable transaction isolation
+ *
+ *
+ * The approach taken is to implement Serializable Snapshot Isolation (SSI)
+ * as initially described in this paper:
+ *
+ *	Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
+ *	Serializable isolation for snapshot databases.
+ *	In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
+ *	international conference on Management of data,
+ *	pages 729-738, New York, NY, USA. ACM.
+ *	http://doi.acm.org/10.1145/1376616.1376690
+ *
+ * and further elaborated in Cahill's doctoral thesis:
+ *
+ *	Michael James Cahill. 2009.
+ *	Serializable Isolation for Snapshot Databases.
+ *	Sydney Digital Theses.
+ *	University of Sydney, School of Information Technologies.
+ *	http://hdl.handle.net/2123/5353
+ *
+ *
+ * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
+ * locks, which are so different from normal locks that a distinct set of
+ * structures is required to handle them.  They are needed to detect
+ * rw-conflicts when the read happens before the write.  (When the write
+ * occurs first, the reading transaction can check for a conflict by
+ * examining the MVCC data.)
+ *
+ * (1)	Besides tuples actually read, they must cover ranges of tuples
+ *		which would have been read based on the predicate.  This will
+ *		require modelling the predicates through locks against database
+ *		objects such as pages, index ranges, or entire tables.
+ *
+ * (2)	They must be kept in RAM for quick access.  Because of this, it
+ *		isn't possible to always maintain tuple-level granularity -- when
+ *		the space allocated to store these approaches exhaustion, a
+ *		request for a lock may need to scan for situations where a single
+ *		transaction holds many fine-grained locks which can be coalesced
+ *		into a single coarser-grained lock.
+ *
+ * (3)	They never block anything; they are more like flags than locks
+ *		in that regard; although they refer to database objects and are
+ *		used to identify rw-conflicts with normal write locks.
+ *
+ * (4)	While they are associated with a transaction, they must survive
+ *		a successful COMMIT of that transaction, and remain until all
+ *		overlapping transactions complete.  This even means that they
+ *		must survive termination of the transaction's process.  If a
+ *		top level transaction is rolled back, however, it is immediately
+ *		flagged so that it can be ignored, and its SIREAD locks can be
+ *		released any time after that.
+ *
+ * (5)	The only transactions which create SIREAD locks or check for
+ *		conflicts with them are serializable transactions.
+ *
+ * (6)	When a write lock for a top level transaction is found to cover
+ *		an existing SIREAD lock for the same transaction, the SIREAD lock
+ *		can be deleted.
+ *
+ * (7)	A write from a serializable transaction must ensure that an xact
+ *		record exists for the transaction, with the same lifespan (until
+ *		all concurrent transaction complete or the transaction is rolled
+ *		back) so that rw-dependencies to that transaction can be
+ *		detected.
+ *
+ * We use an optimization for read-only transactions. Under certain
+ * circumstances, a read-only transaction's snapshot can be shown to
+ * never have conflicts with other transactions.  This is referred to
+ * as a "safe" snapshot (and one known not to be is "unsafe").
+ * However, it can't be determined whether a snapshot is safe until
+ * all concurrent read/write transactions complete.
+ *
+ * Once a read-only transaction is known to have a safe snapshot, it
+ * can release its predicate locks and exempt itself from further
+ * predicate lock tracking. READ ONLY DEFERRABLE transactions run only
+ * on safe snapshots, waiting as necessary for one to be available.
+ *
+ *
+ * Lightweight locks to manage access to the predicate locking shared
+ * memory objects must be taken in this order, and should be released in
+ * reverse order:
+ *
+ *	SerializableFinishedListLock
+ *		- Protects the list of transactions which have completed but which
+ *			may yet matter because they overlap still-active transactions.
+ *
+ *	SerializablePredicateListLock
+ *		- Protects the linked list of locks held by a transaction.  Note
+ *			that the locks themselves are also covered by the partition
+ *			locks of their respective lock targets; this lock only affects
+ *			the linked list connecting the locks related to a transaction.
+ *		- All transactions share this single lock (with no partitioning).
+ *		- There is never a need for a process other than the one running
+ *			an active transaction to walk the list of locks held by that
+ *			transaction, except parallel query workers sharing the leader's
+ *			transaction.  In the parallel case, an extra per-sxact lock is
+ *			taken; see below.
+ *		- It is relatively infrequent that another process needs to
+ *			modify the list for a transaction, but it does happen for such
+ *			things as index page splits for pages with predicate locks and
+ *			freeing of predicate locked pages by a vacuum process.  When
+ *			removing a lock in such cases, the lock itself contains the
+ *			pointers needed to remove it from the list.  When adding a
+ *			lock in such cases, the lock can be added using the anchor in
+ *			the transaction structure.  Neither requires walking the list.
+ *		- Cleaning up the list for a terminated transaction is sometimes
+ *			not done on a retail basis, in which case no lock is required.
+ *		- Due to the above, a process accessing its active transaction's
+ *			list always uses a shared lock, regardless of whether it is
+ *			walking or maintaining the list.  This improves concurrency
+ *			for the common access patterns.
+ *		- A process which needs to alter the list of a transaction other
+ *			than its own active transaction must acquire an exclusive
+ *			lock.
+ *
+ *	SERIALIZABLEXACT's member 'perXactPredicateListLock'
+ *		- Protects the linked list of predicate locks held by a transaction.
+ *			Only needed for parallel mode, where multiple backends share the
+ *			same SERIALIZABLEXACT object.  Not needed if
+ *			SerializablePredicateListLock is held exclusively.
+ *
+ *	PredicateLockHashPartitionLock(hashcode)
+ *		- The same lock protects a target, all locks on that target, and
+ *			the linked list of locks on the target.
+ *		- When more than one is needed, acquire in ascending address order.
+ *		- When all are needed (rare), acquire in ascending index order with
+ *			PredicateLockHashPartitionLockByIndex(index).
+ *
+ *	SerializableXactHashLock
+ *		- Protects both PredXact and SerializableXidHash.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/predicate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *
+ * housekeeping for setting up shared memory predicate lock structures
+ *		InitPredicateLocks(void)
+ *		PredicateLockShmemSize(void)
+ *
+ * predicate lock reporting
+ *		GetPredicateLockStatusData(void)
+ *		PageIsPredicateLocked(Relation relation, BlockNumber blkno)
+ *
+ * predicate lock maintenance
+ *		GetSerializableTransactionSnapshot(Snapshot snapshot)
+ *		SetSerializableTransactionSnapshot(Snapshot snapshot,
+ *										   VirtualTransactionId *sourcevxid)
+ *		RegisterPredicateLockingXid(void)
+ *		PredicateLockRelation(Relation relation, Snapshot snapshot)
+ *		PredicateLockPage(Relation relation, BlockNumber blkno,
+ *						Snapshot snapshot)
+ *		PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
+ *						 TransactionId tuple_xid)
+ *		PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+ *							   BlockNumber newblkno)
+ *		PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+ *								 BlockNumber newblkno)
+ *		TransferPredicateLocksToHeapRelation(Relation relation)
+ *		ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
+ *
+ * conflict detection (may also trigger rollback)
+ *		CheckForSerializableConflictOut(Relation relation, TransactionId xid,
+ *										Snapshot snapshot)
+ *		CheckForSerializableConflictIn(Relation relation, ItemPointer tid,
+ *									   BlockNumber blkno)
+ *		CheckTableForSerializableConflictIn(Relation relation)
+ *
+ * final rollback checking
+ *		PreCommit_CheckForSerializationFailure(void)
+ *
+ * two-phase commit support
+ *		AtPrepare_PredicateLocks(void);
+ *		PostPrepare_PredicateLocks(TransactionId xid);
+ *		PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
+ *		predicatelock_twophase_recover(TransactionId xid, uint16 info,
+ *									   void *recdata, uint32 len);
+ */
+
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/pg_lfind.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "storage/predicate_internals.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+/* Uncomment the next line to test the graceful degradation code. */
+/* #define TEST_SUMMARIZE_SERIAL */
+
+/*
+ * Test the most selective fields first, for performance.
+ *
+ * a is covered by b if all of the following hold:
+ *	1) a.database = b.database
+ *	2) a.relation = b.relation
+ *	3) b.offset is invalid (b is page-granularity or higher)
+ *	4) either of the following:
+ *		4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
+ *	 or 4b) a.offset is invalid and b.page is invalid (a is
+ *			page-granularity and b is relation-granularity
+ */
+#define TargetTagIsCoveredBy(covered_target, covering_target)			\
+	((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */	\
+	  GET_PREDICATELOCKTARGETTAG_RELATION(covering_target))				\
+	 && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) ==			\
+		 InvalidOffsetNumber)								 /* (3) */	\
+	 && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) !=			\
+		   InvalidOffsetNumber)								 /* (4a) */ \
+		  && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+			  GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)))			\
+		 || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+			  InvalidBlockNumber)							 /* (4b) */ \
+			 && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)		\
+				 != InvalidBlockNumber)))								\
+	 && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) ==	 /* (1) */	\
+		 GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
+
+/*
+ * The predicate locking target and lock shared hash tables are partitioned to
+ * reduce contention.  To determine which partition a given target belongs to,
+ * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
+ * apply one of these macros.
+ * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
+ */
+#define PredicateLockHashPartition(hashcode) \
+	((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
+#define PredicateLockHashPartitionLock(hashcode) \
+	(&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \
+		PredicateLockHashPartition(hashcode)].lock)
+#define PredicateLockHashPartitionLockByIndex(i) \
+	(&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock)
+
+#define NPREDICATELOCKTARGETENTS() \
+	mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+
+#define SxactIsOnFinishedList(sxact) (!dlist_node_is_detached(&(sxact)->finishedLink))
+
+/*
+ * Note that a sxact is marked "prepared" once it has passed
+ * PreCommit_CheckForSerializationFailure, even if it isn't using
+ * 2PC. This is the point at which it can no longer be aborted.
+ *
+ * The PREPARED flag remains set after commit, so SxactIsCommitted
+ * implies SxactIsPrepared.
+ */
+#define SxactIsCommitted(sxact) (((sxact)->flags & SXACT_FLAG_COMMITTED) != 0)
+#define SxactIsPrepared(sxact) (((sxact)->flags & SXACT_FLAG_PREPARED) != 0)
+#define SxactIsRolledBack(sxact) (((sxact)->flags & SXACT_FLAG_ROLLED_BACK) != 0)
+#define SxactIsDoomed(sxact) (((sxact)->flags & SXACT_FLAG_DOOMED) != 0)
+#define SxactIsReadOnly(sxact) (((sxact)->flags & SXACT_FLAG_READ_ONLY) != 0)
+#define SxactHasSummaryConflictIn(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_IN) != 0)
+#define SxactHasSummaryConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_OUT) != 0)
+/*
+ * The following macro actually means that the specified transaction has a
+ * conflict out *to a transaction which committed ahead of it*.  It's hard
+ * to get that into a name of a reasonable length.
+ */
+#define SxactHasConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_CONFLICT_OUT) != 0)
+#define SxactIsDeferrableWaiting(sxact) (((sxact)->flags & SXACT_FLAG_DEFERRABLE_WAITING) != 0)
+#define SxactIsROSafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_SAFE) != 0)
+#define SxactIsROUnsafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_UNSAFE) != 0)
+#define SxactIsPartiallyReleased(sxact) (((sxact)->flags & SXACT_FLAG_PARTIALLY_RELEASED) != 0)
+
+/*
+ * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
+ *
+ * To avoid unnecessary recomputations of the hash code, we try to do this
+ * just once per function, and then pass it around as needed.  Aside from
+ * passing the hashcode to hash_search_with_hash_value(), we can extract
+ * the lock partition number from the hashcode.
+ */
+#define PredicateLockTargetTagHashCode(predicatelocktargettag) \
+	get_hash_value(PredicateLockTargetHash, predicatelocktargettag)
+
+/*
+ * Given a predicate lock tag, and the hash for its target,
+ * compute the lock hash.
+ *
+ * To make the hash code also depend on the transaction, we xor the sxid
+ * struct's address into the hash code, left-shifted so that the
+ * partition-number bits don't change.  Since this is only a hash, we
+ * don't care if we lose high-order bits of the address; use an
+ * intermediate variable to suppress cast-pointer-to-int warnings.
+ */
+#define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
+	((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \
+	 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+
+
+/*
+ * The SLRU buffer area through which we access the old xids.
+ */
+static SlruCtlData SerialSlruCtlData;
+
+#define SerialSlruCtl			(&SerialSlruCtlData)
+
+#define SERIAL_PAGESIZE			BLCKSZ
+#define SERIAL_ENTRYSIZE			sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE	(SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+
+/*
+ * Set maximum pages based on the number needed to track all transactions.
+ */
+#define SERIAL_MAX_PAGE			(MaxTransactionId / SERIAL_ENTRIESPERPAGE)
+
+#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
+
+#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
+	(SerialSlruCtl->shared->page_buffer[slotno] + \
+	((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
+
+#define SerialPage(xid)	(((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
+
+typedef struct SerialControlData
+{
+	int			headPage;		/* newest initialized page */
+	TransactionId headXid;		/* newest valid Xid in the SLRU */
+	TransactionId tailXid;		/* oldest xmin we might be interested in */
+}			SerialControlData;
+
+typedef struct SerialControlData *SerialControl;
+
+static SerialControl serialControl;
+
+/*
+ * When the oldest committed transaction on the "finished" list is moved to
+ * SLRU, its predicate locks will be moved to this "dummy" transaction,
+ * collapsing duplicate targets.  When a duplicate is found, the later
+ * commitSeqNo is used.
+ */
+static SERIALIZABLEXACT *OldCommittedSxact;
+
+
+/*
+ * These configuration variables are used to set the predicate lock table size
+ * and to control promotion of predicate locks to coarser granularity in an
+ * attempt to degrade performance (mostly as false positive serialization
+ * failure) gracefully in the face of memory pressure.
+ */
+int			max_predicate_locks_per_xact;	/* in guc_tables.c */
+int			max_predicate_locks_per_relation;	/* in guc_tables.c */
+int			max_predicate_locks_per_page;	/* in guc_tables.c */
+
+/*
+ * This provides a list of objects in order to track transactions
+ * participating in predicate locking.  Entries in the list are fixed size,
+ * and reside in shared memory.  The memory address of an entry must remain
+ * fixed during its lifetime.  The list will be protected from concurrent
+ * update externally; no provision is made in this code to manage that.  The
+ * number of entries in the list, and the size allowed for each entry is
+ * fixed upon creation.
+ */
+static PredXactList PredXact;
+
+/*
+ * This provides a pool of RWConflict data elements to use in conflict lists
+ * between transactions.
+ */
+static RWConflictPoolHeader RWConflictPool;
+
+/*
+ * The predicate locking hash tables are in shared memory.
+ * Each backend keeps pointers to them.
+ */
+static HTAB *SerializableXidHash;
+static HTAB *PredicateLockTargetHash;
+static HTAB *PredicateLockHash;
+static dlist_head *FinishedSerializableTransactions;
+
+/*
+ * Tag for a dummy entry in PredicateLockTargetHash. By temporarily removing
+ * this entry, you can ensure that there's enough scratch space available for
+ * inserting one entry in the hash table. This is an otherwise-invalid tag.
+ */
+static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0};
+static uint32 ScratchTargetTagHash;
+static LWLock *ScratchPartitionLock;
+
+/*
+ * The local hash table used to determine when to combine multiple fine-
+ * grained locks into a single courser-grained lock.
+ */
+static HTAB *LocalPredicateLockHash = NULL;
+
+/*
+ * Keep a pointer to the currently-running serializable transaction (if any)
+ * for quick reference. Also, remember if we have written anything that could
+ * cause a rw-conflict.
+ */
+static SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact;
+static bool MyXactDidWrite = false;
+
+/*
+ * The SXACT_FLAG_RO_UNSAFE optimization might lead us to release
+ * MySerializableXact early.  If that happens in a parallel query, the leader
+ * needs to defer the destruction of the SERIALIZABLEXACT until end of
+ * transaction, because the workers still have a reference to it.  In that
+ * case, the leader stores it here.
+ */
+static SERIALIZABLEXACT *SavedSerializableXact = InvalidSerializableXact;
+
+/* local functions */
+
+static SERIALIZABLEXACT *CreatePredXact(void);
+static void ReleasePredXact(SERIALIZABLEXACT *sxact);
+
+static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer);
+static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact);
+static void ReleaseRWConflict(RWConflict conflict);
+static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact);
+
+static bool SerialPagePrecedesLogically(int page1, int page2);
+static void SerialInit(void);
+static void SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo);
+static SerCommitSeqNo SerialGetMinConflictCommitSeqNo(TransactionId xid);
+static void SerialSetActiveSerXmin(TransactionId xid);
+
+static uint32 predicatelock_hash(const void *key, Size keysize);
+static void SummarizeOldestCommittedSxact(void);
+static Snapshot GetSafeSnapshot(Snapshot origSnapshot);
+static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot,
+													  VirtualTransactionId *sourcevxid,
+													  int sourcepid);
+static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag);
+static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+									  PREDICATELOCKTARGETTAG *parent);
+static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
+static void RemoveScratchTarget(bool lockheld);
+static void RestoreScratchTarget(bool lockheld);
+static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target,
+									   uint32 targettaghash);
+static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag);
+static int	MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag);
+static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag);
+static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
+static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
+								uint32 targettaghash,
+								SERIALIZABLEXACT *sxact);
+static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash);
+static bool TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
+											  PREDICATELOCKTARGETTAG newtargettag,
+											  bool removeOld);
+static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag);
+static void DropAllPredicateLocksFromTable(Relation relation,
+										   bool transfer);
+static void SetNewSxactGlobalXmin(void);
+static void ClearOldPredicateLocks(void);
+static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
+									   bool summarize);
+static bool XidIsConcurrent(TransactionId xid);
+static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
+static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+													SERIALIZABLEXACT *writer);
+static void CreateLocalPredicateLockHash(void);
+static void ReleasePredicateLocksLocal(void);
+
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Does this relation participate in predicate locking? Temporary and system
+ * relations are exempt.
+ */
+static inline bool
+PredicateLockingNeededForRelation(Relation relation)
+{
+	return !(relation->rd_id < FirstUnpinnedObjectId ||
+			 RelationUsesLocalBuffers(relation));
+}
+
+/*
+ * When a public interface method is called for a read, this is the test to
+ * see if we should do a quick return.
+ *
+ * Note: this function has side-effects! If this transaction has been flagged
+ * as RO-safe since the last call, we release all predicate locks and reset
+ * MySerializableXact. That makes subsequent calls to return quickly.
+ *
+ * This is marked as 'inline' to eliminate the function call overhead in the
+ * common case that serialization is not needed.
+ */
+static inline bool
+SerializationNeededForRead(Relation relation, Snapshot snapshot)
+{
+	/* Nothing to do if this is not a serializable transaction */
+	if (MySerializableXact == InvalidSerializableXact)
+		return false;
+
+	/*
+	 * Don't acquire locks or conflict when scanning with a special snapshot.
+	 * This excludes things like CLUSTER and REINDEX. They use the wholesale
+	 * functions TransferPredicateLocksToHeapRelation() and
+	 * CheckTableForSerializableConflictIn() to participate in serialization,
+	 * but the scans involved don't need serialization.
+	 */
+	if (!IsMVCCSnapshot(snapshot))
+		return false;
+
+	/*
+	 * Check if we have just become "RO-safe". If we have, immediately release
+	 * all locks as they're not needed anymore. This also resets
+	 * MySerializableXact, so that subsequent calls to this function can exit
+	 * quickly.
+	 *
+	 * A transaction is flagged as RO_SAFE if all concurrent R/W transactions
+	 * commit without having conflicts out to an earlier snapshot, thus
+	 * ensuring that no conflicts are possible for this transaction.
+	 */
+	if (SxactIsROSafe(MySerializableXact))
+	{
+		ReleasePredicateLocks(false, true);
+		return false;
+	}
+
+	/* Check if the relation doesn't participate in predicate locking */
+	if (!PredicateLockingNeededForRelation(relation))
+		return false;
+
+	return true;				/* no excuse to skip predicate locking */
+}
+
+/*
+ * Like SerializationNeededForRead(), but called on writes.
+ * The logic is the same, but there is no snapshot and we can't be RO-safe.
+ */
+static inline bool
+SerializationNeededForWrite(Relation relation)
+{
+	/* Nothing to do if this is not a serializable transaction */
+	if (MySerializableXact == InvalidSerializableXact)
+		return false;
+
+	/* Check if the relation doesn't participate in predicate locking */
+	if (!PredicateLockingNeededForRelation(relation))
+		return false;
+
+	return true;				/* no excuse to skip predicate locking */
+}
+
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * These functions are a simple implementation of a list for this specific
+ * type of struct.  If there is ever a generalized shared memory list, we
+ * should probably switch to that.
+ */
+static SERIALIZABLEXACT *
+CreatePredXact(void)
+{
+	SERIALIZABLEXACT *sxact;
+
+	if (dlist_is_empty(&PredXact->availableList))
+		return NULL;
+
+	sxact = dlist_container(SERIALIZABLEXACT, xactLink,
+							dlist_pop_head_node(&PredXact->availableList));
+	dlist_push_tail(&PredXact->activeList, &sxact->xactLink);
+	return sxact;
+}
+
+static void
+ReleasePredXact(SERIALIZABLEXACT *sxact)
+{
+	Assert(ShmemAddrIsValid(sxact));
+
+	dlist_delete(&sxact->xactLink);
+	dlist_push_tail(&PredXact->availableList, &sxact->xactLink);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * These functions manage primitive access to the RWConflict pool and lists.
+ */
+static bool
+RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer)
+{
+	dlist_iter	iter;
+
+	Assert(reader != writer);
+
+	/* Check the ends of the purported conflict first. */
+	if (SxactIsDoomed(reader)
+		|| SxactIsDoomed(writer)
+		|| dlist_is_empty(&reader->outConflicts)
+		|| dlist_is_empty(&writer->inConflicts))
+		return false;
+
+	/*
+	 * A conflict is possible; walk the list to find out.
+	 *
+	 * The unconstify is needed as we have no const version of
+	 * dlist_foreach().
+	 */
+	dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->outConflicts)
+	{
+		RWConflict	conflict =
+			dlist_container(RWConflictData, outLink, iter.cur);
+
+		if (conflict->sxactIn == writer)
+			return true;
+	}
+
+	/* No conflict found. */
+	return false;
+}
+
+static void
+SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+{
+	RWConflict	conflict;
+
+	Assert(reader != writer);
+	Assert(!RWConflictExists(reader, writer));
+
+	if (dlist_is_empty(&RWConflictPool->availableList))
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("not enough elements in RWConflictPool to record a read/write conflict"),
+				 errhint("You might need to run fewer transactions at a time or increase max_connections.")));
+
+	conflict = dlist_head_element(RWConflictData, outLink, &RWConflictPool->availableList);
+	dlist_delete(&conflict->outLink);
+
+	conflict->sxactOut = reader;
+	conflict->sxactIn = writer;
+	dlist_push_tail(&reader->outConflicts, &conflict->outLink);
+	dlist_push_tail(&writer->inConflicts, &conflict->inLink);
+}
+
+static void
+SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact,
+						  SERIALIZABLEXACT *activeXact)
+{
+	RWConflict	conflict;
+
+	Assert(roXact != activeXact);
+	Assert(SxactIsReadOnly(roXact));
+	Assert(!SxactIsReadOnly(activeXact));
+
+	if (dlist_is_empty(&RWConflictPool->availableList))
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("not enough elements in RWConflictPool to record a potential read/write conflict"),
+				 errhint("You might need to run fewer transactions at a time or increase max_connections.")));
+
+	conflict = dlist_head_element(RWConflictData, outLink, &RWConflictPool->availableList);
+	dlist_delete(&conflict->outLink);
+
+	conflict->sxactOut = activeXact;
+	conflict->sxactIn = roXact;
+	dlist_push_tail(&activeXact->possibleUnsafeConflicts, &conflict->outLink);
+	dlist_push_tail(&roXact->possibleUnsafeConflicts, &conflict->inLink);
+}
+
+static void
+ReleaseRWConflict(RWConflict conflict)
+{
+	dlist_delete(&conflict->inLink);
+	dlist_delete(&conflict->outLink);
+	dlist_push_tail(&RWConflictPool->availableList, &conflict->outLink);
+}
+
+static void
+FlagSxactUnsafe(SERIALIZABLEXACT *sxact)
+{
+	dlist_mutable_iter iter;
+
+	Assert(SxactIsReadOnly(sxact));
+	Assert(!SxactIsROSafe(sxact));
+
+	sxact->flags |= SXACT_FLAG_RO_UNSAFE;
+
+	/*
+	 * We know this isn't a safe snapshot, so we can stop looking for other
+	 * potential conflicts.
+	 */
+	dlist_foreach_modify(iter, &sxact->possibleUnsafeConflicts)
+	{
+		RWConflict	conflict =
+			dlist_container(RWConflictData, inLink, iter.cur);
+
+		Assert(!SxactIsReadOnly(conflict->sxactOut));
+		Assert(sxact == conflict->sxactIn);
+
+		ReleaseRWConflict(conflict);
+	}
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Decide whether a Serial page number is "older" for truncation purposes.
+ * Analogous to CLOGPagePrecedes().
+ */
+static bool
+SerialPagePrecedesLogically(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * SERIAL_ENTRIESPERPAGE;
+	xid1 += FirstNormalTransactionId + 1;
+	xid2 = ((TransactionId) page2) * SERIAL_ENTRIESPERPAGE;
+	xid2 += FirstNormalTransactionId + 1;
+
+	return (TransactionIdPrecedes(xid1, xid2) &&
+			TransactionIdPrecedes(xid1, xid2 + SERIAL_ENTRIESPERPAGE - 1));
+}
+
+#ifdef USE_ASSERT_CHECKING
+static void
+SerialPagePrecedesLogicallyUnitTests(void)
+{
+	int			per_page = SERIAL_ENTRIESPERPAGE,
+				offset = per_page / 2;
+	int			newestPage,
+				oldestPage,
+				headPage,
+				targetPage;
+	TransactionId newestXact,
+				oldestXact;
+
+	/* GetNewTransactionId() has assigned the last XID it can safely use. */
+	newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;	/* nothing special */
+	newestXact = newestPage * per_page + offset;
+	Assert(newestXact / per_page == newestPage);
+	oldestXact = newestXact + 1;
+	oldestXact -= 1U << 31;
+	oldestPage = oldestXact / per_page;
+
+	/*
+	 * In this scenario, the SLRU headPage pertains to the last ~1000 XIDs
+	 * assigned.  oldestXact finishes, ~2B XIDs having elapsed since it
+	 * started.  Further transactions cause us to summarize oldestXact to
+	 * tailPage.  Function must return false so SerialAdd() doesn't zero
+	 * tailPage (which may contain entries for other old, recently-finished
+	 * XIDs) and half the SLRU.  Reaching this requires burning ~2B XIDs in
+	 * single-user mode, a negligible possibility.
+	 */
+	headPage = newestPage;
+	targetPage = oldestPage;
+	Assert(!SerialPagePrecedesLogically(headPage, targetPage));
+
+	/*
+	 * In this scenario, the SLRU headPage pertains to oldestXact.  We're
+	 * summarizing an XID near newestXact.  (Assume few other XIDs used
+	 * SERIALIZABLE, hence the minimal headPage advancement.  Assume
+	 * oldestXact was long-running and only recently reached the SLRU.)
+	 * Function must return true to make SerialAdd() create targetPage.
+	 *
+	 * Today's implementation mishandles this case, but it doesn't matter
+	 * enough to fix.  Verify that the defect affects just one page by
+	 * asserting correct treatment of its prior page.  Reaching this case
+	 * requires burning ~2B XIDs in single-user mode, a negligible
+	 * possibility.  Moreover, if it does happen, the consequence would be
+	 * mild, namely a new transaction failing in SimpleLruReadPage().
+	 */
+	headPage = oldestPage;
+	targetPage = newestPage;
+	Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+#if 0
+	Assert(SerialPagePrecedesLogically(headPage, targetPage));
+#endif
+}
+#endif
+
+/*
+ * Initialize for the tracking of old serializable committed xids.
+ */
+static void
+SerialInit(void)
+{
+	bool		found;
+
+	/*
+	 * Set up SLRU management of the pg_serial data.
+	 */
+	SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
+	SimpleLruInit(SerialSlruCtl, "Serial",
+				  NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+				  LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
+#ifdef USE_ASSERT_CHECKING
+	SerialPagePrecedesLogicallyUnitTests();
+#endif
+	SlruPagePrecedesUnitTests(SerialSlruCtl, SERIAL_ENTRIESPERPAGE);
+
+	/*
+	 * Create or attach to the SerialControl structure.
+	 */
+	serialControl = (SerialControl)
+		ShmemInitStruct("SerialControlData", sizeof(SerialControlData), &found);
+
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+	{
+		/*
+		 * Set control information to reflect empty SLRU.
+		 */
+		serialControl->headPage = -1;
+		serialControl->headXid = InvalidTransactionId;
+		serialControl->tailXid = InvalidTransactionId;
+	}
+}
+
+/*
+ * Record a committed read write serializable xid and the minimum
+ * commitSeqNo of any transactions to which this xid had a rw-conflict out.
+ * An invalid commitSeqNo means that there were no conflicts out from xid.
+ */
+static void
+SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo)
+{
+	TransactionId tailXid;
+	int			targetPage;
+	int			slotno;
+	int			firstZeroPage;
+	bool		isNewPage;
+
+	Assert(TransactionIdIsValid(xid));
+
+	targetPage = SerialPage(xid);
+
+	LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * If no serializable transactions are active, there shouldn't be anything
+	 * to push out to the SLRU.  Hitting this assert would mean there's
+	 * something wrong with the earlier cleanup logic.
+	 */
+	tailXid = serialControl->tailXid;
+	Assert(TransactionIdIsValid(tailXid));
+
+	/*
+	 * If the SLRU is currently unused, zero out the whole active region from
+	 * tailXid to headXid before taking it into use. Otherwise zero out only
+	 * any new pages that enter the tailXid-headXid range as we advance
+	 * headXid.
+	 */
+	if (serialControl->headPage < 0)
+	{
+		firstZeroPage = SerialPage(tailXid);
+		isNewPage = true;
+	}
+	else
+	{
+		firstZeroPage = SerialNextPage(serialControl->headPage);
+		isNewPage = SerialPagePrecedesLogically(serialControl->headPage,
+												targetPage);
+	}
+
+	if (!TransactionIdIsValid(serialControl->headXid)
+		|| TransactionIdFollows(xid, serialControl->headXid))
+		serialControl->headXid = xid;
+	if (isNewPage)
+		serialControl->headPage = targetPage;
+
+	if (isNewPage)
+	{
+		/* Initialize intervening pages. */
+		while (firstZeroPage != targetPage)
+		{
+			(void) SimpleLruZeroPage(SerialSlruCtl, firstZeroPage);
+			firstZeroPage = SerialNextPage(firstZeroPage);
+		}
+		slotno = SimpleLruZeroPage(SerialSlruCtl, targetPage);
+	}
+	else
+		slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid);
+
+	SerialValue(slotno, xid) = minConflictCommitSeqNo;
+	SerialSlruCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(SerialSLRULock);
+}
+
+/*
+ * Get the minimum commitSeqNo for any conflict out for the given xid.  For
+ * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo
+ * will be returned.
+ */
+static SerCommitSeqNo
+SerialGetMinConflictCommitSeqNo(TransactionId xid)
+{
+	TransactionId headXid;
+	TransactionId tailXid;
+	SerCommitSeqNo val;
+	int			slotno;
+
+	Assert(TransactionIdIsValid(xid));
+
+	LWLockAcquire(SerialSLRULock, LW_SHARED);
+	headXid = serialControl->headXid;
+	tailXid = serialControl->tailXid;
+	LWLockRelease(SerialSLRULock);
+
+	if (!TransactionIdIsValid(headXid))
+		return 0;
+
+	Assert(TransactionIdIsValid(tailXid));
+
+	if (TransactionIdPrecedes(xid, tailXid)
+		|| TransactionIdFollows(xid, headXid))
+		return 0;
+
+	/*
+	 * The following function must be called without holding SerialSLRULock,
+	 * but will return with that lock held, which must then be released.
+	 */
+	slotno = SimpleLruReadPage_ReadOnly(SerialSlruCtl,
+										SerialPage(xid), xid);
+	val = SerialValue(slotno, xid);
+	LWLockRelease(SerialSLRULock);
+	return val;
+}
+
+/*
+ * Call this whenever there is a new xmin for active serializable
+ * transactions.  We don't need to keep information on transactions which
+ * precede that.  InvalidTransactionId means none active, so everything in
+ * the SLRU can be discarded.
+ */
+static void
+SerialSetActiveSerXmin(TransactionId xid)
+{
+	LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * When no sxacts are active, nothing overlaps, set the xid values to
+	 * invalid to show that there are no valid entries.  Don't clear headPage,
+	 * though.  A new xmin might still land on that page, and we don't want to
+	 * repeatedly zero out the same page.
+	 */
+	if (!TransactionIdIsValid(xid))
+	{
+		serialControl->tailXid = InvalidTransactionId;
+		serialControl->headXid = InvalidTransactionId;
+		LWLockRelease(SerialSLRULock);
+		return;
+	}
+
+	/*
+	 * When we're recovering prepared transactions, the global xmin might move
+	 * backwards depending on the order they're recovered. Normally that's not
+	 * OK, but during recovery no serializable transactions will commit, so
+	 * the SLRU is empty and we can get away with it.
+	 */
+	if (RecoveryInProgress())
+	{
+		Assert(serialControl->headPage < 0);
+		if (!TransactionIdIsValid(serialControl->tailXid)
+			|| TransactionIdPrecedes(xid, serialControl->tailXid))
+		{
+			serialControl->tailXid = xid;
+		}
+		LWLockRelease(SerialSLRULock);
+		return;
+	}
+
+	Assert(!TransactionIdIsValid(serialControl->tailXid)
+		   || TransactionIdFollows(xid, serialControl->tailXid));
+
+	serialControl->tailXid = xid;
+
+	LWLockRelease(SerialSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ *
+ * We don't have any data that needs to survive a restart, but this is a
+ * convenient place to truncate the SLRU.
+ */
+void
+CheckPointPredicate(void)
+{
+	int			tailPage;
+
+	LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+	/* Exit quickly if the SLRU is currently not in use. */
+	if (serialControl->headPage < 0)
+	{
+		LWLockRelease(SerialSLRULock);
+		return;
+	}
+
+	if (TransactionIdIsValid(serialControl->tailXid))
+	{
+		/* We can truncate the SLRU up to the page containing tailXid */
+		tailPage = SerialPage(serialControl->tailXid);
+	}
+	else
+	{
+		/*----------
+		 * The SLRU is no longer needed. Truncate to head before we set head
+		 * invalid.
+		 *
+		 * XXX: It's possible that the SLRU is not needed again until XID
+		 * wrap-around has happened, so that the segment containing headPage
+		 * that we leave behind will appear to be new again. In that case it
+		 * won't be removed until XID horizon advances enough to make it
+		 * current again.
+		 *
+		 * XXX: This should happen in vac_truncate_clog(), not in checkpoints.
+		 * Consider this scenario, starting from a system with no in-progress
+		 * transactions and VACUUM FREEZE having maximized oldestXact:
+		 * - Start a SERIALIZABLE transaction.
+		 * - Start, finish, and summarize a SERIALIZABLE transaction, creating
+		 *   one SLRU page.
+		 * - Consume XIDs to reach xidStopLimit.
+		 * - Finish all transactions.  Due to the long-running SERIALIZABLE
+		 *   transaction, earlier checkpoints did not touch headPage.  The
+		 *   next checkpoint will change it, but that checkpoint happens after
+		 *   the end of the scenario.
+		 * - VACUUM to advance XID limits.
+		 * - Consume ~2M XIDs, crossing the former xidWrapLimit.
+		 * - Start, finish, and summarize a SERIALIZABLE transaction.
+		 *   SerialAdd() declines to create the targetPage, because headPage
+		 *   is not regarded as in the past relative to that targetPage.  The
+		 *   transaction instigating the summarize fails in
+		 *   SimpleLruReadPage().
+		 */
+		tailPage = serialControl->headPage;
+		serialControl->headPage = -1;
+	}
+
+	LWLockRelease(SerialSLRULock);
+
+	/* Truncate away pages that are no longer required */
+	SimpleLruTruncate(SerialSlruCtl, tailPage);
+
+	/*
+	 * Write dirty SLRU pages to disk
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely as a debugging aid.
+	 *
+	 * We're doing this after the truncation to avoid writing pages right
+	 * before deleting the file in which they sit, which would be completely
+	 * pointless.
+	 */
+	SimpleLruWriteAll(SerialSlruCtl, true);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * InitPredicateLocks -- Initialize the predicate locking data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments.  In the normal postmaster case, the shared hash tables
+ * are created here.  Backends inherit the pointers
+ * to the shared tables via fork().  In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables.
+ */
+void
+InitPredicateLocks(void)
+{
+	HASHCTL		info;
+	long		max_table_size;
+	Size		requestSize;
+	bool		found;
+
+#ifndef EXEC_BACKEND
+	Assert(!IsUnderPostmaster);
+#endif
+
+	/*
+	 * Compute size of predicate lock target hashtable. Note these
+	 * calculations must agree with PredicateLockShmemSize!
+	 */
+	max_table_size = NPREDICATELOCKTARGETENTS();
+
+	/*
+	 * Allocate hash table for PREDICATELOCKTARGET structs.  This stores
+	 * per-predicate-lock-target information.
+	 */
+	info.keysize = sizeof(PREDICATELOCKTARGETTAG);
+	info.entrysize = sizeof(PREDICATELOCKTARGET);
+	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+
+	PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
+											max_table_size,
+											max_table_size,
+											&info,
+											HASH_ELEM | HASH_BLOBS |
+											HASH_PARTITION | HASH_FIXED_SIZE);
+
+	/*
+	 * Reserve a dummy entry in the hash table; we use it to make sure there's
+	 * always one entry available when we need to split or combine a page,
+	 * because running out of space there could mean aborting a
+	 * non-serializable transaction.
+	 */
+	if (!IsUnderPostmaster)
+	{
+		(void) hash_search(PredicateLockTargetHash, &ScratchTargetTag,
+						   HASH_ENTER, &found);
+		Assert(!found);
+	}
+
+	/* Pre-calculate the hash and partition lock of the scratch entry */
+	ScratchTargetTagHash = PredicateLockTargetTagHashCode(&ScratchTargetTag);
+	ScratchPartitionLock = PredicateLockHashPartitionLock(ScratchTargetTagHash);
+
+	/*
+	 * Allocate hash table for PREDICATELOCK structs.  This stores per
+	 * xact-lock-of-a-target information.
+	 */
+	info.keysize = sizeof(PREDICATELOCKTAG);
+	info.entrysize = sizeof(PREDICATELOCK);
+	info.hash = predicatelock_hash;
+	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+
+	/* Assume an average of 2 xacts per target */
+	max_table_size *= 2;
+
+	PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
+									  max_table_size,
+									  max_table_size,
+									  &info,
+									  HASH_ELEM | HASH_FUNCTION |
+									  HASH_PARTITION | HASH_FIXED_SIZE);
+
+	/*
+	 * Compute size for serializable transaction hashtable. Note these
+	 * calculations must agree with PredicateLockShmemSize!
+	 */
+	max_table_size = (MaxBackends + max_prepared_xacts);
+
+	/*
+	 * Allocate a list to hold information on transactions participating in
+	 * predicate locking.
+	 *
+	 * Assume an average of 10 predicate locking transactions per backend.
+	 * This allows aggressive cleanup while detail is present before data must
+	 * be summarized for storage in SLRU and the "dummy" transaction.
+	 */
+	max_table_size *= 10;
+
+	PredXact = ShmemInitStruct("PredXactList",
+							   PredXactListDataSize,
+							   &found);
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+	{
+		int			i;
+
+		dlist_init(&PredXact->availableList);
+		dlist_init(&PredXact->activeList);
+		PredXact->SxactGlobalXmin = InvalidTransactionId;
+		PredXact->SxactGlobalXminCount = 0;
+		PredXact->WritableSxactCount = 0;
+		PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1;
+		PredXact->CanPartialClearThrough = 0;
+		PredXact->HavePartialClearedThrough = 0;
+		requestSize = mul_size((Size) max_table_size,
+							   sizeof(SERIALIZABLEXACT));
+		PredXact->element = ShmemAlloc(requestSize);
+		/* Add all elements to available list, clean. */
+		memset(PredXact->element, 0, requestSize);
+		for (i = 0; i < max_table_size; i++)
+		{
+			LWLockInitialize(&PredXact->element[i].perXactPredicateListLock,
+							 LWTRANCHE_PER_XACT_PREDICATE_LIST);
+			dlist_push_tail(&PredXact->availableList, &PredXact->element[i].xactLink);
+		}
+		PredXact->OldCommittedSxact = CreatePredXact();
+		SetInvalidVirtualTransactionId(PredXact->OldCommittedSxact->vxid);
+		PredXact->OldCommittedSxact->prepareSeqNo = 0;
+		PredXact->OldCommittedSxact->commitSeqNo = 0;
+		PredXact->OldCommittedSxact->SeqNo.lastCommitBeforeSnapshot = 0;
+		dlist_init(&PredXact->OldCommittedSxact->outConflicts);
+		dlist_init(&PredXact->OldCommittedSxact->inConflicts);
+		dlist_init(&PredXact->OldCommittedSxact->predicateLocks);
+		dlist_node_init(&PredXact->OldCommittedSxact->finishedLink);
+		dlist_init(&PredXact->OldCommittedSxact->possibleUnsafeConflicts);
+		PredXact->OldCommittedSxact->topXid = InvalidTransactionId;
+		PredXact->OldCommittedSxact->finishedBefore = InvalidTransactionId;
+		PredXact->OldCommittedSxact->xmin = InvalidTransactionId;
+		PredXact->OldCommittedSxact->flags = SXACT_FLAG_COMMITTED;
+		PredXact->OldCommittedSxact->pid = 0;
+		PredXact->OldCommittedSxact->pgprocno = INVALID_PGPROCNO;
+	}
+	/* This never changes, so let's keep a local copy. */
+	OldCommittedSxact = PredXact->OldCommittedSxact;
+
+	/*
+	 * Allocate hash table for SERIALIZABLEXID structs.  This stores per-xid
+	 * information for serializable transactions which have accessed data.
+	 */
+	info.keysize = sizeof(SERIALIZABLEXIDTAG);
+	info.entrysize = sizeof(SERIALIZABLEXID);
+
+	SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
+										max_table_size,
+										max_table_size,
+										&info,
+										HASH_ELEM | HASH_BLOBS |
+										HASH_FIXED_SIZE);
+
+	/*
+	 * Allocate space for tracking rw-conflicts in lists attached to the
+	 * transactions.
+	 *
+	 * Assume an average of 5 conflicts per transaction.  Calculations suggest
+	 * that this will prevent resource exhaustion in even the most pessimal
+	 * loads up to max_connections = 200 with all 200 connections pounding the
+	 * database with serializable transactions.  Beyond that, there may be
+	 * occasional transactions canceled when trying to flag conflicts. That's
+	 * probably OK.
+	 */
+	max_table_size *= 5;
+
+	RWConflictPool = ShmemInitStruct("RWConflictPool",
+									 RWConflictPoolHeaderDataSize,
+									 &found);
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+	{
+		int			i;
+
+		dlist_init(&RWConflictPool->availableList);
+		requestSize = mul_size((Size) max_table_size,
+							   RWConflictDataSize);
+		RWConflictPool->element = ShmemAlloc(requestSize);
+		/* Add all elements to available list, clean. */
+		memset(RWConflictPool->element, 0, requestSize);
+		for (i = 0; i < max_table_size; i++)
+		{
+			dlist_push_tail(&RWConflictPool->availableList,
+							&RWConflictPool->element[i].outLink);
+		}
+	}
+
+	/*
+	 * Create or attach to the header for the list of finished serializable
+	 * transactions.
+	 */
+	FinishedSerializableTransactions = (dlist_head *)
+		ShmemInitStruct("FinishedSerializableTransactions",
+						sizeof(dlist_head),
+						&found);
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+		dlist_init(FinishedSerializableTransactions);
+
+	/*
+	 * Initialize the SLRU storage for old committed serializable
+	 * transactions.
+	 */
+	SerialInit();
+}
+
+/*
+ * Estimate shared-memory space used for predicate lock table
+ */
+Size
+PredicateLockShmemSize(void)
+{
+	Size		size = 0;
+	long		max_table_size;
+
+	/* predicate lock target hash table */
+	max_table_size = NPREDICATELOCKTARGETENTS();
+	size = add_size(size, hash_estimate_size(max_table_size,
+											 sizeof(PREDICATELOCKTARGET)));
+
+	/* predicate lock hash table */
+	max_table_size *= 2;
+	size = add_size(size, hash_estimate_size(max_table_size,
+											 sizeof(PREDICATELOCK)));
+
+	/*
+	 * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
+	 * margin.
+	 */
+	size = add_size(size, size / 10);
+
+	/* transaction list */
+	max_table_size = MaxBackends + max_prepared_xacts;
+	max_table_size *= 10;
+	size = add_size(size, PredXactListDataSize);
+	size = add_size(size, mul_size((Size) max_table_size,
+								   sizeof(SERIALIZABLEXACT)));
+
+	/* transaction xid table */
+	size = add_size(size, hash_estimate_size(max_table_size,
+											 sizeof(SERIALIZABLEXID)));
+
+	/* rw-conflict pool */
+	max_table_size *= 5;
+	size = add_size(size, RWConflictPoolHeaderDataSize);
+	size = add_size(size, mul_size((Size) max_table_size,
+								   RWConflictDataSize));
+
+	/* Head for list of finished serializable transactions. */
+	size = add_size(size, sizeof(dlist_head));
+
+	/* Shared memory structures for SLRU tracking of old committed xids. */
+	size = add_size(size, sizeof(SerialControlData));
+	size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+
+	return size;
+}
+
+
+/*
+ * Compute the hash code associated with a PREDICATELOCKTAG.
+ *
+ * Because we want to use just one set of partition locks for both the
+ * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
+ * that PREDICATELOCKs fall into the same partition number as their
+ * associated PREDICATELOCKTARGETs.  dynahash.c expects the partition number
+ * to be the low-order bits of the hash code, and therefore a
+ * PREDICATELOCKTAG's hash code must have the same low-order bits as the
+ * associated PREDICATELOCKTARGETTAG's hash code.  We achieve this with this
+ * specialized hash function.
+ */
+static uint32
+predicatelock_hash(const void *key, Size keysize)
+{
+	const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
+	uint32		targethash;
+
+	Assert(keysize == sizeof(PREDICATELOCKTAG));
+
+	/* Look into the associated target object, and compute its hash code */
+	targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
+
+	return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
+}
+
+
+/*
+ * GetPredicateLockStatusData
+ *		Return a table containing the internal state of the predicate
+ *		lock manager for use in pg_lock_status.
+ *
+ * Like GetLockStatusData, this function tries to hold the partition LWLocks
+ * for as short a time as possible by returning two arrays that simply
+ * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
+ * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
+ * SERIALIZABLEXACT will likely appear.
+ */
+PredicateLockData *
+GetPredicateLockStatusData(void)
+{
+	PredicateLockData *data;
+	int			i;
+	int			els,
+				el;
+	HASH_SEQ_STATUS seqstat;
+	PREDICATELOCK *predlock;
+
+	data = (PredicateLockData *) palloc(sizeof(PredicateLockData));
+
+	/*
+	 * To ensure consistency, take simultaneous locks on all partition locks
+	 * in ascending order, then SerializableXactHashLock.
+	 */
+	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+		LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+	/* Get number of locks and allocate appropriately-sized arrays. */
+	els = hash_get_num_entries(PredicateLockHash);
+	data->nelements = els;
+	data->locktags = (PREDICATELOCKTARGETTAG *)
+		palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
+	data->xacts = (SERIALIZABLEXACT *)
+		palloc(sizeof(SERIALIZABLEXACT) * els);
+
+
+	/* Scan through PredicateLockHash and copy contents */
+	hash_seq_init(&seqstat, PredicateLockHash);
+
+	el = 0;
+
+	while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
+	{
+		data->locktags[el] = predlock->tag.myTarget->tag;
+		data->xacts[el] = *predlock->tag.myXact;
+		el++;
+	}
+
+	Assert(el == els);
+
+	/* Release locks in reverse order */
+	LWLockRelease(SerializableXactHashLock);
+	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+		LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+
+	return data;
+}
+
+/*
+ * Free up shared memory structures by pushing the oldest sxact (the one at
+ * the front of the SummarizeOldestCommittedSxact queue) into summary form.
+ * Each call will free exactly one SERIALIZABLEXACT structure and may also
+ * free one or more of these structures: SERIALIZABLEXID, PREDICATELOCK,
+ * PREDICATELOCKTARGET, RWConflictData.
+ */
+static void
+SummarizeOldestCommittedSxact(void)
+{
+	SERIALIZABLEXACT *sxact;
+
+	LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+
+	/*
+	 * This function is only called if there are no sxact slots available.
+	 * Some of them must belong to old, already-finished transactions, so
+	 * there should be something in FinishedSerializableTransactions list that
+	 * we can summarize. However, there's a race condition: while we were not
+	 * holding any locks, a transaction might have ended and cleaned up all
+	 * the finished sxact entries already, freeing up their sxact slots. In
+	 * that case, we have nothing to do here. The caller will find one of the
+	 * slots released by the other backend when it retries.
+	 */
+	if (dlist_is_empty(FinishedSerializableTransactions))
+	{
+		LWLockRelease(SerializableFinishedListLock);
+		return;
+	}
+
+	/*
+	 * Grab the first sxact off the finished list -- this will be the earliest
+	 * commit.  Remove it from the list.
+	 */
+	sxact = dlist_head_element(SERIALIZABLEXACT, finishedLink,
+							   FinishedSerializableTransactions);
+	dlist_delete_thoroughly(&sxact->finishedLink);
+
+	/* Add to SLRU summary information. */
+	if (TransactionIdIsValid(sxact->topXid) && !SxactIsReadOnly(sxact))
+		SerialAdd(sxact->topXid, SxactHasConflictOut(sxact)
+				  ? sxact->SeqNo.earliestOutConflictCommit : InvalidSerCommitSeqNo);
+
+	/* Summarize and release the detail. */
+	ReleaseOneSerializableXact(sxact, false, true);
+
+	LWLockRelease(SerializableFinishedListLock);
+}
+
+/*
+ * GetSafeSnapshot
+ *		Obtain and register a snapshot for a READ ONLY DEFERRABLE
+ *		transaction. Ensures that the snapshot is "safe", i.e. a
+ *		read-only transaction running on it can execute serializably
+ *		without further checks. This requires waiting for concurrent
+ *		transactions to complete, and retrying with a new snapshot if
+ *		one of them could possibly create a conflict.
+ *
+ *		As with GetSerializableTransactionSnapshot (which this is a subroutine
+ *		for), the passed-in Snapshot pointer should reference a static data
+ *		area that can safely be passed to GetSnapshotData.
+ */
+static Snapshot
+GetSafeSnapshot(Snapshot origSnapshot)
+{
+	Snapshot	snapshot;
+
+	Assert(XactReadOnly && XactDeferrable);
+
+	while (true)
+	{
+		/*
+		 * GetSerializableTransactionSnapshotInt is going to call
+		 * GetSnapshotData, so we need to provide it the static snapshot area
+		 * our caller passed to us.  The pointer returned is actually the same
+		 * one passed to it, but we avoid assuming that here.
+		 */
+		snapshot = GetSerializableTransactionSnapshotInt(origSnapshot,
+														 NULL, InvalidPid);
+
+		if (MySerializableXact == InvalidSerializableXact)
+			return snapshot;	/* no concurrent r/w xacts; it's safe */
+
+		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+		/*
+		 * Wait for concurrent transactions to finish. Stop early if one of
+		 * them marked us as conflicted.
+		 */
+		MySerializableXact->flags |= SXACT_FLAG_DEFERRABLE_WAITING;
+		while (!(dlist_is_empty(&MySerializableXact->possibleUnsafeConflicts) ||
+				 SxactIsROUnsafe(MySerializableXact)))
+		{
+			LWLockRelease(SerializableXactHashLock);
+			ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT);
+			LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+		}
+		MySerializableXact->flags &= ~SXACT_FLAG_DEFERRABLE_WAITING;
+
+		if (!SxactIsROUnsafe(MySerializableXact))
+		{
+			LWLockRelease(SerializableXactHashLock);
+			break;				/* success */
+		}
+
+		LWLockRelease(SerializableXactHashLock);
+
+		/* else, need to retry... */
+		ereport(DEBUG2,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg_internal("deferrable snapshot was unsafe; trying a new one")));
+		ReleasePredicateLocks(false, false);
+	}
+
+	/*
+	 * Now we have a safe snapshot, so we don't need to do any further checks.
+	 */
+	Assert(SxactIsROSafe(MySerializableXact));
+	ReleasePredicateLocks(false, true);
+
+	return snapshot;
+}
+
+/*
+ * GetSafeSnapshotBlockingPids
+ *		If the specified process is currently blocked in GetSafeSnapshot,
+ *		write the process IDs of all processes that it is blocked by
+ *		into the caller-supplied buffer output[].  The list is truncated at
+ *		output_size, and the number of PIDs written into the buffer is
+ *		returned.  Returns zero if the given PID is not currently blocked
+ *		in GetSafeSnapshot.
+ */
+int
+GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size)
+{
+	int			num_written = 0;
+	dlist_iter	iter;
+	SERIALIZABLEXACT *blocking_sxact = NULL;
+
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+	/* Find blocked_pid's SERIALIZABLEXACT by linear search. */
+	dlist_foreach(iter, &PredXact->activeList)
+	{
+		SERIALIZABLEXACT *sxact =
+			dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
+
+		if (sxact->pid == blocked_pid)
+		{
+			blocking_sxact = sxact;
+			break;
+		}
+	}
+
+	/* Did we find it, and is it currently waiting in GetSafeSnapshot? */
+	if (blocking_sxact != NULL && SxactIsDeferrableWaiting(blocking_sxact))
+	{
+		/* Traverse the list of possible unsafe conflicts collecting PIDs. */
+		dlist_foreach(iter, &blocking_sxact->possibleUnsafeConflicts)
+		{
+			RWConflict	possibleUnsafeConflict =
+				dlist_container(RWConflictData, inLink, iter.cur);
+
+			output[num_written++] = possibleUnsafeConflict->sxactOut->pid;
+
+			if (num_written >= output_size)
+				break;
+		}
+	}
+
+	LWLockRelease(SerializableXactHashLock);
+
+	return num_written;
+}
+
+/*
+ * Acquire a snapshot that can be used for the current transaction.
+ *
+ * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+ * It should be current for this process and be contained in PredXact.
+ *
+ * The passed-in Snapshot pointer should reference a static data area that
+ * can safely be passed to GetSnapshotData.  The return value is actually
+ * always this same pointer; no new snapshot data structure is allocated
+ * within this function.
+ */
+Snapshot
+GetSerializableTransactionSnapshot(Snapshot snapshot)
+{
+	Assert(IsolationIsSerializable());
+
+	/*
+	 * Can't use serializable mode while recovery is still active, as it is,
+	 * for example, on a hot standby.  We could get here despite the check in
+	 * check_transaction_isolation() if default_transaction_isolation is set
+	 * to serializable, so phrase the hint accordingly.
+	 */
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot use serializable mode in a hot standby"),
+				 errdetail("\"default_transaction_isolation\" is set to \"serializable\"."),
+				 errhint("You can use \"SET default_transaction_isolation = 'repeatable read'\" to change the default.")));
+
+	/*
+	 * A special optimization is available for SERIALIZABLE READ ONLY
+	 * DEFERRABLE transactions -- we can wait for a suitable snapshot and
+	 * thereby avoid all SSI overhead once it's running.
+	 */
+	if (XactReadOnly && XactDeferrable)
+		return GetSafeSnapshot(snapshot);
+
+	return GetSerializableTransactionSnapshotInt(snapshot,
+												 NULL, InvalidPid);
+}
+
+/*
+ * Import a snapshot to be used for the current transaction.
+ *
+ * This is nearly the same as GetSerializableTransactionSnapshot, except that
+ * we don't take a new snapshot, but rather use the data we're handed.
+ *
+ * The caller must have verified that the snapshot came from a serializable
+ * transaction; and if we're read-write, the source transaction must not be
+ * read-only.
+ */
+void
+SetSerializableTransactionSnapshot(Snapshot snapshot,
+								   VirtualTransactionId *sourcevxid,
+								   int sourcepid)
+{
+	Assert(IsolationIsSerializable());
+
+	/*
+	 * If this is called by parallel.c in a parallel worker, we don't want to
+	 * create a SERIALIZABLEXACT just yet because the leader's
+	 * SERIALIZABLEXACT will be installed with AttachSerializableXact().  We
+	 * also don't want to reject SERIALIZABLE READ ONLY DEFERRABLE in this
+	 * case, because the leader has already determined that the snapshot it
+	 * has passed us is safe.  So there is nothing for us to do.
+	 */
+	if (IsParallelWorker())
+		return;
+
+	/*
+	 * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to
+	 * import snapshots, since there's no way to wait for a safe snapshot when
+	 * we're using the snap we're told to.  (XXX instead of throwing an error,
+	 * we could just ignore the XactDeferrable flag?)
+	 */
+	if (XactReadOnly && XactDeferrable)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE")));
+
+	(void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid,
+												 sourcepid);
+}
+
+/*
+ * Guts of GetSerializableTransactionSnapshot
+ *
+ * If sourcevxid is valid, this is actually an import operation and we should
+ * skip calling GetSnapshotData, because the snapshot contents are already
+ * loaded up.  HOWEVER: to avoid race conditions, we must check that the
+ * source xact is still running after we acquire SerializableXactHashLock.
+ * We do that by calling ProcArrayInstallImportedXmin.
+ */
+static Snapshot
+GetSerializableTransactionSnapshotInt(Snapshot snapshot,
+									  VirtualTransactionId *sourcevxid,
+									  int sourcepid)
+{
+	PGPROC	   *proc;
+	VirtualTransactionId vxid;
+	SERIALIZABLEXACT *sxact,
+			   *othersxact;
+
+	/* We only do this for serializable transactions.  Once. */
+	Assert(MySerializableXact == InvalidSerializableXact);
+
+	Assert(!RecoveryInProgress());
+
+	/*
+	 * Since all parts of a serializable transaction must use the same
+	 * snapshot, it is too late to establish one after a parallel operation
+	 * has begun.
+	 */
+	if (IsInParallelMode())
+		elog(ERROR, "cannot establish serializable snapshot during a parallel operation");
+
+	proc = MyProc;
+	Assert(proc != NULL);
+	GET_VXID_FROM_PGPROC(vxid, *proc);
+
+	/*
+	 * First we get the sxact structure, which may involve looping and access
+	 * to the "finished" list to free a structure for use.
+	 *
+	 * We must hold SerializableXactHashLock when taking/checking the snapshot
+	 * to avoid race conditions, for much the same reasons that
+	 * GetSnapshotData takes the ProcArrayLock.  Since we might have to
+	 * release SerializableXactHashLock to call SummarizeOldestCommittedSxact,
+	 * this means we have to create the sxact first, which is a bit annoying
+	 * (in particular, an elog(ERROR) in procarray.c would cause us to leak
+	 * the sxact).  Consider refactoring to avoid this.
+	 */
+#ifdef TEST_SUMMARIZE_SERIAL
+	SummarizeOldestCommittedSxact();
+#endif
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+	do
+	{
+		sxact = CreatePredXact();
+		/* If null, push out committed sxact to SLRU summary & retry. */
+		if (!sxact)
+		{
+			LWLockRelease(SerializableXactHashLock);
+			SummarizeOldestCommittedSxact();
+			LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+		}
+	} while (!sxact);
+
+	/* Get the snapshot, or check that it's safe to use */
+	if (!sourcevxid)
+		snapshot = GetSnapshotData(snapshot);
+	else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid))
+	{
+		ReleasePredXact(sxact);
+		LWLockRelease(SerializableXactHashLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not import the requested snapshot"),
+				 errdetail("The source process with PID %d is not running anymore.",
+						   sourcepid)));
+	}
+
+	/*
+	 * If there are no serializable transactions which are not read-only, we
+	 * can "opt out" of predicate locking and conflict checking for a
+	 * read-only transaction.
+	 *
+	 * The reason this is safe is that a read-only transaction can only become
+	 * part of a dangerous structure if it overlaps a writable transaction
+	 * which in turn overlaps a writable transaction which committed before
+	 * the read-only transaction started.  A new writable transaction can
+	 * overlap this one, but it can't meet the other condition of overlapping
+	 * a transaction which committed before this one started.
+	 */
+	if (XactReadOnly && PredXact->WritableSxactCount == 0)
+	{
+		ReleasePredXact(sxact);
+		LWLockRelease(SerializableXactHashLock);
+		return snapshot;
+	}
+
+	/* Initialize the structure. */
+	sxact->vxid = vxid;
+	sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo;
+	sxact->prepareSeqNo = InvalidSerCommitSeqNo;
+	sxact->commitSeqNo = InvalidSerCommitSeqNo;
+	dlist_init(&(sxact->outConflicts));
+	dlist_init(&(sxact->inConflicts));
+	dlist_init(&(sxact->possibleUnsafeConflicts));
+	sxact->topXid = GetTopTransactionIdIfAny();
+	sxact->finishedBefore = InvalidTransactionId;
+	sxact->xmin = snapshot->xmin;
+	sxact->pid = MyProcPid;
+	sxact->pgprocno = MyProc->pgprocno;
+	dlist_init(&sxact->predicateLocks);
+	dlist_node_init(&sxact->finishedLink);
+	sxact->flags = 0;
+	if (XactReadOnly)
+	{
+		dlist_iter	iter;
+
+		sxact->flags |= SXACT_FLAG_READ_ONLY;
+
+		/*
+		 * Register all concurrent r/w transactions as possible conflicts; if
+		 * all of them commit without any outgoing conflicts to earlier
+		 * transactions then this snapshot can be deemed safe (and we can run
+		 * without tracking predicate locks).
+		 */
+		dlist_foreach(iter, &PredXact->activeList)
+		{
+			othersxact = dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
+
+			if (!SxactIsCommitted(othersxact)
+				&& !SxactIsDoomed(othersxact)
+				&& !SxactIsReadOnly(othersxact))
+			{
+				SetPossibleUnsafeConflict(sxact, othersxact);
+			}
+		}
+
+		/*
+		 * If we didn't find any possibly unsafe conflicts because every
+		 * uncommitted writable transaction turned out to be doomed, then we
+		 * can "opt out" immediately.  See comments above the earlier check
+		 * for PredXact->WritableSxactCount == 0.
+		 */
+		if (dlist_is_empty(&sxact->possibleUnsafeConflicts))
+		{
+			ReleasePredXact(sxact);
+			LWLockRelease(SerializableXactHashLock);
+			return snapshot;
+		}
+	}
+	else
+	{
+		++(PredXact->WritableSxactCount);
+		Assert(PredXact->WritableSxactCount <=
+			   (MaxBackends + max_prepared_xacts));
+	}
+
+	/* Maintain serializable global xmin info. */
+	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+	{
+		Assert(PredXact->SxactGlobalXminCount == 0);
+		PredXact->SxactGlobalXmin = snapshot->xmin;
+		PredXact->SxactGlobalXminCount = 1;
+		SerialSetActiveSerXmin(snapshot->xmin);
+	}
+	else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin))
+	{
+		Assert(PredXact->SxactGlobalXminCount > 0);
+		PredXact->SxactGlobalXminCount++;
+	}
+	else
+	{
+		Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin));
+	}
+
+	MySerializableXact = sxact;
+	MyXactDidWrite = false;		/* haven't written anything yet */
+
+	LWLockRelease(SerializableXactHashLock);
+
+	CreateLocalPredicateLockHash();
+
+	return snapshot;
+}
+
+static void
+CreateLocalPredicateLockHash(void)
+{
+	HASHCTL		hash_ctl;
+
+	/* Initialize the backend-local hash table of parent locks */
+	Assert(LocalPredicateLockHash == NULL);
+	hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
+	hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
+	LocalPredicateLockHash = hash_create("Local predicate lock",
+										 max_predicate_locks_per_xact,
+										 &hash_ctl,
+										 HASH_ELEM | HASH_BLOBS);
+}
+
+/*
+ * Register the top level XID in SerializableXidHash.
+ * Also store it for easy reference in MySerializableXact.
+ */
+void
+RegisterPredicateLockingXid(TransactionId xid)
+{
+	SERIALIZABLEXIDTAG sxidtag;
+	SERIALIZABLEXID *sxid;
+	bool		found;
+
+	/*
+	 * If we're not tracking predicate lock data for this transaction, we
+	 * should ignore the request and return quickly.
+	 */
+	if (MySerializableXact == InvalidSerializableXact)
+		return;
+
+	/* We should have a valid XID and be at the top level. */
+	Assert(TransactionIdIsValid(xid));
+
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/* This should only be done once per transaction. */
+	Assert(MySerializableXact->topXid == InvalidTransactionId);
+
+	MySerializableXact->topXid = xid;
+
+	sxidtag.xid = xid;
+	sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+										   &sxidtag,
+										   HASH_ENTER, &found);
+	Assert(!found);
+
+	/* Initialize the structure. */
+	sxid->myXact = MySerializableXact;
+	LWLockRelease(SerializableXactHashLock);
+}
+
+
+/*
+ * Check whether there are any predicate locks held by any transaction
+ * for the page at the given block number.
+ *
+ * Note that the transaction may be completed but not yet subject to
+ * cleanup due to overlapping serializable transactions.  This must
+ * return valid information regardless of transaction isolation level.
+ *
+ * Also note that this doesn't check for a conflicting relation lock,
+ * just a lock specifically on the given page.
+ *
+ * One use is to support proper behavior during GiST index vacuum.
+ */
+bool
+PageIsPredicateLocked(Relation relation, BlockNumber blkno)
+{
+	PREDICATELOCKTARGETTAG targettag;
+	uint32		targettaghash;
+	LWLock	   *partitionLock;
+	PREDICATELOCKTARGET *target;
+
+	SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+									relation->rd_locator.dbOid,
+									relation->rd_id,
+									blkno);
+
+	targettaghash = PredicateLockTargetTagHashCode(&targettag);
+	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+	LWLockAcquire(partitionLock, LW_SHARED);
+	target = (PREDICATELOCKTARGET *)
+		hash_search_with_hash_value(PredicateLockTargetHash,
+									&targettag, targettaghash,
+									HASH_FIND, NULL);
+	LWLockRelease(partitionLock);
+
+	return (target != NULL);
+}
+
+
+/*
+ * Check whether a particular lock is held by this transaction.
+ *
+ * Important note: this function may return false even if the lock is
+ * being held, because it uses the local lock table which is not
+ * updated if another transaction modifies our lock list (e.g. to
+ * split an index page). It can also return true when a coarser
+ * granularity lock that covers this target is being held. Be careful
+ * to only use this function in circumstances where such errors are
+ * acceptable!
+ */
+static bool
+PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
+{
+	LOCALPREDICATELOCK *lock;
+
+	/* check local hash table */
+	lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+											  targettag,
+											  HASH_FIND, NULL);
+
+	if (!lock)
+		return false;
+
+	/*
+	 * Found entry in the table, but still need to check whether it's actually
+	 * held -- it could just be a parent of some held lock.
+	 */
+	return lock->held;
+}
+
+/*
+ * Return the parent lock tag in the lock hierarchy: the next coarser
+ * lock that covers the provided tag.
+ *
+ * Returns true and sets *parent to the parent tag if one exists,
+ * returns false if none exists.
+ */
+static bool
+GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+						  PREDICATELOCKTARGETTAG *parent)
+{
+	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+	{
+		case PREDLOCKTAG_RELATION:
+			/* relation locks have no parent lock */
+			return false;
+
+		case PREDLOCKTAG_PAGE:
+			/* parent lock is relation lock */
+			SET_PREDICATELOCKTARGETTAG_RELATION(*parent,
+												GET_PREDICATELOCKTARGETTAG_DB(*tag),
+												GET_PREDICATELOCKTARGETTAG_RELATION(*tag));
+
+			return true;
+
+		case PREDLOCKTAG_TUPLE:
+			/* parent lock is page lock */
+			SET_PREDICATELOCKTARGETTAG_PAGE(*parent,
+											GET_PREDICATELOCKTARGETTAG_DB(*tag),
+											GET_PREDICATELOCKTARGETTAG_RELATION(*tag),
+											GET_PREDICATELOCKTARGETTAG_PAGE(*tag));
+			return true;
+	}
+
+	/* not reachable */
+	Assert(false);
+	return false;
+}
+
+/*
+ * Check whether the lock we are considering is already covered by a
+ * coarser lock for our transaction.
+ *
+ * Like PredicateLockExists, this function might return a false
+ * negative, but it will never return a false positive.
+ */
+static bool
+CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
+{
+	PREDICATELOCKTARGETTAG targettag,
+				parenttag;
+
+	targettag = *newtargettag;
+
+	/* check parents iteratively until no more */
+	while (GetParentPredicateLockTag(&targettag, &parenttag))
+	{
+		targettag = parenttag;
+		if (PredicateLockExists(&targettag))
+			return true;
+	}
+
+	/* no more parents to check; lock is not covered */
+	return false;
+}
+
+/*
+ * Remove the dummy entry from the predicate lock target hash, to free up some
+ * scratch space. The caller must be holding SerializablePredicateListLock,
+ * and must restore the entry with RestoreScratchTarget() before releasing the
+ * lock.
+ *
+ * If lockheld is true, the caller is already holding the partition lock
+ * of the partition containing the scratch entry.
+ */
+static void
+RemoveScratchTarget(bool lockheld)
+{
+	bool		found;
+
+	Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+	if (!lockheld)
+		LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
+	hash_search_with_hash_value(PredicateLockTargetHash,
+								&ScratchTargetTag,
+								ScratchTargetTagHash,
+								HASH_REMOVE, &found);
+	Assert(found);
+	if (!lockheld)
+		LWLockRelease(ScratchPartitionLock);
+}
+
+/*
+ * Re-insert the dummy entry in predicate lock target hash.
+ */
+static void
+RestoreScratchTarget(bool lockheld)
+{
+	bool		found;
+
+	Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+	if (!lockheld)
+		LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
+	hash_search_with_hash_value(PredicateLockTargetHash,
+								&ScratchTargetTag,
+								ScratchTargetTagHash,
+								HASH_ENTER, &found);
+	Assert(!found);
+	if (!lockheld)
+		LWLockRelease(ScratchPartitionLock);
+}
+
+/*
+ * Check whether the list of related predicate locks is empty for a
+ * predicate lock target, and remove the target if it is.
+ */
+static void
+RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
+{
+	PREDICATELOCKTARGET *rmtarget PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+	/* Can't remove it until no locks at this target. */
+	if (!dlist_is_empty(&target->predicateLocks))
+		return;
+
+	/* Actually remove the target. */
+	rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+										   &target->tag,
+										   targettaghash,
+										   HASH_REMOVE, NULL);
+	Assert(rmtarget == target);
+}
+
+/*
+ * Delete child target locks owned by this process.
+ * This implementation is assuming that the usage of each target tag field
+ * is uniform.  No need to make this hard if we don't have to.
+ *
+ * We acquire an LWLock in the case of parallel mode, because worker
+ * backends have access to the leader's SERIALIZABLEXACT.  Otherwise,
+ * we aren't acquiring LWLocks for the predicate lock or lock
+ * target structures associated with this transaction unless we're going
+ * to modify them, because no other process is permitted to modify our
+ * locks.
+ */
+static void
+DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
+{
+	SERIALIZABLEXACT *sxact;
+	PREDICATELOCK *predlock;
+	dlist_mutable_iter iter;
+
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+	sxact = MySerializableXact;
+	if (IsInParallelMode())
+		LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+
+	dlist_foreach_modify(iter, &sxact->predicateLocks)
+	{
+		PREDICATELOCKTAG oldlocktag;
+		PREDICATELOCKTARGET *oldtarget;
+		PREDICATELOCKTARGETTAG oldtargettag;
+
+		predlock = dlist_container(PREDICATELOCK, xactLink, iter.cur);
+
+		oldlocktag = predlock->tag;
+		Assert(oldlocktag.myXact == sxact);
+		oldtarget = oldlocktag.myTarget;
+		oldtargettag = oldtarget->tag;
+
+		if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
+		{
+			uint32		oldtargettaghash;
+			LWLock	   *partitionLock;
+			PREDICATELOCK *rmpredlock PG_USED_FOR_ASSERTS_ONLY;
+
+			oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+			partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+
+			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+			dlist_delete(&predlock->xactLink);
+			dlist_delete(&predlock->targetLink);
+			rmpredlock = hash_search_with_hash_value
+				(PredicateLockHash,
+				 &oldlocktag,
+				 PredicateLockHashCodeFromTargetHashCode(&oldlocktag,
+														 oldtargettaghash),
+				 HASH_REMOVE, NULL);
+			Assert(rmpredlock == predlock);
+
+			RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
+
+			LWLockRelease(partitionLock);
+
+			DecrementParentLocks(&oldtargettag);
+		}
+	}
+	if (IsInParallelMode())
+		LWLockRelease(&sxact->perXactPredicateListLock);
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * Returns the promotion limit for a given predicate lock target.  This is the
+ * max number of descendant locks allowed before promoting to the specified
+ * tag. Note that the limit includes non-direct descendants (e.g., both tuples
+ * and pages for a relation lock).
+ *
+ * Currently the default limit is 2 for a page lock, and half of the value of
+ * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior
+ * of earlier releases when upgrading.
+ *
+ * TODO SSI: We should probably add additional GUCs to allow a maximum ratio
+ * of page and tuple locks based on the pages in a relation, and the maximum
+ * ratio of tuple locks to tuples in a page.  This would provide more
+ * generally "balanced" allocation of locks to where they are most useful,
+ * while still allowing the absolute numbers to prevent one relation from
+ * tying up all predicate lock resources.
+ */
+static int
+MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag)
+{
+	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+	{
+		case PREDLOCKTAG_RELATION:
+			return max_predicate_locks_per_relation < 0
+				? (max_predicate_locks_per_xact
+				   / (-max_predicate_locks_per_relation)) - 1
+				: max_predicate_locks_per_relation;
+
+		case PREDLOCKTAG_PAGE:
+			return max_predicate_locks_per_page;
+
+		case PREDLOCKTAG_TUPLE:
+
+			/*
+			 * not reachable: nothing is finer-granularity than a tuple, so we
+			 * should never try to promote to it.
+			 */
+			Assert(false);
+			return 0;
+	}
+
+	/* not reachable */
+	Assert(false);
+	return 0;
+}
+
+/*
+ * For all ancestors of a newly-acquired predicate lock, increment
+ * their child count in the parent hash table. If any of them have
+ * more descendants than their promotion threshold, acquire the
+ * coarsest such lock.
+ *
+ * Returns true if a parent lock was acquired and false otherwise.
+ */
+static bool
+CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
+{
+	PREDICATELOCKTARGETTAG targettag,
+				nexttag,
+				promotiontag;
+	LOCALPREDICATELOCK *parentlock;
+	bool		found,
+				promote;
+
+	promote = false;
+
+	targettag = *reqtag;
+
+	/* check parents iteratively */
+	while (GetParentPredicateLockTag(&targettag, &nexttag))
+	{
+		targettag = nexttag;
+		parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+														&targettag,
+														HASH_ENTER,
+														&found);
+		if (!found)
+		{
+			parentlock->held = false;
+			parentlock->childLocks = 1;
+		}
+		else
+			parentlock->childLocks++;
+
+		if (parentlock->childLocks >
+			MaxPredicateChildLocks(&targettag))
+		{
+			/*
+			 * We should promote to this parent lock. Continue to check its
+			 * ancestors, however, both to get their child counts right and to
+			 * check whether we should just go ahead and promote to one of
+			 * them.
+			 */
+			promotiontag = targettag;
+			promote = true;
+		}
+	}
+
+	if (promote)
+	{
+		/* acquire coarsest ancestor eligible for promotion */
+		PredicateLockAcquire(&promotiontag);
+		return true;
+	}
+	else
+		return false;
+}
+
+/*
+ * When releasing a lock, decrement the child count on all ancestor
+ * locks.
+ *
+ * This is called only when releasing a lock via
+ * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
+ * we've acquired its parent, possibly due to promotion) or when a new
+ * MVCC write lock makes the predicate lock unnecessary. There's no
+ * point in calling it when locks are released at transaction end, as
+ * this information is no longer needed.
+ */
+static void
+DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
+{
+	PREDICATELOCKTARGETTAG parenttag,
+				nexttag;
+
+	parenttag = *targettag;
+
+	while (GetParentPredicateLockTag(&parenttag, &nexttag))
+	{
+		uint32		targettaghash;
+		LOCALPREDICATELOCK *parentlock,
+				   *rmlock PG_USED_FOR_ASSERTS_ONLY;
+
+		parenttag = nexttag;
+		targettaghash = PredicateLockTargetTagHashCode(&parenttag);
+		parentlock = (LOCALPREDICATELOCK *)
+			hash_search_with_hash_value(LocalPredicateLockHash,
+										&parenttag, targettaghash,
+										HASH_FIND, NULL);
+
+		/*
+		 * There's a small chance the parent lock doesn't exist in the lock
+		 * table. This can happen if we prematurely removed it because an
+		 * index split caused the child refcount to be off.
+		 */
+		if (parentlock == NULL)
+			continue;
+
+		parentlock->childLocks--;
+
+		/*
+		 * Under similar circumstances the parent lock's refcount might be
+		 * zero. This only happens if we're holding that lock (otherwise we
+		 * would have removed the entry).
+		 */
+		if (parentlock->childLocks < 0)
+		{
+			Assert(parentlock->held);
+			parentlock->childLocks = 0;
+		}
+
+		if ((parentlock->childLocks == 0) && (!parentlock->held))
+		{
+			rmlock = (LOCALPREDICATELOCK *)
+				hash_search_with_hash_value(LocalPredicateLockHash,
+											&parenttag, targettaghash,
+											HASH_REMOVE, NULL);
+			Assert(rmlock == parentlock);
+		}
+	}
+}
+
+/*
+ * Indicate that a predicate lock on the given target is held by the
+ * specified transaction. Has no effect if the lock is already held.
+ *
+ * This updates the lock table and the sxact's lock list, and creates
+ * the lock target if necessary, but does *not* do anything related to
+ * granularity promotion or the local lock table. See
+ * PredicateLockAcquire for that.
+ */
+static void
+CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
+					uint32 targettaghash,
+					SERIALIZABLEXACT *sxact)
+{
+	PREDICATELOCKTARGET *target;
+	PREDICATELOCKTAG locktag;
+	PREDICATELOCK *lock;
+	LWLock	   *partitionLock;
+	bool		found;
+
+	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+	if (IsInParallelMode())
+		LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/* Make sure that the target is represented. */
+	target = (PREDICATELOCKTARGET *)
+		hash_search_with_hash_value(PredicateLockTargetHash,
+									targettag, targettaghash,
+									HASH_ENTER_NULL, &found);
+	if (!target)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+				 errhint("You might need to increase %s.", "max_pred_locks_per_transaction")));
+	if (!found)
+		dlist_init(&target->predicateLocks);
+
+	/* We've got the sxact and target, make sure they're joined. */
+	locktag.myTarget = target;
+	locktag.myXact = sxact;
+	lock = (PREDICATELOCK *)
+		hash_search_with_hash_value(PredicateLockHash, &locktag,
+									PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
+									HASH_ENTER_NULL, &found);
+	if (!lock)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+				 errhint("You might need to increase %s.", "max_pred_locks_per_transaction")));
+
+	if (!found)
+	{
+		dlist_push_tail(&target->predicateLocks, &lock->targetLink);
+		dlist_push_tail(&sxact->predicateLocks, &lock->xactLink);
+		lock->commitSeqNo = InvalidSerCommitSeqNo;
+	}
+
+	LWLockRelease(partitionLock);
+	if (IsInParallelMode())
+		LWLockRelease(&sxact->perXactPredicateListLock);
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * Acquire a predicate lock on the specified target for the current
+ * connection if not already held. This updates the local lock table
+ * and uses it to implement granularity promotion. It will consolidate
+ * multiple locks into a coarser lock if warranted, and will release
+ * any finer-grained locks covered by the new one.
+ */
+static void
+PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
+{
+	uint32		targettaghash;
+	bool		found;
+	LOCALPREDICATELOCK *locallock;
+
+	/* Do we have the lock already, or a covering lock? */
+	if (PredicateLockExists(targettag))
+		return;
+
+	if (CoarserLockCovers(targettag))
+		return;
+
+	/* the same hash and LW lock apply to the lock target and the local lock. */
+	targettaghash = PredicateLockTargetTagHashCode(targettag);
+
+	/* Acquire lock in local table */
+	locallock = (LOCALPREDICATELOCK *)
+		hash_search_with_hash_value(LocalPredicateLockHash,
+									targettag, targettaghash,
+									HASH_ENTER, &found);
+	locallock->held = true;
+	if (!found)
+		locallock->childLocks = 0;
+
+	/* Actually create the lock */
+	CreatePredicateLock(targettag, targettaghash, MySerializableXact);
+
+	/*
+	 * Lock has been acquired. Check whether it should be promoted to a
+	 * coarser granularity, or whether there are finer-granularity locks to
+	 * clean up.
+	 */
+	if (CheckAndPromotePredicateLockRequest(targettag))
+	{
+		/*
+		 * Lock request was promoted to a coarser-granularity lock, and that
+		 * lock was acquired. It will delete this lock and any of its
+		 * children, so we're done.
+		 */
+	}
+	else
+	{
+		/* Clean up any finer-granularity locks */
+		if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE)
+			DeleteChildTargetLocks(targettag);
+	}
+}
+
+
+/*
+ *		PredicateLockRelation
+ *
+ * Gets a predicate lock at the relation level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ * Clear any finer-grained predicate locks this session has on the relation.
+ */
+void
+PredicateLockRelation(Relation relation, Snapshot snapshot)
+{
+	PREDICATELOCKTARGETTAG tag;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
+	SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+										relation->rd_locator.dbOid,
+										relation->rd_id);
+	PredicateLockAcquire(&tag);
+}
+
+/*
+ *		PredicateLockPage
+ *
+ * Gets a predicate lock at the page level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ * Skip if a coarser predicate lock already covers this page.
+ * Clear any finer-grained predicate locks this session has on the relation.
+ */
+void
+PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
+{
+	PREDICATELOCKTARGETTAG tag;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
+	SET_PREDICATELOCKTARGETTAG_PAGE(tag,
+									relation->rd_locator.dbOid,
+									relation->rd_id,
+									blkno);
+	PredicateLockAcquire(&tag);
+}
+
+/*
+ *		PredicateLockTID
+ *
+ * Gets a predicate lock at the tuple level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ */
+void
+PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
+				 TransactionId tuple_xid)
+{
+	PREDICATELOCKTARGETTAG tag;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
+	/*
+	 * Return if this xact wrote it.
+	 */
+	if (relation->rd_index == NULL)
+	{
+		/* If we wrote it; we already have a write lock. */
+		if (TransactionIdIsCurrentTransactionId(tuple_xid))
+			return;
+	}
+
+	/*
+	 * Do quick-but-not-definitive test for a relation lock first.  This will
+	 * never cause a return when the relation is *not* locked, but will
+	 * occasionally let the check continue when there really *is* a relation
+	 * level lock.
+	 */
+	SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+										relation->rd_locator.dbOid,
+										relation->rd_id);
+	if (PredicateLockExists(&tag))
+		return;
+
+	SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
+									 relation->rd_locator.dbOid,
+									 relation->rd_id,
+									 ItemPointerGetBlockNumber(tid),
+									 ItemPointerGetOffsetNumber(tid));
+	PredicateLockAcquire(&tag);
+}
+
+
+/*
+ *		DeleteLockTarget
+ *
+ * Remove a predicate lock target along with any locks held for it.
+ *
+ * Caller must hold SerializablePredicateListLock and the
+ * appropriate hash partition lock for the target.
+ */
+static void
+DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
+{
+	dlist_mutable_iter iter;
+
+	Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
+								LW_EXCLUSIVE));
+	Assert(LWLockHeldByMe(PredicateLockHashPartitionLock(targettaghash)));
+
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	dlist_foreach_modify(iter, &target->predicateLocks)
+	{
+		PREDICATELOCK *predlock =
+			dlist_container(PREDICATELOCK, targetLink, iter.cur);
+		bool		found;
+
+		dlist_delete(&(predlock->xactLink));
+		dlist_delete(&(predlock->targetLink));
+
+		hash_search_with_hash_value
+			(PredicateLockHash,
+			 &predlock->tag,
+			 PredicateLockHashCodeFromTargetHashCode(&predlock->tag,
+													 targettaghash),
+			 HASH_REMOVE, &found);
+		Assert(found);
+	}
+	LWLockRelease(SerializableXactHashLock);
+
+	/* Remove the target itself, if possible. */
+	RemoveTargetIfNoLongerUsed(target, targettaghash);
+}
+
+
+/*
+ *		TransferPredicateLocksToNewTarget
+ *
+ * Move or copy all the predicate locks for a lock target, for use by
+ * index page splits/combines and other things that create or replace
+ * lock targets. If 'removeOld' is true, the old locks and the target
+ * will be removed.
+ *
+ * Returns true on success, or false if we ran out of shared memory to
+ * allocate the new target or locks. Guaranteed to always succeed if
+ * removeOld is set (by using the scratch entry in PredicateLockTargetHash
+ * for scratch space).
+ *
+ * Warning: the "removeOld" option should be used only with care,
+ * because this function does not (indeed, can not) update other
+ * backends' LocalPredicateLockHash. If we are only adding new
+ * entries, this is not a problem: the local lock table is used only
+ * as a hint, so missing entries for locks that are held are
+ * OK. Having entries for locks that are no longer held, as can happen
+ * when using "removeOld", is not in general OK. We can only use it
+ * safely when replacing a lock with a coarser-granularity lock that
+ * covers it, or if we are absolutely certain that no one will need to
+ * refer to that lock in the future.
+ *
+ * Caller must hold SerializablePredicateListLock exclusively.
+ */
+static bool
+TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
+								  PREDICATELOCKTARGETTAG newtargettag,
+								  bool removeOld)
+{
+	uint32		oldtargettaghash;
+	LWLock	   *oldpartitionLock;
+	PREDICATELOCKTARGET *oldtarget;
+	uint32		newtargettaghash;
+	LWLock	   *newpartitionLock;
+	bool		found;
+	bool		outOfShmem = false;
+
+	Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
+								LW_EXCLUSIVE));
+
+	oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+	newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+	oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+	newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+
+	if (removeOld)
+	{
+		/*
+		 * Remove the dummy entry to give us scratch space, so we know we'll
+		 * be able to create the new lock target.
+		 */
+		RemoveScratchTarget(false);
+	}
+
+	/*
+	 * We must get the partition locks in ascending sequence to avoid
+	 * deadlocks. If old and new partitions are the same, we must request the
+	 * lock only once.
+	 */
+	if (oldpartitionLock < newpartitionLock)
+	{
+		LWLockAcquire(oldpartitionLock,
+					  (removeOld ? LW_EXCLUSIVE : LW_SHARED));
+		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+	}
+	else if (oldpartitionLock > newpartitionLock)
+	{
+		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+		LWLockAcquire(oldpartitionLock,
+					  (removeOld ? LW_EXCLUSIVE : LW_SHARED));
+	}
+	else
+		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * Look for the old target.  If not found, that's OK; no predicate locks
+	 * are affected, so we can just clean up and return. If it does exist,
+	 * walk its list of predicate locks and move or copy them to the new
+	 * target.
+	 */
+	oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+											&oldtargettag,
+											oldtargettaghash,
+											HASH_FIND, NULL);
+
+	if (oldtarget)
+	{
+		PREDICATELOCKTARGET *newtarget;
+		PREDICATELOCKTAG newpredlocktag;
+		dlist_mutable_iter iter;
+
+		newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+												&newtargettag,
+												newtargettaghash,
+												HASH_ENTER_NULL, &found);
+
+		if (!newtarget)
+		{
+			/* Failed to allocate due to insufficient shmem */
+			outOfShmem = true;
+			goto exit;
+		}
+
+		/* If we created a new entry, initialize it */
+		if (!found)
+			dlist_init(&newtarget->predicateLocks);
+
+		newpredlocktag.myTarget = newtarget;
+
+		/*
+		 * Loop through all the locks on the old target, replacing them with
+		 * locks on the new target.
+		 */
+		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+		dlist_foreach_modify(iter, &oldtarget->predicateLocks)
+		{
+			PREDICATELOCK *oldpredlock =
+				dlist_container(PREDICATELOCK, targetLink, iter.cur);
+			PREDICATELOCK *newpredlock;
+			SerCommitSeqNo oldCommitSeqNo = oldpredlock->commitSeqNo;
+
+			newpredlocktag.myXact = oldpredlock->tag.myXact;
+
+			if (removeOld)
+			{
+				dlist_delete(&(oldpredlock->xactLink));
+				dlist_delete(&(oldpredlock->targetLink));
+
+				hash_search_with_hash_value
+					(PredicateLockHash,
+					 &oldpredlock->tag,
+					 PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag,
+															 oldtargettaghash),
+					 HASH_REMOVE, &found);
+				Assert(found);
+			}
+
+			newpredlock = (PREDICATELOCK *)
+				hash_search_with_hash_value(PredicateLockHash,
+											&newpredlocktag,
+											PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+																					newtargettaghash),
+											HASH_ENTER_NULL,
+											&found);
+			if (!newpredlock)
+			{
+				/* Out of shared memory. Undo what we've done so far. */
+				LWLockRelease(SerializableXactHashLock);
+				DeleteLockTarget(newtarget, newtargettaghash);
+				outOfShmem = true;
+				goto exit;
+			}
+			if (!found)
+			{
+				dlist_push_tail(&(newtarget->predicateLocks),
+								&(newpredlock->targetLink));
+				dlist_push_tail(&(newpredlocktag.myXact->predicateLocks),
+								&(newpredlock->xactLink));
+				newpredlock->commitSeqNo = oldCommitSeqNo;
+			}
+			else
+			{
+				if (newpredlock->commitSeqNo < oldCommitSeqNo)
+					newpredlock->commitSeqNo = oldCommitSeqNo;
+			}
+
+			Assert(newpredlock->commitSeqNo != 0);
+			Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
+				   || (newpredlock->tag.myXact == OldCommittedSxact));
+		}
+		LWLockRelease(SerializableXactHashLock);
+
+		if (removeOld)
+		{
+			Assert(dlist_is_empty(&oldtarget->predicateLocks));
+			RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
+		}
+	}
+
+
+exit:
+	/* Release partition locks in reverse order of acquisition. */
+	if (oldpartitionLock < newpartitionLock)
+	{
+		LWLockRelease(newpartitionLock);
+		LWLockRelease(oldpartitionLock);
+	}
+	else if (oldpartitionLock > newpartitionLock)
+	{
+		LWLockRelease(oldpartitionLock);
+		LWLockRelease(newpartitionLock);
+	}
+	else
+		LWLockRelease(newpartitionLock);
+
+	if (removeOld)
+	{
+		/* We shouldn't run out of memory if we're moving locks */
+		Assert(!outOfShmem);
+
+		/* Put the scratch entry back */
+		RestoreScratchTarget(false);
+	}
+
+	return !outOfShmem;
+}
+
+/*
+ * Drop all predicate locks of any granularity from the specified relation,
+ * which can be a heap relation or an index relation.  If 'transfer' is true,
+ * acquire a relation lock on the heap for any transactions with any lock(s)
+ * on the specified relation.
+ *
+ * This requires grabbing a lot of LW locks and scanning the entire lock
+ * target table for matches.  That makes this more expensive than most
+ * predicate lock management functions, but it will only be called for DDL
+ * type commands that are expensive anyway, and there are fast returns when
+ * no serializable transactions are active or the relation is temporary.
+ *
+ * We don't use the TransferPredicateLocksToNewTarget function because it
+ * acquires its own locks on the partitions of the two targets involved,
+ * and we'll already be holding all partition locks.
+ *
+ * We can't throw an error from here, because the call could be from a
+ * transaction which is not serializable.
+ *
+ * NOTE: This is currently only called with transfer set to true, but that may
+ * change.  If we decide to clean up the locks from a table on commit of a
+ * transaction which executed DROP TABLE, the false condition will be useful.
+ */
+static void
+DropAllPredicateLocksFromTable(Relation relation, bool transfer)
+{
+	HASH_SEQ_STATUS seqstat;
+	PREDICATELOCKTARGET *oldtarget;
+	PREDICATELOCKTARGET *heaptarget;
+	Oid			dbId;
+	Oid			relId;
+	Oid			heapId;
+	int			i;
+	bool		isIndex;
+	bool		found;
+	uint32		heaptargettaghash;
+
+	/*
+	 * Bail out quickly if there are no serializable transactions running.
+	 * It's safe to check this without taking locks because the caller is
+	 * holding an ACCESS EXCLUSIVE lock on the relation.  No new locks which
+	 * would matter here can be acquired while that is held.
+	 */
+	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+		return;
+
+	if (!PredicateLockingNeededForRelation(relation))
+		return;
+
+	dbId = relation->rd_locator.dbOid;
+	relId = relation->rd_id;
+	if (relation->rd_index == NULL)
+	{
+		isIndex = false;
+		heapId = relId;
+	}
+	else
+	{
+		isIndex = true;
+		heapId = relation->rd_index->indrelid;
+	}
+	Assert(heapId != InvalidOid);
+	Assert(transfer || !isIndex);	/* index OID only makes sense with
+									 * transfer */
+
+	/* Retrieve first time needed, then keep. */
+	heaptargettaghash = 0;
+	heaptarget = NULL;
+
+	/* Acquire locks on all lock partitions */
+	LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+		LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/*
+	 * Remove the dummy entry to give us scratch space, so we know we'll be
+	 * able to create the new lock target.
+	 */
+	if (transfer)
+		RemoveScratchTarget(true);
+
+	/* Scan through target map */
+	hash_seq_init(&seqstat, PredicateLockTargetHash);
+
+	while ((oldtarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
+	{
+		dlist_mutable_iter iter;
+
+		/*
+		 * Check whether this is a target which needs attention.
+		 */
+		if (GET_PREDICATELOCKTARGETTAG_RELATION(oldtarget->tag) != relId)
+			continue;			/* wrong relation id */
+		if (GET_PREDICATELOCKTARGETTAG_DB(oldtarget->tag) != dbId)
+			continue;			/* wrong database id */
+		if (transfer && !isIndex
+			&& GET_PREDICATELOCKTARGETTAG_TYPE(oldtarget->tag) == PREDLOCKTAG_RELATION)
+			continue;			/* already the right lock */
+
+		/*
+		 * If we made it here, we have work to do.  We make sure the heap
+		 * relation lock exists, then we walk the list of predicate locks for
+		 * the old target we found, moving all locks to the heap relation lock
+		 * -- unless they already hold that.
+		 */
+
+		/*
+		 * First make sure we have the heap relation target.  We only need to
+		 * do this once.
+		 */
+		if (transfer && heaptarget == NULL)
+		{
+			PREDICATELOCKTARGETTAG heaptargettag;
+
+			SET_PREDICATELOCKTARGETTAG_RELATION(heaptargettag, dbId, heapId);
+			heaptargettaghash = PredicateLockTargetTagHashCode(&heaptargettag);
+			heaptarget = hash_search_with_hash_value(PredicateLockTargetHash,
+													 &heaptargettag,
+													 heaptargettaghash,
+													 HASH_ENTER, &found);
+			if (!found)
+				dlist_init(&heaptarget->predicateLocks);
+		}
+
+		/*
+		 * Loop through all the locks on the old target, replacing them with
+		 * locks on the new target.
+		 */
+		dlist_foreach_modify(iter, &oldtarget->predicateLocks)
+		{
+			PREDICATELOCK *oldpredlock =
+				dlist_container(PREDICATELOCK, targetLink, iter.cur);
+			PREDICATELOCK *newpredlock;
+			SerCommitSeqNo oldCommitSeqNo;
+			SERIALIZABLEXACT *oldXact;
+
+			/*
+			 * Remove the old lock first. This avoids the chance of running
+			 * out of lock structure entries for the hash table.
+			 */
+			oldCommitSeqNo = oldpredlock->commitSeqNo;
+			oldXact = oldpredlock->tag.myXact;
+
+			dlist_delete(&(oldpredlock->xactLink));
+
+			/*
+			 * No need for retail delete from oldtarget list, we're removing
+			 * the whole target anyway.
+			 */
+			hash_search(PredicateLockHash,
+						&oldpredlock->tag,
+						HASH_REMOVE, &found);
+			Assert(found);
+
+			if (transfer)
+			{
+				PREDICATELOCKTAG newpredlocktag;
+
+				newpredlocktag.myTarget = heaptarget;
+				newpredlocktag.myXact = oldXact;
+				newpredlock = (PREDICATELOCK *)
+					hash_search_with_hash_value(PredicateLockHash,
+												&newpredlocktag,
+												PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+																						heaptargettaghash),
+												HASH_ENTER,
+												&found);
+				if (!found)
+				{
+					dlist_push_tail(&(heaptarget->predicateLocks),
+									&(newpredlock->targetLink));
+					dlist_push_tail(&(newpredlocktag.myXact->predicateLocks),
+									&(newpredlock->xactLink));
+					newpredlock->commitSeqNo = oldCommitSeqNo;
+				}
+				else
+				{
+					if (newpredlock->commitSeqNo < oldCommitSeqNo)
+						newpredlock->commitSeqNo = oldCommitSeqNo;
+				}
+
+				Assert(newpredlock->commitSeqNo != 0);
+				Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
+					   || (newpredlock->tag.myXact == OldCommittedSxact));
+			}
+		}
+
+		hash_search(PredicateLockTargetHash, &oldtarget->tag, HASH_REMOVE,
+					&found);
+		Assert(found);
+	}
+
+	/* Put the scratch entry back */
+	if (transfer)
+		RestoreScratchTarget(true);
+
+	/* Release locks in reverse order */
+	LWLockRelease(SerializableXactHashLock);
+	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+		LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * TransferPredicateLocksToHeapRelation
+ *		For all transactions, transfer all predicate locks for the given
+ *		relation to a single relation lock on the heap.
+ */
+void
+TransferPredicateLocksToHeapRelation(Relation relation)
+{
+	DropAllPredicateLocksFromTable(relation, true);
+}
+
+
+/*
+ *		PredicateLockPageSplit
+ *
+ * Copies any predicate locks for the old page to the new page.
+ * Skip if this is a temporary table or toast table.
+ *
+ * NOTE: A page split (or overflow) affects all serializable transactions,
+ * even if it occurs in the context of another transaction isolation level.
+ *
+ * NOTE: This currently leaves the local copy of the locks without
+ * information on the new lock which is in shared memory.  This could cause
+ * problems if enough page splits occur on locked pages without the processes
+ * which hold the locks getting in and noticing.
+ */
+void
+PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+					   BlockNumber newblkno)
+{
+	PREDICATELOCKTARGETTAG oldtargettag;
+	PREDICATELOCKTARGETTAG newtargettag;
+	bool		success;
+
+	/*
+	 * Bail out quickly if there are no serializable transactions running.
+	 *
+	 * It's safe to do this check without taking any additional locks. Even if
+	 * a serializable transaction starts concurrently, we know it can't take
+	 * any SIREAD locks on the page being split because the caller is holding
+	 * the associated buffer page lock. Memory reordering isn't an issue; the
+	 * memory barrier in the LWLock acquisition guarantees that this read
+	 * occurs while the buffer page lock is held.
+	 */
+	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+		return;
+
+	if (!PredicateLockingNeededForRelation(relation))
+		return;
+
+	Assert(oldblkno != newblkno);
+	Assert(BlockNumberIsValid(oldblkno));
+	Assert(BlockNumberIsValid(newblkno));
+
+	SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+									relation->rd_locator.dbOid,
+									relation->rd_id,
+									oldblkno);
+	SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+									relation->rd_locator.dbOid,
+									relation->rd_id,
+									newblkno);
+
+	LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+
+	/*
+	 * Try copying the locks over to the new page's tag, creating it if
+	 * necessary.
+	 */
+	success = TransferPredicateLocksToNewTarget(oldtargettag,
+												newtargettag,
+												false);
+
+	if (!success)
+	{
+		/*
+		 * No more predicate lock entries are available. Failure isn't an
+		 * option here, so promote the page lock to a relation lock.
+		 */
+
+		/* Get the parent relation lock's lock tag */
+		success = GetParentPredicateLockTag(&oldtargettag,
+											&newtargettag);
+		Assert(success);
+
+		/*
+		 * Move the locks to the parent. This shouldn't fail.
+		 *
+		 * Note that here we are removing locks held by other backends,
+		 * leading to a possible inconsistency in their local lock hash table.
+		 * This is OK because we're replacing it with a lock that covers the
+		 * old one.
+		 */
+		success = TransferPredicateLocksToNewTarget(oldtargettag,
+													newtargettag,
+													true);
+		Assert(success);
+	}
+
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ *		PredicateLockPageCombine
+ *
+ * Combines predicate locks for two existing pages.
+ * Skip if this is a temporary table or toast table.
+ *
+ * NOTE: A page combine affects all serializable transactions, even if it
+ * occurs in the context of another transaction isolation level.
+ */
+void
+PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+						 BlockNumber newblkno)
+{
+	/*
+	 * Page combines differ from page splits in that we ought to be able to
+	 * remove the locks on the old page after transferring them to the new
+	 * page, instead of duplicating them. However, because we can't edit other
+	 * backends' local lock tables, removing the old lock would leave them
+	 * with an entry in their LocalPredicateLockHash for a lock they're not
+	 * holding, which isn't acceptable. So we wind up having to do the same
+	 * work as a page split, acquiring a lock on the new page and keeping the
+	 * old page locked too. That can lead to some false positives, but should
+	 * be rare in practice.
+	 */
+	PredicateLockPageSplit(relation, oldblkno, newblkno);
+}
+
+/*
+ * Walk the list of in-progress serializable transactions and find the new
+ * xmin.
+ */
+static void
+SetNewSxactGlobalXmin(void)
+{
+	dlist_iter	iter;
+
+	Assert(LWLockHeldByMe(SerializableXactHashLock));
+
+	PredXact->SxactGlobalXmin = InvalidTransactionId;
+	PredXact->SxactGlobalXminCount = 0;
+
+	dlist_foreach(iter, &PredXact->activeList)
+	{
+		SERIALIZABLEXACT *sxact =
+			dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
+
+		if (!SxactIsRolledBack(sxact)
+			&& !SxactIsCommitted(sxact)
+			&& sxact != OldCommittedSxact)
+		{
+			Assert(sxact->xmin != InvalidTransactionId);
+			if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
+				|| TransactionIdPrecedes(sxact->xmin,
+										 PredXact->SxactGlobalXmin))
+			{
+				PredXact->SxactGlobalXmin = sxact->xmin;
+				PredXact->SxactGlobalXminCount = 1;
+			}
+			else if (TransactionIdEquals(sxact->xmin,
+										 PredXact->SxactGlobalXmin))
+				PredXact->SxactGlobalXminCount++;
+		}
+	}
+
+	SerialSetActiveSerXmin(PredXact->SxactGlobalXmin);
+}
+
+/*
+ *		ReleasePredicateLocks
+ *
+ * Releases predicate locks based on completion of the current transaction,
+ * whether committed or rolled back.  It can also be called for a read only
+ * transaction when it becomes impossible for the transaction to become
+ * part of a dangerous structure.
+ *
+ * We do nothing unless this is a serializable transaction.
+ *
+ * This method must ensure that shared memory hash tables are cleaned
+ * up in some relatively timely fashion.
+ *
+ * If this transaction is committing and is holding any predicate locks,
+ * it must be added to a list of completed serializable transactions still
+ * holding locks.
+ *
+ * If isReadOnlySafe is true, then predicate locks are being released before
+ * the end of the transaction because MySerializableXact has been determined
+ * to be RO_SAFE.  In non-parallel mode we can release it completely, but it
+ * in parallel mode we partially release the SERIALIZABLEXACT and keep it
+ * around until the end of the transaction, allowing each backend to clear its
+ * MySerializableXact variable and benefit from the optimization in its own
+ * time.
+ */
+void
+ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
+{
+	bool		partiallyReleasing = false;
+	bool		needToClear;
+	SERIALIZABLEXACT *roXact;
+	dlist_mutable_iter iter;
+
+	/*
+	 * We can't trust XactReadOnly here, because a transaction which started
+	 * as READ WRITE can show as READ ONLY later, e.g., within
+	 * subtransactions.  We want to flag a transaction as READ ONLY if it
+	 * commits without writing so that de facto READ ONLY transactions get the
+	 * benefit of some RO optimizations, so we will use this local variable to
+	 * get some cleanup logic right which is based on whether the transaction
+	 * was declared READ ONLY at the top level.
+	 */
+	bool		topLevelIsDeclaredReadOnly;
+
+	/* We can't be both committing and releasing early due to RO_SAFE. */
+	Assert(!(isCommit && isReadOnlySafe));
+
+	/* Are we at the end of a transaction, that is, a commit or abort? */
+	if (!isReadOnlySafe)
+	{
+		/*
+		 * Parallel workers mustn't release predicate locks at the end of
+		 * their transaction.  The leader will do that at the end of its
+		 * transaction.
+		 */
+		if (IsParallelWorker())
+		{
+			ReleasePredicateLocksLocal();
+			return;
+		}
+
+		/*
+		 * By the time the leader in a parallel query reaches end of
+		 * transaction, it has waited for all workers to exit.
+		 */
+		Assert(!ParallelContextActive());
+
+		/*
+		 * If the leader in a parallel query earlier stashed a partially
+		 * released SERIALIZABLEXACT for final clean-up at end of transaction
+		 * (because workers might still have been accessing it), then it's
+		 * time to restore it.
+		 */
+		if (SavedSerializableXact != InvalidSerializableXact)
+		{
+			Assert(MySerializableXact == InvalidSerializableXact);
+			MySerializableXact = SavedSerializableXact;
+			SavedSerializableXact = InvalidSerializableXact;
+			Assert(SxactIsPartiallyReleased(MySerializableXact));
+		}
+	}
+
+	if (MySerializableXact == InvalidSerializableXact)
+	{
+		Assert(LocalPredicateLockHash == NULL);
+		return;
+	}
+
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/*
+	 * If the transaction is committing, but it has been partially released
+	 * already, then treat this as a roll back.  It was marked as rolled back.
+	 */
+	if (isCommit && SxactIsPartiallyReleased(MySerializableXact))
+		isCommit = false;
+
+	/*
+	 * If we're called in the middle of a transaction because we discovered
+	 * that the SXACT_FLAG_RO_SAFE flag was set, then we'll partially release
+	 * it (that is, release the predicate locks and conflicts, but not the
+	 * SERIALIZABLEXACT itself) if we're the first backend to have noticed.
+	 */
+	if (isReadOnlySafe && IsInParallelMode())
+	{
+		/*
+		 * The leader needs to stash a pointer to it, so that it can
+		 * completely release it at end-of-transaction.
+		 */
+		if (!IsParallelWorker())
+			SavedSerializableXact = MySerializableXact;
+
+		/*
+		 * The first backend to reach this condition will partially release
+		 * the SERIALIZABLEXACT.  All others will just clear their
+		 * backend-local state so that they stop doing SSI checks for the rest
+		 * of the transaction.
+		 */
+		if (SxactIsPartiallyReleased(MySerializableXact))
+		{
+			LWLockRelease(SerializableXactHashLock);
+			ReleasePredicateLocksLocal();
+			return;
+		}
+		else
+		{
+			MySerializableXact->flags |= SXACT_FLAG_PARTIALLY_RELEASED;
+			partiallyReleasing = true;
+			/* ... and proceed to perform the partial release below. */
+		}
+	}
+	Assert(!isCommit || SxactIsPrepared(MySerializableXact));
+	Assert(!isCommit || !SxactIsDoomed(MySerializableXact));
+	Assert(!SxactIsCommitted(MySerializableXact));
+	Assert(SxactIsPartiallyReleased(MySerializableXact)
+		   || !SxactIsRolledBack(MySerializableXact));
+
+	/* may not be serializable during COMMIT/ROLLBACK PREPARED */
+	Assert(MySerializableXact->pid == 0 || IsolationIsSerializable());
+
+	/* We'd better not already be on the cleanup list. */
+	Assert(!SxactIsOnFinishedList(MySerializableXact));
+
+	topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact);
+
+	/*
+	 * We don't hold XidGenLock lock here, assuming that TransactionId is
+	 * atomic!
+	 *
+	 * If this value is changing, we don't care that much whether we get the
+	 * old or new value -- it is just used to determine how far
+	 * SxactGlobalXmin must advance before this transaction can be fully
+	 * cleaned up.  The worst that could happen is we wait for one more
+	 * transaction to complete before freeing some RAM; correctness of visible
+	 * behavior is not affected.
+	 */
+	MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+	/*
+	 * If it's not a commit it's either a rollback or a read-only transaction
+	 * flagged SXACT_FLAG_RO_SAFE, and we can clear our locks immediately.
+	 */
+	if (isCommit)
+	{
+		MySerializableXact->flags |= SXACT_FLAG_COMMITTED;
+		MySerializableXact->commitSeqNo = ++(PredXact->LastSxactCommitSeqNo);
+		/* Recognize implicit read-only transaction (commit without write). */
+		if (!MyXactDidWrite)
+			MySerializableXact->flags |= SXACT_FLAG_READ_ONLY;
+	}
+	else
+	{
+		/*
+		 * The DOOMED flag indicates that we intend to roll back this
+		 * transaction and so it should not cause serialization failures for
+		 * other transactions that conflict with it. Note that this flag might
+		 * already be set, if another backend marked this transaction for
+		 * abort.
+		 *
+		 * The ROLLED_BACK flag further indicates that ReleasePredicateLocks
+		 * has been called, and so the SerializableXact is eligible for
+		 * cleanup. This means it should not be considered when calculating
+		 * SxactGlobalXmin.
+		 */
+		MySerializableXact->flags |= SXACT_FLAG_DOOMED;
+		MySerializableXact->flags |= SXACT_FLAG_ROLLED_BACK;
+
+		/*
+		 * If the transaction was previously prepared, but is now failing due
+		 * to a ROLLBACK PREPARED or (hopefully very rare) error after the
+		 * prepare, clear the prepared flag.  This simplifies conflict
+		 * checking.
+		 */
+		MySerializableXact->flags &= ~SXACT_FLAG_PREPARED;
+	}
+
+	if (!topLevelIsDeclaredReadOnly)
+	{
+		Assert(PredXact->WritableSxactCount > 0);
+		if (--(PredXact->WritableSxactCount) == 0)
+		{
+			/*
+			 * Release predicate locks and rw-conflicts in for all committed
+			 * transactions.  There are no longer any transactions which might
+			 * conflict with the locks and no chance for new transactions to
+			 * overlap.  Similarly, existing conflicts in can't cause pivots,
+			 * and any conflicts in which could have completed a dangerous
+			 * structure would already have caused a rollback, so any
+			 * remaining ones must be benign.
+			 */
+			PredXact->CanPartialClearThrough = PredXact->LastSxactCommitSeqNo;
+		}
+	}
+	else
+	{
+		/*
+		 * Read-only transactions: clear the list of transactions that might
+		 * make us unsafe. Note that we use 'inLink' for the iteration as
+		 * opposed to 'outLink' for the r/w xacts.
+		 */
+		dlist_foreach_modify(iter, &MySerializableXact->possibleUnsafeConflicts)
+		{
+			RWConflict	possibleUnsafeConflict =
+				dlist_container(RWConflictData, inLink, iter.cur);
+
+			Assert(!SxactIsReadOnly(possibleUnsafeConflict->sxactOut));
+			Assert(MySerializableXact == possibleUnsafeConflict->sxactIn);
+
+			ReleaseRWConflict(possibleUnsafeConflict);
+		}
+	}
+
+	/* Check for conflict out to old committed transactions. */
+	if (isCommit
+		&& !SxactIsReadOnly(MySerializableXact)
+		&& SxactHasSummaryConflictOut(MySerializableXact))
+	{
+		/*
+		 * we don't know which old committed transaction we conflicted with,
+		 * so be conservative and use FirstNormalSerCommitSeqNo here
+		 */
+		MySerializableXact->SeqNo.earliestOutConflictCommit =
+			FirstNormalSerCommitSeqNo;
+		MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
+	}
+
+	/*
+	 * Release all outConflicts to committed transactions.  If we're rolling
+	 * back clear them all.  Set SXACT_FLAG_CONFLICT_OUT if any point to
+	 * previously committed transactions.
+	 */
+	dlist_foreach_modify(iter, &MySerializableXact->outConflicts)
+	{
+		RWConflict	conflict =
+			dlist_container(RWConflictData, outLink, iter.cur);
+
+		if (isCommit
+			&& !SxactIsReadOnly(MySerializableXact)
+			&& SxactIsCommitted(conflict->sxactIn))
+		{
+			if ((MySerializableXact->flags & SXACT_FLAG_CONFLICT_OUT) == 0
+				|| conflict->sxactIn->prepareSeqNo < MySerializableXact->SeqNo.earliestOutConflictCommit)
+				MySerializableXact->SeqNo.earliestOutConflictCommit = conflict->sxactIn->prepareSeqNo;
+			MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
+		}
+
+		if (!isCommit
+			|| SxactIsCommitted(conflict->sxactIn)
+			|| (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo))
+			ReleaseRWConflict(conflict);
+	}
+
+	/*
+	 * Release all inConflicts from committed and read-only transactions. If
+	 * we're rolling back, clear them all.
+	 */
+	dlist_foreach_modify(iter, &MySerializableXact->inConflicts)
+	{
+		RWConflict	conflict =
+			dlist_container(RWConflictData, inLink, iter.cur);
+
+		if (!isCommit
+			|| SxactIsCommitted(conflict->sxactOut)
+			|| SxactIsReadOnly(conflict->sxactOut))
+			ReleaseRWConflict(conflict);
+	}
+
+	if (!topLevelIsDeclaredReadOnly)
+	{
+		/*
+		 * Remove ourselves from the list of possible conflicts for concurrent
+		 * READ ONLY transactions, flagging them as unsafe if we have a
+		 * conflict out. If any are waiting DEFERRABLE transactions, wake them
+		 * up if they are known safe or known unsafe.
+		 */
+		dlist_foreach_modify(iter, &MySerializableXact->possibleUnsafeConflicts)
+		{
+			RWConflict	possibleUnsafeConflict =
+				dlist_container(RWConflictData, outLink, iter.cur);
+
+			roXact = possibleUnsafeConflict->sxactIn;
+			Assert(MySerializableXact == possibleUnsafeConflict->sxactOut);
+			Assert(SxactIsReadOnly(roXact));
+
+			/* Mark conflicted if necessary. */
+			if (isCommit
+				&& MyXactDidWrite
+				&& SxactHasConflictOut(MySerializableXact)
+				&& (MySerializableXact->SeqNo.earliestOutConflictCommit
+					<= roXact->SeqNo.lastCommitBeforeSnapshot))
+			{
+				/*
+				 * This releases possibleUnsafeConflict (as well as all other
+				 * possible conflicts for roXact)
+				 */
+				FlagSxactUnsafe(roXact);
+			}
+			else
+			{
+				ReleaseRWConflict(possibleUnsafeConflict);
+
+				/*
+				 * If we were the last possible conflict, flag it safe. The
+				 * transaction can now safely release its predicate locks (but
+				 * that transaction's backend has to do that itself).
+				 */
+				if (dlist_is_empty(&roXact->possibleUnsafeConflicts))
+					roXact->flags |= SXACT_FLAG_RO_SAFE;
+			}
+
+			/*
+			 * Wake up the process for a waiting DEFERRABLE transaction if we
+			 * now know it's either safe or conflicted.
+			 */
+			if (SxactIsDeferrableWaiting(roXact) &&
+				(SxactIsROUnsafe(roXact) || SxactIsROSafe(roXact)))
+				ProcSendSignal(roXact->pgprocno);
+		}
+	}
+
+	/*
+	 * Check whether it's time to clean up old transactions. This can only be
+	 * done when the last serializable transaction with the oldest xmin among
+	 * serializable transactions completes.  We then find the "new oldest"
+	 * xmin and purge any transactions which finished before this transaction
+	 * was launched.
+	 *
+	 * For parallel queries in read-only transactions, it might run twice. We
+	 * only release the reference on the first call.
+	 */
+	needToClear = false;
+	if ((partiallyReleasing ||
+		 !SxactIsPartiallyReleased(MySerializableXact)) &&
+		TransactionIdEquals(MySerializableXact->xmin,
+							PredXact->SxactGlobalXmin))
+	{
+		Assert(PredXact->SxactGlobalXminCount > 0);
+		if (--(PredXact->SxactGlobalXminCount) == 0)
+		{
+			SetNewSxactGlobalXmin();
+			needToClear = true;
+		}
+	}
+
+	LWLockRelease(SerializableXactHashLock);
+
+	LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+
+	/* Add this to the list of transactions to check for later cleanup. */
+	if (isCommit)
+		dlist_push_tail(FinishedSerializableTransactions,
+						&MySerializableXact->finishedLink);
+
+	/*
+	 * If we're releasing a RO_SAFE transaction in parallel mode, we'll only
+	 * partially release it.  That's necessary because other backends may have
+	 * a reference to it.  The leader will release the SERIALIZABLEXACT itself
+	 * at the end of the transaction after workers have stopped running.
+	 */
+	if (!isCommit)
+		ReleaseOneSerializableXact(MySerializableXact,
+								   isReadOnlySafe && IsInParallelMode(),
+								   false);
+
+	LWLockRelease(SerializableFinishedListLock);
+
+	if (needToClear)
+		ClearOldPredicateLocks();
+
+	ReleasePredicateLocksLocal();
+}
+
+static void
+ReleasePredicateLocksLocal(void)
+{
+	MySerializableXact = InvalidSerializableXact;
+	MyXactDidWrite = false;
+
+	/* Delete per-transaction lock table */
+	if (LocalPredicateLockHash != NULL)
+	{
+		hash_destroy(LocalPredicateLockHash);
+		LocalPredicateLockHash = NULL;
+	}
+}
+
+/*
+ * Clear old predicate locks, belonging to committed transactions that are no
+ * longer interesting to any in-progress transaction.
+ */
+static void
+ClearOldPredicateLocks(void)
+{
+	dlist_mutable_iter iter;
+
+	/*
+	 * Loop through finished transactions. They are in commit order, so we can
+	 * stop as soon as we find one that's still interesting.
+	 */
+	LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+	dlist_foreach_modify(iter, FinishedSerializableTransactions)
+	{
+		SERIALIZABLEXACT *finishedSxact =
+			dlist_container(SERIALIZABLEXACT, finishedLink, iter.cur);
+
+		if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
+			|| TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore,
+											 PredXact->SxactGlobalXmin))
+		{
+			/*
+			 * This transaction committed before any in-progress transaction
+			 * took its snapshot. It's no longer interesting.
+			 */
+			LWLockRelease(SerializableXactHashLock);
+			dlist_delete_thoroughly(&finishedSxact->finishedLink);
+			ReleaseOneSerializableXact(finishedSxact, false, false);
+			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		}
+		else if (finishedSxact->commitSeqNo > PredXact->HavePartialClearedThrough
+				 && finishedSxact->commitSeqNo <= PredXact->CanPartialClearThrough)
+		{
+			/*
+			 * Any active transactions that took their snapshot before this
+			 * transaction committed are read-only, so we can clear part of
+			 * its state.
+			 */
+			LWLockRelease(SerializableXactHashLock);
+
+			if (SxactIsReadOnly(finishedSxact))
+			{
+				/* A read-only transaction can be removed entirely */
+				dlist_delete_thoroughly(&(finishedSxact->finishedLink));
+				ReleaseOneSerializableXact(finishedSxact, false, false);
+			}
+			else
+			{
+				/*
+				 * A read-write transaction can only be partially cleared. We
+				 * need to keep the SERIALIZABLEXACT but can release the
+				 * SIREAD locks and conflicts in.
+				 */
+				ReleaseOneSerializableXact(finishedSxact, true, false);
+			}
+
+			PredXact->HavePartialClearedThrough = finishedSxact->commitSeqNo;
+			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		}
+		else
+		{
+			/* Still interesting. */
+			break;
+		}
+	}
+	LWLockRelease(SerializableXactHashLock);
+
+	/*
+	 * Loop through predicate locks on dummy transaction for summarized data.
+	 */
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+	dlist_foreach_modify(iter, &OldCommittedSxact->predicateLocks)
+	{
+		PREDICATELOCK *predlock =
+			dlist_container(PREDICATELOCK, xactLink, iter.cur);
+		bool		canDoPartialCleanup;
+
+		LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		Assert(predlock->commitSeqNo != 0);
+		Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
+		canDoPartialCleanup = (predlock->commitSeqNo <= PredXact->CanPartialClearThrough);
+		LWLockRelease(SerializableXactHashLock);
+
+		/*
+		 * If this lock originally belonged to an old enough transaction, we
+		 * can release it.
+		 */
+		if (canDoPartialCleanup)
+		{
+			PREDICATELOCKTAG tag;
+			PREDICATELOCKTARGET *target;
+			PREDICATELOCKTARGETTAG targettag;
+			uint32		targettaghash;
+			LWLock	   *partitionLock;
+
+			tag = predlock->tag;
+			target = tag.myTarget;
+			targettag = target->tag;
+			targettaghash = PredicateLockTargetTagHashCode(&targettag);
+			partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+			dlist_delete(&(predlock->targetLink));
+			dlist_delete(&(predlock->xactLink));
+
+			hash_search_with_hash_value(PredicateLockHash, &tag,
+										PredicateLockHashCodeFromTargetHashCode(&tag,
+																				targettaghash),
+										HASH_REMOVE, NULL);
+			RemoveTargetIfNoLongerUsed(target, targettaghash);
+
+			LWLockRelease(partitionLock);
+		}
+	}
+
+	LWLockRelease(SerializablePredicateListLock);
+	LWLockRelease(SerializableFinishedListLock);
+}
+
+/*
+ * This is the normal way to delete anything from any of the predicate
+ * locking hash tables.  Given a transaction which we know can be deleted:
+ * delete all predicate locks held by that transaction and any predicate
+ * lock targets which are now unreferenced by a lock; delete all conflicts
+ * for the transaction; delete all xid values for the transaction; then
+ * delete the transaction.
+ *
+ * When the partial flag is set, we can release all predicate locks and
+ * in-conflict information -- we've established that there are no longer
+ * any overlapping read write transactions for which this transaction could
+ * matter -- but keep the transaction entry itself and any outConflicts.
+ *
+ * When the summarize flag is set, we've run short of room for sxact data
+ * and must summarize to the SLRU.  Predicate locks are transferred to a
+ * dummy "old" transaction, with duplicate locks on a single target
+ * collapsing to a single lock with the "latest" commitSeqNo from among
+ * the conflicting locks..
+ */
+static void
+ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
+						   bool summarize)
+{
+	SERIALIZABLEXIDTAG sxidtag;
+	dlist_mutable_iter iter;
+
+	Assert(sxact != NULL);
+	Assert(SxactIsRolledBack(sxact) || SxactIsCommitted(sxact));
+	Assert(partial || !SxactIsOnFinishedList(sxact));
+	Assert(LWLockHeldByMe(SerializableFinishedListLock));
+
+	/*
+	 * First release all the predicate locks held by this xact (or transfer
+	 * them to OldCommittedSxact if summarize is true)
+	 */
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+	if (IsInParallelMode())
+		LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+	dlist_foreach_modify(iter, &sxact->predicateLocks)
+	{
+		PREDICATELOCK *predlock =
+			dlist_container(PREDICATELOCK, xactLink, iter.cur);
+		PREDICATELOCKTAG tag;
+		PREDICATELOCKTARGET *target;
+		PREDICATELOCKTARGETTAG targettag;
+		uint32		targettaghash;
+		LWLock	   *partitionLock;
+
+		tag = predlock->tag;
+		target = tag.myTarget;
+		targettag = target->tag;
+		targettaghash = PredicateLockTargetTagHashCode(&targettag);
+		partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		dlist_delete(&predlock->targetLink);
+
+		hash_search_with_hash_value(PredicateLockHash, &tag,
+									PredicateLockHashCodeFromTargetHashCode(&tag,
+																			targettaghash),
+									HASH_REMOVE, NULL);
+		if (summarize)
+		{
+			bool		found;
+
+			/* Fold into dummy transaction list. */
+			tag.myXact = OldCommittedSxact;
+			predlock = hash_search_with_hash_value(PredicateLockHash, &tag,
+												   PredicateLockHashCodeFromTargetHashCode(&tag,
+																						   targettaghash),
+												   HASH_ENTER_NULL, &found);
+			if (!predlock)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of shared memory"),
+						 errhint("You might need to increase %s.", "max_pred_locks_per_transaction")));
+			if (found)
+			{
+				Assert(predlock->commitSeqNo != 0);
+				Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
+				if (predlock->commitSeqNo < sxact->commitSeqNo)
+					predlock->commitSeqNo = sxact->commitSeqNo;
+			}
+			else
+			{
+				dlist_push_tail(&target->predicateLocks,
+								&predlock->targetLink);
+				dlist_push_tail(&OldCommittedSxact->predicateLocks,
+								&predlock->xactLink);
+				predlock->commitSeqNo = sxact->commitSeqNo;
+			}
+		}
+		else
+			RemoveTargetIfNoLongerUsed(target, targettaghash);
+
+		LWLockRelease(partitionLock);
+	}
+
+	/*
+	 * Rather than retail removal, just re-init the head after we've run
+	 * through the list.
+	 */
+	dlist_init(&sxact->predicateLocks);
+
+	if (IsInParallelMode())
+		LWLockRelease(&sxact->perXactPredicateListLock);
+	LWLockRelease(SerializablePredicateListLock);
+
+	sxidtag.xid = sxact->topXid;
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/* Release all outConflicts (unless 'partial' is true) */
+	if (!partial)
+	{
+		dlist_foreach_modify(iter, &sxact->outConflicts)
+		{
+			RWConflict	conflict =
+				dlist_container(RWConflictData, outLink, iter.cur);
+
+			if (summarize)
+				conflict->sxactIn->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+			ReleaseRWConflict(conflict);
+		}
+	}
+
+	/* Release all inConflicts. */
+	dlist_foreach_modify(iter, &sxact->inConflicts)
+	{
+		RWConflict	conflict =
+			dlist_container(RWConflictData, inLink, iter.cur);
+
+		if (summarize)
+			conflict->sxactOut->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+		ReleaseRWConflict(conflict);
+	}
+
+	/* Finally, get rid of the xid and the record of the transaction itself. */
+	if (!partial)
+	{
+		if (sxidtag.xid != InvalidTransactionId)
+			hash_search(SerializableXidHash, &sxidtag, HASH_REMOVE, NULL);
+		ReleasePredXact(sxact);
+	}
+
+	LWLockRelease(SerializableXactHashLock);
+}
+
+/*
+ * Tests whether the given top level transaction is concurrent with
+ * (overlaps) our current transaction.
+ *
+ * We need to identify the top level transaction for SSI, anyway, so pass
+ * that to this function to save the overhead of checking the snapshot's
+ * subxip array.
+ */
+static bool
+XidIsConcurrent(TransactionId xid)
+{
+	Snapshot	snap;
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+	snap = GetTransactionSnapshot();
+
+	if (TransactionIdPrecedes(xid, snap->xmin))
+		return false;
+
+	if (TransactionIdFollowsOrEquals(xid, snap->xmax))
+		return true;
+
+	return pg_lfind32(xid, snap->xip, snap->xcnt);
+}
+
+bool
+CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
+{
+	if (!SerializationNeededForRead(relation, snapshot))
+		return false;
+
+	/* Check if someone else has already decided that we need to die */
+	if (SxactIsDoomed(MySerializableXact))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+				 errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
+				 errhint("The transaction might succeed if retried.")));
+	}
+
+	return true;
+}
+
+/*
+ * CheckForSerializableConflictOut
+ *		A table AM is reading a tuple that has been modified.  If it determines
+ *		that the tuple version it is reading is not visible to us, it should
+ *		pass in the top level xid of the transaction that created it.
+ *		Otherwise, if it determines that it is visible to us but it has been
+ *		deleted or there is a newer version available due to an update, it
+ *		should pass in the top level xid of the modifying transaction.
+ *
+ * This function will check for overlap with our own transaction.  If the given
+ * xid is also serializable and the transactions overlap (i.e., they cannot see
+ * each other's writes), then we have a conflict out.
+ */
+void
+CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
+{
+	SERIALIZABLEXIDTAG sxidtag;
+	SERIALIZABLEXID *sxid;
+	SERIALIZABLEXACT *sxact;
+
+	if (!SerializationNeededForRead(relation, snapshot))
+		return;
+
+	/* Check if someone else has already decided that we need to die */
+	if (SxactIsDoomed(MySerializableXact))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+				 errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
+				 errhint("The transaction might succeed if retried.")));
+	}
+	Assert(TransactionIdIsValid(xid));
+
+	if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
+		return;
+
+	/*
+	 * Find sxact or summarized info for the top level xid.
+	 */
+	sxidtag.xid = xid;
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+	sxid = (SERIALIZABLEXID *)
+		hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+	if (!sxid)
+	{
+		/*
+		 * Transaction not found in "normal" SSI structures.  Check whether it
+		 * got pushed out to SLRU storage for "old committed" transactions.
+		 */
+		SerCommitSeqNo conflictCommitSeqNo;
+
+		conflictCommitSeqNo = SerialGetMinConflictCommitSeqNo(xid);
+		if (conflictCommitSeqNo != 0)
+		{
+			if (conflictCommitSeqNo != InvalidSerCommitSeqNo
+				&& (!SxactIsReadOnly(MySerializableXact)
+					|| conflictCommitSeqNo
+					<= MySerializableXact->SeqNo.lastCommitBeforeSnapshot))
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to read/write dependencies among transactions"),
+						 errdetail_internal("Reason code: Canceled on conflict out to old pivot %u.", xid),
+						 errhint("The transaction might succeed if retried.")));
+
+			if (SxactHasSummaryConflictIn(MySerializableXact)
+				|| !dlist_is_empty(&MySerializableXact->inConflicts))
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to read/write dependencies among transactions"),
+						 errdetail_internal("Reason code: Canceled on identification as a pivot, with conflict out to old committed transaction %u.", xid),
+						 errhint("The transaction might succeed if retried.")));
+
+			MySerializableXact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+		}
+
+		/* It's not serializable or otherwise not important. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+	sxact = sxid->myXact;
+	Assert(TransactionIdEquals(sxact->topXid, xid));
+	if (sxact == MySerializableXact || SxactIsDoomed(sxact))
+	{
+		/* Can't conflict with ourself or a transaction that will roll back. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+
+	/*
+	 * We have a conflict out to a transaction which has a conflict out to a
+	 * summarized transaction.  That summarized transaction must have
+	 * committed first, and we can't tell when it committed in relation to our
+	 * snapshot acquisition, so something needs to be canceled.
+	 */
+	if (SxactHasSummaryConflictOut(sxact))
+	{
+		if (!SxactIsPrepared(sxact))
+		{
+			sxact->flags |= SXACT_FLAG_DOOMED;
+			LWLockRelease(SerializableXactHashLock);
+			return;
+		}
+		else
+		{
+			LWLockRelease(SerializableXactHashLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+					 errmsg("could not serialize access due to read/write dependencies among transactions"),
+					 errdetail_internal("Reason code: Canceled on conflict out to old pivot."),
+					 errhint("The transaction might succeed if retried.")));
+		}
+	}
+
+	/*
+	 * If this is a read-only transaction and the writing transaction has
+	 * committed, and it doesn't have a rw-conflict to a transaction which
+	 * committed before it, no conflict.
+	 */
+	if (SxactIsReadOnly(MySerializableXact)
+		&& SxactIsCommitted(sxact)
+		&& !SxactHasSummaryConflictOut(sxact)
+		&& (!SxactHasConflictOut(sxact)
+			|| MySerializableXact->SeqNo.lastCommitBeforeSnapshot < sxact->SeqNo.earliestOutConflictCommit))
+	{
+		/* Read-only transaction will appear to run first.  No conflict. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+
+	if (!XidIsConcurrent(xid))
+	{
+		/* This write was already in our snapshot; no conflict. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+
+	if (RWConflictExists(MySerializableXact, sxact))
+	{
+		/* We don't want duplicate conflict records in the list. */
+		LWLockRelease(SerializableXactHashLock);
+		return;
+	}
+
+	/*
+	 * Flag the conflict.  But first, if this conflict creates a dangerous
+	 * structure, ereport an error.
+	 */
+	FlagRWConflict(MySerializableXact, sxact);
+	LWLockRelease(SerializableXactHashLock);
+}
+
+/*
+ * Check a particular target for rw-dependency conflict in. A subroutine of
+ * CheckForSerializableConflictIn().
+ */
+static void
+CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
+{
+	uint32		targettaghash;
+	LWLock	   *partitionLock;
+	PREDICATELOCKTARGET *target;
+	PREDICATELOCK *mypredlock = NULL;
+	PREDICATELOCKTAG mypredlocktag;
+	dlist_mutable_iter iter;
+
+	Assert(MySerializableXact != InvalidSerializableXact);
+
+	/*
+	 * The same hash and LW lock apply to the lock target and the lock itself.
+	 */
+	targettaghash = PredicateLockTargetTagHashCode(targettag);
+	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+	LWLockAcquire(partitionLock, LW_SHARED);
+	target = (PREDICATELOCKTARGET *)
+		hash_search_with_hash_value(PredicateLockTargetHash,
+									targettag, targettaghash,
+									HASH_FIND, NULL);
+	if (!target)
+	{
+		/* Nothing has this target locked; we're done here. */
+		LWLockRelease(partitionLock);
+		return;
+	}
+
+	/*
+	 * Each lock for an overlapping transaction represents a conflict: a
+	 * rw-dependency in to this transaction.
+	 */
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+	dlist_foreach_modify(iter, &target->predicateLocks)
+	{
+		PREDICATELOCK *predlock =
+			dlist_container(PREDICATELOCK, targetLink, iter.cur);
+		SERIALIZABLEXACT *sxact = predlock->tag.myXact;
+
+		if (sxact == MySerializableXact)
+		{
+			/*
+			 * If we're getting a write lock on a tuple, we don't need a
+			 * predicate (SIREAD) lock on the same tuple. We can safely remove
+			 * our SIREAD lock, but we'll defer doing so until after the loop
+			 * because that requires upgrading to an exclusive partition lock.
+			 *
+			 * We can't use this optimization within a subtransaction because
+			 * the subtransaction could roll back, and we would be left
+			 * without any lock at the top level.
+			 */
+			if (!IsSubTransaction()
+				&& GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
+			{
+				mypredlock = predlock;
+				mypredlocktag = predlock->tag;
+			}
+		}
+		else if (!SxactIsDoomed(sxact)
+				 && (!SxactIsCommitted(sxact)
+					 || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+											  sxact->finishedBefore))
+				 && !RWConflictExists(sxact, MySerializableXact))
+		{
+			LWLockRelease(SerializableXactHashLock);
+			LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+			/*
+			 * Re-check after getting exclusive lock because the other
+			 * transaction may have flagged a conflict.
+			 */
+			if (!SxactIsDoomed(sxact)
+				&& (!SxactIsCommitted(sxact)
+					|| TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+											 sxact->finishedBefore))
+				&& !RWConflictExists(sxact, MySerializableXact))
+			{
+				FlagRWConflict(sxact, MySerializableXact);
+			}
+
+			LWLockRelease(SerializableXactHashLock);
+			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		}
+	}
+	LWLockRelease(SerializableXactHashLock);
+	LWLockRelease(partitionLock);
+
+	/*
+	 * If we found one of our own SIREAD locks to remove, remove it now.
+	 *
+	 * At this point our transaction already has a RowExclusiveLock on the
+	 * relation, so we are OK to drop the predicate lock on the tuple, if
+	 * found, without fearing that another write against the tuple will occur
+	 * before the MVCC information makes it to the buffer.
+	 */
+	if (mypredlock != NULL)
+	{
+		uint32		predlockhashcode;
+		PREDICATELOCK *rmpredlock;
+
+		LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+		if (IsInParallelMode())
+			LWLockAcquire(&MySerializableXact->perXactPredicateListLock, LW_EXCLUSIVE);
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+		/*
+		 * Remove the predicate lock from shared memory, if it wasn't removed
+		 * while the locks were released.  One way that could happen is from
+		 * autovacuum cleaning up an index.
+		 */
+		predlockhashcode = PredicateLockHashCodeFromTargetHashCode
+			(&mypredlocktag, targettaghash);
+		rmpredlock = (PREDICATELOCK *)
+			hash_search_with_hash_value(PredicateLockHash,
+										&mypredlocktag,
+										predlockhashcode,
+										HASH_FIND, NULL);
+		if (rmpredlock != NULL)
+		{
+			Assert(rmpredlock == mypredlock);
+
+			dlist_delete(&(mypredlock->targetLink));
+			dlist_delete(&(mypredlock->xactLink));
+
+			rmpredlock = (PREDICATELOCK *)
+				hash_search_with_hash_value(PredicateLockHash,
+											&mypredlocktag,
+											predlockhashcode,
+											HASH_REMOVE, NULL);
+			Assert(rmpredlock == mypredlock);
+
+			RemoveTargetIfNoLongerUsed(target, targettaghash);
+		}
+
+		LWLockRelease(SerializableXactHashLock);
+		LWLockRelease(partitionLock);
+		if (IsInParallelMode())
+			LWLockRelease(&MySerializableXact->perXactPredicateListLock);
+		LWLockRelease(SerializablePredicateListLock);
+
+		if (rmpredlock != NULL)
+		{
+			/*
+			 * Remove entry in local lock table if it exists. It's OK if it
+			 * doesn't exist; that means the lock was transferred to a new
+			 * target by a different backend.
+			 */
+			hash_search_with_hash_value(LocalPredicateLockHash,
+										targettag, targettaghash,
+										HASH_REMOVE, NULL);
+
+			DecrementParentLocks(targettag);
+		}
+	}
+}
+
+/*
+ * CheckForSerializableConflictIn
+ *		We are writing the given tuple.  If that indicates a rw-conflict
+ *		in from another serializable transaction, take appropriate action.
+ *
+ * Skip checking for any granularity for which a parameter is missing.
+ *
+ * A tuple update or delete is in conflict if we have a predicate lock
+ * against the relation or page in which the tuple exists, or against the
+ * tuple itself.
+ */
+void
+CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
+{
+	PREDICATELOCKTARGETTAG targettag;
+
+	if (!SerializationNeededForWrite(relation))
+		return;
+
+	/* Check if someone else has already decided that we need to die */
+	if (SxactIsDoomed(MySerializableXact))
+		ereport(ERROR,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+				 errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict in checking."),
+				 errhint("The transaction might succeed if retried.")));
+
+	/*
+	 * We're doing a write which might cause rw-conflicts now or later.
+	 * Memorize that fact.
+	 */
+	MyXactDidWrite = true;
+
+	/*
+	 * It is important that we check for locks from the finest granularity to
+	 * the coarsest granularity, so that granularity promotion doesn't cause
+	 * us to miss a lock.  The new (coarser) lock will be acquired before the
+	 * old (finer) locks are released.
+	 *
+	 * It is not possible to take and hold a lock across the checks for all
+	 * granularities because each target could be in a separate partition.
+	 */
+	if (tid != NULL)
+	{
+		SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
+										 relation->rd_locator.dbOid,
+										 relation->rd_id,
+										 ItemPointerGetBlockNumber(tid),
+										 ItemPointerGetOffsetNumber(tid));
+		CheckTargetForConflictsIn(&targettag);
+	}
+
+	if (blkno != InvalidBlockNumber)
+	{
+		SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+										relation->rd_locator.dbOid,
+										relation->rd_id,
+										blkno);
+		CheckTargetForConflictsIn(&targettag);
+	}
+
+	SET_PREDICATELOCKTARGETTAG_RELATION(targettag,
+										relation->rd_locator.dbOid,
+										relation->rd_id);
+	CheckTargetForConflictsIn(&targettag);
+}
+
+/*
+ * CheckTableForSerializableConflictIn
+ *		The entire table is going through a DDL-style logical mass delete
+ *		like TRUNCATE or DROP TABLE.  If that causes a rw-conflict in from
+ *		another serializable transaction, take appropriate action.
+ *
+ * While these operations do not operate entirely within the bounds of
+ * snapshot isolation, they can occur inside a serializable transaction, and
+ * will logically occur after any reads which saw rows which were destroyed
+ * by these operations, so we do what we can to serialize properly under
+ * SSI.
+ *
+ * The relation passed in must be a heap relation. Any predicate lock of any
+ * granularity on the heap will cause a rw-conflict in to this transaction.
+ * Predicate locks on indexes do not matter because they only exist to guard
+ * against conflicting inserts into the index, and this is a mass *delete*.
+ * When a table is truncated or dropped, the index will also be truncated
+ * or dropped, and we'll deal with locks on the index when that happens.
+ *
+ * Dropping or truncating a table also needs to drop any existing predicate
+ * locks on heap tuples or pages, because they're about to go away. This
+ * should be done before altering the predicate locks because the transaction
+ * could be rolled back because of a conflict, in which case the lock changes
+ * are not needed. (At the moment, we don't actually bother to drop the
+ * existing locks on a dropped or truncated table at the moment. That might
+ * lead to some false positives, but it doesn't seem worth the trouble.)
+ */
+void
+CheckTableForSerializableConflictIn(Relation relation)
+{
+	HASH_SEQ_STATUS seqstat;
+	PREDICATELOCKTARGET *target;
+	Oid			dbId;
+	Oid			heapId;
+	int			i;
+
+	/*
+	 * Bail out quickly if there are no serializable transactions running.
+	 * It's safe to check this without taking locks because the caller is
+	 * holding an ACCESS EXCLUSIVE lock on the relation.  No new locks which
+	 * would matter here can be acquired while that is held.
+	 */
+	if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+		return;
+
+	if (!SerializationNeededForWrite(relation))
+		return;
+
+	/*
+	 * We're doing a write which might cause rw-conflicts now or later.
+	 * Memorize that fact.
+	 */
+	MyXactDidWrite = true;
+
+	Assert(relation->rd_index == NULL); /* not an index relation */
+
+	dbId = relation->rd_locator.dbOid;
+	heapId = relation->rd_id;
+
+	LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+		LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/* Scan through target list */
+	hash_seq_init(&seqstat, PredicateLockTargetHash);
+
+	while ((target = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
+	{
+		dlist_mutable_iter iter;
+
+		/*
+		 * Check whether this is a target which needs attention.
+		 */
+		if (GET_PREDICATELOCKTARGETTAG_RELATION(target->tag) != heapId)
+			continue;			/* wrong relation id */
+		if (GET_PREDICATELOCKTARGETTAG_DB(target->tag) != dbId)
+			continue;			/* wrong database id */
+
+		/*
+		 * Loop through locks for this target and flag conflicts.
+		 */
+		dlist_foreach_modify(iter, &target->predicateLocks)
+		{
+			PREDICATELOCK *predlock =
+				dlist_container(PREDICATELOCK, targetLink, iter.cur);
+
+			if (predlock->tag.myXact != MySerializableXact
+				&& !RWConflictExists(predlock->tag.myXact, MySerializableXact))
+			{
+				FlagRWConflict(predlock->tag.myXact, MySerializableXact);
+			}
+		}
+	}
+
+	/* Release locks in reverse order */
+	LWLockRelease(SerializableXactHashLock);
+	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+		LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+
+/*
+ * Flag a rw-dependency between two serializable transactions.
+ *
+ * The caller is responsible for ensuring that we have a LW lock on
+ * the transaction hash table.
+ */
+static void
+FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+{
+	Assert(reader != writer);
+
+	/* First, see if this conflict causes failure. */
+	OnConflict_CheckForSerializationFailure(reader, writer);
+
+	/* Actually do the conflict flagging. */
+	if (reader == OldCommittedSxact)
+		writer->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+	else if (writer == OldCommittedSxact)
+		reader->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+	else
+		SetRWConflict(reader, writer);
+}
+
+/*----------------------------------------------------------------------------
+ * We are about to add a RW-edge to the dependency graph - check that we don't
+ * introduce a dangerous structure by doing so, and abort one of the
+ * transactions if so.
+ *
+ * A serialization failure can only occur if there is a dangerous structure
+ * in the dependency graph:
+ *
+ *		Tin ------> Tpivot ------> Tout
+ *			  rw			 rw
+ *
+ * Furthermore, Tout must commit first.
+ *
+ * One more optimization is that if Tin is declared READ ONLY (or commits
+ * without writing), we can only have a problem if Tout committed before Tin
+ * acquired its snapshot.
+ *----------------------------------------------------------------------------
+ */
+static void
+OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+										SERIALIZABLEXACT *writer)
+{
+	bool		failure;
+
+	Assert(LWLockHeldByMe(SerializableXactHashLock));
+
+	failure = false;
+
+	/*------------------------------------------------------------------------
+	 * Check for already-committed writer with rw-conflict out flagged
+	 * (conflict-flag on W means that T2 committed before W):
+	 *
+	 *		R ------> W ------> T2
+	 *			rw		  rw
+	 *
+	 * That is a dangerous structure, so we must abort. (Since the writer
+	 * has already committed, we must be the reader)
+	 *------------------------------------------------------------------------
+	 */
+	if (SxactIsCommitted(writer)
+		&& (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
+		failure = true;
+
+	/*------------------------------------------------------------------------
+	 * Check whether the writer has become a pivot with an out-conflict
+	 * committed transaction (T2), and T2 committed first:
+	 *
+	 *		R ------> W ------> T2
+	 *			rw		  rw
+	 *
+	 * Because T2 must've committed first, there is no anomaly if:
+	 * - the reader committed before T2
+	 * - the writer committed before T2
+	 * - the reader is a READ ONLY transaction and the reader was concurrent
+	 *	 with T2 (= reader acquired its snapshot before T2 committed)
+	 *
+	 * We also handle the case that T2 is prepared but not yet committed
+	 * here. In that case T2 has already checked for conflicts, so if it
+	 * commits first, making the above conflict real, it's too late for it
+	 * to abort.
+	 *------------------------------------------------------------------------
+	 */
+	if (!failure && SxactHasSummaryConflictOut(writer))
+		failure = true;
+	else if (!failure)
+	{
+		dlist_iter	iter;
+
+		dlist_foreach(iter, &writer->outConflicts)
+		{
+			RWConflict	conflict =
+				dlist_container(RWConflictData, outLink, iter.cur);
+			SERIALIZABLEXACT *t2 = conflict->sxactIn;
+
+			if (SxactIsPrepared(t2)
+				&& (!SxactIsCommitted(reader)
+					|| t2->prepareSeqNo <= reader->commitSeqNo)
+				&& (!SxactIsCommitted(writer)
+					|| t2->prepareSeqNo <= writer->commitSeqNo)
+				&& (!SxactIsReadOnly(reader)
+					|| t2->prepareSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
+			{
+				failure = true;
+				break;
+			}
+		}
+	}
+
+	/*------------------------------------------------------------------------
+	 * Check whether the reader has become a pivot with a writer
+	 * that's committed (or prepared):
+	 *
+	 *		T0 ------> R ------> W
+	 *			 rw		   rw
+	 *
+	 * Because W must've committed first for an anomaly to occur, there is no
+	 * anomaly if:
+	 * - T0 committed before the writer
+	 * - T0 is READ ONLY, and overlaps the writer
+	 *------------------------------------------------------------------------
+	 */
+	if (!failure && SxactIsPrepared(writer) && !SxactIsReadOnly(reader))
+	{
+		if (SxactHasSummaryConflictIn(reader))
+		{
+			failure = true;
+		}
+		else
+		{
+			dlist_iter	iter;
+
+			/*
+			 * The unconstify is needed as we have no const version of
+			 * dlist_foreach().
+			 */
+			dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->inConflicts)
+			{
+				const RWConflict conflict =
+					dlist_container(RWConflictData, inLink, iter.cur);
+				const SERIALIZABLEXACT *t0 = conflict->sxactOut;
+
+				if (!SxactIsDoomed(t0)
+					&& (!SxactIsCommitted(t0)
+						|| t0->commitSeqNo >= writer->prepareSeqNo)
+					&& (!SxactIsReadOnly(t0)
+						|| t0->SeqNo.lastCommitBeforeSnapshot >= writer->prepareSeqNo))
+				{
+					failure = true;
+					break;
+				}
+			}
+		}
+	}
+
+	if (failure)
+	{
+		/*
+		 * We have to kill a transaction to avoid a possible anomaly from
+		 * occurring. If the writer is us, we can just ereport() to cause a
+		 * transaction abort. Otherwise we flag the writer for termination,
+		 * causing it to abort when it tries to commit. However, if the writer
+		 * is a prepared transaction, already prepared, we can't abort it
+		 * anymore, so we have to kill the reader instead.
+		 */
+		if (MySerializableXact == writer)
+		{
+			LWLockRelease(SerializableXactHashLock);
+			ereport(ERROR,
+					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+					 errmsg("could not serialize access due to read/write dependencies among transactions"),
+					 errdetail_internal("Reason code: Canceled on identification as a pivot, during write."),
+					 errhint("The transaction might succeed if retried.")));
+		}
+		else if (SxactIsPrepared(writer))
+		{
+			LWLockRelease(SerializableXactHashLock);
+
+			/* if we're not the writer, we have to be the reader */
+			Assert(MySerializableXact == reader);
+			ereport(ERROR,
+					(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+					 errmsg("could not serialize access due to read/write dependencies among transactions"),
+					 errdetail_internal("Reason code: Canceled on conflict out to pivot %u, during read.", writer->topXid),
+					 errhint("The transaction might succeed if retried.")));
+		}
+		writer->flags |= SXACT_FLAG_DOOMED;
+	}
+}
+
+/*
+ * PreCommit_CheckForSerializationFailure
+ *		Check for dangerous structures in a serializable transaction
+ *		at commit.
+ *
+ * We're checking for a dangerous structure as each conflict is recorded.
+ * The only way we could have a problem at commit is if this is the "out"
+ * side of a pivot, and neither the "in" side nor the pivot has yet
+ * committed.
+ *
+ * If a dangerous structure is found, the pivot (the near conflict) is
+ * marked for death, because rolling back another transaction might mean
+ * that we fail without ever making progress.  This transaction is
+ * committing writes, so letting it commit ensures progress.  If we
+ * canceled the far conflict, it might immediately fail again on retry.
+ */
+void
+PreCommit_CheckForSerializationFailure(void)
+{
+	dlist_iter	near_iter;
+
+	if (MySerializableXact == InvalidSerializableXact)
+		return;
+
+	Assert(IsolationIsSerializable());
+
+	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+	/*
+	 * Check if someone else has already decided that we need to die.  Since
+	 * we set our own DOOMED flag when partially releasing, ignore in that
+	 * case.
+	 */
+	if (SxactIsDoomed(MySerializableXact) &&
+		!SxactIsPartiallyReleased(MySerializableXact))
+	{
+		LWLockRelease(SerializableXactHashLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+				 errdetail_internal("Reason code: Canceled on identification as a pivot, during commit attempt."),
+				 errhint("The transaction might succeed if retried.")));
+	}
+
+	dlist_foreach(near_iter, &MySerializableXact->inConflicts)
+	{
+		RWConflict	nearConflict =
+			dlist_container(RWConflictData, inLink, near_iter.cur);
+
+		if (!SxactIsCommitted(nearConflict->sxactOut)
+			&& !SxactIsDoomed(nearConflict->sxactOut))
+		{
+			dlist_iter	far_iter;
+
+			dlist_foreach(far_iter, &nearConflict->sxactOut->inConflicts)
+			{
+				RWConflict	farConflict =
+					dlist_container(RWConflictData, inLink, far_iter.cur);
+
+				if (farConflict->sxactOut == MySerializableXact
+					|| (!SxactIsCommitted(farConflict->sxactOut)
+						&& !SxactIsReadOnly(farConflict->sxactOut)
+						&& !SxactIsDoomed(farConflict->sxactOut)))
+				{
+					/*
+					 * Normally, we kill the pivot transaction to make sure we
+					 * make progress if the failing transaction is retried.
+					 * However, we can't kill it if it's already prepared, so
+					 * in that case we commit suicide instead.
+					 */
+					if (SxactIsPrepared(nearConflict->sxactOut))
+					{
+						LWLockRelease(SerializableXactHashLock);
+						ereport(ERROR,
+								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+								 errmsg("could not serialize access due to read/write dependencies among transactions"),
+								 errdetail_internal("Reason code: Canceled on commit attempt with conflict in from prepared pivot."),
+								 errhint("The transaction might succeed if retried.")));
+					}
+					nearConflict->sxactOut->flags |= SXACT_FLAG_DOOMED;
+					break;
+				}
+			}
+		}
+	}
+
+	MySerializableXact->prepareSeqNo = ++(PredXact->LastSxactCommitSeqNo);
+	MySerializableXact->flags |= SXACT_FLAG_PREPARED;
+
+	LWLockRelease(SerializableXactHashLock);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Two-phase commit support
+ */
+
+/*
+ * AtPrepare_Locks
+ *		Do the preparatory work for a PREPARE: make 2PC state file
+ *		records for all predicate locks currently held.
+ */
+void
+AtPrepare_PredicateLocks(void)
+{
+	SERIALIZABLEXACT *sxact;
+	TwoPhasePredicateRecord record;
+	TwoPhasePredicateXactRecord *xactRecord;
+	TwoPhasePredicateLockRecord *lockRecord;
+	dlist_iter	iter;
+
+	sxact = MySerializableXact;
+	xactRecord = &(record.data.xactRecord);
+	lockRecord = &(record.data.lockRecord);
+
+	if (MySerializableXact == InvalidSerializableXact)
+		return;
+
+	/* Generate an xact record for our SERIALIZABLEXACT */
+	record.type = TWOPHASEPREDICATERECORD_XACT;
+	xactRecord->xmin = MySerializableXact->xmin;
+	xactRecord->flags = MySerializableXact->flags;
+
+	/*
+	 * Note that we don't include the list of conflicts in our out in the
+	 * statefile, because new conflicts can be added even after the
+	 * transaction prepares. We'll just make a conservative assumption during
+	 * recovery instead.
+	 */
+
+	RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
+						   &record, sizeof(record));
+
+	/*
+	 * Generate a lock record for each lock.
+	 *
+	 * To do this, we need to walk the predicate lock list in our sxact rather
+	 * than using the local predicate lock table because the latter is not
+	 * guaranteed to be accurate.
+	 */
+	LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+
+	/*
+	 * No need to take sxact->perXactPredicateListLock in parallel mode
+	 * because there cannot be any parallel workers running while we are
+	 * preparing a transaction.
+	 */
+	Assert(!IsParallelWorker() && !ParallelContextActive());
+
+	dlist_foreach(iter, &sxact->predicateLocks)
+	{
+		PREDICATELOCK *predlock =
+			dlist_container(PREDICATELOCK, xactLink, iter.cur);
+
+		record.type = TWOPHASEPREDICATERECORD_LOCK;
+		lockRecord->target = predlock->tag.myTarget->tag;
+
+		RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
+							   &record, sizeof(record));
+	}
+
+	LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * PostPrepare_Locks
+ *		Clean up after successful PREPARE. Unlike the non-predicate
+ *		lock manager, we do not need to transfer locks to a dummy
+ *		PGPROC because our SERIALIZABLEXACT will stay around
+ *		anyway. We only need to clean up our local state.
+ */
+void
+PostPrepare_PredicateLocks(TransactionId xid)
+{
+	if (MySerializableXact == InvalidSerializableXact)
+		return;
+
+	Assert(SxactIsPrepared(MySerializableXact));
+
+	MySerializableXact->pid = 0;
+	MySerializableXact->pgprocno = INVALID_PGPROCNO;
+
+	hash_destroy(LocalPredicateLockHash);
+	LocalPredicateLockHash = NULL;
+
+	MySerializableXact = InvalidSerializableXact;
+	MyXactDidWrite = false;
+}
+
+/*
+ * PredicateLockTwoPhaseFinish
+ *		Release a prepared transaction's predicate locks once it
+ *		commits or aborts.
+ */
+void
+PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
+{
+	SERIALIZABLEXID *sxid;
+	SERIALIZABLEXIDTAG sxidtag;
+
+	sxidtag.xid = xid;
+
+	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+	sxid = (SERIALIZABLEXID *)
+		hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+	LWLockRelease(SerializableXactHashLock);
+
+	/* xid will not be found if it wasn't a serializable transaction */
+	if (sxid == NULL)
+		return;
+
+	/* Release its locks */
+	MySerializableXact = sxid->myXact;
+	MyXactDidWrite = true;		/* conservatively assume that we wrote
+								 * something */
+	ReleasePredicateLocks(isCommit, false);
+}
+
+/*
+ * Re-acquire a predicate lock belonging to a transaction that was prepared.
+ */
+void
+predicatelock_twophase_recover(TransactionId xid, uint16 info,
+							   void *recdata, uint32 len)
+{
+	TwoPhasePredicateRecord *record;
+
+	Assert(len == sizeof(TwoPhasePredicateRecord));
+
+	record = (TwoPhasePredicateRecord *) recdata;
+
+	Assert((record->type == TWOPHASEPREDICATERECORD_XACT) ||
+		   (record->type == TWOPHASEPREDICATERECORD_LOCK));
+
+	if (record->type == TWOPHASEPREDICATERECORD_XACT)
+	{
+		/* Per-transaction record. Set up a SERIALIZABLEXACT. */
+		TwoPhasePredicateXactRecord *xactRecord;
+		SERIALIZABLEXACT *sxact;
+		SERIALIZABLEXID *sxid;
+		SERIALIZABLEXIDTAG sxidtag;
+		bool		found;
+
+		xactRecord = (TwoPhasePredicateXactRecord *) &record->data.xactRecord;
+
+		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+		sxact = CreatePredXact();
+		if (!sxact)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory")));
+
+		/* vxid for a prepared xact is InvalidBackendId/xid; no pid */
+		sxact->vxid.backendId = InvalidBackendId;
+		sxact->vxid.localTransactionId = (LocalTransactionId) xid;
+		sxact->pid = 0;
+		sxact->pgprocno = INVALID_PGPROCNO;
+
+		/* a prepared xact hasn't committed yet */
+		sxact->prepareSeqNo = RecoverySerCommitSeqNo;
+		sxact->commitSeqNo = InvalidSerCommitSeqNo;
+		sxact->finishedBefore = InvalidTransactionId;
+
+		sxact->SeqNo.lastCommitBeforeSnapshot = RecoverySerCommitSeqNo;
+
+		/*
+		 * Don't need to track this; no transactions running at the time the
+		 * recovered xact started are still active, except possibly other
+		 * prepared xacts and we don't care whether those are RO_SAFE or not.
+		 */
+		dlist_init(&(sxact->possibleUnsafeConflicts));
+
+		dlist_init(&(sxact->predicateLocks));
+		dlist_node_init(&sxact->finishedLink);
+
+		sxact->topXid = xid;
+		sxact->xmin = xactRecord->xmin;
+		sxact->flags = xactRecord->flags;
+		Assert(SxactIsPrepared(sxact));
+		if (!SxactIsReadOnly(sxact))
+		{
+			++(PredXact->WritableSxactCount);
+			Assert(PredXact->WritableSxactCount <=
+				   (MaxBackends + max_prepared_xacts));
+		}
+
+		/*
+		 * We don't know whether the transaction had any conflicts or not, so
+		 * we'll conservatively assume that it had both a conflict in and a
+		 * conflict out, and represent that with the summary conflict flags.
+		 */
+		dlist_init(&(sxact->outConflicts));
+		dlist_init(&(sxact->inConflicts));
+		sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+		sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+
+		/* Register the transaction's xid */
+		sxidtag.xid = xid;
+		sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+											   &sxidtag,
+											   HASH_ENTER, &found);
+		Assert(sxid != NULL);
+		Assert(!found);
+		sxid->myXact = (SERIALIZABLEXACT *) sxact;
+
+		/*
+		 * Update global xmin. Note that this is a special case compared to
+		 * registering a normal transaction, because the global xmin might go
+		 * backwards. That's OK, because until recovery is over we're not
+		 * going to complete any transactions or create any non-prepared
+		 * transactions, so there's no danger of throwing away.
+		 */
+		if ((!TransactionIdIsValid(PredXact->SxactGlobalXmin)) ||
+			(TransactionIdFollows(PredXact->SxactGlobalXmin, sxact->xmin)))
+		{
+			PredXact->SxactGlobalXmin = sxact->xmin;
+			PredXact->SxactGlobalXminCount = 1;
+			SerialSetActiveSerXmin(sxact->xmin);
+		}
+		else if (TransactionIdEquals(sxact->xmin, PredXact->SxactGlobalXmin))
+		{
+			Assert(PredXact->SxactGlobalXminCount > 0);
+			PredXact->SxactGlobalXminCount++;
+		}
+
+		LWLockRelease(SerializableXactHashLock);
+	}
+	else if (record->type == TWOPHASEPREDICATERECORD_LOCK)
+	{
+		/* Lock record. Recreate the PREDICATELOCK */
+		TwoPhasePredicateLockRecord *lockRecord;
+		SERIALIZABLEXID *sxid;
+		SERIALIZABLEXACT *sxact;
+		SERIALIZABLEXIDTAG sxidtag;
+		uint32		targettaghash;
+
+		lockRecord = (TwoPhasePredicateLockRecord *) &record->data.lockRecord;
+		targettaghash = PredicateLockTargetTagHashCode(&lockRecord->target);
+
+		LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+		sxidtag.xid = xid;
+		sxid = (SERIALIZABLEXID *)
+			hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+		LWLockRelease(SerializableXactHashLock);
+
+		Assert(sxid != NULL);
+		sxact = sxid->myXact;
+		Assert(sxact != InvalidSerializableXact);
+
+		CreatePredicateLock(&lockRecord->target, targettaghash, sxact);
+	}
+}
+
+/*
+ * Prepare to share the current SERIALIZABLEXACT with parallel workers.
+ * Return a handle object that can be used by AttachSerializableXact() in a
+ * parallel worker.
+ */
+SerializableXactHandle
+ShareSerializableXact(void)
+{
+	return MySerializableXact;
+}
+
+/*
+ * Allow parallel workers to import the leader's SERIALIZABLEXACT.
+ */
+void
+AttachSerializableXact(SerializableXactHandle handle)
+{
+
+	Assert(MySerializableXact == InvalidSerializableXact);
+
+	MySerializableXact = (SERIALIZABLEXACT *) handle;
+	if (MySerializableXact != InvalidSerializableXact)
+		CreateLocalPredicateLockHash();
+}
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
new file mode 100644
index 0000000..e9e445b
--- /dev/null
+++ b/src/backend/storage/lmgr/proc.c
@@ -0,0 +1,1897 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.c
+ *	  routines to manage per-process shared memory data structure
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/proc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * Interface (a):
+ *		ProcSleep(), ProcWakeup(),
+ *
+ * Waiting for a lock causes the backend to be put to sleep.  Whoever releases
+ * the lock wakes the process up again (and gives it an error code so it knows
+ * whether it was awoken on an error condition).
+ *
+ * Interface (b):
+ *
+ * ProcReleaseLocks -- frees the locks associated with current transaction
+ *
+ * ProcKill -- destroys the shared memory state (and locks)
+ * associated with the process.
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "replication/slot.h"
+#include "replication/syncrep.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/spin.h"
+#include "storage/standby.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/* GUC variables */
+int			DeadlockTimeout = 1000;
+int			StatementTimeout = 0;
+int			LockTimeout = 0;
+int			IdleInTransactionSessionTimeout = 0;
+int			IdleSessionTimeout = 0;
+bool		log_lock_waits = false;
+
+/* Pointer to this process's PGPROC struct, if any */
+PGPROC	   *MyProc = NULL;
+
+/*
+ * This spinlock protects the freelist of recycled PGPROC structures.
+ * We cannot use an LWLock because the LWLock manager depends on already
+ * having a PGPROC and a wait semaphore!  But these structures are touched
+ * relatively infrequently (only at backend startup or shutdown) and not for
+ * very long, so a spinlock is okay.
+ */
+NON_EXEC_STATIC slock_t *ProcStructLock = NULL;
+
+/* Pointers to shared-memory structures */
+PROC_HDR   *ProcGlobal = NULL;
+NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL;
+PGPROC	   *PreparedXactProcs = NULL;
+
+/* If we are waiting for a lock, this points to the associated LOCALLOCK */
+static LOCALLOCK *lockAwaited = NULL;
+
+static DeadLockState deadlock_state = DS_NOT_YET_CHECKED;
+
+/* Is a deadlock check pending? */
+static volatile sig_atomic_t got_deadlock_timeout;
+
+static void RemoveProcFromArray(int code, Datum arg);
+static void ProcKill(int code, Datum arg);
+static void AuxiliaryProcKill(int code, Datum arg);
+static void CheckDeadLock(void);
+
+
+/*
+ * Report shared-memory space needed by InitProcGlobal.
+ */
+Size
+ProcGlobalShmemSize(void)
+{
+	Size		size = 0;
+	Size		TotalProcs =
+		add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts));
+
+	/* ProcGlobal */
+	size = add_size(size, sizeof(PROC_HDR));
+	size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC)));
+	size = add_size(size, sizeof(slock_t));
+
+	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids)));
+	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates)));
+	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags)));
+
+	return size;
+}
+
+/*
+ * Report number of semaphores needed by InitProcGlobal.
+ */
+int
+ProcGlobalSemas(void)
+{
+	/*
+	 * We need a sema per backend (including autovacuum), plus one for each
+	 * auxiliary process.
+	 */
+	return MaxBackends + NUM_AUXILIARY_PROCS;
+}
+
+/*
+ * InitProcGlobal -
+ *	  Initialize the global process table during postmaster or standalone
+ *	  backend startup.
+ *
+ *	  We also create all the per-process semaphores we will need to support
+ *	  the requested number of backends.  We used to allocate semaphores
+ *	  only when backends were actually started up, but that is bad because
+ *	  it lets Postgres fail under load --- a lot of Unix systems are
+ *	  (mis)configured with small limits on the number of semaphores, and
+ *	  running out when trying to start another backend is a common failure.
+ *	  So, now we grab enough semaphores to support the desired max number
+ *	  of backends immediately at initialization --- if the sysadmin has set
+ *	  MaxConnections, max_worker_processes, max_wal_senders, or
+ *	  autovacuum_max_workers higher than his kernel will support, he'll
+ *	  find out sooner rather than later.
+ *
+ *	  Another reason for creating semaphores here is that the semaphore
+ *	  implementation typically requires us to create semaphores in the
+ *	  postmaster, not in backends.
+ *
+ * Note: this is NOT called by individual backends under a postmaster,
+ * not even in the EXEC_BACKEND case.  The ProcGlobal and AuxiliaryProcs
+ * pointers must be propagated specially for EXEC_BACKEND operation.
+ */
+void
+InitProcGlobal(void)
+{
+	PGPROC	   *procs;
+	int			i,
+				j;
+	bool		found;
+	uint32		TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS + max_prepared_xacts;
+
+	/* Create the ProcGlobal shared structure */
+	ProcGlobal = (PROC_HDR *)
+		ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found);
+	Assert(!found);
+
+	/*
+	 * Initialize the data structures.
+	 */
+	ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+	dlist_init(&ProcGlobal->freeProcs);
+	dlist_init(&ProcGlobal->autovacFreeProcs);
+	dlist_init(&ProcGlobal->bgworkerFreeProcs);
+	dlist_init(&ProcGlobal->walsenderFreeProcs);
+	ProcGlobal->startupBufferPinWaitBufId = -1;
+	ProcGlobal->walwriterLatch = NULL;
+	ProcGlobal->checkpointerLatch = NULL;
+	pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PGPROCNO);
+	pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PGPROCNO);
+
+	/*
+	 * Create and initialize all the PGPROC structures we'll need.  There are
+	 * five separate consumers: (1) normal backends, (2) autovacuum workers
+	 * and the autovacuum launcher, (3) background workers, (4) auxiliary
+	 * processes, and (5) prepared transactions.  Each PGPROC structure is
+	 * dedicated to exactly one of these purposes, and they do not move
+	 * between groups.
+	 */
+	procs = (PGPROC *) ShmemAlloc(TotalProcs * sizeof(PGPROC));
+	MemSet(procs, 0, TotalProcs * sizeof(PGPROC));
+	ProcGlobal->allProcs = procs;
+	/* XXX allProcCount isn't really all of them; it excludes prepared xacts */
+	ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS;
+
+	/*
+	 * Allocate arrays mirroring PGPROC fields in a dense manner. See
+	 * PROC_HDR.
+	 *
+	 * XXX: It might make sense to increase padding for these arrays, given
+	 * how hotly they are accessed.
+	 */
+	ProcGlobal->xids =
+		(TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids));
+	MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids));
+	ProcGlobal->subxidStates = (XidCacheStatus *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->subxidStates));
+	MemSet(ProcGlobal->subxidStates, 0, TotalProcs * sizeof(*ProcGlobal->subxidStates));
+	ProcGlobal->statusFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->statusFlags));
+	MemSet(ProcGlobal->statusFlags, 0, TotalProcs * sizeof(*ProcGlobal->statusFlags));
+
+	for (i = 0; i < TotalProcs; i++)
+	{
+		PGPROC	   *proc = &procs[i];
+
+		/* Common initialization for all PGPROCs, regardless of type. */
+
+		/*
+		 * Set up per-PGPROC semaphore, latch, and fpInfoLock.  Prepared xact
+		 * dummy PGPROCs don't need these though - they're never associated
+		 * with a real process
+		 */
+		if (i < MaxBackends + NUM_AUXILIARY_PROCS)
+		{
+			proc->sem = PGSemaphoreCreate();
+			InitSharedLatch(&(proc->procLatch));
+			LWLockInitialize(&(proc->fpInfoLock), LWTRANCHE_LOCK_FASTPATH);
+		}
+		proc->pgprocno = i;
+
+		/*
+		 * Newly created PGPROCs for normal backends, autovacuum and bgworkers
+		 * must be queued up on the appropriate free list.  Because there can
+		 * only ever be a small, fixed number of auxiliary processes, no free
+		 * list is used in that case; InitAuxiliaryProcess() instead uses a
+		 * linear search.   PGPROCs for prepared transactions are added to a
+		 * free list by TwoPhaseShmemInit().
+		 */
+		if (i < MaxConnections)
+		{
+			/* PGPROC for normal backend, add to freeProcs list */
+			dlist_push_head(&ProcGlobal->freeProcs, &proc->links);
+			proc->procgloballist = &ProcGlobal->freeProcs;
+		}
+		else if (i < MaxConnections + autovacuum_max_workers + 1)
+		{
+			/* PGPROC for AV launcher/worker, add to autovacFreeProcs list */
+			dlist_push_head(&ProcGlobal->autovacFreeProcs, &proc->links);
+			proc->procgloballist = &ProcGlobal->autovacFreeProcs;
+		}
+		else if (i < MaxConnections + autovacuum_max_workers + 1 + max_worker_processes)
+		{
+			/* PGPROC for bgworker, add to bgworkerFreeProcs list */
+			dlist_push_head(&ProcGlobal->bgworkerFreeProcs, &proc->links);
+			proc->procgloballist = &ProcGlobal->bgworkerFreeProcs;
+		}
+		else if (i < MaxBackends)
+		{
+			/* PGPROC for walsender, add to walsenderFreeProcs list */
+			dlist_push_head(&ProcGlobal->walsenderFreeProcs, &proc->links);
+			proc->procgloballist = &ProcGlobal->walsenderFreeProcs;
+		}
+
+		/* Initialize myProcLocks[] shared memory queues. */
+		for (j = 0; j < NUM_LOCK_PARTITIONS; j++)
+			dlist_init(&(proc->myProcLocks[j]));
+
+		/* Initialize lockGroupMembers list. */
+		dlist_init(&proc->lockGroupMembers);
+
+		/*
+		 * Initialize the atomic variables, otherwise, it won't be safe to
+		 * access them for backends that aren't currently in use.
+		 */
+		pg_atomic_init_u32(&(proc->procArrayGroupNext), INVALID_PGPROCNO);
+		pg_atomic_init_u32(&(proc->clogGroupNext), INVALID_PGPROCNO);
+		pg_atomic_init_u64(&(proc->waitStart), 0);
+	}
+
+	/*
+	 * Save pointers to the blocks of PGPROC structures reserved for auxiliary
+	 * processes and prepared transactions.
+	 */
+	AuxiliaryProcs = &procs[MaxBackends];
+	PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS];
+
+	/* Create ProcStructLock spinlock, too */
+	ProcStructLock = (slock_t *) ShmemAlloc(sizeof(slock_t));
+	SpinLockInit(ProcStructLock);
+}
+
+/*
+ * InitProcess -- initialize a per-process data structure for this backend
+ */
+void
+InitProcess(void)
+{
+	dlist_head *procgloballist;
+
+	/*
+	 * ProcGlobal should be set up already (if we are a backend, we inherit
+	 * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+	 */
+	if (ProcGlobal == NULL)
+		elog(PANIC, "proc header uninitialized");
+
+	if (MyProc != NULL)
+		elog(ERROR, "you already exist");
+
+	/* Decide which list should supply our PGPROC. */
+	if (IsAnyAutoVacuumProcess())
+		procgloballist = &ProcGlobal->autovacFreeProcs;
+	else if (IsBackgroundWorker)
+		procgloballist = &ProcGlobal->bgworkerFreeProcs;
+	else if (am_walsender)
+		procgloballist = &ProcGlobal->walsenderFreeProcs;
+	else
+		procgloballist = &ProcGlobal->freeProcs;
+
+	/*
+	 * Try to get a proc struct from the appropriate free list.  If this
+	 * fails, we must be out of PGPROC structures (not to mention semaphores).
+	 *
+	 * While we are holding the ProcStructLock, also copy the current shared
+	 * estimate of spins_per_delay to local storage.
+	 */
+	SpinLockAcquire(ProcStructLock);
+
+	set_spins_per_delay(ProcGlobal->spins_per_delay);
+
+	if (!dlist_is_empty(procgloballist))
+	{
+		MyProc = (PGPROC *) dlist_pop_head_node(procgloballist);
+		SpinLockRelease(ProcStructLock);
+	}
+	else
+	{
+		/*
+		 * If we reach here, all the PGPROCs are in use.  This is one of the
+		 * possible places to detect "too many backends", so give the standard
+		 * error message.  XXX do we need to give a different failure message
+		 * in the autovacuum case?
+		 */
+		SpinLockRelease(ProcStructLock);
+		if (am_walsender)
+			ereport(FATAL,
+					(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+					 errmsg("number of requested standby connections exceeds max_wal_senders (currently %d)",
+							max_wal_senders)));
+		ereport(FATAL,
+				(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+				 errmsg("sorry, too many clients already")));
+	}
+
+	/*
+	 * Cross-check that the PGPROC is of the type we expect; if this were not
+	 * the case, it would get returned to the wrong list.
+	 */
+	Assert(MyProc->procgloballist == procgloballist);
+
+	/*
+	 * Now that we have a PGPROC, mark ourselves as an active postmaster
+	 * child; this is so that the postmaster can detect it if we exit without
+	 * cleaning up.  (XXX autovac launcher currently doesn't participate in
+	 * this; it probably should.)
+	 */
+	if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+		MarkPostmasterChildActive();
+
+	/*
+	 * Initialize all fields of MyProc, except for those previously
+	 * initialized by InitProcGlobal.
+	 */
+	dlist_node_init(&MyProc->links);
+	MyProc->waitStatus = PROC_WAIT_STATUS_OK;
+	MyProc->lxid = InvalidLocalTransactionId;
+	MyProc->fpVXIDLock = false;
+	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+	MyProc->xid = InvalidTransactionId;
+	MyProc->xmin = InvalidTransactionId;
+	MyProc->pid = MyProcPid;
+	/* backendId, databaseId and roleId will be filled in later */
+	MyProc->backendId = InvalidBackendId;
+	MyProc->databaseId = InvalidOid;
+	MyProc->roleId = InvalidOid;
+	MyProc->tempNamespaceId = InvalidOid;
+	MyProc->isBackgroundWorker = IsBackgroundWorker;
+	MyProc->delayChkptFlags = 0;
+	MyProc->statusFlags = 0;
+	/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
+	if (IsAutoVacuumWorkerProcess())
+		MyProc->statusFlags |= PROC_IS_AUTOVACUUM;
+	MyProc->lwWaiting = LW_WS_NOT_WAITING;
+	MyProc->lwWaitMode = 0;
+	MyProc->waitLock = NULL;
+	MyProc->waitProcLock = NULL;
+	pg_atomic_write_u64(&MyProc->waitStart, 0);
+#ifdef USE_ASSERT_CHECKING
+	{
+		int			i;
+
+		/* Last process should have released all locks. */
+		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+			Assert(dlist_is_empty(&(MyProc->myProcLocks[i])));
+	}
+#endif
+	MyProc->recoveryConflictPending = false;
+
+	/* Initialize fields for sync rep */
+	MyProc->waitLSN = 0;
+	MyProc->syncRepState = SYNC_REP_NOT_WAITING;
+	dlist_node_init(&MyProc->syncRepLinks);
+
+	/* Initialize fields for group XID clearing. */
+	MyProc->procArrayGroupMember = false;
+	MyProc->procArrayGroupMemberXid = InvalidTransactionId;
+	Assert(pg_atomic_read_u32(&MyProc->procArrayGroupNext) == INVALID_PGPROCNO);
+
+	/* Check that group locking fields are in a proper initial state. */
+	Assert(MyProc->lockGroupLeader == NULL);
+	Assert(dlist_is_empty(&MyProc->lockGroupMembers));
+
+	/* Initialize wait event information. */
+	MyProc->wait_event_info = 0;
+
+	/* Initialize fields for group transaction status update. */
+	MyProc->clogGroupMember = false;
+	MyProc->clogGroupMemberXid = InvalidTransactionId;
+	MyProc->clogGroupMemberXidStatus = TRANSACTION_STATUS_IN_PROGRESS;
+	MyProc->clogGroupMemberPage = -1;
+	MyProc->clogGroupMemberLsn = InvalidXLogRecPtr;
+	Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO);
+
+	/*
+	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
+	 * on it.  That allows us to repoint the process latch, which so far
+	 * points to process local one, to the shared one.
+	 */
+	OwnLatch(&MyProc->procLatch);
+	SwitchToSharedLatch();
+
+	/* now that we have a proc, report wait events to shared memory */
+	pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+
+	/*
+	 * We might be reusing a semaphore that belonged to a failed process. So
+	 * be careful and reinitialize its value here.  (This is not strictly
+	 * necessary anymore, but seems like a good idea for cleanliness.)
+	 */
+	PGSemaphoreReset(MyProc->sem);
+
+	/*
+	 * Arrange to clean up at backend exit.
+	 */
+	on_shmem_exit(ProcKill, 0);
+
+	/*
+	 * Now that we have a PGPROC, we could try to acquire locks, so initialize
+	 * local state needed for LWLocks, and the deadlock checker.
+	 */
+	InitLWLockAccess();
+	InitDeadLockChecking();
+}
+
+/*
+ * InitProcessPhase2 -- make MyProc visible in the shared ProcArray.
+ *
+ * This is separate from InitProcess because we can't acquire LWLocks until
+ * we've created a PGPROC, but in the EXEC_BACKEND case ProcArrayAdd won't
+ * work until after we've done CreateSharedMemoryAndSemaphores.
+ */
+void
+InitProcessPhase2(void)
+{
+	Assert(MyProc != NULL);
+
+	/*
+	 * Add our PGPROC to the PGPROC array in shared memory.
+	 */
+	ProcArrayAdd(MyProc);
+
+	/*
+	 * Arrange to clean that up at backend exit.
+	 */
+	on_shmem_exit(RemoveProcFromArray, 0);
+}
+
+/*
+ * InitAuxiliaryProcess -- create a per-auxiliary-process data structure
+ *
+ * This is called by bgwriter and similar processes so that they will have a
+ * MyProc value that's real enough to let them wait for LWLocks.  The PGPROC
+ * and sema that are assigned are one of the extra ones created during
+ * InitProcGlobal.
+ *
+ * Auxiliary processes are presently not expected to wait for real (lockmgr)
+ * locks, so we need not set up the deadlock checker.  They are never added
+ * to the ProcArray or the sinval messaging mechanism, either.  They also
+ * don't get a VXID assigned, since this is only useful when we actually
+ * hold lockmgr locks.
+ *
+ * Startup process however uses locks but never waits for them in the
+ * normal backend sense. Startup process also takes part in sinval messaging
+ * as a sendOnly process, so never reads messages from sinval queue. So
+ * Startup process does have a VXID and does show up in pg_locks.
+ */
+void
+InitAuxiliaryProcess(void)
+{
+	PGPROC	   *auxproc;
+	int			proctype;
+
+	/*
+	 * ProcGlobal should be set up already (if we are a backend, we inherit
+	 * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+	 */
+	if (ProcGlobal == NULL || AuxiliaryProcs == NULL)
+		elog(PANIC, "proc header uninitialized");
+
+	if (MyProc != NULL)
+		elog(ERROR, "you already exist");
+
+	/*
+	 * We use the ProcStructLock to protect assignment and releasing of
+	 * AuxiliaryProcs entries.
+	 *
+	 * While we are holding the ProcStructLock, also copy the current shared
+	 * estimate of spins_per_delay to local storage.
+	 */
+	SpinLockAcquire(ProcStructLock);
+
+	set_spins_per_delay(ProcGlobal->spins_per_delay);
+
+	/*
+	 * Find a free auxproc ... *big* trouble if there isn't one ...
+	 */
+	for (proctype = 0; proctype < NUM_AUXILIARY_PROCS; proctype++)
+	{
+		auxproc = &AuxiliaryProcs[proctype];
+		if (auxproc->pid == 0)
+			break;
+	}
+	if (proctype >= NUM_AUXILIARY_PROCS)
+	{
+		SpinLockRelease(ProcStructLock);
+		elog(FATAL, "all AuxiliaryProcs are in use");
+	}
+
+	/* Mark auxiliary proc as in use by me */
+	/* use volatile pointer to prevent code rearrangement */
+	((volatile PGPROC *) auxproc)->pid = MyProcPid;
+
+	MyProc = auxproc;
+
+	SpinLockRelease(ProcStructLock);
+
+	/*
+	 * Initialize all fields of MyProc, except for those previously
+	 * initialized by InitProcGlobal.
+	 */
+	dlist_node_init(&MyProc->links);
+	MyProc->waitStatus = PROC_WAIT_STATUS_OK;
+	MyProc->lxid = InvalidLocalTransactionId;
+	MyProc->fpVXIDLock = false;
+	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+	MyProc->xid = InvalidTransactionId;
+	MyProc->xmin = InvalidTransactionId;
+	MyProc->backendId = InvalidBackendId;
+	MyProc->databaseId = InvalidOid;
+	MyProc->roleId = InvalidOid;
+	MyProc->tempNamespaceId = InvalidOid;
+	MyProc->isBackgroundWorker = IsBackgroundWorker;
+	MyProc->delayChkptFlags = 0;
+	MyProc->statusFlags = 0;
+	MyProc->lwWaiting = LW_WS_NOT_WAITING;
+	MyProc->lwWaitMode = 0;
+	MyProc->waitLock = NULL;
+	MyProc->waitProcLock = NULL;
+	pg_atomic_write_u64(&MyProc->waitStart, 0);
+#ifdef USE_ASSERT_CHECKING
+	{
+		int			i;
+
+		/* Last process should have released all locks. */
+		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+			Assert(dlist_is_empty(&(MyProc->myProcLocks[i])));
+	}
+#endif
+
+	/*
+	 * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
+	 * on it.  That allows us to repoint the process latch, which so far
+	 * points to process local one, to the shared one.
+	 */
+	OwnLatch(&MyProc->procLatch);
+	SwitchToSharedLatch();
+
+	/* now that we have a proc, report wait events to shared memory */
+	pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+
+	/* Check that group locking fields are in a proper initial state. */
+	Assert(MyProc->lockGroupLeader == NULL);
+	Assert(dlist_is_empty(&MyProc->lockGroupMembers));
+
+	/*
+	 * We might be reusing a semaphore that belonged to a failed process. So
+	 * be careful and reinitialize its value here.  (This is not strictly
+	 * necessary anymore, but seems like a good idea for cleanliness.)
+	 */
+	PGSemaphoreReset(MyProc->sem);
+
+	/*
+	 * Arrange to clean up at process exit.
+	 */
+	on_shmem_exit(AuxiliaryProcKill, Int32GetDatum(proctype));
+}
+
+/*
+ * Used from bufmgr to share the value of the buffer that Startup waits on,
+ * or to reset the value to "not waiting" (-1). This allows processing
+ * of recovery conflicts for buffer pins. Set is made before backends look
+ * at this value, so locking not required, especially since the set is
+ * an atomic integer set operation.
+ */
+void
+SetStartupBufferPinWaitBufId(int bufid)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile PROC_HDR *procglobal = ProcGlobal;
+
+	procglobal->startupBufferPinWaitBufId = bufid;
+}
+
+/*
+ * Used by backends when they receive a request to check for buffer pin waits.
+ */
+int
+GetStartupBufferPinWaitBufId(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile PROC_HDR *procglobal = ProcGlobal;
+
+	return procglobal->startupBufferPinWaitBufId;
+}
+
+/*
+ * Check whether there are at least N free PGPROC objects.  If false is
+ * returned, *nfree will be set to the number of free PGPROC objects.
+ * Otherwise, *nfree will be set to n.
+ *
+ * Note: this is designed on the assumption that N will generally be small.
+ */
+bool
+HaveNFreeProcs(int n, int *nfree)
+{
+	dlist_iter	iter;
+
+	Assert(n > 0);
+	Assert(nfree);
+
+	SpinLockAcquire(ProcStructLock);
+
+	*nfree = 0;
+	dlist_foreach(iter, &ProcGlobal->freeProcs)
+	{
+		(*nfree)++;
+		if (*nfree == n)
+			break;
+	}
+
+	SpinLockRelease(ProcStructLock);
+
+	return (*nfree == n);
+}
+
+/*
+ * Check if the current process is awaiting a lock.
+ */
+bool
+IsWaitingForLock(void)
+{
+	if (lockAwaited == NULL)
+		return false;
+
+	return true;
+}
+
+/*
+ * Cancel any pending wait for lock, when aborting a transaction, and revert
+ * any strong lock count acquisition for a lock being acquired.
+ *
+ * (Normally, this would only happen if we accept a cancel/die
+ * interrupt while waiting; but an ereport(ERROR) before or during the lock
+ * wait is within the realm of possibility, too.)
+ */
+void
+LockErrorCleanup(void)
+{
+	LWLock	   *partitionLock;
+	DisableTimeoutParams timeouts[2];
+
+	HOLD_INTERRUPTS();
+
+	AbortStrongLockAcquire();
+
+	/* Nothing to do if we weren't waiting for a lock */
+	if (lockAwaited == NULL)
+	{
+		RESUME_INTERRUPTS();
+		return;
+	}
+
+	/*
+	 * Turn off the deadlock and lock timeout timers, if they are still
+	 * running (see ProcSleep).  Note we must preserve the LOCK_TIMEOUT
+	 * indicator flag, since this function is executed before
+	 * ProcessInterrupts when responding to SIGINT; else we'd lose the
+	 * knowledge that the SIGINT came from a lock timeout and not an external
+	 * source.
+	 */
+	timeouts[0].id = DEADLOCK_TIMEOUT;
+	timeouts[0].keep_indicator = false;
+	timeouts[1].id = LOCK_TIMEOUT;
+	timeouts[1].keep_indicator = true;
+	disable_timeouts(timeouts, 2);
+
+	/* Unlink myself from the wait queue, if on it (might not be anymore!) */
+	partitionLock = LockHashPartitionLock(lockAwaited->hashcode);
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	if (!dlist_node_is_detached(&MyProc->links))
+	{
+		/* We could not have been granted the lock yet */
+		RemoveFromWaitQueue(MyProc, lockAwaited->hashcode);
+	}
+	else
+	{
+		/*
+		 * Somebody kicked us off the lock queue already.  Perhaps they
+		 * granted us the lock, or perhaps they detected a deadlock. If they
+		 * did grant us the lock, we'd better remember it in our local lock
+		 * table.
+		 */
+		if (MyProc->waitStatus == PROC_WAIT_STATUS_OK)
+			GrantAwaitedLock();
+	}
+
+	lockAwaited = NULL;
+
+	LWLockRelease(partitionLock);
+
+	RESUME_INTERRUPTS();
+}
+
+
+/*
+ * ProcReleaseLocks() -- release locks associated with current transaction
+ *			at main transaction commit or abort
+ *
+ * At main transaction commit, we release standard locks except session locks.
+ * At main transaction abort, we release all locks including session locks.
+ *
+ * Advisory locks are released only if they are transaction-level;
+ * session-level holds remain, whether this is a commit or not.
+ *
+ * At subtransaction commit, we don't release any locks (so this func is not
+ * needed at all); we will defer the releasing to the parent transaction.
+ * At subtransaction abort, we release all locks held by the subtransaction;
+ * this is implemented by retail releasing of the locks under control of
+ * the ResourceOwner mechanism.
+ */
+void
+ProcReleaseLocks(bool isCommit)
+{
+	if (!MyProc)
+		return;
+	/* If waiting, get off wait queue (should only be needed after error) */
+	LockErrorCleanup();
+	/* Release standard locks, including session-level if aborting */
+	LockReleaseAll(DEFAULT_LOCKMETHOD, !isCommit);
+	/* Release transaction-level advisory locks */
+	LockReleaseAll(USER_LOCKMETHOD, false);
+}
+
+
+/*
+ * RemoveProcFromArray() -- Remove this process from the shared ProcArray.
+ */
+static void
+RemoveProcFromArray(int code, Datum arg)
+{
+	Assert(MyProc != NULL);
+	ProcArrayRemove(MyProc, InvalidTransactionId);
+}
+
+/*
+ * ProcKill() -- Destroy the per-proc data structure for
+ *		this process. Release any of its held LW locks.
+ */
+static void
+ProcKill(int code, Datum arg)
+{
+	PGPROC	   *proc;
+	dlist_head *procgloballist;
+
+	Assert(MyProc != NULL);
+
+	/* not safe if forked by system(), etc. */
+	if (MyProc->pid != (int) getpid())
+		elog(PANIC, "ProcKill() called in child process");
+
+	/* Make sure we're out of the sync rep lists */
+	SyncRepCleanupAtProcExit();
+
+#ifdef USE_ASSERT_CHECKING
+	{
+		int			i;
+
+		/* Last process should have released all locks. */
+		for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+			Assert(dlist_is_empty(&(MyProc->myProcLocks[i])));
+	}
+#endif
+
+	/*
+	 * Release any LW locks I am holding.  There really shouldn't be any, but
+	 * it's cheap to check again before we cut the knees off the LWLock
+	 * facility by releasing our PGPROC ...
+	 */
+	LWLockReleaseAll();
+
+	/* Cancel any pending condition variable sleep, too */
+	ConditionVariableCancelSleep();
+
+	/*
+	 * Detach from any lock group of which we are a member.  If the leader
+	 * exits before all other group members, its PGPROC will remain allocated
+	 * until the last group process exits; that process must return the
+	 * leader's PGPROC to the appropriate list.
+	 */
+	if (MyProc->lockGroupLeader != NULL)
+	{
+		PGPROC	   *leader = MyProc->lockGroupLeader;
+		LWLock	   *leader_lwlock = LockHashPartitionLockByProc(leader);
+
+		LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+		Assert(!dlist_is_empty(&leader->lockGroupMembers));
+		dlist_delete(&MyProc->lockGroupLink);
+		if (dlist_is_empty(&leader->lockGroupMembers))
+		{
+			leader->lockGroupLeader = NULL;
+			if (leader != MyProc)
+			{
+				procgloballist = leader->procgloballist;
+
+				/* Leader exited first; return its PGPROC. */
+				SpinLockAcquire(ProcStructLock);
+				dlist_push_head(procgloballist, &leader->links);
+				SpinLockRelease(ProcStructLock);
+			}
+		}
+		else if (leader != MyProc)
+			MyProc->lockGroupLeader = NULL;
+		LWLockRelease(leader_lwlock);
+	}
+
+	/*
+	 * Reset MyLatch to the process local one.  This is so that signal
+	 * handlers et al can continue using the latch after the shared latch
+	 * isn't ours anymore.
+	 *
+	 * Similarly, stop reporting wait events to MyProc->wait_event_info.
+	 *
+	 * After that clear MyProc and disown the shared latch.
+	 */
+	SwitchBackToLocalLatch();
+	pgstat_reset_wait_event_storage();
+
+	proc = MyProc;
+	MyProc = NULL;
+	DisownLatch(&proc->procLatch);
+
+	procgloballist = proc->procgloballist;
+	SpinLockAcquire(ProcStructLock);
+
+	/*
+	 * If we're still a member of a locking group, that means we're a leader
+	 * which has somehow exited before its children.  The last remaining child
+	 * will release our PGPROC.  Otherwise, release it now.
+	 */
+	if (proc->lockGroupLeader == NULL)
+	{
+		/* Since lockGroupLeader is NULL, lockGroupMembers should be empty. */
+		Assert(dlist_is_empty(&proc->lockGroupMembers));
+
+		/* Return PGPROC structure (and semaphore) to appropriate freelist */
+		dlist_push_tail(procgloballist, &proc->links);
+	}
+
+	/* Update shared estimate of spins_per_delay */
+	ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay);
+
+	SpinLockRelease(ProcStructLock);
+
+	/*
+	 * This process is no longer present in shared memory in any meaningful
+	 * way, so tell the postmaster we've cleaned up acceptably well. (XXX
+	 * autovac launcher should be included here someday)
+	 */
+	if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+		MarkPostmasterChildInactive();
+
+	/* wake autovac launcher if needed -- see comments in FreeWorkerInfo */
+	if (AutovacuumLauncherPid != 0)
+		kill(AutovacuumLauncherPid, SIGUSR2);
+}
+
+/*
+ * AuxiliaryProcKill() -- Cut-down version of ProcKill for auxiliary
+ *		processes (bgwriter, etc).  The PGPROC and sema are not released, only
+ *		marked as not-in-use.
+ */
+static void
+AuxiliaryProcKill(int code, Datum arg)
+{
+	int			proctype = DatumGetInt32(arg);
+	PGPROC	   *auxproc PG_USED_FOR_ASSERTS_ONLY;
+	PGPROC	   *proc;
+
+	Assert(proctype >= 0 && proctype < NUM_AUXILIARY_PROCS);
+
+	/* not safe if forked by system(), etc. */
+	if (MyProc->pid != (int) getpid())
+		elog(PANIC, "AuxiliaryProcKill() called in child process");
+
+	auxproc = &AuxiliaryProcs[proctype];
+
+	Assert(MyProc == auxproc);
+
+	/* Release any LW locks I am holding (see notes above) */
+	LWLockReleaseAll();
+
+	/* Cancel any pending condition variable sleep, too */
+	ConditionVariableCancelSleep();
+
+	/* look at the equivalent ProcKill() code for comments */
+	SwitchBackToLocalLatch();
+	pgstat_reset_wait_event_storage();
+
+	proc = MyProc;
+	MyProc = NULL;
+	DisownLatch(&proc->procLatch);
+
+	SpinLockAcquire(ProcStructLock);
+
+	/* Mark auxiliary proc no longer in use */
+	proc->pid = 0;
+
+	/* Update shared estimate of spins_per_delay */
+	ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay);
+
+	SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * AuxiliaryPidGetProc -- get PGPROC for an auxiliary process
+ * given its PID
+ *
+ * Returns NULL if not found.
+ */
+PGPROC *
+AuxiliaryPidGetProc(int pid)
+{
+	PGPROC	   *result = NULL;
+	int			index;
+
+	if (pid == 0)				/* never match dummy PGPROCs */
+		return NULL;
+
+	for (index = 0; index < NUM_AUXILIARY_PROCS; index++)
+	{
+		PGPROC	   *proc = &AuxiliaryProcs[index];
+
+		if (proc->pid == pid)
+		{
+			result = proc;
+			break;
+		}
+	}
+	return result;
+}
+
+
+/*
+ * ProcSleep -- put a process to sleep on the specified lock
+ *
+ * Caller must have set MyProc->heldLocks to reflect locks already held
+ * on the lockable object by this process (under all XIDs).
+ *
+ * The lock table's partition lock must be held at entry, and will be held
+ * at exit.
+ *
+ * Result: PROC_WAIT_STATUS_OK if we acquired the lock, PROC_WAIT_STATUS_ERROR if not (deadlock).
+ *
+ * ASSUME: that no one will fiddle with the queue until after
+ *		we release the partition lock.
+ *
+ * NOTES: The process queue is now a priority queue for locking.
+ */
+ProcWaitStatus
+ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
+{
+	LOCKMODE	lockmode = locallock->tag.mode;
+	LOCK	   *lock = locallock->lock;
+	PROCLOCK   *proclock = locallock->proclock;
+	uint32		hashcode = locallock->hashcode;
+	LWLock	   *partitionLock = LockHashPartitionLock(hashcode);
+	dclist_head *waitQueue = &lock->waitProcs;
+	PGPROC	   *insert_before = NULL;
+	LOCKMASK	myHeldLocks = MyProc->heldLocks;
+	TimestampTz standbyWaitStart = 0;
+	bool		early_deadlock = false;
+	bool		allow_autovacuum_cancel = true;
+	bool		logged_recovery_conflict = false;
+	ProcWaitStatus myWaitStatus;
+	PGPROC	   *leader = MyProc->lockGroupLeader;
+
+	/*
+	 * If group locking is in use, locks held by members of my locking group
+	 * need to be included in myHeldLocks.  This is not required for relation
+	 * extension lock which conflict among group members. However, including
+	 * them in myHeldLocks will give group members the priority to get those
+	 * locks as compared to other backends which are also trying to acquire
+	 * those locks.  OTOH, we can avoid giving priority to group members for
+	 * that kind of locks, but there doesn't appear to be a clear advantage of
+	 * the same.
+	 */
+	if (leader != NULL)
+	{
+		dlist_iter	iter;
+
+		dlist_foreach(iter, &lock->procLocks)
+		{
+			PROCLOCK   *otherproclock;
+
+			otherproclock = dlist_container(PROCLOCK, lockLink, iter.cur);
+
+			if (otherproclock->groupLeader == leader)
+				myHeldLocks |= otherproclock->holdMask;
+		}
+	}
+
+	/*
+	 * Determine where to add myself in the wait queue.
+	 *
+	 * Normally I should go at the end of the queue.  However, if I already
+	 * hold locks that conflict with the request of any previous waiter, put
+	 * myself in the queue just in front of the first such waiter. This is not
+	 * a necessary step, since deadlock detection would move me to before that
+	 * waiter anyway; but it's relatively cheap to detect such a conflict
+	 * immediately, and avoid delaying till deadlock timeout.
+	 *
+	 * Special case: if I find I should go in front of some waiter, check to
+	 * see if I conflict with already-held locks or the requests before that
+	 * waiter.  If not, then just grant myself the requested lock immediately.
+	 * This is the same as the test for immediate grant in LockAcquire, except
+	 * we are only considering the part of the wait queue before my insertion
+	 * point.
+	 */
+	if (myHeldLocks != 0 && !dclist_is_empty(waitQueue))
+	{
+		LOCKMASK	aheadRequests = 0;
+		dlist_iter	iter;
+
+		dclist_foreach(iter, waitQueue)
+		{
+			PGPROC	   *proc = dlist_container(PGPROC, links, iter.cur);
+
+			/*
+			 * If we're part of the same locking group as this waiter, its
+			 * locks neither conflict with ours nor contribute to
+			 * aheadRequests.
+			 */
+			if (leader != NULL && leader == proc->lockGroupLeader)
+				continue;
+
+			/* Must he wait for me? */
+			if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks)
+			{
+				/* Must I wait for him ? */
+				if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks)
+				{
+					/*
+					 * Yes, so we have a deadlock.  Easiest way to clean up
+					 * correctly is to call RemoveFromWaitQueue(), but we
+					 * can't do that until we are *on* the wait queue. So, set
+					 * a flag to check below, and break out of loop.  Also,
+					 * record deadlock info for later message.
+					 */
+					RememberSimpleDeadLock(MyProc, lockmode, lock, proc);
+					early_deadlock = true;
+					break;
+				}
+				/* I must go before this waiter.  Check special case. */
+				if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
+					!LockCheckConflicts(lockMethodTable, lockmode, lock,
+										proclock))
+				{
+					/* Skip the wait and just grant myself the lock. */
+					GrantLock(lock, proclock, lockmode);
+					GrantAwaitedLock();
+					return PROC_WAIT_STATUS_OK;
+				}
+
+				/* Put myself into wait queue before conflicting process */
+				insert_before = proc;
+				break;
+			}
+			/* Nope, so advance to next waiter */
+			aheadRequests |= LOCKBIT_ON(proc->waitLockMode);
+		}
+	}
+
+	/*
+	 * Insert self into queue, at the position determined above.
+	 */
+	if (insert_before)
+		dclist_insert_before(waitQueue, &insert_before->links, &MyProc->links);
+	else
+		dclist_push_tail(waitQueue, &MyProc->links);
+
+	lock->waitMask |= LOCKBIT_ON(lockmode);
+
+	/* Set up wait information in PGPROC object, too */
+	MyProc->waitLock = lock;
+	MyProc->waitProcLock = proclock;
+	MyProc->waitLockMode = lockmode;
+
+	MyProc->waitStatus = PROC_WAIT_STATUS_WAITING;
+
+	/*
+	 * If we detected deadlock, give up without waiting.  This must agree with
+	 * CheckDeadLock's recovery code.
+	 */
+	if (early_deadlock)
+	{
+		RemoveFromWaitQueue(MyProc, hashcode);
+		return PROC_WAIT_STATUS_ERROR;
+	}
+
+	/* mark that we are waiting for a lock */
+	lockAwaited = locallock;
+
+	/*
+	 * Release the lock table's partition lock.
+	 *
+	 * NOTE: this may also cause us to exit critical-section state, possibly
+	 * allowing a cancel/die interrupt to be accepted. This is OK because we
+	 * have recorded the fact that we are waiting for a lock, and so
+	 * LockErrorCleanup will clean up if cancel/die happens.
+	 */
+	LWLockRelease(partitionLock);
+
+	/*
+	 * Also, now that we will successfully clean up after an ereport, it's
+	 * safe to check to see if there's a buffer pin deadlock against the
+	 * Startup process.  Of course, that's only necessary if we're doing Hot
+	 * Standby and are not the Startup process ourselves.
+	 */
+	if (RecoveryInProgress() && !InRecovery)
+		CheckRecoveryConflictDeadlock();
+
+	/* Reset deadlock_state before enabling the timeout handler */
+	deadlock_state = DS_NOT_YET_CHECKED;
+	got_deadlock_timeout = false;
+
+	/*
+	 * Set timer so we can wake up after awhile and check for a deadlock. If a
+	 * deadlock is detected, the handler sets MyProc->waitStatus =
+	 * PROC_WAIT_STATUS_ERROR, allowing us to know that we must report failure
+	 * rather than success.
+	 *
+	 * By delaying the check until we've waited for a bit, we can avoid
+	 * running the rather expensive deadlock-check code in most cases.
+	 *
+	 * If LockTimeout is set, also enable the timeout for that.  We can save a
+	 * few cycles by enabling both timeout sources in one call.
+	 *
+	 * If InHotStandby we set lock waits slightly later for clarity with other
+	 * code.
+	 */
+	if (!InHotStandby)
+	{
+		if (LockTimeout > 0)
+		{
+			EnableTimeoutParams timeouts[2];
+
+			timeouts[0].id = DEADLOCK_TIMEOUT;
+			timeouts[0].type = TMPARAM_AFTER;
+			timeouts[0].delay_ms = DeadlockTimeout;
+			timeouts[1].id = LOCK_TIMEOUT;
+			timeouts[1].type = TMPARAM_AFTER;
+			timeouts[1].delay_ms = LockTimeout;
+			enable_timeouts(timeouts, 2);
+		}
+		else
+			enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout);
+
+		/*
+		 * Use the current time obtained for the deadlock timeout timer as
+		 * waitStart (i.e., the time when this process started waiting for the
+		 * lock). Since getting the current time newly can cause overhead, we
+		 * reuse the already-obtained time to avoid that overhead.
+		 *
+		 * Note that waitStart is updated without holding the lock table's
+		 * partition lock, to avoid the overhead by additional lock
+		 * acquisition. This can cause "waitstart" in pg_locks to become NULL
+		 * for a very short period of time after the wait started even though
+		 * "granted" is false. This is OK in practice because we can assume
+		 * that users are likely to look at "waitstart" when waiting for the
+		 * lock for a long time.
+		 */
+		pg_atomic_write_u64(&MyProc->waitStart,
+							get_timeout_start_time(DEADLOCK_TIMEOUT));
+	}
+	else if (log_recovery_conflict_waits)
+	{
+		/*
+		 * Set the wait start timestamp if logging is enabled and in hot
+		 * standby.
+		 */
+		standbyWaitStart = GetCurrentTimestamp();
+	}
+
+	/*
+	 * If somebody wakes us between LWLockRelease and WaitLatch, the latch
+	 * will not wait. But a set latch does not necessarily mean that the lock
+	 * is free now, as there are many other sources for latch sets than
+	 * somebody releasing the lock.
+	 *
+	 * We process interrupts whenever the latch has been set, so cancel/die
+	 * interrupts are processed quickly. This means we must not mind losing
+	 * control to a cancel/die interrupt here.  We don't, because we have no
+	 * shared-state-change work to do after being granted the lock (the
+	 * grantor did it all).  We do have to worry about canceling the deadlock
+	 * timeout and updating the locallock table, but if we lose control to an
+	 * error, LockErrorCleanup will fix that up.
+	 */
+	do
+	{
+		if (InHotStandby)
+		{
+			bool		maybe_log_conflict =
+				(standbyWaitStart != 0 && !logged_recovery_conflict);
+
+			/* Set a timer and wait for that or for the lock to be granted */
+			ResolveRecoveryConflictWithLock(locallock->tag.lock,
+											maybe_log_conflict);
+
+			/*
+			 * Emit the log message if the startup process is waiting longer
+			 * than deadlock_timeout for recovery conflict on lock.
+			 */
+			if (maybe_log_conflict)
+			{
+				TimestampTz now = GetCurrentTimestamp();
+
+				if (TimestampDifferenceExceeds(standbyWaitStart, now,
+											   DeadlockTimeout))
+				{
+					VirtualTransactionId *vxids;
+					int			cnt;
+
+					vxids = GetLockConflicts(&locallock->tag.lock,
+											 AccessExclusiveLock, &cnt);
+
+					/*
+					 * Log the recovery conflict and the list of PIDs of
+					 * backends holding the conflicting lock. Note that we do
+					 * logging even if there are no such backends right now
+					 * because the startup process here has already waited
+					 * longer than deadlock_timeout.
+					 */
+					LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+										standbyWaitStart, now,
+										cnt > 0 ? vxids : NULL, true);
+					logged_recovery_conflict = true;
+				}
+			}
+		}
+		else
+		{
+			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+							 PG_WAIT_LOCK | locallock->tag.lock.locktag_type);
+			ResetLatch(MyLatch);
+			/* check for deadlocks first, as that's probably log-worthy */
+			if (got_deadlock_timeout)
+			{
+				CheckDeadLock();
+				got_deadlock_timeout = false;
+			}
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		/*
+		 * waitStatus could change from PROC_WAIT_STATUS_WAITING to something
+		 * else asynchronously.  Read it just once per loop to prevent
+		 * surprising behavior (such as missing log messages).
+		 */
+		myWaitStatus = *((volatile ProcWaitStatus *) &MyProc->waitStatus);
+
+		/*
+		 * If we are not deadlocked, but are waiting on an autovacuum-induced
+		 * task, send a signal to interrupt it.
+		 */
+		if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel)
+		{
+			PGPROC	   *autovac = GetBlockingAutoVacuumPgproc();
+			uint8		statusFlags;
+			uint8		lockmethod_copy;
+			LOCKTAG		locktag_copy;
+
+			/*
+			 * Grab info we need, then release lock immediately.  Note this
+			 * coding means that there is a tiny chance that the process
+			 * terminates its current transaction and starts a different one
+			 * before we have a change to send the signal; the worst possible
+			 * consequence is that a for-wraparound vacuum is cancelled.  But
+			 * that could happen in any case unless we were to do kill() with
+			 * the lock held, which is much more undesirable.
+			 */
+			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+			statusFlags = ProcGlobal->statusFlags[autovac->pgxactoff];
+			lockmethod_copy = lock->tag.locktag_lockmethodid;
+			locktag_copy = lock->tag;
+			LWLockRelease(ProcArrayLock);
+
+			/*
+			 * Only do it if the worker is not working to protect against Xid
+			 * wraparound.
+			 */
+			if ((statusFlags & PROC_IS_AUTOVACUUM) &&
+				!(statusFlags & PROC_VACUUM_FOR_WRAPAROUND))
+			{
+				int			pid = autovac->pid;
+
+				/* report the case, if configured to do so */
+				if (message_level_is_interesting(DEBUG1))
+				{
+					StringInfoData locktagbuf;
+					StringInfoData logbuf;	/* errdetail for server log */
+
+					initStringInfo(&locktagbuf);
+					initStringInfo(&logbuf);
+					DescribeLockTag(&locktagbuf, &locktag_copy);
+					appendStringInfo(&logbuf,
+									 "Process %d waits for %s on %s.",
+									 MyProcPid,
+									 GetLockmodeName(lockmethod_copy, lockmode),
+									 locktagbuf.data);
+
+					ereport(DEBUG1,
+							(errmsg_internal("sending cancel to blocking autovacuum PID %d",
+											 pid),
+							 errdetail_log("%s", logbuf.data)));
+
+					pfree(locktagbuf.data);
+					pfree(logbuf.data);
+				}
+
+				/* send the autovacuum worker Back to Old Kent Road */
+				if (kill(pid, SIGINT) < 0)
+				{
+					/*
+					 * There's a race condition here: once we release the
+					 * ProcArrayLock, it's possible for the autovac worker to
+					 * close up shop and exit before we can do the kill().
+					 * Therefore, we do not whinge about no-such-process.
+					 * Other errors such as EPERM could conceivably happen if
+					 * the kernel recycles the PID fast enough, but such cases
+					 * seem improbable enough that it's probably best to issue
+					 * a warning if we see some other errno.
+					 */
+					if (errno != ESRCH)
+						ereport(WARNING,
+								(errmsg("could not send signal to process %d: %m",
+										pid)));
+				}
+			}
+
+			/* prevent signal from being sent again more than once */
+			allow_autovacuum_cancel = false;
+		}
+
+		/*
+		 * If awoken after the deadlock check interrupt has run, and
+		 * log_lock_waits is on, then report about the wait.
+		 */
+		if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED)
+		{
+			StringInfoData buf,
+						lock_waiters_sbuf,
+						lock_holders_sbuf;
+			const char *modename;
+			long		secs;
+			int			usecs;
+			long		msecs;
+			dlist_iter	proc_iter;
+			PROCLOCK   *curproclock;
+			bool		first_holder = true,
+						first_waiter = true;
+			int			lockHoldersNum = 0;
+
+			initStringInfo(&buf);
+			initStringInfo(&lock_waiters_sbuf);
+			initStringInfo(&lock_holders_sbuf);
+
+			DescribeLockTag(&buf, &locallock->tag.lock);
+			modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid,
+									   lockmode);
+			TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT),
+								GetCurrentTimestamp(),
+								&secs, &usecs);
+			msecs = secs * 1000 + usecs / 1000;
+			usecs = usecs % 1000;
+
+			/*
+			 * we loop over the lock's procLocks to gather a list of all
+			 * holders and waiters. Thus we will be able to provide more
+			 * detailed information for lock debugging purposes.
+			 *
+			 * lock->procLocks contains all processes which hold or wait for
+			 * this lock.
+			 */
+
+			LWLockAcquire(partitionLock, LW_SHARED);
+
+			dlist_foreach(proc_iter, &lock->procLocks)
+			{
+				curproclock =
+					dlist_container(PROCLOCK, lockLink, proc_iter.cur);
+
+				/*
+				 * we are a waiter if myProc->waitProcLock == curproclock; we
+				 * are a holder if it is NULL or something different
+				 */
+				if (curproclock->tag.myProc->waitProcLock == curproclock)
+				{
+					if (first_waiter)
+					{
+						appendStringInfo(&lock_waiters_sbuf, "%d",
+										 curproclock->tag.myProc->pid);
+						first_waiter = false;
+					}
+					else
+						appendStringInfo(&lock_waiters_sbuf, ", %d",
+										 curproclock->tag.myProc->pid);
+				}
+				else
+				{
+					if (first_holder)
+					{
+						appendStringInfo(&lock_holders_sbuf, "%d",
+										 curproclock->tag.myProc->pid);
+						first_holder = false;
+					}
+					else
+						appendStringInfo(&lock_holders_sbuf, ", %d",
+										 curproclock->tag.myProc->pid);
+
+					lockHoldersNum++;
+				}
+			}
+
+			LWLockRelease(partitionLock);
+
+			if (deadlock_state == DS_SOFT_DEADLOCK)
+				ereport(LOG,
+						(errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms",
+								MyProcPid, modename, buf.data, msecs, usecs),
+						 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+											   "Processes holding the lock: %s. Wait queue: %s.",
+											   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+			else if (deadlock_state == DS_HARD_DEADLOCK)
+			{
+				/*
+				 * This message is a bit redundant with the error that will be
+				 * reported subsequently, but in some cases the error report
+				 * might not make it to the log (eg, if it's caught by an
+				 * exception handler), and we want to ensure all long-wait
+				 * events get logged.
+				 */
+				ereport(LOG,
+						(errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms",
+								MyProcPid, modename, buf.data, msecs, usecs),
+						 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+											   "Processes holding the lock: %s. Wait queue: %s.",
+											   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+			}
+
+			if (myWaitStatus == PROC_WAIT_STATUS_WAITING)
+				ereport(LOG,
+						(errmsg("process %d still waiting for %s on %s after %ld.%03d ms",
+								MyProcPid, modename, buf.data, msecs, usecs),
+						 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+											   "Processes holding the lock: %s. Wait queue: %s.",
+											   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+			else if (myWaitStatus == PROC_WAIT_STATUS_OK)
+				ereport(LOG,
+						(errmsg("process %d acquired %s on %s after %ld.%03d ms",
+								MyProcPid, modename, buf.data, msecs, usecs)));
+			else
+			{
+				Assert(myWaitStatus == PROC_WAIT_STATUS_ERROR);
+
+				/*
+				 * Currently, the deadlock checker always kicks its own
+				 * process, which means that we'll only see
+				 * PROC_WAIT_STATUS_ERROR when deadlock_state ==
+				 * DS_HARD_DEADLOCK, and there's no need to print redundant
+				 * messages.  But for completeness and future-proofing, print
+				 * a message if it looks like someone else kicked us off the
+				 * lock.
+				 */
+				if (deadlock_state != DS_HARD_DEADLOCK)
+					ereport(LOG,
+							(errmsg("process %d failed to acquire %s on %s after %ld.%03d ms",
+									MyProcPid, modename, buf.data, msecs, usecs),
+							 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+												   "Processes holding the lock: %s. Wait queue: %s.",
+												   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+			}
+
+			/*
+			 * At this point we might still need to wait for the lock. Reset
+			 * state so we don't print the above messages again.
+			 */
+			deadlock_state = DS_NO_DEADLOCK;
+
+			pfree(buf.data);
+			pfree(lock_holders_sbuf.data);
+			pfree(lock_waiters_sbuf.data);
+		}
+	} while (myWaitStatus == PROC_WAIT_STATUS_WAITING);
+
+	/*
+	 * Disable the timers, if they are still running.  As in LockErrorCleanup,
+	 * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has
+	 * already caused QueryCancelPending to become set, we want the cancel to
+	 * be reported as a lock timeout, not a user cancel.
+	 */
+	if (!InHotStandby)
+	{
+		if (LockTimeout > 0)
+		{
+			DisableTimeoutParams timeouts[2];
+
+			timeouts[0].id = DEADLOCK_TIMEOUT;
+			timeouts[0].keep_indicator = false;
+			timeouts[1].id = LOCK_TIMEOUT;
+			timeouts[1].keep_indicator = true;
+			disable_timeouts(timeouts, 2);
+		}
+		else
+			disable_timeout(DEADLOCK_TIMEOUT, false);
+	}
+
+	/*
+	 * Emit the log message if recovery conflict on lock was resolved but the
+	 * startup process waited longer than deadlock_timeout for it.
+	 */
+	if (InHotStandby && logged_recovery_conflict)
+		LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+							standbyWaitStart, GetCurrentTimestamp(),
+							NULL, false);
+
+	/*
+	 * Re-acquire the lock table's partition lock.  We have to do this to hold
+	 * off cancel/die interrupts before we can mess with lockAwaited (else we
+	 * might have a missed or duplicated locallock update).
+	 */
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	/*
+	 * We no longer want LockErrorCleanup to do anything.
+	 */
+	lockAwaited = NULL;
+
+	/*
+	 * If we got the lock, be sure to remember it in the locallock table.
+	 */
+	if (MyProc->waitStatus == PROC_WAIT_STATUS_OK)
+		GrantAwaitedLock();
+
+	/*
+	 * We don't have to do anything else, because the awaker did all the
+	 * necessary update of the lock table and MyProc.
+	 */
+	return MyProc->waitStatus;
+}
+
+
+/*
+ * ProcWakeup -- wake up a process by setting its latch.
+ *
+ *	 Also remove the process from the wait queue and set its links invalid.
+ *
+ * The appropriate lock partition lock must be held by caller.
+ *
+ * XXX: presently, this code is only used for the "success" case, and only
+ * works correctly for that case.  To clean up in failure case, would need
+ * to twiddle the lock's request counts too --- see RemoveFromWaitQueue.
+ * Hence, in practice the waitStatus parameter must be PROC_WAIT_STATUS_OK.
+ */
+void
+ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus)
+{
+	if (dlist_node_is_detached(&proc->links))
+		return;
+
+	Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING);
+
+	/* Remove process from wait queue */
+	dclist_delete_from_thoroughly(&proc->waitLock->waitProcs, &proc->links);
+
+	/* Clean up process' state and pass it the ok/fail signal */
+	proc->waitLock = NULL;
+	proc->waitProcLock = NULL;
+	proc->waitStatus = waitStatus;
+	pg_atomic_write_u64(&MyProc->waitStart, 0);
+
+	/* And awaken it */
+	SetLatch(&proc->procLatch);
+}
+
+/*
+ * ProcLockWakeup -- routine for waking up processes when a lock is
+ *		released (or a prior waiter is aborted).  Scan all waiters
+ *		for lock, waken any that are no longer blocked.
+ *
+ * The appropriate lock partition lock must be held by caller.
+ */
+void
+ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
+{
+	dclist_head *waitQueue = &lock->waitProcs;
+	LOCKMASK	aheadRequests = 0;
+	dlist_mutable_iter miter;
+
+	if (dclist_is_empty(waitQueue))
+		return;
+
+	dclist_foreach_modify(miter, waitQueue)
+	{
+		PGPROC	   *proc = dlist_container(PGPROC, links, miter.cur);
+		LOCKMODE	lockmode = proc->waitLockMode;
+
+		/*
+		 * Waken if (a) doesn't conflict with requests of earlier waiters, and
+		 * (b) doesn't conflict with already-held locks.
+		 */
+		if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
+			!LockCheckConflicts(lockMethodTable, lockmode, lock,
+								proc->waitProcLock))
+		{
+			/* OK to waken */
+			GrantLock(lock, proc->waitProcLock, lockmode);
+			/* removes proc from the lock's waiting process queue */
+			ProcWakeup(proc, PROC_WAIT_STATUS_OK);
+		}
+		else
+		{
+			/*
+			 * Lock conflicts: Don't wake, but remember requested mode for
+			 * later checks.
+			 */
+			aheadRequests |= LOCKBIT_ON(lockmode);
+		}
+	}
+}
+
+/*
+ * CheckDeadLock
+ *
+ * We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a
+ * lock to be released by some other process.  Check if there's a deadlock; if
+ * not, just return.  (But signal ProcSleep to log a message, if
+ * log_lock_waits is true.)  If we have a real deadlock, remove ourselves from
+ * the lock's wait queue and signal an error to ProcSleep.
+ */
+static void
+CheckDeadLock(void)
+{
+	int			i;
+
+	/*
+	 * Acquire exclusive lock on the entire shared lock data structures. Must
+	 * grab LWLocks in partition-number order to avoid LWLock deadlock.
+	 *
+	 * Note that the deadlock check interrupt had better not be enabled
+	 * anywhere that this process itself holds lock partition locks, else this
+	 * will wait forever.  Also note that LWLockAcquire creates a critical
+	 * section, so that this routine cannot be interrupted by cancel/die
+	 * interrupts.
+	 */
+	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+		LWLockAcquire(LockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
+
+	/*
+	 * Check to see if we've been awoken by anyone in the interim.
+	 *
+	 * If we have, we can return and resume our transaction -- happy day.
+	 * Before we are awoken the process releasing the lock grants it to us so
+	 * we know that we don't have to wait anymore.
+	 *
+	 * We check by looking to see if we've been unlinked from the wait queue.
+	 * This is safe because we hold the lock partition lock.
+	 */
+	if (MyProc->links.prev == NULL ||
+		MyProc->links.next == NULL)
+		goto check_done;
+
+#ifdef LOCK_DEBUG
+	if (Debug_deadlocks)
+		DumpAllLocks();
+#endif
+
+	/* Run the deadlock check, and set deadlock_state for use by ProcSleep */
+	deadlock_state = DeadLockCheck(MyProc);
+
+	if (deadlock_state == DS_HARD_DEADLOCK)
+	{
+		/*
+		 * Oops.  We have a deadlock.
+		 *
+		 * Get this process out of wait state. (Note: we could do this more
+		 * efficiently by relying on lockAwaited, but use this coding to
+		 * preserve the flexibility to kill some other transaction than the
+		 * one detecting the deadlock.)
+		 *
+		 * RemoveFromWaitQueue sets MyProc->waitStatus to
+		 * PROC_WAIT_STATUS_ERROR, so ProcSleep will report an error after we
+		 * return from the signal handler.
+		 */
+		Assert(MyProc->waitLock != NULL);
+		RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag)));
+
+		/*
+		 * We're done here.  Transaction abort caused by the error that
+		 * ProcSleep will raise will cause any other locks we hold to be
+		 * released, thus allowing other processes to wake up; we don't need
+		 * to do that here.  NOTE: an exception is that releasing locks we
+		 * hold doesn't consider the possibility of waiters that were blocked
+		 * behind us on the lock we just failed to get, and might now be
+		 * wakable because we're not in front of them anymore.  However,
+		 * RemoveFromWaitQueue took care of waking up any such processes.
+		 */
+	}
+
+	/*
+	 * And release locks.  We do this in reverse order for two reasons: (1)
+	 * Anyone else who needs more than one of the locks will be trying to lock
+	 * them in increasing order; we don't want to release the other process
+	 * until it can get all the locks it needs. (2) This avoids O(N^2)
+	 * behavior inside LWLockRelease.
+	 */
+check_done:
+	for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+		LWLockRelease(LockHashPartitionLockByIndex(i));
+}
+
+/*
+ * CheckDeadLockAlert - Handle the expiry of deadlock_timeout.
+ *
+ * NB: Runs inside a signal handler, be careful.
+ */
+void
+CheckDeadLockAlert(void)
+{
+	int			save_errno = errno;
+
+	got_deadlock_timeout = true;
+
+	/*
+	 * Have to set the latch again, even if handle_sig_alarm already did. Back
+	 * then got_deadlock_timeout wasn't yet set... It's unlikely that this
+	 * ever would be a problem, but setting a set latch again is cheap.
+	 *
+	 * Note that, when this function runs inside procsignal_sigusr1_handler(),
+	 * the handler function sets the latch again after the latch is set here.
+	 */
+	SetLatch(MyLatch);
+	errno = save_errno;
+}
+
+/*
+ * ProcWaitForSignal - wait for a signal from another backend.
+ *
+ * As this uses the generic process latch the caller has to be robust against
+ * unrelated wakeups: Always check that the desired state has occurred, and
+ * wait again if not.
+ */
+void
+ProcWaitForSignal(uint32 wait_event_info)
+{
+	(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+					 wait_event_info);
+	ResetLatch(MyLatch);
+	CHECK_FOR_INTERRUPTS();
+}
+
+/*
+ * ProcSendSignal - set the latch of a backend identified by pgprocno
+ */
+void
+ProcSendSignal(int pgprocno)
+{
+	if (pgprocno < 0 || pgprocno >= ProcGlobal->allProcCount)
+		elog(ERROR, "pgprocno out of range");
+
+	SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch);
+}
+
+/*
+ * BecomeLockGroupLeader - designate process as lock group leader
+ *
+ * Once this function has returned, other processes can join the lock group
+ * by calling BecomeLockGroupMember.
+ */
+void
+BecomeLockGroupLeader(void)
+{
+	LWLock	   *leader_lwlock;
+
+	/* If we already did it, we don't need to do it again. */
+	if (MyProc->lockGroupLeader == MyProc)
+		return;
+
+	/* We had better not be a follower. */
+	Assert(MyProc->lockGroupLeader == NULL);
+
+	/* Create single-member group, containing only ourselves. */
+	leader_lwlock = LockHashPartitionLockByProc(MyProc);
+	LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+	MyProc->lockGroupLeader = MyProc;
+	dlist_push_head(&MyProc->lockGroupMembers, &MyProc->lockGroupLink);
+	LWLockRelease(leader_lwlock);
+}
+
+/*
+ * BecomeLockGroupMember - designate process as lock group member
+ *
+ * This is pretty straightforward except for the possibility that the leader
+ * whose group we're trying to join might exit before we manage to do so;
+ * and the PGPROC might get recycled for an unrelated process.  To avoid
+ * that, we require the caller to pass the PID of the intended PGPROC as
+ * an interlock.  Returns true if we successfully join the intended lock
+ * group, and false if not.
+ */
+bool
+BecomeLockGroupMember(PGPROC *leader, int pid)
+{
+	LWLock	   *leader_lwlock;
+	bool		ok = false;
+
+	/* Group leader can't become member of group */
+	Assert(MyProc != leader);
+
+	/* Can't already be a member of a group */
+	Assert(MyProc->lockGroupLeader == NULL);
+
+	/* PID must be valid. */
+	Assert(pid != 0);
+
+	/*
+	 * Get lock protecting the group fields.  Note LockHashPartitionLockByProc
+	 * accesses leader->pgprocno in a PGPROC that might be free.  This is safe
+	 * because all PGPROCs' pgprocno fields are set during shared memory
+	 * initialization and never change thereafter; so we will acquire the
+	 * correct lock even if the leader PGPROC is in process of being recycled.
+	 */
+	leader_lwlock = LockHashPartitionLockByProc(leader);
+	LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+
+	/* Is this the leader we're looking for? */
+	if (leader->pid == pid && leader->lockGroupLeader == leader)
+	{
+		/* OK, join the group */
+		ok = true;
+		MyProc->lockGroupLeader = leader;
+		dlist_push_tail(&leader->lockGroupMembers, &MyProc->lockGroupLink);
+	}
+	LWLockRelease(leader_lwlock);
+
+	return ok;
+}
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index 0000000..327ac64
--- /dev/null
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -0,0 +1,324 @@
+/*-------------------------------------------------------------------------
+ *
+ * s_lock.c
+ *	   Hardware-dependent implementation of spinlocks.
+ *
+ * When waiting for a contended spinlock we loop tightly for awhile, then
+ * delay using pg_usleep() and try again.  Preferably, "awhile" should be a
+ * small multiple of the maximum time we expect a spinlock to be held.  100
+ * iterations seems about right as an initial guess.  However, on a
+ * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario
+ * it's usually better to spin a bit longer than to call the kernel, so we try
+ * to adapt the spin loop count depending on whether we seem to be in a
+ * uniprocessor or multiprocessor.
+ *
+ * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
+ * be wrong; there are platforms where that can result in a "stuck
+ * spinlock" failure.  This has been seen particularly on Alphas; it seems
+ * that the first TAS after returning from kernel space will always fail
+ * on that hardware.
+ *
+ * Once we do decide to block, we use randomly increasing pg_usleep()
+ * delays. The first delay is 1 msec, then the delay randomly increases to
+ * about one second, after which we reset to 1 msec and start again.  The
+ * idea here is that in the presence of heavy contention we need to
+ * increase the delay, else the spinlock holder may never get to run and
+ * release the lock.  (Consider situation where spinlock holder has been
+ * nice'd down in priority by the scheduler --- it will not get scheduled
+ * until all would-be acquirers are sleeping, so if we always use a 1-msec
+ * sleep, there is a real possibility of starvation.)  But we can't just
+ * clamp the delay to an upper bound, else it would take a long time to
+ * make a reasonable number of tries.
+ *
+ * We time out and declare error after NUM_DELAYS delays (thus, exactly
+ * that many tries).  With the given settings, this will usually take 2 or
+ * so minutes.  It seems better to fix the total number of tries (and thus
+ * the probability of unintended failure) than to fix the total time
+ * spent.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/s_lock.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <time.h>
+#include <unistd.h>
+
+#include "common/pg_prng.h"
+#include "port/atomics.h"
+#include "storage/s_lock.h"
+#include "utils/wait_event.h"
+
+#define MIN_SPINS_PER_DELAY 10
+#define MAX_SPINS_PER_DELAY 1000
+#define NUM_DELAYS			1000
+#define MIN_DELAY_USEC		1000L
+#define MAX_DELAY_USEC		1000000L
+
+
+slock_t		dummy_spinlock;
+
+static int	spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+
+
+/*
+ * s_lock_stuck() - complain about a stuck spinlock
+ */
+static void
+s_lock_stuck(const char *file, int line, const char *func)
+{
+	if (!func)
+		func = "(unknown)";
+#if defined(S_LOCK_TEST)
+	fprintf(stderr,
+			"\nStuck spinlock detected at %s, %s:%d.\n",
+			func, file, line);
+	exit(1);
+#else
+	elog(PANIC, "stuck spinlock detected at %s, %s:%d",
+		 func, file, line);
+#endif
+}
+
+/*
+ * s_lock(lock) - platform-independent portion of waiting for a spinlock.
+ */
+int
+s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
+{
+	SpinDelayStatus delayStatus;
+
+	init_spin_delay(&delayStatus, file, line, func);
+
+	while (TAS_SPIN(lock))
+	{
+		perform_spin_delay(&delayStatus);
+	}
+
+	finish_spin_delay(&delayStatus);
+
+	return delayStatus.delays;
+}
+
+#ifdef USE_DEFAULT_S_UNLOCK
+void
+s_unlock(volatile slock_t *lock)
+{
+#ifdef TAS_ACTIVE_WORD
+	/* HP's PA-RISC */
+	*TAS_ACTIVE_WORD(lock) = -1;
+#else
+	*lock = 0;
+#endif
+}
+#endif
+
+/*
+ * Wait while spinning on a contended spinlock.
+ */
+void
+perform_spin_delay(SpinDelayStatus *status)
+{
+	/* CPU-specific delay each time through the loop */
+	SPIN_DELAY();
+
+	/* Block the process every spins_per_delay tries */
+	if (++(status->spins) >= spins_per_delay)
+	{
+		if (++(status->delays) > NUM_DELAYS)
+			s_lock_stuck(status->file, status->line, status->func);
+
+		if (status->cur_delay == 0) /* first time to delay? */
+			status->cur_delay = MIN_DELAY_USEC;
+
+		/*
+		 * Once we start sleeping, the overhead of reporting a wait event is
+		 * justified. Actively spinning easily stands out in profilers, but
+		 * sleeping with an exponential backoff is harder to spot...
+		 *
+		 * We might want to report something more granular at some point, but
+		 * this is better than nothing.
+		 */
+		pgstat_report_wait_start(WAIT_EVENT_SPIN_DELAY);
+		pg_usleep(status->cur_delay);
+		pgstat_report_wait_end();
+
+#if defined(S_LOCK_TEST)
+		fprintf(stdout, "*");
+		fflush(stdout);
+#endif
+
+		/* increase delay by a random fraction between 1X and 2X */
+		status->cur_delay += (int) (status->cur_delay *
+									pg_prng_double(&pg_global_prng_state) + 0.5);
+		/* wrap back to minimum delay when max is exceeded */
+		if (status->cur_delay > MAX_DELAY_USEC)
+			status->cur_delay = MIN_DELAY_USEC;
+
+		status->spins = 0;
+	}
+}
+
+/*
+ * After acquiring a spinlock, update estimates about how long to loop.
+ *
+ * If we were able to acquire the lock without delaying, it's a good
+ * indication we are in a multiprocessor.  If we had to delay, it's a sign
+ * (but not a sure thing) that we are in a uniprocessor. Hence, we
+ * decrement spins_per_delay slowly when we had to delay, and increase it
+ * rapidly when we didn't.  It's expected that spins_per_delay will
+ * converge to the minimum value on a uniprocessor and to the maximum
+ * value on a multiprocessor.
+ *
+ * Note: spins_per_delay is local within our current process. We want to
+ * average these observations across multiple backends, since it's
+ * relatively rare for this function to even get entered, and so a single
+ * backend might not live long enough to converge on a good value.  That
+ * is handled by the two routines below.
+ */
+void
+finish_spin_delay(SpinDelayStatus *status)
+{
+	if (status->cur_delay == 0)
+	{
+		/* we never had to delay */
+		if (spins_per_delay < MAX_SPINS_PER_DELAY)
+			spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY);
+	}
+	else
+	{
+		if (spins_per_delay > MIN_SPINS_PER_DELAY)
+			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
+	}
+}
+
+/*
+ * Set local copy of spins_per_delay during backend startup.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+void
+set_spins_per_delay(int shared_spins_per_delay)
+{
+	spins_per_delay = shared_spins_per_delay;
+}
+
+/*
+ * Update shared estimate of spins_per_delay during backend exit.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+int
+update_spins_per_delay(int shared_spins_per_delay)
+{
+	/*
+	 * We use an exponential moving average with a relatively slow adaption
+	 * rate, so that noise in any one backend's result won't affect the shared
+	 * value too much.  As long as both inputs are within the allowed range,
+	 * the result must be too, so we need not worry about clamping the result.
+	 *
+	 * We deliberately truncate rather than rounding; this is so that single
+	 * adjustments inside a backend can affect the shared estimate (see the
+	 * asymmetric adjustment rules above).
+	 */
+	return (shared_spins_per_delay * 15 + spins_per_delay) / 16;
+}
+
+
+/*****************************************************************************/
+#if defined(S_LOCK_TEST)
+
+/*
+ * test program for verifying a port's spinlock support.
+ */
+
+struct test_lock_struct
+{
+	char		pad1;
+	slock_t		lock;
+	char		pad2;
+};
+
+volatile struct test_lock_struct test_lock;
+
+int
+main()
+{
+	pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL));
+
+	test_lock.pad1 = test_lock.pad2 = 0x44;
+
+	S_INIT_LOCK(&test_lock.lock);
+
+	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+	{
+		printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+		return 1;
+	}
+
+	if (!S_LOCK_FREE(&test_lock.lock))
+	{
+		printf("S_LOCK_TEST: failed, lock not initialized\n");
+		return 1;
+	}
+
+	S_LOCK(&test_lock.lock);
+
+	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+	{
+		printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+		return 1;
+	}
+
+	if (S_LOCK_FREE(&test_lock.lock))
+	{
+		printf("S_LOCK_TEST: failed, lock not locked\n");
+		return 1;
+	}
+
+	S_UNLOCK(&test_lock.lock);
+
+	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+	{
+		printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+		return 1;
+	}
+
+	if (!S_LOCK_FREE(&test_lock.lock))
+	{
+		printf("S_LOCK_TEST: failed, lock not unlocked\n");
+		return 1;
+	}
+
+	S_LOCK(&test_lock.lock);
+
+	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+	{
+		printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+		return 1;
+	}
+
+	if (S_LOCK_FREE(&test_lock.lock))
+	{
+		printf("S_LOCK_TEST: failed, lock not re-locked\n");
+		return 1;
+	}
+
+	printf("S_LOCK_TEST: this will print %d stars and then\n", NUM_DELAYS);
+	printf("             exit with a 'stuck spinlock' message\n");
+	printf("             if S_LOCK() and TAS() are working.\n");
+	fflush(stdout);
+
+	s_lock(&test_lock.lock, __FILE__, __LINE__, __func__);
+
+	printf("S_LOCK_TEST: failed, lock not locked\n");
+	return 1;
+}
+
+#endif							/* S_LOCK_TEST */
diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c
new file mode 100644
index 0000000..6052779
--- /dev/null
+++ b/src/backend/storage/lmgr/spin.c
@@ -0,0 +1,180 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.c
+ *	   Hardware-independent implementation of spinlocks.
+ *
+ *
+ * For machines that have test-and-set (TAS) instructions, s_lock.h/.c
+ * define the spinlock implementation.  This file contains only a stub
+ * implementation for spinlocks using PGSemaphores.  Unless semaphores
+ * are implemented in a way that doesn't involve a kernel call, this
+ * is too slow to be very useful :-(
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/lmgr/spin.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/pg_sema.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+
+
+#ifndef HAVE_SPINLOCKS
+
+/*
+ * No TAS, so spinlocks are implemented as PGSemaphores.
+ */
+
+#ifndef HAVE_ATOMICS
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES)
+#else
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES)
+#endif							/* HAVE_ATOMICS */
+
+PGSemaphore *SpinlockSemaArray;
+
+#else							/* !HAVE_SPINLOCKS */
+
+#define NUM_EMULATION_SEMAPHORES 0
+
+#endif							/* HAVE_SPINLOCKS */
+
+/*
+ * Report the amount of shared memory needed to store semaphores for spinlock
+ * support.
+ */
+Size
+SpinlockSemaSize(void)
+{
+	return NUM_EMULATION_SEMAPHORES * sizeof(PGSemaphore);
+}
+
+/*
+ * Report number of semaphores needed to support spinlocks.
+ */
+int
+SpinlockSemas(void)
+{
+	return NUM_EMULATION_SEMAPHORES;
+}
+
+#ifndef HAVE_SPINLOCKS
+
+/*
+ * Initialize spinlock emulation.
+ *
+ * This must be called after PGReserveSemaphores().
+ */
+void
+SpinlockSemaInit(void)
+{
+	PGSemaphore *spinsemas;
+	int			nsemas = SpinlockSemas();
+	int			i;
+
+	/*
+	 * We must use ShmemAllocUnlocked(), since the spinlock protecting
+	 * ShmemAlloc() obviously can't be ready yet.
+	 */
+	spinsemas = (PGSemaphore *) ShmemAllocUnlocked(SpinlockSemaSize());
+	for (i = 0; i < nsemas; ++i)
+		spinsemas[i] = PGSemaphoreCreate();
+	SpinlockSemaArray = spinsemas;
+}
+
+/*
+ * s_lock.h hardware-spinlock emulation using semaphores
+ *
+ * We map all spinlocks onto NUM_EMULATION_SEMAPHORES semaphores.  It's okay to
+ * map multiple spinlocks onto one semaphore because no process should ever
+ * hold more than one at a time.  We just need enough semaphores so that we
+ * aren't adding too much extra contention from that.
+ *
+ * There is one exception to the restriction of only holding one spinlock at a
+ * time, which is that it's ok if emulated atomic operations are nested inside
+ * spinlocks. To avoid the danger of spinlocks and atomic using the same sema,
+ * we make sure "normal" spinlocks and atomics backed by spinlocks use
+ * distinct semaphores (see the nested argument to s_init_lock_sema).
+ *
+ * slock_t is just an int for this implementation; it holds the spinlock
+ * number from 1..NUM_EMULATION_SEMAPHORES.  We intentionally ensure that 0
+ * is not a valid value, so that testing with this code can help find
+ * failures to initialize spinlocks.
+ */
+
+static inline void
+s_check_valid(int lockndx)
+{
+	if (unlikely(lockndx <= 0 || lockndx > NUM_EMULATION_SEMAPHORES))
+		elog(ERROR, "invalid spinlock number: %d", lockndx);
+}
+
+void
+s_init_lock_sema(volatile slock_t *lock, bool nested)
+{
+	static uint32 counter = 0;
+	uint32		offset;
+	uint32		sema_total;
+	uint32		idx;
+
+	if (nested)
+	{
+		/*
+		 * To allow nesting atomics inside spinlocked sections, use a
+		 * different spinlock. See comment above.
+		 */
+		offset = 1 + NUM_SPINLOCK_SEMAPHORES;
+		sema_total = NUM_ATOMICS_SEMAPHORES;
+	}
+	else
+	{
+		offset = 1;
+		sema_total = NUM_SPINLOCK_SEMAPHORES;
+	}
+
+	idx = (counter++ % sema_total) + offset;
+
+	/* double check we did things correctly */
+	s_check_valid(idx);
+
+	*lock = idx;
+}
+
+void
+s_unlock_sema(volatile slock_t *lock)
+{
+	int			lockndx = *lock;
+
+	s_check_valid(lockndx);
+
+	PGSemaphoreUnlock(SpinlockSemaArray[lockndx - 1]);
+}
+
+bool
+s_lock_free_sema(volatile slock_t *lock)
+{
+	/* We don't currently use S_LOCK_FREE anyway */
+	elog(ERROR, "spin.c does not support S_LOCK_FREE()");
+	return false;
+}
+
+int
+tas_sema(volatile slock_t *lock)
+{
+	int			lockndx = *lock;
+
+	s_check_valid(lockndx);
+
+	/* Note that TAS macros return 0 if *success* */
+	return !PGSemaphoreTryLock(SpinlockSemaArray[lockndx - 1]);
+}
+
+#endif							/* !HAVE_SPINLOCKS */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-13 13:44:03 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-13 13:44:03 +0000
commit	293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch)
tree	fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/storage/lmgr
parent	Initial commit. (diff)
download	postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip